User:XLinkBot/Code/LinkParser.pl

#!/usr/bin/perl

fork and exit;

use POE qw (Component::Client::TCP);
use HTML::Entities;
use LWP::UserAgent;
use perlwikipedia;
use strict;

my $editor=Perlwikipedia->new("LinkParser","LinkParser");

my $diffFetcher=LWP::UserAgent->new;
$diffFetcher->agent("LinkParser/2.0");

my %settings;

$settings{'debug'} = 0;

print "Reading config file...\n" if $settings{'debug'};
open (CONFIG,"<linkwatcher-config") or die "Can't open LinkWatcher config: $!";
foreach (<CONFIG>) {
    unless (/^#/) {
        if(/(.+?)=(.+)/) {
            $settings{$1}=$2;
        }
    }
}   
close (CONFIG); 

my @prefixes = split(/\|/,$settings{'prefixes'});

print ("Prefixes: " . join(" - ", @prefixes) . "\n") if $settings{'debug'};

print "done\n" if $settings{'debug'};


my $server_port=shift;

sleep 4;

POE::Component::Client::TCP->new(
    RemoteAddress       =>'127.0.0.1',
    RemotePort          => $server_port,
    ServerInput     => \&server_input,
    Connected       => \&connected,
);

my ($heap,$kernel);
my $number_of_edits=0;

POE::Kernel->run();
exit 0;

sub server_input {
    my ( $session, $heap, $kernel, $input ) = @_[ SESSION, HEAP, KERNEL, ARG0 ];
    if ($input =~ m{EDIT \[\[(.+)\]\] \[\[(.+):User:(.+?)\]\] (http:\/\/.+) (.+)}) {
        my ($pagename) = $1;
        my ($lang)     = $2;
        my ($username) = $3;
        my ($diffurl)  = $4;                   
        my ($size)     = $5;
        $number_of_edits++;

        my @linksadded;
        my @linkremoved;
        if ( $diffurl ) {       
            my @addedPre = ();
            my @removedPre = ();
            my @addedlinks = ();
            my @removedlinks = ();
            my $addedTotal = "";
            my $removedTotal = "";

            if ($diffurl =~ m/index\.php/) {
                my $diffUrl="$diffurl&diffonly=1&action=render";
                my $diffContent=$diffFetcher->get($diffUrl)->content;
                print ("$diffContent\n") if $settings{'debug'};
                @addedPre=$diffContent=~m/<td class=.diff-addedline.><div>(.*?)<\/div><\/td>/sg;
                @removedPre=$diffContent=~m/<td class=.diff-deletedline.><div>(.*?)<\/div><\/td>/sg;    
                $addedTotal=join(' ', @addedPre);
                $removedTotal=join(' ', @removedPre);

                $addedTotal   =~ s/<span class=.diffchange diffchange-inline.>//g;
                $addedTotal   =~ s/<span class=.diffchange.>//g;
                $addedTotal   =~ s/<\/span>//g;

                $removedTotal   =~ s/<span class=.diffchange diffchange-inline.>//g;
                $removedTotal =~ s/<span class=.diffchange.>//g;
                $removedTotal =~ s/<\/span>//g;

                $addedTotal   =~ s/<ins class=.diffchange diffchange-inline.>//g;
                $addedTotal   =~ s/<ins class=.diffchange.>//g;
                $addedTotal   =~ s/<\/ins>//g;

                $removedTotal =~ s/<ins class=.diffchange diffchange-inline.>//g;
                $removedTotal =~ s/<ins class=.diffchange.>//g;
                $removedTotal =~ s/<\/ins>//sig;

                $addedTotal = lc($addedTotal);
                $removedTotal = lc($removedTotal);    
                print ("Added data: $addedTotal\n") if $settings{'debug'};
            } else {
                $addedTotal=$editor->get_text($pagename);
                $addedTotal= lc($addedTotal);
                $removedTotal = "";
            }
        
            decode_entities( $addedTotal );
            decode_entities( $removedTotal );

            @addedlinks=$addedTotal=~m{(http://[^\s\]\[\{\}\\\|^~`<>]+)}sgi;
            @removedlinks=$removedTotal=~m{(http://[^\s\]\[\{\}\\\|^~`<>]+)}sgi;
            my @really_added_links = ();
            my @really_removed_links = ();
            my $links_added;
            my $links_removed;
            
            if (@addedlinks) {
                if (@removedlinks) {
                    print("----\nDIFF $diffurl ".join(" ",@addedlinks)." - ".join(" ",@removedlinks)."\n----\n") if $settings{'debug'};
                    foreach $links_added(@addedlinks) {
                        my $found = 0;
                        foreach $links_removed(@removedlinks) {
                            if ($links_removed eq $links_added) {
                                $found = 1;
                            }
                        }
                        unless ($found) {
                            push(@really_added_links,$links_added);
                        }
                    }
                } else {
                    @really_added_links = @addedlinks;
                }
                print ("DIFF $diffurl ".join(" ",@really_added_links)."\n----\n") if $settings{'debug'};
            }
            if (@really_added_links) {
                my $message="PARSED [[$pagename]] $diffurl $size [[$lang:User:$username]] |" . join(" ",@really_added_links) . "|";
                $heap->{server}->put($message);
            }
        }
        $heap->{server}->put("REQUEST");
    }
    elsif ($input =~ m{NOEDIT}) {
        sleep 1;
        $heap->{server}->put("REQUEST");
    }
    if ($number_of_edits>50) {
        $kernel->post("shutdown");
        exit 0;
    }
}

sub connected {
    ( $kernel, $heap ) = @_[ KERNEL,HEAP ];
    $heap->{server}->put("REQUEST");
}

sub request_edit {
    ( $kernel, $heap ) = @_[ KERNEL,HEAP ];
    $heap->{server}->put("REQUEST");
}