User:AnomieBOT/source/tasks/ReplaceExternalLinks3.pm

package tasks::ReplaceExternalLinks3;

=pod

=begin metadata

Bot:     AnomieBOT
Task:    ReplaceExternalLinks3
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 50
Status:  Completed 2011-12-28
Created: 2011-01-06

Process pages linking to <nowiki>http://www.nr.nps.gov/</nowiki>:
* Replace links beginning with "<nowiki>http://www.nr.nps.gov/multiples/</nowiki>" with the corresponding link starting "<nowiki>http://pdfhost.focus.nps.gov/docs/NRHP/Text/</nowiki>".
* Replace {{tl|cite web}} templates with url <nowiki>http://www.nr.nps.gov/</nowiki> with {{tl|NRISref}}.
* Tag other {{tl|cite web}} templates and non-{{tl|cite web}} links with {{tl|NRIS dead link}}.

=end metadata

=cut

use utf8;
use strict;

use Data::Dumper;
use POSIX;
use Date::Parse;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    $self->{'iter'}=undef;
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2011-01-28.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 50]]

=cut

sub approved {
    return -1;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('ReplaceExternalLinks3', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);

    my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks3]]';

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    # Get list of citation templates
    my %templates=$api->redirects_to_resolved(
        'Template:Cite web',
    );
    if(exists($templates{''})){
        $api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");
        return 60;
    }

    # Get list of infobox templates
    my %infoboxes=$api->redirects_to_resolved(
        'Template:Infobox NRHP',
    );
    if(exists($infoboxes{''})){
        $api->warn("Failed to get infobox template redirects: ".$infoboxes{''}{'error'}."\n");
        return 60;
    }

    # Get target template
    my %t=$api->resolve_redirects('Template:NRISref');
    if(exists($t{''})){
        $api->warn("Failed to get NRISref template redirect: ".$t{''}{'error'}."\n");
        return 60;
    }
    my $NRISref=$t{'Template:NRISref'};
    $NRISref=~s/Template://;

    if(!defined($self->{'iter'})){
        $self->{'iter'}=$api->iterator(
            list        => 'exturlusage',
            eunamespace => 0,
            euprop      => 'title',
            euquery     => 'www.nr.nps.gov',
            eulimit     => '1000', # exturlusage has issues with big lists
        );
    }
    while(my $pg=$self->{'iter'}->next){
        if(!$pg->{'_ok_'}){
            $api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
            return 60;
        }

        return 0 if $api->halting;
        my $page=$pg->{'title'};
        my $tok=$api->edittoken($page, EditRedir => 1);
        if($tok->{'code'} eq 'shutoff'){
            $api->warn("Task disabled: ".$tok->{'content'}."\n");
            return 300;
        }
        if($tok->{'code'} ne 'success'){
            $api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
            next;
        }
        if(exists($tok->{'missing'})){
            $api->warn("WTF? $page does not exist?\n");
            next;
        }

        my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
        my $outtxt=$intxt;
        my ($fix,$fix2,$fix9,$mark)=(0,0,0,0);

        # Replace simple moved links
        $fix2+=($outtxt=~s!http://www.nr.nps.gov/multiples/!http://pdfhost.focus.nps.gov/docs/NRHP/Text/!g);

        # Replace the citation templates
        my $nowiki;
        $outtxt=$api->process_templates($outtxt, sub {
            my $name=shift;
            my $params=shift;
            my $wikitext=shift;
            my $data=shift;
            my $oname=shift;

            if(exists($infoboxes{"Template:$name"})){
                $fix9+=($wikitext=~s/\Q{{convert|0.9|acre}}\E/less than one acre/g);
                return $wikitext;
            }

            return undef unless exists($templates{"Template:$name"});

            my ($url,$date,$dt)=('','no date specified','');
            foreach ($api->process_paramlist(@$params)){
                $_->{'name'}=~s/^\s+|\s+$//g;
                $_->{'value'}=~s/^\s+|\s+$//g;
                if($_->{'name'} eq 'url'){
                    $url=$_->{'value'};
                } elsif($_->{'name'} eq 'date'){
                    $dt=$_->{'value'};
                }
            }
            if($url=~m!^http://www.nr.nps.gov/?$!){
                my $d=str2time($dt);
                if(defined($d)){
                    $d=strftime('%F', gmtime $d);
                    $date='2010a' if $d eq '2010-07-09';
                    $date='2009a' if $d eq '2009-03-13';
                    $date='2008b' if $d eq '2008-04-24';
                    $date='2008a' if $d eq '2008-04-15';
                    $date='2007b' if $d eq '2007-06-30';
                    $date='2007a' if $d eq '2007-01-23';
                    $date='2006a' if $d eq '2006-03-15';
                }
                $d//=$dt;
                #$api->warn("Unknown date $d in $page\n") if $date eq 'no date specified';
                $date=$d if($date eq 'no date specified' && $d ne '');
                $fix++;
                return "{{$NRISref|$date}}";
            }
            if($url=~m!^http://www.nr.nps.gov/!){
                $mark++;
                return $wikitext."{{NRIS dead link}}";
            }
            return undef;
        });

        # Hide cite web templates, we already processed them
        ($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {
            my $name=shift;
            return exists($templates{"Template:$name"});
        }, {}, $nowiki);

        # Mark any bracketed external link.
        $mark+=($outtxt=~s!(\[http://www.nr.nps.gov(?:[/:][^][<>\x22\x00-\x20\x7F]*)?(?: *[^\]\x00-\x08\x0a-\x1F]*?)\])!$1\{{NRIS dead link}}!g);

        # Hide all bracketed external links.
        ($outtxt,$nowiki)=$api->strip_regex(qr{\[http://[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $outtxt, $nowiki);

        # Mark any bare external link.
        $mark+=($outtxt=~s!\b(http://www.nr.nps.gov(?:[/:][^][<>\x22\x00-\x20\x7F]*)?)! fixExtLink($1) !ge);

        # Unstrip
        $outtxt=$api->replace_stripped($outtxt,$nowiki);

        # Avoid doubling up on the template
        my $ct=0;
        do {
            $ct=($outtxt=~s/\{\{NRIS dead link\}\}\s*\{\{NRIS dead link\}\}/{{NRIS dead link}}/g);
            $mark-=$ct;
        } while($ct>0);

        if($outtxt ne $intxt){
            my @summary=();
            push @summary, "replacing $fix NRIS {{cite web}} template".($fix==1?'':'s')." with {{$NRISref}}" if $fix;
            push @summary, "updating $fix2 moved NRIS link".($fix2==1?'':'s') if $fix2;
            push @summary, "marking $mark NRIS link".($fix==1?'':'s')." with {{NRIS dead link}}" if $mark;
            push @summary, "repairing $fix9 [[User talk:Elkman#NRHP places having area of .9 acres, etc.|incorrect data entry code".($fix9==1?'':'s')."]]" if $fix9;
            unless(@summary){
                $api->warn("Changes made with no summary for $page, not editing");
                next;
            }
            $summary[$#summary]='and '.$summary[$#summary] if @summary>1;
            my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
            $api->log("$summary in $page");
            my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
            if($r->{'code'} ne 'success'){
                $api->warn("Write failed on $page: ".$r->{'error'}."\n");
                next;
            }
        }

        # If we've been at it long enough, let another task have a go.
        return 0 if time()>=$endtime;
    }

    $api->log("May be DONE!");
    $self->{'iter'}=undef;
    return undef;
}

# Duplicate Mediawiki post-processing of bare external links
sub fixExtLink {
    my $url=shift;
    my $txt='';

    $txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
    my $sep=',;\.:!?';
    $sep.=')' unless $url=~/\(/;
    $txt=$1.$txt if $url=~s/([$sep]+$)//;

    # There shouldn't be a template inside the url
    $txt=$1.$txt if $url=~s/(\{\{.*$)//;

    return "[$url $url]{{NRIS dead link}}$txt";
}

1;