User:ImageRemovalBot/removebot-followup.pl

Source code for ImageRemovalBot's second-pass removal, to deal with protected pages and delays in updating the "image usage" table in the database. Requires User:FairuseBot/Pearle.pm and User:FairuseBot/libBot.pm.

#!/usr/bin/perl


# RemoveBot Followup
#
# A bot to remove deleted images from pages.  Checks up on images RemoveBot was unable to remove at a 24-hour delay.

use strict;
use warnings;

use lib '/home/mark/perllib';
use lib '/home/mark/Desktop/Projects/Wikibots/dev/common';
#use Date::Calc qw();
#use URI::Escape;
use Fcntl qw(:flock);

use libBot;


my $homedir = '/home/mark/removebot';
my $permit_interruptions = 0;	# Allow talkpage messages to stop the bot?

Pearle::init("ImageRemovalBot", "<INSERT PASSWORD HERE>", "$homedir/removebot-followup.log","$homedir/followup-cookies.txt");
config(username => "ImageRemovalBot");

if(!Pearle::login())
{
	exit;
}

# Check for a running copy
if(-e "$homedir/pid-followup")
{
	# Possible other copy.  Compare PIDs
	open PIDFILE, "<", "$homedir/pid-followup";
	my $pid = <PIDFILE>;
	close PIDFILE;

	my $psresult = `ps -p $pid`;
	if($psresult =~ /removebot-followup.pl/)
	{
		botwarnlog("*Previous run is taking longer than normal\n");
		exit;
	}
}

open PIDFILE, ">", "$homedir/pid-followup";
print PIDFILE $$;
close PIDFILE;

my @images;

# Process the followup log
if(-e "$homedir/followup.log")
{
	my @new_images;
	# Read the log in
	open INFILE, "<", "$homedir/followup.log";
	flock INFILE, LOCK_SH;
	while(<INFILE>)
	{
		my ($date, $image) = $_ =~ /(\d+) (.*)/;
		chomp $image;
		if(defined($date) && $date < time() - 86400)
		{
			push @images, $image;
		}
		else
		{
			if(defined($date))
			{
				push @new_images, "$date $image\n";
			}
		}
	}
	flock INFILE, LOCK_UN;
	close INFILE;

	# Write out a log containing the entries we aren't going to process now
	open INFILE, ">", "$homedir/followup.log";
	flock INFILE, LOCK_EX;
	foreach my $line (@new_images)
	{
		print INFILE $line;
	}
	flock INFILE, LOCK_UN;
	close INFILE;
}
else
{
	Pearle::myLog(2, "No images in followup log\n");
	exit;
}

{
	my $image;
	
	Pearle::myLog(2, "Beginning set at " . time() . "\n");

	print join "\n", @images;
	print "\n", scalar(@images), " images found\n";
	
	if(scalar(@images) == 0)
	{
		Pearle::myLog(1, "*No images in log need processing\n");
	}

	foreach $image (@images)
	{
		my $image_url;
		my $image_regex = $image;
		my $page;
		my @pages = ();
		my ($day, $month, $year);
		# Fetch image info
		my $image_data = Pearle::APIQuery(titles => [$image], prop => 'imageinfo', meta => 'userinfo', uiprop => ['hasmsg'], 			# Basic data
		                                  list => 'imageusage', iutitle => $image, iunamespace => [0, 10, 12, 14, 100], iulimit => 500);	# Image usage

		my $full_comment = "";
		my $removal_prefix = "Deleted image removed:";
		my $removal_comment = "Removing deleted image";

		if($permit_interruptions and DoIHaveMessages($image_data))
		{
			Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
			exit;
		}
		
		# Verify the image is still deleted
		if($image_data !~ /missing=""/)
		{
			Pearle::myLog(2, "*Image [[:$image]] has been re-uploaded.\n");
			next;
		}

		# Images from Commons.  May have been masked by the deleted version.
		if($image_data =~ /imagerepository="shared"/)
		{
			Pearle::myLog(2, "*Commons image [[:$image]] found\n");
			next;
		}
		
		@pages = GetPageList($image_data);

		if(scalar(@pages) == 0)
		{
			notelog("Image $image is already orphaned\n");
			next;
		}
		
		my ($raw_image) = $image =~ /Image:(.*)/;
		$raw_image = MakeWikiRegex($raw_image);
		if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i)
		{
			$image_regex = "[ _]*(:?[Ii][Mm][Aa][Gg][Ee]|[Mm][Ee][Dd][Ii][Aa])[ _]*:[ _]*${raw_image}[ _]*";
			Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n");
		}
		else
		{
			$image_regex = "[ _]*[Ii][Mm][Aa][Gg][Ee][ _]*:[ _]*${raw_image}[ _]*";
		}
		
		# Sanity check
		if(!defined($raw_image) or $image !~ /$raw_image/)
		{
			botwarnlog("*Parse error on image [[:$image]] ($raw_image)\n");
			next;
		}
		Pearle::myLog(3, "Image regex: $image_regex\n");
		
		if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i)
		{
			my $page_list = "( ";
			foreach my $page_entry (@pages)
			{
				$page_list .= "[[:$page_entry]] ";
			}
			$page_list .= ")";
			wikilog("User talk:ImageRemovalBot/media", "*[[:$image]] $page_list\n");
			next;	# Non-image files are too hard to work with
		}

		my $parsed_removal_comment = $removal_comment;
		$parsed_removal_comment =~ s/image/[[:$image|image]]/;
 		foreach $page (@pages)
		{
			my $hits = 0;
			notelog("Page for removal: $page\n");
			if($hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment)) 	# Don't limit if we just touched the article
			{
				Pearle::myLog(2, "Removed image $image from article $page ($hits times)\n");
				Pearle::limit();
			}
		}
				
		# Verify removal
		# Portal removal is too hard to get correct, and we don't really care about it.
		# Template removal isn't possible, and the template usage has already been logged.
		$image_data = Pearle::APIQuery(list => 'imageusage', iutitle => $image, iunamespace => [0, 12, 14], iulimit => 500);
		@pages = GetPageList($image_data);

		if(scalar(@pages) != 0)
		{
			botwarnlog("*Unable to remove all instances of [[:$image]]\n");
			Pearle::myLog(2, "*Unable to remove all instances of [[:$image]]\n");
		}

	}
	Pearle::myLog(2, "Finished with followup set.\n");
}

#print "Finished.  Total $total_images removed, $total_processed processed.\n";

unlink "$homedir/pid-followup"