User:Squidonius/userpage/microbes code

use strict;
use warnings;
use constant N=>"\n";
use constant T=>"\t";

my $diag=0;
my $set='AC'; #AC DL MR SZ

$set=$ARGV[0] if $ARGV[0];
$diag=$ARGV[1] if $ARGV[1];

sub download {
	my $home=shift;
	require LWP::UserAgent;
	my $ua = LWP::UserAgent->new;
	$ua->timeout(1000);  #Internet is under 200kbps in NZ
	$ua->proxy(['http', 'ftp'], 'http://tur-cache.massey.ac.nz:8080/');  
	$ua->env_proxy;
	my $response = $ua->get($home);
	return retry_download($home,$response->status_line) if ! $response->is_success;
	my $reply=$response->decoded_content;
	return retry_download($home,'Message empty') if length($reply)<10;
	return retry_download($home,'Serverside error') if ($reply=~m/Error\+11\+\(Resource\+temporarily\+unavailable\)/i);
	return $reply;  #really the internet actually works here?!
}

sub retry_download {
	my $home=shift;
	print 'Issue with '.$home.N;
	print shift(@_).N;
	print '1 minute pause...'.N;
	sleep 60;
	return download($home);
	
}


sub parse_LPSN_page {
	
	#split into paragraphs...
	my $file=shift;
	my $name=shift;
	my $cut='<a href="#r"><img border="0" src="top.jpg" width="23" height="11" alt="image"></a>';
	my $col='<font color="#0000FF">';
	my $colH='<font color="#FF0000">';
	
	
	$file=~ s/.*Number of species cited in this file//sm;
	$file=~ s/Copyright.*//sm;
	my $total=0;
	($file=~ m/^\:\s+(\d+)/) ? ($total=$1):(print 'Error... Tally not found'.N.$file.N);
	my @species=split/\Q$cut\E/,$file;
	my $about=shift(@species);
	
	my ($author,$type,$ety,$doi,@spp); my $em='';
	if ($about=~ m/$colH<i><b>$name<\/b><\/i><\/font>(.*)/) {$author=$1; $author=~s/[\r\n]//g} else {print "Cannot find authority with $col<i><b>$name<\/b><\/i><\/font>\n"; $em.=' parse error for authority'}
	if ($about=~ m/$col<b>Type<\/b><\/font> $col<b>species<\/b>\:<\/font> <a .*?\/a> <i>$name<\/i> <i>(\w+)<\/i>/) {$type=$1} else {print "Cannot find type epitet\n"; $em.=' parse error for type species'}
	if ($about=~ m/$col<b>Etymology<\/b>\:<\/font>(.*)/) {$ety=$1; $ety=~s/[\r\n]//g} else {print "Cannot find Etymology\n"; $em.=' parse error for etymology'}
	if ($about=~ m/http\:\/\/dx\.doi\.org\/(.*?)\"/) {$doi=$1;} 
	elsif ($about=~ m/(http\:\/\/ijs\.sgmjournals\.org\/cgi\/reprint\/.*?)\"/) {$doi=$1;}
	else {print "Cannot find Doi\n"; $em.=' parse error for doi'}
	if ($about=~ m/---\&gt\;/) {print "Emendment detected\n"; $em.=' emendment'}

	my $first=substr($name, 0, 1);
	
	foreach my $sp (@species) {
		my ($ln,$la,$le,$ld,$ll); my $lx='';
		if ($sp=~ m/$name<\/b><\/i><\/font> $colH<i><b>(\w+)<\/b><\/i><\/font>(.*)/) {($ln,$la)=($1,$2); $la=~s/[\r\n]//g} else {next; print "Cannot find authority in $sp\n"; $lx.=' parse error for name'}
		if ($sp=~ m/$col<b>Etymology<\/b>\:?<\/font>(.*)/) {$le=$1; $le=~s/[\r\n]//g} else {print "Cannot find Etymology in $sp\n"; $lx.=' parse error for etymology'}
		if ($sp=~ m/http\:\/\/dx\.doi\.org\/(.*?)\"/) {$ld=$1;}
		elsif ($sp=~ m/(http\:\/\/ijs\.sgmjournals\.org\/cgi\/reprint\/.*?)\"/) {$ld=$1;}
		else {print "Cannot find Doi\n"; $lx.=' parse error for doi'}
		if ($sp=~ m/---\&gt\;/) {print "Emendment detected\n"; $lx.=' emendment'}
		$ll="* \[\[$name $ln|$first. $ln\]\] ($la\;$le)<ref>\{\{cite doi\|$ld\}\}</ref>";
		$ll.="<!-- Manual check required due to$lx! -->" if $lx;
		push(@spp,$ll)
	}
	return ($total,$author,substr($name,0,1).'. '.$type,$ety,$doi, $em,@spp);
}




#########################


my $q='&quot;';

open(WIKI,'>',$set.'_wiki.txt') or die;


open(LIST,$set.'.html') or die "cannot open file\n\a";

foreach (grep(/Domain/,split(/<\/p>/,do { local $/; <LIST> }))) {
	s/[\n\r]//msg;
	s/Division or phylum/Division/;
	s/Domain or empire/Domain/;
	s/\&nbsp\;/ /g;
	s/<span .*?>//g;  #they have no use, I think
	my @lines=split/<br \/>/;
	
	my $temp=shift(@{[grep(/<a name/,@lines)]});
	$temp=~m/<a href\=\"(\w+\/\w+.html)\"><font color\=\"\#FF0000\">(\w+)<\/font>/;
	my ($glink,$genus)=($1,$2);
	print "Error with $temp\n\a" if ! $glink;
	
	print "Parsing $genus\n" if $diag;
	
	my %tax=(genus=>[$genus,$glink]);
	my ($total,$author,$type,$ety,$doi,$error,@spp);
	
	if ($glink) {
		if (!-e $glink) {open(PAGE,'>',$glink) or die "cannot make file $glink\n\a";print "downloading $genus from $glink\n"; print PAGE download('http://www.bacterio.cict.fr/'.$glink);close (PAGE);}
		
		open(PAGE,$glink) or die "cannot open file $glink\n\a";
		($total,$author,$type,$ety,$doi,$error,@spp)=parse_LPSN_page(do { local $/; <PAGE> },$genus);
		close (PAGE);
	} else {print "HELP!"}
	
	foreach my $rank qw(Family Suborder Order Subclass Class Division Domain) {
		my ($link,$taxa);
		$temp=shift(@{[grep(/$rank\:/,@lines)]});
		if (($temp !~ m/$rank\:\W+$/)&&($temp=~ m/$rank\:/)) {
			if ($temp=~m/$rank\:\s+<a href\=\"([\w\#\/\.]+)\">(\w+)<\/a>/) {($link,$taxa)=($1,$2)}
			elsif ($temp=~m/$rank\:\s+$q<a href\=\"([\w\#\/\.]+)\">(\w+)<\/a>$q/) {($link,$taxa)=($1,'"'.$2.'"')}  #odd bug with $q
			elsif ($temp=~m/$rank\:\s+$q<a href\=\"([\w\#\/\.]+)\"><font.*?>(\w+)<\/font><\/a>$q/) {($link,$taxa)=($1,'"'.$2.'"')} #does this mean anything different
			elsif ($temp=~m/$rank\:\s+$q<i>(\w+)<\/i>$q/) {$taxa=$1}
			else {print "\nImminent  error\r";}
			$error.=" $rank error" if ! $taxa;
			$tax{$rank}=[$taxa,$link];			
		}
	}
	
	print WIKI "\n\n==\[\[$genus\]\]==\n<nowiki>\{\{italic title\}\}\n";
	print WIKI "<!--Errors: $error -->\n" if $error;
	print WIKI "\{\{Taxobox\n\| color \= lightgrey\n\| name \= ''$genus''\n";
	print WIKI "\| domain \= \[\[".$tax{Domain}->[0]."\]\]\n"  if $tax{Domain}->[0];
	print WIKI "\| phylum \= \[\[".$tax{Division}->[0]."\]\]\n"  if $tax{Division}->[0];
	print WIKI "\| classis \= \[\[".$tax{Class}->[0]."\]\]\n" if $tax{Class}->[0];
	print WIKI "\| subclassis \= \[\[".$tax{Subclass}->[0]."\]\]\n" if $tax{Subclass}->[0];
	print WIKI "\| ordo \= \[\[".$tax{Order}->[0]."\]\]\n"  if $tax{Order}->[0];
	print WIKI "\| subordo \= \[\[".$tax{Suborder}->[0]."\]\]\n" if $tax{Suborder}->[0];
	print WIKI "\| familia \= \[\[".$tax{Family}->[0]."\]\\n"  if $tax{Family}->[0];
	print WIKI "\| genus \= ''$genus''\n";
	print WIKI "\| binomial_authority \= $author<ref>\{\{cite doi\|$doi\}\}<\/ref>" if $author;
	print WIKI "\| type_species \= $type \n"  if $type;
	print WIKI "\| subdivision_ranks \= Species \n\}\}\n";
	my $tp=$tax{Division}->[0]; $tp=~s/\"//g; my $td=$tax{Domain}->[0]; $td=~s/\"//g;
	print WIKI "'''''$genus''''' is a genus in the phylum \[\[".$tp.']] ([['.$td.']]).<ref>{{lpsn|classification'.lc($set).'.html|Classification of Genera '.$set.'}}</ref>'.N;
	print WIKI "The etymology of the genus is $ety.<ref name=main>\{\{lpsn\|$glink\|$genus\}\}<\/ref>\nThe genus contains $total species (including basonyms and synonyms), namely<ref name=main/>\n".join(N,@spp).N;
	print WIKI '==See Also=='.N.'* [[Bacterial taxonomy]]\n* [[Microbiology]]'.N;
	print WIKI '== References =='.N.'{{reflist}}'.N.'[[Category:'.$td.']]'.N.'[[Category:'.$tp.']]</nowiki>'.N;
}


print 'Done'.N;