User:J. Johnson/Sortrefs

#!/usr/bin/perl -w
# 2013-06-29 JJ
# Reads a list of citation templates from stdin and sorts by author-date.
# Record starts with '*' in first column, includes preceding non-blank lines.
# Also includes any single preceding line starting with "<" or "{".
# Looks for 'last1-4', 'author1-4', 'last', 'author', 'year', 'date' params.
# Warns if no identifiable author is found (must be in sequence).
# Warns if 'coauthor' is used with less than four authors. 
# ---------------------
# Not perfect, but close enough.
#====================================================================
sub gotrec;

$Debug = 0;
$suppress = 0;
$harv = 0; $harv = "";
$start = $end = $new = $special = 0;
$linenum = 0;
$recn = 0;

# = Get command line arguments.
while (@ARGV) {
  $_ = shift; 
  if (/^(-\?|-h\b|-help)$/) {
      print <<"===End of help text===";

Sorts a list of Wikipedia citation templates (any kind) by author-date. Data
is read from stdin (through a "pipe") and written to stdout. Each citation
template must follow an asterisk ('*') in the first column. The sorted record
extends to the next record, except: a records includes any preceding line that
starts with "<" or "{" in the first column, and any preceding non-blank lines
provided they follow a blank line.

Sorting is done on the last name and first initial of up to four authors and
a year. Proper sorting requires proper use of the metadata: the last names
(and ONLY the last names) are expected in the 'last', 'last1', 'last2',
'last3', and 'last4' parameters, or in the corresponding 'author' parameters.
The frequent practice of putting whole names, or even multiple names, into
these parameters, or into the 'coauthors' parameter, will cause anomalous
sorting. Use of 'coauthors' without 'last1-4' triggers a warning, as well
as the absence of any identifiable author, or 'author' with semi-colons.

Accented letters sort after plain ASCII, may depend on locale.  Inclusion of
square brackets or comments may impair sorting. 

Adding "-d -d" on the command line will display the template data; 
"-s" will suppress the normal output. "-harv" produces Harv templates.

===End of help text===
  exit 0;
  }
  elsif (/^-d\b/) {
      $Debug++;
  }
  elsif (/^-harv\b/) {
      $harv++;
  }
  elsif (/^-s\b/) {
      $suppress++;
  }
} #while argv

print "=" x 50, "\n";
print "Debug level $Debug\n" if $Debug;
$Debug2 = 2 if ($Debug > 1);

#-------------------------------------------------------------------
# = Loop through input, determine start and end lines of each record.
while (<>) {
  @lines = (@lines, $_); 
  $linenum++;
  if (/^\s*$/) { 	# Blank line, goes into prior record.
    $end = $linenum; # if !$end;  # Reset $end at each blank line.
    	# (Alternately: could reset only at first blank line.)
    	# Defer making record until ^* line, as might reset $end.
    $new = $linenum+1;  # Of _new_ rec. Blank line always resets.
    next;
  }
  if (!$new && /^[<{]/o) {  # Special line (if not already covered).
    $special = 1;
    next;
  }
  if (/^\*/) {  	# New record.
    # - Finish prior record.
    #$end = $linenum-1 if !$end;   # Don't reset if already ended.
    if (!$end) { 	# May already have been set.
      if ($special) { 
        $end = $linenum-2;
      } else { 
        $end = $linenum-1;
      }
    }
    gotrec if $start;   # Old record, if one was started.

    # - New record.
    $recn++;
    if ($new) { 
      $start = $new;
    } elsif ($special) { 
      $start = $linenum-1;
    } else { 
      $start = $linenum;
    }
    $new = $end = 0; 	# New record.
  }
  $special = 0;  	# Reset if not caught line by following ^*.
  next if $new; 	# Skip processing of prepended lines.

  # - Assemble record as a single line.
  chomp ($record .= $_); # Removes newlines.
}

# - Catch last record.
$end = $linenum-1 if !$end;   # Don't reset if already ended.
gotrec if $recn;  # Were any records started?

print "-" x 50, "\n" if $Debug2;
exit if $suppress;  #--------------

# = Output.
$start = $end = $cflag = 0;
print "## Found $recn records in $linenum lines. ----------------\n";
foreach $key (sort keys(%RP)) {
  ($start, $end, $aflag, $aaflag, $cflag, $yflag,$harv) = 
  	split (/,/, $RP{$key});

  # - Warnings.
  print "* XX Following citation lacks an identifiable author.\n" if $aflag;
  print "* XX Following citation appears to misuse 'author='.\n" if $aaflag;
  print "* XX Following citation appears to misuse 'coauthors='.\n" if $cflag;
  print "* XX Following citation lacks a publication year.\n" if $yflag;
    print "-- key:  '$key'\n" if $Debug;
    print "   lines: $start-$end\n" if $Debug2;
    if ($harv) {
      #print "<!-- {{Harvnb|$harv|p= }} -->\n" if $harv;
      print "<!-- {{Harvnb|$harv|p= }} ";
      print "* Incomplete! " if ($aflag || $aaflag || $cflag || $yflag);
      print " -->\n";
    }
  
  print @lines[$start-1..$end-1]; 
}
print "-- sortrefs: Found $recn citation records.\n";

exit; # ----------------

#---------------------------
# Subroutines
#---------------------------
# = Got a record.
sub gotrec {
  $lastP = "!\\s*last";
  $authorP = "\\s*author";
  $endP = "\\s*[!}]";

  # - Extract last names. Loop through last$n, author$n, last, author.
  #   Try to grab first initial of first name.
  $key = ""; $aflag = $aaflag = 0; $cflag = 1;
  foreach $n (1 .. 4) { $last{$n} = "" }; 
  $record =~ tr/|/!/; 
  foreach $n (1 .. 4) {
    $last{$n} = $f = "";
    #xx Are separate $n variables needed?
    #if ( $record =~ /!\s*last$n\s*=\s*([^!}]+)\s*[!}]/ ) {   	# last$n?
    if ( $record =~ /$lastP$n\s*=\s*([^!}]+)$endP/ ) {   	# last$n?
      $last{$n} = $1;
      if ( $record =~ /!\s*first$n\s*=\s*([^!}])/ ) {  # First initial.
        $f = $1;
      }
    } elsif ($record =~ /$authorP$n\s*=\s*([^!}]+)$endP/ ) {   # author$n?
      $last{$n} = $1;
    }
    if ( $n == 1 && !$last{1}) {  # Check if unnumbered last/author was used.
      if ( $record =~ /$lastP\s*=\s*([^!}]+)$endP/ ) {   	# last?
        $last{$n} = $1;
        if ( $record =~ /!\s*first\s*=\s*([^!}])/ ) {  # First initial.
          $f = $1;
        }
      } elsif ($record =~ /$authorP\s*=\s*([^!}]+)$endP/ ) {    # author?
        $last{$n} = $1;
	$aaflag = 1 if ($record =~ /[^\d]{2};/);  # Unaccompanied semicolons?
      }
    }
    last if !$last{$n}; # Expect 'last(n)' to be consecutive.
    $name = $last{$n}; 
    $name =~ s/\s+$//o; # Trim any trailing white space.
    $name .= "=$f" if ($f);

    # - 
    $key .= "$name+";
    #print "++ $recn: last$n= '$last{$n}'\n" if $Debug2;
    $cflag = 0 if ($n == 4);
  }# foreach 1..4
  $aflag = 1 if !$key;
  
  # - Check for use of coauthors= instead of last2..4.
  if ($cflag && $record !~ /!\s*coauthors?\s*=\s*([^!}]+)$endP/ ) {
    $cflag = 0;
  }

  # - Get year.
  $year = ""; $yflag = 0;
  if (      $record =~ /!\s*year\s*=\s*([^!}]+)$endP/ ) {
    $year = $1;
  } elsif ( $record =~ /!\s*date\s*=\s*([^!}]+)$endP/ ) {
    $date = $1;
    if ( $date =~ /([12]\d\d\d[a-z]?)/ ) {
      $year = $1;
    }
  }
  if ($year) { $key .= $year; } else { $yflag = 1; }
  #print "++++ year= $year\n" if $Debug2;
  
  if ($harv) { 
    $harv = $key;
    $harv =~ s/=.\+/\+/g;
    $harv =~ s/\+/\|/g;
  }

  # = Adjust key. 
  # "BrownSmith" should come before "BrowningSmith" (last2 is a sub-key);
  # delimiter must sort _before_ letters and space. 
  # Unicode::Collate handles accented chrs inconsistently.
  # Conversion of accented and other chars. to plain ASCII using tr fails.
  $key =~ tr/A-Z/a-z/;  

  $RP{$key} = "$start,$end,$aflag,$aaflag,$cflag,$yflag,$harv";

  print "-- key: '$key'\n" if $Debug2;
  printf ("#%03d >> %s\n", $recn, $record) if $Debug2;
  
  $record = "";
  return;
}
#---------------------------
###