import os, re, time
bookdir = "books"
badprefix = set(("user", "user talk", "wikipedia", "wikipedia talk",
"template", "category", "portal", "portal talk",
"help", "help talk"))
whitelist = set((
"Wikipedia:Books/The Missing Manual",
"User:Sue Gardner/Books/Welcome",
"User:Sue Gardner/Books/BLP",
"User:Miya/Books/Helps and Extensions",
"User:BookSpace/Books/Sandbox1",
"User:BookSpace/Books/Sandbox2",
"User:BookSpace/Books/Sandbox3",
"User:BookSpace/Books/Sandbox4",
"User:BookSpace/Books/Sandbox5",
"User:BookSpace/Books/Sandbox6",
"User:BookSpace/Books/Sandbox7",
"User:BookSpace/Books/Sandbox8",
"User:BookSpace/Books/Template",
))
booksNoLinks = set()
booksBadLinks = set()
booksOneLink = set()
booksGoodUser = set()
booksGoodProject = set()
booksGoodWtf = set()
for bf in os.listdir(bookdir):
f = open(os.path.join(bookdir, bf))
title = f.readline().strip()
if title in whitelist: continue
links = set()
lines = 0
headings = 0
sections = 0
cats = 0
unknown = 0
for line in f:
line = line.strip()
lines += 1
if not line:
continue
if line == '{{saved_book}}':
continue
llinks = re.findall("\[\[([^]|]*)", line)
if len(llinks):
for link in llinks:
if link.startswith("Category:"):
cats += 1
else:
links.add(link)
elif line.startswith("{{saved"):
continue
elif line.startswith("="):
headings += 1
elif line.startswith(";"):
sections += 1
elif len(line):
unknown += 1
badlinks = set()
prefixes = set()
for l in links:
pfx = l.split(":")[0].lower()
prefixes.add(pfx)
if pfx in badprefix or l == 'Main Page':
badlinks.add(l)
goodlinks = links.difference(badlinks)
#print title, "(%d good/%d bad; %d heads %d sects %d cats %d unk)" % (len(goodlinks), len(badlinks), headings, sections, cats, unknown)
if len(links) == 0:
booksNoLinks.add(title)
elif len(goodlinks) == 0:
booksBadLinks.add(title)
elif len(links) == 1:
booksOneLink.add(title)
elif title.lower().startswith("user:"):
booksGoodUser.add(title)
elif title.lower().startswith("wikipedia:"):
booksGoodProject.add(title)
elif title.lower().startswith("book:"):
booksGoodProject.add(title)
else:
booksGoodWtf.add(title)
def printLinks(linkSet, title):
if not len(linkSet): return
print "\n== %s (%d) ==" % (title, len(linkSet))
for l in sorted(linkSet):
print "* [[%s]]" % l
print "__TOC__"
print "Last updated %s" % time.ctime()
printLinks(booksNoLinks, "Books containing no articles")
printLinks(booksOneLink, "Books containing only one article")
printLinks(booksBadLinks, "Books containing no mainspace articles")
printLinks(booksGoodWtf, "Books in totally unexpected places")
printLinks(booksGoodUser, "Otherwise unclassified books in user space")
printLinks(booksGoodProject, "Otherwise unclassified books in project space")