User:WildBot/watchlist monitor.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

"""
Producer threads that follows changes to articles on a Wikipedia watchlist

Each time an article changes, it is added to the processing queue
"""

import time, traceback, codecs, re, threading
import wikipedia

__metaclass__ = type

        
class LastWatchlistCheck:
    """
    Persistently keeps track of the last time the watchlist was acted on
    """
    def __init__(self, site=None):
        if not site:
            site = wikipedia.getSite()
        self.log_filename = wikipedia.config.datafilepath('watchlists',
            'latestcheckwatchlist-%s-%s.dat' % (site.family.name, site.lang))
        self.lasttime = 0;
        try:
            f = codecs.open(self.log_filename, 'r', 'utf-8')
            try:
                logtext = f.readline()
                self.lasttime = int(logtext)
            finally:
                f.close()
        except:
            return

    def put(self, newtime):
        """
        Note and persistent the last time the watchlist was acted on
        """
        if unicode(newtime) > unicode(self.lasttime):
            self.lasttime = newtime
            try:
                f = codecs.open(self.log_filename, 'w+', 'utf-8')
                try:
                    f.write(unicode(self.lasttime))
                    f.write('\n')
                finally:
                    f.close()
            except:
                return
        
    def get(self):
        """
        Retrieve the last time the watchlist was acted on
        """
        return self.lasttime
    

class WatchlistProducer( threading.Thread ):
    def __init__(self, shutdown, queue, site=None):
        self.shutdown = shutdown
        self.queue = queue
        if site is None:
            site = wikipedia.getSite()
        self.site = site
        fromdisk = LastWatchlistCheck(site)
        self.latest = fromdisk.get()
        threading.Thread.__init__(self)

    def _refreshOld(self, site, sysop=False):
        # get watchlist special page's URL
        path = site.watchlist_address()
        wikipedia.output(u'Retrieving watchlist for %s' % repr(site))
        #wikipedia.put_throttle() # It actually is a get, but a heavy one.
        watchlistHTML = site.getUrl(path, sysop=sysop)
    
        wikipedia.output(u'Parsing watchlist')
        watchlist = []
        for itemR in [re.compile(r'<li><input type="checkbox" name="id\[\]" value="(.+?)" />'), re.compile(r'<li><input name="titles\[\]" type="checkbox" value="(.+?)" />')]:
            for m in itemR.finditer(watchlistHTML):
                pageName = m.group(1)
                watchlist.append(pageName)
    
        # Save the watchlist to disk
        # The file is stored in the watchlists subdir. Create if necessary.
        if sysop:
            f = open(wikipedia.config.datafilepath('watchlists',
                     'watchlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w')    
        else:
            f = open(wikipedia.config.datafilepath('watchlists',
                     'watchlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
        pickle.dump(watchlist, f)
        f.close()
    
    def watchedpages(self, sysop=False):
    #    try:
        if wikipedia.config.use_api and self.site.versionnumber() >= 10:
            x = self.site.api_address()
            del x
        else:
            raise NotImplementedError
    #    except NotImplementedError:
    #        _refreshOld(site)
        
        # get watchlist special page's URL
        if not self.site.loggedInAs(sysop=sysop):
            self.site.forceLogin(sysop=sysop)

        wikipedia.output(u'Retrieving watchlist for %s' % repr(self.site))
       #wikipedia.put_throttle() # It actually is a get, but a heavy one.
        watchlist = []
        while not self.shutdown.isSet():
            if self.latest == 0:
                params = {
                    'action': 'query',
                    'list': 'watchlist',
                    'wllimit': wikipedia.config.special_page_limit,
                    'wlexcludeuser': self.site.username(),
                    'wlprop': ['title', 'timestamp',], 
                }
            else:
                params = {
                    'action': 'query',
                    'list': 'watchlist',
                    'wlstart' : self.latest + 1,
                    'wldir' : 'newer',
                    'wllimit': wikipedia.config.special_page_limit,
                    'wlexcludeuser': self.site.username(),
                    'wlprop': ['title', 'timestamp',], 
                }
        
            data = wikipedia.query.GetData(params, self.site, sysop=sysop)
            if 'error' in data:
                raise RuntimeError('ERROR: %s' % data)
            for w in data['query']['watchlist']:
                yield w['title'], wikipedia.parsetime2stamp(w['timestamp'])
            
            if 'query-continue' in data:
                params['wlstart'] = data['query-continue']['watchlist']['wlstart']
            else:
                self.shutdown.wait(30)

    def run(self):
        try:
            for (title, timestamp) in self.watchedpages():
                print 'Watchlist: %s at %s\n' % (title, timestamp)
                page = wikipedia.Page(self.site, title)
                self.queue.add_page(page)
                if timestamp > self.latest:
                    print 'Latest was %s and now is %s' % (self.latest, timestamp)
                    self.latest = timestamp
        except:
            self.shutdown.set()
            self.queue.add_page(None)
            wikipedia.stopme()
            raise