User:Gwern/Archive-bot.hs

{- Module      :  Main.hs
   License     :  public domain
   Maintainer  :  Gwern Branwen <gwern0@gmail.com>
   Stability   :  unstable
   Portability :  portable
   Functionality: retrieve specified articles from Wikipedia and request WebCite to archive all URLs found.
   USE: Print to stdin a succession of Wikipedia article names (whitespace in names should be escaped as '_').
        A valid invocation might be, say: '$echo Fujiwara_no_Teika Fujiwara_no_Shunzei | archive-bot'
        All URLs in [[Fujiwara no Teika]] and [[Fujiwara no Shunzei]] would then be backed up.
        If you wanted to run this on all of Wikipedia, you could take the current 'all-titles-in-ns0'
        gzipped file from [[WP:DUMP]], gunzip it, and then pipe it into archive-bot.
   TODO: send an equivalent request to the Internet Archive.
         Not in any way rate-limited.
   BUGS: Issues redundant archive requests.
         Currently uses Data.ByteString.Lazy.Char8. If I'm understanding the documentation right, this barfs
         on the full UTF-8 character set, but Wikipedia definitely exercises the full UTF-8 set. I *would* use
         Data.ByteString.Lazy, but that doesn't have 'lines', 'unlines', and 'words'. Need to ask #haskell/Dons
         what's up. -}

module Main where
import Monad (liftM)
import Control.Concurrent (forkIO)
import Text.HTML.TagSoup (parseTags, Tag(TagOpen))
import Text.HTML.Download (openURL)
import qualified Data.ByteString.Lazy.Char8 as B (ByteString(), getContents, lines, unlines, pack, unpack, putStrLn, words)
import Data.List (isPrefixOf)
import Data.Set (toList, fromList)

main :: IO ()
main = mapM_ (forkIO . archiveURL) =<< (liftM sortNub $ mapM fetchArticleText =<< (liftM B.words $ B.getContents))
              where sortNub :: [[B.ByteString]] -> [B.ByteString]
                    sortNub = toList . fromList . concat

fetchArticleText :: B.ByteString -> IO [B.ByteString]
fetchArticleText article = liftM (B.lines . extractURLs) (openURL(wikipedia ++ B.unpack article))
                           where wikipedia = "http://en.wikipedia.org/wiki/"

extractURLs :: String -> B.ByteString
extractURLs arg = B.unlines [B.pack x | TagOpen "a" atts <- (parseTags arg), (_,x) <- atts, "http://" `isPrefixOf` x]

archiveURL :: B.ByteString -> IO ()
archiveURL url = do B.putStrLn url -- Note that the use of forkIO means only some URLs will print
                    openURL("www.webcitation.org/archive?url=" ++ (B.unpack url) ++ emailAddress)
                    return ()
                 where emailAddress = "&email=foo@bar.com"