├── LICENSE ├── Network └── URL │ └── Archiver.hs ├── Setup.lhs ├── archiver.cabal └── archiver.hs /LICENSE: -------------------------------------------------------------------------------- 1 | All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the author nor the names of his contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR 19 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 26 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 27 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Network/URL/Archiver.hs: -------------------------------------------------------------------------------- 1 | module Network.URL.Archiver (checkArchive) where 2 | 3 | import Control.Monad (when, void) 4 | import Data.Char (isAlphaNum, isAscii) 5 | import Data.List (isInfixOf) 6 | import Data.Maybe (fromJust) 7 | import Network.Browser (browse, formToRequest, request, Form(..)) 8 | import Network.HTTP (RequestMethod(POST)) 9 | import Network.HTTP.Conduit (simpleHttp) 10 | import Network.URI (isURI, parseURI) 11 | import Text.Printf (printf) 12 | 13 | -- | Open a URL 14 | pingURL :: String -> IO () 15 | pingURL = void . simpleHttp 16 | 17 | -- | Error check the URL and then archive it using 'wikiwixArchive', and 'internetArchiveLive'; excludes Tor links. 18 | checkArchive :: String -- ^ URL to archive 19 | -> IO () 20 | checkArchive url = when (isURI url && not (".onion/" `isInfixOf` url)) (internetArchiveLive url >> wikiwixArchive url >> googleSearch url >> archiveisArchive url) 21 | 22 | -- | Request a URL through Internet Archive's on-demand archiving URL. 23 | -- 24 | -- This also does a backup archive attempt through the live Internet mirror; 25 | -- this is completely speculative and may result in no archiving. 26 | -- This method is a guess based on my use of their mirror and a banner that is sometimes inserted; 27 | -- see 28 | internetArchiveLive :: String -> IO () 29 | internetArchiveLive url = pingURL("http://web.archive.org/save/"++url) >> pingURL ("http://liveweb.archive.org/"++url) 30 | 31 | wikiwixArchive :: String -> IO () 32 | wikiwixArchive url = pingURL ("http://archive.wikiwix.com/cache/?url="++url) 33 | 34 | -- | 35 | archiveisArchive :: String -> IO () 36 | archiveisArchive url = do let archiveform = Form POST (fromJust $ parseURI "https://archive.ph/submit/") [("url", url), ("submit", "")] 37 | void $ browse $ request $ formToRequest archiveform 38 | 39 | -- can't hurt to let Google know it exists 40 | googleSearch :: String -> IO () 41 | googleSearch url = pingURL ("http://www.google.com/search?q=" ++ escape url) 42 | 43 | -- | Utility function to URL-encode a string for use in URL arguments; copied from somewhere 44 | escape :: String -> String 45 | escape = concatMap escapeURIChar 46 | escapeURIChar :: Char -> String 47 | escapeURIChar c | isAscii c && isAlphaNum c = [c] 48 | | otherwise = concatMap (printf "%%%02X") [c] 49 | -------------------------------------------------------------------------------- /Setup.lhs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env runhaskell 2 | > import Distribution.Simple 3 | > main = defaultMain 4 | -------------------------------------------------------------------------------- /archiver.cabal: -------------------------------------------------------------------------------- 1 | name: archiver 2 | version: 0.7 3 | 4 | license: BSD3 5 | license-file: LICENSE 6 | author: Gwern 7 | maintainer: Gwern 8 | 9 | category: Documentation, Network 10 | synopsis: Archive supplied URLs in Internet Archive 11 | description: `archiver` is a daemon which will process a specified text file, 12 | each line of which is a URL, and will (randomly) one by one request that 13 | the URLs be archived or spidered by 14 | , and for future reference. 15 | (One may optionally specify an arbitrary `sh` command like `wget` to download URLs locally.) 16 | . 17 | Because the interface is a simple text file, this can be combined 18 | with other scripts; for example, a script using SQLite3 to extract 19 | visited URLs from Firefox, or a program extracting URLs from Pandoc 20 | documents. (See .) 21 | . 22 | For explanation of the derivation of the code in `Network.URL.Archiver`, 23 | see . 24 | 25 | build-type: Simple 26 | Cabal-Version: >= 1.6 27 | stability: provisional 28 | tested-with: GHC==6.12.1 29 | 30 | source-repository head 31 | type: git 32 | location: git://github.com/gwern/archiver-bot.git 33 | 34 | Library 35 | exposed-modules: Network.URL.Archiver 36 | build-Depends: base>=4 && < 5, network, HTTP, curl, http-conduit 37 | ghc-options: -Wall 38 | 39 | Executable archiver 40 | main-is: archiver.hs 41 | other-modules: Network.URL.Archiver 42 | build-depends: base>=4 && < 5, containers, bytestring, random, process, network, network-uri -------------------------------------------------------------------------------- /archiver.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | import Control.Concurrent (threadDelay) 3 | import qualified Control.Exception as CE (catch, IOException) 4 | import Control.Monad (liftM, unless, void, when) 5 | import Data.List (delete) 6 | import qualified Data.Set as S (fromList, toList) 7 | import Data.Maybe (fromMaybe) 8 | import Network.HTTP (getRequest, simpleHTTP) 9 | import Network.URI (isURI) 10 | import System.Environment (getArgs) 11 | import System.Process (runCommand, terminateProcess) 12 | import qualified Data.ByteString.Char8 as B (length, lines, readFile, unlines, unpack, writeFile, ByteString) 13 | import System.Random (getStdRandom, randomR) 14 | 15 | import Network.URL.Archiver (checkArchive) 16 | 17 | main :: IO () 18 | main = do args <- getArgs 19 | case args of 20 | (f:[]) -> archivePage f Nothing Nothing 21 | (f:e:[]) -> archivePage f (Just e) Nothing 22 | (f:e:n:[]) -> archivePage f (Just e) (Just (read n :: Int)) 23 | _ -> error "must supply a filename" 24 | 25 | archivePage :: FilePath -> Maybe String -> Maybe Int -> IO () 26 | archivePage file sh n = do -- default: 48 seconds (converted to milliseconds) 27 | let n' = 1000000 * fromMaybe 48 n 28 | let loop = archivePage file sh n 29 | connectedp <- CE.catch (simpleHTTP (getRequest "http://www.webcitation.org")) (\(_::CE.IOException) -> return (Left undefined)) 30 | case connectedp of 31 | Left _ -> -- Left = ConnError, network not working! sleep for a minute and try again later 32 | threadDelay n' >> loop 33 | Right _ -> do -- we have access to the WWW, it seems. proceeding with mission! 34 | contents <- B.readFile file 35 | when (B.length contents == 0) $ threadDelay n' 36 | (url,rest) <- splitRandom contents 37 | let url' = B.unpack url 38 | when (isURI url') $ do 39 | print url' 40 | hdl <- case sh of 41 | Nothing -> return Nothing 42 | Just sh' -> return $ Just (runCommand (sh' ++ " '" ++ url' ++ "'")) 43 | -- banned >=100 requests/hour; choke it 44 | threadDelay n' 45 | case hdl of 46 | Nothing -> return () 47 | Just hdl' -> void $ liftM terminateProcess hdl' -- GC 48 | unless (null rest) (writePages file url >> loop) -- rid of leading \n 49 | 50 | -- re-reads a possibly modified 'file' from disk, removes the archived URL from it, and writes it back out for 'archivePage' to read immediately 51 | writePages :: FilePath -> B.ByteString -> IO () 52 | writePages file done = do original <- liftM B.lines $ B.readFile file 53 | let sorted = S.toList $ S.fromList original 54 | let final = B.unlines $ filter (not . (== done)) sorted 55 | B.writeFile file final 56 | 57 | -- pick a random entry in the list 58 | splitRandom :: B.ByteString -> IO (B.ByteString, [B.ByteString]) 59 | splitRandom s = do let ss = B.lines s 60 | let l = length ss 61 | i <- getStdRandom (randomR (0,l)) 62 | let randpick = if length ss > 1 then ss !! i else head ss 63 | let removed = Data.List.delete randpick ss 64 | return (randpick, removed) 65 | --------------------------------------------------------------------------------