├── .gitignore ├── Makefile ├── OfflinePages ├── ArchiveHTTPServer.py └── __init__.py ├── README.md ├── bin ├── offline-browse └── offline-create └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | dist 3 | *.egg-info 4 | 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | clean: 4 | rm -rf build dist *.egg-info *.pyc 5 | 6 | install: 7 | python setup.py install 8 | 9 | test: 10 | @echo test 11 | 12 | .PHONY: clean install test 13 | -------------------------------------------------------------------------------- /OfflinePages/ArchiveHTTPServer.py: -------------------------------------------------------------------------------- 1 | """Archive HTTP Server. 2 | 3 | This module builds on BaseHTTPServer by implementing the standard GET 4 | and HEAD requests in a fairly straightforward manner. 5 | 6 | """ 7 | 8 | __version__ = "0.1" 9 | 10 | __all__ = ["ArchiveHTTPRequestHandler"] 11 | 12 | import os 13 | import posixpath 14 | import BaseHTTPServer 15 | import SimpleHTTPServer 16 | import urllib 17 | import cgi 18 | import sys 19 | import shutil 20 | import mimetypes 21 | try: 22 | from cStringIO import StringIO 23 | except ImportError: 24 | from StringIO import StringIO 25 | 26 | class ArchiveHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): 27 | 28 | server_version = "ArchiveHTTP/" + __version__ 29 | 30 | """Simple HTTP request handler with GET and HEAD commands. 31 | 32 | This serves files from the current directory and any of its 33 | subdirectories. The MIME type for files is determined by 34 | calling the .guess_type() method. 35 | 36 | The GET and HEAD requests are identical except that the HEAD 37 | request omits the actual contents of the file. 38 | 39 | """ 40 | 41 | def send_head(self): 42 | """Common code for GET and HEAD commands. 43 | 44 | This sends the response code and MIME headers. 45 | 46 | Return value is either a file object (which has to be copied 47 | to the outputfile by the caller unless the command was HEAD, 48 | and must be closed by the caller under all circumstances), or 49 | None, in which case the caller has nothing further to do. 50 | 51 | """ 52 | path = self.translate_path(self.path, strict=True) 53 | if not os.path.isfile(path): 54 | path = self.translate_path(self.path, strict=False) 55 | 56 | f = None 57 | if os.path.isdir(path): 58 | if not self.path.endswith('/'): 59 | # redirect browser - doing basically what apache does 60 | self.send_response(301) 61 | self.send_header("Location", self.path + "/") 62 | self.end_headers() 63 | return None 64 | for index in "index.html", "index.htm": 65 | index = os.path.join(path, index) 66 | if os.path.exists(index): 67 | path = index 68 | break 69 | else: 70 | return self.list_directory(path) 71 | ctype = self.guess_type(path) 72 | try: 73 | # Always read in binary mode. Opening files in text mode may cause 74 | # newline translations, making the actual size of the content 75 | # transmitted *less* than the content-length! 76 | f = open(path, 'rb') 77 | except IOError: 78 | self.send_error(404, "File not found") 79 | return None 80 | self.send_response(200) 81 | self.send_header("Content-type", ctype) 82 | fs = os.fstat(f.fileno()) 83 | self.send_header("Content-Length", str(fs[6])) 84 | self.send_header("Last-Modified", self.date_time_string(fs.st_mtime)) 85 | self.end_headers() 86 | return f 87 | 88 | def translate_path(self, path, strict): 89 | """Translate a /-separated PATH to the local filename syntax. 90 | 91 | Components that mean special things to the local file system 92 | (e.g. drive or directory names) are ignored. (XXX They should 93 | probably be diagnosed.) 94 | 95 | """ 96 | # abandon query parameters 97 | if not strict: 98 | path = path.split('?',1)[0] 99 | path = path.split('#',1)[0] 100 | path = posixpath.normpath(urllib.unquote(path)) 101 | words = path.split('/') 102 | words = filter(None, words) 103 | path = os.getcwd() 104 | for word in words: 105 | drive, word = os.path.splitdrive(word) 106 | head, word = os.path.split(word) 107 | if word in (os.curdir, os.pardir): continue 108 | path = os.path.join(path, word) 109 | return path 110 | 111 | def guess_type(self, path): 112 | """Guess the type of a file. 113 | 114 | Argument is a PATH (a filename). 115 | 116 | Return value is a string of the form type/subtype, 117 | usable for a MIME Content-type header. 118 | 119 | The default implementation looks the file's extension 120 | up in the table self.extensions_map, using application/octet-stream 121 | as a default; however it would be permissible (if 122 | slow) to look inside the data to make a better guess. 123 | 124 | """ 125 | 126 | path = path.split('?',1)[0] 127 | path = path.split('#',1)[0] 128 | base, ext = posixpath.splitext(path) 129 | if ext in self.extensions_map: 130 | return self.extensions_map[ext] 131 | ext = ext.lower() 132 | if ext in self.extensions_map: 133 | return self.extensions_map[ext] 134 | else: 135 | return self.extensions_map[''] 136 | 137 | if not mimetypes.inited: 138 | mimetypes.init() # try to read system mime.types 139 | extensions_map = mimetypes.types_map.copy() 140 | extensions_map.update({ 141 | '': 'text/html', # Default 142 | '.py': 'text/plain', 143 | '.c': 'text/plain', 144 | '.h': 'text/plain', 145 | }) 146 | 147 | 148 | def test(HandlerClass = ArchiveHTTPRequestHandler, 149 | ServerClass = BaseHTTPServer.HTTPServer): 150 | BaseHTTPServer.test(HandlerClass, ServerClass) 151 | 152 | if __name__ == '__main__': 153 | test() 154 | -------------------------------------------------------------------------------- /OfflinePages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iandennismiller/offline-pages/16fe0311b831a81b6e1149fb0a99ec7d7f35fe82/OfflinePages/__init__.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | offline-pages 2 | ============= 3 | 4 | Offline-pages lets you save an entire website to a file, along with all the required media you'll need to view the pages offline. It's like your browser's "Save Page As" feature, except it isn't limited to one page so it can handle entire websites. All the inter-links will point within the archive (i.e. it is fully self-contained), and the archive is easy to browse offline. 5 | 6 | The goals of offline-pages are: 7 | 8 | - capture entire sites, instead of single pages 9 | - simple method for viewing the archive with a standard web browser 10 | - store external media and links into a self-contained file 11 | - convert all external references so the archive has no online dependencies 12 | 13 | Installation 14 | ------------ 15 | 16 | git clone git://github.com/iandennismiller/offline-pages.git 17 | cd offline-pages 18 | sudo make install 19 | 20 | Behind the scenes, `make install` will use setuptools to install a python library and scripts. Tested on OS X Mountain Lion. Other *NIXes are likely to work as well. 21 | 22 | Usage 23 | ----- 24 | 25 | Let's say you want to mirror the wikipedia article for "Webarchive". 26 | 27 | ### 1. Create a file containing target URLs 28 | 29 | This file can contain as many URLs as you want; they will all be added to the same archive. 30 | 31 | ``` 32 | echo http://en.wikimedia.org/wikipedia/en/wiki/Webarchive > urls.txt 33 | ``` 34 | 35 | ### 2. Use this file as input to the `offline-create` program: 36 | 37 | ``` 38 | offline-create ./urls.txt wikipage 39 | ``` 40 | 41 | ### 3. View the results 42 | 43 | ``` 44 | offline-browse wikipage.archive.tgz 45 | ``` 46 | 47 | Use Case: archiving a forum thread 48 | ---------------------------------- 49 | 50 | So there is a forum thread consisting of hundreds of posts, and it spans dozens of pages. Offline-pages can create a fully self-contained mirror of all these pages, such that the offline version can be navigated much like the online version. In this case, just create a URLs file containing each of the pages you want to include in the archive. 51 | 52 | ### example forum: vBulletin 53 | 54 | For the purpose of this example, we will look at a vBulletin forum. The base URL for a vBulletin forum thread might look something like this: 55 | 56 | ``` 57 | http://www.example.com/vb/threads/1234-this-thread 58 | ``` 59 | 60 | Subsequent pages of the forum thread simply append "pageX" to the URL, like this: 61 | 62 | ``` 63 | http://www.example.com/vb/threads/1234-this-thread/page2 64 | ``` 65 | 66 | ### 1. identify target URLs 67 | 68 | Begin with the stable portion of the URL and write all URLs to a file at once: 69 | 70 | ``` 71 | export BASE_URL=http://www.example.com/vb/threads/1234-this-thread/page 72 | for i in $(seq 1 40); do echo ${BASE_URL}${i}; done > urls.txt 73 | ``` 74 | 75 | ### 2. fix the first URL 76 | 77 | If you look at the file, everything looks great except the first URL. Since I used a for loop to generate most of the URLs, I will need to manually modify the first entry. You will notice that the first line now says: 78 | 79 | ``` 80 | http://www.example.com/vb/threads/1234-this-thread/page1 81 | ``` 82 | 83 | We don't want "page1" there (because that's not how vBulletin works), so edit `urls.txt` and change the first line back to the real URL: 84 | 85 | ``` 86 | http://www.example.com/vb/threads/1234-this-thread 87 | ``` 88 | 89 | However, that's not going to work right. Our URLs file specifies there to be a directory called **1234-this-thread** in which it will place a file called **page2**. If there is already a file called **1234-this-thread**, then we will be unable to create a directory with the same name, and our archival process will be unable to save **page2** if it cannot create a properly named directory. To fix this, modify the first line again to include a nonsense parameter: 90 | 91 | ``` 92 | http://www.example.com/vb/threads/1234-this-thread?s=1 93 | ``` 94 | 95 | You will notice I added **?s=1** to the end of the URL. This will be ignored by vBulletin, but it will ensure our archival process creates a file happily in the archive. 96 | 97 | ### 3. archive the thread 98 | 99 | Now you are ready to archive the thread. 100 | 101 | ``` 102 | offline-create ./urls.txt vbulletin-thread 103 | ``` 104 | 105 | See also 106 | -------- 107 | 108 | ### Webarchive 109 | 110 | [http://en.wikipedia.org/wiki/Web_ARChive](http://en.wikipedia.org/wiki/Web_ARChive) 111 | 112 | The Internet Archive created an awesome file format called WARC for storing their web crawls. This seems to offer a great toolkit for very serious work, but it's not easy to use at all. I never figured out how to simply "browse" a web archive using my browser, so even though it might be able to get the job done, it is overly complex for simple tasks. 113 | 114 | ### HTTrack 115 | 116 | [https://secure.wikimedia.org/wikipedia/en/wiki/Httrack](https://secure.wikimedia.org/wikipedia/en/wiki/Httrack) 117 | 118 | HTTrack looks like a full-featured and relatively simple site mirroring tool. Unfortunately, it does not compile under OSX Mountain Lion, and I was unable to evaluate how easy it was to use its offline archive file format. 119 | 120 | ### MHTML 121 | 122 | [http://en.wikipedia.org/wiki/MHTML](http://en.wikipedia.org/wiki/MHTML) 123 | 124 | It's possible to save an entire web page's assets (images, css, whatever) into a single .html file the same way email messages can include images and attachments. If you click around, you'll get kicked out of the archive and back onto the live Internet. This becomes a drawback since this works for a single file rather than for multiple files or an entire website. 125 | 126 | ### MAF 127 | 128 | [http://en.wikipedia.org/wiki/Mozilla_Archive_Format](http://en.wikipedia.org/wiki/Mozilla_Archive_Format) 129 | 130 | Mozilla Archive Format might have provisions for saving multiple files into an archive. It seems like this is integrated with Firefox, and I haven't played with it much to test its capabilities. 131 | 132 | ### Webarchive 133 | 134 | [http://en.wikipedia.org/wiki/Webarchive](http://en.wikipedia.org/wiki/Webarchive) 135 | 136 | Safari can save pages for offline use, but the end product behaves a lot like a .PDF because it is sortof like a static page of text. Webarchive has the same single-page drawbacks as MHTML. 137 | 138 | -------------------------------------------------------------------------------- /bin/offline-browse: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf /tmp/offline-pages 4 | mkdir -p /tmp/offline-pages 5 | tar xz -C /tmp/offline-pages -f $1 6 | 7 | PORT=$[ ( $RANDOM % 100 ) + 8000] 8 | bash -c "sleep 3; open http://localhost:${PORT}" & 9 | pushd /tmp/offline-pages 10 | python -m OfflinePages.ArchiveHTTPServer $PORT 11 | popd 12 | -------------------------------------------------------------------------------- /bin/offline-create: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | UA="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) \ 4 | Chrome/23.0.1271.6 Safari/537.11" 5 | FIRST=$(head -n1 $1 | sed 's/http:\/\//\//') 6 | 7 | rm -rf /tmp/offline-pages-www 8 | mkdir -p /tmp/offline-pages-www 9 | cat > /tmp/offline-pages-www/index.html < 11 | 12 | 13 | 14 | 15 | EOF 16 | wget -v --span-hosts --timestamping --convert-links --page-requisites \ 17 | --random-wait --wait=3 --directory-prefix="/tmp/offline-pages-www" \ 18 | --user-agent="$UA" --input-file="$1" 19 | 20 | LC_CTYPE='C' 21 | grep -ril 'http://' /tmp/offline-pages-www | grep -v jpg |grep -v gif | grep -v png \ 22 | | xargs sed -i '' 's/http:\/\//\//g' 23 | 24 | cp $(find /tmp/offline-pages-www -name favicon.ico | head -n1) /tmp/offline-pages-www 25 | 26 | tar cz -C /tmp/offline-pages-www -f $2.offline.tgz . 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os, shutil 3 | 4 | version = '0.1' 5 | 6 | setup(name='offline-pages', 7 | version=version, 8 | description="offline pages", 9 | scripts=[ 10 | "bin/offline-browse", 11 | "bin/offline-create", 12 | ], 13 | long_description="""offline pages""", 14 | classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers 15 | keywords='', 16 | author='', 17 | author_email='', 18 | url='', 19 | #install_requires = [], 20 | packages = ["OfflinePages"], 21 | license='MIT', 22 | zip_safe=False, 23 | ) 24 | --------------------------------------------------------------------------------