├── .gitignore
├── Makefile
├── OfflinePages
    ├── ArchiveHTTPServer.py
    └── __init__.py
├── README.md
├── bin
    ├── offline-browse
    └── offline-create
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | dist
3 | *.egg-info
4 | 
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/bash
 2 | 
 3 | clean:
 4 | 	rm -rf build dist *.egg-info *.pyc
 5 | 
 6 | install:
 7 | 	python setup.py install
 8 | 
 9 | test:
10 | 	@echo test
11 | 
12 | .PHONY: clean install test
13 | 


--------------------------------------------------------------------------------
/OfflinePages/ArchiveHTTPServer.py:
--------------------------------------------------------------------------------
  1 | """Archive HTTP Server.
  2 | 
  3 | This module builds on BaseHTTPServer by implementing the standard GET
  4 | and HEAD requests in a fairly straightforward manner.
  5 | 
  6 | """
  7 | 
  8 | __version__ = "0.1"
  9 | 
 10 | __all__ = ["ArchiveHTTPRequestHandler"]
 11 | 
 12 | import os
 13 | import posixpath
 14 | import BaseHTTPServer
 15 | import SimpleHTTPServer
 16 | import urllib
 17 | import cgi
 18 | import sys
 19 | import shutil
 20 | import mimetypes
 21 | try:
 22 |     from cStringIO import StringIO
 23 | except ImportError:
 24 |     from StringIO import StringIO
 25 | 
 26 | class ArchiveHTTPRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
 27 | 
 28 |     server_version = "ArchiveHTTP/" + __version__
 29 | 
 30 |     """Simple HTTP request handler with GET and HEAD commands.
 31 | 
 32 |     This serves files from the current directory and any of its
 33 |     subdirectories.  The MIME type for files is determined by
 34 |     calling the .guess_type() method.
 35 | 
 36 |     The GET and HEAD requests are identical except that the HEAD
 37 |     request omits the actual contents of the file.
 38 | 
 39 |     """
 40 | 
 41 |     def send_head(self):
 42 |         """Common code for GET and HEAD commands.
 43 | 
 44 |         This sends the response code and MIME headers.
 45 | 
 46 |         Return value is either a file object (which has to be copied
 47 |         to the outputfile by the caller unless the command was HEAD,
 48 |         and must be closed by the caller under all circumstances), or
 49 |         None, in which case the caller has nothing further to do.
 50 | 
 51 |         """
 52 |         path = self.translate_path(self.path, strict=True)
 53 |         if not os.path.isfile(path):
 54 |             path = self.translate_path(self.path, strict=False)
 55 | 
 56 |         f = None
 57 |         if os.path.isdir(path):
 58 |             if not self.path.endswith('/'):
 59 |                 # redirect browser - doing basically what apache does
 60 |                 self.send_response(301)
 61 |                 self.send_header("Location", self.path + "/")
 62 |                 self.end_headers()
 63 |                 return None
 64 |             for index in "index.html", "index.htm":
 65 |                 index = os.path.join(path, index)
 66 |                 if os.path.exists(index):
 67 |                     path = index
 68 |                     break
 69 |             else:
 70 |                 return self.list_directory(path)
 71 |         ctype = self.guess_type(path)
 72 |         try:
 73 |             # Always read in binary mode. Opening files in text mode may cause
 74 |             # newline translations, making the actual size of the content
 75 |             # transmitted *less* than the content-length!
 76 |             f = open(path, 'rb')
 77 |         except IOError:
 78 |             self.send_error(404, "File not found")
 79 |             return None
 80 |         self.send_response(200)
 81 |         self.send_header("Content-type", ctype)
 82 |         fs = os.fstat(f.fileno())
 83 |         self.send_header("Content-Length", str(fs[6]))
 84 |         self.send_header("Last-Modified", self.date_time_string(fs.st_mtime))
 85 |         self.end_headers()
 86 |         return f
 87 | 
 88 |     def translate_path(self, path, strict):
 89 |         """Translate a /-separated PATH to the local filename syntax.
 90 | 
 91 |         Components that mean special things to the local file system
 92 |         (e.g. drive or directory names) are ignored.  (XXX They should
 93 |         probably be diagnosed.)
 94 | 
 95 |         """
 96 |         # abandon query parameters
 97 |         if not strict:
 98 |             path = path.split('?',1)[0]
 99 |             path = path.split('#',1)[0]
100 |         path = posixpath.normpath(urllib.unquote(path))
101 |         words = path.split('/')
102 |         words = filter(None, words)
103 |         path = os.getcwd()
104 |         for word in words:
105 |             drive, word = os.path.splitdrive(word)
106 |             head, word = os.path.split(word)
107 |             if word in (os.curdir, os.pardir): continue
108 |             path = os.path.join(path, word)
109 |         return path
110 | 
111 |     def guess_type(self, path):
112 |         """Guess the type of a file.
113 | 
114 |         Argument is a PATH (a filename).
115 | 
116 |         Return value is a string of the form type/subtype,
117 |         usable for a MIME Content-type header.
118 | 
119 |         The default implementation looks the file's extension
120 |         up in the table self.extensions_map, using application/octet-stream
121 |         as a default; however it would be permissible (if
122 |         slow) to look inside the data to make a better guess.
123 | 
124 |         """
125 | 
126 |         path = path.split('?',1)[0]
127 |         path = path.split('#',1)[0]
128 |         base, ext = posixpath.splitext(path)
129 |         if ext in self.extensions_map:
130 |             return self.extensions_map[ext]
131 |         ext = ext.lower()
132 |         if ext in self.extensions_map:
133 |             return self.extensions_map[ext]
134 |         else:
135 |             return self.extensions_map['']
136 | 
137 |     if not mimetypes.inited:
138 |         mimetypes.init() # try to read system mime.types
139 |     extensions_map = mimetypes.types_map.copy()
140 |     extensions_map.update({
141 |         '': 'text/html', # Default
142 |         '.py': 'text/plain',
143 |         '.c': 'text/plain',
144 |         '.h': 'text/plain',
145 |         })
146 | 
147 | 
148 | def test(HandlerClass = ArchiveHTTPRequestHandler,
149 |          ServerClass = BaseHTTPServer.HTTPServer):
150 |     BaseHTTPServer.test(HandlerClass, ServerClass)
151 | 
152 | if __name__ == '__main__':
153 |     test()
154 | 


--------------------------------------------------------------------------------
/OfflinePages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iandennismiller/offline-pages/16fe0311b831a81b6e1149fb0a99ec7d7f35fe82/OfflinePages/__init__.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | offline-pages
  2 | =============
  3 | 
  4 | Offline-pages lets you save an entire website to a file, along with all the required media you'll need to view the pages offline.  It's like your browser's "Save Page As" feature, except it isn't limited to one page so it can handle entire websites.  All the inter-links will point within the archive (i.e. it is fully self-contained), and the archive is easy to browse offline.
  5 | 
  6 | The goals of offline-pages are:
  7 | 
  8 | - capture entire sites, instead of single pages
  9 | - simple method for viewing the archive with a standard web browser
 10 | - store external media and links into a self-contained file
 11 | - convert all external references so the archive has no online dependencies
 12 | 
 13 | Installation
 14 | ------------
 15 | 
 16 |     git clone git://github.com/iandennismiller/offline-pages.git
 17 |     cd offline-pages
 18 |     sudo make install
 19 | 
 20 | Behind the scenes, `make install` will use setuptools to install a python library and scripts. Tested on OS X Mountain Lion.  Other *NIXes are likely to work as well. 
 21 | 
 22 | Usage
 23 | -----
 24 | 
 25 | Let's say you want to mirror the wikipedia article for "Webarchive".  
 26 | 
 27 | ### 1. Create a file containing target URLs
 28 | 
 29 | This file can contain as many URLs as you want; they will all be added to the same archive.
 30 | 
 31 | ```
 32 | echo http://en.wikimedia.org/wikipedia/en/wiki/Webarchive > urls.txt
 33 | ```
 34 | 
 35 | ### 2. Use this file as input to the `offline-create` program:
 36 | 
 37 | ```
 38 | offline-create ./urls.txt wikipage
 39 | ```
 40 | 
 41 | ### 3. View the results
 42 | 
 43 | ```
 44 | offline-browse wikipage.archive.tgz
 45 | ```
 46 | 
 47 | Use Case: archiving a forum thread
 48 | ----------------------------------
 49 | 
 50 | So there is a forum thread consisting of hundreds of posts, and it spans dozens of pages.  Offline-pages can create a fully self-contained mirror of all these pages, such that the offline version can be navigated much like the online version. In this case, just create a URLs file containing each of the pages you want to include in the archive. 
 51 | 
 52 | ### example forum: vBulletin
 53 | 
 54 | For the purpose of this example, we will look at a vBulletin forum.  The base URL for a vBulletin forum thread might look something like this:
 55 | 
 56 | ```
 57 | http://www.example.com/vb/threads/1234-this-thread
 58 | ```
 59 | 
 60 | Subsequent pages of the forum thread simply append "pageX" to the URL, like this:
 61 | 
 62 | ```
 63 | http://www.example.com/vb/threads/1234-this-thread/page2
 64 | ```
 65 | 
 66 | ### 1. identify target URLs
 67 | 
 68 | Begin with the stable portion of the URL and write all URLs to a file at once:
 69 | 
 70 | ```
 71 | export BASE_URL=http://www.example.com/vb/threads/1234-this-thread/page
 72 | for i in $(seq 1 40); do echo ${BASE_URL}${i}; done > urls.txt
 73 | ```
 74 | 
 75 | ### 2. fix the first URL
 76 | 
 77 | If you look at the file, everything looks great except the first URL. Since I used a for loop to generate most of the URLs, I will need to manually modify the first entry. You will notice that the first line now says:
 78 | 
 79 | ```
 80 | http://www.example.com/vb/threads/1234-this-thread/page1
 81 | ```
 82 | 
 83 | We don't want "page1" there (because that's not how vBulletin works), so edit `urls.txt` and change the first line back to the real URL:
 84 | 
 85 | ```
 86 | http://www.example.com/vb/threads/1234-this-thread
 87 | ```
 88 | 
 89 | However, that's not going to work right. Our URLs file specifies there to be a directory called **1234-this-thread** in which it will place a file called **page2**.  If there is already a file called **1234-this-thread**, then we will be unable to create a directory with the same name, and our archival process will be unable to save **page2** if it cannot create a properly named directory. To fix this, modify the first line again to include a nonsense parameter:
 90 | 
 91 | ```
 92 | http://www.example.com/vb/threads/1234-this-thread?s=1
 93 | ```
 94 | 
 95 | You will notice I added **?s=1** to the end of the URL.  This will be ignored by vBulletin, but it will ensure our archival process creates a file happily in the archive.
 96 | 
 97 | ### 3. archive the thread
 98 | 
 99 | Now you are ready to archive the thread.  
100 | 
101 | ```
102 | offline-create ./urls.txt vbulletin-thread
103 | ```
104 | 
105 | See also
106 | --------
107 | 
108 | ### Webarchive
109 | 
110 | [http://en.wikipedia.org/wiki/Web_ARChive](http://en.wikipedia.org/wiki/Web_ARChive)
111 | 
112 | The Internet Archive created an awesome file format called WARC for storing their web crawls.  This seems to offer a great toolkit for very serious work, but it's not easy to use at all.  I never figured out how to simply "browse" a web archive using my browser, so even though it might be able to get the job done, it is overly complex for simple tasks.
113 | 
114 | ### HTTrack
115 | 
116 | [https://secure.wikimedia.org/wikipedia/en/wiki/Httrack](https://secure.wikimedia.org/wikipedia/en/wiki/Httrack)
117 | 
118 | HTTrack looks like a full-featured and relatively simple site mirroring tool.  Unfortunately, it does not compile under OSX Mountain Lion, and I was unable to evaluate how easy it was to use its offline archive file format.
119 | 
120 | ### MHTML
121 | 
122 | [http://en.wikipedia.org/wiki/MHTML](http://en.wikipedia.org/wiki/MHTML)
123 | 
124 | It's possible to save an entire web page's assets (images, css, whatever) into a single .html file the same way email messages can include images and attachments.  If you click around, you'll get kicked out of the archive and back onto the live Internet.  This becomes a drawback since this works for a single file rather than for multiple files or an entire website.
125 | 
126 | ### MAF
127 | 
128 | [http://en.wikipedia.org/wiki/Mozilla_Archive_Format](http://en.wikipedia.org/wiki/Mozilla_Archive_Format)
129 | 
130 | Mozilla Archive Format might have provisions for saving multiple files into an archive.  It seems like this is integrated with Firefox, and I haven't played with it much to test its capabilities.
131 | 
132 | ### Webarchive
133 | 
134 | [http://en.wikipedia.org/wiki/Webarchive](http://en.wikipedia.org/wiki/Webarchive)
135 | 
136 | Safari can save pages for offline use, but the end product behaves a lot like a .PDF because it is sortof like a static page of text.  Webarchive has the same single-page drawbacks as MHTML.
137 | 
138 | 


--------------------------------------------------------------------------------
/bin/offline-browse:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -rf /tmp/offline-pages
 4 | mkdir -p /tmp/offline-pages
 5 | tar xz -C /tmp/offline-pages -f $1
 6 | 
 7 | PORT=$[ ( $RANDOM % 100 ) + 8000]
 8 | bash -c "sleep 3; open http://localhost:${PORT}" &
 9 | pushd /tmp/offline-pages
10 | python -m OfflinePages.ArchiveHTTPServer $PORT
11 | popd
12 | 


--------------------------------------------------------------------------------
/bin/offline-create:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | UA="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) \
 4 | Chrome/23.0.1271.6 Safari/537.11"
 5 | FIRST=$(head -n1 $1 | sed 's/http:\/\//\//')
 6 | 
 7 | rm -rf /tmp/offline-pages-www
 8 | mkdir -p /tmp/offline-pages-www
 9 | cat > /tmp/offline-pages-www/index.html <<EOF 
10 | <html>
11 | <head>
12 |     <meta http-equiv="refresh" content="0;url=${FIRST}">
13 | </head>
14 | </html>
15 | EOF
16 | wget -v --span-hosts --timestamping --convert-links --page-requisites \
17 |     --random-wait --wait=3 --directory-prefix="/tmp/offline-pages-www" \
18 |     --user-agent="$UA" --input-file="$1"
19 | 
20 | LC_CTYPE='C'
21 | grep -ril 'http://' /tmp/offline-pages-www | grep -v jpg |grep -v gif | grep -v png \
22 |     | xargs sed -i '' 's/http:\/\//\//g'
23 | 
24 | cp $(find /tmp/offline-pages-www -name favicon.ico | head -n1) /tmp/offline-pages-www
25 | 
26 | tar cz -C /tmp/offline-pages-www -f $2.offline.tgz .
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os, shutil
 3 | 
 4 | version = '0.1'
 5 | 
 6 | setup(name='offline-pages',
 7 |       version=version,
 8 |       description="offline pages",
 9 |       scripts=[
10 |             "bin/offline-browse", 
11 |             "bin/offline-create", 
12 |             ],
13 |       long_description="""offline pages""",
14 |       classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
15 |       keywords='',
16 |       author='',
17 |       author_email='',
18 |       url='',
19 |       #install_requires = [],
20 |       packages = ["OfflinePages"],
21 |       license='MIT',
22 |       zip_safe=False,
23 |       )
24 | 


--------------------------------------------------------------------------------