├── .coveragerc ├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.md ├── README ├── logo.png └── logo@2x.png ├── dev-requirements.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── test.py └── webcrystal.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | 3 | # Source files to ignore 4 | omit = 5 | # Ignore 3rd party modules like: django, selenium, markdown, yaml, etc 6 | /usr/local/lib/* 7 | venv/* 8 | 9 | # Ignore the test suite itself, which artificially boosts the average 10 | test.py 11 | 12 | # Track coverage when running the caching_proxy as a separate Process 13 | concurrency = multiprocessing 14 | 15 | 16 | [report] 17 | 18 | # Ignore files that are 100% covered, like __init__.py 19 | # 20 | # NOTE: This option seems to be ignored by the HTML report generator, 21 | # although it IS respected by the CLI report generator. 22 | skip_covered = True 23 | 24 | # Exclude lines from coverage that contain ... 25 | exclude_lines = 26 | pragma: no cover 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | *.pyc 3 | *.pyo 4 | 5 | # OS X 6 | .DS_Store 7 | 8 | # Archives 9 | *.wbcr 10 | 11 | # PyCharm 12 | .idea 13 | 14 | # coverage 15 | .coverage 16 | .coverage.* 17 | htmlcov 18 | 19 | # setup.py 20 | build 21 | dist 22 | MANIFEST 23 | README.rst 24 | *.egg-info 25 | 26 | # virtualenv 27 | venv 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 David Foster 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include LICENSE.txt 3 | include MANIFEST.in 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Run all automated tests 2 | test: 3 | nosetests 4 | @#python3 test.py # if nose is not available 5 | 6 | # Collect code coverage metrics 7 | coverage: 8 | coverage run test.py && coverage combine && coverage html 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # webcrystal 2 | 3 | 4 | 5 | webcrystal is: 6 | 7 | 1. An HTTP proxy and web service that saves every web page accessed through it to disk. 8 | 2. An on-disk archival format for storing websites. 9 | 10 | webcrystal is intended as a tool for archiving websites. It is also intended to be convenient to write HTTP-based and browser-based web scrapers on top of. 11 | 12 | 13 | ## Features 14 | 15 | * Compact package: One .py file. Only one dependency (`urllib3`). 16 | * A simple documented archival format. 17 | * >95% code coverage, enforced by the test suite. 18 | * Friendly MIT license. 19 | * Excellent documentation. 20 | 21 | 22 | ## Installation 23 | 24 | * Install [Python 3]. 25 | * From a command-line terminal (Terminal on OS X, Command Prompt on Windows), run the command: 26 | 27 | ``` 28 | pip3 install webcrystal 29 | ``` 30 | 31 | [Python 3]: https://www.python.org/downloads/ 32 | 33 | 34 | ## Quickstart 35 | 36 | To start the proxy run a command like: 37 | 38 | ``` 39 | webcrystal.py 9227 xkcd.wbcr http://xkcd.com/ 40 | ``` 41 | 42 | Then you can visit to have the same effect as visiting directly, except that all requests are archived in `xkcd.wbcr/`. 43 | 44 | When you access an HTTP resource through the webcrystal proxy for the first time, it will be fetched from the origin HTTP server and archived locally. All subsequent requests for the same resource will be returned from the archive. 45 | 46 | 47 | ## CLI 48 | 49 | To start the webcrystal proxy: 50 | 51 | ``` 52 | webcrystal.py [--help] [--quiet] [] 53 | ``` 54 | 55 | To stop the proxy press ^C or send a SIGINT signal to it. 56 | 57 | ### Full Syntax 58 | 59 | ``` 60 | webcrystal.py --help 61 | ``` 62 | 63 | This outputs: 64 | 65 | ``` 66 | usage: webcrystal.py [-h] [-q] port archive_dirpath [default_origin_domain] 67 | 68 | An archiving HTTP proxy and web service. 69 | 70 | positional arguments: 71 | port Port on which to run the HTTP proxy. Suggest 9227 72 | (WBCR). 73 | archive_dirpath Path to the archive directory. Usually has .wbcr 74 | extension. 75 | default_origin_domain 76 | Default HTTP domain which the HTTP proxy will redirect 77 | to if no URL is specified. 78 | 79 | optional arguments: 80 | -h, --help Show this help message and exit. 81 | -q, --quiet Suppresses all output. 82 | ``` 83 | 84 | 85 | ## HTTP API 86 | 87 | The HTTP API is the primary API for interacting with the webcrystal proxy. 88 | 89 | While the proxy is running, it responds to the following HTTP endpoints. 90 | 91 | Notice that GET is an accepted method for all endpoints, so that they can be easily requested using a regular web browser. Browser accessibility is convenient for manual inspection and browser-based website scrapers. 92 | 93 | ### `GET,HEAD /` 94 | 95 | Redirects to the home page of the default origin domain if it was specified at the CLI. Returns: 96 | 97 | * HTTP 404 (Not Found) if no default origin domain is specified (the default) or 98 | * HTTP 307 (Temporary Redirect) to the default origin domain if it is specified. 99 | 100 | ### `GET,HEAD /_/http[s]/__PATH__` 101 | 102 | If in online mode (the default): 103 | 104 | * The requested resource will be fetched from the origin server and added to the archive if: 105 | * (1) it is not already archived, 106 | * (2) a `Cache-Control=no-cache` request header is specified, or 107 | * (3) a `Pragma=no-cache` request header is specified. 108 | * The newly archived resource will be returned to the client, with all URLs in HTTP headers and content rewritten to point to the proxy. 109 | * If there was problem communicating with the origin server, returns: 110 | * HTTP 502 (Bad Gateway), with an HTML page that provides a link to the online version of the content. 111 | 112 | If in offline mode: 113 | 114 | * If the resource is in the archive, it will be returned to the client, with all URLs in HTTP headers and content rewritten to point to the proxy. 115 | * If the resource is not in the archive, returns: 116 | * HTTP 503 (Service Unavailable), with an HTML page that provides a link to the online version of the content. 117 | 118 | ### `POST,GET /_online` 119 | 120 | Switches the proxy to online mode. 121 | 122 | ### `POST,GET /_offline` 123 | 124 | Switches the proxy to offline mode. 125 | 126 | ### `GET,HEAD /_raw/http[s]/__PATH__` 127 | 128 | Returns the specified resource from the archive if it is already archived. Nothing about the resource will be rewritten including any URLs that appear in HTTP headers or content. The intent is that the returned resource be as close to the original response from the origin server as is practical. 129 | 130 | If the resource is not in the archive, returns: 131 | 132 | * HTTP 503 (Service Unavailable), with an HTML page that provides a link to the online version of the content. 133 | 134 | ### `POST,GET /_refresh/http[s]/__PATH__` 135 | 136 | Refetches the specified URL from the origin server using the same request headers as the last time it was fetched. Returns: 137 | 138 | * HTTP 200 (OK) if successful, 139 | * HTTP 404 (Not Found) if the specified URL was not in the archive, or 140 | * HTTP 502 (Bad Gateway) if there was problem communicating with the origin server. 141 | 142 | ### `POST,GET /_delete/http[s]/__PATH__` 143 | 144 | Deletes the specified URL in the archive. Returns: 145 | 146 | * HTTP 200 (OK) if successful, or 147 | * HTTP 404 (Not Found) if the specified URL was not in the archive. 148 | 149 | 150 | ## Archival Format 151 | 152 | When the proxy is started with a command like: 153 | 154 | ``` 155 | webcrystal.py 9227 website.wbcr 156 | ``` 157 | 158 | It creates an archive in the directory `website.wbcr/` in the following format: 159 | 160 | 161 | ### `website.wbcr/index.txt` 162 | 163 | * Lists the URL of each archived HTTP resource, one per line. 164 | * UTF-8 encoded text file with Unix line endings (`\n`). 165 | 166 | Example: 167 | 168 | ``` 169 | http://xkcd.com/ 170 | http://xkcd.com/s/b0dcca.css 171 | http://xkcd.com/1645/ 172 | ``` 173 | 174 | The preceding example archive contains 3 HTTP resources, numbered #1, #2, and #3. 175 | 176 | 177 | ### `website.wbcr/1.request_headers.json` 178 | 179 | * Contains the HTTP request headers sent to the origin HTTP server to obtain HTTP resource #1. 180 | * UTF-8 encoded JSON file. 181 | 182 | Example: 183 | 184 | ``` 185 | {"Accept-Language": "en-us", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Host": "xkcd.com", "Accept-Encoding": "gzip, deflate", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/601.4.4 (KHTML, like Gecko) Version/9.0.3 Safari/601.4.4"} 186 | ``` 187 | 188 | 189 | ### `website.wbcr/1.response_headers.json` 190 | 191 | * Contains the HTTP response headers received from the origin HTTP server when obtaining HTTP resource #1. 192 | * UTF-8 encoded JSON file. 193 | * Contains an internal "X-Status-Code" header that indicates the HTTP status code received from the origin HTTP server. 194 | 195 | Example: 196 | 197 | ``` 198 | {"Cache-Control": "public", "Connection": "keep-alive", "Accept-Ranges": "bytes", "X-Cache-Hits": "0", "Date": "Tue, 15 Mar 2016 04:37:05 GMT", "Age": "0", "X-Served-By": "cache-sjc3628-SJC", "Content-Type": "text/html", "Server": "lighttpd/1.4.28", "X-Status-Code": "404", "X-Cache": "MISS", "Content-Length": "345", "X-Timer": "S1458016625.375814,VS0,VE148", "Via": "1.1 varnish"} 199 | ``` 200 | 201 | ### `website.wbcr/1.response_body.dat` 202 | 203 | * Contains the contents of the HTTP response body received from the origin HTTP server when obtaining HTTP resource #1. 204 | * Binary file. 205 | 206 | 207 | ## Contributing 208 | 209 | ### Install Dev Requirements 210 | 211 | ``` 212 | pip3 install -r dev-requirements.txt 213 | ``` 214 | 215 | ### Run the Tests 216 | 217 | ``` 218 | make test 219 | ``` 220 | 221 | ### Gather Code Coverage Metrics 222 | 223 | ``` 224 | make coverage 225 | open htmlcov/index.html 226 | ``` 227 | 228 | ### Release a New Version 229 | 230 | * Ensure the tests pass. 231 | * Ensure the changelog is updated. 232 | * Bump the version number in `setup.py`. 233 | * `python3 setup.py sdist bdist_wheel upload` 234 | - There are more advanced [upload techniques] that might be used later. 235 | * Tag the release in Git. 236 | 237 | [upload techniques]: https://packaging.python.org/en/latest/distributing/#upload-your-distributions 238 | 239 | 240 | ## Known Limitations 241 | 242 | * Sites that vary the content served at a particular URL depending on whether you are logged in can have only one version of the URL archived. 243 | 244 | 245 | ## Related Projects 246 | 247 | * [Crystal Web Archiver] - An alternative website archiving tool that focuses 248 | on making it easy for humans (rather than for automated crawlers) to 249 | download websites. 250 | 251 | [Crystal Web Archiver]: http://dafoster.net/projects/crystal-web-archiver 252 | 253 | 254 | ## License 255 | 256 | This code is provided under the MIT License. See LICENSE file for details. 257 | 258 | 259 | ## Changelog 260 | 261 | * master 262 | - Add /_raw/ endpoint. 263 | - Improve error reporting when origin server is unavailable. 264 | See HTTP 502 (Bad Gateway) response situations. 265 | * v1.0.1 266 | - More robust support for HTTPS URLs on OS X 10.11. 267 | - Validate HTTPS certificates. 268 | * v1.0 - Initial release 269 | -------------------------------------------------------------------------------- /README/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidfstr/webcrystal/a2aff76b8998bb58d11ed17006b02247344de321/README/logo.png -------------------------------------------------------------------------------- /README/logo@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davidfstr/webcrystal/a2aff76b8998bb58d11ed17006b02247344de321/README/logo@2x.png -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | nose==1.3.7 2 | coverage==4.0.3 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | urllib3==1.26.4 2 | 3 | # Needed for urllib3 to perform HTTPS certificate validation 4 | certifi==2020.12.5 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | #logging-clear-handlers=true 3 | with-coverage=true 4 | cover-package=webcrystal 5 | cover-min-percentage=96 6 | #cover-erase=true 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | import sys 4 | 5 | def is_older_than(file1, file2): 6 | if not os.path.exists(file1) or not os.path.exists(file2): 7 | return False 8 | return os.path.getmtime(file1) < os.path.getmtime(file2) 9 | 10 | # Generate README.rst if missing or out of date 11 | if not os.path.exists('README.rst') or is_older_than('README.rst', 'README.md'): 12 | result = os.system('pandoc --from=markdown --to=rst --output=README.rst README.md') 13 | if result == 0x7f00: 14 | sys.exit('Pandoc is not installed. It is required when changing README.md.') 15 | if result != 0: 16 | sys.exit('Pandoc exited with error code %s while processing README.md.' % result) 17 | 18 | with open('README.rst') as file: 19 | long_description = file.read() 20 | 21 | setup( 22 | # Identity 23 | name='webcrystal', 24 | version='1.0.1', 25 | 26 | # Contents 27 | py_modules=['webcrystal'], 28 | scripts=['webcrystal.py'], 29 | 30 | # Metadata 31 | author='David Foster', 32 | author_email='david@dafoster.net', 33 | url='http://dafoster.net/projects/webcrystal/', 34 | description='A website archival tool and format.', 35 | long_description=long_description, 36 | license='MIT', 37 | # see: https://pypi.python.org/pypi?%3Aaction=list_classifiers 38 | classifiers=[ 39 | 'Development Status :: 5 - Production/Stable', 40 | 'Environment :: Web Environment', 41 | 'Intended Audience :: Developers', 42 | 'License :: OSI Approved :: MIT License', 43 | 'Operating System :: OS Independent', 44 | 'Programming Language :: Python', 45 | 'Programming Language :: Python :: 3.4', 46 | 'Topic :: Database', 47 | 'Topic :: Internet :: Proxy Servers', 48 | 'Topic :: Internet :: WWW/HTTP', 49 | 'Topic :: Internet :: WWW/HTTP :: HTTP Servers', 50 | 'Topic :: System :: Archiving', 51 | 'Topic :: System :: Archiving :: Backup', 52 | 'Topic :: System :: Archiving :: Mirroring', 53 | ], 54 | 55 | # Dependencies 56 | install_requires=[ 57 | # Need 1.15.x for header ordering guarantees 58 | 'urllib3>=1.15.1', 59 | 60 | # Needed for urllib3 to fetch HTTPS URLs on OS X 10.11 61 | 'pyopenssl', 'ndg-httpsclient', 'pyasn1', 62 | 63 | # Needed for urllib3 to perform HTTPS certificate validation 64 | 'certifi', 65 | ] 66 | ) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import webcrystal 2 | from webcrystal import _format_proxy_path as format_proxy_path 3 | from webcrystal import _format_proxy_url as format_proxy_url 4 | from collections import OrderedDict 5 | import gzip 6 | from http.server import HTTPServer, BaseHTTPRequestHandler 7 | from io import BytesIO 8 | from multiprocessing import Process 9 | import os 10 | import os.path 11 | import random 12 | import shutil 13 | import signal 14 | import socket 15 | import sys 16 | import tempfile 17 | from threading import Thread 18 | import time 19 | import unittest 20 | from unittest import mock, skip, TestCase 21 | import urllib3 22 | 23 | 24 | # ------------------------------------------------------------------------------ 25 | # Tests 26 | 27 | http = urllib3.PoolManager(retries=0) 28 | 29 | 30 | _HOST = '127.0.0.1' 31 | _PROXY_PORT = 9000 32 | _DEFAULT_DOMAIN_PORT = 9001 33 | _OTHER_DOMAIN_PORT = 9002 34 | 35 | _PROXY_INFO = webcrystal._ProxyInfo(host=_HOST, port=_PROXY_PORT) 36 | 37 | _DEFAULT_DOMAIN = '%s:%s' % (_HOST, _DEFAULT_DOMAIN_PORT) 38 | _OTHER_DOMAIN = '%s:%s' % (_HOST, _OTHER_DOMAIN_PORT) 39 | 40 | _DEFAULT_DOMAIN_AS_IP = _DEFAULT_DOMAIN 41 | _DEFAULT_DOMAIN_AS_DNS = 'localhost:%s' % _DEFAULT_DOMAIN_PORT 42 | 43 | _PROXY_SERVER_URL = 'http://%s:%s' % (_HOST, _PROXY_PORT) 44 | _DEFAULT_SERVER_URL = 'http://%s' % _DEFAULT_DOMAIN 45 | _OTHER_SERVER_URL = 'http://%s' % _OTHER_DOMAIN 46 | 47 | 48 | def forbid_unless_referer_starts_with(required_referer_prefix, ok_response): 49 | def generate_response(path, headers): 50 | referer = {k.lower(): v for (k, v) in headers.items()}.get('referer') 51 | if referer is None or not referer.startswith(required_referer_prefix): 52 | return dict(status_code=403) # Forbidden 53 | else: 54 | return ok_response 55 | 56 | return generate_response 57 | 58 | def forbid_unless_user_agent_is(required_user_agent, ok_response_func): 59 | def generate_response(path, headers): 60 | user_agent = {k.lower(): v for (k, v) in headers.items()}.get('user-agent') 61 | if user_agent != required_user_agent: 62 | return dict(status_code=403) # Forbidden 63 | else: 64 | return ok_response_func(path, headers) 65 | 66 | return generate_response 67 | 68 | def modified_long_ago(ok_response): 69 | def generate_response(path, headers): 70 | if_modified_since = {k.lower(): v for (k, v) in headers.items()}.get('if-modified-since') 71 | if if_modified_since is not None: 72 | return dict(status_code=304) # Not Modified 73 | else: 74 | return ok_response 75 | 76 | return generate_response 77 | 78 | def no_weird_headers(ok_response): 79 | def generate_response(path, headers): 80 | has_weird_headers = 'X-Weird-Request-Header' in headers.keys() 81 | if has_weird_headers: 82 | return dict(status_code=400) # Bad Request 83 | else: 84 | return ok_response 85 | 86 | return generate_response 87 | 88 | def on_host(required_host, ok_response): 89 | def generate_response(path, headers): 90 | host = {k.lower(): v for (k, v) in headers.items()}.get('host') 91 | if host is None or host != required_host: 92 | return dict(status_code=404) # Not Found 93 | else: 94 | return ok_response 95 | 96 | return generate_response 97 | 98 | _default_server_counter = -1 99 | 100 | def get_counter(): 101 | def generate_response(path, headers): 102 | global _default_server_counter 103 | return dict( 104 | body=str(_default_server_counter) 105 | ) 106 | 107 | return generate_response 108 | 109 | _expected_request_headers = None 110 | 111 | def expects_certain_request_headers(): 112 | def generate_response(path, headers): 113 | global _expected_request_headers 114 | 115 | matching_request_headers = [k for k in headers.keys() if k in _expected_request_headers] 116 | if matching_request_headers != _expected_request_headers: 117 | return dict( 118 | status_code=400, # Bad Request 119 | body='Expected headers %s but got %s.' % ( 120 | _expected_request_headers, 121 | matching_request_headers 122 | ) 123 | ) 124 | else: 125 | return dict(status_code=200) # OK 126 | 127 | return generate_response 128 | 129 | _response_headers_to_send = None 130 | 131 | def send_certain_response_headers(): 132 | def generate_response(path, headers): 133 | global _response_headers_to_send 134 | 135 | return dict(status_code=200, headers=list(_response_headers_to_send.items())) 136 | 137 | return generate_response 138 | 139 | def sometimes_disconnects(): 140 | def generate_response(path, headers): 141 | global _should_disconnect 142 | 143 | if _should_disconnect: 144 | return '__disconnect__' 145 | else: 146 | return dict(status_code=200) 147 | 148 | return generate_response 149 | 150 | def nice_404_page(): 151 | def generate_response(path, headers): 152 | return dict( 153 | status_code=404, 154 | headers=[('Content-Type', 'text/plain')], 155 | body='No such page was found!' 156 | ) 157 | 158 | return generate_response 159 | 160 | _DEFAULT_SERVER_RESPONSES = { # like a blog 161 | '/': dict( 162 | headers=[('Content-Type', 'text/html')], 163 | body='Default server' 164 | ), 165 | '/posts/': dict( 166 | headers=[('Content-Type', 'text/html')], 167 | body='Posts' 168 | ), 169 | '/posts/image_no_hotlinking.png': forbid_unless_referer_starts_with(_DEFAULT_SERVER_URL, dict( 170 | headers=[('Content-Type', 'image/png')], 171 | body=b'' 172 | )), 173 | '/posts/image_modified_long_ago.png': modified_long_ago(dict( 174 | headers=[('Content-Type', 'image/png')], 175 | body=b'' 176 | )), 177 | '/api/no_weird_headers': no_weird_headers(dict( 178 | headers=[('Content-Type', 'application/json')], 179 | body='{}' 180 | )), 181 | '/posts/only_on_localhost.html': on_host(_DEFAULT_DOMAIN_AS_DNS, dict( 182 | headers=[('Content-Type', 'text/html')], 183 | body='Most pages will not load if the Host header is wrong.' 184 | )), 185 | '/posts/image_no_extension': dict( 186 | headers=[('Content-Type', 'image/png')], 187 | body=b'' 188 | ), 189 | '/posts/super_secret.html': dict( 190 | headers=[ 191 | ('Content-Type', 'text/html'), 192 | # NOTE: Normally this header would only be sent over an HTTPS connection. 193 | ('Strict-Transport-Security', 'max-age=31536000') 194 | ], 195 | body='Secret!' 196 | ), 197 | '/api/generate_weird_headers': dict( 198 | headers=[ 199 | ('Content-Type', 'application/json'), 200 | ('X-Weird-Response-Header', 'boom') 201 | ], 202 | body='{}' 203 | ), 204 | '/posts/redirect_to_social_network.html': dict( 205 | status_code=302, # Found 206 | headers=[('Location', _OTHER_SERVER_URL + '/feed/landing_page_from_blog.html')] 207 | ), 208 | '/posts/digits.txt': dict( 209 | headers=[ 210 | ('Content-Type', 'text/plain'), 211 | ('Content-Encoding', 'gzip') 212 | ], 213 | body=gzip.compress(b'0123456789') 214 | ), 215 | '/posts/link_to_social_network.html': dict( 216 | headers=[('Content-Type', 'text/html')], 217 | body='Link' % 218 | (_OTHER_SERVER_URL + '/feed/landing_page_from_blog.html') 219 | ), 220 | '/posts/link_to_social_network_with_same_protocol.html': dict( 221 | headers=[('Content-Type', 'text/html')], 222 | body='Link' % 223 | ('//' + _OTHER_DOMAIN + '/feed/landing_page_from_blog.html') 224 | ), 225 | '/posts/link_to_homepage_with_site_relative_url.html': dict( 226 | headers=[('Content-Type', 'text/html')], 227 | body='Link' % 228 | ('/') 229 | ), 230 | '/posts/link_to_neighboring_post_with_relative_url.html': dict( 231 | headers=[('Content-Type', 'text/html')], 232 | body='Link' % 233 | ('neighboring_post.html') 234 | ), 235 | '/api/get_counter': get_counter(), 236 | '/api/get_counter_only_chrome': forbid_unless_user_agent_is('Chrome', get_counter()), 237 | '/api/expects_certain_request_headers': expects_certain_request_headers(), 238 | '/api/send_certain_response_headers': send_certain_response_headers(), 239 | '/sometimes_disconnects': sometimes_disconnects(), 240 | '/404.html': nice_404_page(), 241 | } 242 | 243 | _OTHER_SERVER_RESPONSES = { # like a social network 244 | '/': dict( 245 | headers=[('Content-Type', 'text/html')], 246 | body='Other server' 247 | ), 248 | '/feed/': dict( 249 | headers=[('Content-Type', 'text/html')], 250 | body='Feed' 251 | ), 252 | '/feed/landing_page_from_blog.html': forbid_unless_referer_starts_with(_DEFAULT_SERVER_URL, dict( 253 | headers=[('Content-Type', 'text/html')], 254 | body='Thanks for visiting us from fooblog!' 255 | )), 256 | } 257 | 258 | 259 | class _AbstractEndpointTests(TestCase): 260 | has_default_domain = True 261 | 262 | @classmethod 263 | def setUpClass(cls): 264 | cls._proxy_server = _RealProxyServer(_PROXY_PORT, _DEFAULT_DOMAIN if cls.has_default_domain else None) 265 | cls._default_server = _MockOriginServer(_DEFAULT_DOMAIN_PORT, _DEFAULT_SERVER_RESPONSES) 266 | cls._other_server = _MockOriginServer(_OTHER_DOMAIN_PORT, _OTHER_SERVER_RESPONSES) 267 | 268 | @classmethod 269 | def tearDownClass(cls): 270 | cls._proxy_server.close() 271 | cls._default_server.close() 272 | cls._other_server.close() 273 | 274 | # === Utility: HTTP === 275 | 276 | def _get(self, *args, **kwargs): 277 | return self._request('GET', *args, **kwargs) 278 | 279 | def _head(self, *args, **kwargs): 280 | return self._request('HEAD', *args, **kwargs) 281 | 282 | def _post(self, *args, **kwargs): 283 | return self._request('POST', *args, **kwargs) 284 | 285 | def _request(self, method, path, headers={}, *, allow_redirects=False, cache=False): 286 | final_headers = OrderedDict(headers) # clone 287 | if not cache: 288 | final_headers['Cache-Control'] = 'no-cache' 289 | final_headers['X-Pragma'] = 'no-cache' 290 | 291 | urllib3_response = http.request( 292 | method=method, 293 | url=_PROXY_SERVER_URL + path, 294 | headers=final_headers, 295 | redirect=allow_redirects 296 | ) 297 | return _HttpResponse(urllib3_response) 298 | 299 | 300 | class _HttpResponse: 301 | """ 302 | An HTTP response. 303 | 304 | Simulates the API of the "requests" library, since that's the library that 305 | the test suite was originally written with. 306 | """ 307 | def __init__(self, urllib3_response): 308 | self._urllib3_response = urllib3_response 309 | 310 | @property 311 | def status_code(self): 312 | return self._urllib3_response.status 313 | 314 | @property 315 | def headers(self): 316 | return self._urllib3_response.headers 317 | 318 | @property 319 | def text(self): 320 | return self._urllib3_response.data.decode('utf8') 321 | 322 | @property 323 | def content(self): 324 | return self._urllib3_response.data 325 | 326 | 327 | class CoreEndpointTests(_AbstractEndpointTests): 328 | """ 329 | Acceptance tests for the behavior of the core endpoints: 330 | * GET,HEAD / 331 | * GET,HEAD /_/http[s]/__PATH__ 332 | 333 | And supporting endpoints: 334 | * POST,GET /_online 335 | * POST,GET /_offline 336 | """ 337 | 338 | # === Request Formats === 339 | 340 | # GET/HEAD of /__PATH__ when Referer is omitted 341 | # -> http://__DEFAULT_DOMAIN__/__PATH__ 342 | def test_request_of_unqualified_path_without_referer_reinterprets_with_default_domain(self): 343 | for method in ['GET', 'HEAD', 'POST']: 344 | response = self._request(method, '/posts/', allow_redirects=True) 345 | if method == 'POST': 346 | self.assertEqual(405, response.status_code) 347 | else: 348 | self.assertEqual(200, response.status_code) 349 | self.assertEqual('Posts' if method == 'GET' else '', response.text) 350 | 351 | # GET/HEAD of /__PATH__ when Referer is __OTHER_DOMAIN__ 352 | # -> http://__OTHER_DOMAIN__/__PATH__ 353 | def test_request_of_unqualified_path_with_referer_uses_referer_domain(self): 354 | for method in ['GET', 'HEAD', 'POST']: 355 | response = self._request(method, '/', { 356 | 'Referer': format_proxy_url('http', _OTHER_DOMAIN, '/feed/', proxy_info=_PROXY_INFO) 357 | }, allow_redirects=True) 358 | if method == 'POST': 359 | self.assertEqual(405, response.status_code) 360 | else: 361 | self.assertEqual(200, response.status_code) 362 | self.assertEqual('Other server' if method == 'GET' else '', response.text) 363 | 364 | # GET/HEAD of /_/http/__OTHER_DOMAIN__/__PATH__ 365 | # -> http://__OTHER_DOMAIN__/__PATH__ 366 | def test_request_of_qualified_http_path_works(self): 367 | for method in ['GET', 'HEAD', 'POST']: 368 | response = self._request(method, format_proxy_path('http', _OTHER_DOMAIN, '/feed/')) 369 | if method == 'POST': 370 | self.assertEqual(405, response.status_code) 371 | else: 372 | self.assertEqual(200, response.status_code) 373 | self.assertEqual('Feed' if method == 'GET' else '', response.text) 374 | 375 | # GET/HEAD of /_/https/__DOMAIN__/__PATH__ 376 | # -> https://__DOMAIN__/__PATH__ 377 | @skip('not yet automated') 378 | def test_request_of_qualified_https_path_works(self): 379 | # TODO: Implement. It's a pain. Maybe the following will help: 380 | # http://code.activestate.com/recipes/442473-simple-http-server-supporting-ssl-secure-communica/ 381 | pass 382 | 383 | # === Request Header Processing: Client -> Proxy -> Server === 384 | 385 | # Allows Request Header: User-Agent, Referer 386 | def test_allows_certain_headers_when_forwarding_request_to_server(self): 387 | response = self._get( 388 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/image_no_hotlinking.png'), 389 | {'Referer': _DEFAULT_SERVER_URL + '/posts/'}) 390 | self.assertEqual(200, response.status_code) 391 | 392 | response = self._get( 393 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/image_no_hotlinking.png'), 394 | {}) 395 | self.assertEqual(403, response.status_code) 396 | 397 | # Blocks Request Header: If-Modified-Since 398 | def test_blocks_certain_headers_when_forwarding_request_to_server(self): 399 | response = self._get( 400 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/image_modified_long_ago.png'), 401 | {}) 402 | self.assertEqual(200, response.status_code) 403 | 404 | response = self._get( 405 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/image_modified_long_ago.png'), 406 | {'If-Modified-Since': 'Sat, 29 Oct 1994 19:43:31 GMT'}) 407 | self.assertEqual(200, response.status_code) # blocked, != 304 408 | 409 | # Blocks Request Header: X-Weird-Request-Header 410 | def test_blocks_unknown_headers_when_forwarding_request_to_server(self): 411 | response = self._get( 412 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/no_weird_headers'), 413 | {}) 414 | self.assertEqual(200, response.status_code) 415 | 416 | response = self._get( 417 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/no_weird_headers'), 418 | {'X-Weird-Request-Header': 'boom'}) 419 | self.assertEqual(200, response.status_code) # blocked, != 400 420 | 421 | # Rewrites Request Header: Host 422 | def test_rewrites_host_header_when_forwarding_request_to_server(self): 423 | response = self._get( 424 | format_proxy_path('http', _DEFAULT_DOMAIN_AS_DNS, '/posts/only_on_localhost.html')) 425 | self.assertEqual(200, response.status_code) 426 | 427 | response = self._get( 428 | format_proxy_path('http', _DEFAULT_DOMAIN_AS_IP, '/posts/only_on_localhost.html')) 429 | self.assertEqual(404, response.status_code) 430 | 431 | # Rewrites Request Header: Referer 432 | def test_rewrites_referer_header_when_forwarding_request_to_server(self): 433 | # ...when coming from http://__PROXY_DOMAIN__/__PATH__ 434 | response = self._get( 435 | format_proxy_path('http', _OTHER_DOMAIN, '/feed/landing_page_from_blog.html'), 436 | {'Referer': _PROXY_SERVER_URL + '/posts/redirect_to_social_network.html'}) 437 | self.assertEqual(200, response.status_code) 438 | 439 | # ...when coming from http://__PROXY_DOMAIN__/_/http/__DEFAULT_DOMAIN__/__PATH__ 440 | response = self._get( 441 | format_proxy_path('http', _OTHER_DOMAIN, '/feed/landing_page_from_blog.html'), 442 | {'Referer': format_proxy_url('http', _DEFAULT_DOMAIN, '/posts/redirect_to_social_network.html', proxy_info=_PROXY_INFO)}) 443 | self.assertEqual(200, response.status_code) 444 | 445 | # === Response Header Processing: Client <- Proxy <- Server === 446 | 447 | # Allows Response Header: Content-Type 448 | def test_allows_certain_headers_when_returning_response_from_server(self): 449 | response = self._get( 450 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/image_no_extension')) 451 | self.assertEqual(200, response.status_code) 452 | self.assertEqual('image/png', response.headers['Content-Type']) 453 | 454 | # Blocks Response Header: Strict-Transport-Security 455 | def test_blocks_certain_headers_when_returning_response_from_server(self): 456 | response = self._get( 457 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/super_secret.html')) 458 | self.assertEqual(200, response.status_code) 459 | self.assertNotIn('Strict-Transport-Security', response.headers) 460 | 461 | # Blocks Response Header: X-Weird-Response-Header 462 | def test_blocks_unknown_headers_when_returning_response_from_server(self): 463 | response = self._get( 464 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/generate_weird_headers')) 465 | self.assertEqual(200, response.status_code) 466 | self.assertNotIn('X-Weird-Response-Header', response.headers) 467 | 468 | # Blocks Response Header: X-Status-Code 469 | def test_blocks_internal_headers_when_returning_response_from_server(self): 470 | response = self._get( 471 | format_proxy_path('http', _DEFAULT_DOMAIN, '/')) 472 | self.assertEqual(200, response.status_code) 473 | self.assertNotIn('X-Status-Code', response.headers) 474 | 475 | # Rewrites Response Header: Location 476 | def test_rewrites_location_header_when_returning_response_from_server(self): 477 | response = self._get( 478 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/redirect_to_social_network.html')) 479 | self.assertEqual(302, response.status_code) # Found 480 | self.assertEqual( 481 | format_proxy_url('http', _OTHER_DOMAIN, '/feed/landing_page_from_blog.html', proxy_info=_PROXY_INFO), 482 | response.headers['Location']) 483 | 484 | # Rewrites Response Header: Content-Length (if Content-Encoding is gzip or similar) 485 | def test_rewrites_content_length_header_when_returning_compressed_response_from_server(self): 486 | response = self._get( 487 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/digits.txt')) 488 | self.assertEqual(200, response.status_code) 489 | self.assertEqual(b'0123456789', response.content) 490 | 491 | # NOTE: Presently the proxy never serves compressed responses to the client. 492 | # This may change in the future. 493 | self.assertNotIn('Content-Encoding', response.headers) 494 | self.assertEqual('10', response.headers['Content-Length']) 495 | 496 | # === Response Content Processing: Client <- Proxy <- Server === 497 | 498 | # Rewrites Response Content: absolute URLs 499 | def test_rewrites_absolute_urls_in_content_when_returning_response_from_server(self): 500 | response = self._get( 501 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/link_to_social_network.html')) 502 | self.assertEqual(200, response.status_code) 503 | self.assertIn( 504 | format_proxy_url('http', _OTHER_DOMAIN, '/feed/landing_page_from_blog.html', proxy_info=_PROXY_INFO), 505 | response.text) 506 | 507 | # Rewrites Response Content: protocol-relative URLs 508 | def test_rewrites_protocol_relative_urls_in_content_when_returning_response_from_server(self): 509 | response = self._get( 510 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/link_to_social_network_with_same_protocol.html')) 511 | self.assertEqual(200, response.status_code) 512 | self.assertIn( 513 | format_proxy_url('http', _OTHER_DOMAIN, '/feed/landing_page_from_blog.html', proxy_info=_PROXY_INFO), 514 | response.text) 515 | 516 | # Retains Response Content: site-relative URLs 517 | # NOTE: Might rewrite these URLs in the future. 518 | def test_retains_site_relative_urls_in_content_when_returning_response_from_server(self): 519 | response = self._get( 520 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/link_to_homepage_with_site_relative_url.html')) 521 | self.assertEqual(200, response.status_code) 522 | self.assertIn('"/"', response.text) 523 | 524 | # Retains Response Content: relative URLs 525 | # NOTE: Might rewrite these URLs in the future. 526 | def test_retains_relative_urls_in_content_when_returning_response_from_server(self): 527 | response = self._get( 528 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/link_to_neighboring_post_with_relative_url.html')) 529 | self.assertEqual(200, response.status_code) 530 | self.assertIn('"neighboring_post.html"', response.text) 531 | 532 | # === Header Order Preservation === 533 | 534 | def test_sends_request_headers_in_same_order_as_client(self): 535 | global _expected_request_headers 536 | 537 | SAFE_REQUEST_HEADERS = [ 538 | h for h in webcrystal._REQUEST_HEADER_WHITELIST 539 | if h not in ['host', 'referer'] 540 | ] 541 | 542 | for i in range(5): 543 | headers = list(SAFE_REQUEST_HEADERS) # clone 544 | if i != 0: 545 | random.shuffle(headers) 546 | 547 | _expected_request_headers = headers # export 548 | 549 | response = self._get( 550 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/expects_certain_request_headers'), 551 | OrderedDict([(k, 'ignoreme') for k in headers])) 552 | self.assertEqual(200, response.status_code, response.text) 553 | 554 | def test_sends_response_headers_in_same_order_as_origin_server(self): 555 | global _response_headers_to_send 556 | 557 | SAFE_RESPONSE_HEADERS = [ 558 | h for h in webcrystal._RESPONSE_HEADER_WHITELIST 559 | if h.startswith('x-') 560 | ] 561 | 562 | for i in range(5): 563 | headers = list(SAFE_RESPONSE_HEADERS) # clone 564 | if i != 0: 565 | random.shuffle(headers) 566 | 567 | _response_headers_to_send = \ 568 | OrderedDict([(k, 'ignoreme') for k in headers]) # export 569 | 570 | response = self._get( 571 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/send_certain_response_headers')) 572 | self.assertEqual(200, response.status_code) 573 | 574 | matching_response_headers = \ 575 | [k for k in response.headers.keys() if k in _response_headers_to_send] 576 | self.assertEqual(headers, matching_response_headers) 577 | 578 | # === Online vs. Offline === 579 | 580 | def test_returns_archived_response_by_default_if_available(self): 581 | global _default_server_counter 582 | 583 | _default_server_counter = 1 584 | response = self._get( 585 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/get_counter'), 586 | cache=False) 587 | self.assertEqual(200, response.status_code) 588 | self.assertEqual('1', response.text) 589 | 590 | _default_server_counter = 2 591 | response = self._get( 592 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/get_counter'), 593 | cache=True) 594 | self.assertEqual(200, response.status_code) 595 | self.assertEqual('1', response.text) # should be stale 596 | 597 | # [Cache-Control: no-cache] should disable cache on a per-request basis 598 | def test_always_returns_fresh_response_if_cache_disabled(self): 599 | global _default_server_counter 600 | 601 | self.test_returns_archived_response_by_default_if_available() 602 | 603 | _default_server_counter = 3 604 | response = self._get( 605 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/get_counter'), 606 | cache=False) 607 | self.assertEqual(200, response.status_code) 608 | self.assertEqual('3', response.text) # should be fresh 609 | 610 | def test_fetch_of_archived_resource_in_offline_mode_returns_the_resource(self): 611 | for starline_method in ['POST', 'GET']: 612 | response = self._get( 613 | format_proxy_path('http', _DEFAULT_DOMAIN, '/'), 614 | cache=False) 615 | self.assertEqual(200, response.status_code) 616 | self.assertIn('Default server', response.text) 617 | 618 | self._go_offline(method=starline_method) 619 | try: 620 | response = self._get( 621 | format_proxy_path('http', _DEFAULT_DOMAIN, '/'), 622 | cache=True) 623 | self.assertEqual(200, response.status_code) 624 | self.assertIn('Default server', response.text) 625 | finally: 626 | self._go_online(method=starline_method) 627 | 628 | def test_fetch_of_unarchived_resource_in_offline_mode_returns_http_503_with_link(self): 629 | for starline_method in ['POST', 'GET']: 630 | response = self._get( 631 | format_proxy_path('http', _DEFAULT_DOMAIN, '/', 632 | command='_delete'), 633 | cache=False) 634 | self.assertIn(response.status_code, [200, 404]) 635 | 636 | self._go_offline(method=starline_method) 637 | try: 638 | response = self._get( 639 | format_proxy_path('http', _DEFAULT_DOMAIN, '/'), 640 | cache=True) 641 | self.assertEqual(503, response.status_code) 642 | self.assertIn('"http://%s/"' % _DEFAULT_DOMAIN, response.text) 643 | finally: 644 | self._go_online(method=starline_method) 645 | 646 | def test_cannot_go_online_with_invalid_method(self): 647 | self._go_online(method='HEAD') 648 | 649 | def test_cannot_go_offline_with_invalid_method(self): 650 | try: 651 | self._go_offline(method='HEAD') 652 | finally: 653 | self._go_online() 654 | 655 | # === Misc === 656 | 657 | def test_invalid_command_is_rejected(self): 658 | response = self._get('/_bogus/') 659 | self.assertEqual(400, response.status_code) # Bad Request 660 | 661 | def test_fetch_of_invalid_proxy_url_returns_bad_request(self): 662 | response = self._get('/_/bogus_url') 663 | self.assertEqual(400, response.status_code) # Bad Request 664 | 665 | def test_fetch_of_unreachable_origin_server_returns_http_502(self): 666 | response = self._get('/_/http/nosuchsite-really.com/') 667 | self.assertEqual(502, response.status_code) # Bad Gateway 668 | self.assertIn('"http://nosuchsite-really.com/"', response.text) 669 | 670 | def test_head_works(self): 671 | response = self._head(format_proxy_path('http', _DEFAULT_DOMAIN, '/')) 672 | self.assertEqual('text/html', response.headers['Content-Type']) 673 | 674 | # === Utility: Commands === 675 | 676 | def _go_online(self, *, method='POST'): 677 | response = self._request(method, '/_online') 678 | self.assertEqual(200 if method in ['POST', 'GET'] else 405, response.status_code) 679 | 680 | def _go_offline(self, *, method='POST'): 681 | response = self._request(method, '/_offline') 682 | self.assertEqual(200 if method in ['POST', 'GET'] else 405, response.status_code) 683 | 684 | 685 | class CoreEndpointTests2(_AbstractEndpointTests): 686 | """ 687 | Subset of the core endpoint tests that check behavior when there is no 688 | default origin domain. 689 | """ 690 | 691 | has_default_domain = False 692 | 693 | # === Request Formats === 694 | 695 | # GET/HEAD of /__PATH__ when Referer is omitted 696 | # -> HTTP 404 697 | def test_request_of_unqualified_path_without_referer_returns_404_if_no_default_domain(self): 698 | for method in ['GET', 'HEAD', 'POST']: 699 | response = self._request(method, '/posts/', allow_redirects=True) 700 | if method == 'POST': 701 | self.assertEqual(405, response.status_code) 702 | else: 703 | self.assertEqual(404, response.status_code) 704 | 705 | 706 | class RawEndpointTests(_AbstractEndpointTests): 707 | """ 708 | Acceptance tests for the raw endpoint: 709 | * GET,HEAD /_raw/http[s]/__PATH__ 710 | 711 | This endpoint exists primarily so that scrapers built on top of webcrystal 712 | can access the raw content of an archive without causing an implicit fetch 713 | of missing content, as would be the case with the core /_/ endpoint. 714 | """ 715 | 716 | def test_request_of_resource_in_archive_returns_original_resource_verbatim(self): 717 | ORIGINAL_RESOURCE = _DEFAULT_SERVER_RESPONSES['/posts/link_to_social_network.html'] 718 | 719 | response = self._get( 720 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/link_to_social_network.html')) 721 | self.assertEqual(200, response.status_code) 722 | 723 | response = self._get( 724 | format_proxy_path('http', _DEFAULT_DOMAIN, '/posts/link_to_social_network.html', 725 | command='_raw')) 726 | self.assertEqual(200, response.status_code) 727 | self.assertEqual(ORIGINAL_RESOURCE['body'], response.text) 728 | self.assertEqual( 729 | OrderedDict(ORIGINAL_RESOURCE['headers']), 730 | self._remove_automatically_added_headers( 731 | OrderedDict(response.headers))) 732 | 733 | def test_request_of_resource_not_in_archive_returns_http_503_with_link(self): 734 | response = self._get( 735 | format_proxy_path('http', _DEFAULT_DOMAIN, '/not_in_archive', 736 | command='_raw')) 737 | self.assertEqual(503, response.status_code) 738 | self.assertIn('"http://%s/not_in_archive"' % _DEFAULT_DOMAIN, response.text) 739 | 740 | def test_request_of_404_in_archive_returns_404(self): 741 | response = self._get( 742 | format_proxy_path('http', _DEFAULT_DOMAIN, '/404.html')) 743 | self.assertEqual(404, response.status_code) 744 | self.assertEqual('No such page was found!', response.text) 745 | 746 | response = self._get( 747 | format_proxy_path('http', _DEFAULT_DOMAIN, '/404.html', 748 | command='_raw')) 749 | self.assertEqual(404, response.status_code) 750 | self.assertEqual('No such page was found!', response.text) 751 | 752 | @skip('not yet automated') 753 | def test_should_not_passthrough_connection_specific_headers(self): 754 | # In particualr the 'Content-Encoding' header, if it was received 755 | # from the origin server, should not be returned from the _raw endpoint. 756 | pass 757 | 758 | def test_cannot_use_invalid_method(self): 759 | response = self._request('POST', 760 | format_proxy_path('http', _DEFAULT_DOMAIN, '/', 761 | command='_raw')) 762 | self.assertEqual(405, response.status_code) 763 | 764 | # Removes headers added automatically by http.server (the underlying 765 | # server used by the MockOriginServer). They are hard to remove without 766 | # monkeypatching. So just ignore them. 767 | def _remove_automatically_added_headers(self, headers): 768 | headers = headers.copy() 769 | for hn in ['Server', 'Date', 'Content-Length']: 770 | if hn in headers: 771 | del headers[hn] 772 | return headers 773 | 774 | 775 | class RefreshEndpointTests(_AbstractEndpointTests): 776 | """ 777 | Acceptance tests for the refresh endpoint: 778 | * POST,GET /_refresh/http[s]/__PATH__ 779 | 780 | This endpoint mainly exists to prove that webcrystal persists the original 781 | request headers for a previously fetched URL. 782 | """ 783 | 784 | # === Refresh === 785 | 786 | def test_can_refresh_resource_without_resending_request_headers(self): 787 | for method in ['POST', 'GET']: 788 | global _default_server_counter 789 | 790 | _default_server_counter = 1 791 | response = self._get( 792 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/get_counter_only_chrome'), 793 | {'User-Agent': 'Chrome'}) 794 | self.assertEqual(200, response.status_code) 795 | self.assertEqual('1', response.text) 796 | 797 | _default_server_counter = 2 798 | response = self._request(method, 799 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/get_counter_only_chrome', 800 | command='_refresh')) 801 | self.assertEqual(200, response.status_code) 802 | self.assertEqual('', response.text) 803 | 804 | response = self._get( 805 | format_proxy_path('http', _DEFAULT_DOMAIN, '/api/get_counter_only_chrome'), 806 | cache=True) 807 | self.assertEqual(200, response.status_code) 808 | self.assertEqual('2', response.text) 809 | 810 | def test_cannot_refresh_unarchived_resource(self): 811 | response = self._post( 812 | format_proxy_path('http', _DEFAULT_DOMAIN, '/never_archived', 813 | command='_refresh')) 814 | self.assertEqual(404, response.status_code) 815 | 816 | def test_cannot_refresh_resource_with_invalid_method(self): 817 | response = self._request('HEAD', 818 | format_proxy_path('http', _DEFAULT_DOMAIN, '/', 819 | command='_refresh')) 820 | self.assertEqual(405, response.status_code) 821 | 822 | def test_refresh_of_unreachable_origin_server_returns_http_502(self): 823 | global _should_disconnect 824 | 825 | _should_disconnect = False 826 | response = self._get( 827 | format_proxy_path('http', _DEFAULT_DOMAIN, '/sometimes_disconnects')) 828 | self.assertEqual(200, response.status_code) 829 | 830 | _should_disconnect = True 831 | response = self._post( 832 | format_proxy_path('http', _DEFAULT_DOMAIN, '/sometimes_disconnects', 833 | command='_refresh')) 834 | self.assertEqual(502, response.status_code) # Bad Gateway 835 | 836 | 837 | class ModuleImportTests(TestCase): 838 | """ 839 | Acceptance tests related to the behavior of importing "webcrystal", 840 | particularly as related to dependencies available in the environment. 841 | """ 842 | 843 | def test_missing_urllib3_gives_nice_error_message(self): 844 | with mock.patch.dict('sys.modules', {'urllib3': None}): 845 | del sys.modules['webcrystal'] # unimport 846 | try: 847 | import webcrystal 848 | except ImportError as e: 849 | self.assertIn('webcrystal requires urllib3. Try: pip3 install urllib3', str(e)) 850 | else: 851 | self.fail() 852 | 853 | def test_unsupported_python_version_gives_nice_error_message(self): 854 | old_version_info = sys.version_info 855 | try: 856 | sys.version_info = (2, 7) 857 | 858 | del sys.modules['webcrystal'] # unimport 859 | try: 860 | import webcrystal 861 | except ImportError as e: 862 | self.assertIn('webcrystal requires Python 3.4 or later.', str(e)) 863 | else: 864 | self.fail() 865 | finally: 866 | sys.version_info = old_version_info 867 | 868 | def test_imports_when_missing_pyopenssl(self): 869 | with mock.patch.dict('sys.modules', {'urllib3.contrib.pyopenssl': None}): 870 | del sys.modules['webcrystal'] # unimport 871 | import webcrystal 872 | 873 | def test_imports_when_missing_certifi(self): 874 | with mock.patch.dict('sys.modules', {'certifi': None}): 875 | del sys.modules['webcrystal'] # unimport 876 | import webcrystal 877 | 878 | 879 | # ------------------------------------------------------------------------------ 880 | # Real Proxy Server 881 | 882 | class _RealProxyServer: 883 | def __init__(self, port, default_origin_domain): 884 | self._port = port 885 | 886 | archive_dirpath = os.path.join( 887 | tempfile.mkdtemp(prefix='webcrystal_test_archive'), 888 | 'default_origin.wbcr') 889 | 890 | args = ['--quiet', str(port), archive_dirpath,] 891 | if default_origin_domain is not None: 892 | args.append(default_origin_domain) 893 | 894 | self._process = Process(target=webcrystal.main, args=(args,)) 895 | self._process.start() 896 | 897 | wait_until_port_not_open('127.0.0.1', port) 898 | 899 | def close(self): 900 | # Send Control-C to the process to bring it down gracefully 901 | # NOTE: Graceful shutdown is required in order to collect 902 | # code coverage metrics properly. 903 | os.kill(self._process.pid, signal.SIGINT) 904 | 905 | wait_until_port_open('127.0.0.1', self._port) 906 | 907 | 908 | # ------------------------------------------------------------------------------ 909 | # Mock Origin Server 910 | 911 | 912 | class _MockOriginServer: 913 | def __init__(self, port, responses): 914 | self._port = port 915 | 916 | def create_request_handler(*args): 917 | nonlocal responses 918 | return _TestServerHttpRequestHandler(*args, responses=responses) 919 | 920 | self._httpd = HTTPServer(('', port), create_request_handler) 921 | 922 | # NOTE: Use a low poll interval so that shutdown() completes quickly 923 | thread = Thread(target=lambda: self._httpd.serve_forever(poll_interval=50/1000)) 924 | thread.start() 925 | 926 | wait_until_port_not_open('127.0.0.1', port) 927 | 928 | def close(self): 929 | self._httpd.shutdown() 930 | self._httpd.socket.close() 931 | 932 | assert is_port_open('127.0.0.1', self._port) 933 | 934 | class _TestServerHttpRequestHandler(BaseHTTPRequestHandler): 935 | def __init__(self, *args, responses): 936 | self._responses = responses 937 | super().__init__(*args) 938 | 939 | def do_HEAD(self): 940 | f = self._send_head() 941 | f.close() 942 | 943 | def do_GET(self): 944 | f = self._send_head() 945 | try: 946 | shutil.copyfileobj(f, self.wfile) 947 | finally: 948 | f.close() 949 | 950 | def _send_head(self): 951 | response = self._responses.get(self.path) 952 | if response is None: 953 | self.send_response(404) # Not Found 954 | self.end_headers() 955 | return BytesIO(b'') 956 | 957 | # Compute response if it is dynamic 958 | if callable(response): 959 | response = response(self.path, self.headers) 960 | 961 | # Disconnect abruptly if requested to 962 | if response == '__disconnect__': 963 | return BytesIO(b'') 964 | 965 | # Send header 966 | self.send_response(response.get('status_code', 200)) 967 | for (k, v) in response.get('headers', []): 968 | self.send_header(k, v) 969 | self.end_headers() 970 | 971 | # Prepare to send body 972 | response_body = response.get('body', b'') 973 | if isinstance(response_body, str): 974 | response_body = response_body.encode('utf8') 975 | return BytesIO(response_body) 976 | 977 | def log_message(self, *args): 978 | pass # operate silently 979 | 980 | 981 | # ------------------------------------------------------------------------------ 982 | # Utility 983 | 984 | def wait_until_port_not_open(hostname, port): 985 | while is_port_open(hostname, port): 986 | time.sleep(20/1000) 987 | 988 | 989 | def wait_until_port_open(hostname, port): 990 | while not is_port_open(hostname, port): 991 | time.sleep(20/1000) 992 | 993 | 994 | def is_port_open(hostname, port): 995 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 996 | try: 997 | result = s.connect_ex((hostname, port)) 998 | if result == 0: 999 | return False 1000 | else: 1001 | return True 1002 | finally: 1003 | s.close() 1004 | 1005 | 1006 | # ------------------------------------------------------------------------------ 1007 | 1008 | if __name__ == '__main__': 1009 | unittest.main() 1010 | -------------------------------------------------------------------------------- /webcrystal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | webcrystal is: 4 | 5 | 1. An HTTP proxy and web service that saves every web page accessed through it to disk. 6 | 2. An on-disk archival format for storing websites. 7 | 8 | webcrystal is intended as a tool for archiving websites. 9 | 10 | See the README for more information. 11 | """ 12 | 13 | import argparse 14 | import atexit 15 | from collections import namedtuple, OrderedDict 16 | import html 17 | from http.server import HTTPServer, BaseHTTPRequestHandler 18 | from io import BytesIO 19 | import json 20 | import os.path 21 | import re 22 | import shutil 23 | from socketserver import ThreadingMixIn 24 | import sys 25 | from threading import Lock 26 | 27 | if not (sys.version_info >= (3, 4)): 28 | raise ImportError('webcrystal requires Python 3.4 or later.') 29 | 30 | try: 31 | import urllib3 32 | except ImportError: 33 | raise ImportError('webcrystal requires urllib3. Try: pip3 install urllib3') 34 | 35 | # Use PyOpenSSL if it is available. 36 | # 37 | # This allows most HTTPS connections to succeed on operating systems 38 | # like OS X 10.11 that ship with such old versions of OpenSSL that most 39 | # HTTPS connections are dropped. 40 | try: 41 | # Requires: pip3 install pyopenssl ndg-httpsclient pyasn1 42 | import urllib3.contrib.pyopenssl 43 | except ImportError: 44 | pass 45 | else: 46 | urllib3.contrib.pyopenssl.inject_into_urllib3() 47 | 48 | # Force HTTPS certificate validation with certifi if it is available. 49 | try: 50 | import certifi 51 | except ImportError: 52 | _pool_manager_kwargs = dict() 53 | else: 54 | _pool_manager_kwargs = dict( 55 | cert_reqs='CERT_REQUIRED', # Force certificate check. 56 | ca_certs=certifi.where(), # Path to the Certifi bundle. 57 | ) 58 | 59 | # ============================================================================== 60 | # Service 61 | 62 | 63 | _http = urllib3.PoolManager(retries=0, **_pool_manager_kwargs) 64 | 65 | 66 | def main(raw_cli_args): 67 | # Parse arguments 68 | parser = argparse.ArgumentParser( 69 | description='An archiving HTTP proxy and web service.', 70 | add_help=False) 71 | parser.add_argument('-h', '--help', action='help', 72 | help='Show this help message and exit.') 73 | parser.add_argument('-q', '--quiet', action='store_true', dest='is_quiet', 74 | help='Suppresses all output.') 75 | parser.add_argument('port', type=int, 76 | help='Port on which to run the HTTP proxy. Suggest 9227 (WBCR).') 77 | parser.add_argument('archive_dirpath', 78 | help='Path to the archive directory. Usually has .wbcr extension.') 79 | parser.add_argument('default_origin_domain', nargs='?', type=_domain, 80 | help='Default HTTP domain which the HTTP proxy will redirect to if no URL is specified.') 81 | cli_args = parser.parse_args(raw_cli_args) 82 | 83 | proxy_info = _ProxyInfo(host='127.0.0.1', port=cli_args.port) 84 | 85 | # Open archive 86 | archive = HttpResourceArchive(cli_args.archive_dirpath) 87 | try: 88 | atexit.register(lambda: archive.close()) # last resort 89 | 90 | # ProxyState -- is mutable and threadsafe 91 | proxy_state = { 92 | 'is_online': True 93 | } 94 | 95 | def create_request_handler(*args): 96 | return _ArchivingHTTPRequestHandler(*args, 97 | archive=archive, 98 | proxy_info=proxy_info, 99 | default_origin_domain=cli_args.default_origin_domain, 100 | is_quiet=cli_args.is_quiet, 101 | proxy_state=proxy_state) 102 | 103 | # Run service until user presses ^C 104 | if not cli_args.is_quiet: 105 | print('Listening on %s:%s' % (proxy_info.host, proxy_info.port)) 106 | httpd = _ThreadedHttpServer( 107 | (proxy_info.host, proxy_info.port), 108 | create_request_handler) 109 | try: 110 | httpd.serve_forever() 111 | except KeyboardInterrupt: 112 | pass 113 | finally: 114 | httpd.server_close() 115 | finally: 116 | archive.close() 117 | 118 | 119 | def _domain(domain_descriptor): 120 | m = re.search(r'^(?:(https?)://)?([^/]+)/?$', domain_descriptor) 121 | if m is None: 122 | raise argparse.ArgumentTypeError( 123 | '%r must look like %r or %r' % 124 | (domain_descriptor, 'xkcd.com', 'http://xkcd.com/')) 125 | (protocol, domain) = m.groups() 126 | if protocol == 'https': 127 | raise argparse.ArgumentTypeError( 128 | 'The %r protocol is not supported for the default origin domain. Try %r instead.' % 129 | ('https', 'http')) 130 | return domain 131 | 132 | 133 | _ProxyInfo = namedtuple('_ProxyInfo', ['host', 'port']) 134 | 135 | 136 | class _ThreadedHttpServer(ThreadingMixIn, HTTPServer): 137 | pass 138 | 139 | 140 | class _ArchivingHTTPRequestHandler(BaseHTTPRequestHandler): 141 | """ 142 | HTTP request handler that serves requests from an HttpResourceArchive. 143 | When a resource is requested that isn't in the archive, it will be added 144 | to the archive automatically. 145 | """ 146 | 147 | def __init__(self, *args, archive, proxy_info, default_origin_domain, is_quiet, proxy_state): 148 | self._archive = archive 149 | self._proxy_info = proxy_info 150 | self._default_origin_domain = default_origin_domain 151 | self._is_quiet = is_quiet 152 | self._proxy_state = proxy_state 153 | super().__init__(*args) 154 | 155 | def do_HEAD(self): 156 | f = self._send_head(method='HEAD') 157 | f.close() 158 | 159 | def do_GET(self): 160 | f = self._send_head(method='GET') 161 | try: 162 | shutil.copyfileobj(f, self.wfile) 163 | finally: 164 | f.close() 165 | 166 | def do_POST(self): 167 | f = self._send_head(method='POST') 168 | try: 169 | shutil.copyfileobj(f, self.wfile) 170 | finally: 171 | f.close() 172 | 173 | def _send_head(self, *, method): 174 | try: 175 | if self.path.startswith('/_') and not self.path.startswith('/_/'): 176 | return self._send_head_for_special_request(method=method) 177 | else: 178 | return self._send_head_for_regular_request(method=method) 179 | except Exception as e: 180 | # Annotate exception with offending URL and method 181 | raise Exception('Problem while serving %s of %s: %s' % (method, self.path, e)) from e 182 | 183 | def _send_head_for_special_request(self, *, method): 184 | if self.path == '/_online': 185 | if method not in ['POST', 'GET']: 186 | return self._send_head_for_simple_response(405) # Method Not Allowed 187 | 188 | self._proxy_state['is_online'] = True 189 | 190 | self.send_response(200) # OK 191 | self.send_header('Content-Type', 'text/plain') 192 | self.end_headers() 193 | return BytesIO(b'OK') 194 | 195 | elif self.path == '/_offline': 196 | if method not in ['POST', 'GET']: 197 | return self._send_head_for_simple_response(405) # Method Not Allowed 198 | 199 | self._proxy_state['is_online'] = False 200 | 201 | self.send_response(200) # OK 202 | self.send_header('Content-Type', 'text/plain') 203 | self.end_headers() 204 | return BytesIO(b'OK') 205 | 206 | elif self.path.startswith('/_raw/'): 207 | if method not in ['GET', 'HEAD']: 208 | return self._send_head_for_simple_response(405) # Method Not Allowed 209 | 210 | parsed_request_url = _try_parse_client_request_path(self.path, self._default_origin_domain) 211 | assert parsed_request_url is not None 212 | request_url = '%s://%s%s' % ( 213 | parsed_request_url.protocol, 214 | parsed_request_url.domain, 215 | parsed_request_url.path 216 | ) 217 | 218 | resource = self._archive.get(request_url) 219 | if resource is None: 220 | self.send_response(503) # Service Unavailable 221 | self.send_header('Content-Type', 'text/html') 222 | self.end_headers() 223 | 224 | return BytesIO( 225 | (('Resource %s is not archived') % 226 | (html.escape(request_url), html.escape(request_url)) 227 | ).encode('utf8') 228 | ) 229 | else: 230 | return self._send_head_for_resource(resource, filter=False) 231 | 232 | elif self.path.startswith('/_delete/'): 233 | if method not in ['POST', 'GET']: 234 | return self._send_head_for_simple_response(405) # Method Not Allowed 235 | 236 | parsed_request_url = _try_parse_client_request_path(self.path, self._default_origin_domain) 237 | assert parsed_request_url is not None 238 | request_url = '%s://%s%s' % ( 239 | parsed_request_url.protocol, 240 | parsed_request_url.domain, 241 | parsed_request_url.path 242 | ) 243 | 244 | did_exist = self._archive.delete(request_url) 245 | if did_exist: 246 | return self._send_head_for_simple_response(200) # OK 247 | else: 248 | return self._send_head_for_simple_response(404) # Not Found 249 | 250 | elif self.path.startswith('/_refresh/'): 251 | if method not in ['POST', 'GET']: 252 | return self._send_head_for_simple_response(405) # Method Not Allowed 253 | 254 | parsed_request_url = _try_parse_client_request_path(self.path, self._default_origin_domain) 255 | assert parsed_request_url is not None 256 | request_url = '%s://%s%s' % ( 257 | parsed_request_url.protocol, 258 | parsed_request_url.domain, 259 | parsed_request_url.path 260 | ) 261 | 262 | request_headers = self._archive.get_request_headers(request_url) 263 | if request_headers is None: 264 | return self._send_head_for_simple_response(404) # Not Found 265 | 266 | try: 267 | resource = self._fetch_from_origin_and_store_in_archive( 268 | request_url, request_headers, 269 | parsed_request_url=parsed_request_url) 270 | except _OriginServerError as e: 271 | return self._send_head_for_origin_server_error(e) 272 | else: 273 | resource.content.close() 274 | 275 | return self._send_head_for_simple_response(200) # OK 276 | 277 | else: 278 | return self._send_head_for_simple_response(400) # Bad Request 279 | 280 | def _send_head_for_regular_request(self, *, method): 281 | if method not in ['GET', 'HEAD']: 282 | return self._send_head_for_simple_response(405) # Method Not Allowed 283 | 284 | canonical_request_headers = {k.lower(): v for (k, v) in self.headers.items()} # cache 285 | 286 | parsed_request_url = _try_parse_client_request_path(self.path, self._default_origin_domain) 287 | if parsed_request_url is None: 288 | return self._send_head_for_simple_response(400) # Bad Request 289 | assert parsed_request_url.command == '_' 290 | 291 | request_referer = canonical_request_headers.get('referer') 292 | parsed_referer = \ 293 | None if request_referer is None \ 294 | else _try_parse_client_referer(request_referer, self._default_origin_domain) 295 | 296 | # Received a request at a site-relative path? 297 | # Redirect to a fully qualified proxy path at the appropriate domain. 298 | if not parsed_request_url.is_proxy: 299 | if parsed_referer is not None and parsed_referer.is_proxy: 300 | # Referer exists and is from the proxy? 301 | # Redirect to the referer domain. 302 | redirect_url = _format_proxy_url( 303 | protocol=parsed_request_url.protocol, 304 | domain=parsed_referer.domain, 305 | path=parsed_request_url.path, 306 | proxy_info=self._proxy_info 307 | ) 308 | is_permanent = True 309 | else: 310 | if parsed_request_url.domain is None: 311 | return self._send_head_for_simple_response(404) # Not Found 312 | 313 | # No referer exists (or it's an unexpected external referer)? 314 | # Redirect to the default origin domain. 315 | redirect_url = _format_proxy_url( 316 | protocol=parsed_request_url.protocol, 317 | domain=parsed_request_url.domain, 318 | path=parsed_request_url.path, 319 | proxy_info=self._proxy_info 320 | ) 321 | is_permanent = False # temporary because the default origin domain can change 322 | 323 | self.send_response(308 if is_permanent else 307) # Permanent Redirect, Temporary Redirect 324 | self.send_header('Location', redirect_url) 325 | self.send_header('Vary', 'Referer') 326 | self.end_headers() 327 | 328 | return BytesIO(b'') 329 | 330 | assert parsed_request_url.domain is not None 331 | request_url = '%s://%s%s' % ( 332 | parsed_request_url.protocol, 333 | parsed_request_url.domain, 334 | parsed_request_url.path 335 | ) 336 | 337 | # If client performs a hard refresh (Command-Shift-R in Chrome), 338 | # ignore any archived response and refetch a fresh resource from the origin server. 339 | request_cache_control = canonical_request_headers.get('cache-control') 340 | request_pragma = canonical_request_headers.get('pragma') 341 | should_disable_cache = ( 342 | (request_cache_control is not None and 343 | # HACK: fuzzy match 344 | 'no-cache' in request_cache_control) or 345 | (request_pragma is not None and 346 | # HACK: fuzzy match 347 | 'no-cache' in request_pragma) 348 | ) 349 | 350 | # Try fetch requested resource from archive. 351 | if should_disable_cache: 352 | resource = None 353 | else: 354 | resource = self._archive.get(request_url) 355 | 356 | # If missing fetch the resource from the origin and add it to the archive. 357 | if resource is None: 358 | # Fail if in offline mode 359 | if not self._proxy_state['is_online']: 360 | self.send_response(503) # Service Unavailable 361 | self.send_header('Content-Type', 'text/html') 362 | self.end_headers() 363 | 364 | return BytesIO( 365 | (('Resource %s is not archived, ' + 366 | 'and this proxy is in offline mode. Go online?') % 367 | (html.escape(request_url), html.escape(request_url)) 368 | ).encode('utf8') 369 | ) 370 | 371 | try: 372 | resource = self._fetch_from_origin_and_store_in_archive( 373 | request_url, 374 | self.headers, 375 | parsed_request_url=parsed_request_url) 376 | except _OriginServerError as e: 377 | return self._send_head_for_origin_server_error(e) 378 | 379 | return self._send_head_for_resource(resource) 380 | 381 | def _fetch_from_origin_and_store_in_archive( 382 | self, request_url, request_headers, *, parsed_request_url): 383 | request_headers = OrderedDict(request_headers) # clone 384 | 385 | # Set Host request header appropriately 386 | _del_headers(request_headers, ['Host']) 387 | request_headers['Host'] = parsed_request_url.domain 388 | 389 | # Filter request headers before sending to origin server 390 | _filter_headers(request_headers, 'request header', is_quiet=self._is_quiet) 391 | _reformat_absolute_urls_in_headers( 392 | request_headers, 393 | proxy_info=self._proxy_info, 394 | default_origin_domain=self._default_origin_domain) 395 | 396 | try: 397 | response = _http.request( 398 | method='GET', 399 | url=request_url, 400 | headers=request_headers, 401 | redirect=False 402 | ) 403 | except Exception as e: 404 | raise _OriginServerError(request_url) from e 405 | 406 | # NOTE: Not streaming the response at the moment for simplicity. 407 | # Probably want to use iter_content() later. 408 | response_content_bytes = response.data 409 | 410 | response_headers = OrderedDict(response.headers) # clone 411 | _del_headers(response_headers, ['Content-Length', 'Content-Encoding']) 412 | response_headers['Content-Length'] = str(len(response_content_bytes)) 413 | response_headers['X-Status-Code'] = str(response.status) 414 | 415 | response_content = BytesIO(response_content_bytes) 416 | try: 417 | self._archive.put(request_url, request_headers, HttpResource( 418 | headers=response_headers, 419 | content=response_content 420 | )) 421 | finally: 422 | response_content.close() 423 | 424 | resource = self._archive.get(request_url) 425 | assert resource is not None 426 | 427 | return resource 428 | 429 | def _send_head_for_resource(self, resource, *, filter=True): 430 | status_code = int(resource.headers['X-Status-Code']) 431 | resource_headers = OrderedDict(resource.headers) # clone 432 | resource_content = resource.content 433 | 434 | if filter: 435 | # Filter response headers before sending to client 436 | _filter_headers(resource_headers, 'response header', is_quiet=self._is_quiet) 437 | _reformat_absolute_urls_in_headers( 438 | resource_headers, 439 | proxy_info=self._proxy_info, 440 | default_origin_domain=self._default_origin_domain) 441 | 442 | # Filter response content before sending to client 443 | (resource_headers, resource_content) = _reformat_absolute_urls_in_content( 444 | resource_headers, resource_content, 445 | proxy_info=self._proxy_info) 446 | 447 | else: 448 | # Minimal filtering: Remove only the internal response headers 449 | for hn in list(resource_headers.keys()): 450 | if hn.lower() in _RAW_RESPONSE_HEADER_BLACKLIST: 451 | del resource_headers[hn] 452 | 453 | # Send headers 454 | self.send_response(status_code) 455 | for (key, value) in resource_headers.items(): 456 | self.send_header(key, value) 457 | self.end_headers() 458 | 459 | return resource_content 460 | 461 | def _send_head_for_simple_response(self, status_code): 462 | self.send_response(status_code) 463 | self.end_headers() 464 | return BytesIO(b'') 465 | 466 | def _send_head_for_origin_server_error(self, e): 467 | f = e.__cause__ 468 | 469 | self.send_response(502) # Bad Gateway 470 | self.send_header('Content-Type', 'text/html') 471 | self.end_headers() 472 | 473 | return BytesIO( 474 | (('Error while fetching resource %s.' + 475 | '
%s: %s
') % 476 | (html.escape(e.request_url), html.escape(e.request_url), 477 | html.escape(type(f).__name__), html.escape(str(f))) 478 | ).encode('utf8') 479 | ) 480 | 481 | def log_message(self, *args): 482 | if self._is_quiet: 483 | pass # operate silently 484 | else: 485 | super().log_message(*args) 486 | 487 | 488 | def _del_headers(headers, header_names_to_delete): 489 | header_names_to_delete = [hn.lower() for hn in header_names_to_delete] 490 | for key in list(headers.keys()): 491 | if key.lower() in header_names_to_delete: 492 | del headers[key] 493 | 494 | 495 | class _OriginServerError(Exception): 496 | def __init__(self, request_url): 497 | self.request_url = request_url 498 | 499 | 500 | # ------------------------------------------------------------------------------ 501 | # Filter Headers 502 | 503 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 504 | # Filter Header Keys 505 | 506 | _REQUEST_HEADER_WHITELIST = [ 507 | # Request 508 | 'accept', 509 | 'accept-encoding', 510 | 'accept-language', 511 | 'cookie', 512 | 'host', 513 | 'referer', 514 | 'user-agent', 515 | ] 516 | _RESPONSE_HEADER_WHITELIST = [ 517 | # Response 518 | 'access-control-allow-origin', 519 | 'access-control-allow-credentials', 520 | 'age', 521 | 'content-length', 522 | 'content-type', 523 | 'date', 524 | 'etag', 525 | 'expires', 526 | 'last-modified', 527 | 'location', 528 | 'retry-after', 529 | 'server', 530 | 'set-cookie', 531 | 'via', 532 | 'x-content-type-options', 533 | 'x-frame-options', 534 | 'x-runtime', 535 | 'x-served-by', 536 | 'x-xss-protection', 537 | ] 538 | _HEADER_WHITELIST = ( 539 | _REQUEST_HEADER_WHITELIST + 540 | _RESPONSE_HEADER_WHITELIST 541 | ) 542 | 543 | _REQUEST_HEADER_BLACKLIST = [ 544 | # Request 545 | 'cache-control', 546 | 'connection', 547 | 'if-modified-since', 548 | 'if-none-match', 549 | 'pragma', 550 | 'upgrade-insecure-requests', 551 | 'x-pragma', 552 | ] 553 | _RESPONSE_HEADER_BLACKLIST = [ 554 | # Response 555 | 'accept-ranges', 556 | 'cache-control', 557 | 'connection', 558 | 'strict-transport-security', 559 | 'transfer-encoding', 560 | 'vary', 561 | 'x-cache', 562 | 'x-cache-hits', 563 | 'x-request-id', 564 | 'x-served-time', 565 | 'x-timer', 566 | ] 567 | _INTERNAL_RESPONSE_HEADERS = [ 568 | # Internal 569 | 'x-status-code', 570 | ] 571 | _HEADER_BLACKLIST = ( 572 | _REQUEST_HEADER_BLACKLIST + 573 | _RESPONSE_HEADER_BLACKLIST + 574 | _INTERNAL_RESPONSE_HEADERS 575 | ) 576 | 577 | _RAW_RESPONSE_HEADER_BLACKLIST = ( 578 | ['connection', 'transfer-encoding'] + 579 | _INTERNAL_RESPONSE_HEADERS 580 | ) 581 | 582 | # TODO: Should differentiate between request & response headers. 583 | def _filter_headers(headers, header_type_title, *, is_quiet): 584 | for k in list(headers.keys()): 585 | k_lower = k.lower() 586 | if k_lower in _HEADER_WHITELIST: 587 | pass 588 | elif k_lower in _HEADER_BLACKLIST: 589 | del headers[k] 590 | else: # graylist 591 | if not is_quiet: 592 | print(' - Removing unrecognized %s: %s' % (header_type_title, k)) 593 | del headers[k] 594 | 595 | 596 | # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 597 | # Filter Header URLs 598 | 599 | def _reformat_absolute_urls_in_headers(headers, *, proxy_info, default_origin_domain): 600 | for k in list(headers.keys()): 601 | if k.lower() == 'location': 602 | parsed_url = _try_parse_absolute_url(headers[k]) 603 | if parsed_url is not None: 604 | headers[k] = _format_proxy_url( 605 | protocol=parsed_url.protocol, 606 | domain=parsed_url.domain, 607 | path=parsed_url.path, 608 | proxy_info=proxy_info, 609 | ) 610 | 611 | elif k.lower() == 'referer': 612 | referer = headers[k] 613 | 614 | parsed_referer = _try_parse_client_referer(referer, default_origin_domain) 615 | if parsed_referer is not None: 616 | headers[k] = '%s://%s%s' % ( 617 | parsed_referer.protocol, 618 | parsed_referer.domain, 619 | parsed_referer.path 620 | ) 621 | 622 | 623 | # ------------------------------------------------------------------------------ 624 | # Filter Content 625 | 626 | _ABSOLUTE_URL_BYTES_IN_HTML_RE = re.compile(rb'([\'"])(https?://.*?)\1') 627 | _PROTOCOL_RELATIVE_URL_BYTES_IN_HTML_RE = re.compile(rb'([\'"])(//.*?)\1') 628 | 629 | def _reformat_absolute_urls_in_content(resource_headers, resource_content, *, proxy_info): 630 | """ 631 | If specified resource is an HTML document, replaces any obvious absolute 632 | URL references with references of the format "/_/http/..." that will be 633 | interpreted by the archiving proxy appropriately. 634 | 635 | Otherwise returns the original content unmodified. 636 | """ 637 | is_html = False 638 | for (k, v) in resource_headers.items(): 639 | if k.lower() == 'content-type': 640 | is_html = 'text/html' in v # HACK: Loose test 641 | break 642 | 643 | if not is_html: 644 | return (resource_headers, resource_content) 645 | 646 | try: 647 | content_bytes = resource_content.read() 648 | finally: 649 | resource_content.close() 650 | 651 | def reformat_absolute_url_match(match_in_html): 652 | nonlocal proxy_info 653 | 654 | (quote, url) = match_in_html.groups() 655 | 656 | parsed_url = _try_parse_absolute_url_in_bytes(url) 657 | assert parsed_url is not None # inner regex should be subset of outer 658 | 659 | return quote + _format_proxy_url_in_bytes( 660 | protocol=parsed_url.protocol, 661 | domain=parsed_url.domain, 662 | path=parsed_url.path, 663 | proxy_info=proxy_info 664 | ) + quote 665 | 666 | content_bytes = _ABSOLUTE_URL_BYTES_IN_HTML_RE.sub(reformat_absolute_url_match, content_bytes) 667 | 668 | def reformat_protocol_relative_url_match(match_in_html): 669 | nonlocal proxy_info 670 | 671 | (quote, url) = match_in_html.groups() 672 | 673 | parsed_url = _try_parse_protocol_relative_url_in_bytes(url, protocol=b'http') 674 | assert parsed_url is not None # inner regex should be subset of outer 675 | 676 | return quote + _format_proxy_url_in_bytes( 677 | protocol=parsed_url.protocol, 678 | domain=parsed_url.domain, 679 | path=parsed_url.path, 680 | proxy_info=proxy_info 681 | ) + quote 682 | 683 | content_bytes = _PROTOCOL_RELATIVE_URL_BYTES_IN_HTML_RE.sub(reformat_protocol_relative_url_match, content_bytes) 684 | 685 | # Update Content-Length in the headers 686 | assert 'Content-Encoding' not in resource_headers 687 | _del_headers(resource_headers, ['Content-Length']) 688 | resource_headers['Content-Length'] = str(len(content_bytes)) 689 | 690 | return (resource_headers, BytesIO(content_bytes)) 691 | 692 | 693 | # ------------------------------------------------------------------------------ 694 | # Parse URLs 695 | 696 | _ABSOLUTE_REQUEST_URL_RE = re.compile(r'^/(_[^/]*)/(https?)/([^/]+)(/.*)$') 697 | 698 | _ClientRequestUrl = namedtuple('_ClientRequestUrl', 699 | ['protocol', 'domain', 'path', 'is_proxy', 'command']) 700 | 701 | def _try_parse_client_request_path(path, default_origin_domain): 702 | if path.startswith('/_'): 703 | m = _ABSOLUTE_REQUEST_URL_RE.match(path) 704 | if m is None: 705 | return None 706 | (command, protocol, domain, path) = m.groups() 707 | 708 | return _ClientRequestUrl( 709 | protocol=protocol, 710 | domain=domain, 711 | path=path, 712 | is_proxy=True, 713 | command=command 714 | ) 715 | else: 716 | return _ClientRequestUrl( 717 | protocol='http', 718 | domain=default_origin_domain, 719 | path=path, 720 | is_proxy=False, 721 | command='_' 722 | ) 723 | 724 | 725 | _REFERER_LONG_RE = re.compile(r'^https?://[^/]*/_/(https?)/([^/]*)(/.*)?$') 726 | _REFERER_SHORT_RE = re.compile(r'^(https?)://[^/]*(/.*)?$') 727 | 728 | _ClientReferer = namedtuple('_ClientReferer', 729 | ['protocol', 'domain', 'path', 'is_proxy']) 730 | 731 | def _try_parse_client_referer(referer, default_origin_domain): 732 | m = _REFERER_LONG_RE.match(referer) 733 | if m is not None: 734 | (protocol, domain, path) = m.groups() 735 | if path is None: 736 | path = '' 737 | 738 | return _ClientReferer( 739 | protocol=protocol, 740 | domain=domain, 741 | path=path, 742 | is_proxy=True 743 | ) 744 | 745 | m = _REFERER_SHORT_RE.match(referer) 746 | if m is not None: 747 | (protocol, path) = m.groups() 748 | if path is None: 749 | path = '' 750 | 751 | return _ClientReferer( 752 | protocol=protocol, 753 | domain=default_origin_domain, 754 | path=path, 755 | is_proxy=False 756 | ) 757 | 758 | return None # failed to parse header 759 | 760 | 761 | _Url = namedtuple('_Url', ['protocol', 'domain', 'path']) 762 | 763 | 764 | _ABSOLUTE_URL_RE = re.compile(r'^(https?)://([^/]*)(/.*)?$') 765 | 766 | def _try_parse_absolute_url(url): 767 | url_match = _ABSOLUTE_URL_RE.match(url) 768 | if url_match is None: 769 | return None 770 | 771 | (protocol, domain, path) = url_match.groups() 772 | if path is None: 773 | path = '' 774 | 775 | return _Url( 776 | protocol=protocol, 777 | domain=domain, 778 | path=path 779 | ) 780 | 781 | 782 | _ABSOLUTE_URL_BYTES_RE = re.compile(rb'^(https?)://([^/]*)(/.*)?$') 783 | 784 | def _try_parse_absolute_url_in_bytes(url): 785 | url_match = _ABSOLUTE_URL_BYTES_RE.match(url) 786 | if url_match is None: 787 | return None 788 | 789 | (protocol, domain, path) = url_match.groups() 790 | if path is None: 791 | path = b'' 792 | 793 | return _Url( 794 | protocol=protocol, 795 | domain=domain, 796 | path=path 797 | ) 798 | 799 | 800 | _PROTOCOL_RELATIVE_URL_BYTES_RE = re.compile(rb'^//([^/]*)(/.*)?$') 801 | 802 | def _try_parse_protocol_relative_url_in_bytes(url, *, protocol): 803 | url_match = _PROTOCOL_RELATIVE_URL_BYTES_RE.match(url) 804 | if url_match is None: 805 | return None 806 | 807 | (domain, path) = url_match.groups() 808 | if path is None: 809 | path = b'' 810 | 811 | return _Url( 812 | protocol=protocol, 813 | domain=domain, 814 | path=path 815 | ) 816 | 817 | 818 | def _format_proxy_path(protocol, domain, path, *, command='_'): 819 | return '/%s/%s/%s%s' % ( 820 | command, protocol, domain, path) 821 | 822 | 823 | def _format_proxy_url(protocol, domain, path, *, proxy_info): 824 | return 'http://%s:%s%s' % ( 825 | proxy_info.host, proxy_info.port, _format_proxy_path(protocol, domain, path)) 826 | 827 | 828 | def _format_proxy_url_in_bytes(protocol, domain, path, *, proxy_info): 829 | (proxy_host, proxy_port) = (proxy_info.host.encode('utf8'), str(proxy_info.port).encode('utf8')) 830 | # TODO: After upgrading to Python 3.5+, replace the following code with: 831 | # percent-substitution syntax like b'/_/%b/%b%b' % (protocol, domain, path 832 | return b'http://' + proxy_host + b':' + proxy_port + b'/_/' + protocol + b'/' + domain + path 833 | 834 | 835 | # ============================================================================== 836 | # Archive 837 | 838 | 839 | HttpResource = namedtuple('HttpResource', ['headers', 'content']) 840 | 841 | 842 | class HttpResourceArchive: 843 | """ 844 | Persistent archive of HTTP resources, include the full content and headers of 845 | each resource. 846 | 847 | This class is threadsafe. 848 | """ 849 | 850 | def __init__(self, root_dirpath): 851 | """ 852 | Opens the existing archive at the specified directory, 853 | or creates a new archive if there is no such directory. 854 | """ 855 | self._closed = False 856 | self._lock = Lock() 857 | self._root_dirpath = root_dirpath 858 | 859 | # Create empty archive if archive does not already exist 860 | if not os.path.exists(root_dirpath): 861 | os.mkdir(root_dirpath) 862 | with self._open_index('w') as f: 863 | f.write('') 864 | 865 | # Load archive 866 | with self._open_index('r') as f: 867 | self._urls = f.read().split('\n') 868 | if self._urls == ['']: 869 | self._urls = [] 870 | # NOTE: It is possible for the archive to contain multiple IDs for the 871 | # same path under rare circumstances. In that case the last ID wins. 872 | self._resource_id_for_url = {url: i for (i, url) in enumerate(self._urls)} 873 | 874 | def get(self, url): 875 | """ 876 | Gets the HttpResource at the specified url from this archive, 877 | or None if the specified resource is not in the archive. 878 | """ 879 | with self._lock: 880 | resource_id = self._resource_id_for_url.get(url) 881 | if resource_id is None: 882 | return None 883 | 884 | with self._open_response_headers(resource_id, 'r') as f: 885 | headers = json.load(f, object_pairs_hook=OrderedDict) 886 | f = self._open_response_content(resource_id, 'rb') 887 | return HttpResource( 888 | headers=headers, 889 | content=f, 890 | ) 891 | 892 | def get_request_headers(self, url): 893 | """ 894 | Gets the request headers for the resource at the specified url from this archive, 895 | or None if the specified resource is not in the archive. 896 | """ 897 | with self._lock: 898 | resource_id = self._resource_id_for_url.get(url) 899 | if resource_id is None: 900 | return None 901 | 902 | with self._open_request_headers(resource_id, 'r') as f: 903 | return json.load(f, object_pairs_hook=OrderedDict) 904 | 905 | def put(self, url, request_headers, resource): 906 | """ 907 | Puts the specified HttpResource into this archive, replacing any previous 908 | resource with the same url. 909 | 910 | If two difference resources are put into this archive at the same url 911 | concurrently, the last one put into the archive will eventually win. 912 | """ 913 | # Reserve resource ID (if new) 914 | with self._lock: 915 | resource_id = self._resource_id_for_url.get(url) 916 | if resource_id is None: 917 | resource_id = len(self._urls) 918 | self._urls.append('') # reserve space 919 | resource_id_is_new = True 920 | else: 921 | resource_id_is_new = False 922 | 923 | # Write resource content 924 | with self._open_request_headers(resource_id, 'w') as f: 925 | json.dump(request_headers, f) 926 | with self._open_response_headers(resource_id, 'w') as f: 927 | json.dump(resource.headers, f) 928 | with self._open_response_content(resource_id, 'wb') as f: 929 | shutil.copyfileobj(resource.content, f) 930 | 931 | # Commit resource ID (if new) 932 | if resource_id_is_new: 933 | # NOTE: Only commit an entry to self._urls AFTER the resource 934 | # content has been written to disk successfully. 935 | with self._lock: 936 | self._urls[resource_id] = url 937 | old_resource_id = self._resource_id_for_url.get(url) 938 | if old_resource_id is None or old_resource_id < resource_id: 939 | self._resource_id_for_url[url] = resource_id 940 | 941 | def delete(self, url): 942 | """ 943 | Deletes the specified resource from this archive if it exists. 944 | 945 | Returns whether the specified resource was found and deleted. 946 | """ 947 | with self._lock: 948 | resource_id = self._resource_id_for_url.get(url) 949 | if resource_id is None: 950 | return False 951 | else: 952 | self._delete_resource(resource_id) 953 | 954 | self._urls[resource_id] = '' 955 | del self._resource_id_for_url[url] 956 | return True 957 | 958 | def flush(self): 959 | """ 960 | Flushes all pending changes made to this archive to disk. 961 | """ 962 | # TODO: Make this operation atomic, even if the write fails in the middle. 963 | with self._open_index('w') as f: 964 | f.write('\n'.join(self._urls)) 965 | 966 | def close(self): 967 | """ 968 | Closes this archive. 969 | """ 970 | if self._closed: 971 | return # pragma: no cover: unreachable by tests 972 | self.flush() 973 | self._closed = True 974 | 975 | # === Filesystem I/O === 976 | 977 | def _open_index(self, mode='r'): 978 | return open(os.path.join(self._root_dirpath, 'index.txt'), mode, encoding='utf8') 979 | 980 | def _open_request_headers(self, resource_id, mode='r'): 981 | resource_ordinal = resource_id + 1 982 | return open(os.path.join(self._root_dirpath, '%d.request_headers.json' % resource_ordinal), mode, encoding='utf8') 983 | 984 | def _open_response_headers(self, resource_id, mode='r'): 985 | resource_ordinal = resource_id + 1 986 | return open(os.path.join(self._root_dirpath, '%d.response_headers.json' % resource_ordinal), mode, encoding='utf8') 987 | 988 | def _open_response_content(self, resource_id, mode='rb'): 989 | resource_ordinal = resource_id + 1 990 | return open(os.path.join(self._root_dirpath, '%d.response_body.dat' % resource_ordinal), mode) 991 | 992 | def _delete_resource(self, resource_id): 993 | resource_ordinal = resource_id + 1 994 | os.remove(os.path.join(self._root_dirpath, '%d.request_headers.json' % resource_ordinal)) 995 | os.remove(os.path.join(self._root_dirpath, '%d.response_headers.json' % resource_ordinal)) 996 | os.remove(os.path.join(self._root_dirpath, '%d.response_body.dat' % resource_ordinal)) 997 | 998 | 999 | # ------------------------------------------------------------------------------ 1000 | 1001 | if __name__ == '__main__': 1002 | main(sys.argv[1:]) # pragma: no cover: unreachable by tests 1003 | --------------------------------------------------------------------------------