├── .gitignore ├── .travis.yml ├── MANIFEST.in ├── Makefile ├── Readme.md ├── config.ini ├── docs ├── Makefile ├── conf.py ├── config.rst ├── devsetup.rst ├── errors.rst └── index.rst ├── liveweb ├── __init__.py ├── cache.py ├── cli.py ├── config.py ├── configutil.py ├── errors.py ├── file_pool.py ├── filetools.py ├── main.py ├── proxy.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_configutil.py │ ├── test_filepool.py │ ├── test_filetools.py │ ├── test_proxy.py │ ├── test_webapp.py │ └── webtest.py ├── tools │ ├── __init__.py │ ├── wayback.py │ └── wsgiapp.py └── webapp.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | lib/ 3 | include/ 4 | *.egg-info/ 5 | *.pyc 6 | *~ 7 | .coverage 8 | htmlcov/ 9 | docs/_build/ 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | install: pip install -r requirements.txt --use-mirrors 6 | script: py.test liveweb/ 7 | 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | PROJECT_NAME=$(shell basename $(PWD)) 3 | 4 | # If VENV_ROOT is defined in the environment, use it to find the VENV 5 | # directory, else consider the current dir as the venv dir. 6 | VENV_ROOT ?= $(shell dirname $(PWD)) 7 | 8 | 9 | # Use the active virtualenv or the one inside the project 10 | VIRTUAL_ENV ?= $(VENV_ROOT)/$(PROJECT_NAME) 11 | 12 | VENV=$(VIRTUAL_ENV) 13 | 14 | # host:port of the liveweb proxy. 15 | # This is used by the wayback. 16 | LIVEWEB_ADDRESS=localhost:7070 17 | WAYBACK_ADDRESS=:8080 18 | 19 | CONFIG=config.ini 20 | 21 | UWSGI=$(VENV)/bin/uwsgi -H$(VENV) 22 | 23 | .PHONY: docs 24 | 25 | run: 26 | $(VENV)/bin/liveweb-proxy -c $(CONFIG) 27 | 28 | venv: 29 | virtualenv --no-site-packages $(VENV) 30 | $(VENV)/bin/pip install -r requirements.txt 31 | $(VENV)/bin/python setup.py develop 32 | 33 | test: 34 | 35 | $(VENV)/bin/py.test liveweb/ 36 | 37 | wayback: 38 | $(UWSGI) --http ${WAYBACK_ADDRESS} --wsgi liveweb.tools.wayback --pyargv $(LIVEWEB_ADDRESS) 39 | 40 | docs: 41 | cd docs && make html 42 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | Liveweb Proxy for Wayback Machine 2 | ================================= 3 | 4 | [![Build Status](https://secure.travis-ci.org/internetarchive/liveweb.png?branch=master)](http://travis-ci.org/internetarchive/liveweb) 5 | 6 | Liveweb proxy is component of Internet Archive's [wayback machine][] 7 | project. 8 | 9 | [wayback machine]: http://web.archive.org/ 10 | 11 | The liveweb proxy captures the content of a web page in real time, archives it 12 | into a ARC or WARC file and returns the ARC/WARC record back to the wayback 13 | machine to process. The recorded ARC/WARC file becomes part of the wayback 14 | machine in due course of time. 15 | 16 | How to setup 17 | ============ 18 | 19 | * `make venv` 20 | 21 | This setup a new virtual env in the project directory and instals all the dependencies. 22 | 23 | * `make run` 24 | 25 | This starts running the liveweb proxy. 26 | 27 | * `make test` 28 | 29 | Runs all the test cases. 30 | 31 | Documentation 32 | ============= 33 | 34 | Documentation is available at . 35 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [liveweb] 2 | 3 | # User-Agent. Specifies the user-agent header sent by liveweb-proxy. 4 | user-agent = liveweb-proxy/2.0 5 | 6 | 7 | # Default value of timeout to use when individual timeouts are not specified. 8 | # 9 | # default-timeout = 10s 10 | 11 | 12 | # DNS timeout. 13 | # 14 | # dns-timeout = 2s 15 | 16 | 17 | # Max allowed time to receive HTTP status and headers 18 | # 19 | # initial-data_timout = 3s 20 | 21 | 22 | # Specifies the read timeout. This indicates the idle time. If no data 23 | # is received for more than this time, the request will fail. 24 | # 25 | # read-timeout = 1s 26 | 27 | max-request-time = 2m 28 | 29 | # Maximum allowed response size. 30 | max-response-size = 100MB 31 | 32 | # Output directory to write ARC/WRC files 33 | output-directory = records 34 | 35 | # Filename pattern. Uses python string substitution to replace 36 | # keywords timestamp, serial, pid, fqdn and port. 37 | filename-pattern = live-%(timestamp)s-%(serial)05d-%(fqdn)s-%(port)s.arc.gz 38 | 39 | # The limit on the size of file, after which it 40 | filesize-limit = 100MB 41 | 42 | # Number of concurrent writers 43 | # 44 | # num-writers = 1 45 | 46 | archive-type = arc 47 | 48 | # Cache type. 49 | # Supported options are redis and none. 50 | # 51 | cache = redis 52 | 53 | # Redis parameters. Used when cache=redis. 54 | # 55 | # redis-host = localhost 56 | # redis-port = 6379 57 | # redis-db = 0 58 | 59 | # Number of worker processes. 60 | # 61 | # workers = 1 62 | 63 | # Number of threads per process 64 | # 65 | # threads = 10 66 | 67 | # Listen address. 68 | # Set it to empty to listen on all available interfaces. 69 | # 70 | # listen-address = 127.0.0.1 71 | 72 | # Port to bind. 73 | # 74 | # port = 7070 75 | 76 | 77 | # UWSGI settings 78 | [uwsgi] 79 | 80 | # The liveweb-proxy is run using uwsgi. 81 | # You can specify uwsgi setting here. 82 | # See uwsgi --help for more details. 83 | 84 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " singlehtml to make a single large HTML file" 22 | @echo " pickle to make pickle files" 23 | @echo " json to make JSON files" 24 | @echo " htmlhelp to make HTML files and a HTML help project" 25 | @echo " qthelp to make HTML files and a qthelp project" 26 | @echo " devhelp to make HTML files and a Devhelp project" 27 | @echo " epub to make an epub" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 30 | @echo " text to make text files" 31 | @echo " man to make manual pages" 32 | @echo " changes to make an overview of all changed/added/deprecated items" 33 | @echo " linkcheck to check all external links for integrity" 34 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 35 | 36 | clean: 37 | -rm -rf $(BUILDDIR)/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | singlehtml: 50 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 51 | @echo 52 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 53 | 54 | pickle: 55 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 56 | @echo 57 | @echo "Build finished; now you can process the pickle files." 58 | 59 | json: 60 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 61 | @echo 62 | @echo "Build finished; now you can process the JSON files." 63 | 64 | htmlhelp: 65 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 66 | @echo 67 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 68 | ".hhp project file in $(BUILDDIR)/htmlhelp." 69 | 70 | qthelp: 71 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 72 | @echo 73 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 74 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 75 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/LivewebProxy.qhcp" 76 | @echo "To view the help file:" 77 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/LivewebProxy.qhc" 78 | 79 | devhelp: 80 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 81 | @echo 82 | @echo "Build finished." 83 | @echo "To view the help file:" 84 | @echo "# mkdir -p $$HOME/.local/share/devhelp/LivewebProxy" 85 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/LivewebProxy" 86 | @echo "# devhelp" 87 | 88 | epub: 89 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 90 | @echo 91 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 92 | 93 | latex: 94 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 95 | @echo 96 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 97 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 98 | "(use \`make latexpdf' here to do that automatically)." 99 | 100 | latexpdf: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo "Running LaTeX files through pdflatex..." 103 | make -C $(BUILDDIR)/latex all-pdf 104 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 105 | 106 | text: 107 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 108 | @echo 109 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 110 | 111 | man: 112 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 113 | @echo 114 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 115 | 116 | changes: 117 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 118 | @echo 119 | @echo "The overview file is in $(BUILDDIR)/changes." 120 | 121 | linkcheck: 122 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 123 | @echo 124 | @echo "Link check complete; look for any errors in the above output " \ 125 | "or in $(BUILDDIR)/linkcheck/output.txt." 126 | 127 | doctest: 128 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 129 | @echo "Testing of doctests in the sources finished, look at the " \ 130 | "results in $(BUILDDIR)/doctest/output.txt." 131 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Liveweb Proxy documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Apr 27 14:01:57 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = [] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'Liveweb Proxy' 44 | copyright = u'2012, Internet Archive' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '2.0-dev' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '2.0-dev' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_domain_indices = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 153 | #html_show_sphinx = True 154 | 155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 156 | #html_show_copyright = True 157 | 158 | # If true, an OpenSearch description file will be output, and all pages will 159 | # contain a tag referring to it. The value of this option must be the 160 | # base URL from which the finished HTML is served. 161 | #html_use_opensearch = '' 162 | 163 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 164 | #html_file_suffix = None 165 | 166 | # Output file base name for HTML help builder. 167 | htmlhelp_basename = 'LivewebProxydoc' 168 | 169 | 170 | # -- Options for LaTeX output -------------------------------------------------- 171 | 172 | # The paper size ('letter' or 'a4'). 173 | #latex_paper_size = 'letter' 174 | 175 | # The font size ('10pt', '11pt' or '12pt'). 176 | #latex_font_size = '10pt' 177 | 178 | # Grouping the document tree into LaTeX files. List of tuples 179 | # (source start file, target name, title, author, documentclass [howto/manual]). 180 | latex_documents = [ 181 | ('index', 'LivewebProxy.tex', u'Liveweb Proxy Documentation', 182 | u'Internet Archive', 'manual'), 183 | ] 184 | 185 | # The name of an image file (relative to this directory) to place at the top of 186 | # the title page. 187 | #latex_logo = None 188 | 189 | # For "manual" documents, if this is true, then toplevel headings are parts, 190 | # not chapters. 191 | #latex_use_parts = False 192 | 193 | # If true, show page references after internal links. 194 | #latex_show_pagerefs = False 195 | 196 | # If true, show URL addresses after external links. 197 | #latex_show_urls = False 198 | 199 | # Additional stuff for the LaTeX preamble. 200 | #latex_preamble = '' 201 | 202 | # Documents to append as an appendix to all manuals. 203 | #latex_appendices = [] 204 | 205 | # If false, no module index is generated. 206 | #latex_domain_indices = True 207 | 208 | 209 | # -- Options for manual page output -------------------------------------------- 210 | 211 | # One entry per manual page. List of tuples 212 | # (source start file, name, description, authors, manual section). 213 | man_pages = [ 214 | ('index', 'livewebproxy', u'Liveweb Proxy Documentation', 215 | [u'Internet Archive'], 1) 216 | ] 217 | -------------------------------------------------------------------------------- /docs/config.rst: -------------------------------------------------------------------------------- 1 | .. _config: 2 | 3 | 4 | Liveweb Proxy Configuration 5 | =========================== 6 | 7 | The ``liveweb-proxy`` can be configured using various command-line options and/or a config file. 8 | 9 | Config file can be specified as:: 10 | 11 | $ liveweb-proxy -c liveweb.ini 12 | 13 | or:: 14 | 15 | $ liveweb-proxy --config liveweb.ini 16 | 17 | This section describes the available config settings. For each config setting, there is a command line option with the same name. 18 | 19 | For example, config setting ``archive-format`` is available as command line argument `--archive-format`. 20 | 21 | The config file is specified in INI format. Here is a sample config file. :: 22 | 23 | [liveweb] 24 | 25 | archive-format = arc 26 | 27 | output-directory = /tmp/records 28 | 29 | dns-timeout = 2s 30 | 31 | 32 | Archive Settings 33 | ---------------- 34 | 35 | **archive-format** 36 | 37 | Specifies the archive format. Should be one if ``arc`` or ``warc``. 38 | 39 | The default value is ``arc``. 40 | 41 | .. warning:: 42 | 43 | As of now only ``arc`` is supported. 44 | 45 | 46 | **output-directory** 47 | 48 | Output directory to write ARC/WRC files. Default value is "records". 49 | 50 | 51 | **filename-pattern** 52 | 53 | The pattern of the filename specified as Python string formatting 54 | template. The default value is ``live-%(timestamp)s-%(serial)05d.arc.gz``. 55 | 56 | Available substitutions are ``timestamp``, ``serial``, ``pid``, 57 | ``fqdn`` (fully qualified domain name) and ``port``. 58 | 59 | **filesize-limit** 60 | 61 | The limit on the size of file. If a file crosses this size, it 62 | will be closed a new file will be created to write new records. 63 | 64 | **num-writers** 65 | 66 | The number of concurrent writers. 67 | 68 | The default value is ``1``. 69 | 70 | 71 | Cache Settings 72 | -------------- 73 | 74 | .. _config_cache: 75 | 76 | **cache** 77 | 78 | Type of cache to use. Available options are ``redis``, ``sqlite`` and ``none``. 79 | 80 | The default value is ``none``. 81 | 82 | **redis-host** 83 | 84 | **redis-port** 85 | 86 | **redis-db** 87 | 88 | Redis host, port and db number. Used only when ``cache=redis``. 89 | 90 | **redis-expire-time** 91 | 92 | Expire time to set in redis. Used only when ``cache=redis``. 93 | 94 | The default value is ``1h`` (1 hour). 95 | 96 | **redis-max-record-size** 97 | 98 | Maximum allowed size of a record that can be cached. Used only when ``cache=redis``. 99 | 100 | The default value is ``100KB``. 101 | 102 | **sqlite-db** 103 | 104 | Path to the sqlite database to use. This option is valid only when ``cache=sqlite``. 105 | 106 | The default value is ``liveweb.db``. 107 | 108 | Timeouts and Resource Limits 109 | ---------------------------- 110 | 111 | **default-timeout** 112 | 113 | This is the default timeout value for ``connect-timeout``, ``initial-data-timeout`` and ``read-timeout``. 114 | 115 | The default value is ``10s``. 116 | 117 | .. _config_dns_timeout: 118 | 119 | **dns-timeout** 120 | 121 | Specifies the max amount of time can a DNS resolution can take. 122 | 123 | Python doesn't support a way to specify DNS timeout. On Linux, the 124 | dns timeout can be specified via the ``RES_OPTIONS`` environment 125 | variable. This enviroment variable is set at the startup of the 126 | application based on this config setting. 127 | 128 | If unspecified, the DNS timeout is decided by the system default behavior. 129 | 130 | See `resolv.conf man page`_ for more details. 131 | 132 | .. _resolv.conf man page: http://manpages.ubuntu.com/manpages/lucid/en/man5/resolv.conf.5.html 133 | 134 | .. _config_connect_timeout: 135 | 136 | **connect-timeout** 137 | 138 | Specifies the connect timeout in seconds. Connections that take 139 | longer to establish will be aborted. 140 | 141 | .. _config_initial_data_timeout: 142 | 143 | **initial-data-timeout** 144 | 145 | Specifies the maximum time allowed before receiving initial data 146 | (HTTP headers) from the remote server. 147 | 148 | .. _config_read_timeout: 149 | 150 | **read-timeout** 151 | 152 | Specifies the read timeout in seconds. This indicates the idle time. If no data is received for more than this time, the request will fail. 153 | 154 | 155 | **max-request-time** 156 | 157 | Specifies the total amout of time a HTTP request can take. If it takes 158 | more than this, the current request will fail. 159 | 160 | The default value is ``2m``. 161 | 162 | **max-response-size** 163 | 164 | Specifies the maximum allowed size of response. 165 | 166 | The default value is ``100MB``. 167 | 168 | Other Settings 169 | -------------- 170 | 171 | .. _config_user_agent: 172 | 173 | **user-agent** 174 | 175 | Specifies the value of the ``User-Agent`` request header. 176 | 177 | The default value is ``ia_archiver(OS-Wayback)``. 178 | 179 | 180 | **http-passthrough** 181 | 182 | This is a boolean parameter, setting it to ``true`` will make it 183 | work like a http proxy with archiving. Useful for testing and 184 | recording personal browsing. 185 | -------------------------------------------------------------------------------- /docs/devsetup.rst: -------------------------------------------------------------------------------- 1 | .. _devsetup: 2 | 3 | Development Setup 4 | ================= 5 | 6 | Setting up 7 | ---------- 8 | 9 | Start with getting the source code from github. :: 10 | 11 | $ git clone git://github.com/internetarchive/liveweb.git 12 | $ cd liveweb 13 | 14 | Setup a virtualenv. :: 15 | 16 | $ make venv 17 | 18 | This will create the virtualenv in the current directory. Edit the 19 | ``Makefile`` if you want to setup virtualenv elsewhere. 20 | 21 | Running the application 22 | ----------------------- 23 | 24 | Run the application using:: 25 | 26 | $ make run 27 | 28 | This will start the liveweb proxy at ``localhost:7070``. 29 | 30 | Testing using curl 31 | ------------------ 32 | 33 | Assuming the liveweb proxy is running on `localhost:7070`:: 34 | 35 | $ curl -s -x localhost:7070 http://httpbin.org/get | zcat 36 | http://httpbin.org/get 204.236.238.79 20120427110218 application/json 451 37 | HTTP/1.1 200 OK 38 | Content-Type: application/json 39 | Date: Fri, 27 Apr 2012 11:02:18 GMT 40 | Server: gunicorn/0.13.4 41 | Content-Length: 298 42 | Connection: Close 43 | 44 | { 45 | "url": "http://httpbin.org/get", 46 | "headers": { 47 | "Content-Length": "", 48 | "Accept-Encoding": "identity", 49 | "Connection": "keep-alive", 50 | "User-Agent": "ia_archiver(OS-Wayback)", 51 | "Host": "httpbin.org", 52 | "Content-Type": "" 53 | }, 54 | "args": {}, 55 | "origin": "207.241.237.193" 56 | } 57 | 58 | Running in http-passthough mode 59 | ------------------------------- 60 | 61 | Enable http-passthrough mode by adding the following to the config file. :: 62 | 63 | http_passthough: true 64 | 65 | Make sure caching is disabled. The http-passthough mode doesn't work with caching. 66 | 67 | Run the application and change the browser setting to use application 68 | address (localhost:7070 by default) as http proxy. 69 | 70 | Performance Testing 71 | ------------------- 72 | 73 | Test performance using Apache-Bench:: 74 | 75 | $ ab -X localhost:7070 -c 10 -n 100 http://www.archive.org/ 76 | 77 | The ``-X`` options is to specify the proxy server. 78 | 79 | -------------------------------------------------------------------------------- /docs/errors.rst: -------------------------------------------------------------------------------- 1 | Error Codes 2 | =========== 3 | 4 | The application writes the errors with following codes when something fails when trying to fetch the given URL. 5 | 6 | 1X - Bad Input 7 | -------------- 8 | 9 | **E10 - Invalid URL** 10 | 11 | The given URL is invalid. For example:: 12 | 13 | http://example.com:bad-port/ 14 | 15 | 2X - DNS errors 16 | --------------- 17 | 18 | **E20 - Invalid Domain** 19 | 20 | The URL has non existant domain. 21 | 22 | **E21 - DNS Timeout** 23 | 24 | The hostname couldn't be resolved within :ref:`config_dns_timeout` seconds. 25 | 26 | 3X - Connection Errors 27 | ---------------------- 28 | 29 | **E30 - Connection Refused** 30 | 31 | Connection refused by the server. 32 | 33 | **E31 - Connect Timeout** 34 | 35 | Connection couldn't be established within :ref:`config_connect_timeout` seconds. 36 | 37 | **E32 - Initial Data Timeout** 38 | 39 | Initial data (HTTP headers) couldn't be obtained within :ref:`config_initial_data_timeout` seconds. 40 | 41 | **E33 - Read Timeout** 42 | 43 | When reading data from the remote server, no data was received for :ref:`config_read_timeout` seconds. 44 | 45 | **E34 - Connection Dropped** 46 | 47 | The remote server dropped the connection before all the data was received. 48 | 49 | **E39 - Unexpected Connection Error** 50 | 51 | Unexpected connection error when receiving data from the remote server. 52 | 53 | 4X - Resource Limits 54 | -------------------- 55 | 56 | **E40 - Response Too Big** 57 | 58 | The response length is bigger than :ref:`config_max_response_size` bytes. 59 | 60 | **E41 - Request Took Too Long** 61 | 62 | The request was not completed within :ref:`config_max_request_time` seconds. 63 | 64 | In all these cases, the application responds back with status ``200 OK`` 65 | with a record contain status ``302 Bad Gateway``. 66 | 67 | 68 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Liveweb Proxy documentation master file, created by 2 | sphinx-quickstart on Fri Apr 27 14:01:57 2012. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Liveweb Proxy 7 | ============= 8 | 9 | Liveweb proxy is a component of Internet Archive's `wayback machine 10 | project `_. 11 | 12 | The liveweb proxy captures the content of a web page in real time, 13 | archives it into a ARC or WARC file and returns the ARC/WARC record 14 | back to the wayback machine to process. The recorded ARC/WARC file 15 | becomes part of the wayback machine in due course of time. 16 | 17 | .. note:: 18 | 19 | The liveweb project is under active development, so this documentation may not be up-to-date. 20 | 21 | Installation 22 | ------------ 23 | 24 | Liveweb proxy can be installed using `pip `_:: 25 | 26 | $ pip install liveweb 27 | 28 | or, with `easy_install `_ :: 29 | 30 | $ easy_install liveweb 31 | 32 | See `Development Setup `_ if you want to work with source code. 33 | 34 | Running liveweb-proxy 35 | --------------------- 36 | 37 | Liveweb proxy can be run using:: 38 | 39 | $ liveweb-proxy 40 | 41 | To start liveweb-proxy on a different port:: 42 | 43 | $ liveweb-proxy -p 8080 44 | 45 | To load settings from a config file:: 46 | 47 | $ liveweb-proxy -c liveweb.ini 48 | 49 | To see the available command-line options:: 50 | 51 | $ liveweb-proxy --help 52 | 53 | See :ref:`Configuration ` section for the available config settings and command line options. 54 | 55 | Advanced Usage 56 | -------------- 57 | 58 | Under the hood, liveweb proxy used `uwsgi `_ as the http server. 59 | 60 | If you want to tweak uwsgi parameters, you can start liveweb as:: 61 | 62 | $ uwsgi --master --single-interpreter --lazy --wsgi liveweb.main --processes 1 --threads 10 --http localhost:7070 63 | 64 | The values of ``--processes``, ``--threads`` and ``--http`` options can be changed as needed and more options can be added too. 65 | 66 | You may have to specify `-H virtualenv_home` if you using a virtualenv. 67 | 68 | Documentation 69 | ------------- 70 | 71 | .. toctree:: 72 | :maxdepth: 3 73 | 74 | devsetup 75 | config 76 | errors 77 | -------------------------------------------------------------------------------- /liveweb/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /liveweb/cache.py: -------------------------------------------------------------------------------- 1 | """Cache for liveweb. 2 | """ 3 | 4 | from collections import namedtuple 5 | import logging 6 | import sqlite3 7 | 8 | import redis 9 | 10 | from .proxy import Record 11 | 12 | class RedisCache: 13 | """Cache based on Redis. 14 | 15 | This caches the whole arc record. 16 | """ 17 | def __init__(self, **params): 18 | """Creates a new instance of redis client. 19 | 20 | :param host: host to connect, defaults to "localhost" 21 | :param port: port to connect, defaults to 6379, the default redis server port 22 | :param db: db number, defaults to 0 23 | :param expire_time: amount of time in seconds after which the entry in the cache should expire, defaults to one hour. 24 | """ 25 | self.expire_time = int(params.pop('expire_time', 3600)) # default timeout 26 | 27 | # max size of record that can be cached. Defaults to 100K. 28 | self.max_record_size = params.pop('max_record_size', 100*1024) 29 | 30 | self.redis_client = redis.StrictRedis(**params) 31 | 32 | def get(self, url): 33 | data = self.redis_client.get(url) 34 | if data is not None: 35 | logging.info("cache hit - %s", url) 36 | return Record(filename=None, 37 | offset=0, 38 | content_length=len(data), 39 | content_iter=iter([data])) 40 | 41 | def set(self, url, record): 42 | """Puts a new entry in the cache. 43 | 44 | :param url: URL for which the response is being cached 45 | :param record: record to be cached 46 | """ 47 | if record.content_length <= self.max_record_size: 48 | data = record.read_all() 49 | self.redis_client.setex(url, self.expire_time, data) 50 | 51 | def next(self): 52 | """Returns the next-value of the counter. 53 | Used by file_pool to get next sequence. 54 | """ 55 | return self.redis_client.incr("filename-sequence") 56 | 57 | class SqliteCache: 58 | """Cache implementation based on sqlite. 59 | 60 | This stores url, filepath, offset and content_length in the 61 | database. Useful when running in http-passthough mode with 62 | browser. 63 | """ 64 | SCHEMA = ("" + 65 | "CREATE TABLE cache (" + 66 | " url text unique," + 67 | " filename text," + 68 | " offset int," + 69 | " clen int" + 70 | ")") 71 | 72 | def __init__(self, database): 73 | self.database = database 74 | self.create_table() 75 | 76 | def create_table(self): 77 | if "cache" not in self._get_tables(): 78 | self.query(self.SCHEMA) 79 | 80 | def _get_tables(self): 81 | q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" 82 | tables = [row[0] for row in self.query(q)] 83 | return tables 84 | 85 | def query(self, query, args=[], commit=False): 86 | logging.debug("query: %r - %r", query, args) 87 | conn = sqlite3.connect(self.database) 88 | cursor = conn.execute(query, args) 89 | rows = cursor.fetchall() 90 | if commit: 91 | conn.commit() 92 | cursor.close() 93 | conn.close() 94 | return rows 95 | 96 | def get(self, url): 97 | rows = self.query("SELECT filename, offset, clen FROM cache WHERE url=?", [url]) 98 | if rows: 99 | logging.info("cache hit - %s", url) 100 | filepath, offset, content_length = rows[0] 101 | return Record(filepath, offset=offset, content_length=content_length) 102 | else: 103 | logging.info("cache miss - %s", url) 104 | 105 | def set(self, url, record): 106 | self.query("INSERT INTO cache (url, filename, offset, clen) VALUES (?, ?, ?, ?)", 107 | [url, record.filename, record.offset, record.content_length], 108 | commit=True) 109 | 110 | class NoCache: 111 | def get(self, url): 112 | return None 113 | 114 | def set(self, url, record): 115 | pass 116 | 117 | def create(type, config): 118 | logging.info("creating cache %s", type) 119 | 120 | if type == 'redis': 121 | return RedisCache(host=config.redis_host, 122 | port=config.redis_port, 123 | db=config.redis_db, 124 | expire_time=config.redis_expire_time, 125 | max_record_size=config.redis_max_record_size) 126 | elif type == 'sqlite': 127 | return SqliteCache(config.sqlite_db) 128 | elif type == 'none' or type == None: 129 | return NoCache() 130 | else: 131 | raise ValueError("Unknown cache type %r" % type) 132 | -------------------------------------------------------------------------------- /liveweb/cli.py: -------------------------------------------------------------------------------- 1 | """Command-line interface to the Liveweb Proxy. 2 | """ 3 | import sys 4 | import os 5 | from optparse import OptionParser, OptionGroup 6 | from .configutil import Config 7 | 8 | def make_config(): 9 | c = Config("liveweb") 10 | 11 | c.add_option("-c", "--config", 12 | help="specifies the liveweb-proxy config file") 13 | 14 | c.add_option("--archive-format", 15 | type="choice", 16 | choices=["none", "arc", "warc"], 17 | default="arc", 18 | help="specifies the archiving format") 19 | 20 | c.add_option("--http-passthrough", 21 | type="bool", 22 | default="false", 23 | help="enables the http-passthrough mode") 24 | 25 | c.add_option("--user-agent", 26 | default="ia_archiver(OS-Wayback)", 27 | help="the user-agent string used by liveweb-proxy") 28 | 29 | c.add_option("--uid", 30 | help="setuid to the specified user/uid") 31 | 32 | c.add_option("--gid", 33 | help="setgid to the specified group/gid") 34 | 35 | # server options 36 | c.add_option("-l", "--listen", 37 | metavar="IP_ADDRESS", 38 | default="127.0.0.1", 39 | help="the IP-address on which the liveweb-proxy will listen on (default: %default).") 40 | 41 | c.add_option("-p", "--port", 42 | type="int", 43 | default="7070", 44 | help="the port on which the liveweb-proxy will listen on (default: %default).") 45 | 46 | c.add_option("-w", "--workers", 47 | type="int", 48 | default="1", 49 | help="the number of worker processes (default: %default)") 50 | 51 | c.add_option("-t", "--threads", 52 | type="int", 53 | default="10", 54 | help="the number of threads/process (default: %default)") 55 | 56 | # storage options 57 | c.add_option("-o", "--output-directory", 58 | metavar="DIR", 59 | default="records", 60 | help="the directory to store the arc/warc files (default: %default)") 61 | 62 | c.add_option("--filename-pattern", 63 | type="string", 64 | default="live-%(timestamp)s-%(serial)05d.arc.gz", 65 | help="specifies the format of the filename to store the arc/warc files.") 66 | 67 | c.add_option("--num-writers", 68 | type="int", 69 | default="1", 70 | help="specifies the number of concurrent writers") 71 | 72 | c.add_option("--filesize-limit", 73 | type="bytes", 74 | default="100MB", 75 | help="specifies the recommended size limit for each file.") 76 | 77 | # timeouts and limits 78 | 79 | c.add_option("--default-timeout", 80 | type="time", 81 | default="10s", 82 | help="the default timeout value to use if a timeout option is not specified."), 83 | 84 | c.add_option("--dns-timeout", 85 | type="time", 86 | help="maximum allowed time for domain name resolution") 87 | 88 | c.add_option("--connect-timeout", 89 | type="time", 90 | help="maximum allowed time for establishing connection") 91 | 92 | c.add_option("--initial-data-timeout", 93 | type="time", 94 | help="maximum wait time to receive status and headers from the remove server") 95 | 96 | c.add_option("--read-timeout", 97 | type="time", 98 | help="the read timeout") 99 | 100 | c.add_option("--max-request-time", 101 | type="time", 102 | default="5m", 103 | help="the total amout of time a HTTP request can take") 104 | 105 | c.add_option("--max-response-size", 106 | type="bytes", 107 | default="100MB", 108 | help="the maximum allowed size of response") 109 | 110 | # cache options 111 | c.add_option("--cache", 112 | type="choice", 113 | choices=["none", "redis", "sqlite"], 114 | default="none", 115 | help="specifies the type of cache to use") 116 | 117 | c.add_option("--redis-host", 118 | type="string", 119 | default="localhost") 120 | 121 | c.add_option("--redis-port", 122 | type="int", 123 | default="6379") 124 | 125 | c.add_option("--redis-db", 126 | type="int", 127 | default="0") 128 | 129 | c.add_option("--redis-expire-time", 130 | type="time", 131 | default="1h") 132 | 133 | c.add_option("--redis-max-record-size", 134 | type="bytes", 135 | default="100KB") 136 | 137 | c.add_option("--sqlite-db", 138 | type="string", 139 | default="liveweb.db") 140 | 141 | return c 142 | 143 | def find_python_home(): 144 | # Python will be installed in bin/ or Scripts/ directory. Parent 145 | # of that will be the Python home. 146 | bindir = os.path.abspath(os.path.dirname(sys.executable)) 147 | home = os.path.dirname(bindir) 148 | return home 149 | 150 | def run_uwsgi(config): 151 | python_home = os.getenv("VIRTUAL_ENV") or find_python_home() 152 | bind = "%s:%s" % (config['listen'], config['port']) 153 | 154 | args = ["uwsgi", 155 | "--http", bind, 156 | "-Mi", # master, single-interpreter 157 | "--lazy", 158 | "--home", python_home, 159 | "--wsgi", "liveweb.main", 160 | "--processes", config['workers'], 161 | "--threads", config['threads'], 162 | "--listen", 1024, 163 | ] 164 | 165 | if config.get("config"): 166 | args.append("--pyargv") 167 | args.append("-c " + config['config']) 168 | 169 | if config['uid']: 170 | args.append("--uid", config['uid']) 171 | 172 | if config['gid']: 173 | args.append("--gid", config['gid']) 174 | 175 | dirname = os.path.abspath(os.path.dirname(sys.argv[0])) 176 | uwsgi_path = os.path.join(dirname, "uwsgi") 177 | 178 | os.execvp(uwsgi_path, [str(a) for a in args]) 179 | 180 | def set_dns_timeout(timeout): 181 | os.putenv("RES_OPTIONS", "timeout:%d attempts:1" % timeout) 182 | 183 | def main(): 184 | c = make_config() 185 | 186 | # load configuration from env, config file and command line arguments. 187 | c.load() 188 | 189 | # update current env with new values so that the exec'ed process can take these settings 190 | c.putenv() 191 | 192 | set_dns_timeout(c.get('dns_timeout') or c.get('default_timeout')) 193 | 194 | # required for generating filenames 195 | os.putenv("LIVEWEB_PORT", str(c.get("port"))) 196 | 197 | run_uwsgi(c.dict()) 198 | 199 | if __name__ == "__main__": 200 | main() 201 | -------------------------------------------------------------------------------- /liveweb/config.py: -------------------------------------------------------------------------------- 1 | """Liveweb configuration. 2 | 3 | This is initialized by calling the load(configfile) function on startup. 4 | """ 5 | import yaml 6 | import os 7 | import logging 8 | from ConfigParser import ConfigParser 9 | 10 | from .cli import make_config 11 | 12 | # The config options and default values are specified in make_config function in cli.py 13 | 14 | def init_defaults(): 15 | global _config 16 | _config = make_config() 17 | globals().update(_config.dict()) 18 | 19 | def load(): 20 | """Loads the configuration from environment, config file and command-line arguments. 21 | """ 22 | global _config 23 | _config = make_config() 24 | _config.load() 25 | globals().update(_config.dict()) 26 | 27 | extra_headers = {} 28 | 29 | def get_connect_timeout(): 30 | return connect_timeout or default_timeout 31 | 32 | def get_initial_data_timeout(): 33 | return initial_data_timeout or default_timeout 34 | 35 | def get_read_timeout(): 36 | return read_timeout or default_timeout 37 | 38 | def get_dns_timeout(): 39 | return dns_timeout or default_timeout 40 | 41 | # initialize the default configuration 42 | init_defaults() 43 | 44 | # handy function to check for existance of a config parameter 45 | get = globals().get 46 | -------------------------------------------------------------------------------- /liveweb/configutil.py: -------------------------------------------------------------------------------- 1 | """Utility to work with configuration. 2 | 3 | The configuration can be changed in many different ways. The configuration sources in the order of priority are: 4 | 5 | * The command-line arguments 6 | * The config file 7 | * The environment variables 8 | 9 | The optparse module alone is not sufficient to handle this. If the 10 | OptionParser is used with default values, there is no way to know if 11 | the option is specified as command line argument or it is a default 12 | value. 13 | 14 | Also, we need a way to specify time in seconds/minutes/hours and bytes 15 | in KB/MB/GB. These formats should work for all the 3 sources. 16 | 17 | This module provides a framework to address these issues. 18 | """ 19 | import os 20 | import sys 21 | import logging 22 | from ConfigParser import ConfigParser 23 | import optparse 24 | 25 | class Config: 26 | """Entry point to configuration. 27 | 28 | This provides way to set the configuration using environment 29 | variables, config file and command line parameters. 30 | 31 | The configuration is made of many ConfigOptions. Each ConfigOption 32 | accounts for one environment variable, one setting in the config 33 | file and one command line variable. 34 | 35 | This class provides tools to read/set environment variable, load 36 | config file and create OptionParser to parse the command-line 37 | arguments. 38 | """ 39 | def __init__(self, name): 40 | self.name = name 41 | self.config_options = [] 42 | 43 | def add_option(self, *args, **kwargs): 44 | """Creates a new ConfigOption created using the specified arguments and adds it to this Config. 45 | """ 46 | option = ConfigOption(*args, **kwargs) 47 | self.config_options.append(option) 48 | 49 | def get(self, name): 50 | return self.dict().get(name) 51 | 52 | def dict(self, dirty=None): 53 | """Returns values of all the config options as a dict. 54 | 55 | If dirty=True is specified, only the values of the modified options are returned. 56 | """ 57 | return dict((c.name, c.value) for c in self.config_options 58 | if dirty is None or c.dirty==dirty) 59 | 60 | def putenv(self): 61 | """Updates this process env with environment variables 62 | indicating current configuration. 63 | 64 | Useful to set config values before exec'ing a new process. 65 | """ 66 | for c in self.config_options: 67 | c.putenv() 68 | 69 | def load(self, env=None, args=None): 70 | """Loads the configuration from environment, config-file and command-line arguments. 71 | """ 72 | self.load_from_env(env) 73 | 74 | p = self.create_optparse_parser() 75 | options, args2 = p.parse_args(args) 76 | 77 | # take config file from command-line or env 78 | # TODO: support an option to provide an alternative name for "config" 79 | config_file = getattr(options, "config", None) or self.get("config") 80 | if config_file: 81 | self.load_from_ini(config_file) 82 | 83 | self.load_from_optparse_options(options) 84 | 85 | def load_from_env(self, env=None): 86 | """Loads the configuration from environment. 87 | """ 88 | for c in self.config_options: 89 | c.load_from_env(env) 90 | 91 | def load_from_ini(self, filename): 92 | """Loads the configuration from a config file. 93 | """ 94 | p = ConfigParser() 95 | p.read(filename) 96 | 97 | for c in self.config_options: 98 | # using name as used in command line options in the config file 99 | name = c.name.replace("_", "-") 100 | if p.has_option(self.name, name): 101 | c.set(p.get(self.name, name, raw=True)) 102 | 103 | def create_optparse_parser(self): 104 | p = optparse.OptionParser(self.name) 105 | for c in self.config_options: 106 | p.add_option(c.option) 107 | return p 108 | 109 | def load_from_optparse_options(self, options): 110 | options = options.__dict__ 111 | for c in self.config_options: 112 | if options.get(c.name) is not None: 113 | c.set(options[c.name]) 114 | 115 | class ConfigOption: 116 | """Represents one entry in the Configuration. 117 | 118 | This corresponds to one environment variable, one config setting and one command line option. 119 | """ 120 | def __init__(self, *opts, **kw): 121 | self.type = kw.pop("type", "string") 122 | self.default = kw.pop("default", None) 123 | self.help = kw.pop("help", None) 124 | 125 | help = self.help and self.help.replace("%default", str(self.default)) 126 | 127 | if self.type == "bool": 128 | self.option = _Option(*opts, action="store_true", help=help, **kw) 129 | else: 130 | type = self.type 131 | self.option = _Option(*opts, type=type, help=help, **kw) 132 | 133 | self.name = self.option.dest 134 | self.kw = kw 135 | 136 | self.set(self.default) 137 | 138 | @property 139 | def dest(self): 140 | return self.option.dest 141 | 142 | @property 143 | def optname(self): 144 | return self.name.replace("_", "-") 145 | 146 | @property 147 | def envname(self): 148 | """Name of the enviroment variable to specify this. 149 | """ 150 | return "LIVEWEB_" + self.name.upper() 151 | 152 | @property 153 | def dirty(self): 154 | """True if the value of this item is modified.""" 155 | return self.strvalue != self.default 156 | 157 | def set(self, value): 158 | if value is None: 159 | self.value = None 160 | self.strvalue = value 161 | else: 162 | self.strvalue = str(value) 163 | self.value = self.parse(value) 164 | 165 | def parse(self, value): 166 | if self.type == "bool": 167 | return self.parse_boolean(value) 168 | else: 169 | return self.option.convert_value("--" + self.optname, str(value)) 170 | 171 | def parse_boolean(self, strvalue): 172 | return str(strvalue).lower() in ["true", "1"] 173 | 174 | def load_from_env(self, env=None): 175 | if env is None: 176 | env = os.environ 177 | if self.envname in env: 178 | self.set(env[self.envname]) 179 | 180 | def putenv(self): 181 | if self.dirty or self.envname in os.environ: 182 | os.putenv(self.envname, self.strvalue) 183 | 184 | def add_option(self, option_parser): 185 | option_parser.add_option("--" + self.optname, help=self.help, **kw) 186 | 187 | def parse_time(strvalue): 188 | """Parses time specified in seconds, minutes and hours into seconds. 189 | 190 | Time is specifed in seconds, minutes and hours using suffix s, m 191 | and h respectively. This method parses that info and converts that 192 | with appropriate multipler to convert into seconds. 193 | """ 194 | if not isinstance(strvalue, basestring): 195 | return strvalue 196 | 197 | strvalue = strvalue.replace(" ", "") 198 | scales = { 199 | 's': 1, 200 | 'm': 60, 201 | 'h': 3600 202 | } 203 | 204 | if strvalue[-1] in scales.keys(): 205 | scale = scales[strvalue[-1]] 206 | strvalue = strvalue[:-1] 207 | else: 208 | scale = 1 209 | 210 | t = float(strvalue) * scale 211 | return t 212 | 213 | def parse_bytes(strvalue): 214 | """Parses the bytes specified as KB, MB and GB into number. 215 | """ 216 | if not isinstance(strvalue, basestring): 217 | return strvalue 218 | 219 | strvalue = strvalue.replace(" ", "") 220 | scales = { 221 | "KB": 1024, 222 | "MB": 1024**2, 223 | "GB": 1024**3 224 | } 225 | if strvalue[-2:] in scales: 226 | scale = scales[strvalue[-2:]] 227 | strvalue = strvalue[:-2] 228 | else: 229 | scale = 1 230 | size = int(strvalue) * scale 231 | return size 232 | 233 | def wrap_checker(f): 234 | def g(option, opt, value): 235 | try: 236 | return f(value) 237 | except ValueError: 238 | what = option.type 239 | raise optparse.OptionValueError( 240 | "option %s: invalid %s value: %r" % (opt, what, value)) 241 | return g 242 | 243 | class _Option(optparse.Option): 244 | """Customized Option class to support time and bytes types. 245 | """ 246 | TYPES = optparse.Option.TYPES + ("time", "bytes") 247 | TYPE_CHECKER = dict(optparse.Option.TYPE_CHECKER, 248 | bytes=wrap_checker(parse_bytes), 249 | time=wrap_checker(parse_time)) 250 | 251 | -------------------------------------------------------------------------------- /liveweb/errors.py: -------------------------------------------------------------------------------- 1 | """Exceptions raised by liveweb internals. 2 | """ 3 | class LivewebException(Exception): 4 | pass 5 | 6 | class BadURL(LivewebException): 7 | """Raised if the given URL was malformed in some way. 8 | """ 9 | pass 10 | 11 | class ConnectionFailure(LivewebException, IOError): 12 | """Raised if a connection to the remote URL couldn't be established or was 13 | interrupted. 14 | """ 15 | pass 16 | 17 | class TimeoutError(LivewebException, IOError): 18 | """Raised if when a connection is timedout. 19 | """ 20 | pass 21 | 22 | -------------------------------------------------------------------------------- /liveweb/file_pool.py: -------------------------------------------------------------------------------- 1 | """ 2 | File pool implementation 3 | """ 4 | 5 | import datetime 6 | import os 7 | import Queue 8 | import random 9 | import threading 10 | import socket 11 | import itertools 12 | 13 | import logging 14 | logging.basicConfig(level = logging.DEBUG) 15 | 16 | class MemberFile(object): 17 | """ 18 | """ 19 | def __init__(self, name, pool, *largs, **kargs): 20 | self.fp = open(name, *largs, **kargs) 21 | self.pool = pool 22 | 23 | def __enter__(self): 24 | return self 25 | 26 | def __exit__(self, exc_type, exc_value, traceback): 27 | self.pool.return_file(self) 28 | 29 | 30 | def __getattr__(self, attr): 31 | return getattr(self.fp, attr) 32 | 33 | 34 | class FilePool(object): 35 | """ 36 | Implements a pool of files from which a file can be requested. 37 | 38 | """ 39 | def __init__(self, directory, pattern="liveweb-%(timestamp)s-%(serial)05d.arc.gz", max_files=1, max_file_size=100*1024*1024, init_file_func=None): 40 | """ 41 | Creates a pool of files in the given directory with the 42 | specified pattern. 43 | 44 | The number of files is max_files and the maximum size of each 45 | file is max_file_size. 46 | 47 | The `get_file` method returns a new file from the pool 48 | 49 | """ 50 | self.directory = directory 51 | self.pattern = pattern 52 | self.max_files = max_files 53 | self.max_file_size = max_file_size 54 | self.init_file_func = init_file_func 55 | 56 | self.queue = Queue.Queue(self.max_files) 57 | 58 | self.seq_counter = itertools.count() 59 | 60 | # vars required to substitue filename pattern. 61 | self._port = os.getenv("LIVEWEB_PORT", "0") 62 | self._host = socket.gethostname() 63 | self._pid = os.getpid() 64 | 65 | # Adding None to queue indicating that new file needs to be created 66 | for i in range(self.max_files): 67 | self.queue.put(None) 68 | 69 | def set_sequence(self, counter): 70 | """Sets the sequence counter used to generate filename. 71 | 72 | Used to set a distrbuted persistent counter using redis/database. 73 | 74 | :param counter: An iterable counter 75 | """ 76 | self.seq_counter = counter 77 | 78 | def _new_file(self): 79 | timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S%f") 80 | pattern_dict = dict( 81 | timestamp=timestamp, 82 | timestamp20=timestamp, 83 | timestamp17=timestamp[:17], 84 | timestamp14=timestamp[:14], 85 | serial=self.seq_counter.next(), 86 | port=self._port, 87 | host=self._host, 88 | fqdn=self._host, 89 | pid=self._pid) 90 | 91 | fname = self.pattern%pattern_dict 92 | partial_dir = os.path.join(self.directory, 'partial') 93 | absolute_name = os.path.join(partial_dir, fname) 94 | 95 | logging.info("Creating new file %s", absolute_name) 96 | 97 | fp = MemberFile(absolute_name, self, mode = "ab") 98 | # Initialize the file object like writing file headers etc. 99 | if self.init_file_func: 100 | self.init_file_func(fp) 101 | return fp 102 | 103 | def return_file(self, f): 104 | """Returns a file to the pool. Will discard the file and 105 | insert a new one if the file is above max_file_size.""" 106 | logging.debug("Returning %s",f) 107 | file_size = f.tell() 108 | if file_size < self.max_file_size: 109 | logging.debug(" Put it back") 110 | self.queue.put(f) 111 | else: 112 | logging.debug(" Closing and creating a new file") 113 | f.close() 114 | complete_dir = os.path.join(self.directory, 'complete') 115 | basename = os.path.basename(f.name) 116 | complete_name = os.path.join(complete_dir, basename) 117 | os.rename(f.name, complete_name) 118 | self.queue.put(None) 119 | 120 | def get_file(self): 121 | f = self.queue.get() 122 | # f is None when new file needs to be created 123 | if f is None: 124 | f = self._new_file() 125 | logging.debug("Getting %s",f) 126 | return f 127 | 128 | def close(self): 129 | logging.debug("Closing all descriptors. Emptying pool.") 130 | while not self.queue.empty(): 131 | fp = self.queue.get_nowait() 132 | if fp: 133 | fp.close() 134 | -------------------------------------------------------------------------------- /liveweb/filetools.py: -------------------------------------------------------------------------------- 1 | from cStringIO import StringIO 2 | import httplib 3 | import tempfile 4 | import logging 5 | import os 6 | 7 | from . import config 8 | 9 | class SizeLimitExceeded(IOError): pass 10 | 11 | 12 | def spy(fileobj, spyobj = None, max_size = None): 13 | """Returns a new file wrapper the records the contents of a file 14 | as someone is reading from it. 15 | """ 16 | return SpyFile(fileobj, spyobj, max_size) 17 | 18 | class SpyFile: 19 | """File wrapper to record the contents of a file as someone is 20 | reading from it. 21 | 22 | If the "spy" parameter is passed, it will be the stream to which 23 | the read data is written. 24 | 25 | SpyFile works like a "tee" 26 | 27 | ------------- 28 | Actual client <--- SpyFileObject <--- Data Source 29 | ____ ____ 30 | \ | / 31 | | 32 | V 33 | spy 34 | (spy object) 35 | 36 | 37 | """ 38 | def _check_size(self): 39 | """Raises SizeLimitExceeded if the SpyFile has seen more data 40 | than the specified limit""" 41 | if self.max_size: 42 | if self.current_size > int(self.max_size): 43 | raise SizeLimitExceeded("Spy file limit exceeded %d (max size : %d)"%(self.current_size, self.max_size)) 44 | 45 | def __init__(self, fileobj, spy = None, max_size = None): 46 | self.fileobj = fileobj 47 | self.buf = spy or StringIO() 48 | self.max_size = max_size 49 | self.current_size = 0 50 | 51 | def read(self, *a, **kw): 52 | text = self.fileobj.read(*a, **kw) 53 | self.buf.write(text) 54 | self.current_size += len(text) 55 | self._check_size() 56 | return text 57 | 58 | def readline(self, *a, **kw): 59 | text = self.fileobj.readline(*a, **kw) 60 | self.buf.write(text) 61 | self.current_size += len(text) 62 | self._check_size() 63 | return text 64 | 65 | def readlines(self): 66 | return list(self) 67 | 68 | def __iter__(self): 69 | while True: 70 | line = self.readline() 71 | if not line: 72 | break 73 | yield line 74 | 75 | def close(self): 76 | self.fileobj.close() 77 | 78 | def change_spy(self, fileobj): 79 | "Changes the file which recives the spied upon data to fileobj" 80 | self.buf.flush() 81 | self.buf.close() 82 | self.buf = fileobj 83 | 84 | 85 | class SpyHTTPResponse(httplib.HTTPResponse): 86 | def __init__(self, *a, **kw): 87 | httplib.HTTPResponse.__init__(self, *a, **kw) 88 | from . import config 89 | self.fp = spy(self.fp, None, config.max_payload_size) 90 | 91 | 92 | class MemFile: 93 | """Something like StringIO, but switches to a temp file when the maxsize is crossed. 94 | """ 95 | def __init__(self, maxsize=1024*1024, tmpdir=None, prefix="memfile-", suffix=".tmp"): 96 | self.maxsize = maxsize 97 | 98 | self.tmpdir = tmpdir 99 | self.prefix = prefix 100 | self.suffix = suffix 101 | 102 | self._fileobj = StringIO() 103 | 104 | def in_memory(self): 105 | """Returns True if the file is in memory.""" 106 | return not isinstance(self._fileobj, file) 107 | 108 | def __getattr__(self, name): 109 | return getattr(self._fileobj, name) 110 | 111 | def _open_tmpfile(self): 112 | # The TemporaryFile gets deleted automatically when it is closed or when it is garbage collected. 113 | return tempfile.TemporaryFile(dir=self.tmpdir, prefix=self.prefix, suffix=self.suffix) 114 | 115 | def _switch_to_disk(self): 116 | content = self._fileobj.getvalue() 117 | self._fileobj = self._open_tmpfile() 118 | self._fileobj.write(content) 119 | 120 | def write(self, data): 121 | if self.in_memory() and self.tell() + len(data) > self.maxsize: 122 | self._switch_to_disk() 123 | self._fileobj.write(data) 124 | 125 | def writelines(self, lines): 126 | for line in lines: 127 | self.write(line) 128 | 129 | def close(self): 130 | """Deletes the temp file if created. 131 | """ 132 | if self._fileobj and not self.in_memory(): 133 | logging.info("removing temp file %s", self._fileobj.name) 134 | os.unlink(self._fileobj.name) 135 | 136 | class DummyFilePool: 137 | """Simple implementation of FilePool. 138 | """ 139 | counter = 0 140 | 141 | def get_file(self): 142 | filename = "/tmp/record-%d.arc.gz" % self.counter 143 | while os.path.exists(filename): 144 | self.counter += 1 145 | filename = "/tmp/record-%d.arc.gz" % self.counter 146 | return open(filename, "w") 147 | 148 | def fileiter(file, size, chunk_size=1024*10): 149 | """Returns an iterator over the file for specified size. 150 | 151 | The chunk_size specified the amount of data read in each step. 152 | """ 153 | completed = 0 154 | while completed < size: 155 | nbytes = min(size-completed, chunk_size) 156 | content = file.read(nbytes) 157 | if not content: 158 | break 159 | yield content 160 | completed += len(content) 161 | 162 | def test(): 163 | import httplib 164 | conn = httplib.HTTPConnection("openlibrary.org") 165 | conn.response_class = SpyHTTPResponse 166 | 167 | conn.request("GET", "/") 168 | res = conn.getresponse() 169 | fp = res.fp 170 | 171 | print fp.buf.getvalue() 172 | 173 | res.read() 174 | print fp.buf.getvalue() 175 | 176 | if __name__ == "__main__": 177 | test() 178 | -------------------------------------------------------------------------------- /liveweb/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | 5 | logging.basicConfig(level=logging.INFO, 6 | format="%(asctime)s %(threadName)18s %(levelname)5s: %(message)s", 7 | datefmt="%Y-%m-%d %H:%M:%S") 8 | 9 | logging.info("starting liveweb-proxy") 10 | 11 | from . import config 12 | 13 | # load config 14 | config.load() 15 | 16 | # Make sure the storage directory exists 17 | partial_dir = os.path.join(config.output_directory, 'partial') 18 | complete_dir = os.path.join(config.output_directory, 'complete') 19 | if not os.path.exists(partial_dir): 20 | os.makedirs(partial_dir) 21 | if not os.path.exists(complete_dir): 22 | os.makedirs(complete_dir) 23 | 24 | from . import webapp 25 | 26 | # Intialize 27 | webapp.setup() 28 | 29 | application = webapp.application 30 | -------------------------------------------------------------------------------- /liveweb/proxy.py: -------------------------------------------------------------------------------- 1 | """The proxy functionality. 2 | """ 3 | 4 | import datetime 5 | import gzip 6 | import httplib 7 | import logging 8 | import os 9 | import socket 10 | import urllib 11 | from cStringIO import StringIO 12 | import tempfile 13 | import sys 14 | import errno 15 | import time 16 | 17 | from warc import arc 18 | from warc.utils import FilePart 19 | from . import filetools 20 | from . import config 21 | 22 | MEG = 1024 * 1024 23 | 24 | EMPTY_BUFFER = filetools.MemFile() 25 | 26 | # 1x - bad input 27 | ERR_INVALID_URL = 10, "invalid URL" 28 | 29 | # 2x - DNS errors 30 | ERR_INVALID_DOMAIN = 20, "invalid domain" 31 | ERR_DNS_TIMEOUT = 21, "dns timeout" 32 | 33 | # 3x - connction errors 34 | ERR_CONN_REFUSED = 30, "connection refused" 35 | ERR_CONN_TIMEOUT = 31, "connection timedout" 36 | ERR_INITIAL_DATA_TIMEOUT = 32, "initial data timeout" 37 | ERR_READ_TIMEOUT = 33, "read timeout" 38 | ERR_CONN_DROPPED = 34, "connection dropped" 39 | ERR_CONN_MISC = 39, "unexpected connection error" 40 | 41 | # 4x - resource errors 42 | ERR_RESPONSE_TOO_BIG = 40, "response too big" 43 | ERR_REQUEST_TIMEOUT = 41, "request took too long to finish" 44 | 45 | 46 | class ProxyError(Exception): 47 | def __init__(self, error, cause=None, data=None): 48 | self.errcode, self.errmsg = error 49 | 50 | if isinstance(cause, socket.error) and cause.errno: 51 | cause_msg = "%s: %s" % (errno.errorcode.get(cause.errno, cause.errno), cause.strerror) 52 | else: 53 | cause_msg = cause and ("%s: %s" % (cause.__class__.__name__, str(cause))) 54 | 55 | msg = "E%02d: %s" % (self.errcode, self.errmsg) 56 | 57 | if cause_msg: 58 | msg += " (" + cause_msg + ")" 59 | 60 | if data: 61 | msg += " %s" % data 62 | 63 | Exception.__init__(self, msg) 64 | 65 | class Record: 66 | """A class to hold together the filepath, content_length, and iterator over content. 67 | """ 68 | def __init__(self, filename, offset=0, content_length=None, content_iter=None): 69 | """Creates a new record instance. 70 | 71 | :param filename: Relative or absolute path to the file that has this record. 72 | :param offset: The offset in the file where this record started. 73 | :param content_length: The total length of the record. 74 | :param content_iter: An iterator over content. 75 | """ 76 | self.filename = filename 77 | self.offset = offset 78 | self.content_length = content_length 79 | self.content_iter = content_iter 80 | 81 | if self.content_length is None: 82 | self.content_length = os.stat(filename).st_size 83 | 84 | if self.content_iter is None: 85 | f = open(self.filename, 'rb') 86 | f.seek(self.offset) 87 | self.content_iter = filetools.fileiter(f, self.content_length) 88 | 89 | def read_all(self): 90 | """Reads all the data from content_iter and reinitializes the 91 | content_iter with the data read. 92 | 93 | Since this reads all the data into memory, this should be used 94 | only when content_length is not very big. 95 | """ 96 | data = "".join(self.content_iter) 97 | self.content_iter = iter([data]) 98 | return data 99 | 100 | def __iter__(self): 101 | return iter(self.content_iter) 102 | 103 | def split_type_host(url): 104 | """Returns (type, host, selector) from the url. 105 | """ 106 | type, rest = urllib.splittype(url) 107 | host, selector = urllib.splithost(rest) 108 | return type, host, selector 109 | 110 | 111 | def log_error(err): 112 | code, msg = err 113 | exc_type, exc_value, _ = sys.exc_info() 114 | logging.error("E%02d - %s (%s)", code, msg, str(exc_value)) 115 | 116 | def urlopen(url): 117 | """Works like urllib.urlopen, but returns a ProxyHTTPResponse object instead. 118 | """ 119 | logging.info("urlopen %s", url) 120 | 121 | try: 122 | return _urlopen(url) 123 | except ProxyError, e: 124 | logging.error("%s - %s", str(e), url) 125 | response = ProxyHTTPResponse(url, None, method="GET") 126 | response.error_bad_gateway() 127 | return response 128 | 129 | def _urlopen(url): 130 | """urlopen without the exception handling. 131 | 132 | Called by urlopen and test cases. 133 | """ 134 | headers = config.get("extra_headers",{}) 135 | headers['User-Agent'] = config.user_agent 136 | 137 | type, host, selector = split_type_host(url) 138 | 139 | if type.lower() == "https": 140 | conn = ProxyHTTPSConnection(host, url=url) 141 | else: 142 | conn = ProxyHTTPConnection(host, url=url) 143 | 144 | conn.request("GET", selector, headers=headers) 145 | return conn.getresponse() 146 | 147 | class _FakeSocket: 148 | """Faking a socket with makefile method. 149 | """ 150 | def __init__(self, fileobj=None): 151 | self.fileobj = fileobj or StringIO() 152 | 153 | def makefile(self, mode="rb", bufsize=0): 154 | return self.fileobj 155 | 156 | def getpeername(self): 157 | return ("0.0.0.0", 80) 158 | 159 | def settimeout(self, timeout): 160 | pass 161 | 162 | class SocketWrapper: 163 | """The socket.socket class doesn't have a way to enforce max-time and max-size limits. 164 | 165 | This extends the socket functionality by adding those constraints. 166 | """ 167 | def __init__(self, sock, max_time=None, max_size=None): 168 | self._sock = sock 169 | self._max_time = max_time 170 | self._max_size = max_size 171 | 172 | self._start_time = time.time() 173 | self._bytes_read = 0 174 | 175 | def __getattr__(self, name): 176 | return getattr(self._sock, name) 177 | 178 | def recv(self, bufsize): 179 | data = self._sock.recv(bufsize) 180 | self._bytes_read += len(data) 181 | 182 | # TODO: optimize this 183 | # Each time.time() call takes about 0.35 ns. 184 | # For reading headers, this function is called once of each byte. 185 | # Assuming that the headers is 1000 bytes long, it will add an overhead of 0.35ms. 186 | # We should optimize this if we care about half-a-milli-second. 187 | 188 | if self._max_time is not None and time.time() - self._start_time > self._max_time: 189 | raise ProxyError(ERR_REQUEST_TIMEOUT, data={"max_time": self._max_time}) 190 | 191 | if self._max_size is not None and self._bytes_read > self._max_size: 192 | raise ProxyError(ERR_RESPONSE_TOO_BIG, data={"max_size": self._max_size}) 193 | 194 | return data 195 | 196 | def makefile(self, mode='r', bufsize=-1): 197 | return socket._fileobject(self, mode, bufsize) 198 | 199 | class ProxyHTTPResponse(httplib.HTTPResponse): 200 | """HTTPResponse wrapper to record the HTTP payload. 201 | 202 | Provides utility methods to write ARC and WARC files. 203 | """ 204 | DEFAULT_CONTENT_TYPE = "unk" 205 | 206 | def __init__(self, url, sock, *a, **kw): 207 | self.sock = sock or _FakeSocket() 208 | httplib.HTTPResponse.__init__(self, self.sock, *a, **kw) 209 | 210 | self.url = url 211 | self.remoteip = self.sock.getpeername()[0] 212 | self.content_type = self.DEFAULT_CONTENT_TYPE 213 | self.buf = EMPTY_BUFFER 214 | 215 | # Length of header data 216 | self.header_offset = 0 217 | 218 | self.arc_size = None 219 | self.arc_data = None 220 | 221 | def begin(self): 222 | self.fp = filetools.SpyFile(self.fp, spy=filetools.MemFile()) 223 | self.buf = self.fp.buf 224 | 225 | try: 226 | self.sock.settimeout(config.get_initial_data_timeout()) 227 | httplib.HTTPResponse.begin(self) 228 | 229 | ctype = self.getheader("content-type", self.DEFAULT_CONTENT_TYPE) 230 | self.content_type = self.parse_content_type(ctype) 231 | self.header_offset = self.buf.tell() 232 | except socket.error, e: 233 | raise ProxyError(ERR_INITIAL_DATA_TIMEOUT, e, {"initial_data_timeout": config.get_initial_data_timeout()}) 234 | except httplib.HTTPException, e: 235 | raise ProxyError(ERR_CONN_MISC, e) 236 | 237 | try: 238 | # This will read the whole payload, taking care of content-length, 239 | # chunked transfer-encoding etc.. The spy file will record the real 240 | # HTTP payload. 241 | self.sock.settimeout(config.get_read_timeout()) 242 | self.read() 243 | except httplib.IncompleteRead, e: 244 | raise ProxyError(ERR_CONN_DROPPED, e) 245 | except httplib.HTTPException: 246 | raise ProxyError(ERR_CONN_MISC, e) 247 | except socket.error, e: 248 | raise ProxyError(ERR_READ_TIMEOUT, e, data={"read_timeout": config.get_read_timeout()}) 249 | 250 | def parse_content_type(self, ctype): 251 | # If there are multiple content-type headers, httplib joins them using ", " 252 | # Take the last one in that case 253 | ctype = ctype.split(",")[-1] 254 | 255 | # content-type can have parameters separated by semicolon. 256 | # For example: text/html; charset=UTF-8 257 | ctype = ctype.split(";")[0] 258 | 259 | # strip leading and trailing whitespace 260 | ctype = ctype.strip() 261 | 262 | # remove any whitespace as it may interfere with arc header 263 | ctype = ctype.replace(" ", "") 264 | return ctype 265 | 266 | def error_bad_gateway(self): 267 | """Resets the status code to "502 Bad Gateway" indicating that there was 268 | some network error when trying to accessing the server. 269 | """ 270 | self._error(502, "Bad Gateway") 271 | 272 | def error_bad_url(self): 273 | """Resets the status code to "400 Bad Request" indicating that the URL provided is bad. 274 | """ 275 | self._error(400, "Bad Request") 276 | 277 | def _error(self, status, reason): 278 | self.version = "HTTP/1.1" 279 | self.status = status 280 | self.reason = reason 281 | self.content_type = self.DEFAULT_CONTENT_TYPE 282 | 283 | # close file 284 | if self.fp: 285 | self.fp.close() 286 | self.fp = None 287 | 288 | self.buf = EMPTY_BUFFER 289 | self.header_offset = 0 290 | 291 | def write_arc(self, pool): 292 | record = self._make_arc_record() 293 | 294 | # if small enough, store in memory 295 | if record.header.length < MEG: 296 | # write ARC record into memory 297 | buf = StringIO() 298 | begin, record_size = self._write_arc_record(record, buf) 299 | 300 | # write the ARC record data in memory into file 301 | with pool.get_file() as f: 302 | logging.info("writing arc record to file %s", f.name) 303 | begin = f.tell() 304 | f.write(buf.getvalue()) 305 | filename = f.name 306 | 307 | return Record(filename, offset=begin, content_length=record_size, content_iter=iter([buf.getvalue()])) 308 | else: 309 | with pool.get_file() as f: 310 | logging.info("writing arc record to file %s", f.name) 311 | filename = f.name 312 | begin, record_size = self._write_arc_record(record, f) 313 | 314 | return Record(filename, offset=begin, content_length=record_size) 315 | 316 | def _write_arc_record(self, record, fileobj): 317 | """Writes the give ARC record into the given fileobj as gzip data and returns the start offset in the file and and record size. 318 | """ 319 | begin = fileobj.tell() 320 | 321 | zfile = gzip.GzipFile(fileobj=fileobj, filename=None, mode="w") 322 | record.write_to(zfile) 323 | zfile.close() 324 | fileobj.flush() 325 | 326 | end = fileobj.tell() 327 | return begin, end-begin 328 | 329 | def _make_arc_record(self): 330 | if self.status == 502: 331 | # Match the response of liveweb 1.0 incase of gateway errors. 332 | payload = "HTTP 502 Bad Gateway\n\n" 333 | payload_length = len(payload) 334 | content_type = "unk" 335 | remoteip = "0.0.0.0" 336 | else: 337 | # We've finished writing to the buf. The file-pointer will be at 338 | # the end of the file. Calling tell should give the file size. 339 | payload_length = self.buf.tell() 340 | 341 | # move the file pointer to the beginning of the file, so that we can read 342 | self.buf.seek(0) 343 | payload = self.buf 344 | remoteip = self.remoteip 345 | content_type = self.content_type 346 | 347 | headers = dict(url = self.url, 348 | date = self._utcnow(), 349 | content_type = self.content_type, 350 | ip_address = self.remoteip, 351 | length = payload_length) 352 | return arc.ARCRecord(headers = headers, 353 | payload = payload, 354 | version = 1) 355 | 356 | def _utcnow(self): 357 | """Returns datetime.datetime.utcnow(). 358 | 359 | Provided as a method here so that it is easy to monkeypatch for testing. 360 | """ 361 | return datetime.datetime.utcnow() 362 | 363 | def write_warc(self, pool): 364 | raise NotImplementedError() 365 | 366 | def get_arc(self): 367 | """Returns size and fileobj to read arc data. 368 | 369 | This must be called only after calling write_arc method. 370 | """ 371 | if self.arc_data is None: 372 | self.write_arc() 373 | return self.arc_size, self.arc_data 374 | 375 | def get_warc(self): 376 | """Returns size and fileobj to read warc data.""" 377 | raise NotImplementedError() 378 | 379 | def get_payload(self): 380 | """Returns size and fileobj to read HTTP payload. 381 | """ 382 | # go to the end find the filesize 383 | self.buf.seek(0, 2) 384 | size = self.buf.tell() - self.header_offset 385 | 386 | # go to the beginning 387 | self.buf.seek(self.header_offset) 388 | return filetools.fileiter(self.buf, size) 389 | 390 | class ProxyConnectionMixin: 391 | """Mixin to add extra functionality to HTTP/HTTPS connection to handle errors differently. 392 | """ 393 | _base_connection_class = httplib.HTTPConnection 394 | _proxy_response_class = ProxyHTTPResponse 395 | 396 | def __init__(self, host, url): 397 | try: 398 | self._base_connection_class.__init__(self, host) 399 | except httplib.InvalidURL, e: 400 | raise ProxyError(ERR_INVALID_URL, e) 401 | 402 | self.url = url 403 | self.response_class = lambda *a, **kw: self._proxy_response_class(self.url, *a, **kw) 404 | 405 | # This is used when creating the socket connection 406 | self.timeout = config.get_connect_timeout() 407 | 408 | def connect(self): 409 | try: 410 | self._base_connection_class.connect(self) 411 | self.sock = SocketWrapper(self.sock, config.max_request_time, config.max_response_size) 412 | except socket.gaierror, e: 413 | # -3: Temporary failure in name resolution 414 | # Happens when DNS request is timeout 415 | if e.errno == -3: 416 | raise ProxyError(ERR_DNS_TIMEOUT, e, data={"dns_timeout": config.get_dns_timeout()}) 417 | else: 418 | raise ProxyError(ERR_INVALID_DOMAIN, e) 419 | except socket.timeout, e: 420 | raise ProxyError(ERR_CONN_TIMEOUT, e, data={"conn_timeout": config.get_connect_timeout()}) 421 | except socket.error, e: 422 | msg = e.strerror or "" 423 | if e.errno == errno.ECONNREFUSED: 424 | raise ProxyError(ERR_CONN_REFUSED, e) 425 | else: 426 | raise ProxyError(ERR_CONN_MISC, e) 427 | return self.sock 428 | 429 | def request(self, method, url, body=None, headers={}): 430 | try: 431 | self._base_connection_class.request(self, method, url, body=body, headers=headers) 432 | except socket.error, e: 433 | raise ProxyError(ERR_CONN_MISC, e) 434 | 435 | 436 | class ProxyHTTPConnection(ProxyConnectionMixin, httplib.HTTPConnection): 437 | """HTTPConnection wrapper to add extra hooks to handle errors. 438 | """ 439 | _base_connection_class = httplib.HTTPConnection 440 | 441 | class ProxyHTTPSConnection(ProxyConnectionMixin, httplib.HTTPSConnection): 442 | """HTTPSConnection wrapper to add extra hooks to handle errors. 443 | """ 444 | _base_connection_class = httplib.HTTPSConnection 445 | -------------------------------------------------------------------------------- /liveweb/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/liveweb/daf121365b959477e3b90ae07bcd72959f5be856/liveweb/tests/__init__.py -------------------------------------------------------------------------------- /liveweb/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import urllib 4 | import subprocess 5 | import time 6 | 7 | def pytest_funcarg__pooldir(request): 8 | "Creates a directory for the pool" 9 | dirname = "/tmp/pool-xxx" 10 | 11 | if os.path.exists(dirname): 12 | shutil.rmtree(dirname) 13 | 14 | os.makedirs(os.path.join(dirname, 'partial')) 15 | os.makedirs(os.path.join(dirname, 'complete')) 16 | 17 | request.addfinalizer(lambda : shutil.rmtree(dirname)) 18 | 19 | return dirname 20 | 21 | 22 | def pytest_funcarg__webtest(request): 23 | dirname = os.path.dirname(__file__) 24 | path = os.path.join(dirname, "webtest.py") 25 | port = 9876 26 | 27 | p = subprocess.Popen(['python', path, str(port)]) 28 | request.addfinalizer(p.kill) 29 | 30 | # Return an object with url and port atrributes 31 | x = lambda: None 32 | x.url = "http://127.0.0.1:%d" % port 33 | x.port = port 34 | 35 | # wait until the server is ready, with max-tries=20 36 | for i in range(20): 37 | print i 38 | try: 39 | urllib.urlopen(x.url + "/") 40 | break 41 | except IOError: 42 | time.sleep(0.1) 43 | 44 | return x 45 | -------------------------------------------------------------------------------- /liveweb/tests/test_configutil.py: -------------------------------------------------------------------------------- 1 | from ..configutil import ConfigOption, Config, parse_bytes, parse_time 2 | 3 | class TestConfigOption: 4 | def test_add_option(self): 5 | opt = ConfigOption("--threads", help="Number of threads") 6 | assert opt.dest == "threads" 7 | 8 | def test_help(self): 9 | opt = ConfigOption("--threads", help="Number of threads") 10 | assert opt.option.help == "Number of threads" 11 | 12 | opt = ConfigOption("--threads", default="10", help="Number of threads (default: %default)") 13 | assert opt.option.help == "Number of threads (default: 10)" 14 | 15 | def test_default(self): 16 | c = ConfigOption("--foo") 17 | assert c.value == None 18 | 19 | c = ConfigOption("--foo", default="foo-default") 20 | assert c.value == "foo-default" 21 | 22 | def test_from_env(self): 23 | c = ConfigOption("--foo", default="foo-default") 24 | c.load_from_env({}) 25 | assert c.value == "foo-default" 26 | 27 | c.load_from_env({"LIVEWEB_FOO": "foo-env"}) 28 | assert c.value == "foo-env" 29 | 30 | def test_putenv(self, monkeypatch): 31 | import os 32 | 33 | environ = {} 34 | monkeypatch.setattr(os, "environ", environ) 35 | monkeypatch.setattr(os, "getenv", environ.__getitem__) 36 | monkeypatch.setattr(os, "putenv", environ.__setitem__) 37 | 38 | c = ConfigOption("--foo", default="foo-default") 39 | 40 | c.putenv() 41 | assert environ == {} 42 | 43 | c.set("new-value") 44 | c.putenv() 45 | assert environ == {"LIVEWEB_FOO": "new-value"} 46 | 47 | def test_parse_bytes(): 48 | assert parse_bytes("5") == 5 49 | assert parse_bytes("5KB") == 5 * 1024 50 | assert parse_bytes("5MB") == 5 * 1024 * 1024 51 | assert parse_bytes("5GB") == 5 * 1024 * 1024 * 1024 52 | 53 | def test_parse_time(): 54 | assert parse_time("5") == 5.0 55 | assert parse_time("5s") == 5.0 56 | assert parse_time("5m") == 5.0 * 60 57 | assert parse_time("5h") == 5.0 * 3600 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /liveweb/tests/test_filepool.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | def test_creation(pooldir): 5 | """ 6 | Tests to see if the file pool is created properly. 7 | 8 | Creates a pool and then walks through the pool directory to see if 9 | the expected files are there. 10 | """ 11 | from ..file_pool import FilePool 12 | 13 | # Create the pool 14 | pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10) 15 | 16 | # call pool.get_file() so that files are initialized 17 | for i in range(pool.max_files): 18 | pool.get_file() 19 | 20 | # Get files in pool directory. 21 | pool_files = set(glob.glob(pooldir + "/partial/*")) 22 | 23 | # Check if this is the same as what we expect 24 | expected_files = set(["%s/partial/test-%05d"%(pooldir,x) for x in range(0,10)]) 25 | 26 | assert expected_files == pool_files 27 | 28 | def test_get_return(pooldir): 29 | """ 30 | Tests to see if get_file return_file work as expected. 31 | 32 | gets files, checks the number of files in the queue, returns them 33 | and checks again. 34 | """ 35 | 36 | from ..file_pool import FilePool 37 | 38 | # Create the pool 39 | pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10) 40 | 41 | fps = [] 42 | 43 | assert len(pool.queue.queue) == 10 44 | 45 | for i in range(1, 6): 46 | fps.append(pool.get_file()) 47 | assert len(pool.queue.queue) == 10 - i 48 | 49 | assert len(pool.queue.queue) == 5 50 | 51 | for i in range(1, 6): 52 | pool.return_file(fps.pop()) 53 | assert len(pool.queue.queue) == 5 + i 54 | 55 | assert len(pool.queue.queue) == 10 56 | 57 | def test_max_file_size(pooldir): 58 | """ 59 | Tests to see if the files are closed and released when the maximum 60 | file size is reached and the file is returned. 61 | """ 62 | from ..file_pool import FilePool 63 | 64 | # Create the pool 65 | pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10) 66 | 67 | fp = pool.get_file() 68 | fp.write("test" * 100) # Max size has been exceeded. File should 69 | pool.return_file(fp) # get removed from pool when returned. 70 | 71 | # queue should have all Nones now. 72 | assert list(pool.queue.queue) == [None] * 10 73 | 74 | complete_files = set(glob.glob(pooldir + "/complete/*")) 75 | expected_complete_files = set(("%s/complete/test-%05d"%(pooldir,0),)) 76 | assert expected_complete_files == complete_files 77 | 78 | 79 | def test_close_pool(pooldir): 80 | """ 81 | Makes sure that the pool is emptied when closed. 82 | 83 | """ 84 | from ..file_pool import FilePool 85 | 86 | # Create the pool 87 | pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10) 88 | 89 | pool.close() 90 | 91 | assert len(pool.queue.queue) == 0 92 | 93 | 94 | def test_member_file_context(pooldir): 95 | """ 96 | Tests the context manager behaviour of the MemberFile object. 97 | """ 98 | 99 | from ..file_pool import FilePool 100 | 101 | # Create the pool 102 | pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10) 103 | 104 | assert len(pool.queue.queue) == 10 105 | 106 | with pool.get_file() as f: 107 | assert len(pool.queue.queue) == 9 108 | name = f.name 109 | f.write("Hello") 110 | 111 | assert len(pool.queue.queue) == 10 112 | pool.close() 113 | 114 | assert open(name).read() == "Hello" 115 | -------------------------------------------------------------------------------- /liveweb/tests/test_filetools.py: -------------------------------------------------------------------------------- 1 | from .. import filetools 2 | from cStringIO import StringIO 3 | 4 | class TestMemFile: 5 | def test_nodata(self): 6 | f = filetools.MemFile() 7 | assert f.read() == "" 8 | assert f.readline() == "" 9 | 10 | def test_mem(self): 11 | f = filetools.MemFile(100) 12 | f.write("helloworld") 13 | assert f.tell() == 10 14 | f.seek(0) 15 | assert f.read() == "helloworld" 16 | 17 | def test_readlines(self): 18 | f = filetools.MemFile(100) 19 | f.write("a\nb\nc\n") 20 | f.seek(0) 21 | assert f.readline() == "a\n" 22 | assert f.readline() == "b\n" 23 | assert f.readline() == "c\n" 24 | assert f.readline() == "" 25 | 26 | def test_mem(self): 27 | f = filetools.MemFile(100) 28 | f.write("helloworld" * 10) 29 | assert f.tell() == 100 30 | assert f.in_memory() is True 31 | 32 | f.write("helloworld") 33 | assert f.in_memory() is False 34 | assert f.name is not None 35 | 36 | f.seek(0) 37 | content = f.read() 38 | assert len(content) == 110 39 | assert content == "helloworld" * 11 40 | 41 | 42 | def test_fileiter(): 43 | f = StringIO("helloworld" * 15) 44 | # there are 15 "helloworld"s and we are asking it to read 4 of them. 45 | assert list(filetools.fileiter(f, 40, chunk_size=10)) == ["helloworld"] * 4 46 | 47 | # case where the size is not multiple of chunk_size 48 | assert list(filetools.fileiter(f, 38, chunk_size=10)) == ["helloworld", "helloworld", "helloworld", "hellowor"] 49 | 50 | # what if we ask for more data than we have? 51 | f = StringIO("helloworld" + "helloworld" + "end") 52 | assert list(filetools.fileiter(f, 40, chunk_size=10)) == ["helloworld", "helloworld", "end"] 53 | -------------------------------------------------------------------------------- /liveweb/tests/test_proxy.py: -------------------------------------------------------------------------------- 1 | from .. import proxy, config 2 | 3 | from cStringIO import StringIO 4 | import datetime 5 | import subprocess 6 | import os 7 | import urllib 8 | import time 9 | 10 | import pytest 11 | 12 | class TestRecord: 13 | def test_read_all(self): 14 | content = "helloworld" * 100 15 | record = proxy.Record(None, 0, len(content), iter([content])) 16 | 17 | # read_all should return all the content 18 | assert record.read_all() == content 19 | 20 | # after calling read_all, the content should still be available 21 | assert record.read_all() == content 22 | assert record.read_all() == content 23 | 24 | # after read_all, the content_iter should still have the content 25 | assert "".join(record.content_iter) == content 26 | 27 | def test_init(self, tmpdir): 28 | path = tmpdir.join("foo.txt") 29 | path.write("helloworld" * 100) 30 | 31 | record = proxy.Record(path.strpath, 0, 1000, None) 32 | assert record.read_all() == "helloworld" * 100 33 | 34 | record = proxy.Record(path.strpath, 800, 100, None) 35 | assert record.read_all() == "helloworld" * 10 36 | 37 | 38 | def test_split_type_host(): 39 | assert proxy.split_type_host("http://www.archive.org/details/test") == ("http", "www.archive.org", "/details/test") 40 | 41 | 42 | def test_FakeSocket(): 43 | s = proxy._FakeSocket() 44 | assert s.makefile(mode="rb").read() == "" 45 | 46 | s = proxy._FakeSocket(StringIO("helloworld")) 47 | assert s.makefile(mode="rb").read() == "helloworld" 48 | 49 | 50 | SAMPLE_RESPONSE = """\ 51 | HTTP/1.1 200 OK 52 | Content-type: text/plain 53 | Server: Apache/2.2.20 54 | Content-Length: 10 55 | 56 | helloworld""".replace("\n", "\r\n") 57 | 58 | SAMPLE_RESPONSE_CHUNKED = """\ 59 | HTTP/1.1 200 OK 60 | Content-Type: text/plain 61 | Server: Apache/2.2.20 62 | Transfer-Encoding: chunked 63 | 64 | 5 65 | hello 66 | 5 67 | world 68 | 0 69 | """.replace("\n", "\r\n") 70 | 71 | class TestProxyResponse: 72 | def make_response(self, content): 73 | sock = proxy._FakeSocket(StringIO(content)) 74 | response = proxy.ProxyHTTPResponse("http://example.com/hello", sock) 75 | response.begin() 76 | return response 77 | 78 | def test_headers(self): 79 | response = self.make_response(SAMPLE_RESPONSE) 80 | assert sorted(response.getheaders()) == [ 81 | ("content-length", "10"), 82 | ("content-type", "text/plain"), 83 | ("server", "Apache/2.2.20"), 84 | ] 85 | 86 | def test_buf(self): 87 | response = self.make_response(SAMPLE_RESPONSE) 88 | assert response.buf.getvalue() == SAMPLE_RESPONSE 89 | 90 | def test_buf_chunked(self): 91 | response = self.make_response(SAMPLE_RESPONSE_CHUNKED) 92 | assert response.buf.getvalue() == SAMPLE_RESPONSE_CHUNKED 93 | 94 | def test_get_payload(self): 95 | response = self.make_response(SAMPLE_RESPONSE) 96 | payload = response.get_payload() 97 | assert "".join(payload) == "helloworld" 98 | 99 | def test_get_payload_chunked(self): 100 | response = self.make_response(SAMPLE_RESPONSE_CHUNKED) 101 | payload = response.get_payload() 102 | assert "".join(payload) == "5\r\nhello\r\n5\r\nworld\r\n0\r\n" 103 | 104 | def test_arc_record(self, monkeypatch): 105 | # monkey patch _utcnow so that the time is deterministic 106 | monkeypatch.setattr(proxy.ProxyHTTPResponse, "_utcnow", lambda self: datetime.datetime(2010, 9, 8, 7, 6, 5)) 107 | 108 | self._test_arc_record(SAMPLE_RESPONSE) 109 | self._test_arc_record(SAMPLE_RESPONSE_CHUNKED) 110 | 111 | def _test_arc_record(self, http_payload): 112 | response = self.make_response(http_payload) 113 | arc = response._make_arc_record() 114 | assert str(arc.header) == "http://example.com/hello 0.0.0.0 20100908070605 text/plain %d" % len(http_payload) 115 | 116 | 117 | class FakeSocket: 118 | def __init__(self, content, delay_per_byte=0): 119 | self.content = content 120 | self.delay_per_byte = delay_per_byte 121 | 122 | def recv(self, size): 123 | data = self.content[:size] 124 | self.content = self.content[size:] 125 | 126 | if self.delay_per_byte: 127 | time.sleep(len(data) * self.delay_per_byte) 128 | 129 | return data 130 | 131 | def dummy(self, *a, **kw): 132 | pass 133 | 134 | class TestSocketWrapper: 135 | def test_max_time(self): 136 | _sock = FakeSocket("a" * 1000, delay_per_byte=0.001) 137 | sock = proxy.SocketWrapper(_sock, max_time=0.1) 138 | sock.recv(90) # .09 seconds 139 | 140 | with pytest.raises(proxy.ProxyError) as excinfo: 141 | sock.recv(20) # this should fail 142 | 143 | e = excinfo.value 144 | assert (e.errcode, e.errmsg) == proxy.ERR_REQUEST_TIMEOUT 145 | 146 | def test_max_size(self): 147 | _sock = FakeSocket("a" * 1200) 148 | sock = proxy.SocketWrapper(_sock, max_size=1001) 149 | 150 | # read 1000 bytes 151 | for i in range(10): 152 | sock.recv(100) 153 | 154 | with pytest.raises(proxy.ProxyError) as excinfo: 155 | sock.recv(100) # this should fail 156 | 157 | e = excinfo.value 158 | assert (e.errcode, e.errmsg) == proxy.ERR_RESPONSE_TOO_BIG 159 | 160 | class TestErrors: 161 | def assert_error_code(self, excinfo, error): 162 | e = excinfo.value 163 | if (e.errcode, e.errmsg) != error: 164 | import traceback 165 | traceback.print_exc(e) 166 | assert (e.errcode, e.errmsg) == error 167 | 168 | def verify(self, err, url): 169 | with pytest.raises(proxy.ProxyError) as excinfo: 170 | proxy._urlopen(url) 171 | self.assert_error_code(excinfo, err) 172 | 173 | def test_invalid_url(self): 174 | self.verify(proxy.ERR_INVALID_URL, "http://localhost:foo/") 175 | 176 | def test_invalid_domain(self): 177 | self.verify(proxy.ERR_INVALID_DOMAIN, "http://invalid.com2/") 178 | 179 | def test_conn_refused(self): 180 | # nothing will be running at localhost:1234, so connection will be refused 181 | self.verify(proxy.ERR_CONN_REFUSED, "http://localhost:1234/") 182 | 183 | def test_conn_timeout(self, monkeypatch): 184 | monkeypatch.setattr(config, "connect_timeout", 0.5) 185 | # this random IP seems to be creating timeout 186 | self.verify(proxy.ERR_CONN_TIMEOUT, "http://1.2.3.4/") 187 | 188 | def test_initial_data_timeout(self, monkeypatch, webtest): 189 | # This should not fail 190 | proxy._urlopen(webtest.url + "/delay-headers/0.2") 191 | 192 | # But when we set the initial_data_timeout, it should fail 193 | monkeypatch.setattr(config, "initial_data_timeout", 0.1) 194 | self.verify(proxy.ERR_INITIAL_DATA_TIMEOUT, webtest.url + "/delay-headers/0.2") 195 | 196 | def test_read_timeout(self, monkeypatch, webtest): 197 | # This should not fail 198 | proxy._urlopen(webtest.url + "/delay/0.2?repeats=1") 199 | 200 | # But when we set the initial_data_timeout, it should fail 201 | monkeypatch.setattr(config, "read_timeout", 0.1) 202 | self.verify(proxy.ERR_READ_TIMEOUT, webtest.url + "/delay/0.2?repeats=1") 203 | 204 | def test_conn_dropped(self, webtest): 205 | self.verify(proxy.ERR_CONN_DROPPED, webtest.url + "/drop") 206 | 207 | def test_response_too_big(self, monkeypatch, webtest): 208 | monkeypatch.setattr(config, "max_response_size", 1000) 209 | 210 | # This should not fail 211 | proxy._urlopen(webtest.url + "/echo/helloworld?repeats=50") 212 | 213 | with pytest.raises(proxy.ProxyError) as excinfo: 214 | proxy._urlopen(webtest.url + "/echo/helloworld?repeats=100") 215 | 216 | self.assert_error_code(excinfo, proxy.ERR_RESPONSE_TOO_BIG) 217 | 218 | def test_request_took_too_long(self, monkeypatch, webtest): 219 | monkeypatch.setattr(config, "max_request_time", 0.1) 220 | 221 | # This should not fail 222 | proxy._urlopen(webtest.url + "/echo/helloworld?repeats=5&delay=0.01") 223 | 224 | with pytest.raises(proxy.ProxyError) as excinfo: 225 | proxy._urlopen(webtest.url + "/echo/helloworld?repeats=20&delay=0.01") 226 | 227 | self.assert_error_code(excinfo, proxy.ERR_REQUEST_TIMEOUT) 228 | 229 | 230 | def test_webtest(webtest): 231 | assert urllib.urlopen(webtest.url + "/echo/hello").read() == "hello\n" 232 | -------------------------------------------------------------------------------- /liveweb/tests/test_webapp.py: -------------------------------------------------------------------------------- 1 | from ..webapp import application 2 | 3 | class Test_application: 4 | def test_parse_request_url(self): 5 | environ = { 6 | 'REQUEST_METHOD': "GET", 7 | 'REQUEST_URI': 'http://www.example.com/foo/bar', 8 | 'PATH_INFO': 'http://www.example.com/foo/bar', 9 | 'HTTP_HOST': 'www.example.com' 10 | } 11 | app = application(environ, None) 12 | app.parse_request() 13 | assert app.url == "http://www.example.com/foo/bar" 14 | 15 | def test_nginx_work_around(self): 16 | # nginx is stripping the http://host in the URL, which is of the form http://host/path 17 | # and passing just /path to the app. 18 | # Test the work-around that reconstructs the full path using the host. 19 | environ = { 20 | 'REQUEST_METHOD': "GET", 21 | 'REQUEST_URI': '/foo/bar', 22 | 'PATH_INFO': '/foo/bar', 23 | 'HTTP_HOST': 'www.example.com' 24 | } 25 | app = application(environ, None) 26 | app.parse_request() 27 | assert app.url == "http://www.example.com/foo/bar" 28 | -------------------------------------------------------------------------------- /liveweb/tests/webtest.py: -------------------------------------------------------------------------------- 1 | """Web app to simulate various error conditions. 2 | """ 3 | 4 | import sys, os 5 | from os.path import dirname, pardir 6 | 7 | # insert liveweb in sys.path 8 | sys.path.insert(0, os.path.join(dirname(__file__), pardir, pardir)) 9 | 10 | from liveweb.tools.wsgiapp import wsgiapp 11 | import time 12 | 13 | class application(wsgiapp): 14 | urls = [ 15 | ("/", "index"), 16 | ("/echo/(.*)", "echo"), 17 | ("/delay-headers/([0-9\.]+)", "delay_headers"), 18 | ("/delay/([0-9\.]+)", "delay"), 19 | ("/drop", "drop"), 20 | ] 21 | 22 | def GET_index(self): 23 | self.header("Content-Type", "text/plain") 24 | return ["hello, world!\n"] 25 | 26 | def GET_echo(self, name): 27 | self.header("Content-Type", "text/plain") 28 | 29 | i = self.input() 30 | repeats = int(i.get("repeats", 1)) 31 | delay = float(i.get('delay', 0)) 32 | delim = i.get("delim", "\n") 33 | 34 | for i in range(repeats): 35 | yield name + delim 36 | if delay: 37 | time.sleep(delay) 38 | 39 | def GET_drop(self): 40 | self.header("Content-Type", "text/plain") 41 | self.header("Content-Length", "10000") 42 | return ["dropped!"] 43 | 44 | def GET_delay_headers(self, delay): 45 | self.header("Content-Type", "text/plain") 46 | delay = float(delay) 47 | time.sleep(delay) 48 | return ["delayed"] 49 | 50 | def GET_delay(self, delay): 51 | """Emits 10 numbers with delay seconds between each. 52 | """ 53 | i = self.input() 54 | repeats = int(i.get("repeats", 10)) 55 | delay = float(delay) 56 | for i in range(repeats): 57 | yield str(i) + "\n" 58 | time.sleep(delay) 59 | 60 | if __name__ == "__main__": 61 | import sys 62 | 63 | try: 64 | port = int(sys.argv[1]) 65 | except IndexError: 66 | port = 8080 67 | 68 | from wsgiref.simple_server import make_server 69 | httpd = make_server('127.0.0.1', port, application) 70 | print "http://127.0.0.1:%d/" % port 71 | httpd.serve_forever() 72 | -------------------------------------------------------------------------------- /liveweb/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/liveweb/daf121365b959477e3b90ae07bcd72959f5be856/liveweb/tools/__init__.py -------------------------------------------------------------------------------- /liveweb/tools/wayback.py: -------------------------------------------------------------------------------- 1 | """Really simple implememtation of wayback machine web-interface. 2 | 3 | Written to test the liveweb proxy implementation. 4 | """ 5 | 6 | import sys 7 | import httplib 8 | import gzip 9 | import urlparse 10 | import cgi 11 | from StringIO import StringIO 12 | 13 | from BeautifulSoup import BeautifulSoup 14 | 15 | import warc 16 | from .wsgiapp import wsgiapp 17 | # We expect that liveweb host:port is passed as argument to this script. 18 | liveweb = sys.argv[1] 19 | import re 20 | import logging 21 | 22 | logging.basicConfig(level=logging.DEBUG, ) 23 | logger = logging.getLogger('[wayback]') 24 | 25 | 26 | class application(wsgiapp): 27 | """WSGI application for wayback machine prototype. 28 | """ 29 | urls = [ 30 | ("/", "index"), 31 | ("/get", "get"), 32 | ("/web/(.*)", "web") 33 | ] 34 | 35 | @property 36 | def home(self): 37 | return "http://" + self.environ['HTTP_HOST'] 38 | 39 | def GET_index(self): 40 | self.header("content-type", "text/html") 41 | return [HEADER] 42 | #yield "

Wayback Machine Prototype

" 43 | 44 | def GET_get(self): 45 | fs = cgi.FieldStorage(environ=self.environ, keep_blank_values=1) 46 | if 'url' in fs: 47 | url = fs['url'].value 48 | self.status = "302 See Other" 49 | self.header("Location", self.home + "/web/" + url) 50 | return [""] 51 | else: 52 | self.status = "302 See Other" 53 | self.header("Location", self.home + "/") 54 | return [""] 55 | 56 | def GET_web(self, url): 57 | qs = self.environ.get("QUERY_STRING", "") 58 | if qs: 59 | url = url + "?" + qs 60 | record = self.fetch_arc_record(url) 61 | 62 | # fake socket to pass to httplib 63 | f = StringIO(record.payload) 64 | f.makefile = lambda *a: f 65 | 66 | response = httplib.HTTPResponse(f) 67 | response.begin() 68 | h = dict(response.getheaders()) 69 | 70 | content_type = h.get("content-type", "text/plain") 71 | self.header("Content-Type", content_type) 72 | 73 | if 'content-length' in h: 74 | self.header('Content-Length', h['content-length']) 75 | 76 | content = response.read() 77 | if content_type.lower().startswith("text/html"): 78 | content = self.rewrite_page(url, content) 79 | self.header('Content-Length', str(len(content))) 80 | elif content_type.lower().startswith("text/css"): 81 | content = self.rewrite_css(url, content) 82 | self.header('Content-Length', str(len(content))) 83 | return [content] 84 | 85 | def rewrite_css(self, base_url, content): 86 | #base_url = base_url.replace(os.path.basename(base_url), '') 87 | for image_url in re.findall('url\(([^\)]+)', content): 88 | image_url = image_url.replace('"', '') 89 | image_url = image_url.replace("'", '') 90 | if not image_url.startswith('http://'): 91 | new_url = urlparse.urljoin(base_url, image_url) 92 | else: 93 | new_url = image_url 94 | new_url = new_url.replace('//', '/') 95 | url2 = urlparse.urljoin(base_url, new_url) 96 | url2 = self.home + "/web/" + url2 97 | logger.debug("rewrote %r => %r" % (image_url, url2)) 98 | content = content.replace(image_url, url2, 1) 99 | return content 100 | 101 | def fetch_arc_record(self, url): 102 | """Fetchs the ARC record data from liveweb proxy. 103 | """ 104 | conn = httplib.HTTPConnection(liveweb) 105 | conn.request("GET", url) 106 | content = conn.getresponse().read() 107 | 108 | gz = gzip.GzipFile(fileobj=StringIO(content), mode="rb") 109 | record = warc.ARCRecord.from_string(gz.read(), version=1) 110 | 111 | return record 112 | 113 | def rewrite_page(self, base_url, content): 114 | """Rewrites all the links the the HTML.""" 115 | 116 | soup = BeautifulSoup(content) 117 | for tag in soup.findAll(["a", "link", "img", "script", "form"]): 118 | if tag.has_key('href'): 119 | tag['href'] = self.rewrite_url(base_url, tag['href']) 120 | elif tag.has_key("src"): 121 | tag['src'] = self.rewrite_url(base_url, tag['src']) 122 | elif tag.has_key("action"): 123 | tag['action'] = self.rewrite_url(base_url, tag['action']) 124 | 125 | self.inject_header(base_url, soup) 126 | return str(soup) 127 | 128 | def inject_header(self, base_url, soup): 129 | """Injects wayback machine header into the web page.""" 130 | header_soup = BeautifulSoup(HEADER).find("div") 131 | header_soup.find("input", {"id": "wmtbURL"})['value'] = base_url 132 | soup.find("body").insert(0, header_soup) 133 | 134 | def rewrite_url(self, base_url, url): 135 | if url.strip().lower().startswith("javascript"): 136 | return url 137 | url2 = urlparse.urljoin(base_url, url) 138 | url2 = self.home + "/web/" + url2 139 | logger.debug("rewrote %r => %r" % (url, url2)) 140 | return url2 141 | 142 | 143 | HEADER = """ 144 |
145 |
146 | 147 | 150 |
148 | Wayback Machine 149 | 151 | 152 | 153 | 156 | 158 | 162 |
154 |
155 |
157 | 159 | Close 160 | Help 161 |
163 | 164 | 165 | """ 166 | -------------------------------------------------------------------------------- /liveweb/tools/wsgiapp.py: -------------------------------------------------------------------------------- 1 | """Really simple wsgi framework. 2 | """ 3 | 4 | import re 5 | import traceback 6 | 7 | class wsgiapp: 8 | """Simple WSGI web framework. 9 | 10 | class applicaiton(wsgiapp): 11 | urls = [ 12 | ("/", "index") 13 | ] 14 | def GET_index(self): 15 | self.header("Content-Type", "text/plain") 16 | return "hello, world!" 17 | """ 18 | def __init__(self, environ, start_response): 19 | self.start = start_response 20 | self.environ = environ 21 | 22 | self.status = "200 OK" 23 | self._headers = {} 24 | 25 | def input(self): 26 | tokens = self.environ.get("QUERY_STRING", "").split("&") 27 | print "input", tokens 28 | return dict(kv.split("=") for kv in tokens if "=" in kv) 29 | 30 | def header(self, name, value): 31 | self._headers[name.title()] = value 32 | 33 | def __iter__(self): 34 | try: 35 | x = self.delegate() 36 | self.start(self.status, self._headers.items()) 37 | return iter(x) 38 | except: 39 | headers = {"Content-Type": "text/plain"} 40 | self.start("500 Internal Error", headers.items()) 41 | out = "Internal Error:\n\n" 42 | exc = traceback.format_exc() 43 | return iter([out, exc]) 44 | 45 | def delegate(self): 46 | """Delegates the request to appropriate method. 47 | """ 48 | path = self.environ['PATH_INFO'] 49 | method = self.environ['REQUEST_METHOD'] 50 | 51 | # Try each pattern and dispatch to the right method 52 | for pattern, name in self.urls: 53 | m = re.match('^' + pattern + '$', path) 54 | if m: 55 | funcname = method.upper() + "_" + name 56 | f = getattr(self, funcname) 57 | return f(*m.groups()) 58 | 59 | # give "404 Not Found" if all the patterns are exhausted 60 | return self.notfound() 61 | 62 | def notfound(self): 63 | self.status = "404 Not Found" 64 | self.headers = {"Content-Type": "text/html"}.items() 65 | return ["Not Found"] 66 | 67 | -------------------------------------------------------------------------------- /liveweb/webapp.py: -------------------------------------------------------------------------------- 1 | """The webapp for arc proxy. 2 | """ 3 | 4 | from cStringIO import StringIO 5 | import gzip 6 | import logging 7 | import socket 8 | import datetime 9 | 10 | from warc.arc import ARCRecord, ARCFile 11 | 12 | from . import proxy 13 | from . import errors 14 | from . import config 15 | from . import file_pool 16 | from . import cache 17 | 18 | pool = None 19 | _cache = None 20 | 21 | def init_arc_file(fileobj): 22 | """Writes the ARC file headers when a new file is created. 23 | """ 24 | zfileobj = gzip.GzipFile(fileobj=fileobj, filename=None, mode="w") 25 | 26 | headers = {} 27 | headers['date'] = datetime.datetime.utcnow() 28 | headers['ip_address'] = socket.gethostbyname(socket.gethostname()) 29 | headers['org'] = "InternetArchive" 30 | 31 | afile = ARCFile(fileobj=zfileobj, filename=fileobj.name, mode='wb', version=1, file_headers=headers) 32 | afile._write_header() 33 | afile.close() 34 | fileobj.flush() 35 | 36 | def setup(): 37 | """This is called from main to initialize the requires globals. 38 | """ 39 | global pool, _cache 40 | 41 | # Write ARC file header if the archive format is "arc" 42 | if config.archive_format == "arc": 43 | init_file = init_arc_file 44 | else: 45 | init_file = None 46 | 47 | pool = file_pool.FilePool(config.output_directory, 48 | pattern=config.filename_pattern, 49 | max_files=config.num_writers, 50 | max_file_size=config.filesize_limit, 51 | init_file_func=init_file) 52 | _cache = cache.create(type=config.cache, config=config) 53 | 54 | # For redis cache, use redis for keeping track of file number sequence 55 | if config.cache == 'redis': 56 | pool.set_sequence(_cache) 57 | 58 | class application: 59 | """WSGI application for liveweb proxy. 60 | """ 61 | def __init__(self, environ, start_response): 62 | self.environ = environ 63 | self.start_response = start_response 64 | 65 | def parse_request(self): 66 | self.method = self.environ['REQUEST_METHOD'] 67 | if 'REQUEST_URI' in self.environ: # This is for uwsgi 68 | self.url = self.environ['REQUEST_URI'] #TODO: Is this a valid environment variable always? 69 | if 'RAW_URI' in self.environ: # This is for gunicorn 70 | self.url = self.environ['RAW_URI'] #TODO: Is this a valid environment variable always? 71 | 72 | # Allow accessing the proxy using regular URL so that we can use 73 | # tools like ab. 74 | if self.url.startswith("/_web/"): 75 | self.url = self.url[len("/_web/"):] 76 | 77 | # Since this is a proxy, the URL is always of the form http://hostname/path 78 | # nginx is stripping the http://host from the passed URL and just passing the /path here. 79 | # This is a work-around for that issue. 80 | if self.url.startswith("/"): 81 | self.url = "http://" + self.environ['HTTP_HOST'] + self.url 82 | 83 | def __iter__(self): 84 | try: 85 | self.parse_request() 86 | 87 | record = self.get_record() 88 | if config.http_passthrough: 89 | return self.proxy_response(record) 90 | else: 91 | return self.success(record.content_length, record.content_iter) 92 | except: 93 | logging.error("Internal Error - %s", self.url, exc_info=True) 94 | return self.error("500 Internal Server Error") 95 | 96 | def get_record(self): 97 | """Fetches the Record object from cache or constructs from web. 98 | """ 99 | record = _cache.get(self.url) 100 | if record is None: 101 | http_response = proxy.urlopen(self.url) 102 | record = http_response.write_arc(pool) 103 | _cache.set(self.url, record) 104 | return record 105 | 106 | def proxy_response(self, record): 107 | """Send the response data as it is """ 108 | # TODO: This is very inefficient. Improve. 109 | 110 | # Now we only have the ARC record data. 111 | record_payload = record.read_all() 112 | record_payload = gzip.GzipFile(fileobj=StringIO(record_payload)).read() 113 | arc = ARCRecord.from_string(record_payload, version=1) 114 | 115 | # Create a FakeSocket and read HTTP headers and payload. 116 | sock = proxy._FakeSocket(StringIO(arc.payload)) 117 | response = proxy.ProxyHTTPResponse(self.url, sock) 118 | response.begin() 119 | 120 | status = "%d %s" % (response.status, response.reason) 121 | headers = response.getheaders() 122 | self.start_response(status, headers) 123 | return response.get_payload() 124 | 125 | def success(self, clen, data): 126 | status = '200 OK' 127 | response_headers = [ 128 | ('Content-type', 'application/x-arc-record'), 129 | ('Content-Length', str(clen)) 130 | ] 131 | self.start_response(status, response_headers) 132 | return iter(data) 133 | 134 | def error(self, status, headers=None): 135 | if headers is None: 136 | headers = [ 137 | ('Content-Type', 'text/plain'), 138 | ('Content-Length', '0'), 139 | ] 140 | self.start_response(status, headers) 141 | return iter([]) 142 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BeautifulSoup==3.2.1 2 | Genshi==0.6 3 | PyYAML==3.10 4 | hiredis==0.1.1 5 | py==1.4.7 6 | pytest==2.2.3 7 | redis==2.4.12 8 | uWSGI==1.9.14 9 | warc 10 | wsgiref==0.1.2 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup 3 | 4 | requirements = [line.strip() for line in open("requirements.txt")] 5 | 6 | setup( 7 | name="liveweb", 8 | version="2.0.dev", 9 | description="Liveweb proxy", 10 | license='GPL v2', 11 | author="Internet Archive", 12 | author_email="info@archive.org", 13 | url="http://github.com/internetarchive/liveweb", 14 | packages=["liveweb", "liveweb.tools"], 15 | platforms=["any"], 16 | entry_points={ 17 | "console_scripts": [ 18 | "liveweb-proxy=liveweb.cli:main" 19 | ] 20 | }, 21 | install_requires=requirements 22 | ) 23 | 24 | --------------------------------------------------------------------------------