├── .gitignore
├── .travis.yml
├── MANIFEST.in
├── Makefile
├── Readme.md
├── config.ini
├── docs
    ├── Makefile
    ├── conf.py
    ├── config.rst
    ├── devsetup.rst
    ├── errors.rst
    └── index.rst
├── liveweb
    ├── __init__.py
    ├── cache.py
    ├── cli.py
    ├── config.py
    ├── configutil.py
    ├── errors.py
    ├── file_pool.py
    ├── filetools.py
    ├── main.py
    ├── proxy.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_configutil.py
    │   ├── test_filepool.py
    │   ├── test_filetools.py
    │   ├── test_proxy.py
    │   ├── test_webapp.py
    │   └── webtest.py
    ├── tools
    │   ├── __init__.py
    │   ├── wayback.py
    │   └── wsgiapp.py
    └── webapp.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | bin/
 2 | lib/
 3 | include/
 4 | *.egg-info/
 5 | *.pyc
 6 | *~
 7 | .coverage
 8 | htmlcov/
 9 | docs/_build/
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "2.6"
4 |   - "2.7"
5 | install: pip install -r requirements.txt --use-mirrors
6 | script: py.test liveweb/
7 | 
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 |     
 2 | PROJECT_NAME=$(shell basename $(PWD))
 3 | 
 4 | # If VENV_ROOT is defined in the environment, use it to find the VENV
 5 | # directory, else consider the current dir as the venv dir.
 6 | VENV_ROOT ?= $(shell dirname $(PWD))
 7 | 
 8 | 
 9 | # Use the active virtualenv or the one inside the project
10 | VIRTUAL_ENV ?= $(VENV_ROOT)/$(PROJECT_NAME)
11 | 
12 | VENV=$(VIRTUAL_ENV)
13 | 
14 | # host:port of the liveweb proxy. 
15 | # This is used by the wayback.
16 | LIVEWEB_ADDRESS=localhost:7070
17 | WAYBACK_ADDRESS=:8080
18 | 
19 | CONFIG=config.ini
20 | 
21 | UWSGI=$(VENV)/bin/uwsgi -H$(VENV)
22 | 
23 | .PHONY: docs
24 | 
25 | run:
26 | 	$(VENV)/bin/liveweb-proxy -c $(CONFIG)
27 | 
28 | venv:
29 | 	virtualenv --no-site-packages $(VENV)
30 | 	$(VENV)/bin/pip install -r requirements.txt
31 | 	$(VENV)/bin/python setup.py develop
32 | 
33 | test:
34 | 
35 | 	$(VENV)/bin/py.test liveweb/
36 |     
37 | wayback:
38 | 	$(UWSGI) --http ${WAYBACK_ADDRESS} --wsgi liveweb.tools.wayback --pyargv $(LIVEWEB_ADDRESS)
39 | 
40 | docs:
41 | 	cd docs && make html
42 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | Liveweb Proxy for Wayback Machine
 2 | =================================
 3 | 
 4 | [![Build Status](https://secure.travis-ci.org/internetarchive/liveweb.png?branch=master)](http://travis-ci.org/internetarchive/liveweb)
 5 | 
 6 | Liveweb proxy is component of Internet Archive's [wayback machine][]
 7 | project.
 8 | 
 9 | [wayback machine]: http://web.archive.org/
10 | 
11 | The liveweb proxy captures the content of a web page in real time, archives it 
12 | into a ARC or WARC file and returns the ARC/WARC record back to the wayback 
13 | machine to process. The recorded ARC/WARC file becomes part of the wayback 
14 | machine in due course of time.
15 | 
16 | How to setup
17 | ============
18 | 
19 | * `make venv`
20 | 
21 | 	This setup a new virtual env in the project directory and instals all the dependencies.
22 | 	
23 | * `make run`
24 | 
25 | 	This starts running the liveweb proxy.
26 | 	
27 | * `make test`
28 | 
29 | 	Runs all the test cases.
30 | 	
31 | Documentation
32 | =============
33 | 
34 | Documentation is available at <http://liveweb.readthedocs.org/>.
35 | 


--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
 1 | [liveweb]
 2 | 
 3 | # User-Agent. Specifies the user-agent header sent by liveweb-proxy.
 4 | user-agent = liveweb-proxy/2.0
 5 | 
 6 | 
 7 | # Default value of timeout to use when individual timeouts are not specified.
 8 | #
 9 | # default-timeout = 10s
10 | 
11 | 
12 | # DNS timeout. 
13 | #
14 | # dns-timeout = 2s
15 | 
16 | 
17 | # Max allowed time to receive HTTP status and headers
18 | # 
19 | # initial-data_timout = 3s
20 | 
21 | 
22 | # Specifies the read timeout. This indicates the idle time. If no data
23 | # is received for more than this time, the request will fail.
24 | #
25 | # read-timeout = 1s
26 | 
27 | max-request-time = 2m
28 | 
29 | # Maximum allowed response size.
30 | max-response-size = 100MB
31 | 
32 | # Output directory to write ARC/WRC files
33 | output-directory = records
34 | 
35 | # Filename pattern. Uses python string substitution to replace
36 | # keywords timestamp, serial, pid, fqdn and port.
37 | filename-pattern = live-%(timestamp)s-%(serial)05d-%(fqdn)s-%(port)s.arc.gz
38 | 
39 | # The limit on the size of file, after which it 
40 | filesize-limit = 100MB
41 | 
42 | # Number of concurrent writers
43 | # 
44 | # num-writers = 1
45 | 
46 | archive-type = arc
47 | 
48 | # Cache type.
49 | # Supported options are redis and none.
50 | #
51 | cache = redis
52 | 
53 | # Redis parameters. Used when cache=redis.
54 | #
55 | # redis-host = localhost
56 | # redis-port = 6379
57 | # redis-db = 0
58 | 
59 | # Number of worker processes.
60 | # 
61 | # workers = 1
62 | 
63 | # Number of threads per process
64 | #
65 | # threads = 10
66 | 
67 | # Listen address. 
68 | # Set it to empty to listen on all available interfaces.
69 | #
70 | # listen-address = 127.0.0.1
71 | 
72 | # Port to bind.
73 | # 
74 | # port = 7070
75 | 
76 | 
77 | # UWSGI settings
78 | [uwsgi]
79 | 
80 | # The liveweb-proxy is run using uwsgi.
81 | # You can specify uwsgi setting here.
82 | # See uwsgi --help for more details.
83 | 
84 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | 
 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
 16 | 
 17 | help:
 18 | 	@echo "Please use \`make <target>' where <target> is one of"
 19 | 	@echo "  html       to make standalone HTML files"
 20 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 21 | 	@echo "  singlehtml to make a single large HTML file"
 22 | 	@echo "  pickle     to make pickle files"
 23 | 	@echo "  json       to make JSON files"
 24 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 25 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 26 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 27 | 	@echo "  epub       to make an epub"
 28 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 29 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 30 | 	@echo "  text       to make text files"
 31 | 	@echo "  man        to make manual pages"
 32 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 33 | 	@echo "  linkcheck  to check all external links for integrity"
 34 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 35 | 
 36 | clean:
 37 | 	-rm -rf $(BUILDDIR)/*
 38 | 
 39 | html:
 40 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 41 | 	@echo
 42 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 43 | 
 44 | dirhtml:
 45 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 48 | 
 49 | singlehtml:
 50 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 53 | 
 54 | pickle:
 55 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 56 | 	@echo
 57 | 	@echo "Build finished; now you can process the pickle files."
 58 | 
 59 | json:
 60 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the JSON files."
 63 | 
 64 | htmlhelp:
 65 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 66 | 	@echo
 67 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 68 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 69 | 
 70 | qthelp:
 71 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 72 | 	@echo
 73 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 74 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 75 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/LivewebProxy.qhcp"
 76 | 	@echo "To view the help file:"
 77 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/LivewebProxy.qhc"
 78 | 
 79 | devhelp:
 80 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 81 | 	@echo
 82 | 	@echo "Build finished."
 83 | 	@echo "To view the help file:"
 84 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/LivewebProxy"
 85 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/LivewebProxy"
 86 | 	@echo "# devhelp"
 87 | 
 88 | epub:
 89 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 90 | 	@echo
 91 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 92 | 
 93 | latex:
 94 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 95 | 	@echo
 96 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 97 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 98 | 	      "(use \`make latexpdf' here to do that automatically)."
 99 | 
100 | latexpdf:
101 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
102 | 	@echo "Running LaTeX files through pdflatex..."
103 | 	make -C $(BUILDDIR)/latex all-pdf
104 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
105 | 
106 | text:
107 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
108 | 	@echo
109 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
110 | 
111 | man:
112 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
113 | 	@echo
114 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
115 | 
116 | changes:
117 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
118 | 	@echo
119 | 	@echo "The overview file is in $(BUILDDIR)/changes."
120 | 
121 | linkcheck:
122 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
123 | 	@echo
124 | 	@echo "Link check complete; look for any errors in the above output " \
125 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
126 | 
127 | doctest:
128 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
129 | 	@echo "Testing of doctests in the sources finished, look at the " \
130 | 	      "results in $(BUILDDIR)/doctest/output.txt."
131 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Liveweb Proxy documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Apr 27 14:01:57 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = []
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'Liveweb Proxy'
 44 | copyright = u'2012, Internet Archive'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '2.0-dev'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '2.0-dev'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_domain_indices = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
153 | #html_show_sphinx = True
154 | 
155 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
156 | #html_show_copyright = True
157 | 
158 | # If true, an OpenSearch description file will be output, and all pages will
159 | # contain a <link> tag referring to it.  The value of this option must be the
160 | # base URL from which the finished HTML is served.
161 | #html_use_opensearch = ''
162 | 
163 | # This is the file name suffix for HTML files (e.g. ".xhtml").
164 | #html_file_suffix = None
165 | 
166 | # Output file base name for HTML help builder.
167 | htmlhelp_basename = 'LivewebProxydoc'
168 | 
169 | 
170 | # -- Options for LaTeX output --------------------------------------------------
171 | 
172 | # The paper size ('letter' or 'a4').
173 | #latex_paper_size = 'letter'
174 | 
175 | # The font size ('10pt', '11pt' or '12pt').
176 | #latex_font_size = '10pt'
177 | 
178 | # Grouping the document tree into LaTeX files. List of tuples
179 | # (source start file, target name, title, author, documentclass [howto/manual]).
180 | latex_documents = [
181 |   ('index', 'LivewebProxy.tex', u'Liveweb Proxy Documentation',
182 |    u'Internet Archive', 'manual'),
183 | ]
184 | 
185 | # The name of an image file (relative to this directory) to place at the top of
186 | # the title page.
187 | #latex_logo = None
188 | 
189 | # For "manual" documents, if this is true, then toplevel headings are parts,
190 | # not chapters.
191 | #latex_use_parts = False
192 | 
193 | # If true, show page references after internal links.
194 | #latex_show_pagerefs = False
195 | 
196 | # If true, show URL addresses after external links.
197 | #latex_show_urls = False
198 | 
199 | # Additional stuff for the LaTeX preamble.
200 | #latex_preamble = ''
201 | 
202 | # Documents to append as an appendix to all manuals.
203 | #latex_appendices = []
204 | 
205 | # If false, no module index is generated.
206 | #latex_domain_indices = True
207 | 
208 | 
209 | # -- Options for manual page output --------------------------------------------
210 | 
211 | # One entry per manual page. List of tuples
212 | # (source start file, name, description, authors, manual section).
213 | man_pages = [
214 |     ('index', 'livewebproxy', u'Liveweb Proxy Documentation',
215 |      [u'Internet Archive'], 1)
216 | ]
217 | 


--------------------------------------------------------------------------------
/docs/config.rst:
--------------------------------------------------------------------------------
  1 | .. _config:
  2 | 
  3 | 
  4 | Liveweb Proxy Configuration
  5 | ===========================
  6 | 
  7 | The ``liveweb-proxy`` can be configured using various command-line options and/or a config file.
  8 | 
  9 | Config file can be specified as::
 10 | 
 11 |     $ liveweb-proxy -c liveweb.ini
 12 | 
 13 | or::
 14 |  
 15 |     $ liveweb-proxy --config liveweb.ini
 16 | 
 17 | This section describes the available config settings. For each config setting, there is a command line option with the same name.
 18 | 
 19 | For example, config setting ``archive-format`` is available as command line argument `--archive-format`. 
 20 | 
 21 | The config file is specified in INI format. Here is a sample config file. ::
 22 | 
 23 |     [liveweb]
 24 | 
 25 |     archive-format = arc
 26 | 
 27 |     output-directory = /tmp/records
 28 | 
 29 |     dns-timeout = 2s
 30 |     
 31 | 
 32 | Archive Settings
 33 | ----------------
 34 | 
 35 | **archive-format**
 36 | 
 37 |     Specifies the archive format. Should be one if ``arc`` or ``warc``.
 38 | 
 39 |     The default value is ``arc``.
 40 | 
 41 | .. warning::
 42 | 
 43 |    As of now only ``arc`` is supported.
 44 | 
 45 | 
 46 | **output-directory**
 47 | 
 48 |     Output directory to write ARC/WRC files. Default value is "records".
 49 | 
 50 | 
 51 | **filename-pattern**
 52 | 
 53 |     The pattern of the filename specified as Python string formatting
 54 |     template. The default value is ``live-%(timestamp)s-%(serial)05d.arc.gz``.
 55 | 
 56 |     Available substitutions are ``timestamp``, ``serial``, ``pid``,
 57 |     ``fqdn`` (fully qualified domain name) and ``port``.
 58 | 
 59 | **filesize-limit**
 60 | 
 61 |     The limit on the size of file. If a file crosses this size, it
 62 |     will be closed a new file will be created to write new records.
 63 | 
 64 | **num-writers**
 65 | 
 66 |     The number of concurrent writers. 
 67 | 
 68 |     The default value is ``1``.
 69 | 
 70 | 
 71 | Cache Settings
 72 | --------------
 73 | 
 74 | .. _config_cache:
 75 | 
 76 | **cache**
 77 | 
 78 |     Type of cache to use. Available options are ``redis``, ``sqlite`` and ``none``.
 79 | 
 80 |     The default value is ``none``.
 81 | 
 82 | **redis-host**
 83 | 
 84 | **redis-port**
 85 | 
 86 | **redis-db**
 87 | 
 88 |     Redis host, port and db number. Used only when ``cache=redis``.
 89 | 
 90 | **redis-expire-time**
 91 | 
 92 |     Expire time to set in redis. Used only when ``cache=redis``.
 93 | 
 94 |     The default value is ``1h`` (1 hour).
 95 | 
 96 | **redis-max-record-size**
 97 | 
 98 |     Maximum allowed size of a record that can be cached. Used only when ``cache=redis``.
 99 | 
100 |     The default value is ``100KB``.
101 | 
102 | **sqlite-db**
103 | 
104 |     Path to the sqlite database to use. This option is valid only when ``cache=sqlite``.
105 | 
106 |     The default value is ``liveweb.db``.
107 | 
108 | Timeouts and Resource Limits
109 | ----------------------------
110 | 
111 | **default-timeout**
112 | 
113 |     This is the default timeout value for ``connect-timeout``, ``initial-data-timeout`` and ``read-timeout``. 
114 | 
115 |     The default value is ``10s``.
116 | 
117 | .. _config_dns_timeout:
118 | 
119 | **dns-timeout**
120 | 
121 |     Specifies the max amount of time can a DNS resolution can take.
122 | 
123 |     Python doesn't support a way to specify DNS timeout. On Linux, the
124 |     dns timeout can be specified via the ``RES_OPTIONS`` environment
125 |     variable. This enviroment variable is set at the startup of the
126 |     application based on this config setting.
127 | 
128 |     If unspecified, the DNS timeout is decided by the system default behavior.
129 | 
130 |     See `resolv.conf man page`_ for more details.
131 | 
132 |     .. _resolv.conf man page: http://manpages.ubuntu.com/manpages/lucid/en/man5/resolv.conf.5.html
133 | 
134 | .. _config_connect_timeout:
135 | 
136 | **connect-timeout**
137 | 
138 |     Specifies the connect timeout in seconds. Connections that take
139 |     longer to establish will be aborted.
140 | 
141 | .. _config_initial_data_timeout:
142 | 
143 | **initial-data-timeout**
144 | 
145 |     Specifies the maximum time allowed before receiving initial data
146 |     (HTTP headers) from the remote server.
147 | 
148 | .. _config_read_timeout:
149 | 
150 | **read-timeout**
151 | 
152 |     Specifies the read timeout in seconds. This indicates the idle time. If no data is received for more than this time, the request will fail.
153 | 
154 | 
155 | **max-request-time**
156 | 
157 |     Specifies the total amout of time a HTTP request can take. If it takes
158 |     more than this, the current request will fail.
159 | 
160 |     The default value is ``2m``.
161 | 
162 | **max-response-size**
163 | 
164 |     Specifies the maximum allowed size of response.
165 | 
166 |     The default value is ``100MB``.
167 | 
168 | Other Settings
169 | --------------
170 | 
171 | .. _config_user_agent:
172 | 
173 | **user-agent**
174 | 
175 |     Specifies the value of the ``User-Agent`` request header. 
176 | 
177 |     The default value is ``ia_archiver(OS-Wayback)``.
178 | 
179 | 
180 | **http-passthrough**
181 | 
182 |     This is a boolean parameter, setting it to ``true`` will make it
183 |     work like a http proxy with archiving. Useful for testing and
184 |     recording personal browsing.
185 | 


--------------------------------------------------------------------------------
/docs/devsetup.rst:
--------------------------------------------------------------------------------
 1 | .. _devsetup:
 2 | 
 3 | Development Setup
 4 | =================
 5 | 
 6 | Setting up
 7 | ----------
 8 | 
 9 | Start with getting the source code from github. ::
10 | 
11 |     $ git clone git://github.com/internetarchive/liveweb.git
12 |     $ cd liveweb
13 | 
14 | Setup a virtualenv. ::
15 | 
16 |     $ make venv
17 | 
18 | This will create the virtualenv in the current directory. Edit the
19 | ``Makefile`` if you want to setup virtualenv elsewhere.
20 | 
21 | Running the application
22 | -----------------------
23 | 
24 | Run the application using::
25 | 
26 |     $ make run
27 | 
28 | This will start the liveweb proxy at ``localhost:7070``.
29 | 
30 | Testing using curl
31 | ------------------
32 | 
33 | Assuming the liveweb proxy is running on `localhost:7070`::
34 | 
35 |     $ curl -s -x localhost:7070 http://httpbin.org/get | zcat 
36 |     http://httpbin.org/get 204.236.238.79 20120427110218 application/json 451
37 |     HTTP/1.1 200 OK
38 |     Content-Type: application/json
39 |     Date: Fri, 27 Apr 2012 11:02:18 GMT
40 |     Server: gunicorn/0.13.4
41 |     Content-Length: 298
42 |     Connection: Close
43 | 
44 |     {
45 |       "url": "http://httpbin.org/get", 
46 |       "headers": {
47 |         "Content-Length": "", 
48 |         "Accept-Encoding": "identity", 
49 |         "Connection": "keep-alive", 
50 |         "User-Agent": "ia_archiver(OS-Wayback)", 
51 |         "Host": "httpbin.org", 
52 |         "Content-Type": ""
53 |       }, 
54 |       "args": {}, 
55 |       "origin": "207.241.237.193"
56 |     }
57 | 
58 | Running in http-passthough mode
59 | -------------------------------
60 | 
61 | Enable http-passthrough mode by adding the following to the config file. ::
62 | 
63 |     http_passthough: true
64 | 
65 | Make sure caching is disabled. The http-passthough mode doesn't work with caching.
66 | 
67 | Run the application and change the browser setting to use application
68 | address (localhost:7070 by default) as http proxy.
69 | 
70 | Performance Testing
71 | -------------------
72 | 
73 | Test performance using Apache-Bench::
74 | 
75 |     $ ab -X localhost:7070 -c 10 -n 100 http://www.archive.org/
76 | 
77 | The ``-X`` options is to specify the proxy server.
78 | 
79 | 


--------------------------------------------------------------------------------
/docs/errors.rst:
--------------------------------------------------------------------------------
 1 | Error Codes
 2 | ===========
 3 | 
 4 | The application writes the errors with following codes when something fails when trying to fetch the given URL.
 5 | 
 6 | 1X - Bad Input
 7 | --------------
 8 | 
 9 | **E10 - Invalid URL**
10 | 
11 |     The given URL is invalid. For example::
12 | 
13 |         http://example.com:bad-port/
14 | 
15 | 2X - DNS errors
16 | ---------------
17 | 
18 | **E20 - Invalid Domain**
19 | 
20 |     The URL has non existant domain.
21 | 
22 | **E21 - DNS Timeout**
23 | 
24 |     The hostname couldn't be resolved within :ref:`config_dns_timeout` seconds.
25 | 
26 | 3X - Connection Errors
27 | ----------------------
28 | 
29 | **E30 - Connection Refused**
30 | 
31 |     Connection refused by the server.
32 | 
33 | **E31 - Connect Timeout**
34 | 
35 |     Connection couldn't be established within :ref:`config_connect_timeout` seconds.
36 | 
37 | **E32 - Initial Data Timeout**
38 | 
39 |     Initial data (HTTP headers) couldn't be obtained within :ref:`config_initial_data_timeout` seconds.
40 | 
41 | **E33 - Read Timeout**
42 | 
43 |     When reading data from the remote server, no data was received for :ref:`config_read_timeout` seconds.
44 | 
45 | **E34 - Connection Dropped**
46 | 
47 |    The remote server dropped the connection before all the data was received.
48 | 
49 | **E39 - Unexpected Connection Error**
50 | 
51 |    Unexpected connection error when receiving data from the remote server.
52 | 
53 | 4X - Resource Limits
54 | --------------------
55 | 
56 | **E40 - Response Too Big**
57 | 
58 |     The response length is bigger than :ref:`config_max_response_size` bytes.
59 | 
60 | **E41 - Request Took Too Long**
61 | 
62 |    The request was not completed within :ref:`config_max_request_time` seconds.
63 | 
64 | In all these cases, the application responds back with status ``200 OK``
65 | with a record contain status ``302 Bad Gateway``.
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Liveweb Proxy documentation master file, created by
 2 |    sphinx-quickstart on Fri Apr 27 14:01:57 2012.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Liveweb Proxy
 7 | =============
 8 | 
 9 | Liveweb proxy is a component of Internet Archive's `wayback machine
10 | project <http://web.archive.org/>`_.
11 | 
12 | The liveweb proxy captures the content of a web page in real time,
13 | archives it into a ARC or WARC file and returns the ARC/WARC record
14 | back to the wayback machine to process. The recorded ARC/WARC file
15 | becomes part of the wayback machine in due course of time.
16 | 
17 | .. note::
18 | 
19 |     The liveweb project is under active development, so this documentation may not be up-to-date.
20 | 
21 | Installation
22 | ------------
23 | 
24 | Liveweb proxy can be installed using `pip <http://www.pip-installer.org/>`_::
25 | 
26 |     $ pip install liveweb
27 | 
28 | or, with `easy_install <http://pypi.python.org/pypi/setuptools>`_ ::
29 | 
30 |     $ easy_install liveweb
31 | 
32 | See `Development Setup <devsetup>`_ if you want to work with source code.
33 | 
34 | Running liveweb-proxy
35 | ---------------------
36 | 
37 | Liveweb proxy can be run using::
38 | 
39 |      $ liveweb-proxy 
40 | 
41 | To start liveweb-proxy on a different port::
42 | 
43 |      $ liveweb-proxy -p 8080
44 | 
45 | To load settings from a config file::
46 | 
47 |      $ liveweb-proxy -c liveweb.ini
48 | 
49 | To see the available command-line options::
50 | 
51 |      $ liveweb-proxy --help
52 | 
53 | See :ref:`Configuration <config>` section for the available config settings and command line options.
54 | 
55 | Advanced Usage
56 | --------------
57 | 
58 | Under the hood, liveweb proxy used `uwsgi <http://projects.unbit.it/uwsgi/>`_ as the http server.
59 | 
60 | If you want to tweak uwsgi parameters, you can start liveweb as::
61 | 
62 |     $ uwsgi --master --single-interpreter --lazy --wsgi liveweb.main --processes 1 --threads 10 --http localhost:7070
63 | 
64 | The values of ``--processes``, ``--threads`` and ``--http`` options can be changed as needed and more options can be added too.
65 | 
66 | You may have to specify `-H virtualenv_home` if you using a virtualenv.
67 | 
68 | Documentation
69 | -------------
70 | 
71 | .. toctree::
72 |    :maxdepth: 3
73 | 
74 |    devsetup
75 |    config
76 |    errors
77 | 


--------------------------------------------------------------------------------
/liveweb/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/liveweb/cache.py:
--------------------------------------------------------------------------------
  1 | """Cache for liveweb.
  2 | """
  3 | 
  4 | from collections import namedtuple
  5 | import logging
  6 | import sqlite3
  7 | 
  8 | import redis
  9 | 
 10 | from .proxy import Record
 11 | 
 12 | class RedisCache:
 13 |     """Cache based on Redis.
 14 | 
 15 |     This caches the whole arc record.
 16 |     """
 17 |     def __init__(self, **params):
 18 |         """Creates a new instance of redis client.
 19 | 
 20 |         :param host: host to connect, defaults to "localhost"
 21 |         :param port: port to connect, defaults to 6379, the default redis server port
 22 |         :param db: db number, defaults to 0
 23 |         :param expire_time: amount of time in seconds after which the entry in the cache should expire, defaults to one hour.
 24 |         """
 25 |         self.expire_time = int(params.pop('expire_time', 3600)) # default timeout
 26 | 
 27 |         # max size of record that can be cached. Defaults to 100K.
 28 |         self.max_record_size = params.pop('max_record_size', 100*1024)
 29 | 
 30 |         self.redis_client = redis.StrictRedis(**params)
 31 | 
 32 |     def get(self, url):
 33 |         data = self.redis_client.get(url)
 34 |         if data is not None:
 35 |             logging.info("cache hit - %s", url)
 36 |             return Record(filename=None,
 37 |                           offset=0, 
 38 |                           content_length=len(data),
 39 |                           content_iter=iter([data]))
 40 | 
 41 |     def set(self, url, record):
 42 |         """Puts a new entry in the cache.
 43 | 
 44 |         :param url: URL for which the response is being cached
 45 |         :param record: record to be cached
 46 |         """
 47 |         if record.content_length <= self.max_record_size:
 48 |             data = record.read_all()
 49 |             self.redis_client.setex(url, self.expire_time, data)
 50 | 
 51 |     def next(self):
 52 |         """Returns the next-value of the counter.
 53 |         Used by file_pool to get next sequence.
 54 |         """
 55 |         return self.redis_client.incr("filename-sequence")
 56 | 
 57 | class SqliteCache:
 58 |     """Cache implementation based on sqlite.
 59 | 
 60 |     This stores url, filepath, offset and content_length in the
 61 |     database. Useful when running in http-passthough mode with
 62 |     browser.
 63 |     """
 64 |     SCHEMA = ("" + 
 65 |               "CREATE TABLE cache (" +
 66 |               "    url text unique," + 
 67 |               "    filename text," +
 68 |               "    offset int," + 
 69 |               "    clen int" +
 70 |               ")")
 71 | 
 72 |     def __init__(self, database):
 73 |         self.database = database
 74 |         self.create_table()
 75 | 
 76 |     def create_table(self):
 77 |         if "cache" not in self._get_tables():
 78 |             self.query(self.SCHEMA)
 79 | 
 80 |     def _get_tables(self):
 81 |         q = "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
 82 |         tables = [row[0] for row in self.query(q)]
 83 |         return tables
 84 | 
 85 |     def query(self, query, args=[], commit=False):
 86 |         logging.debug("query: %r - %r", query, args)
 87 |         conn = sqlite3.connect(self.database)
 88 |         cursor = conn.execute(query, args)
 89 |         rows = cursor.fetchall()
 90 |         if commit:
 91 |             conn.commit()
 92 |         cursor.close()
 93 |         conn.close()
 94 |         return rows
 95 |         
 96 |     def get(self, url):
 97 |         rows = self.query("SELECT filename, offset, clen FROM cache WHERE url=?", [url])
 98 |         if rows:
 99 |             logging.info("cache hit - %s", url)
100 |             filepath, offset, content_length = rows[0]
101 |             return Record(filepath, offset=offset, content_length=content_length)
102 |         else:
103 |             logging.info("cache miss - %s", url)
104 | 
105 |     def set(self, url, record):
106 |         self.query("INSERT INTO cache (url, filename, offset, clen) VALUES (?, ?, ?, ?)", 
107 |                    [url, record.filename, record.offset, record.content_length],
108 |                    commit=True)
109 | 
110 | class NoCache:
111 |     def get(self, url):
112 |         return None
113 | 
114 |     def set(self, url, record):
115 |         pass
116 | 
117 | def create(type, config):
118 |     logging.info("creating cache %s", type)
119 | 
120 |     if type == 'redis':
121 |         return RedisCache(host=config.redis_host, 
122 |                           port=config.redis_port, 
123 |                           db=config.redis_db, 
124 |                           expire_time=config.redis_expire_time, 
125 |                           max_record_size=config.redis_max_record_size)
126 |     elif type == 'sqlite':
127 |         return SqliteCache(config.sqlite_db)
128 |     elif type == 'none' or type == None:
129 |         return NoCache()
130 |     else:
131 |         raise ValueError("Unknown cache type %r" % type)
132 | 


--------------------------------------------------------------------------------
/liveweb/cli.py:
--------------------------------------------------------------------------------
  1 | """Command-line interface to the Liveweb Proxy.
  2 | """
  3 | import sys
  4 | import os
  5 | from optparse import OptionParser, OptionGroup
  6 | from .configutil import Config
  7 | 
  8 | def make_config():
  9 |     c = Config("liveweb")
 10 | 
 11 |     c.add_option("-c", "--config",
 12 |                  help="specifies the liveweb-proxy config file")
 13 | 
 14 |     c.add_option("--archive-format",
 15 |                  type="choice",
 16 |                  choices=["none", "arc", "warc"],
 17 |                  default="arc",
 18 |                  help="specifies the archiving format")
 19 | 
 20 |     c.add_option("--http-passthrough",
 21 |                  type="bool",
 22 |                  default="false",
 23 |                  help="enables the http-passthrough mode")
 24 | 
 25 |     c.add_option("--user-agent", 
 26 |                  default="ia_archiver(OS-Wayback)",
 27 |                  help="the user-agent string used by liveweb-proxy")
 28 | 
 29 |     c.add_option("--uid",
 30 |                  help="setuid to the specified user/uid")
 31 | 
 32 |     c.add_option("--gid",
 33 |                  help="setgid to the specified group/gid")
 34 | 
 35 |     # server options
 36 |     c.add_option("-l", "--listen", 
 37 |                  metavar="IP_ADDRESS", 
 38 |                  default="127.0.0.1",
 39 |                  help="the IP-address on which the liveweb-proxy will listen on (default: %default).")
 40 | 
 41 |     c.add_option("-p", "--port",
 42 |                  type="int",
 43 |                  default="7070",
 44 |                  help="the port on which the liveweb-proxy will listen on (default: %default).")
 45 | 
 46 |     c.add_option("-w", "--workers", 
 47 |                  type="int", 
 48 |                  default="1",
 49 |                  help="the number of worker processes (default: %default)")
 50 | 
 51 |     c.add_option("-t", "--threads",
 52 |                  type="int",
 53 |                  default="10",
 54 |                  help="the number of threads/process (default: %default)")
 55 | 
 56 |     # storage options
 57 |     c.add_option("-o", "--output-directory", 
 58 |                  metavar="DIR", 
 59 |                  default="records",
 60 |                  help="the directory to store the arc/warc files (default: %default)")
 61 | 
 62 |     c.add_option("--filename-pattern", 
 63 |                  type="string", 
 64 |                  default="live-%(timestamp)s-%(serial)05d.arc.gz",
 65 |                  help="specifies the format of the filename to store the arc/warc files.")
 66 | 
 67 |     c.add_option("--num-writers", 
 68 |                  type="int", 
 69 |                  default="1",
 70 |                  help="specifies the number of concurrent writers")
 71 | 
 72 |     c.add_option("--filesize-limit",
 73 |                  type="bytes", 
 74 |                  default="100MB",
 75 |                  help="specifies the recommended size limit for each file.")
 76 | 
 77 |     # timeouts and limits
 78 | 
 79 |     c.add_option("--default-timeout", 
 80 |                  type="time", 
 81 |                  default="10s",
 82 |                  help="the default timeout value to use if a timeout option is not specified."),
 83 | 
 84 |     c.add_option("--dns-timeout", 
 85 |                  type="time",
 86 |                  help="maximum allowed time for domain name resolution")
 87 | 
 88 |     c.add_option("--connect-timeout", 
 89 |                  type="time",
 90 |                  help="maximum allowed time for establishing connection")
 91 | 
 92 |     c.add_option("--initial-data-timeout",
 93 |                  type="time",
 94 |                  help="maximum wait time to receive status and headers from the remove server")
 95 | 
 96 |     c.add_option("--read-timeout",
 97 |                  type="time",
 98 |                  help="the read timeout")
 99 | 
100 |     c.add_option("--max-request-time",
101 |                  type="time",
102 |                  default="5m",
103 |                  help="the total amout of time a HTTP request can take")
104 | 
105 |     c.add_option("--max-response-size",
106 |                  type="bytes",
107 |                  default="100MB",
108 |                  help="the maximum allowed size of response")
109 | 
110 |     # cache options
111 |     c.add_option("--cache", 
112 |                  type="choice", 
113 |                  choices=["none", "redis", "sqlite"], 
114 |                  default="none", 
115 |                  help="specifies the type of cache to use")
116 | 
117 |     c.add_option("--redis-host", 
118 |                  type="string", 
119 |                  default="localhost")
120 | 
121 |     c.add_option("--redis-port", 
122 |                  type="int", 
123 |                  default="6379")
124 | 
125 |     c.add_option("--redis-db", 
126 |                  type="int", 
127 |                  default="0")
128 | 
129 |     c.add_option("--redis-expire-time", 
130 |                  type="time", 
131 |                  default="1h")
132 | 
133 |     c.add_option("--redis-max-record-size", 
134 |                  type="bytes", 
135 |                  default="100KB")
136 | 
137 |     c.add_option("--sqlite-db",
138 |                  type="string",
139 |                  default="liveweb.db")
140 | 
141 |     return c
142 | 
143 | def find_python_home():
144 |     # Python will be installed in bin/ or Scripts/ directory. Parent
145 |     # of that will be the Python home.
146 |     bindir = os.path.abspath(os.path.dirname(sys.executable))
147 |     home = os.path.dirname(bindir)
148 |     return home
149 | 
150 | def run_uwsgi(config):
151 |     python_home = os.getenv("VIRTUAL_ENV") or find_python_home()
152 |     bind = "%s:%s" % (config['listen'], config['port'])
153 | 
154 |     args = ["uwsgi",
155 |             "--http", bind,
156 |             "-Mi",  # master, single-interpreter
157 |             "--lazy",
158 |             "--home", python_home,
159 |             "--wsgi", "liveweb.main",
160 |             "--processes", config['workers'],
161 |             "--threads", config['threads'], 
162 |             "--listen", 1024,
163 |             ]
164 | 
165 |     if config.get("config"):
166 |         args.append("--pyargv")
167 |         args.append("-c " + config['config'])
168 | 
169 |     if config['uid']:
170 |         args.append("--uid", config['uid'])
171 | 
172 |     if config['gid']:
173 |         args.append("--gid", config['gid'])
174 | 
175 |     dirname = os.path.abspath(os.path.dirname(sys.argv[0]))
176 |     uwsgi_path = os.path.join(dirname, "uwsgi")
177 | 
178 |     os.execvp(uwsgi_path, [str(a) for a in args])
179 | 
180 | def set_dns_timeout(timeout):
181 |     os.putenv("RES_OPTIONS", "timeout:%d attempts:1" % timeout)
182 | 
183 | def main():
184 |     c = make_config()
185 | 
186 |     # load configuration from env, config file and command line arguments.
187 |     c.load()
188 | 
189 |     # update current env with new values so that the exec'ed process can take these settings
190 |     c.putenv()
191 | 
192 |     set_dns_timeout(c.get('dns_timeout') or c.get('default_timeout'))
193 | 
194 |     # required for generating filenames
195 |     os.putenv("LIVEWEB_PORT", str(c.get("port")))
196 | 
197 |     run_uwsgi(c.dict())
198 |     
199 | if __name__ == "__main__":
200 |     main()
201 | 


--------------------------------------------------------------------------------
/liveweb/config.py:
--------------------------------------------------------------------------------
 1 | """Liveweb configuration.
 2 | 
 3 | This is initialized by calling the load(configfile) function on startup.
 4 | """
 5 | import yaml
 6 | import os
 7 | import logging
 8 | from ConfigParser import ConfigParser
 9 | 
10 | from .cli import make_config
11 | 
12 | # The config options and default values are specified in make_config function in cli.py
13 | 
14 | def init_defaults():
15 |     global _config
16 |     _config = make_config()
17 |     globals().update(_config.dict())
18 | 
19 | def load():
20 |     """Loads the configuration from environment, config file and command-line arguments.
21 |     """
22 |     global _config
23 |     _config = make_config()
24 |     _config.load()
25 |     globals().update(_config.dict())
26 | 
27 | extra_headers = {}
28 | 
29 | def get_connect_timeout():
30 |     return connect_timeout or default_timeout
31 | 
32 | def get_initial_data_timeout():
33 |     return initial_data_timeout or default_timeout
34 | 
35 | def get_read_timeout():
36 |     return read_timeout or default_timeout
37 | 
38 | def get_dns_timeout():
39 |     return dns_timeout or default_timeout
40 | 
41 | # initialize the default configuration
42 | init_defaults()
43 | 
44 | # handy function to check for existance of a config parameter
45 | get = globals().get
46 | 


--------------------------------------------------------------------------------
/liveweb/configutil.py:
--------------------------------------------------------------------------------
  1 | """Utility to work with configuration.
  2 | 
  3 | The configuration can be changed in many different ways. The configuration sources in the order of priority are:
  4 | 
  5 | * The command-line arguments
  6 | * The config file
  7 | * The environment variables
  8 | 
  9 | The optparse module alone is not sufficient to handle this. If the
 10 | OptionParser is used with default values, there is no way to know if
 11 | the option is specified as command line argument or it is a default
 12 | value.
 13 | 
 14 | Also, we need a way to specify time in seconds/minutes/hours and bytes
 15 | in KB/MB/GB. These formats should work for all the 3 sources.
 16 | 
 17 | This module provides a framework to address these issues.
 18 | """
 19 | import os
 20 | import sys
 21 | import logging
 22 | from ConfigParser import ConfigParser
 23 | import optparse
 24 | 
 25 | class Config:
 26 |     """Entry point to configuration.
 27 | 
 28 |     This provides way to set the configuration using environment
 29 |     variables, config file and command line parameters.
 30 | 
 31 |     The configuration is made of many ConfigOptions. Each ConfigOption
 32 |     accounts for one environment variable, one setting in the config
 33 |     file and one command line variable.
 34 | 
 35 |     This class provides tools to read/set environment variable, load
 36 |     config file and create OptionParser to parse the command-line
 37 |     arguments.
 38 |     """
 39 |     def __init__(self, name):
 40 |         self.name = name
 41 |         self.config_options = []
 42 | 
 43 |     def add_option(self, *args, **kwargs):
 44 |         """Creates a new ConfigOption created using the specified arguments and adds it to this Config.
 45 |         """
 46 |         option = ConfigOption(*args, **kwargs)
 47 |         self.config_options.append(option)
 48 | 
 49 |     def get(self, name):
 50 |         return self.dict().get(name)
 51 | 
 52 |     def dict(self, dirty=None):
 53 |         """Returns values of all the config options as a dict.
 54 | 
 55 |         If dirty=True is specified, only the values of the modified options are returned.
 56 |         """
 57 |         return dict((c.name, c.value) for c in self.config_options 
 58 |                     if dirty is None or c.dirty==dirty)
 59 | 
 60 |     def putenv(self):
 61 |         """Updates this process env with environment variables
 62 |         indicating current configuration.
 63 | 
 64 |         Useful to set config values before exec'ing a new process.
 65 |         """
 66 |         for c in self.config_options:
 67 |             c.putenv()
 68 | 
 69 |     def load(self, env=None, args=None):
 70 |         """Loads the configuration from environment, config-file and command-line arguments.
 71 |         """
 72 |         self.load_from_env(env)
 73 | 
 74 |         p = self.create_optparse_parser()
 75 |         options, args2 = p.parse_args(args)
 76 | 
 77 |         # take config file from command-line or env
 78 |         # TODO: support an option to provide an alternative name for "config"
 79 |         config_file = getattr(options, "config", None) or self.get("config")
 80 |         if config_file:
 81 |             self.load_from_ini(config_file)
 82 | 
 83 |         self.load_from_optparse_options(options)
 84 | 
 85 |     def load_from_env(self, env=None):
 86 |         """Loads the configuration from environment.
 87 |         """
 88 |         for c in self.config_options:
 89 |             c.load_from_env(env)
 90 | 
 91 |     def load_from_ini(self, filename):
 92 |         """Loads the configuration from a config file.
 93 |         """
 94 |         p = ConfigParser()
 95 |         p.read(filename)
 96 | 
 97 |         for c in self.config_options:
 98 |             # using name as used in command line options in the config file
 99 |             name = c.name.replace("_", "-")
100 |             if p.has_option(self.name, name):
101 |                 c.set(p.get(self.name, name, raw=True))
102 | 
103 |     def create_optparse_parser(self):
104 |         p = optparse.OptionParser(self.name)
105 |         for c in self.config_options:
106 |             p.add_option(c.option)
107 |         return p
108 | 
109 |     def load_from_optparse_options(self, options):
110 |         options = options.__dict__
111 |         for c in self.config_options:
112 |             if options.get(c.name) is not None:
113 |                 c.set(options[c.name])
114 | 
115 | class ConfigOption:
116 |     """Represents one entry in the Configuration.
117 | 
118 |     This corresponds to one environment variable, one config setting and one command line option.
119 |     """
120 |     def __init__(self, *opts, **kw):
121 |         self.type = kw.pop("type", "string")
122 |         self.default = kw.pop("default", None)
123 |         self.help = kw.pop("help", None)
124 |         
125 |         help = self.help and self.help.replace("%default", str(self.default))
126 | 
127 |         if self.type == "bool":
128 |             self.option = _Option(*opts, action="store_true", help=help, **kw)
129 |         else:
130 |             type = self.type
131 |             self.option = _Option(*opts, type=type, help=help, **kw)
132 |         
133 |         self.name = self.option.dest
134 |         self.kw = kw
135 |         
136 |         self.set(self.default)
137 | 
138 |     @property
139 |     def dest(self):
140 |         return self.option.dest
141 | 
142 |     @property
143 |     def optname(self):
144 |         return self.name.replace("_", "-")
145 | 
146 |     @property
147 |     def envname(self):
148 |         """Name of the enviroment variable to specify this.
149 |         """
150 |         return "LIVEWEB_" + self.name.upper()
151 | 
152 |     @property
153 |     def dirty(self):
154 |         """True if the value of this item is modified."""
155 |         return self.strvalue != self.default
156 | 
157 |     def set(self, value):
158 |         if value is None:
159 |             self.value = None
160 |             self.strvalue = value
161 |         else:
162 |             self.strvalue = str(value)
163 |             self.value = self.parse(value)
164 | 
165 |     def parse(self, value):
166 |         if self.type == "bool":
167 |             return self.parse_boolean(value)
168 |         else:
169 |             return self.option.convert_value("--" + self.optname, str(value))
170 | 
171 |     def parse_boolean(self, strvalue):
172 |         return str(strvalue).lower() in ["true", "1"]
173 | 
174 |     def load_from_env(self, env=None):
175 |         if env is None:
176 |             env = os.environ
177 |         if self.envname in env:
178 |             self.set(env[self.envname])
179 | 
180 |     def putenv(self):
181 |         if self.dirty or self.envname in os.environ:
182 |             os.putenv(self.envname, self.strvalue)
183 | 
184 |     def add_option(self, option_parser):
185 |         option_parser.add_option("--" + self.optname, help=self.help, **kw)
186 | 
187 | def parse_time(strvalue):
188 |     """Parses time specified in seconds, minutes and hours into seconds.
189 |     
190 |     Time is specifed in seconds, minutes and hours using suffix s, m
191 |     and h respectively. This method parses that info and converts that
192 |     with appropriate multipler to convert into seconds.
193 |     """
194 |     if not isinstance(strvalue, basestring):
195 |         return strvalue
196 | 
197 |     strvalue = strvalue.replace(" ", "")
198 |     scales = {
199 |         's': 1,
200 |         'm': 60,
201 |         'h': 3600
202 |     }
203 | 
204 |     if strvalue[-1] in scales.keys():
205 |         scale = scales[strvalue[-1]]
206 |         strvalue = strvalue[:-1]
207 |     else:
208 |         scale = 1
209 |         
210 |     t = float(strvalue) * scale
211 |     return t
212 |     
213 | def parse_bytes(strvalue):
214 |     """Parses the bytes specified as KB, MB and GB into number.
215 |     """
216 |     if not isinstance(strvalue, basestring):
217 |         return strvalue
218 | 
219 |     strvalue = strvalue.replace(" ", "")
220 |     scales = {
221 |         "KB": 1024,
222 |         "MB": 1024**2,
223 |         "GB": 1024**3
224 |     }
225 |     if strvalue[-2:] in scales:
226 |         scale = scales[strvalue[-2:]]
227 |         strvalue = strvalue[:-2]
228 |     else:
229 |         scale = 1
230 |     size = int(strvalue) * scale
231 |     return size
232 | 
233 | def wrap_checker(f):
234 |     def g(option, opt, value):
235 |         try:
236 |             return f(value)
237 |         except ValueError:
238 |             what = option.type
239 |             raise optparse.OptionValueError(
240 |                 "option %s: invalid %s value: %r" % (opt, what, value))
241 |     return g
242 | 
243 | class _Option(optparse.Option):
244 |     """Customized Option class to support time and bytes types.
245 |     """
246 |     TYPES = optparse.Option.TYPES + ("time", "bytes")
247 |     TYPE_CHECKER = dict(optparse.Option.TYPE_CHECKER,
248 |                         bytes=wrap_checker(parse_bytes),
249 |                         time=wrap_checker(parse_time))
250 | 
251 | 


--------------------------------------------------------------------------------
/liveweb/errors.py:
--------------------------------------------------------------------------------
 1 | """Exceptions raised by liveweb internals.
 2 | """
 3 | class LivewebException(Exception): 
 4 |     pass
 5 | 
 6 | class BadURL(LivewebException): 
 7 |     """Raised if the given URL was malformed in some way.
 8 |     """
 9 |     pass
10 | 
11 | class ConnectionFailure(LivewebException, IOError):
12 |     """Raised if a connection to the remote URL couldn't be established or was 
13 |     interrupted.
14 |     """
15 |     pass
16 |     
17 | class TimeoutError(LivewebException, IOError):
18 |     """Raised if when a connection is timedout.
19 |     """
20 |     pass
21 |     
22 | 


--------------------------------------------------------------------------------
/liveweb/file_pool.py:
--------------------------------------------------------------------------------
  1 | """
  2 | File pool implementation
  3 | """
  4 | 
  5 | import datetime
  6 | import os
  7 | import Queue
  8 | import random
  9 | import threading
 10 | import socket
 11 | import itertools
 12 | 
 13 | import logging
 14 | logging.basicConfig(level = logging.DEBUG)
 15 | 
 16 | class MemberFile(object):
 17 |     """
 18 |     """
 19 |     def __init__(self, name, pool, *largs, **kargs):
 20 |         self.fp = open(name, *largs, **kargs)
 21 |         self.pool = pool
 22 | 
 23 |     def __enter__(self):
 24 |         return self
 25 | 
 26 |     def __exit__(self, exc_type, exc_value, traceback):
 27 |         self.pool.return_file(self)
 28 | 
 29 | 
 30 |     def __getattr__(self, attr):
 31 |         return getattr(self.fp, attr)
 32 | 
 33 | 
 34 | class FilePool(object):
 35 |     """
 36 |     Implements a pool of files from which a file can be requested.
 37 | 
 38 |     """
 39 |     def __init__(self, directory, pattern="liveweb-%(timestamp)s-%(serial)05d.arc.gz", max_files=1, max_file_size=100*1024*1024, init_file_func=None):
 40 |         """
 41 |         Creates a pool of files in the given directory with the
 42 |         specified pattern.
 43 | 
 44 |         The number of files is max_files and the maximum size of each
 45 |         file is max_file_size.
 46 | 
 47 |         The `get_file` method returns a new file from the pool
 48 | 
 49 |         """
 50 |         self.directory = directory
 51 |         self.pattern = pattern
 52 |         self.max_files = max_files
 53 |         self.max_file_size = max_file_size
 54 |         self.init_file_func = init_file_func
 55 | 
 56 |         self.queue = Queue.Queue(self.max_files)
 57 | 
 58 |         self.seq_counter = itertools.count()
 59 | 
 60 |         # vars required to substitue filename pattern.
 61 |         self._port = os.getenv("LIVEWEB_PORT", "0")
 62 |         self._host = socket.gethostname()
 63 |         self._pid = os.getpid()
 64 | 
 65 |         # Adding None to queue indicating that new file needs to be created
 66 |         for i in range(self.max_files):
 67 |             self.queue.put(None)
 68 | 
 69 |     def set_sequence(self, counter):
 70 |         """Sets the sequence counter used to generate filename.
 71 | 
 72 |         Used to set a distrbuted persistent counter using redis/database.
 73 | 
 74 |         :param counter: An iterable counter
 75 |         """
 76 |         self.seq_counter = counter
 77 | 
 78 |     def _new_file(self):
 79 |         timestamp = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
 80 |         pattern_dict = dict(
 81 |             timestamp=timestamp,
 82 |             timestamp20=timestamp,
 83 |             timestamp17=timestamp[:17],
 84 |             timestamp14=timestamp[:14],
 85 |             serial=self.seq_counter.next(),
 86 |             port=self._port,
 87 |             host=self._host,
 88 |             fqdn=self._host,
 89 |             pid=self._pid)
 90 | 
 91 |         fname = self.pattern%pattern_dict
 92 |         partial_dir = os.path.join(self.directory, 'partial')
 93 |         absolute_name = os.path.join(partial_dir, fname)
 94 | 
 95 |         logging.info("Creating new file %s", absolute_name)
 96 | 
 97 |         fp = MemberFile(absolute_name, self, mode = "ab")
 98 |         # Initialize the file object like writing file headers etc.
 99 |         if self.init_file_func:
100 |             self.init_file_func(fp)
101 |         return fp
102 | 
103 |     def return_file(self, f):
104 |         """Returns a file to the pool. Will discard the file and
105 |         insert a new one if the file is above max_file_size."""
106 |         logging.debug("Returning %s",f)
107 |         file_size = f.tell()
108 |         if file_size < self.max_file_size:
109 |             logging.debug(" Put it back")
110 |             self.queue.put(f)
111 |         else:
112 |             logging.debug(" Closing and creating a new file")
113 |             f.close()
114 |             complete_dir = os.path.join(self.directory, 'complete')
115 |             basename = os.path.basename(f.name)
116 |             complete_name = os.path.join(complete_dir, basename)
117 |             os.rename(f.name, complete_name)
118 |             self.queue.put(None)
119 | 
120 |     def get_file(self):
121 |         f = self.queue.get()
122 |         # f is None when new file needs to be created
123 |         if f is None:
124 |             f = self._new_file()
125 |         logging.debug("Getting %s",f)
126 |         return f
127 | 
128 |     def close(self):
129 |         logging.debug("Closing all descriptors. Emptying pool.")
130 |         while not self.queue.empty():
131 |             fp = self.queue.get_nowait()
132 |             if fp:
133 |                 fp.close()
134 | 


--------------------------------------------------------------------------------
/liveweb/filetools.py:
--------------------------------------------------------------------------------
  1 | from cStringIO import StringIO
  2 | import httplib
  3 | import tempfile
  4 | import logging
  5 | import os
  6 | 
  7 | from . import config
  8 | 
  9 | class SizeLimitExceeded(IOError): pass
 10 | 
 11 | 
 12 | def spy(fileobj, spyobj = None, max_size = None):
 13 |     """Returns a new file wrapper the records the contents of a file
 14 |     as someone is reading from it.
 15 |     """
 16 |     return SpyFile(fileobj, spyobj, max_size)
 17 | 
 18 | class SpyFile:
 19 |     """File wrapper to record the contents of a file as someone is
 20 |     reading from it.
 21 | 
 22 |     If the "spy" parameter is passed, it will be the stream to which
 23 |     the read data is written.
 24 | 
 25 |     SpyFile works like a "tee"
 26 |                           
 27 |                         -------------
 28 |      Actual client <--- SpyFileObject <--- Data Source
 29 |                         ____     ____
 30 |                             \ | /    
 31 |                               |      
 32 |                               V      
 33 |                              spy     
 34 |                          (spy object) 
 35 |     
 36 |                               
 37 |     """
 38 |     def _check_size(self):
 39 |         """Raises SizeLimitExceeded if the SpyFile has seen more data
 40 |         than the specified limit"""
 41 |         if self.max_size:
 42 |             if self.current_size > int(self.max_size):
 43 |                 raise SizeLimitExceeded("Spy file limit exceeded %d (max size : %d)"%(self.current_size, self.max_size))
 44 | 
 45 |     def __init__(self, fileobj, spy = None, max_size = None):
 46 |         self.fileobj = fileobj
 47 |         self.buf = spy or StringIO()
 48 |         self.max_size = max_size
 49 |         self.current_size = 0
 50 | 
 51 |     def read(self, *a, **kw):
 52 |         text = self.fileobj.read(*a, **kw)
 53 |         self.buf.write(text)
 54 |         self.current_size += len(text)
 55 |         self._check_size()
 56 |         return text
 57 | 
 58 |     def readline(self, *a, **kw):
 59 |         text = self.fileobj.readline(*a, **kw)
 60 |         self.buf.write(text)
 61 |         self.current_size += len(text)
 62 |         self._check_size()
 63 |         return text
 64 | 
 65 |     def readlines(self):
 66 |         return list(self)
 67 | 
 68 |     def __iter__(self):
 69 |         while True:
 70 |             line = self.readline()
 71 |             if not line:
 72 |                 break
 73 |             yield line
 74 |     
 75 |     def close(self):
 76 |         self.fileobj.close()
 77 | 
 78 |     def change_spy(self, fileobj):
 79 |         "Changes the file which recives the spied upon data to fileobj"
 80 |         self.buf.flush()
 81 |         self.buf.close()
 82 |         self.buf = fileobj
 83 |         
 84 | 
 85 | class SpyHTTPResponse(httplib.HTTPResponse):
 86 |     def __init__(self, *a, **kw):
 87 |         httplib.HTTPResponse.__init__(self, *a, **kw)
 88 |         from . import config
 89 |         self.fp = spy(self.fp, None, config.max_payload_size)
 90 | 
 91 | 
 92 | class MemFile:
 93 |     """Something like StringIO, but switches to a temp file when the maxsize is crossed.
 94 |     """
 95 |     def __init__(self, maxsize=1024*1024, tmpdir=None, prefix="memfile-", suffix=".tmp"):
 96 |         self.maxsize = maxsize
 97 |         
 98 |         self.tmpdir = tmpdir
 99 |         self.prefix = prefix
100 |         self.suffix = suffix
101 | 
102 |         self._fileobj = StringIO()
103 |         
104 |     def in_memory(self):
105 |         """Returns True if the file is in memory."""
106 |         return not isinstance(self._fileobj, file)
107 |         
108 |     def __getattr__(self, name):
109 |         return getattr(self._fileobj, name)
110 |         
111 |     def _open_tmpfile(self):
112 |         # The TemporaryFile gets deleted automatically when it is closed or when it is garbage collected.
113 |         return tempfile.TemporaryFile(dir=self.tmpdir, prefix=self.prefix, suffix=self.suffix)
114 |         
115 |     def _switch_to_disk(self):
116 |         content = self._fileobj.getvalue()
117 |         self._fileobj = self._open_tmpfile()
118 |         self._fileobj.write(content)
119 |         
120 |     def write(self, data):
121 |         if self.in_memory() and self.tell() + len(data) > self.maxsize:
122 |             self._switch_to_disk()
123 |         self._fileobj.write(data)
124 |         
125 |     def writelines(self, lines):
126 |         for line in lines:
127 |             self.write(line)
128 |             
129 |     def close(self):
130 |         """Deletes the temp file if created.
131 |         """
132 |         if self._fileobj and not self.in_memory():
133 |             logging.info("removing temp file %s", self._fileobj.name)
134 |             os.unlink(self._fileobj.name)
135 | 
136 | class DummyFilePool:
137 |     """Simple implementation of FilePool.
138 |     """
139 |     counter = 0
140 |     
141 |     def get_file(self):
142 |         filename = "/tmp/record-%d.arc.gz" % self.counter
143 |         while os.path.exists(filename):
144 |             self.counter += 1
145 |             filename = "/tmp/record-%d.arc.gz" % self.counter
146 |         return open(filename, "w")
147 | 
148 | def fileiter(file, size, chunk_size=1024*10):
149 |     """Returns an iterator over the file for specified size.
150 |     
151 |     The chunk_size specified the amount of data read in each step.
152 |     """
153 |     completed = 0
154 |     while completed < size:
155 |         nbytes = min(size-completed, chunk_size)
156 |         content = file.read(nbytes)
157 |         if not content:
158 |             break
159 |         yield content
160 |         completed += len(content)
161 |         
162 | def test():
163 |     import httplib
164 |     conn = httplib.HTTPConnection("openlibrary.org")
165 |     conn.response_class = SpyHTTPResponse
166 | 
167 |     conn.request("GET", "/")
168 |     res = conn.getresponse()
169 |     fp = res.fp
170 | 
171 |     print fp.buf.getvalue()
172 | 
173 |     res.read()
174 |     print fp.buf.getvalue()
175 | 
176 | if __name__ == "__main__":
177 |     test()
178 | 


--------------------------------------------------------------------------------
/liveweb/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging
 4 | 
 5 | logging.basicConfig(level=logging.INFO,
 6 |                     format="%(asctime)s %(threadName)18s %(levelname)5s: %(message)s",
 7 |                     datefmt="%Y-%m-%d %H:%M:%S")
 8 | 
 9 | logging.info("starting liveweb-proxy")
10 | 
11 | from . import config
12 | 
13 | # load config
14 | config.load()
15 | 
16 | # Make sure the storage directory exists
17 | partial_dir = os.path.join(config.output_directory, 'partial')
18 | complete_dir = os.path.join(config.output_directory, 'complete')
19 | if not os.path.exists(partial_dir):
20 |     os.makedirs(partial_dir)
21 | if not os.path.exists(complete_dir):
22 |     os.makedirs(complete_dir)
23 | 
24 | from . import webapp
25 | 
26 | # Intialize
27 | webapp.setup()
28 | 
29 | application = webapp.application
30 | 


--------------------------------------------------------------------------------
/liveweb/proxy.py:
--------------------------------------------------------------------------------
  1 | """The proxy functionality.
  2 | """
  3 | 
  4 | import datetime
  5 | import gzip
  6 | import httplib
  7 | import logging
  8 | import os
  9 | import socket
 10 | import urllib
 11 | from cStringIO import StringIO
 12 | import tempfile
 13 | import sys
 14 | import errno
 15 | import time
 16 | 
 17 | from warc import arc
 18 | from warc.utils import FilePart
 19 | from . import filetools
 20 | from . import config
 21 | 
 22 | MEG = 1024 * 1024
 23 | 
 24 | EMPTY_BUFFER = filetools.MemFile()
 25 | 
 26 | # 1x - bad input
 27 | ERR_INVALID_URL = 10, "invalid URL"
 28 | 
 29 | # 2x - DNS errors 
 30 | ERR_INVALID_DOMAIN = 20, "invalid domain"
 31 | ERR_DNS_TIMEOUT = 21, "dns timeout"
 32 | 
 33 | # 3x - connction errors
 34 | ERR_CONN_REFUSED = 30, "connection refused"
 35 | ERR_CONN_TIMEOUT = 31, "connection timedout"
 36 | ERR_INITIAL_DATA_TIMEOUT = 32, "initial data timeout"
 37 | ERR_READ_TIMEOUT = 33, "read timeout"
 38 | ERR_CONN_DROPPED = 34, "connection dropped"
 39 | ERR_CONN_MISC = 39, "unexpected connection error"
 40 | 
 41 | # 4x - resource errors
 42 | ERR_RESPONSE_TOO_BIG = 40, "response too big"
 43 | ERR_REQUEST_TIMEOUT = 41, "request took too long to finish"
 44 | 
 45 | 
 46 | class ProxyError(Exception):
 47 |     def __init__(self, error, cause=None, data=None):
 48 |         self.errcode, self.errmsg = error
 49 | 
 50 |         if isinstance(cause, socket.error) and cause.errno:
 51 |             cause_msg = "%s: %s" % (errno.errorcode.get(cause.errno, cause.errno), cause.strerror)
 52 |         else:
 53 |             cause_msg = cause and ("%s: %s" % (cause.__class__.__name__, str(cause)))
 54 | 
 55 |         msg = "E%02d: %s" % (self.errcode, self.errmsg)
 56 | 
 57 |         if cause_msg:
 58 |             msg += " (" + cause_msg + ")"
 59 | 
 60 |         if data:
 61 |             msg += " %s" % data
 62 | 
 63 |         Exception.__init__(self, msg)
 64 | 
 65 | class Record:
 66 |     """A class to hold together the filepath, content_length, and iterator over content.
 67 |     """
 68 |     def __init__(self, filename, offset=0, content_length=None, content_iter=None):
 69 |         """Creates a new record instance.
 70 | 
 71 |         :param filename: Relative or absolute path to the file that has this record.
 72 |         :param offset: The offset in the file where this record started.
 73 |         :param content_length: The total length of the record.
 74 |         :param content_iter: An iterator over content.
 75 |         """
 76 |         self.filename = filename
 77 |         self.offset = offset
 78 |         self.content_length = content_length
 79 |         self.content_iter = content_iter
 80 | 
 81 |         if self.content_length is None:
 82 |             self.content_length = os.stat(filename).st_size
 83 | 
 84 |         if self.content_iter is None:
 85 |             f = open(self.filename, 'rb')
 86 |             f.seek(self.offset)
 87 |             self.content_iter = filetools.fileiter(f, self.content_length)
 88 | 
 89 |     def read_all(self):
 90 |         """Reads all the data from content_iter and reinitializes the
 91 |         content_iter with the data read.
 92 | 
 93 |         Since this reads all the data into memory, this should be used
 94 |         only when content_length is not very big.
 95 |         """
 96 |         data = "".join(self.content_iter)
 97 |         self.content_iter = iter([data])
 98 |         return data
 99 | 
100 |     def __iter__(self):
101 |         return iter(self.content_iter)
102 | 
103 | def split_type_host(url):
104 |     """Returns (type, host, selector) from the url.
105 |     """
106 |     type, rest = urllib.splittype(url)
107 |     host, selector = urllib.splithost(rest)
108 |     return type, host, selector
109 | 
110 | 
111 | def log_error(err):
112 |     code, msg = err
113 |     exc_type, exc_value, _ = sys.exc_info()
114 |     logging.error("E%02d - %s (%s)", code, msg, str(exc_value))
115 | 
116 | def urlopen(url):
117 |     """Works like urllib.urlopen, but returns a ProxyHTTPResponse object instead.
118 |     """
119 |     logging.info("urlopen %s", url)
120 |  
121 |     try:
122 |         return _urlopen(url)
123 |     except ProxyError, e:
124 |         logging.error("%s - %s", str(e), url)
125 |         response = ProxyHTTPResponse(url, None, method="GET")
126 |         response.error_bad_gateway()
127 |         return response
128 | 
129 | def _urlopen(url):
130 |     """urlopen without the exception handling.
131 |     
132 |     Called by urlopen and test cases.
133 |     """
134 |     headers = config.get("extra_headers",{})
135 |     headers['User-Agent'] = config.user_agent
136 | 
137 |     type, host, selector = split_type_host(url)
138 | 
139 |     if type.lower() == "https":
140 |         conn = ProxyHTTPSConnection(host, url=url)
141 |     else:
142 |         conn = ProxyHTTPConnection(host, url=url)
143 | 
144 |     conn.request("GET", selector, headers=headers)
145 |     return conn.getresponse()
146 | 
147 | class _FakeSocket:
148 |     """Faking a socket with makefile method.
149 |     """
150 |     def __init__(self, fileobj=None):
151 |         self.fileobj = fileobj or StringIO()
152 | 
153 |     def makefile(self, mode="rb", bufsize=0):
154 |         return self.fileobj
155 |     
156 |     def getpeername(self):
157 |         return ("0.0.0.0", 80)
158 | 
159 |     def settimeout(self, timeout):
160 |         pass
161 | 
162 | class SocketWrapper:
163 |     """The socket.socket class doesn't have a way to enforce max-time and max-size limits.
164 | 
165 |     This extends the socket functionality by adding those constraints.
166 |     """
167 |     def __init__(self, sock, max_time=None, max_size=None):
168 |         self._sock = sock
169 |         self._max_time = max_time
170 |         self._max_size = max_size
171 |         
172 |         self._start_time = time.time()
173 |         self._bytes_read = 0
174 | 
175 |     def __getattr__(self, name):
176 |         return getattr(self._sock, name)
177 | 
178 |     def recv(self, bufsize):
179 |         data = self._sock.recv(bufsize)
180 |         self._bytes_read += len(data)
181 | 
182 |         # TODO: optimize this
183 |         # Each time.time() call takes about 0.35 ns. 
184 |         # For reading headers, this function is called once of each byte.
185 |         # Assuming that the headers is 1000 bytes long, it will add an overhead of 0.35ms.
186 |         # We should optimize this if we care about half-a-milli-second.
187 |         
188 |         if self._max_time is not None and time.time() - self._start_time > self._max_time:
189 |             raise ProxyError(ERR_REQUEST_TIMEOUT, data={"max_time": self._max_time})
190 | 
191 |         if self._max_size is not None and self._bytes_read > self._max_size:
192 |             raise ProxyError(ERR_RESPONSE_TOO_BIG, data={"max_size": self._max_size})
193 | 
194 |         return data
195 | 
196 |     def makefile(self, mode='r', bufsize=-1):
197 |         return socket._fileobject(self, mode, bufsize)
198 | 
199 | class ProxyHTTPResponse(httplib.HTTPResponse):
200 |     """HTTPResponse wrapper to record the HTTP payload.
201 |     
202 |     Provides utility methods to write ARC and WARC files.
203 |     """
204 |     DEFAULT_CONTENT_TYPE = "unk"
205 |     
206 |     def __init__(self, url, sock, *a, **kw):
207 |         self.sock = sock or _FakeSocket()
208 |         httplib.HTTPResponse.__init__(self, self.sock, *a, **kw)
209 |         
210 |         self.url = url
211 |         self.remoteip = self.sock.getpeername()[0]
212 |         self.content_type = self.DEFAULT_CONTENT_TYPE
213 |         self.buf = EMPTY_BUFFER
214 |         
215 |         # Length of header data
216 |         self.header_offset = 0
217 |         
218 |         self.arc_size = None
219 |         self.arc_data = None
220 | 
221 |     def begin(self):
222 |         self.fp = filetools.SpyFile(self.fp, spy=filetools.MemFile())
223 |         self.buf = self.fp.buf
224 |         
225 |         try:
226 |             self.sock.settimeout(config.get_initial_data_timeout())
227 |             httplib.HTTPResponse.begin(self)
228 | 
229 |             ctype = self.getheader("content-type", self.DEFAULT_CONTENT_TYPE)
230 |             self.content_type = self.parse_content_type(ctype)
231 |             self.header_offset = self.buf.tell()
232 |         except socket.error, e:
233 |             raise ProxyError(ERR_INITIAL_DATA_TIMEOUT, e, {"initial_data_timeout": config.get_initial_data_timeout()})
234 |         except httplib.HTTPException, e:
235 |             raise ProxyError(ERR_CONN_MISC, e)
236 | 
237 |         try:
238 |             # This will read the whole payload, taking care of content-length,
239 |             # chunked transfer-encoding etc.. The spy file will record the real
240 |             # HTTP payload.
241 |             self.sock.settimeout(config.get_read_timeout())
242 |             self.read()
243 |         except httplib.IncompleteRead, e:
244 |             raise ProxyError(ERR_CONN_DROPPED, e)
245 |         except httplib.HTTPException:
246 |             raise ProxyError(ERR_CONN_MISC, e)
247 |         except socket.error, e:
248 |             raise ProxyError(ERR_READ_TIMEOUT, e, data={"read_timeout": config.get_read_timeout()})
249 | 
250 |     def parse_content_type(self, ctype):
251 |         # If there are multiple content-type headers, httplib joins them using ", "
252 |         # Take the last one in that case
253 |         ctype = ctype.split(",")[-1]
254 | 
255 |         # content-type can have parameters separated by semicolon.
256 |         # For example: text/html; charset=UTF-8
257 |         ctype = ctype.split(";")[0]
258 | 
259 |         # strip leading and trailing whitespace
260 |         ctype =  ctype.strip()
261 | 
262 |         # remove any whitespace as it may interfere with arc header
263 |         ctype = ctype.replace(" ", "")
264 |         return ctype
265 |         
266 |     def error_bad_gateway(self):
267 |         """Resets the status code to "502 Bad Gateway" indicating that there was 
268 |         some network error when trying to accessing the server.
269 |         """
270 |         self._error(502, "Bad Gateway")
271 |         
272 |     def error_bad_url(self):
273 |         """Resets the status code to "400 Bad Request" indicating that the URL provided is bad.
274 |         """
275 |         self._error(400, "Bad Request")
276 |         
277 |     def _error(self, status, reason):
278 |         self.version = "HTTP/1.1"
279 |         self.status = status
280 |         self.reason = reason
281 |         self.content_type = self.DEFAULT_CONTENT_TYPE
282 |         
283 |         # close file
284 |         if self.fp:
285 |             self.fp.close()
286 |             self.fp = None
287 |             
288 |         self.buf = EMPTY_BUFFER
289 |         self.header_offset = 0
290 |     
291 |     def write_arc(self, pool):
292 |         record = self._make_arc_record()
293 | 
294 |         # if small enough, store in memory
295 |         if record.header.length < MEG: 
296 |             # write ARC record into memory
297 |             buf = StringIO()
298 |             begin, record_size = self._write_arc_record(record, buf)
299 |                         
300 |             # write the ARC record data in memory into file
301 |             with pool.get_file() as f:
302 |                 logging.info("writing arc record to file %s", f.name)
303 |                 begin = f.tell()
304 |                 f.write(buf.getvalue())
305 |                 filename = f.name
306 |                 
307 |             return Record(filename, offset=begin, content_length=record_size, content_iter=iter([buf.getvalue()]))
308 |         else:
309 |             with pool.get_file() as f:
310 |                 logging.info("writing arc record to file %s", f.name)
311 |                 filename = f.name
312 |                 begin, record_size = self._write_arc_record(record, f)
313 | 
314 |             return Record(filename, offset=begin, content_length=record_size)
315 |                 
316 |     def _write_arc_record(self, record, fileobj):
317 |         """Writes the give ARC record into the given fileobj as gzip data and returns the start offset in the file and and record size.
318 |         """
319 |         begin = fileobj.tell()
320 |         
321 |         zfile = gzip.GzipFile(fileobj=fileobj, filename=None, mode="w")
322 |         record.write_to(zfile)
323 |         zfile.close()
324 |         fileobj.flush()
325 |         
326 |         end = fileobj.tell()
327 |         return begin, end-begin
328 |                         
329 |     def _make_arc_record(self):
330 |         if self.status == 502:
331 |             # Match the response of liveweb 1.0 incase of gateway errors.
332 |             payload = "HTTP 502 Bad Gateway\n\n"
333 |             payload_length = len(payload)
334 |             content_type = "unk"
335 |             remoteip = "0.0.0.0"
336 |         else:
337 |             # We've finished writing to the buf. The file-pointer will be at 
338 |             # the end of the file. Calling tell should give the file size.
339 |             payload_length = self.buf.tell()
340 |             
341 |             # move the file pointer to the beginning of the file, so that we can read
342 |             self.buf.seek(0) 
343 |             payload =  self.buf
344 |             remoteip = self.remoteip
345 |             content_type = self.content_type
346 |     
347 |         headers = dict(url = self.url,
348 |                        date = self._utcnow(),
349 |                        content_type = self.content_type,
350 |                        ip_address = self.remoteip,
351 |                        length = payload_length)
352 |         return arc.ARCRecord(headers = headers, 
353 |                              payload = payload,
354 |                              version = 1)
355 | 
356 |     def _utcnow(self):
357 |         """Returns datetime.datetime.utcnow(). 
358 | 
359 |         Provided as a method here so that it is easy to monkeypatch for testing.
360 |         """
361 |         return datetime.datetime.utcnow()
362 |     
363 |     def write_warc(self, pool):
364 |         raise NotImplementedError()
365 |         
366 |     def get_arc(self):
367 |         """Returns size and fileobj to read arc data.
368 |         
369 |         This must be called only after calling write_arc method.
370 |         """
371 |         if self.arc_data is None:
372 |             self.write_arc()
373 |         return self.arc_size, self.arc_data
374 |     
375 |     def get_warc(self):
376 |         """Returns size and fileobj to read warc data."""
377 |         raise NotImplementedError()
378 |         
379 |     def get_payload(self):
380 |         """Returns size and fileobj to read HTTP payload.
381 |         """
382 |         # go to the end find the filesize
383 |         self.buf.seek(0, 2)
384 |         size = self.buf.tell() - self.header_offset
385 | 
386 |         # go to the beginning
387 |         self.buf.seek(self.header_offset)
388 |         return filetools.fileiter(self.buf, size)
389 | 
390 | class ProxyConnectionMixin:
391 |     """Mixin to add extra functionality to HTTP/HTTPS connection to handle errors differently.
392 |     """    
393 |     _base_connection_class = httplib.HTTPConnection
394 |     _proxy_response_class = ProxyHTTPResponse
395 | 
396 |     def __init__(self, host, url):
397 |         try:
398 |             self._base_connection_class.__init__(self, host)
399 |         except httplib.InvalidURL, e:
400 |             raise ProxyError(ERR_INVALID_URL, e)
401 | 
402 |         self.url = url
403 |         self.response_class = lambda *a, **kw: self._proxy_response_class(self.url, *a, **kw)
404 | 
405 |         # This is used when creating the socket connection
406 |         self.timeout = config.get_connect_timeout()
407 | 
408 |     def connect(self):
409 |         try:
410 |             self._base_connection_class.connect(self)
411 |             self.sock = SocketWrapper(self.sock, config.max_request_time, config.max_response_size)
412 |         except socket.gaierror, e:
413 |             # -3: Temporary failure in name resolution
414 |             # Happens when DNS request is timeout
415 |             if e.errno == -3:
416 |                 raise ProxyError(ERR_DNS_TIMEOUT, e, data={"dns_timeout": config.get_dns_timeout()})
417 |             else:
418 |                 raise ProxyError(ERR_INVALID_DOMAIN, e)
419 |         except socket.timeout, e:
420 |             raise ProxyError(ERR_CONN_TIMEOUT, e, data={"conn_timeout": config.get_connect_timeout()})
421 |         except socket.error, e:
422 |             msg = e.strerror or ""
423 |             if e.errno == errno.ECONNREFUSED:
424 |                 raise ProxyError(ERR_CONN_REFUSED, e)
425 |             else:
426 |                 raise ProxyError(ERR_CONN_MISC, e)
427 |         return self.sock
428 | 
429 |     def request(self, method, url, body=None, headers={}):
430 |         try:
431 |             self._base_connection_class.request(self, method, url, body=body, headers=headers)
432 |         except socket.error, e:
433 |             raise ProxyError(ERR_CONN_MISC, e)
434 | 
435 | 
436 | class ProxyHTTPConnection(ProxyConnectionMixin, httplib.HTTPConnection):
437 |     """HTTPConnection wrapper to add extra hooks to handle errors.
438 |     """
439 |     _base_connection_class = httplib.HTTPConnection
440 | 
441 | class ProxyHTTPSConnection(ProxyConnectionMixin, httplib.HTTPSConnection):
442 |     """HTTPSConnection wrapper to add extra hooks to handle errors.
443 |     """
444 |     _base_connection_class = httplib.HTTPSConnection
445 | 


--------------------------------------------------------------------------------
/liveweb/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/liveweb/daf121365b959477e3b90ae07bcd72959f5be856/liveweb/tests/__init__.py


--------------------------------------------------------------------------------
/liveweb/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import urllib
 4 | import subprocess
 5 | import time
 6 | 
 7 | def pytest_funcarg__pooldir(request):
 8 |     "Creates a directory for the pool"
 9 |     dirname = "/tmp/pool-xxx"
10 | 
11 |     if os.path.exists(dirname):
12 |         shutil.rmtree(dirname)
13 | 
14 |     os.makedirs(os.path.join(dirname, 'partial'))
15 |     os.makedirs(os.path.join(dirname, 'complete'))
16 | 
17 |     request.addfinalizer(lambda : shutil.rmtree(dirname))
18 | 
19 |     return dirname
20 | 
21 | 
22 | def pytest_funcarg__webtest(request):
23 |     dirname = os.path.dirname(__file__)
24 |     path = os.path.join(dirname, "webtest.py")
25 |     port = 9876
26 | 
27 |     p = subprocess.Popen(['python', path, str(port)])
28 |     request.addfinalizer(p.kill)
29 | 
30 |     # Return an object with url and port atrributes
31 |     x = lambda: None
32 |     x.url = "http://127.0.0.1:%d" % port
33 |     x.port = port
34 | 
35 |     # wait until the server is ready, with max-tries=20
36 |     for i in range(20):
37 |         print i
38 |         try:
39 |             urllib.urlopen(x.url + "/")
40 |             break
41 |         except IOError:
42 |             time.sleep(0.1)
43 | 
44 |     return x
45 | 


--------------------------------------------------------------------------------
/liveweb/tests/test_configutil.py:
--------------------------------------------------------------------------------
 1 | from ..configutil import ConfigOption, Config, parse_bytes, parse_time
 2 | 
 3 | class TestConfigOption:
 4 |     def test_add_option(self):
 5 |         opt = ConfigOption("--threads", help="Number of threads")
 6 |         assert opt.dest == "threads"
 7 | 
 8 |     def test_help(self):
 9 |         opt = ConfigOption("--threads", help="Number of threads")
10 |         assert opt.option.help == "Number of threads"
11 | 
12 |         opt = ConfigOption("--threads", default="10", help="Number of threads (default: %default)")
13 |         assert opt.option.help == "Number of threads (default: 10)"
14 | 
15 |     def test_default(self):
16 |         c = ConfigOption("--foo")
17 |         assert c.value == None
18 | 
19 |         c = ConfigOption("--foo", default="foo-default")
20 |         assert c.value == "foo-default"
21 | 
22 |     def test_from_env(self):
23 |         c = ConfigOption("--foo", default="foo-default")
24 |         c.load_from_env({})
25 |         assert c.value == "foo-default"
26 | 
27 |         c.load_from_env({"LIVEWEB_FOO": "foo-env"})
28 |         assert c.value == "foo-env"
29 | 
30 |     def test_putenv(self, monkeypatch):
31 |         import os
32 | 
33 |         environ = {}
34 |         monkeypatch.setattr(os, "environ", environ)
35 |         monkeypatch.setattr(os, "getenv", environ.__getitem__)
36 |         monkeypatch.setattr(os, "putenv", environ.__setitem__)
37 | 
38 |         c = ConfigOption("--foo", default="foo-default")
39 | 
40 |         c.putenv()
41 |         assert environ == {}
42 | 
43 |         c.set("new-value")
44 |         c.putenv()
45 |         assert environ == {"LIVEWEB_FOO": "new-value"}
46 | 
47 | def test_parse_bytes():
48 |     assert parse_bytes("5") == 5
49 |     assert parse_bytes("5KB") == 5 * 1024
50 |     assert parse_bytes("5MB") == 5 * 1024 * 1024
51 |     assert parse_bytes("5GB") == 5 * 1024 * 1024 * 1024
52 | 
53 | def test_parse_time():
54 |     assert parse_time("5") == 5.0
55 |     assert parse_time("5s") == 5.0
56 |     assert parse_time("5m") == 5.0 * 60
57 |     assert parse_time("5h") == 5.0 * 3600
58 | 
59 |     
60 | 
61 | 


--------------------------------------------------------------------------------
/liveweb/tests/test_filepool.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | def test_creation(pooldir):
  5 |     """
  6 |     Tests to see if the file pool is created properly.
  7 | 
  8 |     Creates a pool and then walks through the pool directory to see if
  9 |     the expected files are there.
 10 |     """
 11 |     from ..file_pool import FilePool
 12 | 
 13 |     # Create the pool
 14 |     pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10)
 15 | 
 16 |     # call pool.get_file() so that files are initialized
 17 |     for i in range(pool.max_files):
 18 |         pool.get_file()
 19 | 
 20 |     # Get files in pool directory.
 21 |     pool_files = set(glob.glob(pooldir + "/partial/*"))
 22 | 
 23 |     # Check if this is the same as what we expect
 24 |     expected_files = set(["%s/partial/test-%05d"%(pooldir,x) for x in range(0,10)])
 25 | 
 26 |     assert expected_files == pool_files
 27 | 
 28 | def test_get_return(pooldir):
 29 |     """
 30 |     Tests to see if get_file return_file work as expected.
 31 | 
 32 |     gets files, checks the number of files in the queue, returns them
 33 |     and checks again.
 34 |     """
 35 | 
 36 |     from ..file_pool import FilePool
 37 | 
 38 |     # Create the pool
 39 |     pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10)
 40 | 
 41 |     fps = []
 42 | 
 43 |     assert len(pool.queue.queue) == 10
 44 | 
 45 |     for i in range(1, 6):
 46 |         fps.append(pool.get_file())
 47 |         assert len(pool.queue.queue) == 10 - i
 48 | 
 49 |     assert len(pool.queue.queue) == 5
 50 | 
 51 |     for i in range(1, 6):
 52 |         pool.return_file(fps.pop())
 53 |         assert len(pool.queue.queue) == 5 + i
 54 | 
 55 |     assert len(pool.queue.queue) == 10
 56 | 
 57 | def test_max_file_size(pooldir):
 58 |     """
 59 |     Tests to see if the files are closed and released when the maximum
 60 |     file size is reached and the file is returned.
 61 |     """
 62 |     from ..file_pool import FilePool
 63 | 
 64 |     # Create the pool
 65 |     pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10)
 66 | 
 67 |     fp = pool.get_file()
 68 |     fp.write("test" * 100) # Max size has been exceeded. File should
 69 |     pool.return_file(fp)   # get removed from pool when returned.
 70 | 
 71 |     # queue should have all Nones now.
 72 |     assert list(pool.queue.queue) == [None] * 10
 73 | 
 74 |     complete_files = set(glob.glob(pooldir + "/complete/*"))
 75 |     expected_complete_files = set(("%s/complete/test-%05d"%(pooldir,0),))
 76 |     assert expected_complete_files == complete_files
 77 | 
 78 | 
 79 | def test_close_pool(pooldir):
 80 |     """
 81 |     Makes sure that the pool is emptied when closed.
 82 | 
 83 |     """
 84 |     from ..file_pool import FilePool
 85 | 
 86 |     # Create the pool
 87 |     pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10)
 88 | 
 89 |     pool.close()
 90 | 
 91 |     assert len(pool.queue.queue) == 0
 92 | 
 93 | 
 94 | def test_member_file_context(pooldir):
 95 |     """
 96 |     Tests the context manager behaviour of the MemberFile object.
 97 |     """
 98 | 
 99 |     from ..file_pool import FilePool
100 | 
101 |     # Create the pool
102 |     pool = FilePool(pooldir, pattern = "test-%(serial)05d", max_files = 10, max_file_size = 10)
103 | 
104 |     assert len(pool.queue.queue) == 10
105 | 
106 |     with pool.get_file() as f:
107 |         assert len(pool.queue.queue) == 9
108 |         name = f.name
109 |         f.write("Hello")
110 | 
111 |     assert len(pool.queue.queue) == 10
112 |     pool.close()
113 | 
114 |     assert open(name).read() == "Hello"
115 | 


--------------------------------------------------------------------------------
/liveweb/tests/test_filetools.py:
--------------------------------------------------------------------------------
 1 | from .. import filetools
 2 | from cStringIO import StringIO
 3 | 
 4 | class TestMemFile:
 5 |     def test_nodata(self):
 6 |         f = filetools.MemFile()
 7 |         assert f.read() == ""
 8 |         assert f.readline() == ""
 9 |         
10 |     def test_mem(self):
11 |         f = filetools.MemFile(100)
12 |         f.write("helloworld")
13 |         assert f.tell() == 10
14 |         f.seek(0)
15 |         assert f.read() == "helloworld"
16 | 
17 |     def test_readlines(self):
18 |         f = filetools.MemFile(100)
19 |         f.write("a\nb\nc\n")
20 |         f.seek(0)
21 |         assert f.readline() == "a\n"
22 |         assert f.readline() == "b\n"
23 |         assert f.readline() == "c\n"
24 |         assert f.readline() == ""
25 |         
26 |     def test_mem(self):
27 |         f = filetools.MemFile(100)
28 |         f.write("helloworld" * 10)
29 |         assert f.tell() == 100
30 |         assert f.in_memory() is True
31 | 
32 |         f.write("helloworld")
33 |         assert f.in_memory() is False
34 |         assert f.name is not None
35 |         
36 |         f.seek(0)
37 |         content = f.read()
38 |         assert len(content) == 110
39 |         assert content == "helloworld" * 11    
40 |         
41 |         
42 | def test_fileiter():
43 |     f = StringIO("helloworld" * 15)
44 |     # there are 15 "helloworld"s and we are asking it to read 4 of them.
45 |     assert list(filetools.fileiter(f, 40, chunk_size=10)) == ["helloworld"] * 4
46 | 
47 |     # case where the size is not multiple of chunk_size
48 |     assert list(filetools.fileiter(f, 38, chunk_size=10)) == ["helloworld", "helloworld", "helloworld", "hellowor"]
49 | 
50 |     # what if we ask for more data than we have?
51 |     f = StringIO("helloworld" + "helloworld" + "end")
52 |     assert list(filetools.fileiter(f, 40, chunk_size=10)) == ["helloworld", "helloworld", "end"]
53 | 


--------------------------------------------------------------------------------
/liveweb/tests/test_proxy.py:
--------------------------------------------------------------------------------
  1 | from .. import proxy, config
  2 | 
  3 | from cStringIO import StringIO
  4 | import datetime
  5 | import subprocess
  6 | import os
  7 | import urllib
  8 | import time
  9 | 
 10 | import pytest
 11 | 
 12 | class TestRecord:
 13 |     def test_read_all(self):
 14 |         content = "helloworld" * 100
 15 |         record = proxy.Record(None, 0, len(content), iter([content]))
 16 | 
 17 |         # read_all should return all the content
 18 |         assert record.read_all() == content
 19 | 
 20 |         # after calling read_all, the content should still be available
 21 |         assert record.read_all() == content
 22 |         assert record.read_all() == content
 23 | 
 24 |         # after read_all, the content_iter should still have the content
 25 |         assert "".join(record.content_iter) == content
 26 | 
 27 |     def test_init(self, tmpdir):
 28 |         path = tmpdir.join("foo.txt")
 29 |         path.write("helloworld" * 100)
 30 | 
 31 |         record = proxy.Record(path.strpath, 0, 1000, None)
 32 |         assert record.read_all() == "helloworld" * 100
 33 | 
 34 |         record = proxy.Record(path.strpath, 800, 100, None)
 35 |         assert record.read_all() == "helloworld" * 10
 36 | 
 37 | 
 38 | def test_split_type_host():
 39 |     assert proxy.split_type_host("http://www.archive.org/details/test") == ("http", "www.archive.org", "/details/test")
 40 | 
 41 | 
 42 | def test_FakeSocket():
 43 |     s = proxy._FakeSocket()
 44 |     assert s.makefile(mode="rb").read() == ""
 45 | 
 46 |     s = proxy._FakeSocket(StringIO("helloworld"))
 47 |     assert s.makefile(mode="rb").read() == "helloworld"
 48 | 
 49 | 
 50 | SAMPLE_RESPONSE = """\
 51 | HTTP/1.1 200 OK
 52 | Content-type: text/plain
 53 | Server: Apache/2.2.20
 54 | Content-Length: 10
 55 | 
 56 | helloworld""".replace("\n", "\r\n")
 57 | 
 58 | SAMPLE_RESPONSE_CHUNKED = """\
 59 | HTTP/1.1 200 OK
 60 | Content-Type: text/plain
 61 | Server: Apache/2.2.20
 62 | Transfer-Encoding: chunked
 63 | 
 64 | 5
 65 | hello
 66 | 5
 67 | world
 68 | 0
 69 | """.replace("\n", "\r\n")
 70 | 
 71 | class TestProxyResponse:
 72 |     def make_response(self, content):
 73 |         sock = proxy._FakeSocket(StringIO(content))
 74 |         response = proxy.ProxyHTTPResponse("http://example.com/hello", sock)
 75 |         response.begin()
 76 |         return response
 77 | 
 78 |     def test_headers(self):
 79 |         response = self.make_response(SAMPLE_RESPONSE)
 80 |         assert sorted(response.getheaders()) == [
 81 |             ("content-length", "10"), 
 82 |             ("content-type", "text/plain"), 
 83 |             ("server", "Apache/2.2.20"),
 84 |         ]
 85 | 
 86 |     def test_buf(self):
 87 |         response = self.make_response(SAMPLE_RESPONSE)
 88 |         assert response.buf.getvalue() == SAMPLE_RESPONSE
 89 | 
 90 |     def test_buf_chunked(self):
 91 |         response = self.make_response(SAMPLE_RESPONSE_CHUNKED)
 92 |         assert response.buf.getvalue() == SAMPLE_RESPONSE_CHUNKED
 93 | 
 94 |     def test_get_payload(self):
 95 |         response = self.make_response(SAMPLE_RESPONSE)
 96 |         payload = response.get_payload()
 97 |         assert "".join(payload) == "helloworld"
 98 | 
 99 |     def test_get_payload_chunked(self):
100 |         response = self.make_response(SAMPLE_RESPONSE_CHUNKED)
101 |         payload = response.get_payload()
102 |         assert "".join(payload) == "5\r\nhello\r\n5\r\nworld\r\n0\r\n"
103 |     
104 |     def test_arc_record(self, monkeypatch):
105 |         # monkey patch _utcnow so that the time is deterministic
106 |         monkeypatch.setattr(proxy.ProxyHTTPResponse, "_utcnow", lambda self: datetime.datetime(2010, 9, 8, 7, 6, 5))
107 | 
108 |         self._test_arc_record(SAMPLE_RESPONSE)
109 |         self._test_arc_record(SAMPLE_RESPONSE_CHUNKED)
110 | 
111 |     def _test_arc_record(self, http_payload):
112 |         response = self.make_response(http_payload)
113 |         arc = response._make_arc_record()
114 |         assert str(arc.header) == "http://example.com/hello 0.0.0.0 20100908070605 text/plain %d" % len(http_payload)
115 | 
116 | 
117 | class FakeSocket:
118 |     def __init__(self, content, delay_per_byte=0):
119 |         self.content = content
120 |         self.delay_per_byte = delay_per_byte
121 | 
122 |     def recv(self, size):
123 |         data = self.content[:size]
124 |         self.content = self.content[size:]
125 | 
126 |         if self.delay_per_byte:
127 |             time.sleep(len(data) * self.delay_per_byte)
128 | 
129 |         return data
130 | 
131 |     def dummy(self, *a, **kw):
132 |         pass
133 |     
134 | class TestSocketWrapper:
135 |     def test_max_time(self):
136 |         _sock = FakeSocket("a" * 1000, delay_per_byte=0.001)
137 |         sock = proxy.SocketWrapper(_sock, max_time=0.1)
138 |         sock.recv(90) # .09 seconds
139 | 
140 |         with pytest.raises(proxy.ProxyError) as excinfo:
141 |             sock.recv(20) # this should fail
142 | 
143 |         e = excinfo.value
144 |         assert (e.errcode, e.errmsg) == proxy.ERR_REQUEST_TIMEOUT
145 | 
146 |     def test_max_size(self):
147 |         _sock = FakeSocket("a" * 1200)
148 |         sock = proxy.SocketWrapper(_sock, max_size=1001)
149 | 
150 |         # read 1000 bytes
151 |         for i in range(10):
152 |             sock.recv(100)
153 | 
154 |         with pytest.raises(proxy.ProxyError) as excinfo:
155 |             sock.recv(100) # this should fail
156 | 
157 |         e = excinfo.value
158 |         assert (e.errcode, e.errmsg) == proxy.ERR_RESPONSE_TOO_BIG
159 | 
160 | class TestErrors:
161 |     def assert_error_code(self, excinfo, error):
162 |         e = excinfo.value
163 |         if (e.errcode, e.errmsg) != error:
164 |             import traceback
165 |             traceback.print_exc(e)
166 |             assert (e.errcode, e.errmsg) == error
167 | 
168 |     def verify(self, err, url):
169 |         with pytest.raises(proxy.ProxyError) as excinfo:
170 |             proxy._urlopen(url)
171 |         self.assert_error_code(excinfo, err)
172 | 
173 |     def test_invalid_url(self):
174 |         self.verify(proxy.ERR_INVALID_URL, "http://localhost:foo/")
175 |     
176 |     def test_invalid_domain(self):
177 |         self.verify(proxy.ERR_INVALID_DOMAIN, "http://invalid.com2/")
178 | 
179 |     def test_conn_refused(self):
180 |         # nothing will be running at localhost:1234, so connection will be refused
181 |         self.verify(proxy.ERR_CONN_REFUSED, "http://localhost:1234/")
182 | 
183 |     def test_conn_timeout(self, monkeypatch):
184 |         monkeypatch.setattr(config, "connect_timeout", 0.5)
185 |         # this random IP seems to be creating timeout
186 |         self.verify(proxy.ERR_CONN_TIMEOUT, "http://1.2.3.4/")
187 | 
188 |     def test_initial_data_timeout(self, monkeypatch, webtest):
189 |         # This should not fail
190 |         proxy._urlopen(webtest.url + "/delay-headers/0.2")
191 | 
192 |         # But when we set the initial_data_timeout, it should fail
193 |         monkeypatch.setattr(config, "initial_data_timeout", 0.1)
194 |         self.verify(proxy.ERR_INITIAL_DATA_TIMEOUT, webtest.url + "/delay-headers/0.2")
195 | 
196 |     def test_read_timeout(self, monkeypatch, webtest):
197 |         # This should not fail
198 |         proxy._urlopen(webtest.url + "/delay/0.2?repeats=1")
199 | 
200 |         # But when we set the initial_data_timeout, it should fail
201 |         monkeypatch.setattr(config, "read_timeout", 0.1)
202 |         self.verify(proxy.ERR_READ_TIMEOUT, webtest.url + "/delay/0.2?repeats=1")
203 | 
204 |     def test_conn_dropped(self, webtest):
205 |         self.verify(proxy.ERR_CONN_DROPPED, webtest.url + "/drop")
206 | 
207 |     def test_response_too_big(self, monkeypatch, webtest):
208 |         monkeypatch.setattr(config, "max_response_size", 1000)
209 | 
210 |         # This should not fail
211 |         proxy._urlopen(webtest.url + "/echo/helloworld?repeats=50")
212 | 
213 |         with pytest.raises(proxy.ProxyError) as excinfo:
214 |             proxy._urlopen(webtest.url + "/echo/helloworld?repeats=100")
215 |             
216 |         self.assert_error_code(excinfo, proxy.ERR_RESPONSE_TOO_BIG)
217 | 
218 |     def test_request_took_too_long(self, monkeypatch, webtest):
219 |         monkeypatch.setattr(config, "max_request_time", 0.1)
220 | 
221 |         # This should not fail
222 |         proxy._urlopen(webtest.url + "/echo/helloworld?repeats=5&delay=0.01")
223 | 
224 |         with pytest.raises(proxy.ProxyError) as excinfo:
225 |             proxy._urlopen(webtest.url + "/echo/helloworld?repeats=20&delay=0.01")
226 |             
227 |         self.assert_error_code(excinfo, proxy.ERR_REQUEST_TIMEOUT)
228 | 
229 | 
230 | def test_webtest(webtest):
231 |     assert urllib.urlopen(webtest.url + "/echo/hello").read() == "hello\n"
232 | 


--------------------------------------------------------------------------------
/liveweb/tests/test_webapp.py:
--------------------------------------------------------------------------------
 1 | from ..webapp import application
 2 | 
 3 | class Test_application:
 4 |     def test_parse_request_url(self):
 5 |         environ = {
 6 |             'REQUEST_METHOD': "GET",
 7 |             'REQUEST_URI': 'http://www.example.com/foo/bar',
 8 |             'PATH_INFO': 'http://www.example.com/foo/bar',
 9 |             'HTTP_HOST': 'www.example.com'
10 |         }
11 |         app = application(environ, None)
12 |         app.parse_request()
13 |         assert app.url == "http://www.example.com/foo/bar"
14 |         
15 |     def test_nginx_work_around(self):
16 |         # nginx is stripping the http://host in the URL, which is of the form http://host/path
17 |         # and passing just /path to the app. 
18 |         # Test the work-around that reconstructs the full path using the host.
19 |         environ = {
20 |             'REQUEST_METHOD': "GET",
21 |             'REQUEST_URI': '/foo/bar',
22 |             'PATH_INFO': '/foo/bar',
23 |             'HTTP_HOST': 'www.example.com'
24 |         }
25 |         app = application(environ, None)
26 |         app.parse_request()
27 |         assert app.url == "http://www.example.com/foo/bar"
28 | 


--------------------------------------------------------------------------------
/liveweb/tests/webtest.py:
--------------------------------------------------------------------------------
 1 | """Web app to simulate various error conditions.
 2 | """
 3 | 
 4 | import sys, os
 5 | from os.path import dirname, pardir
 6 | 
 7 | # insert liveweb in sys.path
 8 | sys.path.insert(0, os.path.join(dirname(__file__), pardir, pardir))
 9 | 
10 | from liveweb.tools.wsgiapp import wsgiapp
11 | import time
12 | 
13 | class application(wsgiapp):
14 |     urls = [
15 |         ("/", "index"),
16 |         ("/echo/(.*)", "echo"),
17 |         ("/delay-headers/([0-9\.]+)", "delay_headers"),
18 |         ("/delay/([0-9\.]+)", "delay"),
19 |         ("/drop", "drop"),
20 |     ]
21 | 
22 |     def GET_index(self):
23 |         self.header("Content-Type", "text/plain")
24 |         return ["hello, world!\n"]
25 | 
26 |     def GET_echo(self, name):
27 |         self.header("Content-Type", "text/plain")
28 | 
29 |         i = self.input()
30 |         repeats = int(i.get("repeats", 1))
31 |         delay = float(i.get('delay', 0))
32 |         delim = i.get("delim", "\n")
33 | 
34 |         for i in range(repeats):
35 |             yield name + delim
36 |             if delay:
37 |                 time.sleep(delay)
38 | 
39 |     def GET_drop(self):
40 |         self.header("Content-Type", "text/plain")
41 |         self.header("Content-Length", "10000")
42 |         return ["dropped!"]
43 | 
44 |     def GET_delay_headers(self, delay):
45 |         self.header("Content-Type", "text/plain")
46 |         delay = float(delay)
47 |         time.sleep(delay)
48 |         return ["delayed"]
49 | 
50 |     def GET_delay(self, delay):
51 |         """Emits 10 numbers with delay seconds between each.
52 |         """
53 |         i = self.input()
54 |         repeats = int(i.get("repeats", 10))
55 |         delay = float(delay)
56 |         for i in range(repeats):
57 |             yield str(i) + "\n"
58 |             time.sleep(delay)
59 | 
60 | if __name__ == "__main__":
61 |     import sys
62 | 
63 |     try:
64 |         port = int(sys.argv[1])
65 |     except IndexError:
66 |         port = 8080
67 | 
68 |     from wsgiref.simple_server import make_server
69 |     httpd = make_server('127.0.0.1', port, application)
70 |     print "http://127.0.0.1:%d/" % port
71 |     httpd.serve_forever()
72 | 


--------------------------------------------------------------------------------
/liveweb/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/liveweb/daf121365b959477e3b90ae07bcd72959f5be856/liveweb/tools/__init__.py


--------------------------------------------------------------------------------
/liveweb/tools/wayback.py:
--------------------------------------------------------------------------------
  1 | """Really simple implememtation of wayback machine web-interface. 
  2 | 
  3 | Written to test the liveweb proxy implementation.
  4 | """
  5 | 
  6 | import sys
  7 | import httplib
  8 | import gzip
  9 | import urlparse
 10 | import cgi
 11 | from StringIO import StringIO
 12 | 
 13 | from BeautifulSoup import BeautifulSoup
 14 | 
 15 | import warc
 16 | from .wsgiapp import wsgiapp
 17 | # We expect that liveweb host:port is passed as argument to this script.
 18 | liveweb = sys.argv[1]
 19 | import re
 20 | import logging
 21 | 
 22 | logging.basicConfig(level=logging.DEBUG, )
 23 | logger = logging.getLogger('[wayback]')
 24 | 
 25 | 
 26 | class application(wsgiapp):
 27 |     """WSGI application for wayback machine prototype.
 28 |     """
 29 |     urls = [
 30 |         ("/", "index"),
 31 |         ("/get", "get"),
 32 |         ("/web/(.*)", "web")
 33 |     ]
 34 | 
 35 |     @property
 36 |     def home(self):
 37 |         return "http://" + self.environ['HTTP_HOST']
 38 | 
 39 |     def GET_index(self):
 40 |         self.header("content-type", "text/html")
 41 |         return [HEADER]
 42 |         #yield "<h1>Wayback Machine Prototype</h1>"
 43 | 
 44 |     def GET_get(self):
 45 |         fs = cgi.FieldStorage(environ=self.environ, keep_blank_values=1)
 46 |         if 'url' in fs:
 47 |             url = fs['url'].value
 48 |             self.status = "302 See Other"
 49 |             self.header("Location", self.home + "/web/" + url)
 50 |             return [""]
 51 |         else:
 52 |             self.status = "302 See Other"
 53 |             self.header("Location", self.home + "/")
 54 |             return [""]
 55 | 
 56 |     def GET_web(self, url):
 57 |         qs = self.environ.get("QUERY_STRING", "")
 58 |         if qs:
 59 |             url = url + "?" + qs
 60 |         record = self.fetch_arc_record(url)
 61 | 
 62 |         # fake socket to pass to httplib
 63 |         f = StringIO(record.payload)
 64 |         f.makefile = lambda *a: f
 65 | 
 66 |         response = httplib.HTTPResponse(f)
 67 |         response.begin()
 68 |         h = dict(response.getheaders())
 69 | 
 70 |         content_type = h.get("content-type", "text/plain")
 71 |         self.header("Content-Type", content_type)
 72 | 
 73 |         if 'content-length' in h:
 74 |             self.header('Content-Length', h['content-length'])
 75 | 
 76 |         content = response.read()
 77 |         if content_type.lower().startswith("text/html"):
 78 |             content = self.rewrite_page(url, content)
 79 |             self.header('Content-Length', str(len(content)))
 80 |         elif content_type.lower().startswith("text/css"):
 81 |             content = self.rewrite_css(url, content)
 82 |             self.header('Content-Length', str(len(content)))
 83 |         return [content]
 84 | 
 85 |     def rewrite_css(self, base_url, content):
 86 |         #base_url = base_url.replace(os.path.basename(base_url), '')
 87 |         for image_url in re.findall('url\(([^\)]+)', content):
 88 |             image_url = image_url.replace('"', '')
 89 |             image_url = image_url.replace("'", '')
 90 |             if not image_url.startswith('http://'):
 91 |                 new_url = urlparse.urljoin(base_url, image_url)
 92 |             else:
 93 |                 new_url = image_url
 94 |             new_url = new_url.replace('//', '/')
 95 |             url2 = urlparse.urljoin(base_url, new_url)
 96 |             url2 = self.home + "/web/" + url2
 97 |             logger.debug("rewrote %r => %r" % (image_url, url2))
 98 |             content = content.replace(image_url, url2, 1)
 99 |         return content
100 | 
101 |     def fetch_arc_record(self, url):
102 |         """Fetchs the ARC record data from liveweb proxy.
103 |         """
104 |         conn = httplib.HTTPConnection(liveweb)
105 |         conn.request("GET", url)
106 |         content = conn.getresponse().read()
107 | 
108 |         gz = gzip.GzipFile(fileobj=StringIO(content), mode="rb")
109 |         record = warc.ARCRecord.from_string(gz.read(), version=1)
110 | 
111 |         return record
112 | 
113 |     def rewrite_page(self, base_url, content):
114 |         """Rewrites all the links the the HTML."""
115 | 
116 |         soup = BeautifulSoup(content)
117 |         for tag in soup.findAll(["a", "link", "img", "script", "form"]):
118 |             if tag.has_key('href'):
119 |                 tag['href'] = self.rewrite_url(base_url, tag['href'])
120 |             elif tag.has_key("src"):
121 |                 tag['src'] = self.rewrite_url(base_url, tag['src'])
122 |             elif tag.has_key("action"):
123 |                 tag['action'] = self.rewrite_url(base_url, tag['action'])
124 | 
125 |         self.inject_header(base_url, soup)
126 |         return str(soup)
127 | 
128 |     def inject_header(self, base_url, soup):
129 |         """Injects wayback machine header into the web page."""
130 |         header_soup = BeautifulSoup(HEADER).find("div")
131 |         header_soup.find("input", {"id": "wmtbURL"})['value'] = base_url
132 |         soup.find("body").insert(0, header_soup)
133 | 
134 |     def rewrite_url(self, base_url, url):
135 |         if url.strip().lower().startswith("javascript"):
136 |             return url
137 |         url2 = urlparse.urljoin(base_url, url)
138 |         url2 = self.home + "/web/" + url2
139 |         logger.debug("rewrote %r => %r" % (url, url2))
140 |         return url2
141 | 
142 | 
143 | HEADER = """
144 | <div id="wm-ipp" style="position: relative; padding-top: 0px; padding-right: 5px; padding-bottom: 0px; padding-left: 5px; min-height: 70px; min-width: 800px; z-index: 9000; display: block; ">
145 | <div id="wm-ipp-inside" style="position:fixed;padding:0!important;margin:0!important;width:97%;min-width:780px;border:5px solid #000;border-top:none;background-image:url(http://staticweb.archive.org/images/toolbar/wm_tb_bk_trns.png);text-align:center;-moz-box-shadow:1px 1px 3px #333;-webkit-box-shadow:1px 1px 3px #333;box-shadow:1px 1px 3px #333;font-size:11px!important;font-family:'Lucida Grande','Arial',sans-serif!important;">
146 |    <table style="border-collapse:collapse;margin:0;padding:0;width:100%;"><tbody><tr>
147 |    <td style="padding:10px;vertical-align:top;min-width:110px;">
148 |    <a href="http://wayback.archive.org/web/" title="Wayback Machine home page" style="background-color:transparent;border:none;"><img src="http://staticweb.archive.org/images/toolbar/wayback-toolbar-logo.png" alt="Wayback Machine" width="110" height="39" border="0"></a>
149 |    </td>
150 |    <td style="padding:0!important;text-align:center;vertical-align:top;width:100%;">
151 | 
152 |        <table style="border-collapse:collapse;margin:0 auto;padding:0;width:570px;"><tbody><tr>
153 |        <td style="padding:3px 0;" colspan="2">
154 |        <form target="_top" method="get" action="/get" name="wmtb" id="wmtb" style="margin:0!important;padding:0!important;"><input type="text" name="url" id="wmtbURL" value="" style="width:400px;font-size:11px;font-family:'Lucida Grande','Arial',sans-serif;" onfocus="javascript:this.focus();this.select();"><input type="hidden" name="type" value="replay"><input type="hidden" name="date" value="20070607010239"><input type="submit" value="Go" style="font-size:11px;font-family:'Lucida Grande','Arial',sans-serif;margin-left:5px;"><span id="wm_tb_options" style="display:block;"></span></form>
155 |        </td>
156 |        <td style="vertical-align:bottom;padding:5px 0 0 0!important;" rowspan="2">
157 |        </td>
158 |    <td style="text-align:right;padding:5px;width:65px;font-size:11px!important;">
159 |        <a href="javascript:;" onclick="document.getElementById('wm-ipp').style.display='none';" style="display:block;padding-right:18px;background:url(http://staticweb.archive.org/images/toolbar/wm_tb_close.png) no-repeat 100% 0;color:#33f;font-family:'Lucida Grande','Arial',sans-serif;margin-bottom:23px;background-color:transparent;border:none;" title="Close the toolbar">Close</a>
160 |        <a href="http://faq.web.archive.org/" style="display:block;padding-right:18px;background:url(http://staticweb.archive.org/images/toolbar/wm_tb_help.png) no-repeat 100% 0;color:#33f;font-family:'Lucida Grande','Arial',sans-serif;background-color:transparent;border:none;" title="Get some help using the Wayback Machine">Help</a>
161 |    </td>
162 |    </tr></tbody></table>
163 | </div>
164 | </div>
165 | """
166 | 


--------------------------------------------------------------------------------
/liveweb/tools/wsgiapp.py:
--------------------------------------------------------------------------------
 1 | """Really simple wsgi framework. 
 2 | """
 3 | 
 4 | import re
 5 | import traceback
 6 | 
 7 | class wsgiapp:
 8 |     """Simple WSGI web framework.
 9 |     
10 |         class applicaiton(wsgiapp):
11 |             urls = [
12 |                 ("/", "index")
13 |             ]
14 |             def GET_index(self):
15 |                 self.header("Content-Type", "text/plain")
16 |                 return "hello, world!"
17 |     """
18 |     def __init__(self, environ, start_response):
19 |         self.start = start_response
20 |         self.environ = environ
21 |         
22 |         self.status = "200 OK"
23 |         self._headers = {}
24 | 
25 |     def input(self):
26 |         tokens = self.environ.get("QUERY_STRING", "").split("&")
27 |         print "input", tokens
28 |         return dict(kv.split("=") for kv in tokens if "=" in kv)
29 |         
30 |     def header(self, name, value):
31 |         self._headers[name.title()] = value
32 | 
33 |     def __iter__(self):
34 |         try:
35 |             x = self.delegate()
36 |             self.start(self.status, self._headers.items())
37 |             return iter(x)
38 |         except:
39 |             headers = {"Content-Type": "text/plain"}
40 |             self.start("500 Internal Error", headers.items())
41 |             out = "Internal Error:\n\n"
42 |             exc = traceback.format_exc()
43 |             return iter([out, exc])
44 | 
45 |     def delegate(self):
46 |         """Delegates the request to appropriate method.
47 |         """
48 |         path = self.environ['PATH_INFO']
49 |         method = self.environ['REQUEST_METHOD']
50 | 
51 |         # Try each pattern and dispatch to the right method
52 |         for pattern, name in self.urls:
53 |             m = re.match('^' + pattern + '$', path)
54 |             if m:
55 |                 funcname = method.upper() + "_" + name
56 |                 f = getattr(self, funcname)
57 |                 return f(*m.groups())
58 |                 
59 |         # give "404 Not Found" if all the patterns are exhausted
60 |         return self.notfound()
61 | 
62 |     def notfound(self):
63 |         self.status = "404 Not Found"
64 |         self.headers = {"Content-Type": "text/html"}.items()
65 |         return ["Not Found"]
66 | 
67 | 


--------------------------------------------------------------------------------
/liveweb/webapp.py:
--------------------------------------------------------------------------------
  1 | """The webapp for arc proxy.
  2 | """
  3 | 
  4 | from cStringIO import StringIO
  5 | import gzip
  6 | import logging
  7 | import socket
  8 | import datetime
  9 | 
 10 | from warc.arc import ARCRecord, ARCFile
 11 | 
 12 | from . import proxy
 13 | from . import errors
 14 | from . import config
 15 | from . import file_pool
 16 | from . import cache
 17 | 
 18 | pool = None
 19 | _cache = None
 20 | 
 21 | def init_arc_file(fileobj):
 22 |     """Writes the ARC file headers when a new file is created.
 23 |     """
 24 |     zfileobj = gzip.GzipFile(fileobj=fileobj, filename=None, mode="w")
 25 | 
 26 |     headers = {}
 27 |     headers['date'] = datetime.datetime.utcnow()
 28 |     headers['ip_address'] = socket.gethostbyname(socket.gethostname())
 29 |     headers['org'] = "InternetArchive"
 30 | 
 31 |     afile = ARCFile(fileobj=zfileobj, filename=fileobj.name, mode='wb', version=1, file_headers=headers)
 32 |     afile._write_header()
 33 |     afile.close()
 34 |     fileobj.flush()
 35 | 
 36 | def setup():
 37 |     """This is called from main to initialize the requires globals.
 38 |     """
 39 |     global pool, _cache
 40 | 
 41 |     # Write ARC file header if the archive format is "arc"
 42 |     if config.archive_format == "arc":
 43 |         init_file = init_arc_file
 44 |     else:
 45 |         init_file = None
 46 | 
 47 |     pool = file_pool.FilePool(config.output_directory,
 48 |                               pattern=config.filename_pattern,
 49 |                               max_files=config.num_writers,
 50 |                               max_file_size=config.filesize_limit,
 51 |                               init_file_func=init_file)
 52 |     _cache = cache.create(type=config.cache, config=config)
 53 | 
 54 |     # For redis cache, use redis for keeping track of file number sequence
 55 |     if config.cache == 'redis':
 56 |         pool.set_sequence(_cache)
 57 | 
 58 | class application:
 59 |     """WSGI application for liveweb proxy.
 60 |     """
 61 |     def __init__(self, environ, start_response):
 62 |         self.environ = environ
 63 |         self.start_response = start_response
 64 |         
 65 |     def parse_request(self):
 66 |         self.method = self.environ['REQUEST_METHOD']
 67 |         if 'REQUEST_URI' in self.environ: # This is for uwsgi
 68 |             self.url = self.environ['REQUEST_URI'] #TODO: Is this a valid environment variable always?
 69 |         if 'RAW_URI' in self.environ: # This is for gunicorn
 70 |             self.url = self.environ['RAW_URI'] #TODO: Is this a valid environment variable always?
 71 | 
 72 |         # Allow accessing the proxy using regular URL so that we can use
 73 |         # tools like ab.
 74 |         if self.url.startswith("/_web/"):
 75 |             self.url = self.url[len("/_web/"):]
 76 |             
 77 |         # Since this is a proxy, the URL is always of the form http://hostname/path
 78 |         # nginx is stripping the http://host from the passed URL and just passing the /path here.
 79 |         # This is a work-around for that issue.
 80 |         if self.url.startswith("/"):
 81 |             self.url = "http://" + self.environ['HTTP_HOST'] + self.url
 82 |         
 83 |     def __iter__(self):
 84 |         try:
 85 |             self.parse_request()
 86 | 
 87 |             record = self.get_record()
 88 |             if config.http_passthrough:
 89 |                 return self.proxy_response(record)
 90 |             else:
 91 |                 return self.success(record.content_length, record.content_iter)
 92 |         except:
 93 |             logging.error("Internal Error - %s", self.url, exc_info=True)
 94 |             return self.error("500 Internal Server Error")
 95 | 
 96 |     def get_record(self):
 97 |         """Fetches the Record object from cache or constructs from web.
 98 |         """
 99 |         record = _cache.get(self.url)
100 |         if record is None:
101 |             http_response = proxy.urlopen(self.url)
102 |             record = http_response.write_arc(pool)
103 |             _cache.set(self.url, record)
104 |         return record
105 | 
106 |     def proxy_response(self, record):
107 |         """Send the response data as it is """
108 |         # TODO: This is very inefficient. Improve.
109 | 
110 |         # Now we only have the ARC record data. 
111 |         record_payload = record.read_all()
112 |         record_payload = gzip.GzipFile(fileobj=StringIO(record_payload)).read()        
113 |         arc = ARCRecord.from_string(record_payload, version=1)
114 | 
115 |         # Create a FakeSocket and read HTTP headers and payload.
116 |         sock = proxy._FakeSocket(StringIO(arc.payload))
117 |         response = proxy.ProxyHTTPResponse(self.url, sock)
118 |         response.begin()
119 | 
120 |         status = "%d %s" % (response.status, response.reason)
121 |         headers = response.getheaders()
122 |         self.start_response(status, headers)
123 |         return response.get_payload()
124 |     
125 |     def success(self, clen, data):
126 |         status = '200 OK'
127 |         response_headers = [
128 |             ('Content-type', 'application/x-arc-record'),
129 |             ('Content-Length', str(clen))
130 |         ]
131 |         self.start_response(status, response_headers)
132 |         return iter(data)
133 |         
134 |     def error(self, status, headers=None):
135 |         if headers is None:
136 |             headers = [
137 |                 ('Content-Type', 'text/plain'),
138 |                 ('Content-Length', '0'),
139 |             ]
140 |         self.start_response(status, headers)
141 |         return iter([])
142 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | BeautifulSoup==3.2.1
 2 | Genshi==0.6
 3 | PyYAML==3.10
 4 | hiredis==0.1.1
 5 | py==1.4.7
 6 | pytest==2.2.3
 7 | redis==2.4.12
 8 | uWSGI==1.9.14
 9 | warc
10 | wsgiref==0.1.2
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from setuptools import setup
 3 | 
 4 | requirements = [line.strip() for line in open("requirements.txt")]
 5 | 
 6 | setup(
 7 |     name="liveweb",
 8 |     version="2.0.dev",
 9 |     description="Liveweb proxy",
10 |     license='GPL v2',
11 |     author="Internet Archive",
12 |     author_email="info@archive.org",
13 |     url="http://github.com/internetarchive/liveweb",
14 |     packages=["liveweb", "liveweb.tools"],
15 |     platforms=["any"],
16 |     entry_points={
17 |         "console_scripts": [
18 |             "liveweb-proxy=liveweb.cli:main"
19 |         ]
20 |     },
21 |     install_requires=requirements
22 | )
23 | 
24 | 


--------------------------------------------------------------------------------