├── .github
    └── workflows
    │   ├── release.yml
    │   ├── setup
    │       └── action.yml
    │   └── tests.yml
├── .gitignore
├── MANIFEST.in
├── README.rst
├── __init__.py
├── api.rst
├── arch.svg
├── benchmarks
    ├── requirements.txt
    └── run-benchmarks.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── tests
    ├── Dockerfile
    ├── conftest.py
    ├── run-tests.sh
    ├── run-trough.sh
    ├── single-threaded-proxy.py
    ├── test_certauth.py
    ├── test_dedup.py
    ├── test_ensure_rethinkdb_tables.py
    ├── test_warcprox.py
    └── test_writer.py
├── uv.lock
└── warcprox
    ├── __init__.py
    ├── bigtable.py
    ├── certauth.py
    ├── controller.py
    ├── crawl_log.py
    ├── dedup.py
    ├── main.py
    ├── mitmproxy.py
    ├── playback.py
    ├── ssl_util.py
    ├── stats.py
    ├── warc.py
    ├── warcproxy.py
    ├── writer.py
    └── writerthread.py


/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '**[0-9]+.[0-9]+.[0-9]+*'
 7 | 
 8 | jobs:
 9 |   publish:
10 |     runs-on: ubuntu-24.04
11 |     permissions:
12 |       id-token: write
13 |     steps:
14 |       - uses: actions/checkout@v4
15 | 
16 |       - name: Install uv
17 |         uses: astral-sh/setup-uv@v5
18 | 
19 |       - name: Build package
20 |         run: uv build
21 | 
22 |       - name: Publish package
23 |         run: uv publish
24 |         if: ${{ !github.event.pull_request }}
25 | 


--------------------------------------------------------------------------------
/.github/workflows/setup/action.yml:
--------------------------------------------------------------------------------
 1 | name: Test setup
 2 | 
 3 | inputs:
 4 |   python-version:
 5 |     required: true
 6 |     type: string
 7 | 
 8 | runs:
 9 |   using: composite
10 |   steps:
11 |     - name: Set up rethinkdb
12 |       run: |
13 |         wget -qO- https://download.rethinkdb.com/repository/raw/pubkey.gpg | sudo gpg --dearmor -o /usr/share/keyrings/rethinkdb-archive-keyrings.gpg
14 |         echo "deb [signed-by=/usr/share/keyrings/rethinkdb-archive-keyrings.gpg] https://download.rethinkdb.com/repository/ubuntu-$(lsb_release -cs) $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/rethinkdb.list
15 |         sudo apt-get update
16 |         sudo apt-get install rethinkdb
17 |         sudo cp /etc/rethinkdb/default.conf.sample /etc/rethinkdb/instances.d/instance1.conf
18 |         sudo /etc/init.d/rethinkdb restart
19 |       shell: bash
20 | 
21 |     - name: Install uv
22 |       uses: astral-sh/setup-uv@v5
23 | 
24 |     - name: Install pip dependencies
25 |       run: uv sync --python ${{ inputs.python-version }}
26 |       shell: bash
27 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 |       - master
12 | 
13 | jobs:
14 |   test:
15 |     name: Run tests
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       matrix:
19 |         version: ['3.8', '3.12']
20 |     steps:
21 |       - uses: actions/checkout@v4
22 | 
23 |       - uses: ./.github/workflows/setup
24 |         with:
25 |           python-version: ${{ matrix.version }}
26 | 
27 |       - name: Run tests
28 |         run: |
29 |           uv run py.test --tb=native --verbose tests/test_certauth.py tests/test_dedup.py tests/test_warcprox.py tests/test_writer.py
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .project
 2 | .pydevproject
 3 | *.pyc
 4 | *.pem
 5 | *.db
 6 | *.diff
 7 | *.egg
 8 | *.egg-info
 9 | *.swp
10 | warcs
11 | build
12 | dist
13 | .tox
14 | out.*
15 | 
16 | # runtime files
17 | /warcprox.sqlite
18 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include tests *.py *.sh Dockerfile
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Warcprox - WARC writing MITM HTTP/S proxy
  2 | *****************************************
  3 | 
  4 | Warcprox is an HTTP proxy designed for web archiving applications. When used in
  5 | parallel with `brozzler <https://github.com/internetarchive/brozzler>`_ it
  6 | supports a comprehensive, modern, and distributed archival web capture system.
  7 | Warcprox stores its traffic to disk in the `Web ARChive (WARC) file format
  8 | <https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/>`_,
  9 | which may then be accessed with web archival replay software like `OpenWayback
 10 | <https://github.com/iipc/openwayback>`_ and `pywb
 11 | <https://github.com/webrecorder/pywb>`_. It captures encrypted HTTPS traffic by
 12 | using the "man-in-the-middle" technique (see the `Man-in-the-middle`_ section
 13 | for more info).
 14 | 
 15 | Warcprox was originally based on `pymiproxy
 16 | <https://github.com/allfro/pymiproxy>`_ by Nadeem Douba.
 17 | 
 18 | .. contents::
 19 | 
 20 | Getting started
 21 | ===============
 22 | Warcprox runs on python 3.4+.
 23 | 
 24 | To install latest release run::
 25 | 
 26 |     # apt-get install libffi-dev libssl-dev
 27 |     pip install warcprox
 28 | 
 29 | You can also install the latest bleeding edge code::
 30 | 
 31 |     pip install git+https://github.com/internetarchive/warcprox.git
 32 | 
 33 | To start warcprox run::
 34 | 
 35 |     warcprox
 36 | 
 37 | Try ``warcprox --help`` for documentation on command line options.
 38 | 
 39 | Man-in-the-middle
 40 | =================
 41 | Normally, HTTP proxies can't read encrypted HTTPS traffic. The browser uses the
 42 | HTTP ``CONNECT`` method to establish a tunnel through the proxy, and the proxy
 43 | merely routes raw bytes between the client and server. Since the bytes are
 44 | encrypted, the proxy can't make sense of the information that it proxies. This
 45 | nonsensical encrypted data is not typically useful for web archiving purposes.
 46 | 
 47 | In order to capture HTTPS traffic, warcprox acts as a "man-in-the-middle"
 48 | (MITM). When it receives a ``CONNECT`` directive from a client, it generates a
 49 | public key certificate for the requested site, presents to the client, and
 50 | proceeds to establish an encrypted connection with the client. It then makes a
 51 | separate, normal HTTPS connection to the remote site. It decrypts, archives,
 52 | and re-encrypts traffic in both directions.
 53 | 
 54 | Configuring a warcprox instance as a browser’s HTTP proxy will result in
 55 | security certificate warnings because none of the certificates will be signed
 56 | by trusted authorities. However, there is nothing malicious about warcprox
 57 | functions. To use warcprox effectively, the client needs to disable certificate
 58 | verification or add the CA certificate generated by warcprox as a trusted
 59 | authority. When using the latter, remember to undo this change when finished
 60 | using warcprox.
 61 | 
 62 | API
 63 | ===
 64 | The warcprox API may be used to retrieve information from and interact with a
 65 | running warcprox instance, including:
 66 | 
 67 | * Retrieving status information via ``/status`` URL
 68 | * Writing WARC records via ``WARCPROX_WRITE_RECORD`` HTTP method
 69 | * Controlling warcprox settings via the ``Warcprox-Meta`` HTTP header
 70 | 
 71 | For warcprox API documentation, see: `<api.rst>`_.
 72 | 
 73 | Deduplication
 74 | =============
 75 | Warcprox avoids archiving redundant content by "deduplicating" it. The process
 76 | for deduplication works similarly to deduplication by `Heritrix
 77 | <https://github.com/internetarchive/heritrix3>`_ and other web archiving tools:
 78 | 
 79 | 1. While fetching URL, calculate payload content digest (typically SHA1
 80 |    checksum value)
 81 | 2. Look up digest in deduplication database (warcprox currently supports
 82 |    `sqlite <https://sqlite.org/>`_ by default, `rethinkdb
 83 |    <https://github.com/rethinkdb/rethinkdb>`_ with two different schemas, and
 84 |    `trough <https://github.com/internetarchive/trough>`_)
 85 | 3. If found, write warc ``revisit`` record referencing the url and capture time
 86 |    of the previous capture
 87 | 4. If not found,
 88 | 
 89 |    a. Write ``response`` record with full payload
 90 |    b. Store new entry in deduplication database (can be disabled, see
 91 |       `Warcprox-Meta HTTP request header <api.rst#warcprox-meta-http-request-header>`_)
 92 | 
 93 | The deduplication database is partitioned into different "buckets". URLs are
 94 | deduplicated only against other captures in the same bucket. If specified, the
 95 | ``dedup-buckets`` field of the `Warcprox-Meta HTTP request header
 96 | <api.rst#warcprox-meta-http-request-header>`_ determines the bucket(s). Otherwise,
 97 | the default bucket is used.
 98 | 
 99 | Deduplication can be disabled entirely by starting warcprox with the argument
100 | ``--dedup-db-file=/dev/null``.
101 | 
102 | Statistics
103 | ==========
104 | Warcprox stores some crawl statistics to sqlite or rethinkdb. These are
105 | consulted for enforcing ``limits`` and ``soft-limits`` (see `Warcprox-Meta
106 | fields <api.rst#warcprox-meta-fields>`_), and can also be consulted by other
107 | processes outside of warcprox, such as for crawl job reporting.
108 | 
109 | Statistics are grouped by "bucket". Every capture is counted as part of the
110 | ``__all__`` bucket. Other buckets can be specified in the ``Warcprox-Meta``
111 | request header. The fallback bucket in case none is specified is called
112 | ``__unspecified__``.
113 | 
114 | Within each bucket are three sub-buckets:
115 | 
116 | * ``new`` - tallies captures for which a complete record (usually a
117 |   ``response`` record) was written to a WARC file
118 | * ``revisit`` - tallies captures for which a ``revisit`` record was written to
119 |   a WARC file
120 | * ``total`` - includes all URLs processed, even those not written to a WARC
121 |   file, and so may be greater than the sum of new and revisit records
122 | 
123 | Within each of these sub-buckets, warcprox generates two kinds of statistics:
124 | 
125 | * ``urls`` - simple count of URLs
126 | * ``wire_bytes`` - sum of bytes received over the wire from the remote server
127 |   for each URL, including HTTP headers
128 | 
129 | For historical reasons, the default sqlite store keeps statistics as JSON blobs::
130 | 
131 |     sqlite> select * from buckets_of_stats;
132 |     bucket           stats
133 |     ---------------  ---------------------------------------------------------------------------------------------
134 |     __unspecified__  {"bucket":"__unspecified__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
135 |     __all__          {"bucket":"__all__","total":{"urls":37,"wire_bytes":1502781},"new":{"urls":15,"wire_bytes":1179906},"revisit":{"urls":22,"wire_bytes":322875}}
136 | 
137 | Plugins
138 | =======
139 | Warcprox supports a limited notion of plugins by way of the ``--plugin``
140 | command line argument. Plugin classes are loaded from the regular python module
141 | search path. They are instantiated with one argument that contains the values
142 | of all command line arguments, ``warcprox.Options``. Legacy plugins with
143 | constructors that take no arguments are also supported. Plugins should either
144 | have a method ``notify(self, recorded_url, records)`` or should subclass
145 | ``warcprox.BasePostfetchProcessor``. More than one plugin can be configured by
146 | specifying ``--plugin`` multiples times.
147 | 
148 | See a minimal example `here
149 | <https://github.com/internetarchive/warcprox/blob/318405e795ac0ab8760988a1a482cf0a17697148/warcprox/__init__.py#L165>`__.
150 | 
151 | Architecture
152 | ============
153 | .. image:: arch.svg
154 | 
155 | Warcprox is multithreaded. It has pool of http proxy threads (100 by default).
156 | When handling a request, a proxy thread records data from the remote server to
157 | an in-memory buffer that spills over to disk if necessary (after 512k by
158 | default), while it streams the data to the proxy client. Once the HTTP
159 | transaction is complete, it puts the recorded URL in a thread-safe queue, to be
160 | picked up by the first processor in the postfetch chain.
161 | 
162 | The postfetch chain normally includes processors for loading deduplication
163 | information, writing records to the WARC, saving deduplication information, and
164 | updating statistics. The exact set of processors in the chain depends on
165 | command line arguments; for example, plugins specified with ``--plugin`` are
166 | processors in the postfetch chain. Each postfetch processor has its own thread
167 | or threads. Thus the processors are able to run in parallel, independent of one
168 | another. This design also enables them to process URLs in batch. For example,
169 | the statistics processor gathers statistics for up to 10 seconds or 500 URLs,
170 | whichever comes first, then updates the statistics database with just a few
171 | queries.
172 | 
173 | License
174 | =======
175 | 
176 | Warcprox is a derivative work of pymiproxy, which is GPL. Thus warcprox is also
177 | GPL.
178 | 
179 | * Copyright (C) 2012 Cygnos Corporation
180 | * Copyright (C) 2013-2018 Internet Archive
181 | 
182 | This program is free software; you can redistribute it and/or
183 | modify it under the terms of the GNU General Public License
184 | as published by the Free Software Foundation; either version 2
185 | of the License, or (at your option) any later version.
186 | 
187 | This program is distributed in the hope that it will be useful,
188 | but WITHOUT ANY WARRANTY; without even the implied warranty of
189 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
190 | GNU General Public License for more details.
191 | 
192 | You should have received a copy of the GNU General Public License
193 | along with this program; if not, write to the Free Software
194 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
195 | 
196 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/warcprox/1cfdcb520648c64f0bb3db4f13e2cecafe7551d3/__init__.py


--------------------------------------------------------------------------------
/api.rst:
--------------------------------------------------------------------------------
  1 | warcprox API
  2 | ************
  3 | 
  4 | Means of interacting with warcprox over http, aside from simply proxying urls.
  5 | 
  6 | .. contents::
  7 | 
  8 | ``/status`` url
  9 | ===============
 10 | 
 11 | If warcprox is running at localhost:8000, http://localhost:8000/status returns
 12 | a json blob with a bunch of status info. For example:
 13 | 
 14 | ::
 15 | 
 16 |     $ curl -sS http://localhost:8000/status
 17 |     {
 18 |       "role": "warcprox",
 19 |       "version": "2.4b3.dev189",
 20 |       "host": "ayutla.local",
 21 |       "address": "127.0.0.1",
 22 |       "port": 8000,
 23 |       "pid": 60555,
 24 |       "threads": 100,
 25 |       "active_requests": 1,
 26 |       "unaccepted_requests": 0,
 27 |       "load": 0.0,
 28 |       "queued_urls": 0,
 29 |       "queue_max_size": 500,
 30 |       "urls_processed": 0,
 31 |       "warc_bytes_written": 0,
 32 |       "start_time": "2018-10-30T20:15:19.929861Z",
 33 |       "rates_1min": {
 34 |         "actual_elapsed": 61.76024103164673,
 35 |         "urls_per_sec": 0.0,
 36 |         "warc_bytes_per_sec": 0.0
 37 |       },
 38 |       "rates_5min": {
 39 |         "actual_elapsed": 1.7602601051330566,
 40 |         "urls_per_sec": 0.0,
 41 |         "warc_bytes_per_sec": 0.0
 42 |       },
 43 |       "rates_15min": {
 44 |         "actual_elapsed": 1.7602710723876953,
 45 |         "urls_per_sec": 0.0,
 46 |         "warc_bytes_per_sec": 0.0
 47 |       },
 48 |       "earliest_still_active_fetch_start": "2018-10-30T20:15:21.691467Z",
 49 |       "seconds_behind": 0.001758,
 50 |       "postfetch_chain": [
 51 |         {
 52 |           "processor": "DedupLoader",
 53 |           "queued_urls": 0
 54 |         },
 55 |         {
 56 |           "processor": "WarcWriterProcessor",
 57 |           "queued_urls": 0
 58 |         },
 59 |         {
 60 |           "processor": "DedupDb",
 61 |           "queued_urls": 0
 62 |         },
 63 |         {
 64 |           "processor": "StatsProcessor",
 65 |           "queued_urls": 0
 66 |         },
 67 |         {
 68 |           "processor": "RunningStats",
 69 |           "queued_urls": 0
 70 |         }
 71 |       ]
 72 | 
 73 | ``WARCPROX_WRITE_RECORD`` http method
 74 | =====================================
 75 | 
 76 | To make warcprox write an arbitrary warc record you can send it a special
 77 | request with http method ``WARCPROX_WRITE_RECORD``. The http request must
 78 | include the headers ``WARC-Type``, ``Content-Type``, and ``Content-Length``.
 79 | Warcprox will use these to populate the warc record. For example::
 80 | 
 81 |     $ ncat --crlf 127.0.0.1 8000 <<EOF
 82 |     > WARCPROX_WRITE_RECORD special://url/some?thing HTTP/1.1
 83 |     > WARC-Type: resource
 84 |     > Content-type: text/plain;charset=utf-8
 85 |     > Content-length: 29
 86 |     > 
 87 |     > i am a warc record payload!
 88 |     > EOF
 89 |     HTTP/1.0 204 OK
 90 |     Server: BaseHTTP/0.6 Python/3.6.3
 91 |     Date: Tue, 22 May 2018 19:21:02 GMT
 92 | 
 93 | On success warcprox responds with http status 204. For the request above
 94 | warcprox will write a warc record that looks like this::
 95 | 
 96 |     WARC/1.0
 97 |     WARC-Type: resource
 98 |     WARC-Record-ID: <urn:uuid:d0e10852-b18c-4037-a99e-f41915fec5b5>
 99 |     WARC-Date: 2018-05-21T23:33:31Z
100 |     WARC-Target-URI: special://url/some?thing
101 |     WARC-Block-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
102 |     WARC-Payload-Digest: sha1:a282cfe127ab8d51b315ff3d31de18614979d0df
103 |     Content-Type: text/plain;charset=utf-8
104 |     Content-Length: 29
105 | 
106 |     i am a warc record payload!
107 | 
108 | ``Warcprox-Meta`` http request header
109 | =====================================
110 | 
111 | ``Warcprox-Meta`` is a special http request header that can be used to pass
112 | configuration information and metadata with each proxy request to warcprox. The
113 | value is a json blob. There are several fields understood by warcprox, and
114 | arbitrary additional fields can be included. If warcprox doesn't recognize a
115 | field it simply ignores it. Custom fields may be useful for custom warcprox
116 | plugins (see `<README.rst#plugins>`_).
117 | 
118 | Warcprox strips the ``warcprox-meta`` header out before sending the request to
119 | remote server, and does not write it in the warc request record.
120 | 
121 | Brozzler knows about ``warcprox-meta``. For information on configuring
122 | it in brozzler, see
123 | https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#warcprox-meta.
124 | ``Warcprox-Meta`` is often a very important part of brozzler job configuration.
125 | It is the way url and data limits on jobs, seeds, and hosts are implemented,
126 | among other things.
127 | 
128 | Warcprox-Meta fields
129 | --------------------
130 | 
131 | ``warc-prefix`` (string)
132 | ~~~~~~~~~~~~~~~~~~~~~~~~
133 | Specifies a warc filename prefix. Warcprox will write the warc record for this
134 | capture, if any, to a warc named accordingly.
135 | 
136 | Example::
137 | 
138 |     Warcprox-Meta: {"warc-prefix": "special-warc"}
139 | 
140 | ``dedup-buckets`` (string)
141 | ~~~~~~~~~~~~~~~~~~~~~~~~~
142 | Specifies the deduplication bucket(s). For more information about deduplication
143 | see `<README.rst#deduplication>`_.
144 | 
145 | Examples::
146 | 
147 |     Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw"}}
148 | 
149 |     Warcprox-Meta: {"dedup-buckets":{"my-dedup-bucket":"rw", "my-read-only-dedup-bucket": "ro"}}
150 | 
151 | ``blocks`` (list)
152 | ~~~~~~~~~~~~~~~~~
153 | List of url match rules. Url match rules are somewhat described at
154 | https://github.com/internetarchive/brozzler/blob/master/job-conf.rst#scoping
155 | and https://github.com/iipc/urlcanon/blob/e2ab3524e/python/urlcanon/rules.py#L70.
156 | (TODO: write a better doc and link to it)
157 | 
158 | Example::
159 | 
160 |     Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
161 | 
162 | If any of the rules match the url being requested, warcprox aborts normal
163 | processing and responds with a http ``403``. The http response includes
164 | a ``Warcprox-Meta`` response header with one field, ``blocked-by-rule``,
165 | which reproduces the value of the match rule that resulted in the block. The
166 | presence of the ``warcprox-meta`` response header can be used by the client to
167 | distinguish this type of a response from a 403 from the remote site.
168 | 
169 | An example::
170 | 
171 |     $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}' http://example.com/foo
172 |     HTTP/1.0 403 Forbidden
173 |     Server: BaseHTTP/0.6 Python/3.6.3
174 |     Date: Fri, 25 May 2018 22:46:42 GMT
175 |     Content-Type: text/plain;charset=utf-8
176 |     Connection: close
177 |     Content-Length: 111
178 |     Warcprox-Meta: {"blocked-by-rule":{"ssurt":"com,example,//http:/"}}
179 | 
180 |     request rejected by warcprox: blocked by rule found in Warcprox-Meta header: {"ssurt": "com,example,//http:/"}
181 | 
182 | You might be wondering why ``blocks`` is necessary. Why would the warcprox
183 | client make a request that it should already know will be blocked by the proxy?
184 | The answer is that the request may be initiated somewhere where it's difficult
185 | to evaluate the block rules. In particular, this circumstance prevails when the
186 | browser controlled by brozzler is requesting images, javascript, css, and so
187 | on, embedded in a page.
188 | 
189 | ``compressed_blocks`` (string)
190 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
191 | If the ``blocks`` header is large, it may be useful or necessary to compress it.
192 | ``compressed_blocks`` is a string containing a zlib and base64-encoded
193 | ``blocks`` list. If both ``blocks`` and ``compressed_blocks`` are provided,
194 | warcprox will use the value of ``compressed_blocks``, however this behavior
195 | is not guaranteed.
196 | 
197 | Example::
198 | 
199 |     Warcprox-Meta: {"compressed_blocks": "eJwVykEKgCAQQNGryKwt90F0kGgxlZSgzuCMFIR3r7b//fkBkVoUBgMbJetvTBy9de5U5cFBs+aBnRKG/D8J44XF91XAGpC6ipaQj58u7iIdIfd88oSbBsrjF6gqtOUFJ5YjwQ=="}
200 | 
201 | Is equivalent to::
202 | 
203 |     {"blocks": [{"ssurt": "com,example,//http:/"}, {"domain": "malware.us", "substring": "wp-login.php?action=logout"}]}
204 | 
205 | ``stats`` (dictionary)
206 | ~~~~~~~~~~~~~~~~~~~~~~
207 | ``stats`` is a dictionary with only one field understood by warcprox,
208 | ``buckets``. The value of ``buckets`` is a list of strings and/or
209 | dictionaries. A string signifies the name of the bucket; a dictionary is
210 | expected to have at least an item with key ``bucket`` whose value is the name
211 | of the bucket. The other currently recognized key is ``tally-domains``, which
212 | if supplied should be a list of domains. This instructs warcprox to
213 | additionally tally substats of the given bucket by domain.
214 | 
215 | See `<README.rst#statistics>`_ for more information on statistics kept by
216 | warcprox.
217 | 
218 | Examples::
219 | 
220 |     Warcprox-Meta: {"stats":{"buckets":["my-stats-bucket","all-the-stats"]}}
221 |     Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
222 | 
223 | Domain stats are stored in the stats table under the key
224 | ``"bucket2:foo.bar.com"`` for the latter example. See the following two
225 | sections for more examples. The ``soft-limits`` section has an example of a
226 | limit on a domain specified in ``tally-domains``.
227 | 
228 | ``limits`` (dictionary)
229 | ~~~~~~~~~~~~~~~~~~~~~~~
230 | Specifies quantitative limits for warcprox to enforce. The structure of the
231 | dictionary is ``{stats_key: numerical_limit, ...}`` where stats key has the
232 | format ``"bucket/sub-bucket/statistic"``. See `README.rst#statistics`_ for
233 | further explanation of what "bucket", "sub-bucket", and "statistic" mean here.
234 | 
235 | If processing a request would result in exceeding a limit, warcprox aborts
236 | normal processing and responds with a http ``420 Reached Limit``. The http
237 | response includes a ``Warcprox-Meta`` response header with the complete set
238 | of statistics for the bucket whose limit has been reached.
239 | 
240 | Example::
241 | 
242 |     Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}
243 | 
244 | ::
245 | 
246 |     $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
247 |     HTTP/1.0 420 Reached limit
248 |     Server: BaseHTTP/0.6 Python/3.6.3
249 |     Date: Fri, 25 May 2018 23:08:32 GMT
250 |     Content-Type: text/plain;charset=utf-8
251 |     Connection: close
252 |     Content-Length: 77
253 |     Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-limit":{"test_limits_bucket/total/urls":10}}
254 | 
255 |     request rejected by warcprox: reached limit test_limits_bucket/total/urls=10
256 | 
257 | ``soft-limits`` (dictionary)
258 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
259 | From warcprox's perspective ``soft-limits`` work almost exactly the same way
260 | as ``limits``. The only difference is that when a soft limit is hit, warcprox
261 | response with an http ``430 Reached soft limit`` instead of http ``420``.
262 | 
263 | Warcprox clients might treat a ``430`` very differently from a ``420``. From
264 | brozzler's perspective, for instance, ``soft-limits`` are very different from
265 | ``limits``. When brozzler receives a ``420`` from warcprox because a ``limit``
266 | has been reached, this means that crawling for that seed is finished, and
267 | brozzler sets about finalizing the crawl of that seed. On the other hand,
268 | brozzler blissfully ignores ``430`` responses, because soft limits only apply
269 | to a particular bucket (like a domain), and don't have any effect on crawling
270 | of urls that don't fall in that bucket.
271 | 
272 | Example::
273 | 
274 |     Warcprox-Meta: {"stats": {"buckets": [{"bucket": "test_domain_doc_limit_bucket", "tally-domains": ["foo.localhost"]}]}, "soft-limits": {"test_domain_doc_limit_bucket:foo.localhost/total/urls": 10}}
275 | 
276 | ::
277 | 
278 |     $ curl -iksS --proxy localhost:8000 --header 'Warcprox-Meta: {"stats": {"buckets": ["test_limits_bucket"]}, "soft-limits": {"test_limits_bucket/total/urls": 10}}' http://example.com/foo
279 |     HTTP/1.0 430 Reached soft limit
280 |     Server: BaseHTTP/0.6 Python/3.6.3
281 |     Date: Fri, 25 May 2018 23:12:06 GMT
282 |     Content-Type: text/plain;charset=utf-8
283 |     Connection: close
284 |     Content-Length: 82
285 |     Warcprox-Meta: {"stats":{"test_limits_bucket":{"bucket":"test_limits_bucket","total":{"urls":10,"wire_bytes":15840},"new":{"urls":0,"wire_bytes":0},"revisit":{"urls":10,"wire_bytes":15840}}},"reached-soft-limit":{"test_limits_bucket/total/urls":10}}
286 | 
287 |     request rejected by warcprox: reached soft limit test_limits_bucket/total/urls=10
288 | 
289 | ``metadata`` (dictionary)
290 | ~~~~~~~~~~~~~~~~~~~~~~~~~
291 | An arbitrary dictionary. Warcprox mostly ignores this. The one exception is
292 | that if it has a ``seed`` entry and crawl logs are enabled via the
293 | ``--crawl-log-dir`` command line option, the value of ``seed`` is written to
294 | the crawl log as the 11th field on the line, simulating heritrix's "source
295 | tag".
296 | 
297 | Example::
298 | 
299 |     Warcprox-Meta: {"metadata": {"seed": "http://example.com/seed", "description": "here's some information about this crawl job. blah blah"}
300 | 
301 | ``accept`` (list)
302 | ~~~~~~~~~~~~~~~~~
303 | Specifies fields that the client would like to receive in the ``Warcprox-Meta``
304 | response header. Only one value is currently understood,
305 | ``capture-metadata``.
306 | 
307 | Example::
308 | 
309 |     Warcprox-Meta: {"accept": ["capture-metadata"]}
310 | 
311 | The response will include a ``Warcprox-Meta`` response header with one field
312 | also called ``captured-metadata``. Currently warcprox reports one piece of
313 | capture medata, ``timestamp``, which represents the time fetch began for the
314 | resource and matches the ``WARC-Date`` written to the warc record. For
315 | example::
316 | 
317 |     Warcprox-Meta: {"capture-metadata":{"timestamp":"2018-05-30T00:22:49Z"}}
318 | 
319 | ``Warcprox-Meta`` http response header
320 | ======================================
321 | In some cases warcprox will add a ``Warcprox-Meta`` header to the http response
322 | that it sends to the client. As with the request header, the value is a json
323 | blob. It is only included if something in the ``warcprox-meta`` request header
324 | calls for it. Those cases are described above in the `Warcprox-Meta http
325 | request header`_ section.
326 | 
327 | 


--------------------------------------------------------------------------------
/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.12.9
2 | 


--------------------------------------------------------------------------------
/benchmarks/run-benchmarks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | run-benchmarks.py - some benchmarking code for warcprox
  4 | 
  5 | Copyright (C) 2015-2017 Internet Archive
  6 | 
  7 | This program is free software; you can redistribute it and/or
  8 | modify it under the terms of the GNU General Public License
  9 | as published by the Free Software Foundation; either version 2
 10 | of the License, or (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with this program; if not, write to the Free Software
 19 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 20 | USA.
 21 | '''
 22 | 
 23 | import aiohttp.web
 24 | import asyncio
 25 | import ssl
 26 | import OpenSSL.crypto
 27 | import OpenSSL.SSL
 28 | import tempfile
 29 | import random
 30 | import os
 31 | import logging
 32 | import sys
 33 | import time
 34 | import argparse
 35 | import hashlib
 36 | import datetime
 37 | import cryptography.hazmat.backends.openssl
 38 | import warcprox
 39 | import warcprox.main
 40 | import threading
 41 | 
 42 | warcprox.warcproxy.WarcProxyHandler.allow_localhost = True
 43 | 
 44 | # https://medium.com/@generativist/a-simple-streaming-http-server-in-aiohttp-4233dbc173c7
 45 | async def do_get(request):
 46 |     n = int(request.match_info.get('n'))
 47 |     response = aiohttp.web.StreamResponse(
 48 |             status=200, reason='OK', headers={
 49 |                 'Content-Type': 'text/plain', 'Content-Length': str(n)})
 50 |     await response.prepare(request)
 51 |     for i in range(n // 80):
 52 |         # some random bytes at the beginning to avoid deduplication
 53 |         # XXX doesn't work for n < 80
 54 |         if i == 0:
 55 |             rando = bytes([random.choice(
 56 |                 b'abcdefghijlkmopqrstuvwxyz') for i in range(30)])
 57 |             bs = rando + b'x' * 49 + b'\n'
 58 |         else:
 59 |             bs = b'x' * 79 + b'\n'
 60 |         await response.write(bs)
 61 |     if n % 80 > 0:
 62 |         await response.write(b'x' * (n % 80 - 1) + b'\n')
 63 | 
 64 |     return response
 65 | 
 66 | def self_signed_cert():
 67 |     key = OpenSSL.crypto.PKey()
 68 |     key.generate_key(OpenSSL.crypto.TYPE_RSA, 2048)
 69 | 
 70 |     cert = OpenSSL.crypto.X509()
 71 |     cert.set_serial_number(random.randint(0, 2 ** 64 - 1))
 72 |     cert.get_subject().CN = '127.0.0.1'
 73 | 
 74 |     cert.set_version(2)
 75 |     cert.gmtime_adj_notBefore(0)
 76 |     cert.gmtime_adj_notAfter(10 * 365 * 24 * 60 * 60)
 77 | 
 78 |     cert.set_issuer(cert.get_subject())
 79 |     cert.set_pubkey(key)
 80 |     cert.sign(key, 'sha1')
 81 | 
 82 |     return key, cert
 83 | 
 84 | def ssl_context():
 85 |     sslc = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 86 |     with tempfile.NamedTemporaryFile(delete=False) as certfile:
 87 |         key, cert = self_signed_cert()
 88 |         certfile.write(
 89 |                 OpenSSL.crypto.dump_privatekey(OpenSSL.SSL.FILETYPE_PEM, key))
 90 |         certfile.write(
 91 |                 OpenSSL.crypto.dump_certificate(OpenSSL.SSL.FILETYPE_PEM, cert))
 92 |     sslc.load_cert_chain(certfile.name)
 93 |     os.remove(certfile.name)
 94 |     return sslc
 95 | 
 96 | def start_servers():
 97 |     app = aiohttp.web.Application()
 98 |     app.router.add_get('/{n}', do_get)
 99 | 
100 |     loop = asyncio.get_event_loop()
101 | 
102 |     http = loop.create_server(
103 |             app.make_handler(access_log=None), '127.0.0.1', 4080)
104 |     loop.run_until_complete(http)
105 | 
106 |     sslc = ssl_context()
107 |     https = loop.create_server(
108 |             app.make_handler(access_log=None), '127.0.0.1', 4443, ssl=sslc)
109 |     loop.run_until_complete(https)
110 | 
111 | async def fetch(session, url, proxy=None):
112 |     # logging.info('sending request to %s', url)
113 |     n_bytes = 0
114 |     async with session.get(url, proxy=proxy) as response:
115 |         assert response.status == 200
116 |         while True:
117 |             chunk = await response.content.read(2**16)
118 |             n_bytes += len(chunk)
119 |             if not chunk:
120 |                 break
121 |         # logging.info('finished receiving response from %s', url)
122 |     return n_bytes
123 | 
124 | async def benchmarking_client(
125 |         base_url, requests=200, payload_size=100000, proxy=None):
126 |     start = time.time()
127 |     connector = aiohttp.TCPConnector(ssl=False)
128 |     n_urls = 0
129 |     n_bytes = 0
130 |     url = '%s/%s' % (base_url, payload_size)
131 |     outstanding_requests = set()
132 |     async with aiohttp.ClientSession(connector=connector) as session:
133 |         for i in range(requests):
134 |             future = asyncio.ensure_future(fetch(session, url, proxy))
135 |             outstanding_requests.add(future)
136 |             # logging.info('scheduled future fetch of %s', url)
137 |         while True:
138 |             done, pending = await asyncio.wait(
139 |                     outstanding_requests, return_when=asyncio.FIRST_COMPLETED)
140 |             for future in done:
141 |                 outstanding_requests.remove(future)
142 |                 n_urls += 1
143 |                 n_bytes += future.result()
144 |             if not pending:
145 |                 return n_urls, n_bytes, time.time() - start
146 | 
147 | def build_arg_parser(tmpdir, prog=os.path.basename(sys.argv[0])):
148 |     desc = '''
149 | Warcprox benchmarker. Runs simple http and https servers and uses them to
150 | benchmark warcprox. Runs 4 benchmarks:
151 | 
152 |     1. baseline http (no warcprox)
153 |     2. baseline https (no warcprox)
154 |     3. http with warcprox
155 |     4. https with warcprox
156 | 
157 | Uses a temporary directory for warcs and other files. Otherwise, most warcprox
158 | options can be specified on the command line. Useful for comparing performance
159 | with different options.
160 | 
161 | Benchmarking code uses asyncio/aiohttp and requires python 3.5 or later.
162 | '''
163 |     arg_parser = warcprox.main._build_arg_parser()
164 |     arg_parser.description = desc
165 | 
166 |     arg_parser.add_argument(
167 |             '--requests', dest='requests', type=int, default=200,
168 |             help='number of urls to fetch')
169 |     arg_parser.add_argument(
170 |             '--payload-size', dest='payload_size', type=int, default=100000,
171 |             help='size of each response payload, in bytes')
172 |     arg_parser.add_argument(
173 |             '--skip-baseline', dest='skip_baseline', action='store_true',
174 |             help='skip the baseline bechmarks')
175 | 
176 |     # filter out options that are not configurable for the benchmarks
177 |     filtered = []
178 |     for action in arg_parser._action_groups[1]._group_actions:
179 |         if action.dest not in (
180 |                 'port', 'address', 'cacert', 'certs_dir', 'directory'):
181 |             filtered.append(action)
182 |     arg_parser._action_groups[1]._group_actions = filtered
183 | 
184 |     return arg_parser
185 | 
186 | if __name__ == '__main__':
187 |     # see https://github.com/pyca/cryptography/issues/2911
188 |     cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
189 | 
190 |     # with tempfile.TemporaryDirectory() as tmpdir:
191 |     tmpdir = tempfile.mkdtemp()
192 |     if True:
193 |         arg_parser = build_arg_parser(tmpdir)
194 |         args = arg_parser.parse_args(args=sys.argv[1:])
195 | 
196 |         if args.trace:
197 |             loglevel = logging.TRACE
198 |         elif args.verbose:
199 |             loglevel = logging.DEBUG
200 |         else:
201 |             loglevel = logging.INFO
202 | 
203 |         logging.basicConfig(
204 |                 stream=sys.stdout, level=loglevel, format=(
205 |                     '%(asctime)s %(process)d %(levelname)s %(threadName)s '
206 |                     '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) '
207 |                     '%(message)s'))
208 |         logging.getLogger('warcprox').setLevel(loglevel + 5)
209 | 
210 |         logging.info('using temp dir %s', tmpdir)
211 | 
212 |         args.playback_port = None
213 |         args.address = '127.0.0.1'
214 |         args.port = 0
215 |         args.cacert = os.path.join(tmpdir, 'benchmark-warcprox-ca.pem')
216 |         args.certs_dir = os.path.join(tmpdir, 'benchmark-warcprox-ca')
217 |         args.directory = os.path.join(tmpdir, 'warcs')
218 |         # if args.rethinkdb_servers:
219 |         #     args.rethinkdb_db = 'benchmarks_{:%Y%m%d%H%M%S}' % (
220 |         #             datetime.datetime.utcnow())
221 | 
222 |         start_servers()
223 |         logging.info(
224 |                 'servers running at http://127.0.0.1:4080 and '
225 |                 'https://127.0.0.1:4443')
226 | 
227 |         loop = asyncio.get_event_loop()
228 | 
229 |         logging.info('===== baseline benchmark starting (no proxy) =====')
230 |         if not args.skip_baseline:
231 |             n_urls, n_bytes, elapsed = loop.run_until_complete(
232 |                     benchmarking_client(
233 |                         'http://127.0.0.1:4080', args.requests,
234 |                         args.payload_size))
235 |             logging.info(
236 |                     'http baseline (no proxy): n_urls=%s n_bytes=%s in %.1f '
237 |                     'sec', n_urls, n_bytes, elapsed)
238 | 
239 |             n_urls, n_bytes, elapsed = loop.run_until_complete(
240 |                     benchmarking_client(
241 |                         'https://127.0.0.1:4443', args.requests,
242 |                         args.payload_size))
243 |             logging.info(
244 |                     'https baseline (no proxy): n_urls=%s n_bytes=%s in %.1f '
245 |                     'sec', n_urls, n_bytes, elapsed)
246 |         else:
247 |             logging.info('SKIPPED')
248 |         logging.info('===== baseline benchmark finished =====')
249 | 
250 |         options = warcprox.Options(**vars(args))
251 |         warcprox_controller = warcprox.controller.WarcproxController(options)
252 | 
253 |         warcprox_controller_thread = threading.Thread(
254 |                 target=warcprox_controller.run_until_shutdown)
255 |         warcprox_controller_thread.start()
256 | 
257 |         proxy = 'http://%s:%s' % (
258 |                 warcprox_controller.proxy.server_address[0],
259 |                 warcprox_controller.proxy.server_address[1])
260 |         logging.info('===== warcprox benchmark starting =====')
261 |         n_urls, n_bytes, elapsed = loop.run_until_complete(
262 |                 benchmarking_client(
263 |                     'http://127.0.0.1:4080', args.requests, args.payload_size,
264 |                     proxy))
265 |         logging.info(
266 |                 'http: n_urls=%s n_bytes=%s in %.1f sec',
267 |                 n_urls, n_bytes, elapsed)
268 | 
269 |         n_urls, n_bytes, elapsed = loop.run_until_complete(
270 |                 benchmarking_client(
271 |                     'https://127.0.0.1:4443', args.requests, args.payload_size,
272 |                     proxy))
273 |         logging.info(
274 |                 'https: n_urls=%s n_bytes=%s in %.1f sec',
275 |                 n_urls, n_bytes, elapsed)
276 | 
277 |         start = time.time()
278 |         warcprox_controller.stop.set()
279 |         warcprox_controller_thread.join()
280 |         logging.info(
281 |                 'waited %.1f sec for warcprox to finish', time.time() - start)
282 |         logging.info('===== warcprox benchmark finished =====')
283 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "warcprox"
 3 | authors = [
 4 |   { name="Noah Levitt", email="nlevitt@archive.org" },
 5 | ]
 6 | maintainers = [
 7 |   { name="Vangelis Banos", email="vangelis@archive.org" },
 8 |   { name="Adam Miller", email="adam@archive.org" },
 9 |   { name="Barbara Miller", email="barbara@archive.org" },
10 |   { name="Alex Dempsey", email="avdempsey@archive.org" },
11 | ]
12 | description = "WARC writing MITM HTTP/S proxy"
13 | readme = "README.rst"
14 | requires-python = ">=3.8"
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "License :: OSI Approved :: Apache Software License",
18 |     "Operating System :: OS Independent",
19 | ]
20 | dynamic = [ "version", "license", "scripts", "dependencies", "optional-dependencies" ]
21 | 
22 | [project.urls]
23 | Homepage = "https://github.com/internetarchive/warcprox"
24 | Issues = "https://github.com/internetarchive/warcprox/issues"
25 | 
26 | [build-system]
27 | requires = ["setuptools>=61.0"]
28 | build-backend = "setuptools.build_meta"
29 | 
30 | [dependency-groups]
31 | dev = [
32 |   "mock",
33 |   "pytest>=8.3.5",
34 |   "pyopenssl",
35 |   "warcio",
36 | ]
37 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [tool:pytest]
5 | addopts=-v
6 | testpaths=tests
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | setup.py - setuptools installation configuration for warcprox
 4 | 
 5 | Copyright (C) 2013-2025 Internet Archive
 6 | 
 7 | This program is free software; you can redistribute it and/or
 8 | modify it under the terms of the GNU General Public License
 9 | as published by the Free Software Foundation; either version 2
10 | of the License, or (at your option) any later version.
11 | 
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 | GNU General Public License for more details.
16 | 
17 | You should have received a copy of the GNU General Public License
18 | along with this program; if not, write to the Free Software
19 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
20 | USA.
21 | '''
22 | 
23 | import sys
24 | import setuptools
25 | 
26 | deps = [
27 |     'warctools>=5.0.0',
28 |     'urlcanon>=0.3.0',
29 |     'doublethink==0.4.9',
30 |     'urllib3>=1.23',
31 |     'requests>=2.0.1',
32 |     'PySocks>=1.6.8',
33 |     'cryptography>=45,<46',
34 |     'idna',
35 |     'PyYAML>=5.1',
36 |     'cachetools',
37 |     'rfc3986>=1.5.0',
38 |     # Needed because of rethinkdb 2.4.9;
39 |     # can be removed once doublethink upgrades to 2.4.10.post1
40 |     'setuptools>=75.8.0;python_version>="3.12"',
41 | ]
42 | try:
43 |     import concurrent.futures
44 | except:
45 |     deps.append('futures')
46 | 
47 | setuptools.setup(
48 |         name='warcprox',
49 |         version='2.7.0',
50 |         description='WARC writing MITM HTTP/S proxy',
51 |         url='https://github.com/internetarchive/warcprox',
52 |         author='Noah Levitt',
53 |         author_email='nlevitt@archive.org',
54 |         long_description=open('README.rst').read(),
55 |         license='GPL',
56 |         packages=['warcprox'],
57 |         install_requires=deps,
58 |         # preferred trough 'trough @ git+https://github.com/internetarchive/trough.git@jammy_focal'
59 |         extras_require={'trough': 'trough'},
60 |         setup_requires=['pytest-runner'],
61 |         tests_require=['mock', 'pytest', 'warcio', 'pyOpenSSL'],
62 |         entry_points={
63 |             'console_scripts': [
64 |                 'warcprox=warcprox.main:main',
65 |                 ('warcprox-ensure-rethinkdb-tables='
66 |                     'warcprox.main:ensure_rethinkdb_tables'),
67 |             ],
68 |         },
69 |         zip_safe=False,
70 |         classifiers=[
71 |             'Development Status :: 5 - Production/Stable',
72 |             'Environment :: Console',
73 |             'License :: OSI Approved :: GNU General Public License (GPL)',
74 |             'Programming Language :: Python :: 3.8',
75 |             'Programming Language :: Python :: 3.9',
76 |             'Programming Language :: Python :: 3.10',
77 |             'Programming Language :: Python :: 3.11',
78 |             'Topic :: Internet :: Proxy Servers',
79 |             'Topic :: Internet :: WWW/HTTP',
80 |             'Topic :: Software Development :: Libraries :: Python Modules',
81 |             'Topic :: System :: Archiving',
82 |         ])
83 | 


--------------------------------------------------------------------------------
/tests/Dockerfile:
--------------------------------------------------------------------------------
  1 | #
  2 | # Dockerfile for warcprox tests
  3 | #
  4 | # Copyright (C) 2015-2017 Internet Archive
  5 | #
  6 | # This program is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License
  8 | # as published by the Free Software Foundation; either version 2
  9 | # of the License, or (at your option) any later version.
 10 | #
 11 | # This program is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | # GNU General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with this program; if not, write to the Free Software
 18 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 19 | # USA.
 20 | #
 21 | 
 22 | FROM ubuntu:focal-20220404
 23 | MAINTAINER Noah Levitt <nlevitt@archive.org>
 24 | 
 25 | # see https://github.com/stuartpb/rethinkdb-dockerfiles/blob/master/trusty/2.1.3/Dockerfile
 26 | # and https://github.com/chali/hadoop-cdh-pseudo-docker/blob/master/Dockerfile
 27 | 
 28 | ENV LANG=C.UTF-8
 29 | 
 30 | RUN apt-get update && apt-get --auto-remove -y dist-upgrade
 31 | RUN apt-get install -y ca-certificates curl gnupg wget
 32 | 
 33 | # Add the RethinkDB repository and public key
 34 | RUN curl -Ss https://download.rethinkdb.com/repository/raw/pubkey.gpg | apt-key add - 
 35 | RUN echo "deb https://download.rethinkdb.com/repository/ubuntu-focal focal main" > /etc/apt/sources.list.d/rethinkdb.list \
 36 |     && apt-get update && apt-get -y install rethinkdb
 37 | 
 38 | RUN mkdir -vp /etc/service/rethinkdb \
 39 |     && echo "#!/bin/bash\nexec rethinkdb --bind 0.0.0.0 --directory /tmp/rethink-data --runuser rethinkdb --rungroup rethinkdb\n" > /etc/service/rethinkdb/run \
 40 |     && chmod a+x /etc/service/rethinkdb/run
 41 | 
 42 | RUN apt-get -y install git
 43 | RUN apt-get -y install libpython2.7-dev libpython3-dev libffi-dev libssl-dev \
 44 |                python-setuptools python3-setuptools
 45 | RUN apt-get -y install gcc
 46 | 
 47 | RUN echo '57ff41e99cb01b6a1c2b0999161589b726f0ec8b  /tmp/pip-9.0.1.tar.gz' > /tmp/sha1sums.txt
 48 | RUN curl -sSL -o /tmp/pip-9.0.1.tar.gz https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
 49 | RUN sha1sum -c /tmp/sha1sums.txt
 50 | RUN tar -C /tmp -xf /tmp/pip-9.0.1.tar.gz
 51 | RUN cd /tmp/pip-9.0.1 && python3 setup.py install
 52 | 
 53 | RUN pip install virtualenv
 54 | 
 55 | RUN apt-get -y install tor
 56 | RUN mkdir -vp /etc/service/tor \
 57 |     && echo "#!/bin/sh\nexec tor\n" > /etc/service/tor/run \
 58 |     && chmod a+x /etc/service/tor/run
 59 | 
 60 | # hadoop hdfs for trough
 61 | 
 62 | ARG DEBIAN_FRONTEND=noninteractive
 63 | ENV TZ=Etc/UTC
 64 | RUN apt-get install -y openjdk-8-jdk openssh-server 
 65 | 
 66 | # set java home
 67 | ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
 68 | 
 69 | # setup ssh with no passphrase
 70 | RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
 71 |     && cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
 72 | 
 73 | RUN wget -O /hadoop-2.7.3.tar.gz -q https://archive.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz \
 74 |         && tar xfz hadoop-2.7.3.tar.gz \
 75 |         && mv /hadoop-2.7.3 /usr/local/hadoop \
 76 |         && rm /hadoop-2.7.3.tar.gz
 77 | 	
 78 | # hadoop environment variables
 79 | ENV HADOOP_HOME=/usr/local/hadoop
 80 | ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
 81 | 
 82 | # hadoop-store
 83 | RUN mkdir -p $HADOOP_HOME/hdfs/namenode \
 84 |         && mkdir -p $HADOOP_HOME/hdfs/datanode
 85 | 
 86 | # Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
 87 | COPY config/ /tmp/
 88 | RUN mv /tmp/ssh_config $HOME/.ssh/config \
 89 |     && mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh \
 90 |     && mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml \
 91 |     && mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml \
 92 |     && mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml.template \
 93 |     && cp $HADOOP_HOME/etc/hadoop/mapred-site.xml.template $HADOOP_HOME/etc/hadoop/mapred-site.xml \
 94 |     && mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
 95 | 
 96 | # Add startup script
 97 | ADD config/hadoop-services.sh $HADOOP_HOME/hadoop-services.sh
 98 | 
 99 | # set permissions
100 | RUN chmod 744 -R $HADOOP_HOME
101 | 
102 | # format namenode
103 | RUN $HADOOP_HOME/bin/hdfs namenode -format
104 | 
105 | # run hadoop services
106 | #ENTRYPOINT $HADOOP_HOME/hadoop-services.sh; bash
107 | 
108 | RUN apt-get install -y libsqlite3-dev build-essential
109 | 
110 | # trough itself
111 | RUN virtualenv -p python3 /opt/trough-ve3 \
112 |     && . /opt/trough-ve3/bin/activate \
113 |     && pip install git+https://github.com/nlevitt/snakebite.git@py3 \
114 |     && pip install git+https://github.com/internetarchive/trough.git
115 | 
116 | RUN mkdir -vp /etc/service/trough-sync-local \
117 |     && echo "#!/bin/bash\nsource /opt/trough-ve3/bin/activate\nexec sync.py >>/tmp/trough-sync-local.out 2>&1" > /etc/service/trough-sync-local/run \
118 |     && chmod a+x /etc/service/trough-sync-local/run
119 | 
120 | RUN mkdir -vp /etc/service/trough-sync-server \
121 |     && echo '#!/bin/bash\nsource /opt/trough-ve3/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec sync.py --server >>/tmp/trough-sync-server.out 2>&1' > /etc/service/trough-sync-server/run \
122 |     && chmod a+x /etc/service/trough-sync-server/run
123 | 
124 | RUN mkdir -vp /etc/service/trough-read \
125 |     && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6444 --master --processes=2 --harakiri=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/reader.py >>/tmp/trough-read.out 2>&1' > /etc/service/trough-read/run \
126 |     && chmod a+x /etc/service/trough-read/run
127 | 
128 | RUN mkdir -vp /etc/service/trough-write \
129 |     && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6222 --master --processes=2 --harakiri=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file $venv/bin/writer.py >>/tmp/trough-write.out 2>&1' > /etc/service/trough-write/run \
130 |     && chmod a+x /etc/service/trough-write/run
131 | 
132 | RUN mkdir -vp /etc/service/trough-segment-manager-local \
133 |     && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6112 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1' > /etc/service/trough-segment-manager-local/run \
134 |     && chmod a+x /etc/service/trough-segment-manager-local/run
135 | 
136 | RUN mkdir -vp /etc/service/trough-segment-manager-server \
137 |     && echo '#!/bin/bash\nvenv=/opt/trough-ve3\nsource $venv/bin/activate\nsleep 5\npython -c $"import doublethink ; from trough.settings import settings ; rr = doublethink.Rethinker(settings[\"RETHINKDB_HOSTS\"]) ; rr.db(\"trough_configuration\").wait().run()"\nexec uwsgi --venv=$venv --http :6111 --master --processes=2 --harakiri=7200 --http-timeout=7200 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1' > /etc/service/trough-segment-manager-server/run \
138 |     && chmod a+x /etc/service/trough-segment-manager-server/run
139 | 
140 | RUN apt-get install -y daemontools daemontools-run
141 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | tests/conftest.py - command line options for warcprox tests
 3 | 
 4 | Copyright (C) 2015-2017 Internet Archive
 5 | 
 6 | This program is free software; you can redistribute it and/or
 7 | modify it under the terms of the GNU General Public License
 8 | as published by the Free Software Foundation; either version 2
 9 | of the License, or (at your option) any later version.
10 | 
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License
17 | along with this program; if not, write to the Free Software
18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
19 | USA.
20 | '''
21 | 
22 | import pytest
23 | 
24 | def pytest_addoption(parser):
25 |     parser.addoption(
26 |             '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
27 |                 'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
28 |                 'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
29 |     parser.addoption(
30 |             '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
31 |                 'rethinkdb big table url (table will be populated with '
32 |                 'various capture information and is suitable for use as '
33 |                 'index for playback), e.g. rethinkdb://db0.foo.org,'
34 |                 'db1.foo.org:38015/my_warcprox_db/captures'))
35 |     parser.addoption(
36 |             '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
37 |                 '🐷   url pointing to trough configuration rethinkdb database, '
38 |                 'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
39 |                 '/trough_configuration'))
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # tests/run-tests.sh - Runs tests in a docker container. Also runs a temporary
 4 | # instance of rethinkdb inside the container. The tests run with rethinkdb
 5 | # features enabled, against that instance of rethinkdb, and also run without
 6 | # rethinkdb features enabled.  With python 2.7 and 3.4.
 7 | #
 8 | # Copyright (C) 2015-2017 Internet Archive
 9 | #
10 | # This program is free software; you can redistribute it and/or
11 | # modify it under the terms of the GNU General Public License
12 | # as published by the Free Software Foundation; either version 2
13 | # of the License, or (at your option) any later version.
14 | #
15 | # This program is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 | # GNU General Public License for more details.
19 | #
20 | # You should have received a copy of the GNU General Public License
21 | # along with this program; if not, write to the Free Software
22 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
23 | # USA.
24 | #
25 | # 😬
26 | #
27 | 
28 | set -e
29 | 
30 | script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
31 | 
32 | docker build -t internetarchive/warcprox-tests $script_dir
33 | 
34 | docker run --rm --volume="$script_dir/..:/warcprox" internetarchive/warcprox-tests \
35 |     bash -x -c "cd /tmp && git clone /warcprox && cd /tmp/warcprox \
36 |         && (cd /warcprox && git diff HEAD) | patch -p1 \
37 |         && virtualenv -p python3 /tmp/venv \
38 |         && source /tmp/venv/bin/activate \
39 |         && pip --log-file /tmp/pip.log install . pytest mock requests warcio trough \
40 |         && py.test -v tests; \
41 | 	svscan /etc/service & \
42 | 	sleep 10; \
43 |         py.test -v --rethinkdb-dedup-url=rethinkdb://localhost/test1/dedup tests \
44 |         && py.test -v --rethinkdb-big-table-url=rethinkdb://localhost/test2/captures tests \
45 | 	&& /usr/local/hadoop/hadoop-services.sh \
46 |         && py.test -v --rethinkdb-trough-db-url=rethinkdb://localhost/trough_configuration tests \
47 |         "
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/run-trough.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # this is used by .travis.yml
 4 | #
 5 | 
 6 | set -x
 7 | 
 8 | pip install git+https://github.com/nlevitt/snakebite.git@py3
 9 | pip install git+https://github.com/internetarchive/trough.git
10 | 
11 | mkdir /etc/trough
12 | 
13 | # hello docker user-defined bridge networking
14 | echo '
15 | HDFS_HOST: hadoop
16 | RETHINKDB_HOSTS:
17 | - rethinkdb
18 | ' > /etc/trough/settings.yml
19 | 
20 | sync.py >>/tmp/trough-sync-local.out 2>&1 &
21 | 
22 | sleep 5
23 | python -c "
24 | import doublethink
25 | from trough.settings import settings
26 | rr = doublethink.Rethinker(settings['RETHINKDB_HOSTS'])
27 | rr.db('trough_configuration').wait().run()"
28 | 
29 | sync.py --server >>/tmp/trough-sync-server.out 2>&1 &
30 | uwsgi --http :6222 --master --processes=2 --harakiri=240 --max-requests=50000 --vacuum --die-on-term --wsgi-file /usr/local/bin/writer.py >>/tmp/trough-write.out 2>&1 &
31 | uwsgi --http :6112 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:local >>/tmp/trough-segment-manager-local.out 2>&1 &
32 | uwsgi --http :6111 --master --processes=2 --harakiri=20 --max-requests=50000 --vacuum --die-on-term --mount /=trough.wsgi.segment_manager:server >>/tmp/trough-segment-manager-server.out 2>&1 &
33 | uwsgi --http :6444 --master --processes=2 --harakiri=3200 --socket-timeout=3200 --max-requests=50000 --vacuum --die-on-term --wsgi-file /usr/local/bin/reader.py >>/tmp/trough-read.out 2>&1 &
34 | 
35 | wait
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/single-threaded-proxy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | tests/single-threaded-proxy.py - single-threaded MITM proxy, useful for
  4 | debugging, does not write warcs
  5 | 
  6 | Copyright (C) 2015-2017 Internet Archive
  7 | 
  8 | This program is free software; you can redistribute it and/or
  9 | modify it under the terms of the GNU General Public License
 10 | as published by the Free Software Foundation; either version 2
 11 | of the License, or (at your option) any later version.
 12 | 
 13 | This program is distributed in the hope that it will be useful,
 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | GNU General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU General Public License
 19 | along with this program; if not, write to the Free Software
 20 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 21 | USA.
 22 | """
 23 | import warcprox
 24 | import logging
 25 | import sys
 26 | import argparse
 27 | import warcprox.certauth
 28 | import queue
 29 | import socket
 30 | import os
 31 | 
 32 | class FakeQueue:
 33 |     logger = logging.getLogger("FakeQueue")
 34 |     def __init__(self, maxsize=0): pass
 35 |     def join(self): pass
 36 |     def qsize(self): return 0
 37 |     def empty(self): return True
 38 |     def full(self): return False
 39 |     def get(self, block=True, timeout=None): raise queue.Empty
 40 |     def put_nowait(self, item): return self.put(item, block=False)
 41 |     def get_nowait(self): return self.get(block=False)
 42 |     def put(self, recorded_url, block=True, timeout=None):
 43 |         logging.info("{} {} {} {} {} size={} {}".format(
 44 |             recorded_url.client_ip, recorded_url.status, recorded_url.method,
 45 |             recorded_url.url.decode("utf-8"), recorded_url.mimetype,
 46 |             recorded_url.size, warcprox.digest_str(recorded_url.payload_digest, False).decode('utf-8')))
 47 | 
 48 | def parse_args():
 49 |     prog = os.path.basename(sys.argv[0])
 50 |     arg_parser = argparse.ArgumentParser(prog=prog,
 51 |         description='%s - single threaded mitm http/s proxy, for debugging' % prog,
 52 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 53 |     arg_parser.add_argument('-p', '--port', dest='port', default='8000',
 54 |         type=int, help='port to listen on')
 55 |     arg_parser.add_argument('-b', '--address', dest='address',
 56 |         default='localhost', help='address to listen on')
 57 |     arg_parser.add_argument('-c', '--cacert', dest='cacert',
 58 |         default='./{}-warcprox-ca.pem'.format(socket.gethostname()),
 59 |         help='CA certificate file; if file does not exist, it will be created')
 60 |     arg_parser.add_argument('--certs-dir', dest='certs_dir',
 61 |         default='./{}-warcprox-ca'.format(socket.gethostname()),
 62 |         help='where to store and load generated certificates')
 63 |     arg_parser.add_argument('--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
 64 |         default=None, help='host:port of tor socks proxy, used only to connect to .onion sites')
 65 |     arg_parser.add_argument('--version', action='version',
 66 |         version="warcprox {}".format(warcprox.__version__))
 67 |     arg_parser.add_argument('-v', '--verbose', dest='verbose', action='store_true')
 68 |     arg_parser.add_argument('-q', '--quiet', dest='quiet', action='store_true')
 69 | 
 70 |     return arg_parser.parse_args(args=sys.argv[1:])
 71 | 
 72 | def init_logging(verbose):
 73 |     if args.verbose:
 74 |         loglevel = logging.DEBUG
 75 |     elif args.quiet:
 76 |         loglevel = logging.WARNING
 77 |     else:
 78 |         loglevel = logging.INFO
 79 | 
 80 |     logging.basicConfig(stream=sys.stdout, level=loglevel,
 81 |             format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
 82 |             # format='%(asctime)s %(funcName) 21s() %(filename)15s:%(lineno)05d %(message)s')
 83 | 
 84 | def init_proxy(args):
 85 |     ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
 86 |     ca = warcprox.certauth.CertificateAuthority(args.cacert, args.certs_dir,
 87 |         ca_name=ca_name)
 88 |     options = warcprox.Options(**vars(args))
 89 |     proxy = warcprox.warcproxy.SingleThreadedWarcProxy(ca,
 90 |         recorded_url_q=FakeQueue(), options=options)
 91 |     return proxy
 92 | 
 93 | if __name__ == "__main__":
 94 |     args = parse_args()
 95 |     init_logging(args.verbose)
 96 |     proxy = init_proxy(args)
 97 | 
 98 |     proxy.serve_forever()
 99 | 
100 | 


--------------------------------------------------------------------------------
/tests/test_certauth.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | from warcprox.certauth import main, CertificateAuthority
 5 | import tempfile
 6 | from OpenSSL import crypto
 7 | import datetime
 8 | import time
 9 | 
10 | import pytest
11 | 
12 | def setup_module():
13 |     global TEST_CA_DIR
14 |     TEST_CA_DIR = tempfile.mkdtemp()
15 | 
16 |     global TEST_CA_ROOT
17 |     TEST_CA_ROOT = os.path.join(TEST_CA_DIR, 'certauth_test_ca.pem')
18 | 
19 | def teardown_module():
20 |     shutil.rmtree(TEST_CA_DIR)
21 |     assert not os.path.isdir(TEST_CA_DIR)
22 |     assert not os.path.isfile(TEST_CA_ROOT)
23 | 
24 | def test_create_root():
25 |     ret = main([TEST_CA_ROOT, '-c', 'Test Root Cert'])
26 |     assert ret == 0
27 | 
28 | def test_create_host_cert():
29 |     ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '-n', 'example.com'])
30 |     assert ret == 0
31 |     certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
32 |     assert os.path.isfile(certfile)
33 | 
34 | def test_create_wildcard_host_cert_force_overwrite():
35 |     ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '--hostname', 'example.com', '-w', '-f'])
36 |     assert ret == 0
37 |     certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
38 |     assert os.path.isfile(certfile)
39 | 
40 | def test_explicit_wildcard():
41 |     ca = CertificateAuthority(TEST_CA_ROOT, TEST_CA_DIR, 'Test CA')
42 |     filename = ca.get_wildcard_cert('test.example.proxy')
43 |     certfile = os.path.join(TEST_CA_DIR, 'example.proxy.pem')
44 |     assert filename == certfile
45 |     assert os.path.isfile(certfile)
46 |     os.remove(certfile)
47 | 
48 | def test_create_already_exists():
49 |     ret = main([TEST_CA_ROOT, '-d', TEST_CA_DIR, '-n', 'example.com', '-w'])
50 |     assert ret == 1
51 |     certfile = os.path.join(TEST_CA_DIR, 'example.com.pem')
52 |     assert os.path.isfile(certfile)
53 |     # remove now
54 |     os.remove(certfile)
55 | 
56 | def test_create_root_already_exists():
57 |     ret = main([TEST_CA_ROOT])
58 |     # not created, already exists
59 |     assert ret == 1
60 |     # remove now
61 |     os.remove(TEST_CA_ROOT)
62 | 
63 | # We have what might be some time zone issues with this right now
64 | @pytest.mark.xfail
65 | def test_create_root_subdir():
66 |     # create a new cert in a subdirectory
67 |     subdir = os.path.join(TEST_CA_DIR, 'subdir')
68 | 
69 |     ca_file = os.path.join(subdir, 'certauth_test_ca.pem')
70 | 
71 |     ca = CertificateAuthority(ca_file, subdir, 'Test CA',
72 |                               cert_not_before=-60 * 60,
73 |                               cert_not_after=60 * 60 * 24 * 3)
74 | 
75 |     assert os.path.isdir(subdir)
76 |     assert os.path.isfile(ca_file)
77 | 
78 |     buff = ca.get_root_PKCS12()
79 |     assert len(buff) > 0
80 | 
81 |     expected_not_before = datetime.datetime.utcnow() - datetime.timedelta(seconds=60 * 60)
82 |     expected_not_after = datetime.datetime.utcnow() + datetime.timedelta(seconds=60 * 60 * 24 * 3)
83 | 
84 |     cert = crypto.load_pkcs12(buff).get_certificate()
85 | 
86 |     actual_not_before = datetime.datetime.strptime(
87 |             cert.get_notBefore().decode('ascii'), '%Y%m%d%H%M%SZ')
88 |     actual_not_after = datetime.datetime.strptime(
89 |             cert.get_notAfter().decode('ascii'), '%Y%m%d%H%M%SZ')
90 | 
91 |     time.mktime(expected_not_before.utctimetuple())
92 |     assert abs(time.mktime(actual_not_before.utctimetuple()) - time.mktime(expected_not_before.utctimetuple())) < 10
93 |     assert abs(time.mktime(actual_not_after.utctimetuple()) - time.mktime(expected_not_after.utctimetuple())) < 10
94 | 


--------------------------------------------------------------------------------
/tests/test_dedup.py:
--------------------------------------------------------------------------------
 1 | import mock
 2 | from warcprox.dedup import CdxServerDedup
 3 | 
 4 | 
 5 | def test_cdx_dedup():
 6 |     # Mock CDX Server responses to simulate found, not found and errors.
 7 |     url = "http://example.com"
 8 |     # not found case
 9 |     result = mock.Mock()
10 |     result.status = 200
11 |     result.data = b'20170101020405 test'
12 |     cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
13 |     cdx_server.http_pool.request = mock.MagicMock(return_value=result)
14 |     res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
15 |                             url=url)
16 |     assert res is None
17 | 
18 |     # found case
19 |     result = mock.Mock()
20 |     result.status = 200
21 |     result.data = b'20170203040503 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
22 |     cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
23 |     cdx_server.http_pool.request = mock.MagicMock(return_value=result)
24 |     res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
25 |                             url=url)
26 |     assert res["date"] == b"2017-02-03T04:05:03Z"
27 | 
28 |     # invalid CDX result status code
29 |     result = mock.Mock()
30 |     result.status = 400
31 |     result.data = b'20170101020405 B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
32 |     cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
33 |     cdx_server.http_pool.request = mock.MagicMock(return_value=result)
34 |     res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
35 |                             url=url)
36 |     assert res is None
37 | 
38 |     # invalid CDX result content
39 |     result = mock.Mock()
40 |     result.status = 200
41 |     result.data = b'InvalidExceptionResult'
42 |     cdx_server = CdxServerDedup(cdx_url="dummy-cdx-server-url")
43 |     cdx_server.http_pool.request = mock.MagicMock(return_value=result)
44 |     res = cdx_server.lookup(digest_key="B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A",
45 |                             url=url)
46 |     assert res is None
47 | 


--------------------------------------------------------------------------------
/tests/test_ensure_rethinkdb_tables.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | tests/test_ensure_rethinkdb_tables.py - automated tests of
  4 | ensure-rethinkdb-tables utility
  5 | 
  6 | Copyright (C) 2017 Internet Archive
  7 | 
  8 | This program is free software; you can redistribute it and/or
  9 | modify it under the terms of the GNU General Public License
 10 | as published by the Free Software Foundation; either version 2
 11 | of the License, or (at your option) any later version.
 12 | 
 13 | This program is distributed in the hope that it will be useful,
 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | GNU General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU General Public License
 19 | along with this program; if not, write to the Free Software
 20 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 21 | USA.
 22 | '''
 23 | 
 24 | import warcprox.main
 25 | import pytest
 26 | import socket
 27 | import doublethink
 28 | import logging
 29 | import sys
 30 | 
 31 | logging.basicConfig(
 32 |         stream=sys.stdout, level=logging.TRACE,
 33 |         format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
 34 |         '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
 35 | 
 36 | def rethinkdb_is_running():
 37 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 38 |     try:
 39 |         sock.connect(('127.0.0.1', 28015))
 40 |         return True
 41 |     except:
 42 |         return False
 43 | 
 44 | if_rethinkdb = pytest.mark.skipif(
 45 |         not rethinkdb_is_running(),
 46 |         reason='rethinkdb not listening at 127.0.0.1:28015')
 47 | 
 48 | @if_rethinkdb
 49 | def test_individual_options():
 50 |     rr = doublethink.Rethinker(['127.0.0.1'])
 51 | 
 52 |     try:
 53 |         warcprox.main.ensure_rethinkdb_tables([
 54 |             'warcprox-ensure-rethinkdb-tables',
 55 |             '--rethinkdb-stats-url=rethinkdb://127.0.0.1/db0/stats'])
 56 |         assert rr.db('db0').table_list().run() == ['stats']
 57 |     finally:
 58 |         rr.db_drop('db0').run()
 59 | 
 60 |     try:
 61 |         warcprox.main.ensure_rethinkdb_tables([
 62 |             'warcprox-ensure-rethinkdb-tables',
 63 |             '--rethinkdb-services-url=rethinkdb://127.0.0.1/db1/services'])
 64 |         assert rr.db('db1').table_list().run() == ['services']
 65 |     finally:
 66 |         rr.db_drop('db1').run()
 67 | 
 68 |     try:
 69 |         warcprox.main.ensure_rethinkdb_tables([
 70 |             'warcprox-ensure-rethinkdb-tables',
 71 |             '--rethinkdb-dedup-url=rethinkdb://127.0.0.1/db2/dedup'])
 72 |         assert rr.db('db2').table_list().run() == ['dedup']
 73 |     finally:
 74 |         rr.db_drop('db2').run()
 75 | 
 76 |     try:
 77 |         warcprox.main.ensure_rethinkdb_tables([
 78 |             'warcprox-ensure-rethinkdb-tables',
 79 |             '--rethinkdb-big-table-url=rethinkdb://127.0.0.1/db3/captures'])
 80 |         assert rr.db('db3').table_list().run() == ['captures']
 81 |     finally:
 82 |         rr.db_drop('db3').run()
 83 | 
 84 |     try:
 85 |         warcprox.main.ensure_rethinkdb_tables([
 86 |             'warcprox-ensure-rethinkdb-tables',
 87 |             '--rethinkdb-trough-db-url=rethinkdb://127.0.0.1/db4'])
 88 |         assert rr.db('db4').table_list().run() == ['services']
 89 |                 # ['assignment', 'lock', 'schema', 'services']
 90 |     finally:
 91 |         rr.db_drop('db4').run()
 92 | 
 93 | @if_rethinkdb
 94 | def test_combos():
 95 |     rr = doublethink.Rethinker(['127.0.0.1'])
 96 | 
 97 |     try:
 98 |         warcprox.main.ensure_rethinkdb_tables([
 99 |             'warcprox-ensure-rethinkdb-tables',
100 |             '--rethinkdb-stats-url=rethinkdb://127.0.0.1/db00/stats',
101 |             '--rethinkdb-trough-db-url=rethinkdb://127.0.0.1/db01',
102 |             ])
103 |         assert rr.db('db00').table_list().run() == ['stats']
104 |         assert rr.db('db01').table_list().run() == ['services']
105 |         # ['assignment', 'lock', 'schema', 'services']
106 |     finally:
107 |         rr.db_drop('db00').run()
108 |         rr.db_drop('db01').run()
109 | 


--------------------------------------------------------------------------------
/tests/test_writer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | tests/test_writer.py - warcprox warc writing tests
  3 | 
  4 | Copyright (C) 2017-2019 Internet Archive
  5 | 
  6 | This program is free software; you can redistribute it and/or
  7 | modify it under the terms of the GNU General Public License
  8 | as published by the Free Software Foundation; either version 2
  9 | of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful,
 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | GNU General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License
 17 | along with this program; if not, write to the Free Software
 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 19 | USA.
 20 | '''
 21 | 
 22 | import os
 23 | import fcntl
 24 | from multiprocessing import Process, Queue
 25 | from datetime import datetime, timedelta
 26 | import pytest
 27 | import re
 28 | from warcprox.mitmproxy import ProxyingRecorder
 29 | from warcprox.warcproxy import RecordedUrl
 30 | from warcprox.writer import WarcWriter
 31 | from warcprox import Options
 32 | import time
 33 | import warcprox
 34 | import io
 35 | import tempfile
 36 | import logging
 37 | import hashlib
 38 | import queue
 39 | import sys
 40 | 
 41 | logging.basicConfig(
 42 |         stream=sys.stdout, level=logging.TRACE,
 43 |         format='%(asctime)s %(process)d %(levelname)s %(threadName)s '
 44 |         '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
 45 | 
 46 | def lock_file(q, filename):
 47 |     """Try to lock file and return 1 if successful, else return 0.
 48 |     It is necessary to run this method in a different process to test locking.
 49 |     """
 50 |     try:
 51 |         fi = open(filename, 'ab')
 52 |         fcntl.lockf(fi, fcntl.LOCK_EX | fcntl.LOCK_NB)
 53 |         fi.close()
 54 |         q.put('OBTAINED LOCK')
 55 |     except OSError:
 56 |         q.put('FAILED TO OBTAIN LOCK')
 57 | 
 58 | # Failing with a "queue empty" exeption; race condition?
 59 | @pytest.mark.xfail
 60 | def test_warc_writer_locking(tmpdir):
 61 |     """Test if WarcWriter is locking WARC files.
 62 |     When we don't have the .open suffix, WarcWriter locks the file and the
 63 |     external process trying to ``lock_file`` fails (result=0).
 64 |     """
 65 |     recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
 66 |     recorded_url = RecordedUrl(
 67 |             url='http://example.com', content_type='text/plain', status=200,
 68 |             client_ip='127.0.0.2', request_data=b'abc',
 69 |             response_recorder=recorder, remote_ip='127.0.0.3',
 70 |             timestamp=datetime.utcnow(), payload_digest=hashlib.sha1())
 71 | 
 72 |     dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
 73 |     wwriter = WarcWriter(Options(
 74 |         directory=dirname, no_warc_open_suffix=True))
 75 |     wwriter.write_records(recorded_url)
 76 |     warcs = [fn for fn in os.listdir(dirname) if fn.endswith('.warc')]
 77 |     assert warcs
 78 |     target_warc = os.path.join(dirname, warcs[0])
 79 |     # launch another process and try to lock WARC file
 80 |     q = Queue()
 81 |     p = Process(target=lock_file, args=(q, target_warc))
 82 |     p.start()
 83 |     p.join()
 84 |     assert q.get(timeout=5) == 'FAILED TO OBTAIN LOCK'
 85 |     wwriter.close()
 86 | 
 87 |     # locking must succeed after writer has closed the WARC file.
 88 |     p = Process(target=lock_file, args=(q, target_warc))
 89 |     p.start()
 90 |     p.join()
 91 |     assert q.get(timeout=5) == 'OBTAINED LOCK'
 92 | 
 93 | def wait(callback, timeout):
 94 |     start = time.time()
 95 |     while time.time() - start < timeout:
 96 |         if callback():
 97 |             return
 98 |         time.sleep(0.5)
 99 |     raise Exception('timed out waiting for %s to return truthy' % callback)
100 | 
101 | def test_special_dont_write_prefix():
102 |     with tempfile.TemporaryDirectory() as tmpdir:
103 |         logging.debug('cd %s', tmpdir)
104 |         os.chdir(tmpdir)
105 | 
106 |         wwt = warcprox.writerthread.WarcWriterProcessor(Options(prefix='-'))
107 |         wwt.inq = queue.Queue(maxsize=1)
108 |         wwt.outq = queue.Queue(maxsize=1)
109 |         try:
110 |             wwt.start()
111 |             # not to be written due to default prefix
112 |             recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
113 |             recorder.read()
114 |             wwt.inq.put(RecordedUrl(
115 |                 url='http://example.com/no', content_type='text/plain',
116 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
117 |                 response_recorder=recorder, remote_ip='127.0.0.3',
118 |                 timestamp=datetime.utcnow(),
119 |                 payload_digest=recorder.block_digest))
120 |             # to be written due to warcprox-meta prefix
121 |             recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
122 |             recorder.read()
123 |             wwt.inq.put(RecordedUrl(
124 |                 url='http://example.com/yes', content_type='text/plain',
125 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
126 |                 response_recorder=recorder, remote_ip='127.0.0.3',
127 |                 timestamp=datetime.utcnow(),
128 |                 payload_digest=recorder.block_digest,
129 |                 warcprox_meta={'warc-prefix': 'normal-warc-prefix'}))
130 |             recorded_url = wwt.outq.get(timeout=10)
131 |             assert not recorded_url.warc_records
132 |             recorded_url = wwt.outq.get(timeout=10)
133 |             assert recorded_url.warc_records
134 |             assert wwt.outq.empty()
135 |         finally:
136 |             wwt.stop.set()
137 |             wwt.join()
138 | 
139 |         wwt = warcprox.writerthread.WarcWriterProcessor(
140 |                 Options(blackout_period=60, prefix='foo'))
141 |         wwt.inq = queue.Queue(maxsize=1)
142 |         wwt.outq = queue.Queue(maxsize=1)
143 |         try:
144 |             wwt.start()
145 |             # to be written due to default prefix
146 |             recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
147 |             recorder.read()
148 |             wwt.inq.put(RecordedUrl(
149 |                 url='http://example.com/yes', content_type='text/plain',
150 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
151 |                 response_recorder=recorder, remote_ip='127.0.0.3',
152 |                 timestamp=datetime.utcnow(),
153 |                 payload_digest=recorder.block_digest))
154 |             # not to be written due to warcprox-meta prefix
155 |             recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
156 |             recorder.read()
157 |             wwt.inq.put(RecordedUrl(
158 |                 url='http://example.com/no', content_type='text/plain',
159 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
160 |                 response_recorder=recorder, remote_ip='127.0.0.3',
161 |                 timestamp=datetime.utcnow(),
162 |                 payload_digest=recorder.block_digest,
163 |                 warcprox_meta={'warc-prefix': '-'}))
164 |             recorded_url = wwt.outq.get(timeout=10)
165 |             assert recorded_url.warc_records
166 |             recorded_url = wwt.outq.get(timeout=10)
167 |             assert not recorded_url.warc_records
168 |             assert wwt.outq.empty()
169 | 
170 |             # test blackout_period option. Write first revisit record because
171 |             # its outside the blackout_period (60). Do not write the second
172 |             # because its inside the blackout_period.
173 |             recorder = ProxyingRecorder(io.BytesIO(b'test1'), None)
174 |             recorder.read()
175 |             old = datetime.utcnow() - timedelta(0, 3600)
176 |             ru = RecordedUrl(
177 |                 url='http://example.com/dup',
178 |                 content_type='text/plain',
179 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
180 |                 response_recorder=recorder, remote_ip='127.0.0.3',
181 |                 timestamp=datetime.utcnow(),
182 |                 payload_digest=recorder.block_digest)
183 |             ru.dedup_info = dict(id=b'1', url=b'http://example.com/dup',
184 |                                  date=old.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
185 |             wwt.inq.put(ru)
186 |             recorded_url = wwt.outq.get(timeout=10)
187 |             recorder = ProxyingRecorder(io.BytesIO(b'test2'), None)
188 |             recorder.read()
189 |             recent = datetime.utcnow() - timedelta(0, 5)
190 |             ru = RecordedUrl(
191 |                 url='http://example.com/dup', content_type='text/plain',
192 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
193 |                 response_recorder=recorder, remote_ip='127.0.0.3',
194 |                 timestamp=datetime.utcnow(),
195 |                 payload_digest=recorder.block_digest)
196 |             ru.dedup_info = dict(id=b'2', url=b'http://example.com/dup',
197 |                                  date=recent.strftime('%Y-%m-%dT%H:%M:%SZ').encode('utf-8'))
198 |             wwt.inq.put(ru)
199 |             assert recorded_url.warc_records
200 |             recorded_url = wwt.outq.get(timeout=10)
201 |             assert not recorded_url.warc_records
202 |             assert wwt.outq.empty()
203 | 
204 |         finally:
205 |             wwt.stop.set()
206 |             wwt.join()
207 | 
208 | def test_do_not_archive():
209 |     with tempfile.TemporaryDirectory() as tmpdir:
210 |         logging.debug('cd %s', tmpdir)
211 |         os.chdir(tmpdir)
212 | 
213 |         wwt = warcprox.writerthread.WarcWriterProcessor()
214 |         wwt.inq = queue.Queue(maxsize=1)
215 |         wwt.outq = queue.Queue(maxsize=1)
216 |         try:
217 |             wwt.start()
218 |             # to be written -- default do_not_archive False
219 |             recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
220 |             recorder.read()
221 |             wwt.inq.put(RecordedUrl(
222 |                 url='http://example.com/yes', content_type='text/plain',
223 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
224 |                 response_recorder=recorder, remote_ip='127.0.0.3',
225 |                 timestamp=datetime.utcnow(),
226 |                 payload_digest=recorder.block_digest))
227 |             # not to be written -- do_not_archive set True
228 |             recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
229 |             recorder.read()
230 |             wwt.inq.put(RecordedUrl(
231 |                 url='http://example.com/no', content_type='text/plain',
232 |                 status=200, client_ip='127.0.0.2', request_data=b'abc',
233 |                 response_recorder=recorder, remote_ip='127.0.0.3',
234 |                 timestamp=datetime.utcnow(),
235 |                 payload_digest=recorder.block_digest,
236 |                 warcprox_meta={'warc-prefix': '-'},
237 |                 do_not_archive=True))
238 |             recorded_url = wwt.outq.get(timeout=10)
239 |             assert recorded_url.warc_records
240 |             recorded_url = wwt.outq.get(timeout=10)
241 |             assert not recorded_url.warc_records
242 |             assert wwt.outq.empty()
243 |         finally:
244 |             wwt.stop.set()
245 |             wwt.join()
246 | 
247 | def test_warc_writer_filename(tmpdir):
248 |     """Test if WarcWriter is writing WARC files with custom filenames.
249 |     """
250 |     recorder = ProxyingRecorder(None, None, 'sha1', url='http://example.com')
251 |     recorded_url = RecordedUrl(
252 |             url='http://example.com', content_type='text/plain', status=200,
253 |             client_ip='127.0.0.2', request_data=b'abc',
254 |             response_recorder=recorder, remote_ip='127.0.0.3',
255 |             timestamp=datetime.utcnow(), payload_digest=hashlib.sha1())
256 | 
257 |     dirname = os.path.dirname(str(tmpdir.mkdir('test-warc-writer')))
258 |     wwriter = WarcWriter(Options(directory=dirname, prefix='foo',
259 |         warc_filename='{timestamp17}_{prefix}_{timestamp14}_{serialno}'))
260 |     wwriter.write_records(recorded_url)
261 |     warcs = [fn for fn in os.listdir(dirname)]
262 |     assert warcs
263 |     assert re.search(
264 |             r'\d{17}_foo_\d{14}_00000.warc.open', wwriter.path)
265 | 
266 | def test_close_for_prefix(tmpdir):
267 |     wwp = warcprox.writerthread.WarcWriterProcessor(
268 |             Options(directory=str(tmpdir)))
269 |     wwp.inq = queue.Queue(maxsize=1)
270 |     wwp.outq = queue.Queue(maxsize=1)
271 | 
272 |     try:
273 |         wwp.start()
274 | 
275 |         # write a record to the default prefix
276 |         recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
277 |         recorder.read()
278 |         wwp.inq.put(RecordedUrl(
279 |             url='http://example.com/1', content_type='text/plain',
280 |             status=200, client_ip='127.0.0.2', request_data=b'abc',
281 |             response_recorder=recorder, remote_ip='127.0.0.3',
282 |             timestamp=datetime.utcnow(),
283 |             payload_digest=recorder.block_digest))
284 |         time.sleep(0.5)
285 |         rurl = wwp.outq.get() # wait for it to finish
286 | 
287 |         assert rurl.url == b'http://example.com/1'
288 |         assert len(tmpdir.listdir()) == 1
289 |         assert tmpdir.listdir()[0].basename.startswith('warcprox-')
290 |         assert tmpdir.listdir()[0].basename.endswith('-00000.warc.open')
291 |         assert tmpdir.listdir()[0].basename == wwp.writer_pool.default_warc_writer.finalname + '.open'
292 | 
293 |         # request close of default warc
294 |         wwp.close_for_prefix()
295 | 
296 |         # write a record to some other prefix
297 |         recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
298 |         recorder.read()
299 |         wwp.inq.put(RecordedUrl(
300 |             url='http://example.com/2', content_type='text/plain',
301 |             status=200, client_ip='127.0.0.2', request_data=b'abc',
302 |             response_recorder=recorder, remote_ip='127.0.0.3',
303 |             timestamp=datetime.utcnow(),
304 |             payload_digest=recorder.block_digest,
305 |             warcprox_meta={'warc-prefix': 'some-prefix'}))
306 |         time.sleep(0.5)
307 |         rurl = wwp.outq.get() # wait for it to finish
308 | 
309 |         assert rurl.url == b'http://example.com/2'
310 |         assert len(tmpdir.listdir()) == 2
311 |         basenames = sorted(f.basename for f in tmpdir.listdir())
312 |         assert basenames[0].startswith('some-prefix-')
313 |         assert basenames[0].endswith('-00000.warc.open')
314 |         assert basenames[1].startswith('warcprox-')
315 |         assert basenames[1].endswith('-00000.warc')
316 | 
317 |         # request close of warc with prefix
318 |         wwp.close_for_prefix('some-prefix')
319 | 
320 |         # write another record to the default prefix
321 |         recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
322 |         recorder.read()
323 |         wwp.inq.put(RecordedUrl(
324 |             url='http://example.com/3', content_type='text/plain',
325 |             status=200, client_ip='127.0.0.2', request_data=b'abc',
326 |             response_recorder=recorder, remote_ip='127.0.0.3',
327 |             timestamp=datetime.utcnow(),
328 |             payload_digest=recorder.block_digest))
329 |         time.sleep(0.5)
330 |         rurl = wwp.outq.get() # wait for it to finish
331 | 
332 |         assert rurl.url == b'http://example.com/3'
333 |         # now some-prefix warc is closed and a new default prefix warc is open
334 |         basenames = sorted(f.basename for f in tmpdir.listdir())
335 |         assert len(basenames) == 3
336 |         assert basenames[0].startswith('some-prefix-')
337 |         assert basenames[0].endswith('-00000.warc')
338 |         assert basenames[1].startswith('warcprox-')
339 |         assert basenames[1].endswith('-00000.warc')
340 |         assert basenames[2].startswith('warcprox-')
341 |         assert basenames[2].endswith('-00001.warc.open')
342 | 
343 |         # write another record to with prefix "some-prefix"
344 |         recorder = ProxyingRecorder(io.BytesIO(b'some payload'), None)
345 |         recorder.read()
346 |         wwp.inq.put(RecordedUrl(
347 |             url='http://example.com/4', content_type='text/plain',
348 |             status=200, client_ip='127.0.0.2', request_data=b'abc',
349 |             response_recorder=recorder, remote_ip='127.0.0.3',
350 |             timestamp=datetime.utcnow(),
351 |             payload_digest=recorder.block_digest,
352 |             warcprox_meta={'warc-prefix': 'some-prefix'}))
353 |         time.sleep(0.5)
354 |         rurl = wwp.outq.get() # wait for it to finish
355 | 
356 |         assert rurl.url == b'http://example.com/4'
357 |         # new some-prefix warc will have a new random token and start over at
358 |         # serial 00000
359 |         basenames = sorted(f.basename for f in tmpdir.listdir())
360 |         assert len(basenames) == 4
361 |         assert basenames[0].startswith('some-prefix-')
362 |         assert basenames[1].startswith('some-prefix-')
363 |         # order of these two warcs depends on random token so we don't know
364 |         # which is which
365 |         assert basenames[0][-5:] != basenames[1][-5:]
366 |         assert '-00000.' in basenames[0]
367 |         assert '-00000.' in basenames[1]
368 | 
369 |         assert basenames[2].startswith('warcprox-')
370 |         assert basenames[2].endswith('-00000.warc')
371 |         assert basenames[3].startswith('warcprox-')
372 |         assert basenames[3].endswith('-00001.warc.open')
373 | 
374 |     finally:
375 |         wwp.stop.set()
376 |         wwp.join()
377 | 


--------------------------------------------------------------------------------
/warcprox/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | warcprox/__init__.py - warcprox package main file, contains some utility code
  3 | 
  4 | Copyright (C) 2013-2021 Internet Archive
  5 | 
  6 | This program is free software; you can redistribute it and/or
  7 | modify it under the terms of the GNU General Public License
  8 | as published by the Free Software Foundation; either version 2
  9 | of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful,
 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | GNU General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License
 17 | along with this program; if not, write to the Free Software
 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
 19 | USA.
 20 | """
 21 | 
 22 | import sys
 23 | import datetime
 24 | import threading
 25 | import time
 26 | import logging
 27 | from argparse import Namespace as _Namespace
 28 | from importlib.metadata import version as _version
 29 | import concurrent.futures
 30 | import queue
 31 | import json
 32 | 
 33 | __version__ = _version('warcprox')
 34 | 
 35 | def digest_str(hash_obj, base32=False):
 36 |     import base64
 37 |     return hash_obj.name.encode('utf-8') + b':' + (
 38 |             base64.b32encode(hash_obj.digest()) if base32
 39 |             else hash_obj.hexdigest().encode('ascii'))
 40 | 
 41 | class Options(_Namespace):
 42 |     def __getattr__(self, name):
 43 |         try:
 44 |             return super().__getattr__(self, name)
 45 |         except AttributeError:
 46 |             return None
 47 | 
 48 | class Jsonner(json.JSONEncoder):
 49 |     def default(self, o):
 50 |         if isinstance(o, datetime.datetime):
 51 |             return o.isoformat()
 52 |         elif isinstance(o, bytes):
 53 |             return base64.b64encode(o).decode('ascii')
 54 |         else:
 55 |             return json.JSONEncoder.default(self, o)
 56 | 
 57 | # XXX linux-specific
 58 | def gettid():
 59 |     try:
 60 |         import ctypes
 61 |         libc = ctypes.cdll.LoadLibrary('libc.so.6')
 62 |         SYS_gettid = 186
 63 |         tid = libc.syscall(SYS_gettid)
 64 |         return tid
 65 |     except:
 66 |         return "n/a"
 67 | 
 68 | class RequestBlockedByRule(Exception):
 69 |     """
 70 |     An exception raised when a request should be blocked to respect a
 71 |     Warcprox-Meta rule.
 72 |     """
 73 |     def __init__(self, msg):
 74 |         self.msg = msg
 75 |     def __str__(self):
 76 |         return "{}: {}".format(self.__class__.__name__, self.msg)
 77 | 
 78 | class BadRequest(Exception):
 79 |     '''
 80 |     Raised in case of a request deemed unacceptable by warcprox.
 81 |     '''
 82 |     def __init__(self, msg):
 83 |         self.msg = msg
 84 |     def __str__(self):
 85 |         return "{}: {}".format(self.__class__.__name__, self.msg)
 86 | 
 87 | class BasePostfetchProcessor(threading.Thread):
 88 |     logger = logging.getLogger("warcprox.BasePostfetchProcessor")
 89 | 
 90 |     def __init__(self, options=Options(), controller=None, **kwargs):
 91 |         threading.Thread.__init__(self, name=self.__class__.__name__)
 92 |         self.options = options
 93 |         self.controller = controller
 94 | 
 95 |         self.stop = threading.Event()
 96 | 
 97 |         # these should be set by the caller before thread is started
 98 |         self.inq = None
 99 |         self.outq = None
100 |         self.profiler = None
101 | 
102 |     def run(self):
103 |         try:
104 |             if self.options.profile:
105 |                 import cProfile
106 |                 self.profiler = cProfile.Profile()
107 |                 self.profiler.enable()
108 |                 self._run()
109 |                 self.profiler.disable()
110 |             else:
111 |                 self._run()
112 |         except:
113 |             self.logger.critical(
114 |                     '%s dying due to uncaught exception',
115 |                     self.name, exc_info=True)
116 | 
117 |     def _get_process_put(self):
118 |         '''
119 |         Get url(s) from `self.inq`, process url(s), queue to `self.outq`.
120 | 
121 |         Subclasses must implement this. Implementations may operate on
122 |         individual urls, or on batches.
123 | 
124 |         May raise queue.Empty.
125 |         '''
126 |         raise Exception('not implemented')
127 | 
128 |     def _run(self):
129 |         threading.current_thread().name = '{}(tid={})'.format(
130 |                 threading.current_thread().name, gettid())
131 |         self.logger.info('%s starting up', self)
132 |         self._startup()
133 |         while not self.stop.is_set():
134 |             try:
135 |                 while True:
136 |                     try:
137 |                         self._get_process_put()
138 |                     except queue.Empty:
139 |                         if self.stop.is_set():
140 |                             break
141 |                 self.logger.info('%s shutting down', self)
142 |                 self._shutdown()
143 |             except Exception as e:
144 |                 if isinstance(e, OSError) and e.errno == 28:
145 |                     # OSError: [Errno 28] No space left on device
146 |                     self.logger.critical(
147 |                             'shutting down due to fatal problem: %s: %s',
148 |                             e.__class__.__name__, e)
149 |                     self._shutdown()
150 |                     sys.exit(1)
151 | 
152 |                 self.logger.critical(
153 |                     '%s will try to continue after unexpected error',
154 |                     self.name, exc_info=True)
155 |                 time.sleep(0.5)
156 | 
157 |     def _startup(self):
158 |         pass
159 | 
160 |     def _shutdown(self):
161 |         pass
162 | 
163 | class BaseStandardPostfetchProcessor(BasePostfetchProcessor):
164 |     def _get_process_put(self):
165 |         recorded_url = self.inq.get(block=True, timeout=0.5)
166 |         self._process_url(recorded_url)
167 |         if self.outq:
168 |             self.outq.put(recorded_url)
169 | 
170 |     def _process_url(self, recorded_url):
171 |         raise Exception('not implemented')
172 | 
173 | class BaseBatchPostfetchProcessor(BasePostfetchProcessor):
174 |     MAX_BATCH_SIZE = 500
175 |     MAX_BATCH_SEC = 60
176 |     MIN_BATCH_SEC = 30
177 |     # these updated batch seconds values have resulted in fewer reported dedup
178 |     # errors and otherwise have worked well in qa
179 | 
180 |     def _get_process_put(self):
181 |         batch = []
182 |         start = time.time()
183 | 
184 |         while True:
185 |             try:
186 |                 batch.append(self.inq.get(block=True, timeout=0.5))
187 |             except queue.Empty:
188 |                 if self.stop.is_set():
189 |                     break
190 |                 # else maybe keep adding to the batch
191 | 
192 |             if len(batch) >= self.MAX_BATCH_SIZE:
193 |                 break  # full batch
194 | 
195 |             elapsed = time.time() - start
196 |             if elapsed >= self.MAX_BATCH_SEC:
197 |                 break  # been batching for a while
198 | 
199 |             if (elapsed >= self.MIN_BATCH_SEC and self.outq
200 |                     and len(self.outq.queue) == 0):
201 |                 break  # next processor is waiting on us
202 | 
203 |         if not batch:
204 |             raise queue.Empty
205 | 
206 |         self.logger.info(
207 |                 'gathered batch of %s in %0.2f sec',
208 |                 len(batch), time.time() - start)
209 |         self._process_batch(batch)
210 | 
211 |         if self.outq:
212 |             for recorded_url in batch:
213 |                 self.outq.put(recorded_url)
214 | 
215 |     def _process_batch(self, batch):
216 |         raise Exception('not implemented')
217 | 
218 | class ListenerPostfetchProcessor(BaseStandardPostfetchProcessor):
219 |     def __init__(self, listener, options=Options(), controller=None, **kwargs):
220 |         BaseStandardPostfetchProcessor.__init__(self, options, controller, **kwargs)
221 |         self.listener = listener
222 |         self.name = listener.__class__.__name__
223 | 
224 |     def _process_url(self, recorded_url):
225 |         return self.listener.notify(recorded_url, recorded_url.warc_records)
226 | 
227 |     def start(self):
228 |         if hasattr(self.listener, 'start'):
229 |             self.listener.start()
230 |         BaseStandardPostfetchProcessor.start(self)
231 | 
232 |     def _shutdown(self):
233 |         if hasattr(self.listener, 'stop'):
234 |             try:
235 |                 self.listener.stop()
236 |             except:
237 |                 self.logger.error(
238 |                         '%s raised exception', self.listener.stop, exc_info=True)
239 | 
240 | def timestamp17():
241 |     now = datetime.datetime.utcnow()
242 |     return '{:%Y%m%d%H%M%S}{:03d}'.format(now, now.microsecond//1000)
243 | 
244 | def timestamp14():
245 |     now = datetime.datetime.utcnow()
246 |     return '{:%Y%m%d%H%M%S}'.format(now)
247 | 
248 | # monkey-patch log levels TRACE and NOTICE
249 | logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
250 | def _logger_trace(self, msg, *args, **kwargs):
251 |     if self.isEnabledFor(logging.TRACE):
252 |         self._log(logging.TRACE, msg, args, **kwargs)
253 | logging.Logger.trace = _logger_trace
254 | logging.trace = logging.root.trace
255 | logging.addLevelName(logging.TRACE, 'TRACE')
256 | 
257 | logging.NOTICE = (logging.INFO + logging.WARN) // 2
258 | def _logger_notice(self, msg, *args, **kwargs):
259 |     if self.isEnabledFor(logging.NOTICE):
260 |         self._log(logging.NOTICE, msg, args, **kwargs)
261 | logging.Logger.notice = _logger_notice
262 | logging.notice = logging.root.notice
263 | logging.addLevelName(logging.NOTICE, 'NOTICE')
264 | 
265 | import warcprox.controller as controller
266 | import warcprox.playback as playback
267 | import warcprox.dedup as dedup
268 | import warcprox.warcproxy as warcproxy
269 | import warcprox.mitmproxy as mitmproxy
270 | import warcprox.writer as writer
271 | import warcprox.warc as warc
272 | import warcprox.writerthread as writerthread
273 | import warcprox.stats as stats
274 | import warcprox.bigtable as bigtable
275 | import warcprox.crawl_log as crawl_log
276 | 


--------------------------------------------------------------------------------
/warcprox/bigtable.py:
--------------------------------------------------------------------------------
  1 | """
  2 | warcprox/bigtable.py - module for "big" RethinkDB table for deduplication;
  3 | the table is "big" in the sense that it is designed to be usable as an index
  4 | for playback software outside of warcprox, and contains information not
  5 | needed merely for deduplication
  6 | 
  7 | Copyright (C) 2015-2016 Internet Archive
  8 | 
  9 | This program is free software; you can redistribute it and/or
 10 | modify it under the terms of the GNU General Public License
 11 | as published by the Free Software Foundation; either version 2
 12 | of the License, or (at your option) any later version.
 13 | 
 14 | This program is distributed in the hope that it will be useful,
 15 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | GNU General Public License for more details.
 18 | 
 19 | You should have received a copy of the GNU General Public License
 20 | along with this program; if not, write to the Free Software
 21 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,
 22 | USA.
 23 | """
 24 | import logging
 25 | import warcprox
 26 | import base64
 27 | import urlcanon
 28 | import os
 29 | import hashlib
 30 | import threading
 31 | import datetime
 32 | import doublethink
 33 | from rethinkdb import RethinkDB; r = RethinkDB()
 34 | from warcprox.dedup import DedupableMixin
 35 | 
 36 | class RethinkCaptures:
 37 |     """Inserts in batches every 0.5 seconds"""
 38 |     logger = logging.getLogger("warcprox.bigtable.RethinkCaptures")
 39 | 
 40 |     def __init__(self, options=warcprox.Options()):
 41 |         parsed = doublethink.parse_rethinkdb_url(
 42 |                 options.rethinkdb_big_table_url)
 43 |         self.rr = doublethink.Rethinker(
 44 |                 servers=parsed.hosts, db=parsed.database)
 45 |         self.table = parsed.table
 46 |         self.options = options
 47 |         self._ensure_db_table()
 48 | 
 49 |         self._stop = threading.Event()
 50 |         self._batch_lock = threading.RLock()
 51 |         with self._batch_lock:
 52 |             self._batch = []
 53 |         self._timer = None
 54 | 
 55 |     def start(self):
 56 |         """Starts batch insert repeating timer"""
 57 |         self._insert_batch()
 58 | 
 59 |     def _insert_batch(self):
 60 |         try:
 61 |             with self._batch_lock:
 62 |                 if len(self._batch) > 0:
 63 |                     result = self.rr.table(self.table).insert(
 64 |                             self._batch, conflict="replace").run()
 65 |                     if (result["inserted"] + result["replaced"]
 66 |                             + result["unchanged"] != len(self._batch)):
 67 |                         raise Exception(
 68 |                                 "unexpected result saving batch of %s: %s "
 69 |                                 "entries" % (len(self._batch), result))
 70 |                     if result["replaced"] > 0 or result["unchanged"] > 0:
 71 |                         self.logger.warning(
 72 |                                 "inserted=%s replaced=%s unchanged=%s in big "
 73 |                                 "captures table (normally replaced=0 and "
 74 |                                 "unchanged=0)", result["inserted"],
 75 |                                 result["replaced"], result["unchanged"])
 76 |                     else:
 77 |                         self.logger.debug(
 78 |                                 "inserted %s entries to big captures table",
 79 |                                 len(self._batch))
 80 |                     self._batch = []
 81 |         except BaseException as e:
 82 |             self.logger.error(
 83 |                     "caught exception trying to save %s entries, they will "
 84 |                     "be included in the next batch", len(self._batch),
 85 |                     exc_info=True)
 86 |         finally:
 87 |             if not self._stop.is_set():
 88 |                 t = threading.Timer(0.5, self._insert_batch)
 89 |                 t.name = "RethinkCaptures-batch-insert-timer-%s" % datetime.datetime.utcnow().isoformat()
 90 |                 t.start()
 91 |                 # ensure self._timer joinable (already started) whenever
 92 |                 # close() happens to be called
 93 |                 self._timer = t
 94 |             else:
 95 |                 self.logger.info("finished")
 96 | 
 97 |     def _ensure_db_table(self):
 98 |         dbs = self.rr.db_list().run()
 99 |         if not self.rr.dbname in dbs:
100 |             self.logger.info("creating rethinkdb database %r", self.rr.dbname)
101 |             self.rr.db_create(self.rr.dbname).run()
102 |         tables = self.rr.table_list().run()
103 |         if not self.table in tables:
104 |             self.logger.info(
105 |                     "creating rethinkdb table %r in database %r",
106 |                     self.table, self.rr.dbname)
107 |             self.rr.table_create(
108 |                     self.table, shards=len(self.rr.servers),
109 |                     replicas=min(3, len(self.rr.servers))).run()
110 |             self.rr.table(self.table).index_create(
111 |                     "abbr_canon_surt_timestamp",
112 |                     [r.row["abbr_canon_surt"], r.row["timestamp"]]).run()
113 |             self.rr.table(self.table).index_create("sha1_warc_type", [
114 |                 r.row["sha1base32"], r.row["warc_type"], r.row["bucket"]]).run()
115 |             self.rr.table(self.table).index_wait().run()
116 | 
117 |     def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
118 |         if algo != "sha1":
119 |             raise Exception(
120 |                     "digest type is %r but big captures table is indexed by "
121 |                     "sha1" % algo)
122 |         sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
123 |         results_iter = self.rr.table(self.table).get_all(
124 |                 [sha1base32, "response", bucket],
125 |                 index="sha1_warc_type").filter(
126 |                         r.row["dedup_ok"], default=True).run()
127 |         results = list(results_iter)
128 |         if len(results) > 0:
129 |             if len(results) > 1:
130 |                 self.logger.debug(
131 |                         "expected 0 or 1 but found %r results for "
132 |                         "sha1base32=%r bucket=%r (will use first result)",
133 |                         len(results), sha1base32, bucket)
134 |             result = results[0]
135 |         else:
136 |             result = None
137 |         self.logger.debug("returning %r for sha1base32=%r bucket=%r",
138 |                           result, sha1base32, bucket)
139 |         return result
140 | 
141 |     def _assemble_entry(self, recorded_url, records):
142 |         if recorded_url.payload_digest:
143 |             if recorded_url.payload_digest.name == "sha1":
144 |                 sha1base32 = base64.b32encode(
145 |                         recorded_url.payload_digest.digest()
146 |                         ).decode("utf-8")
147 |             else:
148 |                 self.logger.warning(
149 |                         "digest type is %r but big captures table is indexed "
150 |                         "by sha1",
151 |                         recorded_url.payload_digest.name)
152 |         else:
153 |             digest = hashlib.new("sha1", records[0].content[1])
154 |             sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")
155 | 
156 |         if (recorded_url.warcprox_meta
157 |                 and "dedup-buckets" in recorded_url.warcprox_meta):
158 |             for bucket, bucket_mode in recorded_url.warcprox_meta["dedup-buckets"].items():
159 |                 if not bucket_mode == 'ro':
160 |                     # maybe this is the right thing to do here? or should we return an entry for each? or ?
161 |                     break
162 |         else:
163 |             bucket = "__unspecified__"
164 | 
165 |         canon_surt = urlcanon.semantic(recorded_url.url).surt().decode('ascii')
166 | 
167 |         entry = {
168 |             # id only specified for rethinkdb partitioning
169 |             "id": "{} {}".format(
170 |                 canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
171 |             "abbr_canon_surt": canon_surt[:150],
172 |             "canon_surt": canon_surt,
173 |             "timestamp": recorded_url.timestamp.replace(
174 |                 tzinfo=doublethink.UTC),
175 |             "url": recorded_url.url.decode("utf-8"),
176 |             "offset": records[0].offset,
177 |             "filename": os.path.basename(records[0].warc_filename),
178 |             "warc_type": records[0].type.decode("utf-8"),
179 |             "warc_id": records[0].id.decode("utf-8"),
180 |             "sha1base32": sha1base32,
181 |             "content_type": recorded_url.mimetype,
182 |             "response_code": recorded_url.status,
183 |             "http_method": recorded_url.method,
184 |             "bucket": bucket,
185 |             "record_length": records[0].length, # compressed (or not) length of
186 |                                                 # warc record including record
187 |                                                 # headers
188 |             "wire_bytes": recorded_url.size, # count of bytes transferred over
189 |                                              # the wire, including http headers
190 |                                              # if any
191 |         }
192 | 
193 |         if recorded_url.warcprox_meta:
194 |             if "dedup-ok" in recorded_url.warcprox_meta:
195 |                 entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"]
196 |             if "captures-table-extra-fields" in recorded_url.warcprox_meta:
197 |                 extras = recorded_url.warcprox_meta[
198 |                         "captures-table-extra-fields"]
199 |                 for extra_field in extras:
200 |                     entry[extra_field] = extras[extra_field]
201 | 
202 |         return entry
203 | 
204 |     def notify(self, recorded_url, records):
205 |         if records:
206 |             entry = self._assemble_entry(recorded_url, records)
207 |             with self._batch_lock:
208 |                 self._batch.append(entry)
209 | 
210 |     def close(self):
211 |         self.stop()
212 | 
213 |     def stop(self):
214 |         self.logger.info("closing rethinkdb captures table")
215 |         self._stop.set()
216 |         if self._timer:
217 |             self._timer.join()
218 | 
219 | class RethinkCapturesDedup(warcprox.dedup.DedupDb, DedupableMixin):
220 |     logger = logging.getLogger("warcprox.dedup.RethinkCapturesDedup")
221 | 
222 |     def __init__(self, options=warcprox.Options()):
223 |         DedupableMixin.__init__(self, options)
224 |         self.captures_db = RethinkCaptures(options=options)
225 |         self.options = options
226 | 
227 |     def lookup(self, digest_key, bucket="__unspecified__", url=None):
228 |         k = digest_key.decode("utf-8") if isinstance(digest_key, bytes) else digest_key
229 |         algo, value_str = k.split(":")
230 |         if self.options.base32:
231 |             raw_digest = base64.b32decode(value_str, casefold=True)
232 |         else:
233 |             raw_digest = base64.b16decode(value_str, casefold=True)
234 |         entry = self.captures_db.find_response_by_digest(algo, raw_digest, bucket)
235 |         if entry:
236 |             dedup_info = {
237 |                 "url": entry["url"].encode("utf-8"),
238 |                 "date": entry["timestamp"].strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8"),
239 |             }
240 |             if "warc_id" in entry:
241 |                 dedup_info["id"] = entry["warc_id"].encode("utf-8")
242 |             return dedup_info
243 |         else:
244 |             return None
245 | 
246 |     def start(self):
247 |         self.captures_db.start()
248 | 
249 |     def stop(self):
250 |         self.captures_db.stop()
251 | 
252 |     def close(self):
253 |         self.captures_db.close()
254 | 
255 |     def notify(self, recorded_url, records):
256 |         self.captures_db.notify(recorded_url, records)
257 | 


--------------------------------------------------------------------------------
/warcprox/certauth.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | from argparse import ArgumentParser
  4 | from datetime import datetime, timedelta
  5 | import threading
  6 | 
  7 | from cryptography import x509
  8 | from cryptography.hazmat.backends import default_backend
  9 | from cryptography.hazmat.primitives import hashes, serialization
 10 | from cryptography.hazmat.primitives.asymmetric import rsa
 11 | from cryptography.x509.oid import NameOID
 12 | 
 13 | # =================================================================
 14 | # Valid for 3 years from now
 15 | # Max validity is 39 months:
 16 | # https://casecurity.org/2015/02/19/ssl-certificate-validity-periods-limited-to-39-months-starting-in-april/
 17 | CERT_NOT_AFTER = 3 * 365 * 24 * 60 * 60
 18 | 
 19 | CERTS_DIR = './ca/certs/'
 20 | 
 21 | CERT_NAME = 'certauth sample CA'
 22 | 
 23 | DEF_HASH_FUNC = hashes.SHA256()
 24 | 
 25 | 
 26 | # =================================================================
 27 | class CertificateAuthority:
 28 |     """
 29 |     Utility class for signing individual certificate
 30 |     with a root cert.
 31 | 
 32 |     Static generate_ca_root() method for creating the root cert
 33 | 
 34 |     All certs saved on filesystem. Individual certs are stored
 35 |     in specified certs_dir and reused if previously created.
 36 |     """
 37 | 
 38 |     def __init__(self, ca_file, certs_dir, ca_name,
 39 |                  overwrite=False,
 40 |                  cert_not_before=0,
 41 |                  cert_not_after=CERT_NOT_AFTER):
 42 | 
 43 |         assert(ca_file)
 44 |         self.ca_file = ca_file
 45 | 
 46 |         assert(certs_dir)
 47 |         self.certs_dir = certs_dir
 48 | 
 49 |         assert(ca_name)
 50 |         self.ca_name = ca_name
 51 | 
 52 |         self._file_created = False
 53 | 
 54 |         self.cert_not_before = cert_not_before
 55 |         self.cert_not_after = cert_not_after
 56 | 
 57 |         if not os.path.exists(certs_dir):
 58 |             os.makedirs(certs_dir)
 59 | 
 60 |         # if file doesn't exist or overwrite is true
 61 |         # create new root cert
 62 |         if (overwrite or not os.path.isfile(ca_file)):
 63 |             self.cert, self.key = self.generate_ca_root(ca_file, ca_name)
 64 |             self._file_created = True
 65 | 
 66 |         # read previously created root cert
 67 |         else:
 68 |             self.cert, self.key = self.read_pem(ca_file)
 69 | 
 70 |         self._lock = threading.Lock()
 71 | 
 72 |     def cert_for_host(self, host, overwrite=False, wildcard=False):
 73 |         with self._lock:
 74 |             host_filename = os.path.join(self.certs_dir, host) + '.pem'
 75 | 
 76 |             if not overwrite and os.path.exists(host_filename):
 77 |                 self._file_created = False
 78 |                 return host_filename
 79 | 
 80 |             self.generate_host_cert(host, self.cert, self.key, host_filename,
 81 |                                     wildcard)
 82 | 
 83 |             self._file_created = True
 84 |             return host_filename
 85 | 
 86 |     def get_wildcard_cert(self, cert_host):
 87 |         host_parts = cert_host.split('.', 1)
 88 |         if len(host_parts) == 2 and '.' in host_parts[1]:
 89 |             cert_host = host_parts[1]
 90 | 
 91 |         certfile = self.cert_for_host(cert_host,
 92 |                                       wildcard=True)
 93 | 
 94 |         return certfile
 95 | 
 96 |     def get_root_PKCS12(self):
 97 |         return serialization.pkcs12.serialize_key_and_certificates(
 98 |             name=b"root",
 99 |             key=self.key,
100 |             cert=self.cert,
101 |             cas=None,
102 |             encryption_algorithm=serialization.NoEncryption()
103 |             )
104 | 
105 |     def _make_cert(self, certname):
106 |         subject = issuer = x509.Name([
107 |             x509.NameAttribute(NameOID.COMMON_NAME, certname),
108 |         ])
109 |         cert = x509.CertificateBuilder().subject_name(
110 |             subject
111 |         ).issuer_name(
112 |             issuer
113 |         ).public_key(
114 |             self.key.public_key()
115 |         ).serial_number(
116 |             random.randint(0, 2**64 - 1)
117 |         ).not_valid_before(
118 |             datetime.utcnow()
119 |         ).not_valid_after(
120 |             datetime.utcnow() + timedelta(seconds=self.cert_not_after)
121 |         ).add_extension(
122 |             x509.BasicConstraints(ca=True, path_length=0), critical=True,
123 |         ).add_extension(
124 |             x509.KeyUsage(key_cert_sign=True, crl_sign=True, digital_signature=False,
125 |                           content_commitment=False, key_encipherment=False,
126 |                           data_encipherment=False, key_agreement=False, encipher_only=False,
127 |                           decipher_only=False), critical=True
128 |         ).add_extension(
129 |             x509.SubjectKeyIdentifier.from_public_key(self.key.public_key()), critical=False
130 |         ).sign(self.key, DEF_HASH_FUNC, default_backend())
131 |         return cert
132 | 
133 |     def generate_ca_root(self, ca_file, ca_name, hash_func=DEF_HASH_FUNC):
134 |         # Generate key
135 |         key = rsa.generate_private_key(
136 |             public_exponent=65537,
137 |             key_size=2048,
138 |             backend=default_backend()
139 |         )
140 | 
141 |         # Generate cert
142 |         self.key = key
143 |         cert = self._make_cert(ca_name)
144 | 
145 |         # Write cert + key
146 |         self.write_pem(ca_file, cert, key)
147 |         return cert, key
148 | 
149 |     def generate_host_cert(self, host, root_cert, root_key, host_filename,
150 |                            wildcard=False, hash_func=DEF_HASH_FUNC):
151 | 
152 |         host = host.encode('utf-8')
153 | 
154 |         # Generate CSR
155 |         csr = x509.CertificateSigningRequestBuilder().subject_name(
156 |             x509.Name([
157 |                 x509.NameAttribute(NameOID.COMMON_NAME, host.decode('utf-8')),
158 |             ])
159 |         ).sign(self.key, hash_func, default_backend())
160 | 
161 |         # Generate Cert
162 |         cert_builder = x509.CertificateBuilder().subject_name(
163 |             csr.subject
164 |         ).issuer_name(
165 |             root_cert.subject
166 |         ).public_key(
167 |             csr.public_key()
168 |         ).serial_number(
169 |             random.randint(0, 2**64 - 1)
170 |         ).not_valid_before(
171 |             datetime.utcnow()
172 |         ).not_valid_after(
173 |             datetime.utcnow() + timedelta(seconds=self.cert_not_after)
174 |         )
175 | 
176 |         if wildcard:
177 |             cert_builder = cert_builder.add_extension(
178 |                 x509.SubjectAlternativeName([
179 |                     x509.DNSName(host.decode('utf-8')),
180 |                     x509.DNSName('*.' + host.decode('utf-8')),
181 |                 ]),
182 |                 critical=False,
183 |             )
184 | 
185 |         cert = cert_builder.sign(root_key, hash_func, default_backend())
186 | 
187 |         # Write cert + key
188 |         self.write_pem(host_filename, cert, self.key)
189 |         return cert, self.key
190 | 
191 |     def write_pem(self, filename, cert, key):
192 |         with open(filename, 'wb+') as f:
193 |             f.write(key.private_bytes(
194 |                 encoding=serialization.Encoding.PEM,
195 |                 format=serialization.PrivateFormat.TraditionalOpenSSL,
196 |                 encryption_algorithm=serialization.NoEncryption()
197 |             ))
198 |             f.write(cert.public_bytes(serialization.Encoding.PEM))
199 | 
200 |     def read_pem(self, filename):
201 |         with open(filename, 'rb') as f:
202 |             cert = x509.load_pem_x509_certificate(f.read(), default_backend())
203 |             f.seek(0)
204 |             key = serialization.load_pem_private_key(f.read(), password=None, backend=default_backend())
205 | 
206 |         return cert, key
207 | 
208 | 
209 | # =================================================================
210 | def main(args=None):
211 |     parser = ArgumentParser(description='Certificate Authority Cert Maker Tools')
212 | 
213 |     parser.add_argument('root_ca_cert',
214 |                         help='Path to existing or new root CA file')
215 | 
216 |     parser.add_argument('-c', '--certname', action='store', default=CERT_NAME,
217 |                         help='Name for root certificate')
218 | 
219 |     parser.add_argument('-n', '--hostname',
220 |                         help='Hostname certificate to create')
221 | 
222 |     parser.add_argument('-d', '--certs-dir', default=CERTS_DIR,
223 |                         help='Directory for host certificates')
224 | 
225 |     parser.add_argument('-f', '--force', action='store_true',
226 |                         help='Overwrite certificates if they already exist')
227 | 
228 |     parser.add_argument('-w', '--wildcard_cert', action='store_true',
229 |                         help='add wildcard SAN to host: *.<host>, <host>')
230 | 
231 |     r = parser.parse_args(args=args)
232 | 
233 |     certs_dir = r.certs_dir
234 |     wildcard = r.wildcard_cert
235 | 
236 |     root_cert = r.root_ca_cert
237 |     hostname = r.hostname
238 | 
239 |     if not hostname:
240 |         overwrite = r.force
241 |     else:
242 |         overwrite = False
243 | 
244 |     ca = CertificateAuthority(ca_file=root_cert,
245 |                               certs_dir=r.certs_dir,
246 |                               ca_name=r.certname,
247 |                               overwrite=overwrite)
248 | 
249 |     # Just creating the root cert
250 |     if not hostname:
251 |         if ca._file_created:
252 |             print('Created new root cert: "' + root_cert + '"')
253 |             return 0
254 |         else:
255 |             print('Root cert "' + root_cert +
256 |                   '" already exists,' + ' use -f to overwrite')
257 |             return 1
258 | 
259 |     # Sign a certificate for a given host
260 |     overwrite = r.force
261 |     host_filename = ca.cert_for_host(hostname,
262 |                                          overwrite, wildcard)
263 | 
264 |     if ca._file_created:
265 |         print('Created new cert "' + hostname +
266 |               '" signed by root cert ' +
267 |               root_cert)
268 |         return 0
269 | 
270 |     else:
271 |         print('Cert for "' + hostname + '" already exists,' +
272 |               ' use -f to overwrite')
273 |         return 1
274 | 
275 | 
276 | if __name__ == "__main__":  #pragma: no cover
277 |     main()
278 | 


--------------------------------------------------------------------------------
/warcprox/crawl_log.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | warcprox/crawl_log.py - heritrix-style crawl logger
  4 | 
  5 | Copyright (C) 2017 Internet Archive
  6 | 
  7 | This program is free software; you can redistribute it and/or
  8 | modify it under the terms of the GNU General Public License
  9 | as published by the Free Software Foundation; either version 2
 10 | of the License, or (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with this program; if not, write to the Free Software
 19 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 20 | USA.
 21 | '''
 22 | import logging
 23 | import datetime
 24 | import json
 25 | import os
 26 | import warcprox
 27 | import socket
 28 | import rfc3986
 29 | from urllib3.exceptions import TimeoutError, HTTPError, NewConnectionError, MaxRetryError
 30 | 
 31 | class CrawlLogger:
 32 |     def __init__(self, dir_, options=warcprox.Options()):
 33 |         self.dir = dir_
 34 |         self.options = options
 35 |         self.hostname = socket.gethostname().split('.', 1)[0]
 36 | 
 37 |     def start(self):
 38 |         if not os.path.exists(self.dir):
 39 |             logging.info('creating directory %r', self.dir)
 40 |             os.mkdir(self.dir)
 41 | 
 42 |     def notify(self, recorded_url, records):
 43 |         # 2017-08-03T21:45:24.496Z   200       2189 https://autismcouncil.wisconsin.gov/robots.txt P https://autismcouncil.wisconsin.gov/ text/plain #001 20170803214523617+365 sha1:PBS2CEF7B4OSEXZZF3QE2XN2VHYCPNPX https://autismcouncil.wisconsin.gov/ duplicate:digest {"warcFileOffset":942,"contentSize":2495,"warcFilename":"ARCHIVEIT-2159-TEST-JOB319150-20170803214522386-00000.warc.gz"}
 44 |         now = datetime.datetime.utcnow()
 45 |         status = self.get_artificial_status(recorded_url)
 46 |         extra_info = {'contentSize': recorded_url.size,} if recorded_url.size is not None and recorded_url.size > 0 else {}
 47 |         if hasattr(recorded_url, 'exception') and recorded_url.exception is not None:
 48 |             extra_info['exception'] = str(recorded_url.exception).replace(" ", "_")
 49 |             if(hasattr(recorded_url, 'message') and recorded_url.message is not None):
 50 |                 extra_info['exceptionMessage'] = str(recorded_url.message).replace(" ", "_")
 51 |         if records:
 52 |             extra_info['warcFilename'] = records[0].warc_filename
 53 |             extra_info['warcFileOffset'] = records[0].offset
 54 |         if recorded_url.method != 'GET':
 55 |             extra_info['method'] = recorded_url.method
 56 |         if recorded_url.response_recorder:
 57 |             content_length = recorded_url.response_recorder.len - recorded_url.response_recorder.payload_offset
 58 |             payload_digest = warcprox.digest_str(
 59 |                 recorded_url.payload_digest,
 60 |                 self.options.base32)
 61 |         elif records is not None and len(records) > 0:
 62 |             # WARCPROX_WRITE_RECORD request
 63 |             content_length = int(records[0].get_header(b'Content-Length'))
 64 |             payload_digest = records[0].get_header(b'WARC-Payload-Digest')
 65 |         else:
 66 |             content_length = 0
 67 |             payload_digest = '-'
 68 |         logging.info('warcprox_meta %s' , recorded_url.warcprox_meta)
 69 | 
 70 |         hop_path = recorded_url.warcprox_meta.get('metadata', {}).get('hop_path')
 71 |         #URLs are url encoded into plain ascii urls by HTTP spec. Since we're comparing against those, our urls sent over the json blob need to be encoded similarly
 72 |         brozzled_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('brozzled_url'))
 73 |         hop_via_url = canonicalize_url(recorded_url.warcprox_meta.get('metadata', {}).get('hop_via_url'))
 74 | 
 75 |         if hop_path is None and brozzled_url is None and hop_via_url is None:
 76 |             #No hop info headers provided
 77 |             hop_path = "-"
 78 |             via_url = recorded_url.referer or '-'
 79 |         else:
 80 |             if hop_path is None:
 81 |                 hop_path = "-"
 82 |             if hop_via_url is None:
 83 |                 hop_via_url = "-"
 84 |             #Prefer referer header. Otherwise use provided via_url
 85 |             via_url = recorded_url.referer or hop_via_url if hop_path != "-" else "-"
 86 |             logging.info('brozzled_url:%s recorded_url:%s' , brozzled_url, recorded_url.url)
 87 |             if brozzled_url != recorded_url.url.decode('ascii') and "brozzled_url" in recorded_url.warcprox_meta.get('metadata', {}).keys():
 88 |                 #Requested page is not the Brozzled url, thus we are an embed or redirect.
 89 |                 via_url = brozzled_url
 90 |                 hop_path = "B" if hop_path == "-" else "".join([hop_path,"B"])
 91 | 
 92 |         fields = [
 93 |             '{:%Y-%m-%dT%H:%M:%S}.{:03d}Z'.format(now, now.microsecond//1000),
 94 |             '% 5s' % status,
 95 |             '% 10s' % content_length,
 96 |             recorded_url.url,
 97 |             hop_path,
 98 |             via_url,
 99 |             recorded_url.mimetype if recorded_url.mimetype is not None and recorded_url.mimetype.strip() else '-',
100 |             '-',
101 |             '{:%Y%m%d%H%M%S}{:03d}+{:03d}'.format(
102 |                 recorded_url.timestamp,
103 |                 recorded_url.timestamp.microsecond//1000,
104 |                 recorded_url.duration.microseconds//1000) if (recorded_url.timestamp is not None and recorded_url.duration is not None) else '-',
105 |             payload_digest,
106 |             recorded_url.warcprox_meta.get('metadata', {}).get('seed', '-'),
107 |             'duplicate:digest' if records and records[0].type == b'revisit' else '-',
108 |             json.dumps(extra_info, separators=(',',':')),
109 |         ]
110 |         for i in range(len(fields)):
111 |             # `fields` is a mix of `bytes` and `unicode`, make them all `bytes`
112 |             try:
113 |                 fields[i] = fields[i].encode('utf-8')
114 |             except:
115 |                 pass
116 |         line = b' '.join(fields) + b'\n'
117 |         prefix = recorded_url.warcprox_meta.get('warc-prefix', 'crawl')
118 |         filename = '{}-{}-{}.log'.format(
119 |                 prefix, self.hostname, self.options.server_port)
120 |         crawl_log_path = os.path.join(self.dir, filename)
121 | 
122 |         with open(crawl_log_path, 'ab') as f:
123 |             f.write(line)
124 | 
125 |     def get_artificial_status(self, recorded_url):
126 |         # urllib3 Does not specify DNS errors. We must parse them from the exception string.
127 |         # Unfortunately, the errors are reported differently on different systems.
128 |         # https://stackoverflow.com/questions/40145631
129 | 
130 |         if hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (MaxRetryError, )):
131 |             return '-8'
132 |         elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (NewConnectionError, )):
133 |             exception_string=str(recorded_url.exception)
134 |             if ("[Errno 11001] getaddrinfo failed" in exception_string or                   # Windows
135 |                 "[Errno -2] Name or service not known" in exception_string or               # Linux
136 |                 "[Errno -3] Temporary failure in name resolution" in exception_string or    # Linux
137 |                 "[Errno 8] nodename nor servname " in exception_string):                    # OS X
138 |                 return '-6' # DNS Failure
139 |             else:
140 |                 return '-2' # Other Connection Failure
141 |         elif hasattr(recorded_url, 'exception') and isinstance(recorded_url.exception, (socket.timeout, TimeoutError, )):
142 |             return '-2' # Connection Timeout
143 |         elif isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
144 |             # synthetic status, used when some other status (such as connection-lost)
145 |             # is considered by policy the same as a document-not-found
146 |             # Cached failures result in FailedUrl with no Exception
147 |             return '-404'
148 |         else:
149 |             return recorded_url.status
150 | 
151 | def canonicalize_url(url):
152 |     #URL needs to be split out to separately encode the hostname from the rest of the path.
153 |     #hostname will be idna encoded (punycode)
154 |     #The rest of the URL will be urlencoded, but browsers only encode "unsafe" and not "reserved" characters, so ignore the reserved chars.
155 |     if url is None or url == '-' or url == '':
156 |         return url
157 |     try:
158 |         parsed_url=rfc3986.urlparse(url)
159 |         encoded_url=parsed_url.copy_with(host=parsed_url.host.encode('idna'))
160 |         return encoded_url.unsplit()
161 |     except (TypeError, ValueError, AttributeError) as e:
162 |         logging.warning("URL Canonicalization failure. Returning raw url: rfc3986 %s - %s", url, e)
163 |         return url
164 | 
165 | 


--------------------------------------------------------------------------------
/warcprox/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | warcprox/main.py - entrypoint for warcprox executable, parses command line
  4 | arguments, initializes components, starts controller, handles signals
  5 | 
  6 | Copyright (C) 2013-2019 Internet Archive
  7 | 
  8 | This program is free software; you can redistribute it and/or
  9 | modify it under the terms of the GNU General Public License
 10 | as published by the Free Software Foundation; either version 2
 11 | of the License, or (at your option) any later version.
 12 | 
 13 | This program is distributed in the hope that it will be useful,
 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | GNU General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU General Public License
 19 | along with this program; if not, write to the Free Software
 20 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 21 | USA.
 22 | '''
 23 | import logging
 24 | import logging.config
 25 | import sys
 26 | import hashlib
 27 | import argparse
 28 | import os
 29 | import socket
 30 | import traceback
 31 | import signal
 32 | import threading
 33 | import yaml
 34 | import warcprox
 35 | import doublethink
 36 | import cryptography.hazmat.backends.openssl
 37 | 
 38 | class BetterArgumentDefaultsHelpFormatter(
 39 |                 argparse.ArgumentDefaultsHelpFormatter,
 40 |                 argparse.RawDescriptionHelpFormatter):
 41 |     '''
 42 |     HelpFormatter with these properties:
 43 | 
 44 |     - formats option help like argparse.ArgumentDefaultsHelpFormatter except
 45 |       that it omits the default value for arguments with action='store_const'
 46 |     - like argparse.RawDescriptionHelpFormatter, does not reformat description
 47 |       string
 48 |     '''
 49 |     def _get_help_string(self, action):
 50 |         if isinstance(action, argparse._StoreConstAction):
 51 |             return action.help
 52 |         else:
 53 |             return argparse.ArgumentDefaultsHelpFormatter._get_help_string(self, action)
 54 | 
 55 | def _build_arg_parser(prog='warcprox', show_hidden=False):
 56 |     if show_hidden:
 57 |         def suppress(msg):
 58 |             return msg
 59 |     else:
 60 |         def suppress(msg):
 61 |             return argparse.SUPPRESS
 62 | 
 63 |     arg_parser = argparse.ArgumentParser(prog=prog,
 64 |             description='warcprox - WARC writing MITM HTTP/S proxy',
 65 |             formatter_class=BetterArgumentDefaultsHelpFormatter)
 66 | 
 67 |     hidden = arg_parser.add_argument_group('hidden options')
 68 |     arg_parser.add_argument(
 69 |         '--help-hidden', action='help', default=argparse.SUPPRESS,
 70 |         help='show help message, including help on hidden options, and exit')
 71 | 
 72 |     arg_parser.add_argument('-p', '--port', dest='port', default='8000',
 73 |             type=int, help='port to listen on')
 74 |     arg_parser.add_argument('-b', '--address', dest='address',
 75 |             default='localhost', help='address to listen on')
 76 |     arg_parser.add_argument('-c', '--cacert', dest='cacert',
 77 |             default='./{}-warcprox-ca.pem'.format(socket.gethostname()),
 78 |             help='CA certificate file; if file does not exist, it will be created')
 79 |     arg_parser.add_argument('--certs-dir', dest='certs_dir',
 80 |             default='./{}-warcprox-ca'.format(socket.gethostname()),
 81 |             help='where to store and load generated certificates')
 82 |     arg_parser.add_argument('-d', '--dir', dest='directory',
 83 |             default='./warcs', help='where to write warcs')
 84 |     arg_parser.add_argument('--subdir-prefix', dest='subdir_prefix', action='store_true',
 85 |                             help='write warcs to --dir subdir equal to the current warc-prefix'),
 86 |     arg_parser.add_argument('--warc-filename', dest='warc_filename',
 87 |             default='{prefix}-{timestamp17}-{serialno}-{randomtoken}',
 88 |             help='define custom WARC filename with variables {prefix}, {timestamp14}, {timestamp17}, {serialno}, {randomtoken}, {hostname}, {shorthostname}, {port}')
 89 |     arg_parser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
 90 |             help='write gzip-compressed warc records')
 91 |     hidden.add_argument(
 92 |             '--no-warc-open-suffix', dest='no_warc_open_suffix',
 93 |             default=False, action='store_true',
 94 |             help=suppress(
 95 |                 'do not name warc files with suffix ".open" while writing to '
 96 |                 'them, but lock them with lockf(3) intead'))
 97 |     # not mentioned in --help: special value for '-' for --prefix means don't
 98 |     # archive the capture, unless prefix set in warcprox-meta header
 99 |     arg_parser.add_argument(
100 |             '-n', '--prefix', dest='prefix', default='WARCPROX',
101 |             help='default WARC filename prefix')
102 |     arg_parser.add_argument(
103 |             '-s', '--size', dest='rollover_size', default=1000*1000*1000,
104 |             type=int, help='WARC file rollover size threshold in bytes')
105 |     arg_parser.add_argument('--rollover-idle-time',
106 |             dest='rollover_idle_time', default=None, type=int,
107 |             help="WARC file rollover idle time threshold in seconds (so that Friday's last open WARC doesn't sit there all weekend waiting for more data)")
108 |     try:
109 |         hash_algos = hashlib.algorithms_guaranteed
110 |     except AttributeError:
111 |         hash_algos = hashlib.algorithms
112 |     arg_parser.add_argument('-g', '--digest-algorithm', dest='digest_algorithm',
113 |             default='sha1', help='digest algorithm, one of {}'.format(', '.join(hash_algos)))
114 |     arg_parser.add_argument('--base32', dest='base32', action='store_true',
115 |             default=False, help='write digests in Base32 instead of hex')
116 |     arg_parser.add_argument('--method-filter', metavar='HTTP_METHOD',
117 |                             action='append', help='only record requests with the given http method(s) (can be used more than once)')
118 | 
119 |     group = arg_parser.add_mutually_exclusive_group()
120 |     group.add_argument(
121 |             '--stats-db-file', dest='stats_db_file',
122 |             default='./warcprox.sqlite', help=(
123 |                 'persistent statistics database file; empty string or '
124 |                 '/dev/null disables statistics tracking'))
125 |     group.add_argument(
126 |             '--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=(
127 |                 'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
128 |                 'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
129 | 
130 |     arg_parser.add_argument('-P', '--playback-port', dest='playback_port',
131 |             type=int, default=None, help='port to listen on for instant playback')
132 |     # arg_parser.add_argument('--playback-index-db-file', dest='playback_index_db_file',
133 |     #         default='./warcprox-playback-index.db',
134 |     #         help='playback index database file (only used if --playback-port is specified)')
135 |     group = arg_parser.add_mutually_exclusive_group()
136 |     group.add_argument('-j', '--dedup-db-file', dest='dedup_db_file',
137 |             default='./warcprox.sqlite', help='persistent deduplication database file; empty string or /dev/null disables deduplication')
138 |     group.add_argument(
139 |             '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
140 |                 'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
141 |                 'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
142 |     group.add_argument(
143 |             '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
144 |                 'rethinkdb big table url (table will be populated with '
145 |                 'various capture information and is suitable for use as '
146 |                 'index for playback), e.g. rethinkdb://db0.foo.org,'
147 |                 'db1.foo.org:38015/my_warcprox_db/captures'))
148 |     group.add_argument(
149 |             '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
150 |                 '🐷   url pointing to trough configuration rethinkdb database, '
151 |                 'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
152 |                 '/trough_configuration'))
153 |     group.add_argument('--cdxserver-dedup', dest='cdxserver_dedup',
154 |             help='use a CDX Server URL for deduplication; e.g. https://web.archive.org/cdx/search')
155 |     arg_parser.add_argument(
156 |             '--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
157 |                 'rethinkdb service registry table url; if provided, warcprox '
158 |                 'will create and heartbeat entry for itself'))
159 |     # optional cookie values to pass to CDX Server; e.g. "cookie1=val1;cookie2=val2"
160 |     hidden.add_argument(
161 |             '--cdxserver-dedup-cookies', dest='cdxserver_dedup_cookies',
162 |             help=suppress(
163 |                 'value of Cookie header to include in requests to the cdx '
164 |                 'server, when using --cdxserver-dedup'))
165 |     hidden.add_argument(
166 |             '--cdxserver-dedup-max-threads', dest='cdxserver_dedup_max_threads',
167 |             type=int, default=50, help=suppress(
168 |                 'maximum number of cdx server dedup threads'))
169 |     arg_parser.add_argument('--dedup-min-text-size', dest='dedup_min_text_size',
170 |                             type=int, default=0,
171 |                             help=('try to dedup text resources with payload size over this limit in bytes'))
172 |     arg_parser.add_argument('--dedup-min-binary-size', dest='dedup_min_binary_size',
173 |                             type=int, default=0, help=(
174 |                             'try to dedup binary resources with payload size over this limit in bytes'))
175 |     hidden.add_argument(
176 |             '--dedup-only-with-bucket', dest='dedup_only_with_bucket',
177 |             action='store_true', default=False, help=suppress(
178 |                 'only deduplicate captures if "dedup-bucket" is set in '
179 |                 'the Warcprox-Meta request header'))
180 |     arg_parser.add_argument('--blackout-period', dest='blackout_period',
181 |                             type=int, default=0,
182 |                             help='skip writing a revisit record if its too close to the original capture')
183 |     hidden.add_argument(
184 |             '--queue-size', dest='queue_size', type=int, default=500,
185 |             help=suppress(
186 |                 'maximum number of urls that can be queued at each '
187 |                 'step of the processing chain (see the section on warcprox '
188 |                 'architecture in README.rst)'))
189 |     hidden.add_argument(
190 |             '--max-threads', dest='max_threads', type=int, default=100,
191 |             help=suppress('maximum number of http worker threads'))
192 |     hidden.add_argument(
193 |             '--profile', action='store_true', default=False,
194 |             help=suppress(
195 |                 'turn on performance profiling; summary statistics are dumped '
196 |                 'every 10 minutes and at shutdown'))
197 |     arg_parser.add_argument(
198 |             '--ssl-context', dest='ssl_context', default=None, help=(
199 |                 'emulate chrome and firefox tls fingerprints'))
200 |     arg_parser.add_argument(
201 |             '--onion-tor-socks-proxy', dest='onion_tor_socks_proxy',
202 |             default=None, help=(
203 |                 'host:port of tor socks proxy, used only to connect to '
204 |                 '.onion sites'))
205 |     arg_parser.add_argument(
206 |             '--socks-proxy', dest='socks_proxy',
207 |             default=None, help='host:port of socks proxy, used for all traffic if activated')
208 |     arg_parser.add_argument(
209 |             '--socks-proxy-username', dest='socks_proxy_username',
210 |             default=None, help='optional socks proxy username')
211 |     arg_parser.add_argument(
212 |             '--socks-proxy-password', dest='socks_proxy_password',
213 |             default=None, help='optional socks proxy password')
214 |     hidden.add_argument(
215 |             '--socket-timeout', dest='socket_timeout', type=float, default=60,
216 |             help=suppress(
217 |                 'socket timeout, used for proxy client connection and for '
218 |                 'connection to remote server'))
219 |     # Increasing this value increases memory usage but reduces /tmp disk I/O.
220 |     hidden.add_argument(
221 |             '--tmp-file-max-memory-size', dest='tmp_file_max_memory_size',
222 |             type=int, default=512*1024, help=suppress(
223 |                 'size of in-memory buffer for each url being processed '
224 |                 '(spills over to temp space on disk if exceeded)'))
225 |     arg_parser.add_argument(
226 |             '--max-resource-size', dest='max_resource_size', type=int,
227 |             default=None, help='maximum resource size limit in bytes')
228 |     arg_parser.add_argument(
229 |             '--crawl-log-dir', dest='crawl_log_dir', default=None, help=(
230 |                 'if specified, write crawl log files in the specified '
231 |                 'directory; one crawl log is written per warc filename '
232 |                 'prefix; crawl log format mimics heritrix'))
233 |     arg_parser.add_argument(
234 |             '--plugin', metavar='PLUGIN_CLASS', dest='plugins',
235 |             action='append', help=(
236 |                 'Qualified name of plugin class, e.g. "mypkg.mymod.MyClass". '
237 |                 'May be used multiple times to register multiple plugins. '
238 |                 'See README.rst for more information.'))
239 |     arg_parser.add_argument(
240 |             '-q', '--quiet', dest='quiet', action='store_true',
241 |             help='less verbose logging')
242 |     arg_parser.add_argument(
243 |             '-v', '--verbose', dest='verbose', action='store_true',
244 |             help='verbose logging')
245 |     arg_parser.add_argument(
246 |             '--trace', dest='trace', action='store_true',
247 |             help='very verbose logging')
248 |     arg_parser.add_argument(
249 |             '--logging-conf-file', dest='logging_conf_file', default=None,
250 |             help=('reads logging configuration from a YAML file'))
251 |     arg_parser.add_argument(
252 |             '--version', action='version',
253 |             version="warcprox {}".format(warcprox.__version__))
254 | 
255 |     return arg_parser
256 | 
257 | def dump_state(signum=None, frame=None):
258 |     '''
259 |     Signal handler, logs stack traces of active threads.
260 |     '''
261 |     state_strs = []
262 | 
263 |     for th in threading.enumerate():
264 |         try:
265 |             state_strs.append(str(th))
266 |             stack = traceback.format_stack(sys._current_frames()[th.ident])
267 |             state_strs.append(''.join(stack))
268 |         except Exception as e:
269 |             state_strs.append('<n/a:%r>' % e)
270 | 
271 |     logging.warning(
272 |             'dumping state (caught signal %s)\n%s',
273 |             signum, '\n'.join(state_strs))
274 | 
275 | def parse_args(argv):
276 |     '''
277 |     Parses command line arguments with argparse.
278 |     '''
279 |     show_hidden = False
280 |     if '--help-hidden' in argv:
281 |         show_hidden = True
282 |         argv = [argv[0], '--help-hidden']
283 |     arg_parser = _build_arg_parser(os.path.basename(argv[0]), show_hidden)
284 |     args = arg_parser.parse_args(args=argv[1:])
285 | 
286 |     try:
287 |         hashlib.new(args.digest_algorithm)
288 |     except Exception as e:
289 |         logging.fatal(e)
290 |         exit(1)
291 | 
292 |     return args
293 | 
294 | def main(argv=None):
295 |     '''
296 |     Main method, entry point of warcprox command.
297 |     '''
298 |     args = parse_args(argv or sys.argv)
299 | 
300 |     if args.trace:
301 |         loglevel = logging.TRACE
302 |     elif args.verbose:
303 |         loglevel = logging.DEBUG
304 |     elif args.quiet:
305 |         loglevel = logging.NOTICE
306 |     else:
307 |         loglevel = logging.INFO
308 | 
309 |     logging.root.handlers = []
310 |     logging.basicConfig(
311 |             stream=sys.stdout, level=loglevel, format=(
312 |                 '%(asctime)s %(process)d %(levelname)s %(threadName)s '
313 |                 '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
314 | 
315 |     if args.logging_conf_file:
316 |         with open(args.logging_conf_file) as fd:
317 |             conf = yaml.safe_load(fd)
318 |             logging.config.dictConfig(conf)
319 | 
320 |     # see https://github.com/pyca/cryptography/issues/2911
321 |     cryptography.hazmat.backends.openssl.backend.activate_builtin_random()
322 | 
323 |     options = warcprox.Options(**vars(args))
324 |     controller = warcprox.controller.WarcproxController(options)
325 | 
326 |     signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
327 |     signal.signal(signal.SIGINT, lambda a,b: controller.stop.set())
328 |     try:
329 |         signal.signal(signal.SIGQUIT, dump_state)
330 |     except AttributeError:
331 |         # SIGQUIT does not exist on some platforms (windows)
332 |         pass
333 | 
334 |     try:
335 |         controller.run_until_shutdown()
336 |     except:
337 |         logging.fatal('unhandled exception in controller', exc_info=True)
338 |         sys.exit(1)
339 | 
340 | def ensure_rethinkdb_tables(argv=None):
341 |     '''
342 |     Creates rethinkdb tables if they don't already exist. Warcprox normally
343 |     creates the tables it needs on demand at startup, but if multiple instances
344 |     are starting up at the same time, you can end up with duplicate broken
345 |     tables. So it's a good idea to use this utility at an early step when
346 |     spinning up a cluster.
347 |     '''
348 |     argv = argv or sys.argv
349 |     arg_parser = argparse.ArgumentParser(
350 |             prog=os.path.basename(argv[0]),
351 |             formatter_class=BetterArgumentDefaultsHelpFormatter)
352 |     arg_parser.add_argument(
353 |             '--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=(
354 |                 'rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
355 |                 'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
356 |     group = arg_parser.add_mutually_exclusive_group()
357 |     group.add_argument(
358 |             '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=(
359 |                 'rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
360 |                 'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
361 |     group.add_argument(
362 |             '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=(
363 |                 'rethinkdb big table url (table will be populated with '
364 |                 'various capture information and is suitable for use as '
365 |                 'index for playback), e.g. rethinkdb://db0.foo.org,'
366 |                 'db1.foo.org:38015/my_warcprox_db/captures'))
367 |     group.add_argument(
368 |             '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=(
369 |                 '🐷   url pointing to trough configuration rethinkdb database, '
370 |                 'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
371 |                 '/trough_configuration'))
372 |     arg_parser.add_argument(
373 |             '--rethinkdb-services-url', dest='rethinkdb_services_url', help=(
374 |                 'rethinkdb service registry table url; if provided, warcprox '
375 |                 'will create and heartbeat entry for itself'))
376 |     arg_parser.add_argument(
377 |             '-q', '--quiet', dest='log_level',
378 |             action='store_const', default=logging.INFO, const=logging.WARN)
379 |     arg_parser.add_argument(
380 |             '-v', '--verbose', dest='log_level',
381 |             action='store_const', default=logging.INFO, const=logging.DEBUG)
382 |     args = arg_parser.parse_args(args=argv[1:])
383 | 
384 |     logging.basicConfig(
385 |             stream=sys.stdout, level=args.log_level, format=(
386 |                 '%(asctime)s %(levelname)s %(name)s.%(funcName)s'
387 |                 '(%(filename)s:%(lineno)d) %(message)s'))
388 | 
389 |     options = warcprox.Options(**vars(args))
390 | 
391 |     did_something = False
392 |     if args.rethinkdb_services_url:
393 |         parsed = doublethink.parse_rethinkdb_url(
394 |                 options.rethinkdb_services_url)
395 |         rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
396 |         svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
397 |         did_something = True
398 |     if args.rethinkdb_stats_url:
399 |         stats_db = warcprox.stats.RethinkStatsProcessor(options=options)
400 |         stats_db._ensure_db_table()
401 |         did_something = True
402 |     if args.rethinkdb_dedup_url:
403 |         dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
404 |         did_something = True
405 |     if args.rethinkdb_big_table_url:
406 |         dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
407 |         did_something = True
408 |     if args.rethinkdb_trough_db_url:
409 |         dedup_db = warcprox.dedup.TroughDedupDb(options)
410 |         logging.warning(
411 |                 'trough is responsible for creating most of the rethinkdb '
412 |                 'tables that it uses')
413 |         did_something = True
414 | 
415 |     if not did_something:
416 |         logging.error('nothing to do, no --rethinkdb-* options supplied')
417 | 
418 | if __name__ == '__main__':
419 |     main()
420 | 
421 | 


--------------------------------------------------------------------------------
/warcprox/playback.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | warcprox/playback.py - rudimentary support for playback of urls archived by
  3 | warcprox (not much used or maintained)
  4 | 
  5 | Copyright (C) 2013-2017 Internet Archive
  6 | 
  7 | This program is free software; you can redistribute it and/or
  8 | modify it under the terms of the GNU General Public License
  9 | as published by the Free Software Foundation; either version 2
 10 | of the License, or (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with this program; if not, write to the Free Software
 19 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 20 | USA.
 21 | '''
 22 | import http.server as http_server
 23 | import socketserver
 24 | import logging
 25 | import os
 26 | from hanzo import warctools
 27 | import json
 28 | import traceback
 29 | import re
 30 | from warcprox.mitmproxy import MitmProxyHandler
 31 | import warcprox
 32 | import sqlite3
 33 | import threading
 34 | from cachetools import TTLCache
 35 | 
 36 | class PlaybackProxyHandler(MitmProxyHandler):
 37 |     logger = logging.getLogger("warcprox.playback.PlaybackProxyHandler")
 38 | 
 39 |     # @Override
 40 |     def _connect_to_remote_server(self):
 41 |         # don't connect to any remote server!
 42 |         pass
 43 | 
 44 |     # @Override
 45 |     def _proxy_request(self):
 46 |         date, location = self.server.playback_index_db.lookup_latest(self.url)
 47 |         self.logger.debug('lookup_latest returned {}:{}'.format(date, location))
 48 | 
 49 |         status = None
 50 |         if location is not None:
 51 |             try:
 52 |                 status, sz = self._send_response_from_warc(location['f'], location['o'])
 53 |             except:
 54 |                 status = 500
 55 |                 self.logger.error('PlaybackProxyHandler problem playing back {}'.format(self.url), exc_info=1)
 56 |                 payload = '500 Warcprox Error\n\n{}\n'.format(traceback.format_exc()).encode('utf-8')
 57 |                 headers = (b'HTTP/1.1 500 Internal Server Error\r\n'
 58 |                         +  b'Content-Type: text/plain;charset=utf-8\r\n'
 59 |                         +  b'Content-Length: ' + str(len(payload)).encode('utf-8') + b'\r\n'
 60 |                         +  b'\r\n')
 61 |                 self.connection.sendall(headers)
 62 |                 self.connection.sendall(payload)
 63 |                 sz = len(headers) + len(payload)
 64 |         else:
 65 |             status = 404
 66 |             payload = b'404 Not in Archive\n'
 67 |             headers = (b'HTTP/1.1 404 Not Found\r\n'
 68 |                     +  b'Content-Type: text/plain;charset=utf-8\r\n'
 69 |                     +  b'Content-Length: ' + str(len(payload)).encode('ascii') + b'\r\n'
 70 |                     +  b'\r\n')
 71 |             self.connection.sendall(headers)
 72 |             self.connection.sendall(payload)
 73 |             sz = len(headers) + len(payload)
 74 | 
 75 |         self.log_message('%r %s %s %s',
 76 |                          self.requestline, str(status), str(sz),
 77 |                          repr(location) if location else '-')
 78 | 
 79 | 
 80 |     def _open_warc_at_offset(self, warcfilename, offset):
 81 |         self.logger.debug('opening {} at offset {}'.format(warcfilename, offset))
 82 | 
 83 |         warcpath = None
 84 |         for p in (os.path.sep.join([self.server.warcs_dir, warcfilename]),
 85 |                 os.path.sep.join([self.server.warcs_dir, '{}.open'.format(warcfilename)])):
 86 |             if os.path.exists(p):
 87 |                 warcpath = p
 88 | 
 89 |         if warcpath is None:
 90 |             raise Exception('{} not found'.format(warcfilename))
 91 | 
 92 |         return warctools.warc.WarcRecord.open_archive(filename=warcpath, mode='rb', offset=offset)
 93 | 
 94 |     def _send_response(self, headers, payload_fh):
 95 |         status = '-'
 96 |         m = re.match(br'^HTTP/\d\.\d (\d{3})', headers)
 97 |         if m is not None:
 98 |             status = m.group(1)
 99 | 
100 |         self.connection.sendall(headers)
101 |         sz = len(headers)
102 | 
103 |         while True:
104 |             buf = payload_fh.read(8192)
105 |             if buf == b'': break
106 |             self.connection.sendall(buf)
107 |             sz += len(buf)
108 | 
109 |         return status, sz
110 | 
111 | 
112 |     def _send_headers_and_refd_payload(
113 |             self, headers, refers_to_target_uri, refers_to_date, payload_digest):
114 |         location = self.server.playback_index_db.lookup_exact(
115 |                 refers_to_target_uri, refers_to_date, payload_digest)
116 |         self.logger.debug('loading http payload from {}'.format(location))
117 | 
118 |         fh = self._open_warc_at_offset(location['f'], location['o'])
119 |         try:
120 |             for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
121 |                 pass
122 | 
123 |             if not record:
124 |                 raise Exception('failed to read record at offset {} from {}'.format(offset, warcfilename))
125 | 
126 |             if errors:
127 |                 raise Exception('warc errors at {}:{} -- {}'.format(location['f'], offset, errors))
128 | 
129 |             if record.type != warctools.WarcRecord.RESPONSE:
130 |                 raise Exception('invalid attempt to retrieve http payload of "{}" record'.format(warc_type))
131 | 
132 |             # find end of headers
133 |             while True:
134 |                 line = record.content_file.readline()
135 |                 if line == b'' or re.match(br'^\r?\n$', line):
136 |                     break
137 | 
138 |             return self._send_response(headers, record.content_file)
139 | 
140 |         finally:
141 |             fh.close()
142 | 
143 | 
144 |     def _send_response_from_warc(self, warcfilename, offset):
145 |         fh = self._open_warc_at_offset(warcfilename, offset)
146 |         try:
147 |             for (offset, record, errors) in fh.read_records(limit=1, offsets=True):
148 |                 pass
149 | 
150 |             if not record:
151 |                 raise Exception('failed to read record at offset {} from {}'.format(offset, warcfilename))
152 | 
153 |             if errors:
154 |                 raise Exception('warc errors at {}:{} -- {}'.format(warcfilename, offset, errors))
155 | 
156 |             if record.type == warctools.WarcRecord.RESPONSE:
157 |                 headers_buf = bytearray()
158 |                 while True:
159 |                     line = record.content_file.readline()
160 |                     headers_buf.extend(line)
161 |                     if line == b'' or re.match(b'^\r?\n$', line):
162 |                         break
163 | 
164 |                 return self._send_response(headers_buf, record.content_file)
165 | 
166 |             elif record.type == warctools.WarcRecord.REVISIT:
167 |                 # response consists of http headers from revisit record and
168 |                 # payload from the referenced record
169 |                 warc_profile = record.get_header(warctools.WarcRecord.PROFILE)
170 |                 if warc_profile != warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST:
171 |                     raise Exception('unknown revisit record profile {}'.format(warc_profile))
172 | 
173 |                 refers_to_target_uri = record.get_header(
174 |                         warctools.WarcRecord.REFERS_TO_TARGET_URI).decode(
175 |                                 'latin1')
176 |                 refers_to_date = record.get_header(
177 |                         warctools.WarcRecord.REFERS_TO_DATE).decode('latin1')
178 |                 payload_digest = record.get_header(
179 |                         warctools.WarcRecord.PAYLOAD_DIGEST).decode('latin1')
180 |                 self.logger.debug(
181 |                         'revisit record references %s:%s capture of %s',
182 |                         refers_to_date, payload_digest, refers_to_target_uri)
183 |                 return self._send_headers_and_refd_payload(
184 |                         record.content[1], refers_to_target_uri, refers_to_date,
185 |                         payload_digest)
186 | 
187 |             else:
188 |                 # send it back raw, whatever it is
189 |                 headers_buf = bytearray()
190 |                 headers_buf.extend(b'HTTP/1.0 200 OK\r\n')
191 |                 headers_buf.extend(b'content-length: ' + record.get_header(b'content-length') + b'\r\n')
192 |                 headers_buf.extend(b'content-type: ' + record.get_header(b'content-type') + b'\r\n')
193 |                 headers_buf.extend(b'\r\n')
194 |                 return self._send_response(headers_buf, record.content_file)
195 | 
196 |         finally:
197 |             fh.close()
198 | 
199 |         raise Exception('should not reach this point')
200 | 
201 | 
202 | class PlaybackProxy(socketserver.ThreadingMixIn, http_server.HTTPServer):
203 |     logger = logging.getLogger("warcprox.playback.PlaybackProxy")
204 | 
205 |     def __init__(self, ca=None, playback_index_db=None, options=warcprox.Options()):
206 |         server_address = (options.address or 'localhost', options.playback_port if options.playback_port is not None else 8001)
207 |         http_server.HTTPServer.__init__(self, server_address, PlaybackProxyHandler, bind_and_activate=True)
208 |         self.ca = ca
209 |         self.playback_index_db = playback_index_db
210 |         self.warcs_dir = options.directory
211 |         self.options = options
212 |         self.bad_hostnames_ports = TTLCache(maxsize=1024, ttl=60)
213 |         self.bad_hostnames_ports_lock = threading.RLock()
214 | 
215 |     def server_activate(self):
216 |         http_server.HTTPServer.server_activate(self)
217 |         self.logger.info('PlaybackProxy listening on {}:{}'.format(self.server_address[0], self.server_address[1]))
218 | 
219 |     def server_close(self):
220 |         self.logger.info('PlaybackProxy shutting down')
221 |         http_server.HTTPServer.server_close(self)
222 | 
223 | 
224 | class PlaybackIndexDb:
225 |     logger = logging.getLogger("warcprox.playback.PlaybackIndexDb")
226 | 
227 |     def __init__(self, file='./warcprox.sqlite', options=warcprox.Options()):
228 |         self.file = file
229 |         self._lock = threading.RLock()
230 | 
231 |         if os.path.exists(self.file):
232 |             self.logger.info(
233 |                     'opening existing playback index database %s', self.file)
234 |         else:
235 |             self.logger.info(
236 |                     'creating new playback index database %s', self.file)
237 | 
238 |         conn = sqlite3.connect(self.file)
239 |         conn.execute(
240 |                 'create table if not exists playback ('
241 |                 '  url varchar(4000) primary key,'
242 |                 '  value varchar(4000)'
243 |                 ');')
244 |         conn.commit()
245 |         conn.close()
246 | 
247 |     def close(self):
248 |         pass
249 | 
250 |     def sync(self):
251 |         pass
252 | 
253 |     def notify(self, recorded_url, records):
254 |         if records:
255 |             self.save(records[0].warc_filename, records, records[0].offset)
256 | 
257 |     def save(self, warcfile, recordset, offset):
258 |         response_record = recordset[0]
259 |         # XXX canonicalize url?
260 |         url = response_record.get_header(warctools.WarcRecord.URL).decode('latin1')
261 |         date_str = response_record.get_header(warctools.WarcRecord.DATE).decode('latin1')
262 |         payload_digest_str = response_record.get_header(warctools.WarcRecord.PAYLOAD_DIGEST).decode('latin1')
263 | 
264 |         # there could be two visits of same url in the same second, and WARC-Date is
265 |         # prescribed as YYYY-MM-DDThh:mm:ssZ, so we have to handle it :-\
266 | 
267 |         # url:{date1:[record1={'f':warcfile,'o':response_offset,'q':request_offset,'d':payload_digest},record2,...],date2:[{...}],...}
268 | 
269 |         with self._lock:
270 |             conn = sqlite3.connect(self.file)
271 |             cursor = conn.execute(
272 |                     'select value from playback where url = ?', (url,))
273 |             result_tuple = cursor.fetchone()
274 |             if result_tuple:
275 |                 py_value = json.loads(result_tuple[0])
276 |             else:
277 |                 py_value = {}
278 | 
279 |             if date_str in py_value:
280 |                 py_value[date_str].append(
281 |                         {'f': warcfile, 'o': offset, 'd': payload_digest_str})
282 |             else:
283 |                 py_value[date_str] = [
284 |                         {'f': warcfile, 'o': offset, 'd': payload_digest_str}]
285 | 
286 |             json_value = json.dumps(py_value, separators=(',',':'))
287 | 
288 |             conn.execute(
289 |                     'insert or replace into playback (url, value) '
290 |                     'values (?, ?)', (url, json_value))
291 |             conn.commit()
292 |             conn.close()
293 | 
294 |         self.logger.debug('playback index saved: {}:{}'.format(url, json_value))
295 | 
296 |     def lookup_latest(self, url):
297 |         conn = sqlite3.connect(self.file)
298 |         cursor = conn.execute(
299 |                 'select value from playback where url = ?', (url,))
300 |         result_tuple = cursor.fetchone()
301 |         conn.close()
302 | 
303 |         if not result_tuple:
304 |             return None, None
305 | 
306 |         json_value = result_tuple[0]
307 |         self.logger.debug('%r:%r', url, json_value)
308 |         py_value = json.loads(json_value)
309 | 
310 |         latest_date = max(py_value)
311 |         result = py_value[latest_date][0]
312 |         result['d'] = result['d'].encode('ascii')
313 |         return latest_date, result
314 | 
315 |     # in python3 params are bytes
316 |     def lookup_exact(self, url, warc_date, payload_digest):
317 |         conn = sqlite3.connect(self.file)
318 |         cursor = conn.execute(
319 |                 'select value from playback where url = ?', (url,))
320 |         result_tuple = cursor.fetchone()
321 |         conn.close()
322 | 
323 |         if not result_tuple:
324 |             return None
325 | 
326 |         json_value = result_tuple[0]
327 |         self.logger.debug('%r:%r', url, json_value)
328 |         py_value = json.loads(json_value)
329 | 
330 |         if warc_date in py_value:
331 |             for record in py_value[warc_date]:
332 |                 if record['d'] == payload_digest:
333 |                     self.logger.debug(
334 |                             "found exact match for (%r,%r,%r)",
335 |                             warc_date, payload_digest, url)
336 |                     record['d'] = record['d']
337 |                     return record
338 |         else:
339 |             self.logger.info(
340 |                     "match not found for (%r,%r,%r)", warc_date, payload_digest, url)
341 |             return None
342 | 


--------------------------------------------------------------------------------
/warcprox/ssl_util.py:
--------------------------------------------------------------------------------
 1 | import ssl
 2 | from urllib3.util.ssl_ import create_urllib3_context
 3 | 
 4 | def create_chrome_ssl_context():
 5 |     """Create a custom SSL context imitating Chrome.
 6 |     Chrome typically uses these cipher suites (as of Chrome 120+)
 7 |     """
 8 |     context = create_urllib3_context()
 9 |     context.set_ciphers(
10 |         "TLS_AES_128_GCM_SHA256:"
11 |         "TLS_AES_256_GCM_SHA384:"
12 |         "TLS_CHACHA20_POLY1305_SHA256:"
13 |         "ECDHE-ECDSA-AES128-GCM-SHA256:"
14 |         "ECDHE-RSA-AES128-GCM-SHA256:"
15 |         "ECDHE-ECDSA-AES256-GCM-SHA384:"
16 |         "ECDHE-RSA-AES256-GCM-SHA384:"
17 |         "ECDHE-ECDSA-CHACHA20-POLY1305:"
18 |         "ECDHE-RSA-CHACHA20-POLY1305:"
19 |         "ECDHE-RSA-AES128-SHA:"
20 |         "ECDHE-RSA-AES256-SHA:"
21 |         "AES128-GCM-SHA256:"
22 |         "AES256-GCM-SHA384:"
23 |         "AES128-SHA:"
24 |         "AES256-SHA"
25 |     )
26 | 
27 |     # Set TLS versions (Chrome supports 1.2 and 1.3)
28 |     context.options |= ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1
29 | 
30 |     # Chrome's preferred elliptic curves
31 |     if hasattr(context, 'set_ecdh_curve'):
32 |         context.set_ecdh_curve("prime256v1")
33 | 
34 |     return context
35 | 
36 | 
37 | def create_firefox_ssl_context():
38 |     """Create a custom SSL context imitating Firefox.
39 |     Firefox (as of recent versions) uses these cipher suites
40 |     """
41 |     context = create_urllib3_context()
42 |     context.set_ciphers(
43 |         "TLS_AES_128_GCM_SHA256:"
44 |         "TLS_AES_256_GCM_SHA384:"
45 |         "TLS_CHACHA20_POLY1305_SHA256:"
46 |         "ECDHE-ECDSA-AES128-GCM-SHA256:"
47 |         "ECDHE-RSA-AES128-GCM-SHA256:"
48 |         "ECDHE-ECDSA-AES256-GCM-SHA384:"
49 |         "ECDHE-RSA-AES256-GCM-SHA384:"
50 |         "ECDHE-ECDSA-CHACHA20-POLY1305:"
51 |         "ECDHE-RSA-CHACHA20-POLY1305"
52 |     )
53 | 
54 |     # Set TLS versions (Firefox supports TLS 1.2 and 1.3)
55 |     context.options |= ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1
56 | 
57 |     # Firefox preferred elliptic curves. None is available in Python so we stick to the defaults.
58 |     # if hasattr(context, "set_ecdh_curve"):
59 |     #     context.set_ecdh_curve("X25519:secp256r1:secp384r1")
60 | 
61 |     return context
62 | 


--------------------------------------------------------------------------------
/warcprox/stats.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | warcprox/stats.py - keeps statistics on what has been archived
  3 | 
  4 | Copyright (C) 2013-2017 Internet Archive
  5 | 
  6 | This program is free software; you can redistribute it and/or
  7 | modify it under the terms of the GNU General Public License
  8 | as published by the Free Software Foundation; either version 2
  9 | of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful,
 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | GNU General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License
 17 | along with this program; if not, write to the Free Software
 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 19 | USA.
 20 | '''
 21 | from hanzo import warctools
 22 | import collections
 23 | import doublethink
 24 | import json
 25 | import logging
 26 | import os
 27 | from rethinkdb import RethinkDB; r = RethinkDB()
 28 | import sqlite3
 29 | import threading
 30 | import time
 31 | import urlcanon
 32 | import warcprox
 33 | 
 34 | def _empty_bucket(bucket):
 35 |     return {
 36 |         "bucket": bucket,
 37 |         "total": {
 38 |             "urls": 0,
 39 |             "wire_bytes": 0,
 40 |         },
 41 |         "new": {
 42 |             "urls": 0,
 43 |             "wire_bytes": 0,
 44 |         },
 45 |         "revisit": {
 46 |             "urls": 0,
 47 |             "wire_bytes": 0,
 48 |         },
 49 |     }
 50 | 
 51 | def unravel_buckets(url, warcprox_meta):
 52 |     '''
 53 |     Unravels bucket definitions in Warcprox-Meta header. Each bucket
 54 |     definition can either be a string, which signifies the name of the
 55 |     bucket, or a dict. If a dict it is expected to have at least an item
 56 |     with key 'bucket' whose value is the name of the bucket. The other
 57 |     currently recognized item is 'tally-domains', which if supplied should
 58 |     be a list of domains. This instructs warcprox to additionally tally
 59 |     substats of the given bucket by domain. Host stats are stored in the
 60 |     stats table under the key '{parent-bucket}:{domain(normalized)}'.
 61 | 
 62 |     Returns:
 63 |         list of strings
 64 | 
 65 |     Example Warcprox-Meta header (a real one will likely have other
 66 |     sections besides 'stats'):
 67 | 
 68 |     Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}
 69 | 
 70 |     In this case the return value would be
 71 |     ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
 72 |     '''
 73 |     buckets = ["__all__"]
 74 |     if (warcprox_meta and "stats" in warcprox_meta
 75 |             and "buckets" in warcprox_meta["stats"]):
 76 |         for bucket in warcprox_meta["stats"]["buckets"]:
 77 |             if isinstance(bucket, dict):
 78 |                 if not 'bucket' in bucket:
 79 |                     self.logger.warning(
 80 |                             'ignoring invalid stats bucket in '
 81 |                             'warcprox-meta header %s', bucket)
 82 |                     continue
 83 |                 buckets.append(bucket['bucket'])
 84 |                 if bucket.get('tally-domains'):
 85 |                     canon_url = urlcanon.semantic(url)
 86 |                     for domain in bucket['tally-domains']:
 87 |                         domain = urlcanon.normalize_host(domain).decode('ascii')
 88 |                         if urlcanon.url_matches_domain(canon_url, domain):
 89 |                             buckets.append(
 90 |                                     '{}:{}'.format(bucket['bucket'], domain))
 91 |             else:
 92 |                 buckets.append(bucket)
 93 |     else:
 94 |         buckets.append("__unspecified__")
 95 | 
 96 |     return buckets
 97 | 
 98 | class StatsProcessor(warcprox.BaseBatchPostfetchProcessor):
 99 |     logger = logging.getLogger("warcprox.stats.StatsProcessor")
100 | 
101 |     def _startup(self):
102 |         if os.path.exists(self.options.stats_db_file):
103 |             self.logger.info(
104 |                     'opening existing stats database %s',
105 |                     self.options.stats_db_file)
106 |         else:
107 |             self.logger.info(
108 |                     'creating new stats database %s',
109 |                     self.options.stats_db_file)
110 | 
111 |         conn = sqlite3.connect(self.options.stats_db_file)
112 |         conn.execute(
113 |                 'create table if not exists buckets_of_stats ('
114 |                 '  bucket varchar(300) primary key,'
115 |                 '  stats varchar(4000)'
116 |                 ');')
117 |         conn.commit()
118 |         conn.close()
119 | 
120 |         self.logger.info(
121 |                 'created table buckets_of_stats in %s',
122 |                 self.options.stats_db_file)
123 | 
124 |     def _process_batch(self, batch):
125 |         batch_buckets = self._tally_batch(batch)
126 |         self._update_db(batch_buckets)
127 |         logging.trace('updated stats from batch of %s', len(batch))
128 | 
129 |     def _update_db(self, batch_buckets):
130 |         conn = sqlite3.connect(self.options.stats_db_file)
131 |         for bucket in batch_buckets:
132 |             bucket_stats = batch_buckets[bucket]
133 | 
134 |             cursor = conn.execute(
135 |                     'select stats from buckets_of_stats where bucket=?',
136 |                     (bucket,))
137 |             result_tuple = cursor.fetchone()
138 |             cursor.close()
139 | 
140 |             if result_tuple:
141 |                 old_bucket_stats = json.loads(result_tuple[0])
142 | 
143 |                 bucket_stats['total']['urls'] += old_bucket_stats['total']['urls']
144 |                 bucket_stats['total']['wire_bytes'] += old_bucket_stats['total']['wire_bytes']
145 |                 bucket_stats['revisit']['urls'] += old_bucket_stats['revisit']['urls']
146 |                 bucket_stats['revisit']['wire_bytes'] += old_bucket_stats['revisit']['wire_bytes']
147 |                 bucket_stats['new']['urls'] += old_bucket_stats['new']['urls']
148 |                 bucket_stats['new']['wire_bytes'] += old_bucket_stats['new']['wire_bytes']
149 | 
150 |             json_value = json.dumps(bucket_stats, separators=(',',':'))
151 |             conn.execute(
152 |                     'insert or replace into buckets_of_stats '
153 |                     '(bucket, stats) values (?, ?)', (bucket, json_value))
154 |             conn.commit()
155 |         conn.close()
156 | 
157 |     def _tally_batch(self, batch):
158 |         batch_buckets = {}
159 |         for recorded_url in batch:
160 |             if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
161 |                 continue
162 |             for bucket in self.buckets(recorded_url):
163 |                 bucket_stats = batch_buckets.get(bucket)
164 |                 if not bucket_stats:
165 |                     bucket_stats = _empty_bucket(bucket)
166 |                     batch_buckets[bucket] = bucket_stats
167 | 
168 |                 bucket_stats["total"]["urls"] += 1
169 |                 bucket_stats["total"]["wire_bytes"] += recorded_url.size
170 | 
171 |                 if recorded_url.warc_records:
172 |                     if recorded_url.warc_records[0].type == b'revisit':
173 |                         bucket_stats["revisit"]["urls"] += 1
174 |                         bucket_stats["revisit"]["wire_bytes"] += recorded_url.size
175 |                     else:
176 |                         bucket_stats["new"]["urls"] += 1
177 |                         bucket_stats["new"]["wire_bytes"] += recorded_url.size
178 |         return batch_buckets
179 | 
180 |     def value(self, bucket0="__all__", bucket1=None, bucket2=None):
181 |         conn = sqlite3.connect(self.options.stats_db_file)
182 |         cursor = conn.execute(
183 |                 'select stats from buckets_of_stats where bucket = ?',
184 |                 (bucket0,))
185 |         result_tuple = cursor.fetchone()
186 |         conn.close()
187 |         if result_tuple:
188 |             bucket0_stats = json.loads(result_tuple[0])
189 |             if bucket1:
190 |                 if bucket2:
191 |                     return bucket0_stats[bucket1][bucket2]
192 |                 else:
193 |                     return bucket0_stats[bucket1]
194 |             else:
195 |                 return bucket0_stats
196 |         else:
197 |             return None
198 | 
199 |     def buckets(self, recorded_url):
200 |         return unravel_buckets(recorded_url.url, recorded_url.warcprox_meta)
201 | 
202 | class RethinkStatsProcessor(StatsProcessor):
203 |     logger = logging.getLogger("warcprox.stats.RethinkStatsProcessor")
204 | 
205 |     def __init__(self, options=warcprox.Options()):
206 |         StatsProcessor.__init__(self, options)
207 | 
208 |         parsed = doublethink.parse_rethinkdb_url(options.rethinkdb_stats_url)
209 |         self.rr = doublethink.Rethinker(
210 |                 servers=parsed.hosts, db=parsed.database)
211 |         self.table = parsed.table
212 |         self.replicas = min(3, len(self.rr.servers))
213 | 
214 |     def _startup(self):
215 |         self._ensure_db_table()
216 | 
217 |     def _ensure_db_table(self):
218 |         dbs = self.rr.db_list().run()
219 |         if not self.rr.dbname in dbs:
220 |             self.logger.info(
221 |                     "creating rethinkdb database %r", self.rr.dbname)
222 |             self.rr.db_create(self.rr.dbname).run()
223 |         tables = self.rr.table_list().run()
224 |         if not self.table in tables:
225 |             self.logger.info(
226 |                     "creating rethinkdb table %r in database %r shards=%r "
227 |                     "replicas=%r", self.table, self.rr.dbname, 1,
228 |                     self.replicas)
229 |             self.rr.table_create(
230 |                     self.table, primary_key="bucket", shards=1,
231 |                     replicas=self.replicas).run()
232 | 
233 |     def _update_db(self, batch_buckets):
234 |         # XXX can all the buckets be done in one query?
235 |         for bucket in batch_buckets:
236 |             result = self._bucket_batch_update_reql(
237 |                     bucket, batch_buckets[bucket]).run()
238 |             if (not result['inserted'] and not result['replaced']
239 |                     or sorted(result.values()) != [0,0,0,0,0,1]):
240 |                 self.logger.error(
241 |                         'unexpected result {} updating stats {}'.format(
242 |                             result, batch_buckets[bucket]))
243 | 
244 |     def _bucket_batch_update_reql(self, bucket, new):
245 |         return self.rr.table(self.table).get(bucket).replace(
246 |             lambda old: r.branch(
247 |                 old.eq(None), new, old.merge({
248 |                     'total': {
249 |                         'urls': old['total']['urls'].add(new['total']['urls']),
250 |                         'wire_bytes': old['total']['wire_bytes'].add(
251 |                             new['total']['wire_bytes']),
252 |                         },
253 |                     'new': {
254 |                         'urls': old['new']['urls'].add(new['new']['urls']),
255 |                         'wire_bytes': old['new']['wire_bytes'].add(
256 |                             new['new']['wire_bytes']),
257 |                         },
258 |                     'revisit': {
259 |                         'urls': old['revisit']['urls'].add(
260 |                             new['revisit']['urls']),
261 |                         'wire_bytes': old['revisit']['wire_bytes'].add(
262 |                             new['revisit']['wire_bytes']),
263 |                         },
264 |                 })))
265 | 
266 |     def value(self, bucket0="__all__", bucket1=None, bucket2=None):
267 |         bucket0_stats = self.rr.table(self.table).get(bucket0).run()
268 |         self.logger.debug(
269 |                 'stats db lookup of bucket=%s returned %s',
270 |                 bucket0, bucket0_stats)
271 |         if bucket0_stats:
272 |             if bucket1:
273 |                 if bucket2:
274 |                     return bucket0_stats[bucket1][bucket2]
275 |                 else:
276 |                     return bucket0_stats[bucket1]
277 |         return bucket0_stats
278 | 
279 | class RunningStats:
280 |     '''
281 |     In-memory stats for measuring overall warcprox performance.
282 |     '''
283 |     def __init__(self):
284 |         self.urls = 0
285 |         self.warc_bytes = 0
286 |         self._lock = threading.RLock()
287 |         self.first_snap_time = time.time()
288 |         # snapshot every minute since the beginning of time
289 |         self.minute_snaps = [(self.first_snap_time, 0, 0)]
290 |         # snapshot every 10 seconds for the last 2 minutes (fill with zeroes)
291 |         self.ten_sec_snaps = collections.deque()
292 |         for i in range(0, 13):
293 |             self.ten_sec_snaps.append(
294 |                     (self.first_snap_time - 120 + i * 10, 0, 0))
295 | 
296 |     def notify(self, recorded_url, records):
297 |         if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
298 |             return
299 |         with self._lock:
300 |             self.urls += 1
301 |             if records:
302 |                 self.warc_bytes += records[-1].offset + records[-1].length - records[0].offset
303 | 
304 |     def snap(self):
305 |         now = time.time()
306 |         last_snap_time = self.minute_snaps[-1][0]
307 |         need_minute_snap = (now - self.first_snap_time) // 60 > (self.minute_snaps[-1][0] - self.first_snap_time) // 60
308 |         need_ten_sec_snap = (now - self.ten_sec_snaps[0][0]) // 10 > (self.ten_sec_snaps[-1][0] - self.ten_sec_snaps[0][0]) // 10
309 |         if need_minute_snap:
310 |             self.minute_snaps.append((now, self.urls, self.warc_bytes))
311 |         if need_ten_sec_snap:
312 |             self.ten_sec_snaps.popleft()
313 |             self.ten_sec_snaps.append((now, self.urls, self.warc_bytes))
314 | 
315 |     def _closest_ten_sec_snap(self, t):
316 |         # it's a deque so iterating over it is faster than indexed lookup
317 |         closest_snap = (0, 0, 0)
318 |         for snap in self.ten_sec_snaps:
319 |             if abs(t - snap[0]) < abs(t - closest_snap[0]):
320 |                 closest_snap = snap
321 |         return closest_snap
322 | 
323 |     def _closest_minute_snap(self, t):
324 |         minutes_ago = int((time.time() - t) / 60)
325 |         # jump to approximately where we expect the closest snap
326 |         i = max(0, len(self.minute_snaps) - minutes_ago)
327 |         # move back to the last one earlier than `t`
328 |         while self.minute_snaps[i][0] > t and i > 0:
329 |             i -= 1
330 |         closest_snap = self.minute_snaps[i]
331 |         # move forward until we start getting farther away from `t`
332 |         while i < len(self.minute_snaps):
333 |             if abs(t - self.minute_snaps[i][0]) <= abs(t - closest_snap[0]):
334 |                 closest_snap = self.minute_snaps[i]
335 |             else:
336 |                 break
337 |             i += 1
338 |         return closest_snap
339 | 
340 |     def current_rates(self, time_period_minutes):
341 |         assert time_period_minutes > 0
342 |         with self._lock:
343 |             now = time.time()
344 |             urls = self.urls
345 |             warc_bytes = self.warc_bytes
346 | 
347 |         t = now - time_period_minutes * 60
348 |         if time_period_minutes <= 2:
349 |             start_snap = self._closest_ten_sec_snap(t)
350 |         else:
351 |             start_snap = self._closest_minute_snap(t)
352 | 
353 |         elapsed = now - start_snap[0]
354 |         logging.trace(
355 |                 'elapsed=%0.1fs urls=%s warc_bytes=%s', elapsed,
356 |                 urls - start_snap[1], warc_bytes - start_snap[2])
357 |         return elapsed, (urls - start_snap[1]) / elapsed, (warc_bytes - start_snap[2]) / elapsed
358 | 
359 | 


--------------------------------------------------------------------------------
/warcprox/warc.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | warcprox/warc.py - assembles warc records
  3 | 
  4 | Copyright (C) 2013-2018 Internet Archive
  5 | 
  6 | This program is free software; you can redistribute it and/or
  7 | modify it under the terms of the GNU General Public License
  8 | as published by the Free Software Foundation; either version 2
  9 | of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful,
 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | GNU General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License
 17 | along with this program; if not, write to the Free Software
 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 19 | USA.
 20 | '''
 21 | 
 22 | import logging
 23 | import warcprox
 24 | import hashlib
 25 | import socket
 26 | import hanzo.httptools
 27 | from hanzo import warctools
 28 | import datetime
 29 | 
 30 | class WarcRecordBuilder:
 31 |     logger = logging.getLogger("warcprox.warc.WarcRecordBuilder")
 32 | 
 33 |     def __init__(self, digest_algorithm="sha1", base32=False):
 34 |         self.digest_algorithm = digest_algorithm
 35 |         self.base32 = base32
 36 | 
 37 |     def format_warc_date(self, dt):
 38 |         return dt.strftime('%Y-%m-%dT%H:%M:%SZ').encode('ascii')
 39 | 
 40 |     def _build_response_principal_record(self, recorded_url, warc_date):
 41 |         """Builds response or revisit record, whichever is appropriate."""
 42 |         if hasattr(recorded_url, "dedup_info") and recorded_url.dedup_info:
 43 |             # revisit record
 44 |             recorded_url.response_recorder.tempfile.seek(0)
 45 |             if recorded_url.response_recorder.payload_offset is not None:
 46 |                 response_header_block = recorded_url.response_recorder.tempfile.read(recorded_url.response_recorder.payload_offset)
 47 |             else:
 48 |                 response_header_block = recorded_url.response_recorder.tempfile.read()
 49 | 
 50 |             return self.build_warc_record(
 51 |                     url=recorded_url.url, warc_date=warc_date,
 52 |                     data=response_header_block,
 53 |                     warc_type=warctools.WarcRecord.REVISIT,
 54 |                     refers_to=recorded_url.dedup_info.get('id'),
 55 |                     refers_to_target_uri=recorded_url.dedup_info['url'],
 56 |                     refers_to_date=recorded_url.dedup_info['date'],
 57 |                     payload_digest=warcprox.digest_str(
 58 |                         recorded_url.payload_digest, self.base32),
 59 |                     profile=warctools.WarcRecord.PROFILE_IDENTICAL_PAYLOAD_DIGEST,
 60 |                     content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
 61 |                     remote_ip=recorded_url.remote_ip)
 62 |         else:
 63 |             # response record
 64 |             return self.build_warc_record(
 65 |                     url=recorded_url.url, warc_date=warc_date,
 66 |                     recorder=recorded_url.response_recorder,
 67 |                     warc_type=warctools.WarcRecord.RESPONSE,
 68 |                     content_type=hanzo.httptools.ResponseMessage.CONTENT_TYPE,
 69 |                     remote_ip=recorded_url.remote_ip,
 70 |                     payload_digest=warcprox.digest_str(
 71 |                         recorded_url.payload_digest, self.base32),
 72 |                     truncated=recorded_url.truncated)
 73 | 
 74 |     def build_warc_records(self, recorded_url):
 75 |         """Returns a tuple of hanzo.warctools.warc.WarcRecord (principal_record, ...)"""
 76 |         warc_date = self.format_warc_date(recorded_url.timestamp)
 77 | 
 78 |         if recorded_url.response_recorder:
 79 |             principal_record = self._build_response_principal_record(recorded_url, warc_date)
 80 |             request_record = self.build_warc_record(url=recorded_url.url,
 81 |                     warc_date=warc_date, data=recorded_url.request_data,
 82 |                     warc_type=warctools.WarcRecord.REQUEST,
 83 |                     content_type=hanzo.httptools.RequestMessage.CONTENT_TYPE,
 84 |                     concurrent_to=principal_record.id)
 85 |             return principal_record, request_record
 86 |         else:
 87 |             principal_record = self.build_warc_record(
 88 |                     url=recorded_url.url,
 89 |                     warc_date=warc_date, data=recorded_url.request_data,
 90 |                     warc_type=recorded_url.custom_type,
 91 |                     content_type=recorded_url.content_type.encode("latin1"),
 92 |                     payload_digest=warcprox.digest_str(
 93 |                         recorded_url.payload_digest, self.base32),
 94 |                     content_length=recorded_url.size)
 95 |             return (principal_record,)
 96 | 
 97 |     def build_warc_record(self, url, warc_date=None, recorder=None, data=None,
 98 |         concurrent_to=None, warc_type=None, content_type=None, remote_ip=None,
 99 |         profile=None, refers_to=None, refers_to_target_uri=None,
100 |         refers_to_date=None, payload_digest=None, truncated=None,
101 |         content_length=None):
102 | 
103 |         if warc_date is None:
104 |             warc_date = self.format_warc_date(datetime.datetime.utcnow())
105 | 
106 |         record_id = warctools.WarcRecord.random_warc_uuid()
107 | 
108 |         headers = []
109 |         if warc_type is not None:
110 |             headers.append((warctools.WarcRecord.TYPE, warc_type))
111 |         headers.append((warctools.WarcRecord.ID, record_id))
112 |         headers.append((warctools.WarcRecord.DATE, warc_date))
113 |         headers.append((warctools.WarcRecord.URL, url))
114 |         if remote_ip is not None:
115 |             headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
116 |         if profile is not None:
117 |             headers.append((warctools.WarcRecord.PROFILE, profile))
118 |         if refers_to is not None:
119 |             headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
120 |         if refers_to_target_uri is not None:
121 |             headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
122 |         if refers_to_date is not None:
123 |             headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
124 |         if concurrent_to is not None:
125 |             headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
126 |         if content_type is not None:
127 |             headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
128 |         # truncated value may be 'length' or 'time'
129 |         if truncated is not None:
130 |             headers.append((b'WARC-Truncated', truncated))
131 |         if content_length is not None:
132 |             headers.append((
133 |                 warctools.WarcRecord.CONTENT_LENGTH,
134 |                 str(content_length).encode('latin1')))
135 | 
136 |         if recorder is not None:
137 |             if payload_digest is not None:
138 |                 headers.append(
139 |                         (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
140 |             if content_length is None:
141 |                 headers.append((
142 |                     warctools.WarcRecord.CONTENT_LENGTH,
143 |                     str(len(recorder)).encode('latin1')))
144 |             headers.append((warctools.WarcRecord.BLOCK_DIGEST,
145 |                 warcprox.digest_str(recorder.block_digest, self.base32)))
146 |             recorder.tempfile.seek(0)
147 |             record = warctools.WarcRecord(
148 |                     headers=headers, content_file=recorder.tempfile)
149 |         else:
150 |             if content_length is None:
151 |                 headers.append((
152 |                     warctools.WarcRecord.CONTENT_LENGTH,
153 |                     str(len(data)).encode('latin1')))
154 | 
155 |             block_digest = None
156 |             if not hasattr(data, 'read'):
157 |                 block_digest = warcprox.digest_str(
158 |                         hashlib.new(self.digest_algorithm, data), self.base32)
159 | 
160 |             if not content_type.lower().startswith(b'application/http'):
161 |                 # no http headers, so block digest == payload digest
162 |                 if payload_digest and not block_digest:
163 |                     block_digest = payload_digest
164 |                 elif block_digest and not payload_digest:
165 |                     payload_digest = block_digest
166 | 
167 |             if block_digest:
168 |                 headers.append(
169 |                         (warctools.WarcRecord.BLOCK_DIGEST, block_digest))
170 |             if payload_digest:
171 |                 headers.append(
172 |                         (warctools.WarcRecord.PAYLOAD_DIGEST, payload_digest))
173 | 
174 |             if hasattr(data, 'read'):
175 |                 record = warctools.WarcRecord(
176 |                         headers=headers, content_file=data)
177 |             else:
178 |                 content_tuple = content_type, data
179 |                 record = warctools.WarcRecord(
180 |                         headers=headers, content=(content_type, data))
181 | 
182 |         return record
183 | 
184 |     def _local_address(self):
185 |         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
186 |         s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
187 |         output = s.getsockname()[0]
188 |         s.close()
189 |         return output
190 | 
191 |     def build_warcinfo_record(self, filename):
192 |         warc_record_date = self.format_warc_date(datetime.datetime.utcnow())
193 |         record_id = warctools.WarcRecord.random_warc_uuid()
194 | 
195 |         headers = []
196 |         headers.append((warctools.WarcRecord.ID, record_id))
197 |         headers.append((warctools.WarcRecord.TYPE, warctools.WarcRecord.WARCINFO))
198 |         headers.append((warctools.WarcRecord.FILENAME, filename.encode('latin1')))
199 |         headers.append((warctools.WarcRecord.DATE, warc_record_date))
200 | 
201 |         warcinfo_fields = []
202 |         warcinfo_fields.append(b'software: warcprox ' + warcprox.__version__.encode('latin1'))
203 |         hostname = socket.gethostname()
204 |         warcinfo_fields.append('hostname: {}'.format(hostname).encode('latin1'))
205 |         warcinfo_fields.append(('ip: %s' % self._local_address()).encode('latin1'))
206 |         warcinfo_fields.append(b'format: WARC File Format 1.0')
207 |         # warcinfo_fields.append('robots: ignore')
208 |         # warcinfo_fields.append('description: {0}'.format(self.description))
209 |         # warcinfo_fields.append('isPartOf: {0}'.format(self.is_part_of))
210 |         data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
211 | 
212 |         record = warctools.WarcRecord(headers=headers, content=(b'application/warc-fields', data))
213 | 
214 |         return record
215 | 
216 | 


--------------------------------------------------------------------------------
/warcprox/writer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | warcprox/writer.py - warc writer, manages and writes records to warc files
  3 | 
  4 | Copyright (C) 2013-2019 Internet Archive
  5 | 
  6 | This program is free software; you can redistribute it and/or
  7 | modify it under the terms of the GNU General Public License
  8 | as published by the Free Software Foundation; either version 2
  9 | of the License, or (at your option) any later version.
 10 | 
 11 | This program is distributed in the hope that it will be useful,
 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | GNU General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License
 17 | along with this program; if not, write to the Free Software
 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 19 | USA.
 20 | '''
 21 | import logging
 22 | from hanzo import warctools
 23 | import fcntl
 24 | import time
 25 | import warcprox
 26 | import os
 27 | import socket
 28 | import random
 29 | 
 30 | class WarcWriter:
 31 |     '''
 32 |     A writer for one warc prefix, which rolls over to new warc file,
 33 |     incrementing serial number, when size limit is hit. Should only be used
 34 |     from one thread.
 35 |     '''
 36 |     logger = logging.getLogger('warcprox.writer.WarcWriter')
 37 | 
 38 |     def __init__(self, options=warcprox.Options()):
 39 |         self.options = options
 40 | 
 41 |         self.gzip = options.gzip or False
 42 |         self.record_builder = warcprox.warc.WarcRecordBuilder(
 43 |                 digest_algorithm=options.digest_algorithm or 'sha1',
 44 |                 base32=options.base32)
 45 | 
 46 |         self.f = None
 47 |         self.path = None
 48 |         self.finalname = None
 49 |         self.gzip = options.gzip or False
 50 |         self.prefix = options.prefix or 'warcprox'
 51 |         self.port = options.port or 8000
 52 |         self.open_suffix = '' if options.no_warc_open_suffix else '.open'
 53 |         self.rollover_size = options.rollover_size or 1000000000
 54 |         self.rollover_idle_time = options.rollover_idle_time or None
 55 |         if options.subdir_prefix and options.prefix:
 56 |             self.directory = os.path.sep.join([options.directory, options.prefix]) or './warcs'
 57 |         else:
 58 |             self.directory = options.directory or './warcs'
 59 |         self.filename_template = options.warc_filename or \
 60 |                 '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
 61 |         self.last_activity = time.time()
 62 |         self.serial = 0
 63 |         self.randomtoken = ''.join(
 64 |                 random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
 65 | 
 66 |     # h3 default <!-- <property name="template" value="${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}" /> -->
 67 |     def filename(self, serial):
 68 |         """WARC filename is configurable with CLI parameter --warc-filename.
 69 |         Default: '{prefix}-{timestamp17}-{randomtoken}-{serialno}'
 70 |         Available variables are: prefix, timestamp14, timestamp17, serialno,
 71 |         randomtoken, hostname, shorthostname, port.
 72 |         Extension ``.warc`` or ``.warc.gz`` is appended automatically.
 73 |         """
 74 |         hostname = socket.getfqdn()
 75 |         shorthostname = hostname.split('.')[0]
 76 |         fname = self.filename_template.format(
 77 |                 prefix=self.prefix, timestamp14=warcprox.timestamp14(),
 78 |                 timestamp17=warcprox.timestamp17(),
 79 |                 serialno='{:05d}'.format(serial),
 80 |                 randomtoken=self.randomtoken, hostname=hostname,
 81 |                 shorthostname=shorthostname, port=self.port)
 82 |         if self.gzip:
 83 |             fname = fname + '.warc.gz'
 84 |         else:
 85 |             fname = fname + '.warc'
 86 |         return fname
 87 | 
 88 |     def open(self, serial):
 89 |         '''
 90 |         Opens a new warc file with filename prefix `self.prefix` and serial
 91 |         number `self.serial` and assigns file handle to `self.f`.
 92 |         '''
 93 |         if not os.path.exists(self.directory):
 94 |             self.logger.info(
 95 |                     "warc destination directory %s doesn't exist, creating it",
 96 |                     self.directory)
 97 |             os.mkdir(self.directory)
 98 | 
 99 |         self.finalname = self.filename(serial)
100 |         self.logger.trace('opening %s', self.finalname)
101 |         self.path = os.path.sep.join(
102 |                 [self.directory, self.finalname + self.open_suffix])
103 | 
104 |         self.f = open(self.path, 'wb')
105 |         # if no '.open' suffix is used for WARC, acquire an exclusive
106 |         # file lock.
107 |         if self.open_suffix == '':
108 |             try:
109 |                 fcntl.lockf(self.f, fcntl.LOCK_EX | fcntl.LOCK_NB)
110 |             except OSError as exc:
111 |                 self.logger.error(
112 |                         'could not lock file %s (%s)', self.path, exc)
113 |         return self.f
114 | 
115 |     def ensure_open(self):
116 |         '''
117 |         Ensures `self.f` is ready to write the next warc record.
118 | 
119 |         If warc is not open, opens one, and writes the warcinfo record.
120 |         '''
121 |         if not self.f:
122 |             serial = self.serial
123 |             self.serial += 1
124 |             self.open(serial)
125 |             warcinfo = self.record_builder.build_warcinfo_record(self.finalname)
126 |             self.logger.debug('warcinfo.headers=%s', warcinfo.headers)
127 |             warcinfo.write_to(self.f, gzip=self.gzip)
128 | 
129 |     def write_records(self, recorded_url):
130 |         '''
131 |         Returns tuple of records written, which are instances of
132 |         `hanzo.warctools.warc.WarcRecord`, decorated with `warc_filename` and
133 |         `offset` attributes.
134 |         '''
135 |         records = self.record_builder.build_warc_records(recorded_url)
136 | 
137 |         self.ensure_open()
138 |         total_warc_file_size = None
139 |         for record in records:
140 |             offset = self.f.tell()
141 |             record.write_to(self.f, gzip=self.gzip)
142 |             record.offset = offset
143 |             offset2 = self.f.tell()
144 |             record.length = offset2 - offset
145 |             total_warc_file_size = offset2
146 |             record.warc_filename = self.finalname
147 |             self.logger.trace(
148 |                     'wrote warc record: warc_type=%s content_length=%s '
149 |                     'digest=%s offset=%d warc=%s url=%s', record.type,
150 |                     record.get_header(warctools.WarcRecord.CONTENT_LENGTH),
151 |                     record.get_header(b'WARC-Payload-Digest'), record.offset,
152 |                     self.path, record.get_header(warctools.WarcRecord.URL))
153 |         self.f.flush()
154 |         self.last_activity = time.time()
155 |         # Closes current warc if size limit has been reached.
156 |         self.maybe_size_rollover(total_warc_file_size)
157 |         return records
158 | 
159 |     def close(self):
160 |         '''
161 |         Closes out the active warc.
162 | 
163 |         The next call to `write_records()` will write to a a new warc file with
164 |         the serial number incremented.
165 |         '''
166 |         if self.path:
167 |             self.logger.trace('closing %s', self.finalname)
168 |             if self.open_suffix == '':
169 |                 try:
170 |                     fcntl.lockf(self.f, fcntl.LOCK_UN)
171 |                 except Exception as exc:
172 |                     self.logger.error(
173 |                             'could not unlock file %s (%s)', self.path, exc)
174 |             try:
175 |                 self.f.close()
176 |                 finalpath = os.path.sep.join(
177 |                         [self.directory, self.finalname])
178 |                 os.rename(self.path, finalpath)
179 |             except Exception as exc:
180 |                 self.logger.error(
181 |                     'could not close and rename file %s (%s)', self.path, exc)
182 |             self.path = None
183 |             self.f = None
184 | 
185 |     def maybe_idle_rollover(self):
186 |         if (self.path and self.rollover_idle_time
187 |                 and self.rollover_idle_time > 0
188 |                 and time.time() - self.last_activity > self.rollover_idle_time):
189 |             self.logger.info(
190 |                     'rolling over %s after %0.1f seconds idle',
191 |                     self.finalname, time.time() - self.last_activity)
192 |             self.close()
193 | 
194 |     def maybe_size_rollover(self, total_warc_file_size):
195 |         if total_warc_file_size and total_warc_file_size > self.rollover_size:
196 |             self.logger.info(
197 |                     'rolling over %s because it has reached %s bytes in size',
198 |                     self.finalname, total_warc_file_size)
199 |             self.close()
200 | 
201 | class WarcWriterPool:
202 |     '''
203 |     A `WarcWriter` per warc prefix. Should only be used from one thread.
204 |     '''
205 |     logger = logging.getLogger("warcprox.writer.WarcWriterPool")
206 | 
207 |     def __init__(self, options=warcprox.Options()):
208 |         self.default_warc_writer = WarcWriter(options)
209 |         self.warc_writers = {}  # {prefix:WarcWriter}
210 |         self.options = options
211 |         self._last_maybe = time.time()
212 | 
213 |     # chooses writer for filename specified by warcprox_meta["warc-prefix"] if set
214 |     def _writer(self, recorded_url):
215 |         w = self.default_warc_writer
216 |         if recorded_url.warcprox_meta and "warc-prefix" in recorded_url.warcprox_meta:
217 |             # self.logger.info("recorded_url.warcprox_meta={} for {}".format(recorded_url.warcprox_meta, recorded_url.url))
218 |             options = warcprox.Options(**vars(self.options))
219 |             options.prefix = recorded_url.warcprox_meta["warc-prefix"]
220 |             if not options.prefix in self.warc_writers:
221 |                 self.warc_writers[options.prefix] = WarcWriter(options)
222 |             w = self.warc_writers[options.prefix]
223 |         return w
224 | 
225 |     def write_records(self, recorded_url):
226 |         '''
227 |         Returns tuple of records written, which are instances of
228 |         `hanzo.warctools.warc.WarcRecord`, decorated with `warc_filename` and
229 |         `offset` attributes.
230 |         '''
231 |         return self._writer(recorded_url).write_records(recorded_url)
232 | 
233 |     def maybe_idle_rollover(self):
234 |         if time.time() - self._last_maybe > 20:
235 |             self.default_warc_writer.maybe_idle_rollover()
236 |             for w in self.warc_writers.values():
237 |                 w.maybe_idle_rollover()
238 |             self._last_maybe = time.time()
239 | 
240 |     def close_writers(self):
241 |         self.default_warc_writer.close()
242 |         for prefix, writer in list(self.warc_writers.items()):
243 |             del self.warc_writers[prefix]
244 |             writer.close()
245 | 
246 |     def close_for_prefix(self, prefix=None):
247 |         '''
248 |         Close warc writer for the given warc prefix, or the default prefix if
249 |         `prefix` is `None`.
250 |         '''
251 |         if prefix and prefix in self.warc_writers:
252 |             writer = self.warc_writers[prefix]
253 |             del self.warc_writers[prefix]
254 |             writer.close()
255 |         else:
256 |             self.default_warc_writer.close()
257 | 
258 | 


--------------------------------------------------------------------------------
/warcprox/writerthread.py:
--------------------------------------------------------------------------------
  1 | """
  2 | warcprox/writerthread.py - warc writer thread, reads from the recorded url
  3 | queue, writes warc records, runs final tasks after warc records are written
  4 | 
  5 | Copyright (C) 2013-2019 Internet Archive
  6 | 
  7 | This program is free software; you can redistribute it and/or
  8 | modify it under the terms of the GNU General Public License
  9 | as published by the Free Software Foundation; either version 2
 10 | of the License, or (at your option) any later version.
 11 | 
 12 | This program is distributed in the hope that it will be useful,
 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 15 | GNU General Public License for more details.
 16 | 
 17 | You should have received a copy of the GNU General Public License
 18 | along with this program; if not, write to the Free Software
 19 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 20 | USA.
 21 | """
 22 | import logging
 23 | from datetime import datetime
 24 | import queue
 25 | import warcprox
 26 | 
 27 | class WarcWriterProcessor(warcprox.BaseStandardPostfetchProcessor):
 28 |     logger = logging.getLogger("warcprox.writerthread.WarcWriterProcessor")
 29 | 
 30 |     _ALWAYS_ACCEPT = {'WARCPROX_WRITE_RECORD'}
 31 | 
 32 |     def __init__(self, options=warcprox.Options()):
 33 |         warcprox.BaseStandardPostfetchProcessor.__init__(self, options=options)
 34 |         self.writer_pool = warcprox.writer.WarcWriterPool(options)
 35 |         self.method_filter = {method.upper() for method in self.options.method_filter or []}
 36 |         self.blackout_period = options.blackout_period or 0
 37 |         self.close_prefix_reqs = queue.Queue()
 38 | 
 39 |     def _get_process_put(self):
 40 |         while True:
 41 |             try:
 42 |                 prefix = self.close_prefix_reqs.get_nowait()
 43 |                 self.writer_pool.close_for_prefix(prefix)
 44 |             except queue.Empty:
 45 |                 break
 46 |         self.writer_pool.maybe_idle_rollover()
 47 |         super()._get_process_put()
 48 | 
 49 |     def close_for_prefix(self, prefix=None):
 50 |         '''
 51 |         Request close of warc writer for the given warc prefix, or the default
 52 |         prefix if `prefix` is `None`.
 53 | 
 54 |         This API exists so that some code from outside of warcprox proper (in a
 55 |         third-party plugin for example) can close open warcs promptly when it
 56 |         knows they are finished.
 57 |         '''
 58 |         self.close_prefix_reqs.put(prefix)
 59 | 
 60 |     def _process_url(self, recorded_url):
 61 |         if isinstance(recorded_url, warcprox.warcproxy.FailedUrl):
 62 |             return
 63 |         try:
 64 |             records = []
 65 |             if self._should_archive(recorded_url):
 66 |                 records = self.writer_pool.write_records(recorded_url)
 67 |             recorded_url.warc_records = records
 68 |             self._log(recorded_url, records)
 69 |             # try to release resources in a timely fashion
 70 |             if recorded_url.response_recorder and recorded_url.response_recorder.tempfile:
 71 |                 recorded_url.response_recorder.tempfile.close()
 72 |         except:
 73 |             logging.error(
 74 |                     'caught exception processing %s', recorded_url.url,
 75 |                     exc_info=True)
 76 | 
 77 |     def _filter_accepts(self, recorded_url):
 78 |         if not self.method_filter:
 79 |             return True
 80 |         meth = recorded_url.method.upper()
 81 |         return meth in self._ALWAYS_ACCEPT or meth in self.method_filter
 82 | 
 83 |     # XXX optimize handling of urls not to be archived throughout warcprox
 84 |     def _should_archive(self, recorded_url):
 85 |         prefix = (recorded_url.warcprox_meta['warc-prefix']
 86 |                   if recorded_url.warcprox_meta
 87 |                      and 'warc-prefix' in recorded_url.warcprox_meta
 88 |                   else self.options.prefix)
 89 |         # special warc name prefix '-' means "don't archive"
 90 |         return (prefix != '-' and not recorded_url.do_not_archive
 91 |                 and self._filter_accepts(recorded_url)
 92 |                 and not self._in_blackout(recorded_url))
 93 | 
 94 |     def _in_blackout(self, recorded_url):
 95 |         """If --blackout-period=N (sec) is set, check if duplicate record
 96 |         datetime is close to the original. If yes, we don't write it to WARC.
 97 |         The aim is to avoid having unnecessary `revisit` records.
 98 |         Return Boolean
 99 |         """
100 |         if self.blackout_period and hasattr(recorded_url, "dedup_info") and \
101 |                 recorded_url.dedup_info:
102 |             dedup_date = recorded_url.dedup_info.get('date')
103 |             if dedup_date and recorded_url.dedup_info.get('url') == recorded_url.url:
104 |                 try:
105 |                     dt = datetime.strptime(dedup_date.decode('utf-8'),
106 |                                            '%Y-%m-%dT%H:%M:%SZ')
107 |                     return (datetime.utcnow() - dt).total_seconds() <= self.blackout_period
108 |                 except ValueError:
109 |                     return False
110 |         return False
111 | 
112 |     def _log(self, recorded_url, records):
113 |         # 2015-07-17T22:32:23.672Z     1         58 dns:www.dhss.delaware.gov P http://www.dhss.delaware.gov/dhss/ text/dns #045 20150717223214881+316 sha1:63UTPB7GTWIHAGIK3WWL76E57BBTJGAK http://www.dhss.delaware.gov/dhss/ - {"warcFileOffset":2964,"warcFilename":"ARCHIVEIT-1303-WEEKLY-JOB165158-20150717223222113-00000.warc.gz"}
114 |         try:
115 |             payload_digest = records[0].get_header(b'WARC-Payload-Digest').decode('utf-8')
116 |         except:
117 |             payload_digest = '-'
118 |         type_ = records[0].type.decode('utf-8') if records else '-'
119 |         filename = records[0].warc_filename if records else '-'
120 |         offset = records[0].offset if records else '-'
121 |         self.logger.info(
122 |                 '%s %s %s %s %s size=%s %s %s %s offset=%s',
123 |                 recorded_url.client_ip, recorded_url.status,
124 |                 recorded_url.method, recorded_url.url.decode('utf-8'),
125 |                 recorded_url.mimetype, recorded_url.size, payload_digest,
126 |                 type_, filename, offset)
127 | 
128 |     def _shutdown(self):
129 |         self.writer_pool.close_writers()
130 | 
131 | 


--------------------------------------------------------------------------------