├── .gitignore
├── .hgignore
├── .hgtags
├── .travis.yml
├── LICENSE
├── README.md
├── hanzo
    ├── arc2warc.py
    ├── httptools
    │   ├── __init__.py
    │   ├── messaging.py
    │   ├── semantics.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── parse_test.py
    ├── warc2warc.py
    ├── warcdump.py
    ├── warcextract.py
    ├── warcfilter.py
    ├── warcindex.py
    ├── warclinks.py
    ├── warcpayload.py
    ├── warctools
    │   ├── __init__.py
    │   ├── arc.py
    │   ├── archive_detect.py
    │   ├── log.py
    │   ├── mixed.py
    │   ├── record.py
    │   ├── s3.py
    │   ├── stream.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_warctools.py
    │   └── warc.py
    └── warcvalid.py
├── make-deb.sh
├── pylint.rc
├── pyproject.toml
├── tox.ini
├── uv.lock
└── warcunpack_ia.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /dist
3 | __pycache__
4 | /warctools.egg-info
5 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | *.swp
 3 | *.log
 4 | *.pyc
 5 | *.pyo
 6 | *.warc
 7 | *.gz
 8 | login.txt
 9 | .DS_Store
10 | build/*
11 | dist/*
12 | hanzo_warc_tools.egg-info/*
13 | *~
14 | *.orig
15 | debian/*
16 | *.deb
17 | test-reports/*
18 | 


--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
 1 | 58d7d99406b04e7c36bfba1c91e2b06f558c22ee hanzo-4.0-rc0
 2 | 764a52f90a951a8c4acc9c9f60f5d8321662d418 hanzo-4.0-rc1
 3 | 94b65646332e5e86f3d274f66e38ce26cc30ccad hanzo-4.0
 4 | 092e8d0615ecc5ace8b067edbeacd5e3b12c9be0 hanzo-4.1-rc0
 5 | 8f64ab5556344065cd68e0cf8265af87e6b9d0cf hanzo-4.1-rc1
 6 | 8ceff9fcde584ec577048dbd9a13743d31dfc74f hanzo-4.1-rc2
 7 | f54be58d0d8b3aa47b3f935a732a7b5752f0e92e hanzo-4.1-rc4
 8 | 0a1d728557b8d29b15b3796f83b6a9dc7f25abff build_success-2012-09-14T15-24-42.616660024
 9 | 741fe327f233f936cd65c6e2c415cd01f9fc9871 build_success-2012-09-14T16-25-56.483325901
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |  - 2.7
 5 |  - 3.2
 6 |  - 3.3
 7 |  - 3.4
 8 |  - 3.5
 9 |  - nightly
10 |  - pypy
11 |  - pypy3
12 | 
13 | matrix:
14 |  allow_failures:
15 |   - python: 3.5
16 |   - python: nightly
17 | 
18 | script: python setup.py test
19 | 
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2012 Hanzo Archives Ltd
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a
 4 | copy of this software and associated documentation files (the "Software"),
 5 | to deal in the Software without restriction, including without limitation
 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
 7 | and/or sell copies of the Software, and to permit persons to whom the
 8 | Software is furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included 
11 | in all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
15 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
18 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Warctools
  2 | =========
  3 | 
  4 | WARC (Web ARChive) file tools for python 2/3 based on the
  5 | [WARC 1.0 spec](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/)
  6 | and compatible with the Internet Archive's
  7 | [ARC File Format](https://archive.org/web/researcher/ArcFileFormat.php)
  8 | originally developed by Hanzo Archives.
  9 | 
 10 | 
 11 | Install
 12 | -------
 13 | 
 14 | ```
 15 | pip install warctools
 16 | ```
 17 | 
 18 | 
 19 | Python Usage
 20 | ------------
 21 | 
 22 | ```
 23 | from hanzo import warctools
 24 | ```
 25 | 
 26 | 
 27 | Python Examples
 28 | ---------------
 29 | 
 30 | Write a WARC file:
 31 | 
 32 | ```
 33 | import os
 34 | 
 35 | from hanzo import warctools
 36 | 
 37 | 
 38 | def write():
 39 |     headers = [
 40 |         (b'WARC-Type', b'warcinfo'),
 41 |         (b'WARC-Date', b'2019-11-19T23:08:51.182451Z'),
 42 |         (b'WARC-Filename', b'CRAWL-20191119230851-00000-hostname.warc.gz'),
 43 |         (b'WARC-Record-ID', b'<urn:uuid:8cc5dcae-0b21-11ea-842b-525476278032>')
 44 |     ]
 45 |     content_type = b'application/warc-fields'
 46 |     content = 'This\nis\nonly\na\ntest\n'.encode()
 47 |     fname = 'test.warc.gz'
 48 | 
 49 |     mode = 'ab'
 50 |     if not os.path.exists(fname):
 51 |         mode = 'wb'
 52 | 
 53 |     with open(fname, mode) as _fh:
 54 |         content = (content_type, content)
 55 |         record = warctools.WarcRecord(headers=headers, content=content)
 56 |         record.write_to(_fh, gzip="record")
 57 | ```
 58 | 
 59 | 
 60 | Command-line Usage
 61 | ------------------
 62 | 
 63 | ### warcvalid
 64 | 
 65 | Returns 0 if the arguments are all valid W/ARC files, non-zero on
 66 | error.
 67 | 
 68 | ```
 69 | [warctools] $ warcvalid -h
 70 | Usage: warcvalid [options] warc warc warc
 71 | 
 72 | Options:
 73 |   -h, --help            show this help message and exit
 74 |   -l LIMIT, --limit=LIMIT
 75 |   -I INPUT_FORMAT, --input=INPUT_FORMAT
 76 |   -L LOG_LEVEL, --log-level=LOG_LEVEL
 77 | ```
 78 | 
 79 | ### warcdump
 80 | 
 81 | Writes human readable summary of warcfiles. Autodetects input format
 82 | when filenames are passed, i.e recordgzip vs plaintext, WARC vs
 83 | ARC. Assumes uncompressed warc on stdin if no args.
 84 | 
 85 | ```
 86 | [warctools] $ warcdump -h
 87 | Usage: warcdump [options] warc warc warc
 88 | 
 89 | Options:
 90 |   -h, --help            show this help message and exit
 91 |   -l LIMIT, --limit=LIMIT
 92 |   -I INPUT_FORMAT, --input=INPUT_FORMAT
 93 |   -L LOG_LEVEL, --log-level=LOG_LEVEL
 94 | ```
 95 | 
 96 | ### warcfilter
 97 | 
 98 | Searches all headers for regex pattern. Autodetects and stdin like
 99 | warcdump. Prints out a WARC format by default. Use -i to invert
100 | search. Use -U to constrain to url. Use -T to constrain to record
101 | type. Use -C to constrain to content-type.
102 | 
103 | ```
104 | $ warcfilter -h
105 | Usage: warcfilter [options] pattern warc warc warc
106 | 
107 | Options:
108 |   -h, --help            show this help message and exit
109 |   -l LIMIT, --limit=LIMIT
110 |                         limit (ignored)
111 |   -I INPUT_FORMAT, --input=INPUT_FORMAT
112 |                         input format (ignored)
113 |   -i, --invert          invert match
114 |   -U, --url             match on url
115 |   -T, --type            match on (warc) record type
116 |   -C, --content-type    match on (warc) record content type
117 |   -H, --http-content-type
118 |                         match on http payload content type
119 |   -D, --warc-date       match on WARC-Date header
120 |   -L LOG_LEVEL, --log-level=LOG_LEVEL
121 |                         log level(ignored)
122 | ```
123 | 
124 | ### warc2warc
125 | 
126 | Autodetects compression on file args. Assumes uncompressed stdin if
127 | none. Use -Z to write compressed output, i.e warc2warc -Z input >
128 | input.gz. Should ignore buggy records in input.
129 | 
130 | ```
131 | [warctools] $ warc2warc -h
132 | Usage: warc2warc [options] url (url ...)
133 | 
134 | Options:
135 |   -h, --help            show this help message and exit
136 |   -o OUTPUT, --output=OUTPUT
137 |                         output warc file
138 |   -l LIMIT, --limit=LIMIT
139 |   -I INPUT_FORMAT, --input=INPUT_FORMAT
140 |                         (ignored)
141 |   -Z, --gzip            compress output, record by record
142 |   -D, --decode_http     decode http messages (strip chunks, gzip)
143 |   -L LOG_LEVEL, --log-level=LOG_LEVEL
144 |   --wget-chunk-fix      skip transfer-encoding headers in http records, when
145 |                         decoding them (-D)
146 | ```
147 | 
148 | ### arc2warc
149 | 
150 | Creates a crappy WARC file from arc files on input. A handful of
151 | headers are preserved. Use -Z to write compressed output, i.e arc2warc
152 | -Z input.arc > input.warc.gz
153 | 
154 | ```
155 | [warctools] $ arc2warc -h
156 | Usage: arc2warc [options] arc (arc ...)
157 | 
158 | Options:
159 |   -h, --help            show this help message and exit
160 |   -o OUTPUT, --output=OUTPUT
161 |                         output warc file
162 |   -l LIMIT, --limit=LIMIT
163 |   -Z, --gzip            compress
164 |   -L LOG_LEVEL, --log-level=LOG_LEVEL
165 |   --description=DESCRIPTION
166 |   --operator=OPERATOR
167 |   --publisher=PUBLISHER
168 |   --audience=AUDIENCE
169 |   --resource=RESOURCE
170 |   --response=RESPONSE
171 | ```
172 | 
173 | ### warcindex
174 | 
175 | DEPRECATED, use `CDX-writer` branch.
176 | 
177 | ```
178 | #WARC-filename offset warc-type warc-subject-uri warc-record-id content-type content-length
179 | warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd1255a8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=request 193
180 | warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd2614f8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=response 3279474
181 | ```
182 | 
183 | 
184 | Notes
185 | -----
186 | 
187 | 1. arc2warc uses the conversion rules from the earlier arc2warc.c as a
188 |    starter for converting the headers
189 | 2. I haven't profiled the code yet (and don't plan to until it falls
190 |    over)
191 | 3. Warcvalid barely skirts some of the iso standard, missing things:
192 |     * strict whitespace
193 |     * required headers check
194 |     * mime quoted printable header encoding
195 |     * treating headers as utf8
196 | 
197 | 
198 | ToDo
199 | ----
200 | 
201 | 1. Lots more testing
202 | 2. Support pre-1.0 WARC files
203 | 3. Add more documentation
204 | 4. Support more commandline options for output and filenames
205 | 5. S3 urls
206 | 
207 | 
208 | Credits
209 | -------
210 | 
211 | Originally developed by "tef" `thomas.figg@hanzoarchives.com`.
212 | 
213 | 
214 | @internetarchive
215 | 


--------------------------------------------------------------------------------
/hanzo/arc2warc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """arc2warc - convert one arc to a new warc"""
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import os
  7 | import sys
  8 | import hashlib
  9 | import uuid
 10 | 
 11 | import sys
 12 | import os.path
 13 | import datetime
 14 | import socket
 15 | 
 16 | from optparse import OptionParser
 17 | 
 18 | from .warctools import ArcRecord,WarcRecord, MixedRecord, expand_files
 19 | from .warctools.warc import warc_datetime_str
 20 | 
 21 | from .httptools import ResponseMessage, RequestMessage
 22 | 
 23 | parser = OptionParser(usage="%prog [options] arc (arc ...)")
 24 | 
 25 | parser.add_option("-o", "--output", dest="output",
 26 |                        help="output warc file")
 27 | parser.add_option("-l", "--limit", dest="limit")
 28 | parser.add_option("-Z", "--gzip", dest="gzip", action="store_true", help="compress")
 29 | parser.add_option("-L", "--log-level", dest="log_level")
 30 | parser.add_option("--description", dest="description")
 31 | parser.add_option("--operator", dest="operator")
 32 | parser.add_option("--publisher", dest="publisher")
 33 | parser.add_option("--audience", dest="audience")
 34 | parser.add_option("--resource", dest="resource", action="append")
 35 | parser.add_option("--response", dest="response", action="append")
 36 | 
 37 | parser.set_defaults(
 38 |     output_directory=None, limit=None, log_level="info", gzip=False,
 39 |     description="", operator="", publisher="", audience="",
 40 |     resource = [], response=[],
 41 |     
 42 | )
 43 | 
 44 | def is_http_response(content):
 45 |     message = ResponseMessage(RequestMessage())
 46 |     remainder = message.feed(content)
 47 |     message.close()
 48 |     return message.complete() and not remainder
 49 | 
 50 | 
 51 | class ArcTransformer(object):
 52 |     def __init__(self, output_filename=None, warcinfo_fields=b'software: hanzo.arc2warc\r\n', resources=(), responses=()):
 53 |         self.warcinfo_id = None
 54 |         self.output_filename = output_filename
 55 |         self.version = b"WARC/1.0"
 56 |         self.warcinfo_fields = warcinfo_fields
 57 |         self.resources = resources
 58 |         self.responses = responses
 59 | 
 60 |     @staticmethod
 61 |     def make_warc_uuid(text):
 62 |         return ("<urn:uuid:%s>"%uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii')
 63 | 
 64 |     def convert(self, record):
 65 | 
 66 |         if record.type == b'filedesc':
 67 |             return self.convert_filedesc(record)
 68 |         else:
 69 |             return self.convert_record(record)
 70 |         
 71 |     def convert_filedesc(self, record):
 72 |         # todo - filedesc might have missing url?
 73 |         warcinfo_date = warc_datetime_str(datetime.datetime.now())
 74 |         warcinfo_id = self.make_warc_uuid(record.url+warcinfo_date)
 75 | 
 76 |         warcinfo_headers = [
 77 |             (WarcRecord.TYPE, WarcRecord.WARCINFO),
 78 |             (WarcRecord.ID, warcinfo_id),
 79 |             (WarcRecord.DATE, warcinfo_date),
 80 |         ]
 81 | 
 82 |         if self.output_filename:
 83 |             warcinfo_headers.append((WarcRecord.FILENAME, self.output_filename))
 84 | 
 85 |         warcinfo_content = (b'application/warc-fields', self.warcinfo_fields)
 86 | 
 87 |         inforecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=self.version)
 88 | 
 89 |         if record.date:
 90 |             if len(record.date) >= 14:
 91 |                 warcmeta_date = datetime.datetime.strptime(record.date[:14].decode('ascii'),'%Y%m%d%H%M%S')
 92 |             else:
 93 |                 warcmeta_date = datetime.datetime.strptime(record.date[:8].decode('ascii'),'%Y%m%d')
 94 | 
 95 |             warcmeta_date = warc_datetime_str(warcmeta_date)
 96 |         else:
 97 |             warcmeta_date = warcinfo_date
 98 | 
 99 | 
100 |         warcmeta_id = self.make_warc_uuid(record.url+record.date+b"-meta")
101 |         warcmeta_url = record.url
102 |         if warcmeta_url.startswith(b'filedesc://'):
103 |             warcmeta_url = warcmeta_url[11:]
104 |         warcmeta_headers = [
105 |             (WarcRecord.TYPE, WarcRecord.METADATA),
106 |             (WarcRecord.CONCURRENT_TO, warcinfo_id),
107 |             (WarcRecord.ID, warcmeta_id),
108 |             (WarcRecord.URL, warcmeta_url),
109 |             (WarcRecord.DATE, warcmeta_date),
110 |             (WarcRecord.WARCINFO_ID, warcinfo_id),
111 |         ]
112 |         warcmeta_content =(b'application/arc', record.raw())
113 | 
114 |         metarecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=self.version)
115 | 
116 |         self.warcinfo_id = warcinfo_id
117 | 
118 |         return inforecord, metarecord
119 | 
120 |     def convert_record(self, record):
121 | 
122 |         warc_id = self.make_warc_uuid(record.url+record.date)
123 |         headers = [
124 |             (WarcRecord.ID, warc_id),
125 |             (WarcRecord.URL,record.url),
126 |             (WarcRecord.WARCINFO_ID, self.warcinfo_id),
127 |         ]
128 | 
129 |         if record.date:
130 |             try:
131 |                 date = datetime.datetime.strptime(record.date.decode('ascii'),'%Y%m%d%H%M%S')
132 |             except ValueError:
133 |                 date = datetime.datetime.strptime(record.date.decode('ascii'),'%Y%m%d')
134 | 
135 |         else:
136 |             date = datetime.datetime.now()
137 | 
138 |         ip = record.get_header(ArcRecord.IP)
139 |         if ip:
140 |             ip = ip.strip()
141 |             if ip != b"0.0.0.0":
142 |                 headers.append((WarcRecord.IP_ADDRESS, ip))
143 |             
144 | 
145 |         headers.append((WarcRecord.DATE, warc_datetime_str(date)))
146 | 
147 |         content_type, content = record.content
148 | 
149 |         if not content_type.strip():
150 |             content_type = b'application/octet-stream'
151 | 
152 |         url = record.url.lower()
153 | 
154 | 
155 |         if any(url.startswith(p) for p in self.resources):
156 |             record_type = WarcRecord.RESOURCE
157 |         elif any(url.startswith(p) for p in self.responses):
158 |             record_type = WarcRecord.RESPONSE
159 |         elif url.startswith(b'http'):
160 |             if is_http_response(content):
161 |                 content_type=b"application/http;msgtype=response"
162 |                 record_type = WarcRecord.RESPONSE
163 |             else:
164 |                 record_type = WarcRecord.RESOURCE
165 |         elif url.startswith(b'dns'):
166 |             if content_type.startswith(b'text/dns') and str(content.decode('ascii', 'ignore')) == content:
167 |                 record_type = WarcRecord.RESOURCE
168 |             else:
169 |                 record_type = WarcRecord.RESPONSE
170 |         else:
171 |             # unknown protocol
172 |             record_type = WarcRecord.RESPONSE
173 |           
174 |         headers.append((WarcRecord.TYPE, record_type))
175 | 
176 |         warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=self.version)
177 | 
178 |         return warcrecord,
179 | 
180 | def warcinfo_fields(description="", operator="", publisher="", audience=""):
181 |     return "\r\n".join([
182 |         "software: hanzo.arc2warc",
183 |         "hostname: %s"%socket.gethostname(),
184 |         "description: %s"%description,
185 |         "operator: %s"%operator,
186 |         "publisher: %s"%publisher,
187 |         "audience: %s"%audience,
188 |     ]).encode('utf-8')
189 | 
190 | ## todo
191 | """
192 |     move arctransformer into mixed.py
193 |     move output file into arc2warc loop
194 | 
195 | """
196 | def main(argv):
197 |     (options, input_files) = parser.parse_args(args=argv[1:])
198 | 
199 |     try: # python3
200 |         out = sys.stdout.buffer
201 |     except AttributeError: # python2
202 |         out = sys.stdout
203 | 
204 |     if options.output:
205 |         out = open(options.output, 'ab')
206 |         if options.output.endswith('.gz'):
207 |             options.gzip = True
208 |     if len(input_files) < 1:
209 |         parser.error("no imput warc file(s)")
210 |         
211 |     warcinfo = warcinfo_fields(
212 |         description = options.description,
213 |         operator = options.operator,
214 |         publisher = options.publisher,
215 |         audience = options.audience,
216 |     )
217 |     arc = ArcTransformer(options.output, warcinfo, options.resource, options.response)
218 |     for name in expand_files(input_files):
219 |         fh = MixedRecord.open_archive(filename=name, gzip="auto")
220 |         try:
221 |             for record in fh:
222 |                 if isinstance(record, WarcRecord):
223 |                     print('   WARC', record.url, file=sys.stderr)
224 |                     warcs = [record]
225 |                 else:
226 |                     print('ARC    ', record.url, file=sys.stderr)
227 |                     warcs = arc.convert(record)
228 | 
229 |                 for warcrecord in warcs:
230 |                     warcrecord.write_to(out, gzip=options.gzip)
231 |         finally:
232 |             fh.close()
233 | 
234 |     return 0
235 | 
236 | def run():
237 |     sys.exit(main(sys.argv))
238 | 
239 | 
240 | if __name__ == '__main__':
241 |     run()
242 | 
243 | 
244 | 
245 | 


--------------------------------------------------------------------------------
/hanzo/httptools/__init__.py:
--------------------------------------------------------------------------------
1 | from hanzo.httptools.messaging import RequestMessage, ResponseMessage, HTTP09Response
2 | 
3 | 
4 | __all__ = [
5 |     "RequestMessage",
6 |     "ResponseMessage",
7 |     "HTTP09Response",
8 | ]
9 | 


--------------------------------------------------------------------------------
/hanzo/httptools/messaging.py:
--------------------------------------------------------------------------------
  1 | """A set of stream oriented parsers for http requests and responses, inline
  2 | with the current draft recommendations from the http working group.
  3 | 
  4 | http://tools.ietf.org/html/draft-ietf-httpbis-p1-messaging-17
  5 | 
  6 | Unlike other libraries, this is for clients, servers and proxies.
  7 | 
  8 | Missing:
  9 |     comma parsing/header folding
 10 | 
 11 | """
 12 | from gzip import GzipFile
 13 | import re
 14 | import zlib
 15 | from io import BytesIO
 16 | 
 17 | 
 18 | class ParseError(Exception):
 19 |     """Baseclass for all http parsing errors"""
 20 |     pass
 21 | 
 22 | from hanzo.httptools.semantics import Codes, Methods
 23 | 
 24 | NEWLINES = (b'\r\n', b'\n')
 25 | 
 26 | 
 27 | class BrokenChunks(Exception):
 28 |     pass
 29 | 
 30 | class HTTPMessage(object):
 31 |     """A stream based parser for http like messages"""
 32 | 
 33 |     CONTENT_TYPE = b"application/http"
 34 | 
 35 |     def __init__(self, header, buf=None, offset=0):
 36 |         self.buffer = buf if buf is not None else bytearray()
 37 |         self.offset = offset
 38 |         self.header = header
 39 |         self.body_chunks = []
 40 |         self.mode = 'start'
 41 |         self.body_reader = None
 42 | 
 43 |     @property
 44 |     def url(self):
 45 |         return self.header.url
 46 | 
 47 |     @property
 48 |     def scheme(self):
 49 |         return self.header.scheme
 50 | 
 51 |     @property
 52 |     def method(self):
 53 |         return self.header.method
 54 | 
 55 |     @property
 56 |     def host(self):
 57 |         return self.header.host
 58 | 
 59 |     @property
 60 |     def port(self):
 61 |         return self.header.port
 62 | 
 63 |     def feed_fd(self, fd):
 64 |         while True:
 65 |             length, terminator = self.feed_predict()
 66 |             if length == 0:
 67 |                 return  ''
 68 |             elif terminator == '\r\n':
 69 |                 text = fd.readLine()
 70 |             elif length < 0:
 71 |                 text = fd.read()
 72 |             elif length > 0:
 73 |                 text = fd.read(length)
 74 |             unread = self.feed(text)
 75 |             if unread:
 76 |                 return unread
 77 | 
 78 |     def feed_predict(self):
 79 |         """returns size, terminator request for input. size is 0 means end. """
 80 |         if self.mode == 'start':
 81 |             return None, '\r\n'
 82 |         elif self.mode == 'headers':
 83 |             return None, '\r\n'
 84 |         elif self.mode == 'body':
 85 |             if self.body_reader is not None:
 86 |                 return self.body_reader.feed_predict()
 87 |             else:
 88 |                 # connection close
 89 |                 return -1, None
 90 |         if self.mode == 'end':
 91 |             return 0, None
 92 |         if self.mode == 'incomplete':
 93 |             return 0, None
 94 | 
 95 |     def feed(self, text):
 96 |         """Push more text from the input stream into the parser."""
 97 |         if text and self.mode == 'start':
 98 |             text = self.feed_start(text)
 99 | 
100 |         if text and self.mode == 'headers':
101 |             text = self.feed_headers(text)
102 |             if self.mode == 'body':
103 |                 if not self.header.has_body():
104 |                     self.mode = 'end'
105 |                 else:
106 |                     if self.header.body_is_chunked():
107 |                         self.body_reader = ChunkReader()
108 |                     else:
109 |                         length = self.header.body_length()
110 |                         if length is not None:
111 |                             encoding = self.header.encoding
112 | 
113 |                             if encoding and encoding.endswith(b'gzip'):
114 |                                 self.body_reader = ZipLengthReader(length,
115 |                                                                    text)
116 |                             else:
117 |                                 self.body_reader = LengthReader(length)
118 |                             length = self.body_reader.remaining
119 |                             self.body_chunks = [(self.offset, length)]
120 |                             if length == 0:
121 |                                 self.mode = 'end'
122 |                         else:
123 |                             self.body_chunks = [(self.offset, 0)]
124 |                             self.body_reader = None
125 | 
126 |         if text and self.mode == 'body':
127 |             if self.body_reader is not None:
128 |                 try:
129 |                     text = self.body_reader.feed(self, text)
130 |                 except BrokenChunks:
131 |                     self.body_reader = None
132 |                     self.body_chunks = [(self.offset, 0)]
133 |             if self.body_reader is None:
134 |                 ((offset, length),) = self.body_chunks
135 |                 self.buffer.extend(text)
136 |                 self.offset = len(self.buffer)
137 |                 self.body_chunks = ((offset, length + len(text)),)
138 |                 text = ''
139 | 
140 |         return text
141 | 
142 |     def close(self):
143 |         """Mark the end of the input stream and finish parsing."""
144 |         if (self.body_reader is None and self.mode == 'body'):
145 |             self.mode = 'end'
146 | 
147 |         elif self.mode != 'end':
148 |             if self.body_chunks:
149 |                 # check for incomplete in body_chunks
150 |                 offset, length = self.body_chunks.pop()
151 |                 position = len(self.buffer)
152 |                 length = min(length, position - offset)
153 |                 self.body_chunks.append((offset, length))
154 |             self.mode = 'incomplete'
155 | 
156 |     def headers_complete(self):
157 |         """Check whether the input stream has finished supplying headers."""
158 |         return self.mode in ('end', 'body')
159 | 
160 |     def complete(self):
161 |         """Checks whether the input stream is at the end, i.e. if the parser
162 |         is expecting no more input."""
163 | 
164 |         return self.mode == 'end'
165 | 
166 |     def feed_line(self, text):
167 |         """Feed text into the buffer, returning the first line found (if found
168 |         yet)"""
169 |         self.buffer.extend(text)
170 |         pos = self.buffer.find(b'\n', self.offset)
171 |         if pos > -1:
172 |             pos += 1
173 |             text = bytes(self.buffer[pos:])
174 |             del self.buffer[pos:]
175 |             line = bytes(self.buffer[self.offset:])
176 |             self.offset = len(self.buffer)
177 |         else:
178 |             line = None
179 |             text = b''
180 |         return line, text
181 | 
182 |     def feed_length(self, text, remaining):
183 |         """Feed (at most remaining bytes) text to buffer, returning
184 |         leftovers."""
185 |         body, text = text[:remaining], text[remaining:]
186 |         remaining -= len(body)
187 |         self.buffer.extend(body)
188 |         self.offset = len(self.buffer)
189 |         return remaining, text
190 | 
191 |     def feed_start(self, text):
192 |         """Feed text to the parser while it is in the 'start' state."""
193 |         line, text = self.feed_line(text)
194 |         if line is not None:
195 |             if line not in NEWLINES:
196 |                 self.header.set_start_line(line)
197 |                 self.mode = 'headers'
198 | 
199 |         return text
200 | 
201 |     def feed_headers(self, text):
202 |         """Feed text to the parser while it is in the 'headers'
203 |         state."""
204 |         while text:
205 |             line, text = self.feed_line(text)
206 |             if line is not None:
207 |                 self.header.add_header_line(line)
208 |                 if line in NEWLINES:
209 |                     self.mode = 'body'
210 |                     break
211 | 
212 |         return text
213 | 
214 |     def get_message(self):
215 |         """Returns the contents of the input buffer."""
216 |         return bytes(self.buffer)
217 | 
218 |     def get_decoded_message(self):
219 |         """Return the input stream reconstructed from the parsed
220 |         data."""
221 |         buf = bytearray()
222 |         self.write_decoded_message(buf)
223 |         return bytes(buf)
224 | 
225 |     def write_message(self, buf):
226 |         #TODO: No idea what this does, looks broken
227 |         self.header.write(buf)
228 |         buf.extend(b'\r\n')
229 |         self.write_body(buf)
230 | 
231 |     def write_decoded_message(self, buf):
232 |         """Writes the parsed data to the buffer passed."""
233 |         self.header.write_decoded(buf)
234 |         if self.header.has_body():
235 |             length = sum(l for o, l in self.body_chunks)
236 |             buf.extend(b'Content-Length: ' + str(length).encode('ascii') + b'\r\n')
237 |         body = self.get_body()
238 |         if self.header.encoding and body:
239 |             try:
240 |                 body = zlib.decompress(body)
241 |             except zlib.error:
242 |                 try:
243 |                     body = zlib.decompress(body, 16 + zlib.MAX_WBITS)
244 |                 except zlib.error:
245 |                     encoding_header = b"Content-Encoding: " + self.header.encoding + b"\r\n"
246 |                     buf.extend(encoding_header)
247 |         buf.extend(b'\r\n')
248 |         try:
249 |             buf.extend(body)
250 |         except Exception as e:
251 |             raise Exception('buf={} body={} e={}'.format(repr(buf), repr(body), e))
252 | 
253 |     def get_body(self):
254 |         """Returns the body of the HTTP message."""
255 |         buf = bytearray()
256 |         self.write_body(buf)
257 |         return bytes(buf)
258 | 
259 |     def write_body(self, buf):
260 |         """Writes the body of the HTTP message to the passed
261 |         buffer."""
262 |         for offset, length in self.body_chunks:
263 |             buf.extend(self.buffer[offset:offset + length])
264 | 
265 | 
266 | class ChunkReader(object):
267 |     """Reads the body of a HTTP message with chunked encoding."""
268 | 
269 |     def __init__(self):
270 |         self.mode = "start"
271 |         self.start = True
272 |         self.remaining = 0
273 | 
274 |     def feed_predict(self):
275 |         if self.mode == 'start':
276 |             return None, '\r\n'
277 |         elif self.mode == 'chunk':
278 |             if self.remaining == 0:
279 |                 return None, '\r\n'
280 |             else:
281 |                 return self.remaining, None
282 |         elif self.mode == 'trailer':
283 |             return None, '\r\n'
284 |         elif self.mode == 'end':
285 |             return 0, None
286 | 
287 |     def feed_start(self, parser, text):
288 |         """Feed text into the ChunkReader when the mode is 'start'."""
289 |         pos = len(parser.buffer)
290 |         line, text = parser.feed_line(text)
291 |         offset = len(parser.buffer)
292 | 
293 |         if line is not None:
294 |             try:
295 |                 chunk = int(line.split(b';', 1)[0], 16)
296 |             except ValueError:
297 |                 # ugh, this means the chunk is probably not a chunk
298 |                 if self.start:
299 |                     # undo, stip text from buffer
300 |                     del parser.buffer[pos:]
301 |                     parser.offset = len(parser.buffer)
302 |                     raise BrokenChunks()
303 |                 else:
304 |                     raise
305 | 
306 |             parser.body_chunks.append((offset, chunk))
307 |             self.remaining = chunk
308 |             if chunk == 0:
309 |                 self.mode = 'trailer'
310 |             else:
311 |                 self.mode = 'chunk'
312 |         self.start = False
313 |         return text
314 | 
315 |     def feed_chunk(self, parser, text):
316 |         """Feed text into the ChunkReader when the mode is 'chunk'."""
317 |         if self.remaining > 0:
318 |             self.remaining, text = parser.feed_length(text, self.remaining)
319 |         if self.remaining == 0:
320 |             end_of_chunk, text = parser.feed_line(text)
321 |             if end_of_chunk:
322 |                 self.mode = 'start'
323 | 
324 |         return text
325 | 
326 |     def feed_trailer(self, parser, text):
327 |         """Feed text into the ChunkReader when the mode is
328 |         'trailer'."""
329 |         line, text = parser.feed_line(text)
330 |         if line is not None:
331 |             parser.header.add_trailer_line(line)
332 |             if line in NEWLINES:
333 |                 self.mode = 'end'
334 | 
335 |         return text
336 | 
337 |     def feed(self, parser, text):
338 |         """Feed text into the ChunkReader."""
339 |         while text:
340 |             if self.mode == 'start':
341 |                 text = self.feed_start(parser, text)
342 | 
343 |             if text and self.mode == 'chunk':
344 |                 text = self.feed_chunk(parser, text)
345 | 
346 |             if text and self.mode == 'trailer':
347 |                 text = self.feed_trailer(parser, text)
348 | 
349 |             if self.mode == 'end':
350 |                 parser.mode = 'end'
351 |                 break
352 | 
353 |         return text
354 | 
355 | 
356 | class LengthReader(object):
357 | 
358 |     def __init__(self, length):
359 |         self.remaining = length
360 | 
361 |     def feed_predict(self):
362 |         return self.remaining, None
363 | 
364 |     def feed(self, parser, text):
365 |         if self.remaining > 0:
366 |             self.remaining, text = parser.feed_length(text, self.remaining)
367 |         if self.remaining <= 0:
368 |             parser.mode = 'end'
369 |         return text
370 | 
371 | 
372 | class ZipLengthReader(LengthReader):
373 |     """
374 |     Tries to read the body as gzip according to length. In case that fails, it
375 |     disregards the Content-Length and reads it normally.
376 |     """
377 |     def __init__(self, length, text):
378 |         # TODO test if this works with gzipped responses in WARC
379 |         try:
380 |             self._file = GzipFile(fileobj=BytesIO(text[:length]), mode='rb')
381 |             self._text = self._file.read()
382 |             super(ZipLengthReader, self).__init__(len(self._text))
383 |         except IOError:
384 |             self._file = None
385 |             super(ZipLengthReader, self).__init__(len(text))
386 | 
387 |     def __del__(self):
388 |         if self._file:
389 |             self._file.close()
390 | 
391 |     def feed(self, parser, text):
392 |         """Parse the body according to remaining length"""
393 |         if self.remaining > 0:
394 |             if self._file:
395 |                 text = self._text
396 |             self.remaining, text = parser.feed_length(text, self.remaining)
397 |         if self.remaining <= 0:
398 |             parser.mode = 'end'
399 |         return text
400 | 
401 | 
402 | class HTTPHeader(object):
403 |     STRIP_HEADERS = [n.lower() for n in (b'Content-Length',
404 |                      b'Transfer-Encoding', b'Content-Encoding',
405 |                      b'TE', b'Expect', b'Trailer')]
406 | 
407 |     def __init__(self, ignore_headers):
408 |         self.headers = []
409 |         self.keep_alive = False
410 |         self.mode = 'close'
411 |         self.content_length = None
412 |         self.encoding = None
413 |         self.trailers = []
414 |         self.expect_continue = False
415 |         self.ignore_headers = set(x.lower() for x in ignore_headers)
416 | 
417 |     def has_body(self):
418 |         pass
419 | 
420 |     def set_start_line(self, line):
421 |         pass
422 | 
423 |     def write_decoded(self, buf):
424 |         self.write_decoded_start(buf)
425 |         strip_headers = self.STRIP_HEADERS if self.has_body() else ()
426 |         self.write_headers(buf, strip_headers)
427 | 
428 |     def write_decoded_start(self, buf):
429 |         pass
430 | 
431 |     def write_headers(self, buf, strip_headers=()):
432 |         for k, v in self.headers:
433 |             if k.lower() not in strip_headers:
434 |                 buf.extend(k + b': ' + v + b'\r\n')
435 |         for k, v in self.trailers:
436 |             if k.lower() not in strip_headers:
437 |                 buf.extend(k + b': ' + v + b'\r\n')
438 | 
439 |     def add_trailer_line(self, line):
440 |         if line.startswith(b' ') or line.startswith(b'\t'):
441 |             k, v = self.trailers.pop()
442 |             line = line.strip()
443 |             v = v + b' ' + line
444 |             self.trailers.append((k, v))
445 |         elif line in NEWLINES:
446 |             pass
447 |         else:
448 |             name, value = line.split(b':', 1)
449 |             name = name.strip()
450 |             value = value.strip()
451 |             self.trailers.append((name, value))
452 | 
453 |     def add_header(self, name, value):
454 |         self.headers.append((name, value))
455 | 
456 |     def add_header_line(self, line):
457 |         if line.startswith(b' ') or line.startswith(b'\t'):
458 |             k, v = self.headers.pop()
459 |             line = line.strip()
460 |             v = v + b' ' + line
461 |             self.add_header(k, v)
462 | 
463 |         elif line in NEWLINES:
464 |             for name, value in self.headers:
465 |                 name = name.lower()
466 |                 value = value.lower()
467 | 
468 |                 # todo handle multiple instances
469 |                 # of these headers
470 |                 if name in self.ignore_headers:
471 |                     #print >> sys.stderr, 'ignore', name
472 |                     pass
473 |                 elif name == b'expect':
474 |                     if b'100-continue' in value:
475 |                         self.expect_continue = True
476 |                 elif name == b'content-length':
477 |                     if self.mode == 'close':
478 |                         self.content_length = int(value)
479 |                         self.mode = 'length'
480 | 
481 |                 elif name == b'transfer-encoding':
482 |                     if b'chunked' in value:
483 |                         self.mode = 'chunked'
484 | 
485 |                 elif name == b'content-encoding':
486 |                     self.encoding = value
487 | 
488 |                 elif name == b'connection':
489 |                     if b'keep-alive' in value:
490 |                         self.keep_alive = True
491 |                     elif b'close' in value:
492 |                         self.keep_alive = False
493 | 
494 |         else:
495 |             name, value = line.split(b':', 1)
496 |             name = name.strip()
497 |             value = value.strip()
498 |             self.add_header(name, value)
499 | 
500 |     def body_is_chunked(self):
501 |         return self.mode == 'chunked'
502 | 
503 |     def body_length(self):
504 |         if self.mode == 'length':
505 |             return self.content_length
506 | 
507 | url_rx = re.compile(
508 |     b'(?P<scheme>https?)://(?P<authority>(?P<host>[^:/]+)(?::(?P<port>\d+))?)'
509 |     b'(?P<path>.*)',
510 |     re.I)
511 | 
512 | 
513 | class RequestHeader(HTTPHeader):
514 | 
515 |     def __init__(self, ignore_headers=()):
516 |         HTTPHeader.__init__(self, ignore_headers=ignore_headers)
517 |         self.method = ''
518 |         self.target_uri = ''
519 |         self.version = ''
520 |         self.host = ''
521 |         self.scheme = 'http'
522 |         self.port = 80
523 |         self.host = ''
524 |     
525 |     @property
526 |     def url(self):
527 |         if (self.scheme == 'http' and self.port == 80)\
528 |         or (self.scheme == 'https' and self.port == 80):
529 |             return "%s://%s%s"%(self.scheme, self.host, self.target_uri)
530 |         else:
531 |             return "%s://%s:%s%s"%(self.scheme, self.host, self.port,  self.target_uri)
532 | 
533 | 
534 |     def add_header(self, name, value):
535 | 
536 |         if name.lower() == b'host':
537 |             if b':' in value:
538 |                 self.host, self.port = value.split(b':',1)
539 |             else:
540 |                 self.host = value
541 | 
542 |         return HTTPHeader.add_header(self, name, value)
543 | 
544 |     def set_start_line(self, line):
545 |         self.method, self.target_uri, self.version = \
546 |             line.rstrip().split(b' ', 2)
547 | 
548 |         if self.method.upper() == b"CONNECT":
549 |             # target_uri = host:port
550 |             self.host, self.port = self.target_uri.split(b':')
551 |         else:
552 |             match = url_rx.match(self.target_uri)
553 |             if match:
554 |                 #self.add_header('Host', match.group('authority'))
555 |                 self.target_uri = match.group('path')
556 |                 self.host = match.group('host')
557 |                 port = match.group('port')
558 |                 self.port = int(port) if port else 80
559 | 
560 |                 self.scheme = match.group('scheme')
561 |                 if not self.target_uri:
562 |                     if self.method.upper() == 'OPTIONS':
563 |                         self.target_uri = '*'
564 |                     else:
565 |                         self.target_uri = '/'
566 | 
567 |         if self.version == 'HTTP/1.0':
568 |             self.keep_alive = False
569 | 
570 |     def has_body(self):
571 |         return self.mode in ('chunked', 'length')
572 | 
573 |     def write_decoded_start(self, buf):
574 |         buf.extend(self.method + b' ' + self.target_uri + b' ' + self.version + b'\r\n')
575 | 
576 | 
577 | class ResponseHeader(HTTPHeader):
578 | 
579 |     def __init__(self, request=None, ignore_headers=()):
580 |         HTTPHeader.__init__(self, ignore_headers=ignore_headers)
581 |         self.request = request
582 |         self.version = b"HTTP/1.1"
583 |         self.code = 0
584 |         self.phrase = "Empty Response"
585 | 
586 |     @property
587 |     def method(self):
588 |         return self.request.method
589 | 
590 |     @property
591 |     def url(self):
592 |         return self.request.url
593 | 
594 |     @property
595 |     def host(self):
596 |         return self.request.host
597 | 
598 |     @property
599 |     def port(self):
600 |         return self.request.port
601 | 
602 |     @property
603 |     def scheme(self):
604 |         return self.request.scheme
605 | 
606 |     def set_start_line(self, line):
607 |         parts = line.rstrip().split(b' ', 2)
608 |         self.version, self.code = parts[:2]
609 |         self.phrase = parts[2] if len(parts) >= 3 else b""
610 | 
611 |         self.code = int(self.code)
612 |         if self.version == b'HTTP/1.0':
613 |             self.keep_alive = False
614 | 
615 |     def has_body(self):
616 |         if self.request and self.request.method in Methods.no_body:
617 |             return False
618 |         elif self.code in Codes.no_body:
619 |             return False
620 | 
621 |         return True
622 | 
623 |     def write_decoded_start(self, buf):
624 |         buf.extend(self.version + b' ' + str(self.code).encode('ascii') + b' ' + self.phrase + b'\r\n')
625 | 
626 | 
627 | class RequestMessage(HTTPMessage):
628 |     CONTENT_TYPE = HTTPMessage.CONTENT_TYPE + b";msgtype=request"
629 | 
630 |     def __init__(self, ignore_headers=()):
631 |         HTTPMessage.__init__(self,
632 |                              RequestHeader(ignore_headers=ignore_headers))
633 | 
634 | 
635 | class ResponseMessage(HTTPMessage):
636 |     CONTENT_TYPE = HTTPMessage.CONTENT_TYPE + b";msgtype=response"
637 | 
638 |     def __init__(self, request, ignore_headers=()):
639 |         self.interim = []
640 |         HTTPMessage.__init__(self,
641 |                              ResponseHeader(request.header,
642 |                                             ignore_headers=ignore_headers))
643 | 
644 |     def got_continue(self):
645 |         return bool(self.interim)
646 | 
647 |     @property
648 |     def code(self):
649 |         return self.header.code
650 | 
651 |     def feed(self, text):
652 |         text = HTTPMessage.feed(self, text)
653 |         if self.complete() and self.header.code == Codes.Continue:
654 |             self.interim.append(self.header)
655 |             self.header = ResponseHeader(self.header.request)
656 |             self.body_chunks = []
657 |             self.mode = 'start'
658 |             self.body_reader = None
659 |             text = HTTPMessage.feed(self, text)
660 |         return text
661 | 
662 |     def as_http09(self):
663 |         return HTTP09Response(self)
664 | 
665 | class HTTP09ResponseHeader(HTTPHeader):
666 |     def __init__(self, request=None, ignore_headers=()):
667 |         HTTPHeader.__init__(self, ignore_headers=ignore_headers)
668 |         self.request = request
669 |         self.version = "HTTP/0.9"
670 |         self.code = 200
671 |         self.phrase = ""
672 | 
673 |     @property
674 |     def method(self):
675 |         return self.request.method
676 | 
677 |     @property
678 |     def url(self):
679 |         return self.request.url
680 | 
681 |     @property
682 |     def host(self):
683 |         return self.request.host
684 | 
685 |     @property
686 |     def port(self):
687 |         return self.request.port
688 | 
689 |     @property
690 |     def scheme(self):
691 |         return self.request.scheme
692 | 
693 |     def has_body(self):
694 |         return True
695 | 
696 | class HTTP09Response(HTTPMessage):
697 |     CONTENT_TYPE = "%s;msgtype=response;version=0.9" % HTTPMessage.CONTENT_TYPE
698 |     def __init__(self, response):
699 |         header= HTTP09ResponseHeader(response.header.request)
700 |         HTTPMessage.__init__(self, header, buf=response.buffer, offset=response.offset)
701 |         self.mode = 'body'
702 | 
703 |     @property
704 |     def code(self):
705 |         return self.header.code
706 | 
707 |     def feed_predict(self):
708 |         """returns size, terminator request for input. size is 0 means end. """
709 |         return -1, None
710 | 
711 |     def feed(self, text):
712 |         """Push more text from the input stream into the parser."""
713 |         self.buffer.extend(text)
714 |         return ''
715 | 
716 |     def close(self):
717 |         """Mark the end of the input stream and finish parsing."""
718 |         self.mode = 'end'
719 | 
720 |     def get_message(self):
721 |         """Returns the contents of the input buffer."""
722 |         return bytes(self.buffer)
723 | 
724 |     def get_decoded_message(self):
725 |         """Return the input stream reconstructed from the parsed
726 |         data."""
727 |         return bytes(self.buffer)
728 | 
729 |     def write_decoded_message(self, buf):
730 |         """Writes the parsed data to the buffer passed."""
731 |         buf.extend(self.buffer)
732 | 
733 |     def get_body(self):
734 |         """Returns the body of the HTTP message."""
735 |         return bytes(self.buffer)
736 | 
737 |     def write_body(self, buf):
738 |         buf.extend(self.buffer)
739 | 
740 | 
741 | 


--------------------------------------------------------------------------------
/hanzo/httptools/semantics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Semantics as based upon
  3 | http://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-17
  4 | """
  5 | 
  6 | class Methods(object):
  7 |     GET = b'GET'
  8 |     PUT = b'PUT'
  9 |     HEAD = b'HEAD'
 10 |     DELETE = b'DELETE'
 11 |     POST = b'POST'
 12 |     OPTIONS = b'OPTIONS'
 13 |     TRACE = b'TRACE'
 14 |     PATCH = b'PATCH'
 15 |     CONNECT = b'CONNECT'
 16 |     safe = (GET, HEAD, OPTIONS, TRACE,)
 17 |     idempotent = (PUT, DELETE,)
 18 |     no_body = (HEAD,)
 19 |     cacheable = (GET,) 
 20 | 
 21 | 
 22 | def range_collection(func):
 23 |     """Returns an object (x) that responds to foo in x,"""
 24 | 
 25 |     class Range(object):
 26 |         def __contains__(self, item):
 27 |             return func(item)
 28 | 
 29 |     return Range()
 30 |                 
 31 | 
 32 | class Codes(object):
 33 |     #pylint: disable-msg=e0213
 34 |     Continue = 100
 35 |     switching_protocols = 101
 36 | 
 37 |     @range_collection
 38 |     def informational(code):
 39 |         return 100 <= code < 200
 40 | 
 41 |     ok = 200
 42 |     created = 201
 43 |     accepted = 202
 44 |     non_authorative_content = 203
 45 |     no_content = 204
 46 |     reset_content = 205
 47 |     partial_content = 206
 48 | 
 49 |     @range_collection
 50 |     def successful(code):
 51 |         return 200 <= code < 300
 52 | 
 53 | 
 54 |     moved_permanently = 301
 55 |     found = 302
 56 |     see_other = 303
 57 |     not_modified = 304
 58 |     use_proxy = 305
 59 |     obsolete_switch_proxy = 306
 60 |     temporary_redirect = 307
 61 | 
 62 |     @range_collection
 63 |     def redirection(code):
 64 |         return 300 <= code < 400
 65 | 
 66 | 
 67 |     bad_request = 400
 68 |     unauthorized = 401
 69 |     payment_required = 402
 70 |     forbidden = 403
 71 |     not_found = 404
 72 |     method_not_allowed = 405
 73 |     not_acceptable = 406
 74 |     proxy_authentication_required = 407
 75 |     request_timeout = 408
 76 |     conflict = 409
 77 |     gone = 410
 78 |     length_required = 411
 79 |     precondition_failed = 412
 80 |     request_representation_too_large = 413
 81 |     uri_too_long = 414
 82 |     unsupported_media_type = 415
 83 |     requested_range_not_satisfiable =415
 84 |     expectation_failed = 417
 85 |     upgrade_required = 426
 86 | 
 87 |     @range_collection
 88 |     def client_error(code):
 89 |         return 400 <= code < 500
 90 | 
 91 | 
 92 |     internal_server_error = 501
 93 |     not_implemented = 501
 94 |     bad_gateway = 502
 95 |     service_unavailable = 503
 96 |     gateway_timeout = 504
 97 |     http_version_not_supported = 505
 98 |     @range_collection
 99 |     def server_error(code):
100 |         return 500 <= code < 600
101 | 
102 |     @range_collection
103 |     def no_body(code):
104 |         return (100 <= code < 200) or (code == 204) or (code == 304)
105 | 
106 | 


--------------------------------------------------------------------------------
/hanzo/httptools/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/warctools/21db132fd3e4b4042cd011d9dc3fb30276a5a0b6/hanzo/httptools/tests/__init__.py


--------------------------------------------------------------------------------
/hanzo/httptools/tests/parse_test.py:
--------------------------------------------------------------------------------
  1 | """Tests for http parsing."""
  2 | import unittest
  3 | 
  4 | # want unittest2 for python2.6
  5 | try:
  6 |     unittest.TestCase.assertIsNone
  7 | except AttributeError:
  8 |     import unittest2
  9 |     unittest = unittest2
 10 | 
 11 | from hanzo.httptools.messaging import \
 12 |     RequestMessage, \
 13 |     ResponseMessage
 14 | 
 15 | get_request_lines = [
 16 |         b"GET / HTTP/1.1",
 17 |         b"Host: example.org",
 18 |         b"",
 19 |         b"",
 20 |         ]
 21 | get_request = b"\r\n".join(get_request_lines)
 22 | get_response_lines = [
 23 |         b"HTTP/1.1 200 OK",
 24 |         b"Host: example.org",
 25 |         b"Content-Length: 5",
 26 |         b"",
 27 |         b"tests",
 28 |         ]
 29 | get_response = b"\r\n".join(get_response_lines)
 30 | 
 31 | 
 32 | class GetChar(unittest.TestCase):
 33 |     """Test basic GET request parsing. Single byte at a time."""
 34 | 
 35 |     def runTest(self):
 36 |         """Attempts to parse the contents of get_request and
 37 |         get_response."""
 38 |         p = RequestMessage()
 39 |         for t in get_request:
 40 |             if isinstance(t, int): t = bytes([t]) # python3
 41 |             text = p.feed(t)
 42 |             self.assertEqual(text, b'')
 43 | 
 44 |         self.assertTrue(p.headers_complete())
 45 |         self.assertTrue(p.complete())
 46 | 
 47 |         self.assertEqual(get_request, p.get_decoded_message())
 48 | 
 49 |         p = ResponseMessage(p)
 50 |         for char in get_response:
 51 |             if isinstance(char, int): char = bytes([char]) # python3
 52 |             text = p.feed(char)
 53 |             self.assertEqual(text, b'')
 54 | 
 55 |         self.assertTrue(p.headers_complete())
 56 |         self.assertTrue(p.complete())
 57 |         self.assertEqual(get_response, p.get_decoded_message())
 58 |         self.assertEqual(b"tests", p.get_body())
 59 | 
 60 | 
 61 | class GetLines(unittest.TestCase):
 62 |     """Test basic GET request parsing. Single line at a time."""
 63 | 
 64 |     def runTest(self):
 65 |         """Attempts to parse get_request_lines, i.e. get_request line
 66 |         at a time."""
 67 | 
 68 |         p = RequestMessage()
 69 |         for line in get_request_lines[:-1]:
 70 |             text = p.feed(line)
 71 |             self.assertEqual(text, b"")
 72 |             text = p.feed(b"\r\n")
 73 |             self.assertEqual(text, b"")
 74 |         text = p.feed(get_request_lines[-1])
 75 |         self.assertEqual(text, b"")
 76 | 
 77 |         self.assertTrue(p.headers_complete())
 78 |         self.assertTrue(p.complete())
 79 | 
 80 |         self.assertEqual(get_request, p.get_decoded_message())
 81 | 
 82 |         p = ResponseMessage(p)
 83 |         for line in get_response_lines[:-1]:
 84 |             text = p.feed(line)
 85 |             self.assertEqual(text, b"")
 86 |             text = p.feed(b"\r\n")
 87 |             self.assertEqual(text, b"")
 88 |         text = p.feed(get_response_lines[-1])
 89 | 
 90 |         self.assertEqual(text, b"")
 91 | 
 92 |         self.assertTrue(p.headers_complete())
 93 |         self.assertTrue(p.complete())
 94 | 
 95 |         self.assertEqual(get_response, p.get_decoded_message())
 96 | 
 97 |         self.assertEqual(p.code, 200)
 98 |         self.assertEqual(p.header.version, b"HTTP/1.1")
 99 |         self.assertEqual(p.header.phrase, b"OK")
100 | 
101 | 
102 | head_request = b"\r\n".join([
103 |     b"HEAD / HTTP/1.1",
104 |     b"Host: example.org",
105 |     b"",
106 |     b"",
107 | ])
108 | head_response = b"\r\n".join([
109 |     b"HTTP/1.1 200 OK",
110 |     b"Host: example.org",
111 |     b"Content-Length: 5",
112 |     b"",
113 |     b"",
114 | ])
115 | 
116 | 
117 | class HeadTest(unittest.TestCase):
118 |     """Tests parsing of HEAD requests and responses."""
119 | 
120 |     def runTest(self):
121 |         """Constructs a RequestMessage and ResponseMessage and uses them to
122 |         parse HEAD messages."""
123 |         p = RequestMessage()
124 |         text = p.feed(head_request)
125 | 
126 |         self.assertEqual(text, b'')
127 |         self.assertTrue(p.complete())
128 |         self.assertEqual(head_request, p.get_decoded_message())
129 | 
130 |         p = ResponseMessage(p)
131 |         text = p.feed(head_response)
132 | 
133 |         self.assertEqual(text, b'')
134 |         self.assertTrue(p.complete())
135 |         self.assertEqual(head_response, p.get_decoded_message())
136 |         self.assertEqual(p.code, 200)
137 |         self.assertEqual(p.header.version, b"HTTP/1.1")
138 |         self.assertEqual(p.header.phrase, b"OK")
139 | 
140 | 
141 | class PostTestChunked(unittest.TestCase):
142 |     """Tests the parser with a POST request with chunked encoding."""
143 |     post_request = b"\r\n".join([
144 |             b"POST / HTTP/1.1",
145 |             b"Host: example.org",
146 |             b"Transfer-Encoding: chunked",
147 |             b"",
148 |             b"8",
149 |             b"abcdefgh",
150 |             b"0",
151 |             b"",
152 |             b"",
153 |             ])
154 |     post_response = b"\r\n".join([
155 |             b"HTTP/1.1 100 Continue",
156 |             b"Host: example.org",
157 |             b"",
158 |             b"HTTP/1.0 204 No Content",
159 |             b"Date: now!",
160 |             b"",
161 |             b"",
162 |             ])
163 | 
164 |     def runTest(self):
165 |         """Tests parsing of POST requests and responses."""
166 |         p = RequestMessage()
167 |         text = p.feed(self.post_request)
168 | 
169 |         self.assertEqual(text, b'')
170 |         self.assertTrue(p.complete())
171 | 
172 |         p = ResponseMessage(p)
173 |         text = p.feed(self.post_response)
174 | 
175 |         self.assertEqual(text, b'')
176 |         self.assertTrue(p.complete())
177 |         self.assertEqual(p.code, 204)
178 |         self.assertEqual(p.header.version, b"HTTP/1.0")
179 |         self.assertEqual(p.header.phrase, b"No Content")
180 | 
181 | 
182 | class PostTestChunkedEmpty(unittest.TestCase):
183 |     """Tests the parser with a POST request with chunked encoding and
184 |     an empty body."""
185 |     post_request = b"\r\n".join([
186 |             b"POST / HTTP/1.1",
187 |             b"Host: example.org",
188 |             b"Transfer-Encoding: chunked",
189 |             b"",
190 |             b"0",
191 |             b"",
192 |             b"",
193 |             ])
194 |     post_response = b"\r\n".join([
195 |             b"HTTP/1.1 100 Continue",
196 |             b"Host: example.org",
197 |             b"",
198 |             b"HTTP/1.0 204 No Content",
199 |             b"Date: now!",
200 |             b"",
201 |             b"",
202 |             ])
203 | 
204 |     def runTest(self):
205 |         """Tests parsing of POST requests and responses."""
206 |         p = RequestMessage()
207 |         text = p.feed(self.post_request)
208 | 
209 |         self.assertEqual(text, b'')
210 |         self.assertTrue(p.complete())
211 | 
212 |         p = ResponseMessage(p)
213 |         text = p.feed(self.post_response)
214 | 
215 |         self.assertEqual(text, b'')
216 |         self.assertTrue(p.complete())
217 |         self.assertEqual(p.code, 204)
218 |         self.assertEqual(p.header.version, b"HTTP/1.0")
219 |         self.assertEqual(p.header.phrase, b"No Content")
220 | 
221 | 
222 | class TestTwoPartStatus(unittest.TestCase):
223 |     """This is a request taken from the wild that broke the crawler. The main
224 |     part being tested is the status line without a message."""
225 | 
226 |     request = b"\r\n".join([
227 |             b"GET / HTTP/1.1",
228 |             b"Host: example.org", # Name changed to protect the guilty
229 |             b"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
230 |             b"Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.3",
231 |             b"Accept-Encoding: gzip,deflate,sdch",
232 |             b"Accept-Language: en-US,en;q=0.8",
233 |             b"Connection: keep-alive",
234 |             b"Host: example.org",
235 |             b"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7",
236 |             b"",
237 |             b"",
238 |             ])
239 |     response = b"\r\n".join([
240 |             b"HTTP/1.1 404",
241 |             b"Cache-Control: no-cache",
242 |             b"Content-Length: 0",
243 |             b"Content-Type:image/gif",
244 |             b"Pragma:no-cache",
245 |             b"nnCoection: close",
246 |             b"",
247 |             b"",
248 |             ])
249 | 
250 |     def runTest(self):
251 |         """Tests parsing of a broken response."""
252 |         p = RequestMessage()
253 |         text = p.feed(self.request)
254 | 
255 |         self.assertEqual(text, b'')
256 |         self.assertTrue(p.complete())
257 | 
258 |         p = ResponseMessage(p)
259 |         text = p.feed(self.response)
260 | 
261 |         self.assertEqual(text, b'')
262 |         self.assertTrue(p.complete())
263 |         self.assertEqual(p.code, 404)
264 |         self.assertEqual(p.header.version, b"HTTP/1.1")
265 | 
266 | 
267 | class TestPseudoGzipped(unittest.TestCase):
268 |     """Test parsing of a response with Content-Encoding:gzip declared, but
269 |     without the payload actually being gzipped (see #14)"""
270 |     post_response = b"\r\n".join([
271 |         b"HTTP/1.1 200 OK",
272 |         b"Host: example.org",
273 |         b"Content-Encoding: gzip",
274 |         b"Content-Length: 7",
275 |         b"",
276 |         b"text",
277 |         b""
278 |     ])
279 | 
280 |     def runTest(self):
281 |         """Tests parsing the response."""
282 |         request = RequestMessage()
283 |         response = ResponseMessage(request)
284 |         text = response.feed(self.post_response)
285 | 
286 |         self.assertEqual(text, b'')
287 |         self.assertTrue(response.complete())
288 |         self.assertEqual(response.code, 200)
289 |         self.assertEqual(response.header.version, b"HTTP/1.1")
290 | 
291 | 
292 | class TestGzipped(unittest.TestCase):
293 |     """Test parsing of a response with Content-Encoding:gzip declared
294 |     and an actually gzipped payload (see #14)"""
295 |     post_response = b"\r\n".join([
296 |         b"HTTP/1.1 200 OK",
297 |         b"Host: example.org",
298 |         b"Content-Encoding: gzip",
299 |         b"Content-Length: 30",
300 |         b"",
301 |         (b"\x1f\x8b\x08\x08G\xb2\xc5V\x00\x03test\x00+I\xad(\xe1\x02\x00'"
302 |          b"\xda\xec7\x05\x00\x00\x00")
303 |     ])
304 | 
305 |     def runTest(self):
306 |         """Tests parsing of the response."""
307 |         request = RequestMessage()
308 |         response = ResponseMessage(request)
309 |         text = response.feed(self.post_response)
310 | 
311 |         self.assertEqual(text, b'')
312 |         self.assertTrue(response.complete())
313 |         self.assertEqual(response.code, 200)
314 |         self.assertEqual(response.header.version, b"HTTP/1.1")
315 | 
316 | 
317 | if __name__ == '__main__':
318 |     unittest.main()
319 | 


--------------------------------------------------------------------------------
/hanzo/warc2warc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """warc2warc - convert one warc to another, can be used to re-compress things"""
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | import sys
10 | import os.path
11 | 
12 | from optparse import OptionParser
13 | 
14 | from .warctools import WarcRecord, expand_files
15 | from .httptools import RequestMessage, ResponseMessage
16 | 
17 | parser = OptionParser(usage="%prog [options] url (url ...)")
18 | 
19 | parser.add_option("-o", "--output", dest="output",
20 |                        help="output warc file")
21 | parser.add_option("-l", "--limit", dest="limit")
22 | parser.add_option("-I", "--input", dest="input_format", help="(ignored)")
23 | parser.add_option("-Z", "--gzip", dest="gzip", action="store_true", help="compress output, record by record")
24 | parser.add_option("-D", "--decode_http", dest="decode_http", action="store_true", help="decode http messages (strip chunks, gzip)")
25 | parser.add_option("-L", "--log-level", dest="log_level")
26 | parser.add_option("--wget-chunk-fix", dest="wget_workaround", action="store_true", help="skip transfer-encoding headers in http records, when decoding them (-D)")
27 | 
28 | parser.set_defaults(output_directory=None, limit=None, log_level="info", gzip=False, decode_http=False, wget_workaround=False)
29 | 
30 | 
31 | WGET_IGNORE_HEADERS = ['Transfer-Encoding']
32 | 
33 | def process(record, out, options):
34 |     ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else ()
35 |     if options.decode_http:
36 |         if record.type == WarcRecord.RESPONSE:
37 |             content_type, content = record.content
38 |             message = None
39 |             if content_type == ResponseMessage.CONTENT_TYPE:
40 |                 # technically, a http request needs to know the request to be parsed
41 |                 # because responses to head requests don't have a body.
42 |                 # we assume we don't store 'head' responses, and plough on 
43 |                 message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers)
44 |             if content_type == RequestMessage.CONTENT_TYPE:
45 |                 message = RequestMessage(ignore_headers=ignore_headers)
46 | 
47 |             if message:
48 |                 leftover = message.feed(content)
49 |                 message.close()
50 |                 if not leftover and message.complete():
51 |                     content = message.get_decoded_message()
52 |                     record.content = content_type, content
53 |                 else:
54 |                     error = []
55 |                     if leftover:
56 |                         error.append("%d bytes unparsed"%len(leftover))
57 |                     if not message.complete():
58 |                         error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode))
59 |                     print('errors decoding http in record', record.id, ",".join(error), file=sys.stderr)
60 | 
61 |     record.write_to(out, gzip=options.gzip)
62 | 
63 | def main(argv):
64 |     (options, input_files) = parser.parse_args(args=argv[1:])
65 | 
66 |     try: # python3
67 |         out = sys.stdout.buffer
68 |     except AttributeError: # python2
69 |         out = sys.stdout
70 | 
71 |     if len(input_files) < 1:
72 |         fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)
73 | 
74 |         for record in fh:
75 |             process(record, out, options)
76 |     else:
77 |         for name in expand_files(input_files):
78 |             fh = WarcRecord.open_archive(name, gzip="auto")
79 |             for record in fh:
80 |                 process(record, out, options)
81 | 
82 |             fh.close()
83 | 
84 | 
85 | 
86 |     return 0
87 | 
88 | def run():
89 |     sys.exit(main(sys.argv))
90 | 
91 | 
92 | if __name__ == '__main__':  
93 |     run()
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/hanzo/warcdump.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """warcdump - dump warcs in a slightly more humane format"""
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | import sys
10 | import os.path
11 | 
12 | from optparse import OptionParser
13 | 
14 | from .warctools import WarcRecord, expand_files
15 | 
16 | parser = OptionParser(usage="%prog [options] warc warc warc")
17 | 
18 | parser.add_option("-l", "--limit", dest="limit")
19 | parser.add_option("-I", "--input", dest="input_format")
20 | parser.add_option("-L", "--log-level", dest="log_level")
21 | 
22 | parser.set_defaults(output_directory=None, limit=None, log_level="info")
23 | 
24 | def main(argv):
25 |     (options, input_files) = parser.parse_args(args=argv[1:])
26 | 
27 |     out = sys.stdout
28 |     if len(input_files) < 1:
29 |         dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False)
30 |         
31 |     else:
32 |         for name in expand_files(input_files):
33 |             fh = WarcRecord.open_archive(name, gzip="auto")
34 |             dump_archive(fh,name)
35 | 
36 |             fh.close()
37 | 
38 | 
39 |     return 0
40 | 
41 | def dump_archive(fh, name, offsets=True):
42 |     for (offset, record, errors) in fh.read_records(limit=None, offsets=offsets):
43 |         if record:
44 |             print("archive record at %s:%s"%(name,offset))
45 |             record.dump(content=True)
46 |         elif errors:
47 |             print("warc errors at %s:%d"%(name, offset if offset else 0))
48 |             for e in errors:
49 |                 print('\t', e)
50 |         else:
51 |             print()
52 |             print('note: no errors encountered in tail of file')
53 | 
54 | def run():
55 |     sys.exit(main(sys.argv))
56 | 
57 | 
58 | if __name__ == '__main__':  
59 |     run()
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/hanzo/warcextract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """warcextract - dump warc record context to standard out"""
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | import sys
10 | import os.path
11 | 
12 | from optparse import OptionParser
13 | from contextlib import closing
14 | 
15 | from .warctools import WarcRecord
16 | 
17 | parser = OptionParser(usage="%prog [options] warc offset")
18 | 
19 | #parser.add_option("-l", "--limit", dest="limit")
20 | parser.add_option("-I", "--input", dest="input_format")
21 | parser.add_option("-L", "--log-level", dest="log_level")
22 | 
23 | parser.set_defaults(output_directory=None, limit=None, log_level="info")
24 | 
25 | def main(argv):
26 |     (options, args) = parser.parse_args(args=argv[1:])
27 | 
28 |     try: # python3
29 |         out = sys.stdout.buffer
30 |     except AttributeError: # python2
31 |         out = sys.stdout
32 | 
33 |     if len(args) < 1:
34 |         # dump the first record on stdin
35 |         with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
36 |             dump_record(fh, out)
37 |         
38 |     else:
39 |         # dump a record from the filename, with optional offset
40 |         filename = args[0]
41 |         if len(args) > 1:
42 |             offset = int(args[1])
43 |         else:
44 |             offset = 0
45 | 
46 |         with closing(WarcRecord.open_archive(filename=filename, gzip="auto")) as fh:
47 |             fh.seek(offset)
48 |             dump_record(fh, out)
49 | 
50 | 
51 |     return 0
52 | 
53 | def dump_record(fh, out):
54 |     for (offset, record, errors) in fh.read_records(limit=1, offsets=False):
55 |         if record:
56 |             out.write(record.content[1])
57 |         elif errors:
58 |             print("warc errors at %s:%d"%(name, offset if offset else 0), file=sys.stderr)
59 |             for e in errors:
60 |                 print('\t', e)
61 |         break # only use one (I'm terrible)
62 | 
63 | 
64 | def run():
65 |     sys.exit(main(sys.argv))
66 | 
67 | 
68 | if __name__ == '__main__':  
69 |     run()
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/hanzo/warcfilter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """warcfilter - prints warcs in that match regexp, by default searches all headers"""
  3 | 
  4 | import os
  5 | import sys
  6 | 
  7 | import re
  8 | 
  9 | from optparse import OptionParser
 10 | 
 11 | from .warctools import WarcRecord, expand_files
 12 | from .httptools import RequestMessage, ResponseMessage
 13 | 
 14 | parser = OptionParser(usage="%prog [options] pattern warc warc warc")
 15 | 
 16 | parser.add_option("-l", "--limit", dest="limit", help="limit (ignored)")
 17 | parser.add_option("-I", "--input", dest="input_format", help="input format (ignored)")
 18 | parser.add_option("-i", "--invert", dest="invert",action="store_true", help="invert match")
 19 | parser.add_option("-U", "--url", dest="url",action="store_true", help="match on url")
 20 | parser.add_option("-T", "--type", dest="type",action="store_true", help="match on (warc) record type")
 21 | parser.add_option("-C", "--content-type", dest="content_type",action="store_true", help="match on (warc) record content type")
 22 | parser.add_option("-H", "--http-content-type", dest="http_content_type",action="store_true", help="match on http payload content type")
 23 | parser.add_option("-D", "--warc-date", dest="warc_date",action="store_true", help="match on WARC-Date header")
 24 | parser.add_option("-L", "--log-level", dest="log_level", help="log level(ignored)")
 25 | 
 26 | parser.set_defaults(output_directory=None, limit=None, log_level="info", invert=False, url=None, content_type=None, type=None)
 27 | 
 28 | def parse_http_response(record):
 29 |     message = ResponseMessage(RequestMessage())
 30 |     remainder = message.feed(record.content[1])
 31 |     message.close()
 32 |     if remainder or not message.complete():
 33 |         if remainder:
 34 |             logging.warning('trailing data in http response for %s'% record.url)
 35 |         if not message.complete():
 36 |             logging.warning('truncated http response for %s'%record.url)
 37 | 
 38 |     header = message.header
 39 | 
 40 |     mime_type = [v for k,v in header.headers if k.lower() == b'content-type']
 41 |     if mime_type:
 42 |         mime_type = mime_type[0].split(b';')[0]
 43 |     else:
 44 |         mime_type = None
 45 | 
 46 |     return header.code, mime_type, message
 47 | 
 48 | def main(argv):
 49 |     (options, input_files) = parser.parse_args(args=argv[1:])
 50 | 
 51 |     try: # python3
 52 |         out = sys.stdout.buffer
 53 |     except AttributeError: # python2
 54 |         out = sys.stdout
 55 | 
 56 |     if len(input_files) < 1:
 57 |         parser.error("no pattern")
 58 | 
 59 |         
 60 |     pattern, input_files = input_files[0].encode(), input_files[1:]
 61 | 
 62 | 
 63 |     invert = options.invert
 64 |     pattern = re.compile(pattern)
 65 |     if not input_files:
 66 |             fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)
 67 |             filter_archive(fh, options, pattern, out)
 68 |     else:
 69 |         for name in expand_files(input_files):
 70 |             fh = WarcRecord.open_archive(name, gzip="auto")
 71 |             filter_archive(fh, options, pattern,out)
 72 |             fh.close()
 73 | 
 74 | 
 75 | 
 76 |     return 0
 77 | 
 78 | def filter_archive(fh, options, pattern, out):
 79 |         invert = options.invert
 80 |         for record in fh:
 81 |             if options.url:
 82 |                 if bool(record.url and pattern.search(record.url)) ^ invert :
 83 |                     record.write_to(out)
 84 | 
 85 |             elif options.type:
 86 |                 if bool(record.type and pattern.search(record.type)) ^ invert:
 87 |                     record.write_to(out)
 88 | 
 89 |             elif options.content_type:
 90 |                 if bool(record.content_type and pattern.search(record.content_type)) ^ invert:
 91 |                     record.write_to(out)
 92 | 
 93 |             elif options.http_content_type:
 94 |                 if record.type == WarcRecord.RESPONSE and record.content_type.startswith(b'application/http'):
 95 |                     code, content_type, message = parse_http_response(record)
 96 | 
 97 |                     if bool(content_type and pattern.search(content_type)) ^ invert:
 98 |                         record.write_to(out)
 99 | 
100 |             elif options.warc_date:
101 |                 if bool(record.date and pattern.search(record.date)) ^ invert:
102 |                     record.write_to(out)
103 | 
104 |             else:
105 |                 found = False
106 |                 for name, value in record.headers:
107 |                     if pattern.search(value):
108 |                         found = True
109 |                         break
110 | 
111 |                 content_type, content = record.content
112 |                 if not found:
113 |                     found = bool(pattern.search(content))
114 |                         
115 | 
116 |                 if found ^ invert:
117 |                     record.write_to(out)
118 | 
119 | 
120 | def run():
121 |     sys.exit(main(sys.argv))
122 | 
123 | 
124 | if __name__ == '__main__':  
125 |     run()
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/hanzo/warcindex.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """warcindex - dump warc index"""
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | import sys
 8 | import os.path
 9 | 
10 | from optparse import OptionParser
11 | 
12 | from .warctools import WarcRecord, expand_files
13 | 
14 | parser = OptionParser(usage="%prog [options] warc warc warc")
15 | 
16 | parser.add_option("-l", "--limit", dest="limit")
17 | parser.add_option("-O", "--output-format", dest="output_format", help="output format (ignored)")
18 | parser.add_option("-o", "--output", dest="output_format", help="output file (ignored)")
19 | 
20 | parser.add_option("-L", "--log-level", dest="log_level")
21 | 
22 | parser.set_defaults(output=None, limit=None, log_level="info")
23 | 
24 | def main(argv):
25 |     (options, input_files) = parser.parse_args(args=argv[1:])
26 | 
27 |     try: # python3
28 |         out = sys.stdout.buffer
29 |     except AttributeError: # python2
30 |         out = sys.stdout
31 | 
32 |     if len(input_files) < 1:
33 |         parser.error("no imput warc file(s)")
34 |         
35 |     out.write(b'#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length\n')
36 |     for name in expand_files(input_files):
37 |         fh = WarcRecord.open_archive(name, gzip="auto")
38 | 
39 |         try:
40 |             for (offset, record, errors) in fh.read_records(limit=None):
41 |                 if record:
42 |                     fields = [name.encode('utf-8'), 
43 |                             str(offset).encode('utf-8'),
44 |                             record.type or b'-', 
45 |                             record.url or b'-', 
46 |                             record.id or b'-', 
47 |                             record.content_type or b'-',
48 |                             str(record.content_length).encode('utf-8')]
49 |                     out.write(b' '.join(fields) + b'\n')
50 |                 elif errors:
51 |                     pass
52 |                     # ignore
53 |                 else:
54 |                     pass
55 |                     # no errors at tail
56 | 
57 |         finally:
58 |             fh.close()
59 | 
60 |     return 0
61 | 
62 | 
63 | def run():
64 |     sys.exit(main(sys.argv))
65 | 
66 | 
67 | if __name__ == '__main__':  
68 |     run()
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/hanzo/warclinks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | from __future__ import print_function
  3 | 
  4 | import os
  5 | import re
  6 | import sys
  7 | import os.path
  8 | import logging
  9 | 
 10 | from urllib.parse import urlparse, urlunparse
 11 | from html.parser import HTMLParser, HTMLParseError
 12 | from optparse import OptionParser
 13 | from contextlib import closing
 14 | 
 15 | from .warctools import WarcRecord, expand_files
 16 | from .httptools import RequestMessage, ResponseMessage
 17 | 
 18 | 
 19 | LEVELS = {'debug': logging.DEBUG,
 20 |           'info': logging.INFO,
 21 |           'warning': logging.WARNING,
 22 |           'error': logging.ERROR,
 23 |           'critical': logging.CRITICAL}
 24 | 
 25 | parser = OptionParser(usage="%prog [options] warc (warc ...)")
 26 | 
 27 | parser.add_option("-L", "--log-level", dest="log_level")
 28 | 
 29 | parser.set_defaults(log_level="info")
 30 | 
 31 | 
 32 | 
 33 | def parse_http_response(record):
 34 |     message = ResponseMessage(RequestMessage())
 35 |     remainder = message.feed(record.content[1])
 36 |     message.close()
 37 |     if remainder or not message.complete():
 38 |         if remainder:
 39 |             logging.warning('trailing data in http response for %s'% record.url)
 40 |         if not message.complete():
 41 |             logging.warning('truncated http response for %s'%record.url)
 42 | 
 43 |     header = message.header
 44 | 
 45 |     mime_type = [v for k,v in header.headers if k.lower() =='content-type']
 46 |     if mime_type:
 47 |         mime_type = mime_type[0].split(';')[0]
 48 |     else:
 49 |         mime_type = None
 50 | 
 51 |     return header.code, mime_type, message
 52 | 
 53 | 
 54 | def extract_links_from_warcfh(fh):
 55 |     for (offset, record, errors) in fh.read_records(limit=None):
 56 |         if record:
 57 |             try:
 58 |                 content_type, content = record.content
 59 | 
 60 |                 if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
 61 | 
 62 |                     code, mime_type, message = parse_http_response(record)
 63 | 
 64 |                     if 200 <= code < 300 and mime_type.find('html') > -1: 
 65 |                         for link in extract_links_from_html(record.url, message.get_body()):
 66 |                             yield ("".join(c for c in link if c not in '\n\r\t'))
 67 | 
 68 | 
 69 |             except Exception as e:
 70 |                 logging.warning("error in handling record "+str(e))
 71 |                 import traceback; traceback.print_exc()
 72 | 
 73 |         elif errors:
 74 |             logging.warning("warc error at %d: %s"%((offset if offset else 0), ", ".join(str(e) for e in errors)))
 75 |             import traceback; traceback.print_exc()
 76 | 
 77 | 
 78 | 
 79 | try:
 80 |     import lxml.html
 81 | 
 82 |     def extract_links_from_html(base, body):
 83 |         try:
 84 |             html = lxml.html.fromstring(body)
 85 |             html.make_links_absolute(base)
 86 | 
 87 |             for element, attribute, link, pos in html.iterlinks():
 88 |                 if isinstance(link, str):
 89 |                     link = link.encode('utf-8', 'ignore')
 90 |                 yield link
 91 | 
 92 |         except Exception:
 93 |             logging.warning("(lxml) html parse error")
 94 |             import traceback; traceback.print_exc()
 95 |             
 96 | 
 97 | except ImportError:
 98 |     logging.warning("using fallback parser")
 99 |     def extract_links_from_html(base, body):
100 |         try:
101 |             html = LinkParser(base)
102 |             html.feed(body)
103 |             html.close()
104 |             for link in html.get_abs_links():
105 |                 yield link
106 |         except HTMLParseError as ex:
107 |             logging.warning("html parse error")
108 | 
109 | 
110 | """ fallback link extractor """
111 | def attr_extractor(*names):
112 |         def _extractor(attrs):
113 |             return [value for key,value in attrs if key in names and value]
114 |         return _extractor
115 | 
116 | def meta_extractor(attrs):
117 |     content = [value for key,value in attrs if key =="content" and value]
118 |     urls = []
119 |     for value in content:
120 |         for pair in value.split(";"):
121 |             bits = pair.split("=",2)
122 |             if len(bits)>1 and bits[0].lower()=="url":
123 |                 urls.append(bits[1].strip())
124 |     return urls
125 | 
126 | 
127 | class LinkParser(HTMLParser):
128 |     def __init__(self, base):
129 |         HTMLParser.__init__(self)
130 |         self.links = []
131 |         self.base = base
132 | 
133 |         self.tag_extractor = {
134 |             "a": attr_extractor("href"),
135 |             "applet": attr_extractor("code"),
136 |             "area": attr_extractor("href"),
137 |             "bgsound": attr_extractor("src"),
138 |             "body": attr_extractor("background"),
139 |             "embed": attr_extractor("href","src"),
140 |             "fig": attr_extractor("src"),
141 |             "form": attr_extractor("action"),
142 |             "frame": attr_extractor("src"),
143 |             "iframe": attr_extractor("src"),
144 |             "img": attr_extractor("href","src","lowsrc"),
145 |             "input": attr_extractor("src"),
146 |             "link": attr_extractor("href"),
147 |             "layer": attr_extractor("src"),
148 |             "object": attr_extractor("data"),
149 |             "overlay": attr_extractor("src"),
150 |             "script": attr_extractor("src"),
151 |             "table": attr_extractor("background"),
152 |             "td": attr_extractor("background"),
153 |             "th": attr_extractor("background"),
154 | 
155 |             "meta": meta_extractor,
156 |             "base": self.base_extractor,
157 |         }
158 | 
159 |     def base_extractor(self, attrs):
160 |         base = [value for key,value in attrs if key == "href" and value]
161 |         if base:
162 |             self.base = base[-1]
163 |         return ()
164 | 
165 |     def handle_starttag(self, tag, attrs):
166 |         extractor = self.tag_extractor.get(tag, None)
167 |         if extractor:
168 |             self.links.extend(extractor(attrs))
169 | 
170 |     def get_abs_links(self):
171 |         full_urls = []
172 |         root = urlparse(self.base)
173 |         root_dir = os.path.split(root.path)[0]
174 |         for link in self.links:
175 |             parsed = urlparse(link)
176 |             if not parsed.netloc: # does it have no protocol or host, i.e relative
177 |                 if parsed.path.startswith("/"):
178 |                     parsed = root[0:2] + parsed[2:5] + (None,)
179 |                 else:
180 |                     dir = root_dir
181 |                     path = parsed.path
182 |                     while True:
183 |                         if path.startswith("../"):
184 |                             path=path[3:]
185 |                             dir=os.path.split(dir)[0]
186 |                         elif path.startswith("./"):
187 |                             path=path[2:]
188 |                         else:
189 |                             break
190 | 
191 |                     parsed = root[0:2] + (os.path.join(dir, path),) + parsed[3:5] + (None,)
192 |                 new_link = urlunparse(parsed)
193 |                 logging.debug("relative %s -> %s"%(link, new_link))
194 |                 link=new_link
195 | 
196 |             else:
197 |                 logging.debug("absolute %s"%link)
198 |             full_urls.append(link)
199 |         return full_urls
200 | 
201 | 
202 | def main(argv):
203 |     (options, warcs) = parser.parse_args(args=argv[1:])
204 |     logging.basicConfig(level=LEVELS[options.log_level])
205 | 
206 |     if len(warcs) < 1:
207 |         parser.error("missing warcs(s)")
208 |         
209 | 
210 |     ret = 0
211 | 
212 |     for warc in expand_files(warcs):
213 |         try:
214 |             with closing(WarcRecord.open_archive(filename=warc, gzip="auto")) as fh:
215 |                 for link in extract_links_from_warcfh(fh):
216 |                     print(link)
217 | 
218 |         except Exception as e:
219 |             logging.error(str(e))
220 |             ret -=1
221 | 
222 |     return ret
223 | 
224 | 
225 | def run():
226 |     sys.exit(main(sys.argv))
227 | 
228 | 
229 | if __name__ == '__main__':  
230 |     run()
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/hanzo/warcpayload.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import os
 6 | import sys
 7 | try:
 8 |     from http.client import HTTPResponse
 9 | except ImportError:
10 |     from httplib import HTTPResponse
11 | 
12 | 
13 | from optparse import OptionParser
14 | from contextlib import closing
15 | 
16 | from .warctools import WarcRecord
17 | 
18 | parser = OptionParser(usage="%prog warc:offset")
19 | 
20 | parser.set_defaults(output_directory=None, limit=None, log_level="info")
21 | 
22 | def main(argv):
23 |     (options, args) = parser.parse_args(args=argv[1:])
24 | 
25 |     filename, offset = args[0].rsplit(':',1)
26 |     if ',' in offset:
27 |         offset, length = [int(n) for n in offset.split(',',1)]
28 |     else:
29 |         offset = int(offset)
30 |         length = None # unknown
31 | 
32 |     dump_payload_from_file(filename, offset, length)
33 | 
34 | def dump_payload_from_file(filename, offset=None, length=None):
35 |     with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh:
36 |         return dump_payload_from_stream(fh)
37 | 
38 | def dump_payload_from_stream(fh):
39 |     try: # python3
40 |         out = sys.stdout.buffer
41 |     except AttributeError: # python2
42 |         out = sys.stdout
43 | 
44 |     for (offset, record, errors) in fh.read_records(limit=1, offsets=False):
45 |         if record:
46 |             if (record.type == WarcRecord.RESPONSE 
47 |                     and record.content_type.startswith(b'application/http')):
48 |                 f = FileHTTPResponse(record.content_file)
49 |                 f.begin()
50 |             else:
51 |                 f = record.content_file
52 | 
53 |             buf = f.read(8192) 
54 |             while buf != b'':
55 |                 out.write(buf)
56 |                 buf = f.read(8192)
57 | 
58 |         elif errors:
59 |             print("warc errors at %s:%d"%(name, offset if offset else 0), file=sys.stderr)
60 |             for e in errors:
61 |                 print('\t', e)
62 | 
63 | class FileHTTPResponse(HTTPResponse):
64 |     """HTTPResponse subclass that reads from the supplied fileobj instead of
65 |     from a socket."""
66 | 
67 |     def __init__(self, fileobj, debuglevel=0, strict=0, method=None, buffering=False):
68 |         self.fp = fileobj
69 | 
70 |         # We can't call HTTPResponse.__init__(self, ...) because it will try to
71 |         # call sock.makefile() and we have no sock. So we have to copy and
72 |         # paste the rest of the constructor below.
73 | 
74 |         self.debuglevel = debuglevel
75 |         self.strict = strict
76 |         self._method = method
77 | 
78 |         self.headers = self.msg = None
79 | 
80 |         # from the Status-Line of the response
81 |         self.version = 'UNKNOWN' # HTTP-Version
82 |         self.status = 'UNKNOWN'  # Status-Code
83 |         self.reason = 'UNKNOWN'  # Reason-Phrase
84 | 
85 |         self.chunked = 'UNKNOWN'         # is "chunked" being used?
86 |         self.chunk_left = 'UNKNOWN'      # bytes left to read in current chunk
87 |         self.length = 'UNKNOWN'          # number of bytes left in response
88 |         self.will_close = 'UNKNOWN'      # conn will close at end of response
89 | 
90 | 
91 | def run():
92 |     sys.exit(main(sys.argv))
93 | 
94 | 
95 | if __name__ == '__main__':  
96 |     run()
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/hanzo/warctools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .record import ArchiveRecord
 2 | from .warc import WarcRecord
 3 | from .arc import ArcRecord
 4 | from .mixed import MixedRecord
 5 | from .s3 import list_files
 6 | from . import record, warc, arc, s3
 7 | 
 8 | def expand_files(files):
 9 |     for file in files:
10 |         if file.startswith('s3:'):
11 |             for f in list_files(file):
12 |                 yield f
13 |         else:
14 |             yield file
15 | 
16 | __all__= [
17 |     'MixedRecord',
18 |     'ArchiveRecord',
19 |     'ArcRecord',
20 |     'WarcRecord',
21 |     'record',
22 |     'warc',
23 |     'arc',
24 |     'expand_files',
25 | ]
26 | 


--------------------------------------------------------------------------------
/hanzo/warctools/arc.py:
--------------------------------------------------------------------------------
  1 | """An object to represent arc records
  2 | http://archive.org/web/researcher/ArcFileFormat.php
  3 | """
  4 | 
  5 | import re
  6 | 
  7 | from hanzo.warctools.record import ArchiveRecord, ArchiveParser
  8 | from hanzo.warctools.archive_detect import register_record_type
  9 | 
 10 | # URL<sp>IP-address<sp>Archive-date<sp>Content-type<sp>
 11 | #Result-code<sp>Checksum<sp>Location<sp> Offset<sp>Filename<sp>
 12 | #Archive-length<nl> 
 13 | # 
 14 | @ArchiveRecord.HEADERS(
 15 |     URL = b'URL',
 16 |     IP = b'IP-address',
 17 |     DATE = b'Archive-date',
 18 |     CONTENT_TYPE = b'Content-type',
 19 |     CONTENT_LENGTH = b'Archive-length',
 20 |     RESULT_CODE = b'Result-code',
 21 |     CHECKSUM = b'Checksum',
 22 |     LOCATION = b'Location',
 23 |     OFFSET = b'Offset',
 24 |     FILENAME = b'Filename',
 25 | )
 26 | class ArcRecord(ArchiveRecord):
 27 | 
 28 |     TRAILER = b'\n'  # an ARC record is trailed by single unix newline
 29 | 
 30 |     """Represents a record in an arc file."""
 31 |     def __init__(self, headers=None, content=None, errors=None):
 32 |         ArchiveRecord.__init__(self, headers, content, errors) 
 33 | 
 34 |     @property
 35 |     def type(self):
 36 |         return b"response"
 37 | 
 38 |     def _write_to(self, out, nl):
 39 |         #TODO: empty method?
 40 |         pass
 41 | 
 42 |     @classmethod
 43 |     def make_parser(cls):
 44 |         """Constructs a parser for arc records."""
 45 |         return ArcParser()
 46 | 
 47 | class ArcRecordHeader(ArcRecord):
 48 |     """Represents the headers in an arc record."""
 49 |     def __init__(self, headers=None, content=None, errors=None, version=None,
 50 |                  raw_headers=None):
 51 |         ArcRecord.__init__(self, headers, content, errors) 
 52 |         self.version = version
 53 |         self.raw_headers = raw_headers
 54 | 
 55 |     @property
 56 |     def type(self):
 57 |         return b"filedesc"
 58 | 
 59 |     def raw(self):
 60 |         """Return the raw representation of this record."""
 61 |         return b"".join(self.raw_headers) + self.content[1]
 62 | 
 63 | def rx(pat):
 64 |     """Helper function to compile a regular expression with the IGNORECASE
 65 |     flag."""
 66 |     return re.compile(pat, flags=re.IGNORECASE)
 67 | 
 68 | nl_rx = rx('^\r\n|\r|\n$')
 69 | length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
 70 | type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')     #pylint: disable-msg=E1101
 71 | SPLIT = re.compile(br'\b\s|\s\b').split
 72 | 
 73 | class ArcParser(ArchiveParser):
 74 |     """A parser for arc archives."""
 75 | 
 76 | 
 77 |     def __init__(self):
 78 |         self.version = 0
 79 |         # we don't know which version to parse initially - a v1 or v2 file so
 80 |         # we read the filedesc because the order and number of the headers
 81 |         # change between versions.
 82 | 
 83 |         # question? will we get arc fragments?
 84 |         # should we store both headers & detect records by header length?
 85 |         # if we don't know 
 86 | 
 87 |         self.headers = []
 88 | 
 89 |     def parse(self, stream, offset, line=None):
 90 |         """Parses a stream as an arc archive and returns an Arc record along
 91 |         with the offset in the stream of the end of the record."""
 92 |         record = None
 93 |         content_type = None
 94 |         content_length = None
 95 |         if line is None:
 96 |             line = stream.readline()
 97 | 
 98 |         while not line.rstrip():
 99 |             if not line:
100 |                 return (None, (), offset)
101 |             line = stream.readline()
102 | 
103 |         if line.startswith(b'filedesc:'):
104 |             raw_headers = []
105 |             raw_headers.append(line)
106 |             # read headers named in body of record
107 |             # to assign names to header, to read body of record
108 |             arc_version_line = stream.readline()
109 |             raw_headers.append(arc_version_line)
110 |             arc_names_line = stream.readline()
111 |             raw_headers.append(arc_names_line)
112 | 
113 |             arc_version = arc_version_line.strip()
114 | 
115 |             # configure parser instance
116 |             self.version = arc_version.split()[0]
117 |             self.headers = arc_names_line.strip().split()
118 |             
119 |             # now we have read header field in record body
120 |             # we can extract the headers from the current record,
121 |             # and read the length field
122 | 
123 |             # which is in a different place with v1 and v2
124 |         
125 |             # read headers 
126 |             arc_headers = self.parse_header_list(line)
127 |             
128 |             # extract content, ignoring header lines parsed already
129 |             content_type, content_length, errors = \
130 |                 self.get_content_headers(arc_headers)
131 | 
132 |             content_length = content_length \
133 |                 - len(arc_version_line) \
134 |                 - len(arc_names_line)
135 | 
136 |             record = ArcRecordHeader(headers=arc_headers,
137 |                                      version=arc_version,
138 |                                      errors=errors,
139 |                                      raw_headers=raw_headers)
140 |         else:
141 |             if not self.headers:
142 |                 raise Exception('missing filedesc')
143 |             headers = self.parse_header_list(line)
144 |             content_type, content_length, errors = \
145 |                 self.get_content_headers(headers)
146 | 
147 |             record = ArcRecord(headers = headers, errors=errors)
148 | 
149 |         line = None
150 | 
151 |         record.content_file = stream
152 |         record.content_file.bytes_to_eoc = content_length
153 | 
154 |         return (record, (), offset)
155 | 
156 |     def trim(self, stream):
157 |         return ()
158 | 
159 |     def parse_header_list(self, line):
160 |         # some people use ' ' as the empty value. lovely.
161 |         line = line.rstrip(b'\r\n')
162 |         values = SPLIT(line)
163 |         if len(self.headers) != len(values):
164 |             if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
165 |                 # fencepost
166 |                 values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
167 |             else:
168 |                 values = SPLIT(line, len(self.headers)-1)
169 | 
170 |         if len(self.headers) != len(values):
171 |             raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
172 |                 
173 |         return list(zip(self.headers, values))
174 | 
175 | 
176 |     @staticmethod
177 |     def get_content_headers(headers):
178 |         content_type = None
179 |         content_length = None
180 |         errors = []
181 | 
182 |         for name, value in headers:
183 |             if type_rx.match(name):
184 |                 if value:
185 |                     content_type = value
186 |                 else:
187 |                     errors.append(('invalid header', name, value))
188 |             elif length_rx.match(name):
189 |                 try:
190 |                     content_length = int(value)
191 |                 except ValueError:
192 |                     errors.append(('invalid header', name, value))
193 | 
194 |         return content_type, content_length, errors
195 | 
196 | 
197 | register_record_type(re.compile(br'^filedesc://'), ArcRecord)
198 | 


--------------------------------------------------------------------------------
/hanzo/warctools/archive_detect.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | 
 3 | archive_types = []
 4 | 
 5 | def is_gzip_file(file_handle):
 6 |     signature = file_handle.read(2)
 7 |     file_handle.seek(-len(signature),1)
 8 |     return signature == b'\x1f\x8b'
 9 | 
10 | def guess_record_type(file_handle):
11 |     offset = file_handle.tell()
12 |     if is_gzip_file(file_handle):
13 |         nfh=gzip.GzipFile(fileobj=file_handle)
14 |     else:
15 |         nfh=file_handle
16 |     
17 |     line = nfh.readline()
18 |     file_handle.seek(offset)
19 |     for rx, record in archive_types:
20 |         if rx.match(line):
21 |             return record
22 | 
23 |     else:
24 |         return None
25 | 
26 | def register_record_type(rx, record):
27 |     archive_types.append((rx,record))
28 | 


--------------------------------------------------------------------------------
/hanzo/warctools/log.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import sys
 4 | 
 5 | __all__ = ['debug']
 6 | 
 7 | if __debug__:
 8 |     def debug(*args):
 9 |         print('WARCTOOLS', args, file=sys.stderr)
10 | else:
11 |     def debug(*args):
12 |         pass
13 |     
14 | 


--------------------------------------------------------------------------------
/hanzo/warctools/mixed.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from hanzo.warctools.record import ArchiveRecord, ArchiveParser
 3 | from hanzo.warctools.warc import WarcParser
 4 | from hanzo.warctools.arc import ArcParser
 5 | 
 6 | 
 7 | class MixedRecord(ArchiveRecord):
 8 |     @classmethod
 9 |     def make_parser(self):
10 |         return MixedParser()
11 | 
12 | class MixedParser(ArchiveParser):
13 |     def __init__(self):
14 |         self.arc = ArcParser()
15 |         self.warc = WarcParser()
16 | 
17 |     def parse(self, stream, offset=None, line=None):
18 |         if line is None:
19 |             line = stream.readline()
20 | 
21 |         while line:
22 |             if line.startswith(b'WARC'):
23 |                 return self.warc.parse(stream, offset, line=line)
24 |             elif line not in (b'\n',b'\r\n',b'\r'):
25 |                 return self.arc.parse(stream, offset, line=line)
26 | 
27 |             line = stream.readline()
28 |         return None, (), offset
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/hanzo/warctools/record.py:
--------------------------------------------------------------------------------
  1 | """a skeleton class for archive records"""
  2 | 
  3 | from __future__ import print_function
  4 | from gzip import GzipFile
  5 | import re
  6 | 
  7 | from hanzo.warctools.stream import open_record_stream
  8 | 
  9 | strip = re.compile(br'[^\w\t \|\\\/]')
 10 | 
 11 | 
 12 | def add_headers(**kwargs):
 13 |     """a useful helper for defining header names in record formats"""
 14 | 
 15 |     def _add_headers(cls):
 16 |         for k, v in kwargs.items():
 17 |             setattr(cls, k, v)
 18 |         cls._HEADERS = list(kwargs.keys())
 19 |         return cls
 20 |     return _add_headers
 21 | 
 22 | 
 23 | class ArchiveParser(object):
 24 |     """ methods parse, and trim """
 25 |     pass
 26 | 
 27 | 
 28 | @add_headers(DATE=b'Date',
 29 |              CONTENT_TYPE=b'Type',
 30 |              CONTENT_LENGTH=b'Length',
 31 |              TYPE=b'Type',
 32 |              URL=b'Url')
 33 | class ArchiveRecord(object):
 34 |     """An archive record has some headers, maybe some content and
 35 |     a list of errors encountered. record.headers is a list of tuples (name,
 36 |     value). errors is a list, and content is a tuple of (type, data)"""
 37 | 
 38 |     #pylint: disable-msg=e1101
 39 | 
 40 |     def __init__(self, headers=None, content=None, errors=None):
 41 |         self.headers = headers if headers else []
 42 |         self.errors = errors if errors else []
 43 |         self._content = content
 44 | 
 45 |     HEADERS = staticmethod(add_headers)
 46 | 
 47 |     @property
 48 |     def date(self):
 49 |         return self.get_header(self.DATE)
 50 | 
 51 |     def error(self, *args):
 52 |         self.errors.append(args)
 53 | 
 54 |     @property
 55 |     def type(self):
 56 |         return self.get_header(self.TYPE)
 57 | 
 58 |     @property
 59 |     def content_type(self):
 60 |         return self.content[0]
 61 | 
 62 |     @property
 63 |     def content_file(self):
 64 |         """
 65 |         File handle for streaming the payload.
 66 | 
 67 |         If the record has been read from a RecordStream, content_file wraps the
 68 |         same underlying file handle as the RecordStream itself. This has
 69 |         important implications. Results are undefined if you try to read from
 70 |         content_file after reading the next record from RecordStream; and
 71 |         closing content_file will close the RecordStream, and vice versa.
 72 |         But if you avoid these caveats, content_file takes care to bound itself
 73 |         within the content-length specified in the warc record, so that reading
 74 |         to the end of content_file will bring you only to the end of the
 75 |         record's payload.
 76 | 
 77 |         When creating a record for writing and supplying content_file, the
 78 |         record can only be written once, since writing the record entails
 79 |         reading content_file and advancing the file position. Subsequent
 80 |         attempts to write using content_file will throw an exception.
 81 |         """
 82 |         return self._content_file
 83 | 
 84 |     @content_file.setter
 85 |     def content_file(self, fh):
 86 |         self._content_file = fh
 87 |         self._content_file_valid = fh is not None
 88 | 
 89 |     @property
 90 |     def content(self):
 91 |         """A tuple (content_type, content). When first referenced, content[0]
 92 |         is populated from the Content-Type header, and content[1] by reading
 93 |         self.content_file."""
 94 |         if self._content is None:
 95 |             content_type = self.get_header(self.CONTENT_TYPE)
 96 |             try:
 97 |                 content = self.content_file.read()
 98 |                 self._content = (content_type, content)
 99 |             finally:
100 |                 self.content_file = None
101 | 
102 |         return self._content
103 | 
104 |     @property
105 |     def content_type(self):
106 |         """If self.content tuple was supplied, or has already been snarfed, or
107 |         we don't have a Content-Type header, return self.content[0]. Otherwise, 
108 |         return the value of the Content-Type header."""
109 |         if self._content is None:
110 |             content_type = self.get_header(self.CONTENT_TYPE)
111 |             if content_type is not None:
112 |                 return content_type
113 | 
114 |         return self.content[0]
115 | 
116 |     @property
117 |     def content_length(self):
118 |         """If self.content tuple was supplied, or has already been snarfed, or
119 |         we don't have a Content-Length header, return len(self.content[1]).
120 |         Otherwise, return the value of the Content-Length header."""
121 |         if self._content is None:
122 |             content_length = self.get_header(self.CONTENT_LENGTH)
123 |             if content_length is not None:
124 |                 return int(content_length)
125 | 
126 |         return len(self.content[1])
127 | 
128 |     @property
129 |     def url(self):
130 |         return self.get_header(self.URL)
131 | 
132 |     def get_header(self, name):
133 |         """Returns value of first header found matching name, case
134 |         insensitively."""
135 |         for k, v in self.headers:
136 |             if name.lower() == k.lower():
137 |                 return v
138 | 
139 |     def set_header(self, name, value):
140 |         self.headers = [(k, v) for (k, v) in self.headers if k != name]
141 |         self.headers.append((name, value))
142 | 
143 |     def dump(self, content=True):
144 |         print('Headers:')
145 |         for (h, v) in self.headers:
146 |             print('\t%s:%s' % (h.decode('latin1'), v.decode('latin1')))
147 |         if content and self.content:
148 |             print('Content Headers:')
149 |             content_type, content_body = self.content
150 |             print('\t' + self.CONTENT_TYPE.decode('latin1'), ':', content_type.decode('latin1'))
151 |             print('\t' + self.CONTENT_LENGTH.decode('latin1'), ':', len(content_body))
152 |             print('Content:')
153 |             ln = min(1024, len(content_body))
154 |             abbr_strp_content = strip.sub(lambda x: ('\\x%00X' % ord(x.group())).encode('ascii'), content_body[:ln])
155 |             print('\t' + abbr_strp_content.decode('ascii'))
156 |             print('\t...')
157 |             print()
158 |         else:
159 |             print('Content: none')
160 |             print()
161 |             print()
162 |         if self.errors:
163 |             print('Errors:')
164 |             for e in self.errors:
165 |                 print('\t' + e)
166 | 
167 |     def write_to(self, out, newline=b'\x0D\x0A', gzip=False):
168 |         if self.content_file is not None:
169 |             if not self._content_file_valid:
170 |                 raise Exception('cannot write record because content_file has already been used')
171 | 
172 |         if gzip:
173 |             if hasattr(out, 'mode'):
174 |                 out = GzipFile(fileobj=out)
175 |             else:
176 |                 out = GzipFile(fileobj=out, mode='ab')
177 | 
178 |         self._write_to(out, newline)
179 | 
180 |         if gzip:
181 |             out.flush()
182 |             out.close()
183 | 
184 |         if self.content_file is not None:
185 |             self._content_file_valid = False
186 | 
187 |     def _write_to(self, out, newline):
188 |         raise AssertionError('this is bad')
189 | 
190 |     ### class methods for parsing
191 |     @classmethod
192 |     def open_archive(cls, filename=None, file_handle=None,
193 |                      mode="rb", gzip="auto", offset=None, length=None):
194 |         """Generically open an archive - magic autodetect"""
195 |         if cls is ArchiveRecord:
196 |             cls = None # means guess
197 |         return open_record_stream(cls, filename, file_handle, mode, gzip, offset, length)
198 | 
199 |     @classmethod
200 |     def make_parser(self):
201 |         """Reads a (w)arc record from the stream, returns a tuple (record,
202 |         errors).  Either records is null or errors is null. Any
203 |         record-specific errors are contained in the record - errors is only
204 |         used when *nothing* could be parsed"""
205 |         raise Exception()
206 | 


--------------------------------------------------------------------------------
/hanzo/warctools/s3.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from urllib.parse import urlparse
 3 | except ImportError:
 4 |     import urlparse
 5 | 
 6 | from io import StringIO
 7 | 
 8 | try:
 9 |     from boto.s3.connection import S3Connection
10 |     from boto.s3.key import Key
11 | except ImportError:
12 |     def open_url(url, offset=None, length=None):
13 |         raise ImportError('boto')
14 | 
15 |     def list_files(prefix):
16 |         raise ImportError('boto')
17 | else:
18 |     def open_url(url, offset=None, length=None):
19 |         p = urlparse(url)
20 |         bucket_name = p.netloc
21 |         key = p.path[1:]
22 |         conn = S3Connection()
23 |         bucket = conn.get_bucket(bucket_name)
24 |         k = Key(bucket)
25 |         k.key = key
26 |         if offset is not None and length is not None:
27 |             headers = {'Range': 'bytes=%d-%d' % (offset, offset + length)}
28 |         elif offset is not None:
29 |             headers = {'Range': 'bytes=%d-' % offset}
30 |         else:
31 |             headers = {}
32 | 
33 |         s = StringIO()
34 |         k.get_contents_to_file(s, headers=headers)
35 |         s.seek(0)
36 |         return s
37 | 
38 |     def list_files(prefix):
39 |         p = urlparse(prefix)
40 |         bucket_name = p.netloc
41 |         prefix = p.path[1:]
42 | 
43 |         conn = S3Connection()
44 | 
45 |         bucket = conn.get_bucket(bucket_name)
46 |         complete  = False
47 |         marker = ''
48 | 
49 |         while not complete:
50 |             rs = bucket.get_all_keys(prefix=prefix, marker=marker, delimiter='')
51 |             for k in rs:
52 |                 yield 's3://%s/%s' % (bucket_name, k.key)
53 |                 marker = k.key
54 | 
55 |             complete = not rs.is_truncated
56 | 


--------------------------------------------------------------------------------
/hanzo/warctools/stream.py:
--------------------------------------------------------------------------------
  1 | """Read records from normal file and compressed file"""
  2 | 
  3 | import gzip
  4 | import re
  5 | 
  6 | from hanzo.warctools.archive_detect import is_gzip_file, guess_record_type
  7 | 
  8 | def open_record_stream(record_class=None, filename=None, file_handle=None,
  9 |                        mode="rb", gzip="auto", offset=None, length=None):
 10 |     """Can take a filename or a file_handle. Normally called
 11 |     indirectly from A record class i.e WarcRecord.open_archive. If the
 12 |     first parameter is None, will try to guess"""
 13 | 
 14 |     if file_handle is None:
 15 |         if filename.startswith('s3://'):
 16 |             from . import s3
 17 |             file_handle = s3.open_url(filename, offset=offset, length=length)
 18 |         else:
 19 |             file_handle = open(filename, mode=mode)
 20 |             if offset is not None:
 21 |                 file_handle.seek(offset)
 22 | 
 23 |     if record_class == None:
 24 |         record_class = guess_record_type(file_handle)
 25 | 
 26 |     if record_class == None:
 27 |         raise Exception('Failed to guess compression')
 28 | 
 29 |     record_parser = record_class.make_parser()
 30 | 
 31 |     if gzip == 'auto':
 32 |         if (filename and filename.endswith('.gz')) or is_gzip_file(file_handle):
 33 |             gzip = 'record'
 34 |             #debug('autodetect: record gzip')
 35 |         else:
 36 |             # assume uncompressed file
 37 |             #debug('autodetected: uncompressed file')
 38 |             gzip = None
 39 | 
 40 |     if gzip == 'record':
 41 |         return GzipRecordStream(file_handle, record_parser)
 42 |     elif gzip == 'file':
 43 |         return GzipFileStream(file_handle, record_parser)
 44 |     else:
 45 |         return RecordStream(file_handle, record_parser)
 46 | 
 47 | 
 48 | class RecordStream(object):
 49 |     """A readable/writable stream of Archive Records. Can be iterated over
 50 |     or read_records can give more control, and potentially offset information.
 51 |     """
 52 |     def __init__(self, file_handle, record_parser):
 53 |         self.fh = file_handle
 54 |         self.record_parser = record_parser
 55 | 
 56 |         # Number of bytes until the end of the record's content, if known.
 57 |         # Normally set by the record parser based on the Content-Length header.
 58 |         self.bytes_to_eoc = None
 59 | 
 60 |     def seek(self, offset, pos=0):
 61 |         """Same as a seek on a file"""
 62 |         self.fh.seek(offset, pos)
 63 | 
 64 |     def read_records(self, limit=1, offsets=True):
 65 |         """Yield a tuple of (offset, record, errors) where
 66 |         Offset is either a number or None.
 67 |         Record is an object and errors is an empty list
 68 |         or record is none and errors is a list"""
 69 |         nrecords = 0
 70 |         while limit is None or nrecords < limit:
 71 |             offset, record, errors = self._read_record(offsets)
 72 |             nrecords += 1
 73 |             yield (offset, record, errors)
 74 |             if not record:
 75 |                 break
 76 | 
 77 |     def __iter__(self):
 78 |         while True:
 79 |             _, record, errors = self._read_record(offsets=False)
 80 |             if record:
 81 |                 yield record
 82 |             elif errors:
 83 |                 error_str = ",".join(str(error) for error in errors)
 84 |                 raise Exception("Errors while decoding %s" % error_str)
 85 |             else:
 86 |                 break
 87 | 
 88 |     def _read_record(self, offsets):
 89 |         """overridden by sub-classes to read individual records"""
 90 |         if self.bytes_to_eoc is not None:
 91 |             self._skip_to_eoc()  # skip to end of previous record
 92 |         self.bytes_to_eoc = None
 93 | 
 94 |         # handle any sort of valid or invalid record terminator
 95 |         while True:
 96 |             offset = self.fh.tell() if offsets else None
 97 |             line = self.fh.readline()
 98 |             if not re.match(br'^[\r\n]+$', line):
 99 |                 break
100 | 
101 |         record, errors, offset = self.record_parser.parse(self, offset, line)
102 |         return offset, record, errors
103 | 
104 |     def write(self, record):
105 |         """Writes an archive record to the stream"""
106 |         record.write_to(self)
107 | 
108 |     def close(self):
109 |         """Close the underlying file handle."""
110 |         self.fh.close()
111 | 
112 |     def _skip_to_eoc(self):
113 |         if self.bytes_to_eoc is None:
114 |             raise Exception('bytes_to_eoc is unset, cannot skip to end')
115 | 
116 |         while self.bytes_to_eoc > 0:
117 |             read_size = min(CHUNK_SIZE, self.bytes_to_eoc)
118 |             buf = self._read(read_size)
119 |             if len(buf) < read_size:
120 |                 raise Exception('expected {} bytes but only read {}'.format(read_size, len(buf)))
121 | 
122 |     def _read(self, count=None):
123 |         """Raw read, will read into next record if caller isn't careful"""
124 |         if count is not None:
125 |             result = self.fh.read(count)
126 |         else:
127 |             result = self.fh.read()
128 | 
129 |         if self.bytes_to_eoc is not None:
130 |             self.bytes_to_eoc -= len(result)
131 | 
132 |         return result
133 | 
134 |     def read(self, count=None):
135 |         """Safe read for reading content, will not read past the end of the
136 |         payload, assuming self.bytes_to_eoc is set. The record's trailing
137 |         bytes, \\r\\n\\r\\n for warcs or \\n for arcs, will remain when this
138 |         method returns "".
139 |         """
140 |         if self.bytes_to_eoc is not None and count is not None:
141 |             read_size = min(count, self.bytes_to_eoc)
142 |         elif self.bytes_to_eoc is not None:
143 |             read_size = self.bytes_to_eoc
144 |         elif count is not None:
145 |             read_size = count
146 |         else:
147 |             read_size = None
148 | 
149 |         return self._read(read_size)
150 | 
151 |     # XXX dumb implementation to support python3 http.client
152 |     def readinto(self, b):
153 |         tmp = self.read(count=len(b))
154 |         b[:len(tmp)] = tmp
155 |         return len(tmp)
156 | 
157 |     def readline(self, maxlen=None):
158 |         """Safe readline for reading content, will not read past the end of the
159 |         payload, assuming self.bytes_to_eoc is set. The record's trailing
160 |         bytes, \\r\\n\\r\\n for valid warcs or \\n for valid arcs, will remain
161 |         when this method returns "".
162 |         """
163 |         if self.bytes_to_eoc is not None and maxlen is not None:
164 |             lim = min(maxlen, self.bytes_to_eoc)
165 |         elif self.bytes_to_eoc is not None:
166 |             lim = self.bytes_to_eoc
167 |         elif maxlen is not None:
168 |             lim = maxlen
169 |         else:
170 |             lim = None
171 | 
172 |         if lim is not None:
173 |             result = self.fh.readline(lim)
174 |         else:
175 |             result = self.fh.readline()
176 | 
177 |         if self.bytes_to_eoc is not None:
178 |             self.bytes_to_eoc -= len(result)
179 |         return result
180 | 
181 | CHUNK_SIZE = 8192 # the size to read in, make this bigger things go faster.
182 | 
183 | class GeeZipFile(gzip.GzipFile):
184 |     """Extends gzip.GzipFile to remember self.member_offset, the raw file
185 |     offset of the current gzip member."""
186 | 
187 |     def __init__(self, filename=None, mode=None,
188 |                  compresslevel=9, fileobj=None, mtime=None):
189 |         # ignore mtime for python 2.6
190 |         gzip.GzipFile.__init__(self, filename=filename, mode=mode, compresslevel=compresslevel, fileobj=fileobj)
191 |         self.member_offset = None
192 | 
193 |     # hook in to the place we seem to be able to reliably get the raw gzip
194 |     # member offset
195 |     def _read(self, size=1024):
196 |         if self._new_member:
197 |             try:
198 |                 # works for python3.2
199 |                 self.member_offset = self.fileobj.tell() - self.fileobj._length + (self.fileobj._read or 0)
200 |             except AttributeError:
201 |                 # works for python2.7
202 |                 self.member_offset = self.fileobj.tell()
203 | 
204 |         return gzip.GzipFile._read(self, size)
205 | 
206 | class GzipRecordStream(RecordStream):
207 |     """A stream to read/write concatted file made up of gzipped
208 |     archive records"""
209 |     def __init__(self, file_handle, record_parser):
210 |         RecordStream.__init__(self, GeeZipFile(fileobj=file_handle), record_parser)
211 |         self.raw_fh = file_handle
212 | 
213 |     def _read_record(self, offsets):
214 |         if self.bytes_to_eoc is not None:
215 |             self._skip_to_eoc()  # skip to end of previous record
216 |         self.bytes_to_eoc = None
217 | 
218 |         # handle any sort of valid or invalid record terminator
219 |         while True:
220 |             line = self.fh.readline()
221 |             if not re.match(br'^[\r\n]+$', line):
222 |                 break
223 | 
224 |         record, errors, _offset = \
225 |             self.record_parser.parse(self, offset=None, line=line)
226 | 
227 |         offset = self.fh.member_offset
228 | 
229 |         return offset, record, errors
230 | 
231 |     def seek(self, offset, pos=0):
232 |         """Same as a seek on a file"""
233 |         self.raw_fh.seek(offset, pos)
234 |         # trick to avoid closing and recreating GzipFile, does it always work?
235 |         self.fh._new_member = True
236 | 
237 | class GzipFileStream(RecordStream):
238 |     """A stream to read/write gzipped file made up of all archive records"""
239 |     def __init__(self, file_handle, record):
240 |         RecordStream.__init__(self, gzip.GzipFile(fileobj=file_handle), record)
241 | 
242 |     def _read_record(self, offsets):
243 |         # no useful offsets in a gzipped file
244 |         if self.bytes_to_eoc is not None:
245 |             self._skip_to_eoc()  # skip to end of previous record
246 |         self.bytes_to_eoc = None
247 | 
248 |         # handle any sort of valid or invalid record terminator
249 |         while True:
250 |             line = self.fh.readline()
251 |             if not re.match(br'^[\r\n]+$', line):
252 |                 break
253 | 
254 |         record, errors, _offset = \
255 |             self.record_parser.parse(self, offset=None, line=line)
256 | 
257 |         return offset, record, errors
258 | 
259 | 


--------------------------------------------------------------------------------
/hanzo/warctools/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/internetarchive/warctools/21db132fd3e4b4042cd011d9dc3fb30276a5a0b6/hanzo/warctools/tests/__init__.py


--------------------------------------------------------------------------------
/hanzo/warctools/tests/test_warctools.py:
--------------------------------------------------------------------------------
  1 | # vim: set sw=4 et:
  2 | 
  3 | import unittest
  4 | 
  5 | # want unittest2 for python2.6
  6 | try:
  7 |     unittest.TestCase.assertIsNone
  8 | except AttributeError:
  9 |     import unittest2
 10 |     unittest = unittest2
 11 | 
 12 | import tempfile
 13 | import gzip
 14 | from hanzo import warctools, httptools
 15 | 
 16 | try:
 17 |     from io import BytesIO
 18 | except ImportError:
 19 |     from StringIO import StringIO
 20 |     BytesIO = StringIO
 21 | 
 22 | class ArcRecordTerminatorTest(unittest.TestCase):
 23 |     REC1_CONTENT = (b'1 0 InternetArchive\n'
 24 |                   + b'URL IP-address Archive-date Content-type Archive-length\n'
 25 |                   + b'Here is some funky arc header content!\n')
 26 |     RECORD1 = b'filedesc://ArcRecordTerminatorTest.arc 0.0.0.0 20131113000000 text/plain ' + str(len(REC1_CONTENT)).encode('ascii') + b'\n' + REC1_CONTENT
 27 | 
 28 |     REC2_CONTENT = (b'HTTP/1.1 200 OK\r\n'
 29 |                   + b'Content-Type: text/plain\r\n'
 30 |                   + b'Content-Length: 12\r\n'
 31 |                   + b'\r\n'
 32 |                   + b'01234567890\r\n')
 33 |     RECORD2 = b'http://example.org/ 192.168.1.1 20131113000000 text/plain ' + str(len(REC2_CONTENT)).encode('ascii') + b'\n' + REC2_CONTENT
 34 | 
 35 |     REC1_GZ = b"\x1f\x8b\x08\x00\xbf\xa9\x99R\x02\xff=NK\x0e\x820\x14\xdc\xf7\x14\xcf\x03\xf0\xa9\xc4\x8d;\xe3F\x12\x17\x86\xe0\x01\x9av\x90Fh\xc9\xeb\xd3\xc8\xedE4\xce\xec\xe6\x97\xe9\xfc\x00\x87d\xf7Eq`\xdb\xc0Fv-x\xf4\xc1H\xe4\x16Ir\xc3\x96\xca|%mK]i\xad\xabr\x05\t^RL\x83\xf1\x81\xb4\xde)M%\xd5A\xc0\x01\xb2\xac\xf5\xfe\tum\xceT_2\xe3\x1c#%\xfa\xc9\x993\x02:\xc6%\x1c$\x93y\xc2\xdf\x19\x10n\xd2\xab\x13\x18\xe4\x13\xa58\x82\xbaG\xb8\xcf\xf49\xd2\xc380\xd9os\xa3\xd4\x1b\xa0\xa9\x1c5\xc1\x00\x00\x00"
 36 |     REC2_GZ = b"\x1f\x8b\x08\x00\xbf\xa9\x99R\x02\xffM\xca1\x0e\xc20\x0c@\xd1\xddR\xee\xe0\x0b\x10\xdb\t\xb4iV\x16$\x90`\xc8\x05:X-RI#\xe4\xa1\xdc\x1e\t\x06\xf8\xeb\x7f\xb3Y\xcbD\xba\x8d\x8f\xb6\xa8_\x9f\x13\xa1\x0c\xc1K\x97\xbcx\xc1\xc0\x12E$\xf2'4\xdd\x8c\xda2\xde+\xf6\tN\xa5\xdc\xe8\xab\x18\xafg\x07\xc7\xb5\x9aV\xdb\x95W\xd3\xfc\x87\x7f\xe7\xa2u\xb29\xa3\x04\x07\x0eXB\xdc\x1f\xba>\r\xec\x00\xde#Pz\x9d\x8c\x00\x00\x00"
 37 | 
 38 |     def _arc_gz(self, terminator=b'\r\n\r\n'):
 39 |         return BytesIO(self.REC1_GZ + self.REC2_GZ)
 40 | 
 41 |     def _arc(self, terminator):
 42 |         s = self.RECORD1 + terminator + self.RECORD2 + terminator
 43 |         f = BytesIO(s)
 44 |         return f
 45 | 
 46 |     def _test_terminator(self, terminator):
 47 |         # print('testing warc with record terminator {}'.format(repr(terminator)))
 48 |         fin = self._arc(terminator)
 49 |         try:
 50 |             self._run_checks(fin, terminator, False)
 51 |         finally:
 52 |             fin.close()
 53 |         
 54 |         fin = self._arc_gz(terminator)
 55 |         try:
 56 |             self._run_checks(fin, terminator, True)
 57 |         finally:
 58 |             fin.close()
 59 | 
 60 |     def _run_checks(self, fin, terminator, gzipped):
 61 |         fh = warctools.ArchiveRecord.open_archive(file_handle=fin)
 62 |         try:
 63 |             i = 0
 64 |             for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
 65 |                 if i == 0:
 66 |                     self.assertEqual(offset, 0)
 67 |                     self.assertEqual(type(record), warctools.arc.ArcRecordHeader)
 68 |                     self.assertEqual(record.type, b'filedesc')
 69 |                     self.assertEqual(record.content_type, b'text/plain')
 70 |                     # content_length != len(record.content[1]) here because
 71 |                     # ArcParser reads and parses part of the "content" of the
 72 |                     # arc header record 
 73 |                     self.assertEqual(record.content_length, 115)
 74 |                     self.assertEqual(record.content[1], b'Here is some funky arc header content!\n')
 75 |                 elif i == 1:
 76 |                     if not gzipped:
 77 |                         self.assertEqual(offset, len(self.RECORD1) + len(terminator))
 78 |                     else:
 79 |                         self.assertEqual(offset, len(self.REC1_GZ))
 80 |                     self.assertEqual(type(record), warctools.arc.ArcRecord)
 81 |                     self.assertEqual(record.type, b'response')
 82 |                     self.assertEqual(record.content_type, b'text/plain')
 83 |                     self.assertEqual(record.content_length, 78)
 84 |                     self.assertEqual(record.content[1], b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 12\r\n\r\n01234567890\r\n')
 85 |                 elif i == 2:
 86 |                     if not gzipped:
 87 |                         self.assertEqual(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator))
 88 |                     else:
 89 |                         self.assertLess(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator))
 90 |                     self.assertIsNone(record)
 91 |                 else:
 92 |                     self.fail('this line should not be reached')
 93 | 
 94 |                 i += 1
 95 |         finally:
 96 |             fh.close()
 97 | 
 98 |     def runTest(self):
 99 |         # anything works as long as it contains only \r and \n and ends with \n
100 |         self._test_terminator(b'\n') # the good one
101 |         self._test_terminator(b'\r\n\r\n') 
102 |         self._test_terminator(b'\r\n')
103 |         self._test_terminator(b'\n\r\n')
104 |         self._test_terminator(b'\n\n\r\n')
105 |         self._test_terminator(b'\r\n\n')
106 |         self._test_terminator(b'\r\n\r\n\r\n')
107 |         self._test_terminator(b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')
108 |         self._test_terminator(b'\n\n')
109 |         self._test_terminator(b'\n\n\n')
110 |         self._test_terminator(b'\n\n\n\n')
111 |         self._test_terminator(b'\r\n\n\r\n\n')
112 |         self._test_terminator(b'\r\r\r\r\r\r\n')
113 |         self._test_terminator(b'\r\r\r\r\r\r\n\n')
114 |         self._test_terminator(b'\r\r\r\r\r\r\n\n\n')
115 | 
116 | class WarcRecordTerminatorTest(unittest.TestCase):
117 |     RECORD1 = (b'WARC/1.0\r\n'
118 |              + b'WARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\n'
119 |              + b'WARC-Type: warcinfo\r\n'
120 |              + b'Content-Type: application/warc-fields\r\n'
121 |              + b'Content-Length: 30\r\n'
122 |              + b'\r\n'
123 |              + b'format: WARC File Format 1.0\r\n')
124 | 
125 |     RECORD2 = (b'WARC/1.0\r\n'
126 |              + b'WARC-Type: response\r\n'
127 |              + b'WARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000001>\r\n'
128 |              + b'WARC-Target-URI: http://example.org/\r\n'
129 |              + b'Content-Type: application/http;msgtype=response\r\n'
130 |              + b'Content-Length: 78\r\n'
131 |              + b'\r\n'
132 |              + b'HTTP/1.1 200 OK\r\n'
133 |              + b'Content-Type: text/plain\r\n'
134 |              + b'Content-Length: 12\r\n'
135 |              + b'\r\n'
136 |              + b'01234567890\r\n')
137 | 
138 |     RECORD1_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xff\x0bw\x0cr\xd67\xd43\xe0\xe5\n\x07\xb2t\x83R\x93\xf3\x8bRt=]\xac\x14lJ\x8b\xf2\xacJK3S\xac\x0c\xa0@\x17\x0b\x01\x03vP\x03B*\x0bR\xad\x14\xca\x13\x8b\x923\xf3\xd2\xf2y\xb9\x9c\xf3\xf3JR\xf3J\xa0\xe2\x89\x05\x059\x99\xc9\x89%\x99\xf9y\xfa 5\xbai\x99\xa99)\xc5\x08e>\xa9y\xe9%\x19V\n\xc6@\x07\xf1r\xa5\xe5\x17\xe5&\x96X)\x80LVp\xcb\xccIUp\x03\x8b(\x80\x1d\x0c\x82\x00\x04h\xbe\xd2\xbf\x00\x00\x00'
139 |     RECORD2_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xffm\x8f\xc9\n\xc20\x10\x86\xef\x81\xbcC^\xa0MR\x97j\\@\xeaAQPJ\xa5\xe7\xa0C-\xd4$\xa4S\xd0\xb7\xb7\x85\x16A\xfd\x0f\xc3\xac\xdf\xcc\xe4\x9b4\xe12\x14\x94\xe4\xad\x17d/\x07\x8ay\xa8\x9d55\xf4\xc9\x14\xae\xd6\xdf\x82\xfdV\xb1e\xe3\x8dj\x9a\xf2\xa6D\xaf\xe0\x8f\xe9%\xd7\x03U\xfb\x020\xb8\xa4{\xc5\xee\x88Nq\x0eO\xfdp\x15\x84\xd6\x17\x9c\x92\xc4\x1a\x04\x83\xfdz\xed\\U^5\x96\xd6\xf0\xae}\xf1\xa8\x0bl+\xab\xcf]\xc3\xc0\x11L\x81w\xc5\xe2\x19%\x94\xec\xb2\xec\xdc>#Y$\x04;\x1d\xbe\xb9\x08O\xe4\xae\xd2\xa5\xf9\x05\xc8\xa8\x03\x08\x19\x8d\xc6\x93i<\x9b\x8b.\xa4\xe4\rV`\x1c`\x1f\x01\x00\x00'
140 | 
141 |     def _warc_gz(self, terminator=b'\r\n\r\n'):
142 |         return BytesIO(self.RECORD1_GZ + self.RECORD2_GZ)
143 | 
144 |     def _warc(self, terminator):
145 |         s = self.RECORD1 + terminator + self.RECORD2 + terminator
146 |         f = BytesIO(s)
147 |         return f
148 | 
149 |     def _test_terminator(self, terminator):
150 |         # print('testing warc with record terminator {}'.format(repr(terminator)))
151 |         fin = self._warc(terminator)
152 |         try:
153 |             self._run_checks(fin, terminator, False)
154 |         finally:
155 |             fin.close()
156 |         
157 |         fin = self._warc_gz(terminator)
158 |         try:
159 |             self._run_checks(fin, terminator, True)
160 |         finally:
161 |             fin.close()
162 | 
163 |     def _run_checks(self, fin, terminator, gzipped):
164 |         fh = warctools.ArchiveRecord.open_archive(file_handle=fin)
165 |         try:
166 |             i = 0
167 |             for (offset, record, errors) in fh.read_records(limit=None, offsets=True):
168 |                 if i == 0:
169 |                     self.assertEqual(offset, 0)
170 |                     self.assertEqual(type(record), warctools.warc.WarcRecord)
171 |                     self.assertEqual(record.type, b'warcinfo')
172 |                     self.assertEqual(record.content_type, b'application/warc-fields')
173 |                     self.assertEqual(record.content_length, 30)
174 |                     self.assertEqual(record.content[1], b'format: WARC File Format 1.0\r\n')
175 |                 elif i == 1:
176 |                     if not gzipped:
177 |                         self.assertEqual(offset, len(self.RECORD1) + len(terminator))
178 |                     else:
179 |                         self.assertEqual(offset, len(self.RECORD1_GZ))
180 |                     self.assertEqual(type(record), warctools.warc.WarcRecord)
181 |                     self.assertEqual(record.type, b'response')
182 |                     self.assertEqual(record.content_type, b'application/http;msgtype=response')
183 |                     self.assertEqual(record.content_length, 78)
184 |                     self.assertEqual(record.content[1], b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 12\r\n\r\n01234567890\r\n')
185 |                 elif i == 2:
186 |                     if not gzipped:
187 |                         self.assertEqual(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator))
188 |                     else:
189 |                         self.assertLess(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator))
190 |                     self.assertIsNone(record)
191 |                 else:
192 |                     self.fail('this line should not be reached')
193 | 
194 |                 i += 1
195 |         finally:
196 |             fh.close()
197 | 
198 |     def runTest(self):
199 |         # anything works as long as it contains only \r and \n and ends with \n
200 |         self._test_terminator(b'\r\n\r\n') # the good one
201 |         self._test_terminator(b'\r\n')
202 |         self._test_terminator(b'\n\r\n')
203 |         self._test_terminator(b'\n\n\r\n')
204 |         self._test_terminator(b'\r\n\n')
205 |         self._test_terminator(b'\r\n\r\n\r\n')
206 |         self._test_terminator(b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n')
207 |         self._test_terminator(b'\n')
208 |         self._test_terminator(b'\n\n')
209 |         self._test_terminator(b'\n\n\n')
210 |         self._test_terminator(b'\n\n\n\n')
211 |         self._test_terminator(b'\r\n\n\r\n\n')
212 |         self._test_terminator(b'\r\r\r\r\r\r\n')
213 |         self._test_terminator(b'\r\r\r\r\r\r\n\n')
214 |         self._test_terminator(b'\r\r\r\r\r\r\n\n\n')
215 | 
216 | 
217 | class WarcWritingTest(unittest.TestCase):
218 | 
219 |     # XXX should this a part of the library?
220 |     def build_warc_record(self, url, warc_date=None, content_buffer=None,
221 |             content_file=None, content_length=None, concurrent_to=None,
222 |             warc_type=None, content_type=None, remote_ip=None, profile=None,
223 |             refers_to=None, refers_to_target_uri=None, refers_to_date=None,
224 |             record_id=None, block_digest=None, payload_digest=None):
225 | 
226 |         if warc_date is None:
227 |             warc_date = warctools.warc.warc_datetime_str(datetime.now())
228 | 
229 |         if record_id is None:
230 |             record_id = warctools.WarcRecord.random_warc_uuid()
231 | 
232 |         headers = []
233 |         if warc_type is not None:
234 |             headers.append((warctools.WarcRecord.TYPE, warc_type))
235 |         headers.append((warctools.WarcRecord.ID, record_id))
236 |         headers.append((warctools.WarcRecord.DATE, warc_date))
237 |         headers.append((warctools.WarcRecord.URL, url))
238 |         if remote_ip is not None:
239 |             headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip))
240 |         if profile is not None:
241 |             headers.append((warctools.WarcRecord.PROFILE, profile))
242 |         if refers_to is not None:
243 |             headers.append((warctools.WarcRecord.REFERS_TO, refers_to))
244 |         if refers_to_target_uri is not None:
245 |             headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri))
246 |         if refers_to_date is not None:
247 |             headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date))
248 |         if concurrent_to is not None:
249 |             headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to))
250 |         if content_type is not None:
251 |             headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type))
252 |         if content_length is not None:
253 |             headers.append((warctools.WarcRecord.CONTENT_LENGTH, content_length))
254 |         if block_digest is not None:
255 |             headers.append((warctools.WarcRecord.BLOCK_DIGEST, block_digest))
256 |         if payload_digest is not None:
257 |             headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest))
258 | 
259 |         if content_file is not None:
260 |             assert content_buffer is None
261 |             assert content_length is not None
262 |             record = warctools.WarcRecord(headers=headers, content_file=content_file)
263 |         else:
264 |             assert content_buffer is not None
265 |             content_tuple = (content_type, content_buffer)
266 |             record = warctools.WarcRecord(headers=headers, content=content_tuple)
267 | 
268 |         return record
269 | 
270 |     def build_record_using_tuple(self):
271 |         content_buffer = b'Luke, I am your payload'
272 |         record = self.build_warc_record(url=b'http://example.org/',
273 |                 content_buffer=content_buffer,
274 |                 record_id=b'<urn:uuid:00000000-0000-0000-0000-000000000000>',
275 |                 warc_date=b'2013-11-15T00:00:00Z',
276 |                 warc_type=warctools.WarcRecord.RESPONSE,
277 |                 content_type=httptools.RequestMessage.CONTENT_TYPE)
278 |         return record
279 | 
280 |     def build_record_using_stream(self):
281 |         content_buffer = b'Shmuke, I gam four snayglob'
282 |         fh = BytesIO(content_buffer)
283 |         record = self.build_warc_record(url=b'http://example.org/',
284 |                 content_file=fh, content_length=str(len(content_buffer)).encode('ascii'),
285 |                 record_id=b'<urn:uuid:00000000-0000-0000-0000-000000000000>',
286 |                 warc_date=b'2013-11-15T00:00:00Z',
287 |                 warc_type=warctools.WarcRecord.RESPONSE,
288 |                 content_type=httptools.RequestMessage.CONTENT_TYPE)
289 |         return record
290 | 
291 | 
292 |     def test_write_using_tuple(self):
293 |         record = self.build_record_using_tuple()
294 | 
295 |         f = BytesIO()
296 |         record.write_to(f)
297 |         self.assertEqual(f.getvalue(), 
298 |                 b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
299 |         f.close()
300 | 
301 |         # should work again if we do it again
302 |         f = BytesIO()
303 |         record.write_to(f)
304 |         self.assertEqual(f.getvalue(), 
305 |                 b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
306 |         f.close()
307 | 
308 | 
309 |     def test_write_using_tuple_gz(self):
310 |         record = self.build_record_using_tuple()
311 | 
312 |         f = BytesIO()
313 |         record.write_to(f, gzip=True)
314 |         f.seek(0)
315 |         g = gzip.GzipFile(fileobj=f, mode='rb')
316 |         self.assertEqual(g.read(), b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
317 |         g.close()
318 |         f.close()
319 | 
320 |         # should work again if we do it again
321 |         f = BytesIO()
322 |         record.write_to(f, gzip=True)
323 |         f.seek(0)
324 |         g = gzip.GzipFile(fileobj=f, mode='rb')
325 |         self.assertEqual(g.read(), b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n')
326 |         g.close()
327 |         f.close()
328 | 
329 | 
330 |     def test_write_using_stream(self):
331 |         record = self.build_record_using_stream()
332 | 
333 |         f = BytesIO()
334 |         record.write_to(f)
335 |         self.assertEqual(f.getvalue(), 
336 |                 b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n')
337 |         f.close()
338 | 
339 |         # throws exception because record.content_file position has advanced
340 |         f = BytesIO()
341 |         with self.assertRaises(Exception):
342 |             record.write_to(f)
343 |         f.close()
344 | 
345 | 
346 |     def test_write_using_stream_gz(self):
347 |         record = self.build_record_using_stream()
348 | 
349 |         f = BytesIO()
350 |         record.write_to(f, gzip=True)
351 |         f.seek(0)
352 |         g = gzip.GzipFile(fileobj=f, mode='rb')
353 |         self.assertEqual(g.read(), b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: <urn:uuid:00000000-0000-0000-0000-000000000000>\r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n')
354 |         g.close()
355 |         f.close()
356 | 
357 |         # throws exception because record.content_file position has advanced
358 |         f = BytesIO()
359 |         with self.assertRaises(Exception):
360 |             record.write_to(f, gzip=True)
361 |         f.close()
362 | 
363 | 
364 | if __name__ == '__main__':
365 |     unittest.main()
366 | 


--------------------------------------------------------------------------------
/hanzo/warctools/warc.py:
--------------------------------------------------------------------------------
  1 | """An object to represent warc records, using the abstract record in
  2 | record.py"""
  3 | 
  4 | import re
  5 | import hashlib
  6 | from hanzo.warctools.record import ArchiveRecord, ArchiveParser
  7 | from hanzo.warctools.archive_detect import register_record_type
  8 | import uuid
  9 | 
 10 | bad_lines = 5 # when to give up looking for the version stamp
 11 | 
 12 | 
 13 | @ArchiveRecord.HEADERS(
 14 |     DATE=b'WARC-Date',
 15 |     TYPE=b'WARC-Type',
 16 |     ID=b'WARC-Record-ID',
 17 |     CONCURRENT_TO=b'WARC-Concurrent-To',
 18 |     REFERS_TO=b'WARC-Refers-To',
 19 |     REFERS_TO_TARGET_URI=b'WARC-Refers-To-Target-URI',
 20 |     REFERS_TO_DATE=b'WARC-Refers-To-Date',
 21 |     CONTENT_LENGTH=b'Content-Length',
 22 |     CONTENT_TYPE=b'Content-Type',
 23 |     URL=b'WARC-Target-URI',
 24 |     BLOCK_DIGEST=b'WARC-Block-Digest',
 25 |     PAYLOAD_DIGEST=b'WARC-Payload-Digest',
 26 |     IP_ADDRESS=b'WARC-IP-Address',
 27 |     FILENAME=b'WARC-Filename',
 28 |     WARCINFO_ID=b'WARC-Warcinfo-ID',
 29 |     PROFILE=b'WARC-Profile'
 30 | )
 31 | class WarcRecord(ArchiveRecord):
 32 | 
 33 |     # Pylint is very bad at decorators, E1101 is the message that says
 34 |     # a member variable does not exist
 35 | 
 36 |     # pylint: disable-msg=E1101
 37 | 
 38 |     VERSION = b"WARC/1.0"
 39 |     VERSION18 = b"WARC/0.18"
 40 |     VERSION17 = b"WARC/0.17"
 41 |     RESPONSE = b"response"
 42 |     RESOURCE = b"resource"
 43 |     REQUEST = b"request"
 44 |     REVISIT = b"revisit"
 45 |     METADATA = b"metadata"
 46 |     CONVERSION = b"conversion"
 47 |     WARCINFO = b"warcinfo"
 48 | 
 49 |     PROFILE_IDENTICAL_PAYLOAD_DIGEST = b"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest"
 50 | 
 51 |     TRAILER = b'\r\n\r\n'
 52 | 
 53 |     def __init__(self, version=VERSION, headers=None, content=None,
 54 |                  errors=None, content_file=None):
 55 |         """
 56 |         WarcRecord constructor.
 57 | 
 58 |         Either content or content_file must be provided, but not both. If
 59 |         content, which is a tuple (content_type, content_buffer), is provided,
 60 |         when writing the warc record, any Content-Type and Content-Length that
 61 |         appear in the supplied headers are ignored, and the values content[0]
 62 |         and len(content[1]), respectively, are used. 
 63 | 
 64 |         When reading, the caller can stream content_file or use content, which is
 65 |         lazily filled using content_file, and after which content_file is
 66 |         unavailable.
 67 |         """
 68 |         ArchiveRecord.__init__(self, headers, content, errors)
 69 |         self.version = version
 70 |         self.content_file = content_file
 71 | 
 72 |     @property
 73 |     def id(self):
 74 |         return self.get_header(self.ID)
 75 | 
 76 |     def _write_to(self, out, nl):
 77 |         """WARC Format:
 78 |             VERSION NL
 79 |             (Key: Value NL)*
 80 |             NL
 81 |             CONTENT NL
 82 |             NL
 83 | 
 84 |             don't write multi line headers
 85 |         """
 86 |         out.write(self.version)
 87 |         out.write(nl)
 88 |         for k, v in self.headers:
 89 |             if self.content_file is not None or k not in (self.CONTENT_TYPE, self.CONTENT_LENGTH):
 90 |                 out.write(k)
 91 |                 out.write(b": ")
 92 |                 out.write(v)
 93 |                 out.write(nl)
 94 | 
 95 |         if self.content_file is not None:
 96 |             out.write(nl) # end of header blank nl
 97 |             while True:
 98 |                 buf = self.content_file.read(8192)
 99 |                 if buf == b'': break
100 |                 out.write(buf)
101 |         else:
102 |             # if content tuple is provided, set Content-Type and
103 |             # Content-Length based on the values in the tuple
104 |             content_type, content_buffer = self.content
105 | 
106 |             if content_type:
107 |                 out.write(self.CONTENT_TYPE)
108 |                 out.write(b": ")
109 |                 out.write(content_type)
110 |                 out.write(nl)
111 |             if content_buffer is None:
112 |                 content_buffer = b""
113 | 
114 |             content_length = len(content_buffer)
115 |             out.write(self.CONTENT_LENGTH)
116 |             out.write(b": ")
117 |             out.write(str(content_length).encode('ascii'))
118 |             out.write(nl)
119 | 
120 |             out.write(nl) # end of header blank nl
121 |             if content_buffer:
122 |                 out.write(content_buffer)
123 |      
124 |         # end of record nl nl
125 |         out.write(nl)
126 |         out.write(nl)
127 |         out.flush()
128 | 
129 |     def repair(self):
130 |         pass
131 | 
132 |     def validate(self):
133 |         return self.errors
134 | 
135 |     @classmethod
136 |     def make_parser(self):
137 |         return WarcParser()
138 | 
139 |     def block_digest(self, content_buffer):
140 |         block_hash = hashlib.sha256()
141 |         block_hash.update(content_buffer)
142 | 
143 |         digest = "sha256:%s" % block_hash.hexdigest()
144 |         return digest
145 | 
146 |     @staticmethod
147 |     def warc_uuid(text):
148 |         return "<urn:uuid:{}>".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii')
149 | 
150 |     @staticmethod
151 |     def random_warc_uuid():
152 |         return "<urn:uuid:{}>".format(uuid.uuid4()).encode('ascii')
153 | 
154 | 
155 | def rx(pat):
156 |     """Helper to compile regexps with IGNORECASE option set."""
157 |     return re.compile(pat, flags=re.IGNORECASE)
158 | 
159 | version_rx = rx(br'^(?P<prefix>.*?)(?P<version>\s*WARC/(?P<number>.*?))'
160 |                 b'(?P<nl>\r\n|\r|\n)\\Z')
161 | # a header is key: <ws> value plus any following lines with leading whitespace
162 | header_rx = rx(br'^(?P<name>.*?):\s?(?P<value>.*?)' b'(?P<nl>\r\n|\r|\n)\\Z')
163 | value_rx = rx(br'^\s+(?P<value>.+?)' b'(?P<nl>\r\n|\r|\n)\\Z')
164 | nl_rx = rx(b'^(?P<nl>\r\n|\r|\n\\Z)')
165 | length_rx = rx(b'^' + WarcRecord.CONTENT_LENGTH + b'$' ) # pylint: disable-msg=E1101
166 | type_rx = rx(b'^' + WarcRecord.CONTENT_TYPE + b'$')     # pylint: disable-msg=E1101
167 | 
168 | required_headers = set((
169 |         WarcRecord.TYPE.lower(),           # pylint: disable-msg=E1101
170 |         WarcRecord.ID.lower(),             # pylint: disable-msg=E1101
171 |         WarcRecord.CONTENT_LENGTH.lower(), # pylint: disable-msg=E1101
172 |         WarcRecord.DATE.lower(),           # pylint: disable-msg=E1101
173 |         ))
174 | 
175 | 
176 | class WarcParser(ArchiveParser):
177 |     KNOWN_VERSIONS = set((b'1.0', b'0.17', b'0.18'))
178 | 
179 |     def parse(self, stream, offset, line=None):
180 |         """Reads a warc record from the stream, returns a tuple
181 |         (record, errors).  Either records is null or errors is
182 |         null. Any record-specific errors are contained in the record -
183 |         errors is only used when *nothing* could be parsed"""
184 |         # pylint: disable-msg=E1101
185 |         errors = []
186 |         version = None
187 |         # find WARC/.*
188 |         if line is None:
189 |             line = stream.readline()
190 | 
191 |         while line:
192 |             match = version_rx.match(line)
193 | 
194 |             if match:
195 |                 version = match.group('version')
196 |                 if offset is not None:
197 |                     offset += len(match.group('prefix'))
198 |                 break
199 |             else:
200 |                 if offset is not None:
201 |                     offset += len(line)
202 |                 if not nl_rx.match(line):
203 |                     errors.append(('ignored line', line))
204 |                     if len(errors) > bad_lines:
205 |                         errors.append(('too many errors, giving up hope',))
206 |                         return (None, errors, offset)
207 |                 line = stream.readline()
208 |         if not line:
209 |             if version:
210 |                 errors.append(('warc version but no headers', version))
211 |             return (None, errors, offset)
212 |         if line:
213 |             content_length = 0
214 |             content_type = None
215 | 
216 |             record = WarcRecord(errors=errors, version=version)
217 | 
218 |             if match.group('nl') != b'\x0d\x0a':
219 |                 record.error('incorrect newline in version', match.group('nl'))
220 | 
221 |             if match.group('number') not in self.KNOWN_VERSIONS:
222 |                 record.error('version field is not known (%s)'
223 |                              % (",".join(self.KNOWN_VERSIONS)),
224 |                              match.group('number'))
225 | 
226 |             prefix = match.group('prefix')
227 | 
228 |             if prefix:
229 |                 record.error('bad prefix on WARC version header', prefix)
230 | 
231 |             #Read headers
232 |             line = stream.readline()
233 |             while line and not nl_rx.match(line):
234 | 
235 |                 #print 'header', repr(line)
236 |                 match = header_rx.match(line)
237 |                 if match:
238 |                     if match.group('nl') != b'\x0d\x0a':
239 |                         record.error('incorrect newline in header',
240 |                                      match.group('nl'))
241 |                     name = match.group('name').strip()
242 |                     value = [match.group('value').strip()]
243 |                     #print 'match',name, value
244 | 
245 |                     line = stream.readline()
246 |                     match = value_rx.match(line)
247 |                     while match:
248 |                         #print 'follow', repr(line)
249 |                         if match.group('nl') != b'\x0d\x0a':
250 |                             record.error('incorrect newline in follow header',
251 |                                          line, match.group('nl'))
252 |                         value.append(match.group('value').strip())
253 |                         line = stream.readline()
254 |                         match = value_rx.match(line)
255 | 
256 |                     value = b" ".join(value)
257 | 
258 |                     record.headers.append((name, value))
259 | 
260 |                     if type_rx.match(name):
261 |                         if value:
262 |                             content_type = value
263 |                         else:
264 |                             record.error('invalid header', name, value)
265 |                     elif length_rx.match(name):
266 |                         try:
267 |                             #print name, value
268 |                             content_length = int(value)
269 |                             #print content_length
270 |                         except ValueError:
271 |                             record.error('invalid header', name, value)
272 | 
273 |             # have read blank line following headers
274 | 
275 |             record.content_file = stream
276 |             record.content_file.bytes_to_eoc = content_length
277 | 
278 |             # check mandatory headers 
279 |             # WARC-Type WARC-Date WARC-Record-ID Content-Length
280 | 
281 |             return (record, (), offset)
282 | 
283 | 
284 | blank_rx = rx(br'^$')
285 | register_record_type(version_rx, WarcRecord)
286 | register_record_type(blank_rx, WarcRecord)
287 | 
288 | 
289 | def make_response(id, date, url, content, request_id):
290 |     # pylint: disable-msg=E1101
291 |     headers = [
292 |             (WarcRecord.TYPE, WarcRecord.RESPONSE),
293 |             (WarcRecord.ID, id),
294 |             (WarcRecord.DATE, date),
295 |             (WarcRecord.URL, url),
296 | 
297 |     ]
298 |     if request_id:
299 |         headers.append((WarcRecord.CONCURRENT_TO, request_id))
300 | 
301 |     record = WarcRecord(headers=headers, content=content)
302 | 
303 |     return record
304 | 
305 | 
306 | def make_request(request_id, date, url, content, response_id):
307 |     # pylint: disable-msg=E1101
308 |     headers = [
309 |             (WarcRecord.TYPE, WarcRecord.REQUEST),
310 |             (WarcRecord.ID, request_id),
311 |             (WarcRecord.DATE, date),
312 |             (WarcRecord.URL, url),
313 | 
314 |     ]
315 |     if response_id:
316 |         headers.append((WarcRecord.CONCURRENT_TO, response_id))
317 | 
318 |     record = WarcRecord(headers=headers, content=content)
319 | 
320 |     return record
321 | 
322 | 
323 | def make_metadata(meta_id, date, content, concurrent_to=None, url=None):
324 |     # pylint: disable-msg=E1101
325 |     headers = [
326 |             (WarcRecord.TYPE, WarcRecord.METADATA),
327 |             (WarcRecord.ID, meta_id),
328 |             (WarcRecord.DATE, date),
329 | 
330 |     ]
331 |     if concurrent_to:
332 |         headers.append((WarcRecord.CONCURRENT_TO, concurrent_to))
333 | 
334 |     if url:
335 |         headers.append((WarcRecord.URL, url))
336 | 
337 |     record = WarcRecord(headers=headers, content=content)
338 | 
339 |     return record
340 | 
341 | 
342 | def make_conversion(conv_id, date, content, refers_to=None, url=None):
343 |     # pylint: disable-msg=E1101
344 |     headers = [
345 |             (WarcRecord.TYPE, WarcRecord.CONVERSION),
346 |             (WarcRecord.ID, conv_id),
347 |             (WarcRecord.DATE, date),
348 | 
349 |     ]
350 |     if refers_to:
351 |         headers.append((WarcRecord.REFERS_TO, refers_to))
352 | 
353 |     if url:
354 |         headers.append((WarcRecord.URL, url))
355 | 
356 |     record = WarcRecord(headers=headers, content=content)
357 | 
358 |     return record
359 | 
360 | 
361 | def warc_datetime_str(d):
362 |     s = d.isoformat()
363 |     if '.' in s:
364 |         s = s[:s.find('.')]
365 |     return (s + 'Z').encode('utf-8')
366 | 


--------------------------------------------------------------------------------
/hanzo/warcvalid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """warcvalid - check a warc is ok"""
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import sys
 8 | 
 9 | import sys
10 | import os.path
11 | 
12 | from optparse import OptionParser
13 | 
14 | from .warctools import WarcRecord, expand_files
15 | 
16 | parser = OptionParser(usage="%prog [options] warc warc warc")
17 | 
18 | parser.add_option("-l", "--limit", dest="limit")
19 | parser.add_option("-I", "--input", dest="input_format")
20 | parser.add_option("-L", "--log-level", dest="log_level")
21 | 
22 | parser.set_defaults(output_directory=None, limit=None, log_level="info")
23 | 
24 | def main(argv):
25 |     (options, input_files) = parser.parse_args(args=argv[1:])
26 | 
27 |     out = sys.stdout
28 |     if len(input_files) < 1:
29 |         parser.error("no imput warc file(s)")
30 |         
31 | 
32 |     correct=True
33 |     fh=None
34 |     try:
35 |         for name in expand_files(input_files):
36 |             fh = WarcRecord.open_archive(name, gzip="auto")
37 | 
38 |             for (offset, record, errors) in fh.read_records(limit=None):
39 |                 if errors:
40 |                     print("warc errors at %s:%d"%(name, offset), file=sys.stderr)
41 |                     print(errors, file=sys.stderr)
42 |                     correct=False
43 | 
44 |                     break
45 |                 elif record is not None and record.validate(): # ugh name, returns errorsa
46 |                     print("warc errors at %s:%d"%(name, offset), file=sys.stderr)
47 |                     print(record.validate(), file=sys.stderr)
48 |                     correct=False
49 |                     break
50 |                 
51 | 
52 |     except Exception as e:
53 |         print("Exception: %s"%(str(e)), file=sys.stderr)
54 |         correct=False
55 |     finally:
56 |         if fh: fh.close()
57 |     
58 |     if correct:
59 |         return 0
60 |     else:
61 |         return -1 # failure code
62 | 
63 | 
64 | def run():
65 |     sys.exit(main(sys.argv))
66 | 
67 | 
68 | if __name__ == '__main__':  
69 |     run()
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/make-deb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | unset CDPATH
 4 | 
 5 | if [ -d debian ]; then
 6 |     rm -rf debian
 7 | fi
 8 | 
 9 | mkdir debian
10 | 
11 | VERSION="$(cat version)"
12 | 
13 | if ! (echo "$VERSION" | egrep -q '^[0-9]+\.[0-9]+$'); then
14 |     echo "Invalid version number $VERSION" 1>&2
15 |     exit 1
16 | fi
17 | 
18 | if [ "$(hg branch)" = 'default' ]; then
19 |     REVISION="$(hg id -n)"
20 |     VERSION="${VERSION}-tip"
21 | elif [ "$(hg branch)" = "$VERSION" ]; then
22 |     REVISION="$(hg id -n)"
23 | fi
24 | 
25 | 
26 | mkdir -p debian/DEBIAN
27 | cat <<EOF > debian/DEBIAN/control
28 | Package: hanzo-warc-tools
29 | Version: ${VERSION}-${REVISION}
30 | Maintainer: Stephen Jones <stephen.jones@hanzoarchives.com>
31 | Section: admin
32 | Priority: optional
33 | Architecture: all
34 | Depends: python (>= 2.7)
35 | Description: Suite of tools and libraries for manipulating warc files.
36 |  Provides commands for listing the contents of warc files and libraries for
37 |  manipulating warc files and http.
38 | EOF
39 | 
40 | python setup.py install -q --no-compile --root "$PWD/debian" --install-layout=deb
41 | 
42 | mkdir -p debian/usr/share/doc/hanzo-warc-tools
43 | echo "Copyright Hanzo Archives $(date +%Y)" > debian/usr/share/doc/hanzo-warc-tools/copyright
44 | cp README debian/usr/share/doc/hanzo-warc-tools/
45 | hg log --style=changelog | gzip -9 > debian/usr/share/doc/hanzo-warc-tools/changelog.gz
46 | 
47 | cat <<EOF | gzip -9 > debian/usr/share/doc/hanzo-warc-tools/changelog.Debian.gz
48 | hanzo-warc-tools ($VERSION) Hanzo;
49 | 
50 |  * Made debian style package  
51 | 
52 |  -- Stephen Jones <stephen.jones@hanzoarchives.com> $(date +'%a, %d %h %Y %T %z')
53 | EOF
54 | 
55 | cat <<EOF > debian/DEBIAN/postinst
56 | #!/bin/bash -e
57 | 
58 | if which pycompile >/dev/null 2>&1; then
59 |   pycompile -p hanzo-warc-tools
60 | fi
61 | EOF
62 | 
63 | pushd debian
64 | 
65 | find usr/bin -type f -name '*.py' | (
66 |     while read SCRIPT; do
67 | 	mv "$SCRIPT" "${SCRIPT%.py}"
68 | 	chmod 755 "${SCRIPT%.py}"
69 |     done
70 | )
71 | md5sum $(find . -path ./DEBIAN -prune -o -type f -print) > DEBIAN/md5sums
72 | 
73 | find usr/lib -type f -exec chmod 644 '{}' ';'
74 | find usr/share -type f -exec chmod 644 '{}' ';'
75 | find DEBIAN -type f -exec chmod 644 '{}' ';'
76 | find . -type d -exec chmod 755 '{}' ';'
77 | 
78 | chmod 755 DEBIAN/postinst
79 | 
80 | popd
81 | 
82 | fakeroot dpkg-deb --build debian .
83 | 
84 | lintian "hanzo-warc-tools_${VERSION}-${REVISION}_all.deb"
85 | 
86 | if [ -n "$1" ] && [ -d "$1" ] && [ -w "$1" ]; then
87 |     mv "hanzo-warc-tools_${VERSION}-${REVISION}_all.deb" "$1"
88 | fi
89 | 


--------------------------------------------------------------------------------
/pylint.rc:
--------------------------------------------------------------------------------
  1 | # lint Python modules using external checkers.
  2 | # 
  3 | # This is the main checker controlling the other ones and the reports
  4 | # generation. It is itself both a raw checker and an astng checker in order
  5 | # to:
  6 | # * handle message activation / deactivation at the module level
  7 | # * handle some basic but necessary stats'data (number of classes, methods...)
  8 | # 
  9 | [MASTER]
 10 | 
 11 | # Specify a configuration file.
 12 | #rcfile=
 13 | 
 14 | # Python code to execute, usually for sys.path manipulation such as
 15 | # pygtk.require().
 16 | #init-hook=
 17 | 
 18 | # Profiled execution.
 19 | profile=no
 20 | 
 21 | # Add <file or directory> to the black list. It should be a base name, not a
 22 | # path. You may set this option multiple times.
 23 | ignore=CVS
 24 | 
 25 | # Pickle collected data for later comparisons.
 26 | persistent=yes
 27 | 
 28 | # Set the cache size for astng objects.
 29 | cache-size=500
 30 | 
 31 | # List of plugins (as comma separated values of python modules names) to load,
 32 | # usually to register additional checkers.
 33 | load-plugins=
 34 | 
 35 | 
 36 | [MESSAGES CONTROL]
 37 | 
 38 | # Enable only checker(s) with the given id(s). This option conflicts with the
 39 | # disable-checker option
 40 | #enable-checker=
 41 | 
 42 | # Enable all checker(s) except those with the given id(s). This option
 43 | # conflicts with the enable-checker option
 44 | #disable-checker=
 45 | 
 46 | # Enable all messages in the listed categories (IRCWEF).
 47 | #enable-msg-cat=
 48 | 
 49 | # Disable all messages in the listed categories (IRCWEF).
 50 | disable-msg-cat=I
 51 | 
 52 | # Enable the message(s) with the given id(s).
 53 | #enable-msg=
 54 | 
 55 | # Disable the message(s) with the given id(s).
 56 | disable-msg=W0703, C0103, R0904, R0903, W0142
 57 | 
 58 | 
 59 | [REPORTS]
 60 | 
 61 | # Set the output format. Available formats are text, parseable, colorized, msvs
 62 | # (visual studio) and html
 63 | output-format=text
 64 | 
 65 | # Include message's id in output
 66 | include-ids=no
 67 | 
 68 | # Put messages in a separate file for each module / package specified on the
 69 | # command line instead of printing them on stdout. Reports (if any) will be
 70 | # written in a file name "pylint_global.[txt|html]".
 71 | files-output=no
 72 | 
 73 | # Tells whether to display a full report or only the messages
 74 | reports=yes
 75 | 
 76 | # Python expression which should return a note less than 10 (10 is the highest
 77 | # note). You have access to the variables errors warning, statement which
 78 | # respectively contain the number of errors / warnings messages and the total
 79 | # number of statements analyzed. This is used by the global evaluation report
 80 | # (R0004).
 81 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 82 | 
 83 | # Add a comment according to your evaluation note. This is used by the global
 84 | # evaluation report (R0004).
 85 | comment=no
 86 | 
 87 | # Enable the report(s) with the given id(s).
 88 | #enable-report=
 89 | 
 90 | # Disable the report(s) with the given id(s).
 91 | #disable-report=
 92 | 
 93 | 
 94 | # checks for :
 95 | # * doc strings
 96 | # * modules / classes / functions / methods / arguments / variables name
 97 | # * number of arguments, local variables, branches, returns and statements in
 98 | # functions, methods
 99 | # * required module attributes
100 | # * dangerous default values as arguments
101 | # * redefinition of function / method / class
102 | # * uses of the global statement
103 | # 
104 | [BASIC]
105 | 
106 | # Required attributes for module, separated by a comma
107 | required-attributes=
108 | 
109 | # Regular expression which should only match functions or classes name which do
110 | # not require a docstring
111 | no-docstring-rgx=__.*__
112 | 
113 | # Regular expression which should only match correct module names
114 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
115 | 
116 | # Regular expression which should only match correct module level names
117 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
118 | 
119 | # Regular expression which should only match correct class names
120 | class-rgx=[A-Z_][a-zA-Z0-9]+$
121 | 
122 | # Regular expression which should only match correct function names
123 | function-rgx=[a-z_][a-z0-9_]{2,30}$
124 | 
125 | # Regular expression which should only match correct method names
126 | method-rgx=[a-z_][a-z0-9_]{2,30}$
127 | 
128 | # Regular expression which should only match correct instance attribute names
129 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
130 | 
131 | # Regular expression which should only match correct argument names
132 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
133 | 
134 | # Regular expression which should only match correct variable names
135 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
136 | 
137 | # Regular expression which should only match correct list comprehension /
138 | # generator expression variable names
139 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
140 | 
141 | # Good variable names which should always be accepted, separated by a comma
142 | good-names=i,j,k,ex,Run,_
143 | 
144 | # Bad variable names which should always be refused, separated by a comma
145 | bad-names=foo,bar,baz,toto,tutu,tata
146 | 
147 | # List of builtins function names that should not be used, separated by a comma
148 | bad-functions=map,filter,apply,input
149 | 
150 | 
151 | # try to find bugs in the code using type inference
152 | # 
153 | [TYPECHECK]
154 | 
155 | # Tells whether missing members accessed in mixin class should be ignored. A
156 | # mixin class is detected if its name ends with "mixin" (case insensitive).
157 | ignore-mixin-members=yes
158 | 
159 | # List of classes names for which member attributes should not be checked
160 | # (useful for classes with attributes dynamically set).
161 | ignored-classes=SQLObject
162 | 
163 | # When zope mode is activated, add a predefined set of Zope acquired attributes
164 | # to generated-members.
165 | zope=no
166 | 
167 | # List of members which are set dynamically and missed by pylint inference
168 | # system, and so shouldn't trigger E0201 when accessed.
169 | generated-members=REQUEST,acl_users,aq_parent
170 | 
171 | 
172 | # checks for
173 | # * unused variables / imports
174 | # * undefined variables
175 | # * redefinition of variable from builtins or from an outer scope
176 | # * use of variable before assignment
177 | # 
178 | [VARIABLES]
179 | 
180 | # Tells whether we should check for unused import in __init__ files.
181 | init-import=no
182 | 
183 | # A regular expression matching names used for dummy variables (i.e. not used).
184 | dummy-variables-rgx=_|dummy
185 | 
186 | # List of additional names supposed to be defined in builtins. Remember that
187 | # you should avoid to define new builtins when possible.
188 | additional-builtins=
189 | 
190 | 
191 | # checks for sign of poor/misdesign:
192 | # * number of methods, attributes, local variables...
193 | # * size, complexity of functions, methods
194 | # 
195 | [DESIGN]
196 | 
197 | # Maximum number of arguments for function / method
198 | max-args=5
199 | 
200 | # Maximum number of locals for function / method body
201 | max-locals=15
202 | 
203 | # Maximum number of return / yield for function / method body
204 | max-returns=6
205 | 
206 | # Maximum number of branch for function / method body
207 | max-branchs=12
208 | 
209 | # Maximum number of statements in function / method body
210 | max-statements=50
211 | 
212 | # Maximum number of parents for a class (see R0901).
213 | max-parents=7
214 | 
215 | # Maximum number of attributes for a class (see R0902).
216 | max-attributes=7
217 | 
218 | # Minimum number of public methods for a class (see R0903).
219 | min-public-methods=2
220 | 
221 | # Maximum number of public methods for a class (see R0904).
222 | max-public-methods=20
223 | 
224 | 
225 | # checks for :
226 | # * methods without self as first argument
227 | # * overridden methods signature
228 | # * access only to existent members via self
229 | # * attributes not defined in the __init__ method
230 | # * supported interfaces implementation
231 | # * unreachable code
232 | # 
233 | [CLASSES]
234 | 
235 | # List of interface methods to ignore, separated by a comma. This is used for
236 | # instance to not check methods defines in Zope's Interface base class.
237 | ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
238 | 
239 | # List of method names used to declare (i.e. assign) instance attributes.
240 | defining-attr-methods=__init__,__new__,setUp
241 | 
242 | 
243 | # checks for
244 | # * external modules dependencies
245 | # * relative / wildcard imports
246 | # * cyclic imports
247 | # * uses of deprecated modules
248 | # 
249 | [IMPORTS]
250 | 
251 | # Deprecated modules which should not be used, separated by a comma
252 | deprecated-modules=regsub,string,TERMIOS,Bastion,rexec
253 | 
254 | # Create a graph of every (i.e. internal and external) dependencies in the
255 | # given file (report R0402 must not be disabled)
256 | import-graph=
257 | 
258 | # Create a graph of external dependencies in the given file (report R0402 must
259 | # not be disabled)
260 | ext-import-graph=
261 | 
262 | # Create a graph of internal dependencies in the given file (report R0402 must
263 | # not be disabled)
264 | int-import-graph=
265 | 
266 | 
267 | # checks for:
268 | # * warning notes in the code like FIXME, XXX
269 | # * PEP 263: source code with non ascii character but no encoding declaration
270 | # 
271 | [MISCELLANEOUS]
272 | 
273 | # List of note tags to take in consideration, separated by a comma.
274 | notes=FIXME,XXX,TODO
275 | 
276 | 
277 | # checks for similarities and duplicated code. This computation may be
278 | # memory / CPU intensive, so you should disable it if you experiments some
279 | # problems.
280 | # 
281 | [SIMILARITIES]
282 | 
283 | # Minimum lines number of a similarity.
284 | min-similarity-lines=4
285 | 
286 | # Ignore comments when computing similarities.
287 | ignore-comments=yes
288 | 
289 | # Ignore docstrings when computing similarities.
290 | ignore-docstrings=yes
291 | 
292 | 
293 | # checks for :
294 | # * unauthorized constructions
295 | # * strict indentation
296 | # * line length
297 | # * use of <> instead of !=
298 | # 
299 | [FORMAT]
300 | 
301 | # Maximum number of characters on a single line.
302 | max-line-length=80
303 | 
304 | # Maximum number of lines in a module
305 | max-module-lines=1000
306 | 
307 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
308 | # tab).
309 | indent-string='    '
310 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "warctools"
 3 | version = "5.0.0"
 4 | authors = [
 5 |   { name="Thomas Figg", email="tef@warctools.twentygototen.org" },
 6 | ]
 7 | maintainers = [
 8 |   { name="Internet Archive", email="info@archive.org" },
 9 | ]
10 | description = "Command line tools and libraries for handling and manipulating WARC files (and HTTP contents)"
11 | readme = "README.md"
12 | requires-python = ">=3.5"
13 | classifiers = [
14 |   "Operating System :: OS Independent",
15 |   "Programming Language :: Python :: 3.5",
16 |   "Topic :: System :: Archiving",
17 | ]
18 | license = "MIT"
19 | license-files = ["LICENSE"]
20 | 
21 | [project.scripts]
22 | warcdump = "hanzo.warcdump:run"
23 | arc2warc = "hanzo.arc2warc:run"
24 | warcextract = "hanzo.warcextract:run"
25 | warcfilter = "hanzo.warcfilter:run"
26 | warcindex = "hanzo.warcindex:run"
27 | warclinks = "hanzo.warclinks:run"
28 | warcvalid = "hanzo.warcvalid:run"
29 | warc2warc = "hanzo.warc2warc:run"
30 | warcpayload = "hanzo.warcpayload:run"
31 | 
32 | [dependency-groups]
33 | dev = [
34 |   "nose",
35 | ]
36 | 
37 | [build-system]
38 | requires = ["setuptools>=61.0"]
39 | build-backend = "setuptools.build_meta"
40 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py27, py32, py33, pypy
 8 | 
 9 | [testenv]
10 | commands = {envpython} setup.py test
11 | 
12 | 


--------------------------------------------------------------------------------
/uv.lock:
--------------------------------------------------------------------------------
 1 | version = 1
 2 | revision = 2
 3 | requires-python = ">=3.5"
 4 | 
 5 | [[package]]
 6 | name = "nose"
 7 | version = "1.3.7"
 8 | source = { registry = "https://pypi.org/simple" }
 9 | sdist = { url = "https://files.pythonhosted.org/packages/58/a5/0dc93c3ec33f4e281849523a5a913fa1eea9a3068acfa754d44d88107a44/nose-1.3.7.tar.gz", hash = "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98", size = 280488, upload-time = "2015-06-02T09:12:32.961Z" }
10 | wheels = [
11 |     { url = "https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl", hash = "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", size = 154731, upload-time = "2015-06-02T09:12:40.57Z" },
12 | ]
13 | 
14 | [[package]]
15 | name = "warctools"
16 | version = "5.0.0"
17 | source = { editable = "." }
18 | 
19 | [package.dev-dependencies]
20 | dev = [
21 |     { name = "nose" },
22 | ]
23 | 
24 | [package.metadata]
25 | 
26 | [package.metadata.requires-dev]
27 | dev = [{ name = "nose" }]
28 | 


--------------------------------------------------------------------------------
/warcunpack_ia.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """warcextract - dump warc record context to directory"""
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import os
  7 | import sys
  8 | import os.path
  9 | import uuid
 10 | import mimetypes
 11 | import shlex
 12 | 
 13 | from optparse import OptionParser
 14 | from contextlib import closing
 15 | try:
 16 |     from urllib.parse import urlparse
 17 | except ImportError:
 18 |     from urlparse import urlparse
 19 | 
 20 | from hanzo.warctools import ArchiveRecord, WarcRecord
 21 | from hanzo.httptools import RequestMessage, ResponseMessage
 22 | 
 23 | mimetypes.add_type('text/javascript', 'js')
 24 | 
 25 | parser = OptionParser(usage="%prog [options] warc offset")
 26 | 
 27 | parser.add_option("-D", "--default-name", dest="default_name")
 28 | parser.add_option("-o", "--output", dest="output")
 29 | parser.add_option("-l", "--log", dest="log_file")
 30 | parser.add_option("-W", "--wayback_prefix", dest="wayback")
 31 | 
 32 | parser.set_defaults(output=None, log_file=None, default_name='crawlerdefault', wayback="http://wayback.archive-it.org/")
 33 | 
 34 | 
 35 | def log_headers(log_file):
 36 |     print('>>warc_file\twarc_id\twarc_type\twarc_content_length\twarc_uri_date\twarc_subject_uri\turi_content_type\toutfile\twayback_uri', file=log_file)
 37 | 
 38 | def log_entry(log_file, input_file, record, content_type, output_file, wayback_uri):
 39 |     log = (input_file, record.id, record.type, record.content_length, record.date, record.url, content_type, output_file, wayback_uri)
 40 |     print("\t".join(str(s) for s in log), file=log_file)
 41 | 
 42 | def main(argv):
 43 |     (options, args) = parser.parse_args(args=argv[1:])
 44 | 
 45 |     out = sys.stdout
 46 |     if options.output:
 47 |         if not os.path.exists(options.output):
 48 |             os.makedirs(options.output)
 49 |         output_dir =  options.output
 50 |     else:
 51 |         output_dir  = os.getcwd()
 52 | 
 53 |     collisions = 0
 54 | 
 55 | 
 56 |     if len(args) < 1:
 57 |         log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb')
 58 |         log_headers(log_file)
 59 |         
 60 |         with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh:
 61 |             collisions += unpack_records('<stdin>', fh, output_dir, options.default_name, log_file, options.wayback)
 62 |         
 63 |     else:
 64 |         for filename in args:
 65 |             
 66 |             log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file
 67 |             log_file = open(log_file, 'wb')
 68 |             log_headers(log_file)
 69 |             try:
 70 |                 with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh:
 71 |                     collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback)
 72 | 
 73 |             except Exception as e:
 74 |                 print("exception in handling", filename, e, file=sys.stderr)
 75 |     if collisions:
 76 |         print(collisions, "filenames that collided", file=sys.stderr)
 77 |         
 78 | 
 79 |     return 0
 80 | 
 81 | def unpack_records(name, fh, output_dir, default_name, output_log, wayback_prefix):
 82 |     collectionId = ''
 83 |     collisions = 0
 84 |     for (offset, record, errors) in fh.read_records(limit=None):
 85 |         if record:
 86 |             try:
 87 |                 content_type, content = record.content
 88 | 
 89 |                 if record.type == WarcRecord.WARCINFO:
 90 |                     info = parse_warcinfo(record)
 91 |                     for entry in shlex.split(info.get('description', "")):
 92 |                         if entry.startswith('collectionId'):
 93 |                             collectionId = entry.split('=',1)[1].split(',')[0]
 94 |                     if not collectionId:
 95 |                         filename = record.get_header("WARC-Filename")
 96 |                         if filename:
 97 |                             collectionId = filename.split(r'-')[1]
 98 |                         elif '-' in name:
 99 |                             collectionId = name.split(r'-')[1]
100 | 
101 | 
102 | 
103 |                 if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
104 | 
105 |                     code, mime_type, message = parse_http_response(record)
106 | 
107 |                     if 200 <= code < 300: 
108 |                         filename, collision = output_file(output_dir, record.url, mime_type, default_name)
109 |                         if collision:
110 |                             collisions+=1
111 | 
112 |                         wayback_uri = ''
113 |                         if collectionId:
114 |                             wayback_date = record.date.translate(None,r'TZ:-')
115 |                             wayback_uri = wayback_prefix + collectionId + '/' + wayback_date + '/' + record.url
116 | 
117 |                         with open(filename, 'wb') as out:
118 |                             out.write(message.get_body())
119 |                             log_entry(output_log, name, record, mime_type, filename, wayback_uri)
120 | 
121 |             except Exception as e:
122 |                 import traceback; traceback.print_exc()
123 |                 print("exception in handling record", e, file=sys.stderr)
124 | 
125 |         elif errors:
126 |             print("warc errors at %s:%d"%(name, offset if offset else 0), end=' ', file=sys.stderr)
127 |             for e in errors:
128 |                 print(e, end=' ', file=sys.stderr)
129 |             print(file=sys.stderr)
130 |     return collisions
131 | 
132 | def parse_warcinfo(record):
133 |     info = {}
134 |     try:
135 |         for line in record.content[1].split('\n'):
136 |             line = line.strip()
137 |             if line:
138 |                 try:
139 |                     key, value =line.split(':',1)
140 |                     info[key]=value
141 |                 except Exception as e:
142 |                         print('malformed warcinfo line', line, file=sys.stderr)
143 |     except Exception as e:
144 |             print('exception reading warcinfo record', e, file=sys.stderr)
145 |     return info
146 | 
147 | def parse_http_response(record):
148 |     message = ResponseMessage(RequestMessage())
149 |     remainder = message.feed(record.content[1])
150 |     message.close()
151 |     if remainder or not message.complete():
152 |         if remainder:
153 |             print('warning: trailing data in http response for', record.url, file=sys.stderr)
154 |         if not message.complete():
155 |             print('warning: truncated http response for', record.url, file=sys.stderr)
156 | 
157 |     header = message.header
158 | 
159 |     mime_type = [v for k,v in header.headers if k.lower() =='content-type']
160 |     if mime_type:
161 |         mime_type = mime_type[0].split(';')[0]
162 |     else:
163 |         mime_type = None
164 | 
165 |     return header.code, mime_type, message
166 | 
167 | 
168 | def output_file(output_dir, url, mime_type, default_name):
169 |     clean_url = "".join((c if c.isalpha() or c.isdigit() or c in '_-/.' else '_') for c in url.replace('://','/',1))
170 | 
171 |     parts = clean_url.split('/')
172 |     directories, filename = parts[:-1], parts[-1]
173 | 
174 | 
175 |     path = [output_dir]
176 |     for d in directories:
177 |         if d:
178 |             path.append(d)
179 | 
180 |     if filename:
181 |         name, ext = os.path.splitext(filename)
182 |     else:
183 |         name, ext = default_name, ''
184 | 
185 |     if mime_type:
186 |         guess_type = mimetypes.guess_type(url)
187 |         # preserve variant file extensions, rather than clobber with default for mime type
188 |         if not ext or guess_type != mime_type: 
189 |             mime_ext = mimetypes.guess_extension(mime_type)
190 |             if mime_ext:
191 |                 ext = mime_ext
192 |     elif not ext:
193 |         ext = '.html' # no mime time, no extension
194 | 
195 |     directory =  os.path.normpath(os.path.join(*path))
196 |     directory = directory[:200]
197 |     
198 |     if not os.path.exists(directory):
199 |         os.makedirs(directory)
200 | 
201 |     filename = name[:45-len(ext)] + ext
202 | 
203 |     fullname = os.path.join(directory, filename)
204 | 
205 |     collision = False
206 | 
207 |     while os.path.exists(fullname):
208 |         collision = True
209 |         u = str(uuid.uuid4())[:8]
210 | 
211 |         filename = name[:45-len(ext)] + '_R'+ u + ext
212 | 
213 |         fullname = os.path.join(directory, filename)
214 | 
215 |     return os.path.realpath(os.path.normpath(fullname)), collision
216 |     
217 | if __name__ == '__main__':
218 |     sys.exit(main(sys.argv))
219 | 
220 | 
221 | 
222 | 


--------------------------------------------------------------------------------