├── .gitignore ├── .hgignore ├── .hgtags ├── .travis.yml ├── LICENSE ├── README.md ├── hanzo ├── arc2warc.py ├── httptools │ ├── __init__.py │ ├── messaging.py │ ├── semantics.py │ └── tests │ │ ├── __init__.py │ │ └── parse_test.py ├── warc2warc.py ├── warcdump.py ├── warcextract.py ├── warcfilter.py ├── warcindex.py ├── warclinks.py ├── warcpayload.py ├── warctools │ ├── __init__.py │ ├── arc.py │ ├── archive_detect.py │ ├── log.py │ ├── mixed.py │ ├── record.py │ ├── s3.py │ ├── stream.py │ ├── tests │ │ ├── __init__.py │ │ └── test_warctools.py │ └── warc.py └── warcvalid.py ├── make-deb.sh ├── pylint.rc ├── pyproject.toml ├── tox.ini ├── uv.lock └── warcunpack_ia.py /.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /dist 3 | __pycache__ 4 | /warctools.egg-info 5 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | *.swp 3 | *.log 4 | *.pyc 5 | *.pyo 6 | *.warc 7 | *.gz 8 | login.txt 9 | .DS_Store 10 | build/* 11 | dist/* 12 | hanzo_warc_tools.egg-info/* 13 | *~ 14 | *.orig 15 | debian/* 16 | *.deb 17 | test-reports/* 18 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | 58d7d99406b04e7c36bfba1c91e2b06f558c22ee hanzo-4.0-rc0 2 | 764a52f90a951a8c4acc9c9f60f5d8321662d418 hanzo-4.0-rc1 3 | 94b65646332e5e86f3d274f66e38ce26cc30ccad hanzo-4.0 4 | 092e8d0615ecc5ace8b067edbeacd5e3b12c9be0 hanzo-4.1-rc0 5 | 8f64ab5556344065cd68e0cf8265af87e6b9d0cf hanzo-4.1-rc1 6 | 8ceff9fcde584ec577048dbd9a13743d31dfc74f hanzo-4.1-rc2 7 | f54be58d0d8b3aa47b3f935a732a7b5752f0e92e hanzo-4.1-rc4 8 | 0a1d728557b8d29b15b3796f83b6a9dc7f25abff build_success-2012-09-14T15-24-42.616660024 9 | 741fe327f233f936cd65c6e2c415cd01f9fc9871 build_success-2012-09-14T16-25-56.483325901 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - 2.7 5 | - 3.2 6 | - 3.3 7 | - 3.4 8 | - 3.5 9 | - nightly 10 | - pypy 11 | - pypy3 12 | 13 | matrix: 14 | allow_failures: 15 | - python: 3.5 16 | - python: nightly 17 | 18 | script: python setup.py test 19 | 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2012 Hanzo Archives Ltd 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a 4 | copy of this software and associated documentation files (the "Software"), 5 | to deal in the Software without restriction, including without limitation 6 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 | and/or sell copies of the Software, and to permit persons to whom the 8 | Software is furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included 11 | in all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 14 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 15 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 18 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Warctools 2 | ========= 3 | 4 | WARC (Web ARChive) file tools for python 2/3 based on the 5 | [WARC 1.0 spec](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.0/) 6 | and compatible with the Internet Archive's 7 | [ARC File Format](https://archive.org/web/researcher/ArcFileFormat.php) 8 | originally developed by Hanzo Archives. 9 | 10 | 11 | Install 12 | ------- 13 | 14 | ``` 15 | pip install warctools 16 | ``` 17 | 18 | 19 | Python Usage 20 | ------------ 21 | 22 | ``` 23 | from hanzo import warctools 24 | ``` 25 | 26 | 27 | Python Examples 28 | --------------- 29 | 30 | Write a WARC file: 31 | 32 | ``` 33 | import os 34 | 35 | from hanzo import warctools 36 | 37 | 38 | def write(): 39 | headers = [ 40 | (b'WARC-Type', b'warcinfo'), 41 | (b'WARC-Date', b'2019-11-19T23:08:51.182451Z'), 42 | (b'WARC-Filename', b'CRAWL-20191119230851-00000-hostname.warc.gz'), 43 | (b'WARC-Record-ID', b'') 44 | ] 45 | content_type = b'application/warc-fields' 46 | content = 'This\nis\nonly\na\ntest\n'.encode() 47 | fname = 'test.warc.gz' 48 | 49 | mode = 'ab' 50 | if not os.path.exists(fname): 51 | mode = 'wb' 52 | 53 | with open(fname, mode) as _fh: 54 | content = (content_type, content) 55 | record = warctools.WarcRecord(headers=headers, content=content) 56 | record.write_to(_fh, gzip="record") 57 | ``` 58 | 59 | 60 | Command-line Usage 61 | ------------------ 62 | 63 | ### warcvalid 64 | 65 | Returns 0 if the arguments are all valid W/ARC files, non-zero on 66 | error. 67 | 68 | ``` 69 | [warctools] $ warcvalid -h 70 | Usage: warcvalid [options] warc warc warc 71 | 72 | Options: 73 | -h, --help show this help message and exit 74 | -l LIMIT, --limit=LIMIT 75 | -I INPUT_FORMAT, --input=INPUT_FORMAT 76 | -L LOG_LEVEL, --log-level=LOG_LEVEL 77 | ``` 78 | 79 | ### warcdump 80 | 81 | Writes human readable summary of warcfiles. Autodetects input format 82 | when filenames are passed, i.e recordgzip vs plaintext, WARC vs 83 | ARC. Assumes uncompressed warc on stdin if no args. 84 | 85 | ``` 86 | [warctools] $ warcdump -h 87 | Usage: warcdump [options] warc warc warc 88 | 89 | Options: 90 | -h, --help show this help message and exit 91 | -l LIMIT, --limit=LIMIT 92 | -I INPUT_FORMAT, --input=INPUT_FORMAT 93 | -L LOG_LEVEL, --log-level=LOG_LEVEL 94 | ``` 95 | 96 | ### warcfilter 97 | 98 | Searches all headers for regex pattern. Autodetects and stdin like 99 | warcdump. Prints out a WARC format by default. Use -i to invert 100 | search. Use -U to constrain to url. Use -T to constrain to record 101 | type. Use -C to constrain to content-type. 102 | 103 | ``` 104 | $ warcfilter -h 105 | Usage: warcfilter [options] pattern warc warc warc 106 | 107 | Options: 108 | -h, --help show this help message and exit 109 | -l LIMIT, --limit=LIMIT 110 | limit (ignored) 111 | -I INPUT_FORMAT, --input=INPUT_FORMAT 112 | input format (ignored) 113 | -i, --invert invert match 114 | -U, --url match on url 115 | -T, --type match on (warc) record type 116 | -C, --content-type match on (warc) record content type 117 | -H, --http-content-type 118 | match on http payload content type 119 | -D, --warc-date match on WARC-Date header 120 | -L LOG_LEVEL, --log-level=LOG_LEVEL 121 | log level(ignored) 122 | ``` 123 | 124 | ### warc2warc 125 | 126 | Autodetects compression on file args. Assumes uncompressed stdin if 127 | none. Use -Z to write compressed output, i.e warc2warc -Z input > 128 | input.gz. Should ignore buggy records in input. 129 | 130 | ``` 131 | [warctools] $ warc2warc -h 132 | Usage: warc2warc [options] url (url ...) 133 | 134 | Options: 135 | -h, --help show this help message and exit 136 | -o OUTPUT, --output=OUTPUT 137 | output warc file 138 | -l LIMIT, --limit=LIMIT 139 | -I INPUT_FORMAT, --input=INPUT_FORMAT 140 | (ignored) 141 | -Z, --gzip compress output, record by record 142 | -D, --decode_http decode http messages (strip chunks, gzip) 143 | -L LOG_LEVEL, --log-level=LOG_LEVEL 144 | --wget-chunk-fix skip transfer-encoding headers in http records, when 145 | decoding them (-D) 146 | ``` 147 | 148 | ### arc2warc 149 | 150 | Creates a crappy WARC file from arc files on input. A handful of 151 | headers are preserved. Use -Z to write compressed output, i.e arc2warc 152 | -Z input.arc > input.warc.gz 153 | 154 | ``` 155 | [warctools] $ arc2warc -h 156 | Usage: arc2warc [options] arc (arc ...) 157 | 158 | Options: 159 | -h, --help show this help message and exit 160 | -o OUTPUT, --output=OUTPUT 161 | output warc file 162 | -l LIMIT, --limit=LIMIT 163 | -Z, --gzip compress 164 | -L LOG_LEVEL, --log-level=LOG_LEVEL 165 | --description=DESCRIPTION 166 | --operator=OPERATOR 167 | --publisher=PUBLISHER 168 | --audience=AUDIENCE 169 | --resource=RESOURCE 170 | --response=RESPONSE 171 | ``` 172 | 173 | ### warcindex 174 | 175 | DEPRECATED, use `CDX-writer` branch. 176 | 177 | ``` 178 | #WARC-filename offset warc-type warc-subject-uri warc-record-id content-type content-length 179 | warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193 180 | warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474 181 | ``` 182 | 183 | 184 | Notes 185 | ----- 186 | 187 | 1. arc2warc uses the conversion rules from the earlier arc2warc.c as a 188 | starter for converting the headers 189 | 2. I haven't profiled the code yet (and don't plan to until it falls 190 | over) 191 | 3. Warcvalid barely skirts some of the iso standard, missing things: 192 | * strict whitespace 193 | * required headers check 194 | * mime quoted printable header encoding 195 | * treating headers as utf8 196 | 197 | 198 | ToDo 199 | ---- 200 | 201 | 1. Lots more testing 202 | 2. Support pre-1.0 WARC files 203 | 3. Add more documentation 204 | 4. Support more commandline options for output and filenames 205 | 5. S3 urls 206 | 207 | 208 | Credits 209 | ------- 210 | 211 | Originally developed by "tef" `thomas.figg@hanzoarchives.com`. 212 | 213 | 214 | @internetarchive 215 | -------------------------------------------------------------------------------- /hanzo/arc2warc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """arc2warc - convert one arc to a new warc""" 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import sys 8 | import hashlib 9 | import uuid 10 | 11 | import sys 12 | import os.path 13 | import datetime 14 | import socket 15 | 16 | from optparse import OptionParser 17 | 18 | from .warctools import ArcRecord,WarcRecord, MixedRecord, expand_files 19 | from .warctools.warc import warc_datetime_str 20 | 21 | from .httptools import ResponseMessage, RequestMessage 22 | 23 | parser = OptionParser(usage="%prog [options] arc (arc ...)") 24 | 25 | parser.add_option("-o", "--output", dest="output", 26 | help="output warc file") 27 | parser.add_option("-l", "--limit", dest="limit") 28 | parser.add_option("-Z", "--gzip", dest="gzip", action="store_true", help="compress") 29 | parser.add_option("-L", "--log-level", dest="log_level") 30 | parser.add_option("--description", dest="description") 31 | parser.add_option("--operator", dest="operator") 32 | parser.add_option("--publisher", dest="publisher") 33 | parser.add_option("--audience", dest="audience") 34 | parser.add_option("--resource", dest="resource", action="append") 35 | parser.add_option("--response", dest="response", action="append") 36 | 37 | parser.set_defaults( 38 | output_directory=None, limit=None, log_level="info", gzip=False, 39 | description="", operator="", publisher="", audience="", 40 | resource = [], response=[], 41 | 42 | ) 43 | 44 | def is_http_response(content): 45 | message = ResponseMessage(RequestMessage()) 46 | remainder = message.feed(content) 47 | message.close() 48 | return message.complete() and not remainder 49 | 50 | 51 | class ArcTransformer(object): 52 | def __init__(self, output_filename=None, warcinfo_fields=b'software: hanzo.arc2warc\r\n', resources=(), responses=()): 53 | self.warcinfo_id = None 54 | self.output_filename = output_filename 55 | self.version = b"WARC/1.0" 56 | self.warcinfo_fields = warcinfo_fields 57 | self.resources = resources 58 | self.responses = responses 59 | 60 | @staticmethod 61 | def make_warc_uuid(text): 62 | return (""%uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii') 63 | 64 | def convert(self, record): 65 | 66 | if record.type == b'filedesc': 67 | return self.convert_filedesc(record) 68 | else: 69 | return self.convert_record(record) 70 | 71 | def convert_filedesc(self, record): 72 | # todo - filedesc might have missing url? 73 | warcinfo_date = warc_datetime_str(datetime.datetime.now()) 74 | warcinfo_id = self.make_warc_uuid(record.url+warcinfo_date) 75 | 76 | warcinfo_headers = [ 77 | (WarcRecord.TYPE, WarcRecord.WARCINFO), 78 | (WarcRecord.ID, warcinfo_id), 79 | (WarcRecord.DATE, warcinfo_date), 80 | ] 81 | 82 | if self.output_filename: 83 | warcinfo_headers.append((WarcRecord.FILENAME, self.output_filename)) 84 | 85 | warcinfo_content = (b'application/warc-fields', self.warcinfo_fields) 86 | 87 | inforecord = WarcRecord(headers=warcinfo_headers, content=warcinfo_content, version=self.version) 88 | 89 | if record.date: 90 | if len(record.date) >= 14: 91 | warcmeta_date = datetime.datetime.strptime(record.date[:14].decode('ascii'),'%Y%m%d%H%M%S') 92 | else: 93 | warcmeta_date = datetime.datetime.strptime(record.date[:8].decode('ascii'),'%Y%m%d') 94 | 95 | warcmeta_date = warc_datetime_str(warcmeta_date) 96 | else: 97 | warcmeta_date = warcinfo_date 98 | 99 | 100 | warcmeta_id = self.make_warc_uuid(record.url+record.date+b"-meta") 101 | warcmeta_url = record.url 102 | if warcmeta_url.startswith(b'filedesc://'): 103 | warcmeta_url = warcmeta_url[11:] 104 | warcmeta_headers = [ 105 | (WarcRecord.TYPE, WarcRecord.METADATA), 106 | (WarcRecord.CONCURRENT_TO, warcinfo_id), 107 | (WarcRecord.ID, warcmeta_id), 108 | (WarcRecord.URL, warcmeta_url), 109 | (WarcRecord.DATE, warcmeta_date), 110 | (WarcRecord.WARCINFO_ID, warcinfo_id), 111 | ] 112 | warcmeta_content =(b'application/arc', record.raw()) 113 | 114 | metarecord = WarcRecord(headers=warcmeta_headers, content=warcmeta_content, version=self.version) 115 | 116 | self.warcinfo_id = warcinfo_id 117 | 118 | return inforecord, metarecord 119 | 120 | def convert_record(self, record): 121 | 122 | warc_id = self.make_warc_uuid(record.url+record.date) 123 | headers = [ 124 | (WarcRecord.ID, warc_id), 125 | (WarcRecord.URL,record.url), 126 | (WarcRecord.WARCINFO_ID, self.warcinfo_id), 127 | ] 128 | 129 | if record.date: 130 | try: 131 | date = datetime.datetime.strptime(record.date.decode('ascii'),'%Y%m%d%H%M%S') 132 | except ValueError: 133 | date = datetime.datetime.strptime(record.date.decode('ascii'),'%Y%m%d') 134 | 135 | else: 136 | date = datetime.datetime.now() 137 | 138 | ip = record.get_header(ArcRecord.IP) 139 | if ip: 140 | ip = ip.strip() 141 | if ip != b"0.0.0.0": 142 | headers.append((WarcRecord.IP_ADDRESS, ip)) 143 | 144 | 145 | headers.append((WarcRecord.DATE, warc_datetime_str(date))) 146 | 147 | content_type, content = record.content 148 | 149 | if not content_type.strip(): 150 | content_type = b'application/octet-stream' 151 | 152 | url = record.url.lower() 153 | 154 | 155 | if any(url.startswith(p) for p in self.resources): 156 | record_type = WarcRecord.RESOURCE 157 | elif any(url.startswith(p) for p in self.responses): 158 | record_type = WarcRecord.RESPONSE 159 | elif url.startswith(b'http'): 160 | if is_http_response(content): 161 | content_type=b"application/http;msgtype=response" 162 | record_type = WarcRecord.RESPONSE 163 | else: 164 | record_type = WarcRecord.RESOURCE 165 | elif url.startswith(b'dns'): 166 | if content_type.startswith(b'text/dns') and str(content.decode('ascii', 'ignore')) == content: 167 | record_type = WarcRecord.RESOURCE 168 | else: 169 | record_type = WarcRecord.RESPONSE 170 | else: 171 | # unknown protocol 172 | record_type = WarcRecord.RESPONSE 173 | 174 | headers.append((WarcRecord.TYPE, record_type)) 175 | 176 | warcrecord = WarcRecord(headers=headers, content=(content_type, content), version=self.version) 177 | 178 | return warcrecord, 179 | 180 | def warcinfo_fields(description="", operator="", publisher="", audience=""): 181 | return "\r\n".join([ 182 | "software: hanzo.arc2warc", 183 | "hostname: %s"%socket.gethostname(), 184 | "description: %s"%description, 185 | "operator: %s"%operator, 186 | "publisher: %s"%publisher, 187 | "audience: %s"%audience, 188 | ]).encode('utf-8') 189 | 190 | ## todo 191 | """ 192 | move arctransformer into mixed.py 193 | move output file into arc2warc loop 194 | 195 | """ 196 | def main(argv): 197 | (options, input_files) = parser.parse_args(args=argv[1:]) 198 | 199 | try: # python3 200 | out = sys.stdout.buffer 201 | except AttributeError: # python2 202 | out = sys.stdout 203 | 204 | if options.output: 205 | out = open(options.output, 'ab') 206 | if options.output.endswith('.gz'): 207 | options.gzip = True 208 | if len(input_files) < 1: 209 | parser.error("no imput warc file(s)") 210 | 211 | warcinfo = warcinfo_fields( 212 | description = options.description, 213 | operator = options.operator, 214 | publisher = options.publisher, 215 | audience = options.audience, 216 | ) 217 | arc = ArcTransformer(options.output, warcinfo, options.resource, options.response) 218 | for name in expand_files(input_files): 219 | fh = MixedRecord.open_archive(filename=name, gzip="auto") 220 | try: 221 | for record in fh: 222 | if isinstance(record, WarcRecord): 223 | print(' WARC', record.url, file=sys.stderr) 224 | warcs = [record] 225 | else: 226 | print('ARC ', record.url, file=sys.stderr) 227 | warcs = arc.convert(record) 228 | 229 | for warcrecord in warcs: 230 | warcrecord.write_to(out, gzip=options.gzip) 231 | finally: 232 | fh.close() 233 | 234 | return 0 235 | 236 | def run(): 237 | sys.exit(main(sys.argv)) 238 | 239 | 240 | if __name__ == '__main__': 241 | run() 242 | 243 | 244 | 245 | -------------------------------------------------------------------------------- /hanzo/httptools/__init__.py: -------------------------------------------------------------------------------- 1 | from hanzo.httptools.messaging import RequestMessage, ResponseMessage, HTTP09Response 2 | 3 | 4 | __all__ = [ 5 | "RequestMessage", 6 | "ResponseMessage", 7 | "HTTP09Response", 8 | ] 9 | -------------------------------------------------------------------------------- /hanzo/httptools/messaging.py: -------------------------------------------------------------------------------- 1 | """A set of stream oriented parsers for http requests and responses, inline 2 | with the current draft recommendations from the http working group. 3 | 4 | http://tools.ietf.org/html/draft-ietf-httpbis-p1-messaging-17 5 | 6 | Unlike other libraries, this is for clients, servers and proxies. 7 | 8 | Missing: 9 | comma parsing/header folding 10 | 11 | """ 12 | from gzip import GzipFile 13 | import re 14 | import zlib 15 | from io import BytesIO 16 | 17 | 18 | class ParseError(Exception): 19 | """Baseclass for all http parsing errors""" 20 | pass 21 | 22 | from hanzo.httptools.semantics import Codes, Methods 23 | 24 | NEWLINES = (b'\r\n', b'\n') 25 | 26 | 27 | class BrokenChunks(Exception): 28 | pass 29 | 30 | class HTTPMessage(object): 31 | """A stream based parser for http like messages""" 32 | 33 | CONTENT_TYPE = b"application/http" 34 | 35 | def __init__(self, header, buf=None, offset=0): 36 | self.buffer = buf if buf is not None else bytearray() 37 | self.offset = offset 38 | self.header = header 39 | self.body_chunks = [] 40 | self.mode = 'start' 41 | self.body_reader = None 42 | 43 | @property 44 | def url(self): 45 | return self.header.url 46 | 47 | @property 48 | def scheme(self): 49 | return self.header.scheme 50 | 51 | @property 52 | def method(self): 53 | return self.header.method 54 | 55 | @property 56 | def host(self): 57 | return self.header.host 58 | 59 | @property 60 | def port(self): 61 | return self.header.port 62 | 63 | def feed_fd(self, fd): 64 | while True: 65 | length, terminator = self.feed_predict() 66 | if length == 0: 67 | return '' 68 | elif terminator == '\r\n': 69 | text = fd.readLine() 70 | elif length < 0: 71 | text = fd.read() 72 | elif length > 0: 73 | text = fd.read(length) 74 | unread = self.feed(text) 75 | if unread: 76 | return unread 77 | 78 | def feed_predict(self): 79 | """returns size, terminator request for input. size is 0 means end. """ 80 | if self.mode == 'start': 81 | return None, '\r\n' 82 | elif self.mode == 'headers': 83 | return None, '\r\n' 84 | elif self.mode == 'body': 85 | if self.body_reader is not None: 86 | return self.body_reader.feed_predict() 87 | else: 88 | # connection close 89 | return -1, None 90 | if self.mode == 'end': 91 | return 0, None 92 | if self.mode == 'incomplete': 93 | return 0, None 94 | 95 | def feed(self, text): 96 | """Push more text from the input stream into the parser.""" 97 | if text and self.mode == 'start': 98 | text = self.feed_start(text) 99 | 100 | if text and self.mode == 'headers': 101 | text = self.feed_headers(text) 102 | if self.mode == 'body': 103 | if not self.header.has_body(): 104 | self.mode = 'end' 105 | else: 106 | if self.header.body_is_chunked(): 107 | self.body_reader = ChunkReader() 108 | else: 109 | length = self.header.body_length() 110 | if length is not None: 111 | encoding = self.header.encoding 112 | 113 | if encoding and encoding.endswith(b'gzip'): 114 | self.body_reader = ZipLengthReader(length, 115 | text) 116 | else: 117 | self.body_reader = LengthReader(length) 118 | length = self.body_reader.remaining 119 | self.body_chunks = [(self.offset, length)] 120 | if length == 0: 121 | self.mode = 'end' 122 | else: 123 | self.body_chunks = [(self.offset, 0)] 124 | self.body_reader = None 125 | 126 | if text and self.mode == 'body': 127 | if self.body_reader is not None: 128 | try: 129 | text = self.body_reader.feed(self, text) 130 | except BrokenChunks: 131 | self.body_reader = None 132 | self.body_chunks = [(self.offset, 0)] 133 | if self.body_reader is None: 134 | ((offset, length),) = self.body_chunks 135 | self.buffer.extend(text) 136 | self.offset = len(self.buffer) 137 | self.body_chunks = ((offset, length + len(text)),) 138 | text = '' 139 | 140 | return text 141 | 142 | def close(self): 143 | """Mark the end of the input stream and finish parsing.""" 144 | if (self.body_reader is None and self.mode == 'body'): 145 | self.mode = 'end' 146 | 147 | elif self.mode != 'end': 148 | if self.body_chunks: 149 | # check for incomplete in body_chunks 150 | offset, length = self.body_chunks.pop() 151 | position = len(self.buffer) 152 | length = min(length, position - offset) 153 | self.body_chunks.append((offset, length)) 154 | self.mode = 'incomplete' 155 | 156 | def headers_complete(self): 157 | """Check whether the input stream has finished supplying headers.""" 158 | return self.mode in ('end', 'body') 159 | 160 | def complete(self): 161 | """Checks whether the input stream is at the end, i.e. if the parser 162 | is expecting no more input.""" 163 | 164 | return self.mode == 'end' 165 | 166 | def feed_line(self, text): 167 | """Feed text into the buffer, returning the first line found (if found 168 | yet)""" 169 | self.buffer.extend(text) 170 | pos = self.buffer.find(b'\n', self.offset) 171 | if pos > -1: 172 | pos += 1 173 | text = bytes(self.buffer[pos:]) 174 | del self.buffer[pos:] 175 | line = bytes(self.buffer[self.offset:]) 176 | self.offset = len(self.buffer) 177 | else: 178 | line = None 179 | text = b'' 180 | return line, text 181 | 182 | def feed_length(self, text, remaining): 183 | """Feed (at most remaining bytes) text to buffer, returning 184 | leftovers.""" 185 | body, text = text[:remaining], text[remaining:] 186 | remaining -= len(body) 187 | self.buffer.extend(body) 188 | self.offset = len(self.buffer) 189 | return remaining, text 190 | 191 | def feed_start(self, text): 192 | """Feed text to the parser while it is in the 'start' state.""" 193 | line, text = self.feed_line(text) 194 | if line is not None: 195 | if line not in NEWLINES: 196 | self.header.set_start_line(line) 197 | self.mode = 'headers' 198 | 199 | return text 200 | 201 | def feed_headers(self, text): 202 | """Feed text to the parser while it is in the 'headers' 203 | state.""" 204 | while text: 205 | line, text = self.feed_line(text) 206 | if line is not None: 207 | self.header.add_header_line(line) 208 | if line in NEWLINES: 209 | self.mode = 'body' 210 | break 211 | 212 | return text 213 | 214 | def get_message(self): 215 | """Returns the contents of the input buffer.""" 216 | return bytes(self.buffer) 217 | 218 | def get_decoded_message(self): 219 | """Return the input stream reconstructed from the parsed 220 | data.""" 221 | buf = bytearray() 222 | self.write_decoded_message(buf) 223 | return bytes(buf) 224 | 225 | def write_message(self, buf): 226 | #TODO: No idea what this does, looks broken 227 | self.header.write(buf) 228 | buf.extend(b'\r\n') 229 | self.write_body(buf) 230 | 231 | def write_decoded_message(self, buf): 232 | """Writes the parsed data to the buffer passed.""" 233 | self.header.write_decoded(buf) 234 | if self.header.has_body(): 235 | length = sum(l for o, l in self.body_chunks) 236 | buf.extend(b'Content-Length: ' + str(length).encode('ascii') + b'\r\n') 237 | body = self.get_body() 238 | if self.header.encoding and body: 239 | try: 240 | body = zlib.decompress(body) 241 | except zlib.error: 242 | try: 243 | body = zlib.decompress(body, 16 + zlib.MAX_WBITS) 244 | except zlib.error: 245 | encoding_header = b"Content-Encoding: " + self.header.encoding + b"\r\n" 246 | buf.extend(encoding_header) 247 | buf.extend(b'\r\n') 248 | try: 249 | buf.extend(body) 250 | except Exception as e: 251 | raise Exception('buf={} body={} e={}'.format(repr(buf), repr(body), e)) 252 | 253 | def get_body(self): 254 | """Returns the body of the HTTP message.""" 255 | buf = bytearray() 256 | self.write_body(buf) 257 | return bytes(buf) 258 | 259 | def write_body(self, buf): 260 | """Writes the body of the HTTP message to the passed 261 | buffer.""" 262 | for offset, length in self.body_chunks: 263 | buf.extend(self.buffer[offset:offset + length]) 264 | 265 | 266 | class ChunkReader(object): 267 | """Reads the body of a HTTP message with chunked encoding.""" 268 | 269 | def __init__(self): 270 | self.mode = "start" 271 | self.start = True 272 | self.remaining = 0 273 | 274 | def feed_predict(self): 275 | if self.mode == 'start': 276 | return None, '\r\n' 277 | elif self.mode == 'chunk': 278 | if self.remaining == 0: 279 | return None, '\r\n' 280 | else: 281 | return self.remaining, None 282 | elif self.mode == 'trailer': 283 | return None, '\r\n' 284 | elif self.mode == 'end': 285 | return 0, None 286 | 287 | def feed_start(self, parser, text): 288 | """Feed text into the ChunkReader when the mode is 'start'.""" 289 | pos = len(parser.buffer) 290 | line, text = parser.feed_line(text) 291 | offset = len(parser.buffer) 292 | 293 | if line is not None: 294 | try: 295 | chunk = int(line.split(b';', 1)[0], 16) 296 | except ValueError: 297 | # ugh, this means the chunk is probably not a chunk 298 | if self.start: 299 | # undo, stip text from buffer 300 | del parser.buffer[pos:] 301 | parser.offset = len(parser.buffer) 302 | raise BrokenChunks() 303 | else: 304 | raise 305 | 306 | parser.body_chunks.append((offset, chunk)) 307 | self.remaining = chunk 308 | if chunk == 0: 309 | self.mode = 'trailer' 310 | else: 311 | self.mode = 'chunk' 312 | self.start = False 313 | return text 314 | 315 | def feed_chunk(self, parser, text): 316 | """Feed text into the ChunkReader when the mode is 'chunk'.""" 317 | if self.remaining > 0: 318 | self.remaining, text = parser.feed_length(text, self.remaining) 319 | if self.remaining == 0: 320 | end_of_chunk, text = parser.feed_line(text) 321 | if end_of_chunk: 322 | self.mode = 'start' 323 | 324 | return text 325 | 326 | def feed_trailer(self, parser, text): 327 | """Feed text into the ChunkReader when the mode is 328 | 'trailer'.""" 329 | line, text = parser.feed_line(text) 330 | if line is not None: 331 | parser.header.add_trailer_line(line) 332 | if line in NEWLINES: 333 | self.mode = 'end' 334 | 335 | return text 336 | 337 | def feed(self, parser, text): 338 | """Feed text into the ChunkReader.""" 339 | while text: 340 | if self.mode == 'start': 341 | text = self.feed_start(parser, text) 342 | 343 | if text and self.mode == 'chunk': 344 | text = self.feed_chunk(parser, text) 345 | 346 | if text and self.mode == 'trailer': 347 | text = self.feed_trailer(parser, text) 348 | 349 | if self.mode == 'end': 350 | parser.mode = 'end' 351 | break 352 | 353 | return text 354 | 355 | 356 | class LengthReader(object): 357 | 358 | def __init__(self, length): 359 | self.remaining = length 360 | 361 | def feed_predict(self): 362 | return self.remaining, None 363 | 364 | def feed(self, parser, text): 365 | if self.remaining > 0: 366 | self.remaining, text = parser.feed_length(text, self.remaining) 367 | if self.remaining <= 0: 368 | parser.mode = 'end' 369 | return text 370 | 371 | 372 | class ZipLengthReader(LengthReader): 373 | """ 374 | Tries to read the body as gzip according to length. In case that fails, it 375 | disregards the Content-Length and reads it normally. 376 | """ 377 | def __init__(self, length, text): 378 | # TODO test if this works with gzipped responses in WARC 379 | try: 380 | self._file = GzipFile(fileobj=BytesIO(text[:length]), mode='rb') 381 | self._text = self._file.read() 382 | super(ZipLengthReader, self).__init__(len(self._text)) 383 | except IOError: 384 | self._file = None 385 | super(ZipLengthReader, self).__init__(len(text)) 386 | 387 | def __del__(self): 388 | if self._file: 389 | self._file.close() 390 | 391 | def feed(self, parser, text): 392 | """Parse the body according to remaining length""" 393 | if self.remaining > 0: 394 | if self._file: 395 | text = self._text 396 | self.remaining, text = parser.feed_length(text, self.remaining) 397 | if self.remaining <= 0: 398 | parser.mode = 'end' 399 | return text 400 | 401 | 402 | class HTTPHeader(object): 403 | STRIP_HEADERS = [n.lower() for n in (b'Content-Length', 404 | b'Transfer-Encoding', b'Content-Encoding', 405 | b'TE', b'Expect', b'Trailer')] 406 | 407 | def __init__(self, ignore_headers): 408 | self.headers = [] 409 | self.keep_alive = False 410 | self.mode = 'close' 411 | self.content_length = None 412 | self.encoding = None 413 | self.trailers = [] 414 | self.expect_continue = False 415 | self.ignore_headers = set(x.lower() for x in ignore_headers) 416 | 417 | def has_body(self): 418 | pass 419 | 420 | def set_start_line(self, line): 421 | pass 422 | 423 | def write_decoded(self, buf): 424 | self.write_decoded_start(buf) 425 | strip_headers = self.STRIP_HEADERS if self.has_body() else () 426 | self.write_headers(buf, strip_headers) 427 | 428 | def write_decoded_start(self, buf): 429 | pass 430 | 431 | def write_headers(self, buf, strip_headers=()): 432 | for k, v in self.headers: 433 | if k.lower() not in strip_headers: 434 | buf.extend(k + b': ' + v + b'\r\n') 435 | for k, v in self.trailers: 436 | if k.lower() not in strip_headers: 437 | buf.extend(k + b': ' + v + b'\r\n') 438 | 439 | def add_trailer_line(self, line): 440 | if line.startswith(b' ') or line.startswith(b'\t'): 441 | k, v = self.trailers.pop() 442 | line = line.strip() 443 | v = v + b' ' + line 444 | self.trailers.append((k, v)) 445 | elif line in NEWLINES: 446 | pass 447 | else: 448 | name, value = line.split(b':', 1) 449 | name = name.strip() 450 | value = value.strip() 451 | self.trailers.append((name, value)) 452 | 453 | def add_header(self, name, value): 454 | self.headers.append((name, value)) 455 | 456 | def add_header_line(self, line): 457 | if line.startswith(b' ') or line.startswith(b'\t'): 458 | k, v = self.headers.pop() 459 | line = line.strip() 460 | v = v + b' ' + line 461 | self.add_header(k, v) 462 | 463 | elif line in NEWLINES: 464 | for name, value in self.headers: 465 | name = name.lower() 466 | value = value.lower() 467 | 468 | # todo handle multiple instances 469 | # of these headers 470 | if name in self.ignore_headers: 471 | #print >> sys.stderr, 'ignore', name 472 | pass 473 | elif name == b'expect': 474 | if b'100-continue' in value: 475 | self.expect_continue = True 476 | elif name == b'content-length': 477 | if self.mode == 'close': 478 | self.content_length = int(value) 479 | self.mode = 'length' 480 | 481 | elif name == b'transfer-encoding': 482 | if b'chunked' in value: 483 | self.mode = 'chunked' 484 | 485 | elif name == b'content-encoding': 486 | self.encoding = value 487 | 488 | elif name == b'connection': 489 | if b'keep-alive' in value: 490 | self.keep_alive = True 491 | elif b'close' in value: 492 | self.keep_alive = False 493 | 494 | else: 495 | name, value = line.split(b':', 1) 496 | name = name.strip() 497 | value = value.strip() 498 | self.add_header(name, value) 499 | 500 | def body_is_chunked(self): 501 | return self.mode == 'chunked' 502 | 503 | def body_length(self): 504 | if self.mode == 'length': 505 | return self.content_length 506 | 507 | url_rx = re.compile( 508 | b'(?Phttps?)://(?P(?P[^:/]+)(?::(?P\d+))?)' 509 | b'(?P.*)', 510 | re.I) 511 | 512 | 513 | class RequestHeader(HTTPHeader): 514 | 515 | def __init__(self, ignore_headers=()): 516 | HTTPHeader.__init__(self, ignore_headers=ignore_headers) 517 | self.method = '' 518 | self.target_uri = '' 519 | self.version = '' 520 | self.host = '' 521 | self.scheme = 'http' 522 | self.port = 80 523 | self.host = '' 524 | 525 | @property 526 | def url(self): 527 | if (self.scheme == 'http' and self.port == 80)\ 528 | or (self.scheme == 'https' and self.port == 80): 529 | return "%s://%s%s"%(self.scheme, self.host, self.target_uri) 530 | else: 531 | return "%s://%s:%s%s"%(self.scheme, self.host, self.port, self.target_uri) 532 | 533 | 534 | def add_header(self, name, value): 535 | 536 | if name.lower() == b'host': 537 | if b':' in value: 538 | self.host, self.port = value.split(b':',1) 539 | else: 540 | self.host = value 541 | 542 | return HTTPHeader.add_header(self, name, value) 543 | 544 | def set_start_line(self, line): 545 | self.method, self.target_uri, self.version = \ 546 | line.rstrip().split(b' ', 2) 547 | 548 | if self.method.upper() == b"CONNECT": 549 | # target_uri = host:port 550 | self.host, self.port = self.target_uri.split(b':') 551 | else: 552 | match = url_rx.match(self.target_uri) 553 | if match: 554 | #self.add_header('Host', match.group('authority')) 555 | self.target_uri = match.group('path') 556 | self.host = match.group('host') 557 | port = match.group('port') 558 | self.port = int(port) if port else 80 559 | 560 | self.scheme = match.group('scheme') 561 | if not self.target_uri: 562 | if self.method.upper() == 'OPTIONS': 563 | self.target_uri = '*' 564 | else: 565 | self.target_uri = '/' 566 | 567 | if self.version == 'HTTP/1.0': 568 | self.keep_alive = False 569 | 570 | def has_body(self): 571 | return self.mode in ('chunked', 'length') 572 | 573 | def write_decoded_start(self, buf): 574 | buf.extend(self.method + b' ' + self.target_uri + b' ' + self.version + b'\r\n') 575 | 576 | 577 | class ResponseHeader(HTTPHeader): 578 | 579 | def __init__(self, request=None, ignore_headers=()): 580 | HTTPHeader.__init__(self, ignore_headers=ignore_headers) 581 | self.request = request 582 | self.version = b"HTTP/1.1" 583 | self.code = 0 584 | self.phrase = "Empty Response" 585 | 586 | @property 587 | def method(self): 588 | return self.request.method 589 | 590 | @property 591 | def url(self): 592 | return self.request.url 593 | 594 | @property 595 | def host(self): 596 | return self.request.host 597 | 598 | @property 599 | def port(self): 600 | return self.request.port 601 | 602 | @property 603 | def scheme(self): 604 | return self.request.scheme 605 | 606 | def set_start_line(self, line): 607 | parts = line.rstrip().split(b' ', 2) 608 | self.version, self.code = parts[:2] 609 | self.phrase = parts[2] if len(parts) >= 3 else b"" 610 | 611 | self.code = int(self.code) 612 | if self.version == b'HTTP/1.0': 613 | self.keep_alive = False 614 | 615 | def has_body(self): 616 | if self.request and self.request.method in Methods.no_body: 617 | return False 618 | elif self.code in Codes.no_body: 619 | return False 620 | 621 | return True 622 | 623 | def write_decoded_start(self, buf): 624 | buf.extend(self.version + b' ' + str(self.code).encode('ascii') + b' ' + self.phrase + b'\r\n') 625 | 626 | 627 | class RequestMessage(HTTPMessage): 628 | CONTENT_TYPE = HTTPMessage.CONTENT_TYPE + b";msgtype=request" 629 | 630 | def __init__(self, ignore_headers=()): 631 | HTTPMessage.__init__(self, 632 | RequestHeader(ignore_headers=ignore_headers)) 633 | 634 | 635 | class ResponseMessage(HTTPMessage): 636 | CONTENT_TYPE = HTTPMessage.CONTENT_TYPE + b";msgtype=response" 637 | 638 | def __init__(self, request, ignore_headers=()): 639 | self.interim = [] 640 | HTTPMessage.__init__(self, 641 | ResponseHeader(request.header, 642 | ignore_headers=ignore_headers)) 643 | 644 | def got_continue(self): 645 | return bool(self.interim) 646 | 647 | @property 648 | def code(self): 649 | return self.header.code 650 | 651 | def feed(self, text): 652 | text = HTTPMessage.feed(self, text) 653 | if self.complete() and self.header.code == Codes.Continue: 654 | self.interim.append(self.header) 655 | self.header = ResponseHeader(self.header.request) 656 | self.body_chunks = [] 657 | self.mode = 'start' 658 | self.body_reader = None 659 | text = HTTPMessage.feed(self, text) 660 | return text 661 | 662 | def as_http09(self): 663 | return HTTP09Response(self) 664 | 665 | class HTTP09ResponseHeader(HTTPHeader): 666 | def __init__(self, request=None, ignore_headers=()): 667 | HTTPHeader.__init__(self, ignore_headers=ignore_headers) 668 | self.request = request 669 | self.version = "HTTP/0.9" 670 | self.code = 200 671 | self.phrase = "" 672 | 673 | @property 674 | def method(self): 675 | return self.request.method 676 | 677 | @property 678 | def url(self): 679 | return self.request.url 680 | 681 | @property 682 | def host(self): 683 | return self.request.host 684 | 685 | @property 686 | def port(self): 687 | return self.request.port 688 | 689 | @property 690 | def scheme(self): 691 | return self.request.scheme 692 | 693 | def has_body(self): 694 | return True 695 | 696 | class HTTP09Response(HTTPMessage): 697 | CONTENT_TYPE = "%s;msgtype=response;version=0.9" % HTTPMessage.CONTENT_TYPE 698 | def __init__(self, response): 699 | header= HTTP09ResponseHeader(response.header.request) 700 | HTTPMessage.__init__(self, header, buf=response.buffer, offset=response.offset) 701 | self.mode = 'body' 702 | 703 | @property 704 | def code(self): 705 | return self.header.code 706 | 707 | def feed_predict(self): 708 | """returns size, terminator request for input. size is 0 means end. """ 709 | return -1, None 710 | 711 | def feed(self, text): 712 | """Push more text from the input stream into the parser.""" 713 | self.buffer.extend(text) 714 | return '' 715 | 716 | def close(self): 717 | """Mark the end of the input stream and finish parsing.""" 718 | self.mode = 'end' 719 | 720 | def get_message(self): 721 | """Returns the contents of the input buffer.""" 722 | return bytes(self.buffer) 723 | 724 | def get_decoded_message(self): 725 | """Return the input stream reconstructed from the parsed 726 | data.""" 727 | return bytes(self.buffer) 728 | 729 | def write_decoded_message(self, buf): 730 | """Writes the parsed data to the buffer passed.""" 731 | buf.extend(self.buffer) 732 | 733 | def get_body(self): 734 | """Returns the body of the HTTP message.""" 735 | return bytes(self.buffer) 736 | 737 | def write_body(self, buf): 738 | buf.extend(self.buffer) 739 | 740 | 741 | -------------------------------------------------------------------------------- /hanzo/httptools/semantics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Semantics as based upon 3 | http://tools.ietf.org/html/draft-ietf-httpbis-p2-semantics-17 4 | """ 5 | 6 | class Methods(object): 7 | GET = b'GET' 8 | PUT = b'PUT' 9 | HEAD = b'HEAD' 10 | DELETE = b'DELETE' 11 | POST = b'POST' 12 | OPTIONS = b'OPTIONS' 13 | TRACE = b'TRACE' 14 | PATCH = b'PATCH' 15 | CONNECT = b'CONNECT' 16 | safe = (GET, HEAD, OPTIONS, TRACE,) 17 | idempotent = (PUT, DELETE,) 18 | no_body = (HEAD,) 19 | cacheable = (GET,) 20 | 21 | 22 | def range_collection(func): 23 | """Returns an object (x) that responds to foo in x,""" 24 | 25 | class Range(object): 26 | def __contains__(self, item): 27 | return func(item) 28 | 29 | return Range() 30 | 31 | 32 | class Codes(object): 33 | #pylint: disable-msg=e0213 34 | Continue = 100 35 | switching_protocols = 101 36 | 37 | @range_collection 38 | def informational(code): 39 | return 100 <= code < 200 40 | 41 | ok = 200 42 | created = 201 43 | accepted = 202 44 | non_authorative_content = 203 45 | no_content = 204 46 | reset_content = 205 47 | partial_content = 206 48 | 49 | @range_collection 50 | def successful(code): 51 | return 200 <= code < 300 52 | 53 | 54 | moved_permanently = 301 55 | found = 302 56 | see_other = 303 57 | not_modified = 304 58 | use_proxy = 305 59 | obsolete_switch_proxy = 306 60 | temporary_redirect = 307 61 | 62 | @range_collection 63 | def redirection(code): 64 | return 300 <= code < 400 65 | 66 | 67 | bad_request = 400 68 | unauthorized = 401 69 | payment_required = 402 70 | forbidden = 403 71 | not_found = 404 72 | method_not_allowed = 405 73 | not_acceptable = 406 74 | proxy_authentication_required = 407 75 | request_timeout = 408 76 | conflict = 409 77 | gone = 410 78 | length_required = 411 79 | precondition_failed = 412 80 | request_representation_too_large = 413 81 | uri_too_long = 414 82 | unsupported_media_type = 415 83 | requested_range_not_satisfiable =415 84 | expectation_failed = 417 85 | upgrade_required = 426 86 | 87 | @range_collection 88 | def client_error(code): 89 | return 400 <= code < 500 90 | 91 | 92 | internal_server_error = 501 93 | not_implemented = 501 94 | bad_gateway = 502 95 | service_unavailable = 503 96 | gateway_timeout = 504 97 | http_version_not_supported = 505 98 | @range_collection 99 | def server_error(code): 100 | return 500 <= code < 600 101 | 102 | @range_collection 103 | def no_body(code): 104 | return (100 <= code < 200) or (code == 204) or (code == 304) 105 | 106 | -------------------------------------------------------------------------------- /hanzo/httptools/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/warctools/21db132fd3e4b4042cd011d9dc3fb30276a5a0b6/hanzo/httptools/tests/__init__.py -------------------------------------------------------------------------------- /hanzo/httptools/tests/parse_test.py: -------------------------------------------------------------------------------- 1 | """Tests for http parsing.""" 2 | import unittest 3 | 4 | # want unittest2 for python2.6 5 | try: 6 | unittest.TestCase.assertIsNone 7 | except AttributeError: 8 | import unittest2 9 | unittest = unittest2 10 | 11 | from hanzo.httptools.messaging import \ 12 | RequestMessage, \ 13 | ResponseMessage 14 | 15 | get_request_lines = [ 16 | b"GET / HTTP/1.1", 17 | b"Host: example.org", 18 | b"", 19 | b"", 20 | ] 21 | get_request = b"\r\n".join(get_request_lines) 22 | get_response_lines = [ 23 | b"HTTP/1.1 200 OK", 24 | b"Host: example.org", 25 | b"Content-Length: 5", 26 | b"", 27 | b"tests", 28 | ] 29 | get_response = b"\r\n".join(get_response_lines) 30 | 31 | 32 | class GetChar(unittest.TestCase): 33 | """Test basic GET request parsing. Single byte at a time.""" 34 | 35 | def runTest(self): 36 | """Attempts to parse the contents of get_request and 37 | get_response.""" 38 | p = RequestMessage() 39 | for t in get_request: 40 | if isinstance(t, int): t = bytes([t]) # python3 41 | text = p.feed(t) 42 | self.assertEqual(text, b'') 43 | 44 | self.assertTrue(p.headers_complete()) 45 | self.assertTrue(p.complete()) 46 | 47 | self.assertEqual(get_request, p.get_decoded_message()) 48 | 49 | p = ResponseMessage(p) 50 | for char in get_response: 51 | if isinstance(char, int): char = bytes([char]) # python3 52 | text = p.feed(char) 53 | self.assertEqual(text, b'') 54 | 55 | self.assertTrue(p.headers_complete()) 56 | self.assertTrue(p.complete()) 57 | self.assertEqual(get_response, p.get_decoded_message()) 58 | self.assertEqual(b"tests", p.get_body()) 59 | 60 | 61 | class GetLines(unittest.TestCase): 62 | """Test basic GET request parsing. Single line at a time.""" 63 | 64 | def runTest(self): 65 | """Attempts to parse get_request_lines, i.e. get_request line 66 | at a time.""" 67 | 68 | p = RequestMessage() 69 | for line in get_request_lines[:-1]: 70 | text = p.feed(line) 71 | self.assertEqual(text, b"") 72 | text = p.feed(b"\r\n") 73 | self.assertEqual(text, b"") 74 | text = p.feed(get_request_lines[-1]) 75 | self.assertEqual(text, b"") 76 | 77 | self.assertTrue(p.headers_complete()) 78 | self.assertTrue(p.complete()) 79 | 80 | self.assertEqual(get_request, p.get_decoded_message()) 81 | 82 | p = ResponseMessage(p) 83 | for line in get_response_lines[:-1]: 84 | text = p.feed(line) 85 | self.assertEqual(text, b"") 86 | text = p.feed(b"\r\n") 87 | self.assertEqual(text, b"") 88 | text = p.feed(get_response_lines[-1]) 89 | 90 | self.assertEqual(text, b"") 91 | 92 | self.assertTrue(p.headers_complete()) 93 | self.assertTrue(p.complete()) 94 | 95 | self.assertEqual(get_response, p.get_decoded_message()) 96 | 97 | self.assertEqual(p.code, 200) 98 | self.assertEqual(p.header.version, b"HTTP/1.1") 99 | self.assertEqual(p.header.phrase, b"OK") 100 | 101 | 102 | head_request = b"\r\n".join([ 103 | b"HEAD / HTTP/1.1", 104 | b"Host: example.org", 105 | b"", 106 | b"", 107 | ]) 108 | head_response = b"\r\n".join([ 109 | b"HTTP/1.1 200 OK", 110 | b"Host: example.org", 111 | b"Content-Length: 5", 112 | b"", 113 | b"", 114 | ]) 115 | 116 | 117 | class HeadTest(unittest.TestCase): 118 | """Tests parsing of HEAD requests and responses.""" 119 | 120 | def runTest(self): 121 | """Constructs a RequestMessage and ResponseMessage and uses them to 122 | parse HEAD messages.""" 123 | p = RequestMessage() 124 | text = p.feed(head_request) 125 | 126 | self.assertEqual(text, b'') 127 | self.assertTrue(p.complete()) 128 | self.assertEqual(head_request, p.get_decoded_message()) 129 | 130 | p = ResponseMessage(p) 131 | text = p.feed(head_response) 132 | 133 | self.assertEqual(text, b'') 134 | self.assertTrue(p.complete()) 135 | self.assertEqual(head_response, p.get_decoded_message()) 136 | self.assertEqual(p.code, 200) 137 | self.assertEqual(p.header.version, b"HTTP/1.1") 138 | self.assertEqual(p.header.phrase, b"OK") 139 | 140 | 141 | class PostTestChunked(unittest.TestCase): 142 | """Tests the parser with a POST request with chunked encoding.""" 143 | post_request = b"\r\n".join([ 144 | b"POST / HTTP/1.1", 145 | b"Host: example.org", 146 | b"Transfer-Encoding: chunked", 147 | b"", 148 | b"8", 149 | b"abcdefgh", 150 | b"0", 151 | b"", 152 | b"", 153 | ]) 154 | post_response = b"\r\n".join([ 155 | b"HTTP/1.1 100 Continue", 156 | b"Host: example.org", 157 | b"", 158 | b"HTTP/1.0 204 No Content", 159 | b"Date: now!", 160 | b"", 161 | b"", 162 | ]) 163 | 164 | def runTest(self): 165 | """Tests parsing of POST requests and responses.""" 166 | p = RequestMessage() 167 | text = p.feed(self.post_request) 168 | 169 | self.assertEqual(text, b'') 170 | self.assertTrue(p.complete()) 171 | 172 | p = ResponseMessage(p) 173 | text = p.feed(self.post_response) 174 | 175 | self.assertEqual(text, b'') 176 | self.assertTrue(p.complete()) 177 | self.assertEqual(p.code, 204) 178 | self.assertEqual(p.header.version, b"HTTP/1.0") 179 | self.assertEqual(p.header.phrase, b"No Content") 180 | 181 | 182 | class PostTestChunkedEmpty(unittest.TestCase): 183 | """Tests the parser with a POST request with chunked encoding and 184 | an empty body.""" 185 | post_request = b"\r\n".join([ 186 | b"POST / HTTP/1.1", 187 | b"Host: example.org", 188 | b"Transfer-Encoding: chunked", 189 | b"", 190 | b"0", 191 | b"", 192 | b"", 193 | ]) 194 | post_response = b"\r\n".join([ 195 | b"HTTP/1.1 100 Continue", 196 | b"Host: example.org", 197 | b"", 198 | b"HTTP/1.0 204 No Content", 199 | b"Date: now!", 200 | b"", 201 | b"", 202 | ]) 203 | 204 | def runTest(self): 205 | """Tests parsing of POST requests and responses.""" 206 | p = RequestMessage() 207 | text = p.feed(self.post_request) 208 | 209 | self.assertEqual(text, b'') 210 | self.assertTrue(p.complete()) 211 | 212 | p = ResponseMessage(p) 213 | text = p.feed(self.post_response) 214 | 215 | self.assertEqual(text, b'') 216 | self.assertTrue(p.complete()) 217 | self.assertEqual(p.code, 204) 218 | self.assertEqual(p.header.version, b"HTTP/1.0") 219 | self.assertEqual(p.header.phrase, b"No Content") 220 | 221 | 222 | class TestTwoPartStatus(unittest.TestCase): 223 | """This is a request taken from the wild that broke the crawler. The main 224 | part being tested is the status line without a message.""" 225 | 226 | request = b"\r\n".join([ 227 | b"GET / HTTP/1.1", 228 | b"Host: example.org", # Name changed to protect the guilty 229 | b"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 230 | b"Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.3", 231 | b"Accept-Encoding: gzip,deflate,sdch", 232 | b"Accept-Language: en-US,en;q=0.8", 233 | b"Connection: keep-alive", 234 | b"Host: example.org", 235 | b"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7", 236 | b"", 237 | b"", 238 | ]) 239 | response = b"\r\n".join([ 240 | b"HTTP/1.1 404", 241 | b"Cache-Control: no-cache", 242 | b"Content-Length: 0", 243 | b"Content-Type:image/gif", 244 | b"Pragma:no-cache", 245 | b"nnCoection: close", 246 | b"", 247 | b"", 248 | ]) 249 | 250 | def runTest(self): 251 | """Tests parsing of a broken response.""" 252 | p = RequestMessage() 253 | text = p.feed(self.request) 254 | 255 | self.assertEqual(text, b'') 256 | self.assertTrue(p.complete()) 257 | 258 | p = ResponseMessage(p) 259 | text = p.feed(self.response) 260 | 261 | self.assertEqual(text, b'') 262 | self.assertTrue(p.complete()) 263 | self.assertEqual(p.code, 404) 264 | self.assertEqual(p.header.version, b"HTTP/1.1") 265 | 266 | 267 | class TestPseudoGzipped(unittest.TestCase): 268 | """Test parsing of a response with Content-Encoding:gzip declared, but 269 | without the payload actually being gzipped (see #14)""" 270 | post_response = b"\r\n".join([ 271 | b"HTTP/1.1 200 OK", 272 | b"Host: example.org", 273 | b"Content-Encoding: gzip", 274 | b"Content-Length: 7", 275 | b"", 276 | b"text", 277 | b"" 278 | ]) 279 | 280 | def runTest(self): 281 | """Tests parsing the response.""" 282 | request = RequestMessage() 283 | response = ResponseMessage(request) 284 | text = response.feed(self.post_response) 285 | 286 | self.assertEqual(text, b'') 287 | self.assertTrue(response.complete()) 288 | self.assertEqual(response.code, 200) 289 | self.assertEqual(response.header.version, b"HTTP/1.1") 290 | 291 | 292 | class TestGzipped(unittest.TestCase): 293 | """Test parsing of a response with Content-Encoding:gzip declared 294 | and an actually gzipped payload (see #14)""" 295 | post_response = b"\r\n".join([ 296 | b"HTTP/1.1 200 OK", 297 | b"Host: example.org", 298 | b"Content-Encoding: gzip", 299 | b"Content-Length: 30", 300 | b"", 301 | (b"\x1f\x8b\x08\x08G\xb2\xc5V\x00\x03test\x00+I\xad(\xe1\x02\x00'" 302 | b"\xda\xec7\x05\x00\x00\x00") 303 | ]) 304 | 305 | def runTest(self): 306 | """Tests parsing of the response.""" 307 | request = RequestMessage() 308 | response = ResponseMessage(request) 309 | text = response.feed(self.post_response) 310 | 311 | self.assertEqual(text, b'') 312 | self.assertTrue(response.complete()) 313 | self.assertEqual(response.code, 200) 314 | self.assertEqual(response.header.version, b"HTTP/1.1") 315 | 316 | 317 | if __name__ == '__main__': 318 | unittest.main() 319 | -------------------------------------------------------------------------------- /hanzo/warc2warc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warc2warc - convert one warc to another, can be used to re-compress things""" 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import sys 8 | 9 | import sys 10 | import os.path 11 | 12 | from optparse import OptionParser 13 | 14 | from .warctools import WarcRecord, expand_files 15 | from .httptools import RequestMessage, ResponseMessage 16 | 17 | parser = OptionParser(usage="%prog [options] url (url ...)") 18 | 19 | parser.add_option("-o", "--output", dest="output", 20 | help="output warc file") 21 | parser.add_option("-l", "--limit", dest="limit") 22 | parser.add_option("-I", "--input", dest="input_format", help="(ignored)") 23 | parser.add_option("-Z", "--gzip", dest="gzip", action="store_true", help="compress output, record by record") 24 | parser.add_option("-D", "--decode_http", dest="decode_http", action="store_true", help="decode http messages (strip chunks, gzip)") 25 | parser.add_option("-L", "--log-level", dest="log_level") 26 | parser.add_option("--wget-chunk-fix", dest="wget_workaround", action="store_true", help="skip transfer-encoding headers in http records, when decoding them (-D)") 27 | 28 | parser.set_defaults(output_directory=None, limit=None, log_level="info", gzip=False, decode_http=False, wget_workaround=False) 29 | 30 | 31 | WGET_IGNORE_HEADERS = ['Transfer-Encoding'] 32 | 33 | def process(record, out, options): 34 | ignore_headers = WGET_IGNORE_HEADERS if options.wget_workaround else () 35 | if options.decode_http: 36 | if record.type == WarcRecord.RESPONSE: 37 | content_type, content = record.content 38 | message = None 39 | if content_type == ResponseMessage.CONTENT_TYPE: 40 | # technically, a http request needs to know the request to be parsed 41 | # because responses to head requests don't have a body. 42 | # we assume we don't store 'head' responses, and plough on 43 | message = ResponseMessage(RequestMessage(), ignore_headers=ignore_headers) 44 | if content_type == RequestMessage.CONTENT_TYPE: 45 | message = RequestMessage(ignore_headers=ignore_headers) 46 | 47 | if message: 48 | leftover = message.feed(content) 49 | message.close() 50 | if not leftover and message.complete(): 51 | content = message.get_decoded_message() 52 | record.content = content_type, content 53 | else: 54 | error = [] 55 | if leftover: 56 | error.append("%d bytes unparsed"%len(leftover)) 57 | if not message.complete(): 58 | error.append("incomplete message (at %s, %s)"%(message.mode, message.header.mode)) 59 | print('errors decoding http in record', record.id, ",".join(error), file=sys.stderr) 60 | 61 | record.write_to(out, gzip=options.gzip) 62 | 63 | def main(argv): 64 | (options, input_files) = parser.parse_args(args=argv[1:]) 65 | 66 | try: # python3 67 | out = sys.stdout.buffer 68 | except AttributeError: # python2 69 | out = sys.stdout 70 | 71 | if len(input_files) < 1: 72 | fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) 73 | 74 | for record in fh: 75 | process(record, out, options) 76 | else: 77 | for name in expand_files(input_files): 78 | fh = WarcRecord.open_archive(name, gzip="auto") 79 | for record in fh: 80 | process(record, out, options) 81 | 82 | fh.close() 83 | 84 | 85 | 86 | return 0 87 | 88 | def run(): 89 | sys.exit(main(sys.argv)) 90 | 91 | 92 | if __name__ == '__main__': 93 | run() 94 | 95 | 96 | -------------------------------------------------------------------------------- /hanzo/warcdump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warcdump - dump warcs in a slightly more humane format""" 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import sys 8 | 9 | import sys 10 | import os.path 11 | 12 | from optparse import OptionParser 13 | 14 | from .warctools import WarcRecord, expand_files 15 | 16 | parser = OptionParser(usage="%prog [options] warc warc warc") 17 | 18 | parser.add_option("-l", "--limit", dest="limit") 19 | parser.add_option("-I", "--input", dest="input_format") 20 | parser.add_option("-L", "--log-level", dest="log_level") 21 | 22 | parser.set_defaults(output_directory=None, limit=None, log_level="info") 23 | 24 | def main(argv): 25 | (options, input_files) = parser.parse_args(args=argv[1:]) 26 | 27 | out = sys.stdout 28 | if len(input_files) < 1: 29 | dump_archive(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None), name="-",offsets=False) 30 | 31 | else: 32 | for name in expand_files(input_files): 33 | fh = WarcRecord.open_archive(name, gzip="auto") 34 | dump_archive(fh,name) 35 | 36 | fh.close() 37 | 38 | 39 | return 0 40 | 41 | def dump_archive(fh, name, offsets=True): 42 | for (offset, record, errors) in fh.read_records(limit=None, offsets=offsets): 43 | if record: 44 | print("archive record at %s:%s"%(name,offset)) 45 | record.dump(content=True) 46 | elif errors: 47 | print("warc errors at %s:%d"%(name, offset if offset else 0)) 48 | for e in errors: 49 | print('\t', e) 50 | else: 51 | print() 52 | print('note: no errors encountered in tail of file') 53 | 54 | def run(): 55 | sys.exit(main(sys.argv)) 56 | 57 | 58 | if __name__ == '__main__': 59 | run() 60 | 61 | 62 | -------------------------------------------------------------------------------- /hanzo/warcextract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warcextract - dump warc record context to standard out""" 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import sys 8 | 9 | import sys 10 | import os.path 11 | 12 | from optparse import OptionParser 13 | from contextlib import closing 14 | 15 | from .warctools import WarcRecord 16 | 17 | parser = OptionParser(usage="%prog [options] warc offset") 18 | 19 | #parser.add_option("-l", "--limit", dest="limit") 20 | parser.add_option("-I", "--input", dest="input_format") 21 | parser.add_option("-L", "--log-level", dest="log_level") 22 | 23 | parser.set_defaults(output_directory=None, limit=None, log_level="info") 24 | 25 | def main(argv): 26 | (options, args) = parser.parse_args(args=argv[1:]) 27 | 28 | try: # python3 29 | out = sys.stdout.buffer 30 | except AttributeError: # python2 31 | out = sys.stdout 32 | 33 | if len(args) < 1: 34 | # dump the first record on stdin 35 | with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: 36 | dump_record(fh, out) 37 | 38 | else: 39 | # dump a record from the filename, with optional offset 40 | filename = args[0] 41 | if len(args) > 1: 42 | offset = int(args[1]) 43 | else: 44 | offset = 0 45 | 46 | with closing(WarcRecord.open_archive(filename=filename, gzip="auto")) as fh: 47 | fh.seek(offset) 48 | dump_record(fh, out) 49 | 50 | 51 | return 0 52 | 53 | def dump_record(fh, out): 54 | for (offset, record, errors) in fh.read_records(limit=1, offsets=False): 55 | if record: 56 | out.write(record.content[1]) 57 | elif errors: 58 | print("warc errors at %s:%d"%(name, offset if offset else 0), file=sys.stderr) 59 | for e in errors: 60 | print('\t', e) 61 | break # only use one (I'm terrible) 62 | 63 | 64 | def run(): 65 | sys.exit(main(sys.argv)) 66 | 67 | 68 | if __name__ == '__main__': 69 | run() 70 | 71 | 72 | -------------------------------------------------------------------------------- /hanzo/warcfilter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warcfilter - prints warcs in that match regexp, by default searches all headers""" 3 | 4 | import os 5 | import sys 6 | 7 | import re 8 | 9 | from optparse import OptionParser 10 | 11 | from .warctools import WarcRecord, expand_files 12 | from .httptools import RequestMessage, ResponseMessage 13 | 14 | parser = OptionParser(usage="%prog [options] pattern warc warc warc") 15 | 16 | parser.add_option("-l", "--limit", dest="limit", help="limit (ignored)") 17 | parser.add_option("-I", "--input", dest="input_format", help="input format (ignored)") 18 | parser.add_option("-i", "--invert", dest="invert",action="store_true", help="invert match") 19 | parser.add_option("-U", "--url", dest="url",action="store_true", help="match on url") 20 | parser.add_option("-T", "--type", dest="type",action="store_true", help="match on (warc) record type") 21 | parser.add_option("-C", "--content-type", dest="content_type",action="store_true", help="match on (warc) record content type") 22 | parser.add_option("-H", "--http-content-type", dest="http_content_type",action="store_true", help="match on http payload content type") 23 | parser.add_option("-D", "--warc-date", dest="warc_date",action="store_true", help="match on WARC-Date header") 24 | parser.add_option("-L", "--log-level", dest="log_level", help="log level(ignored)") 25 | 26 | parser.set_defaults(output_directory=None, limit=None, log_level="info", invert=False, url=None, content_type=None, type=None) 27 | 28 | def parse_http_response(record): 29 | message = ResponseMessage(RequestMessage()) 30 | remainder = message.feed(record.content[1]) 31 | message.close() 32 | if remainder or not message.complete(): 33 | if remainder: 34 | logging.warning('trailing data in http response for %s'% record.url) 35 | if not message.complete(): 36 | logging.warning('truncated http response for %s'%record.url) 37 | 38 | header = message.header 39 | 40 | mime_type = [v for k,v in header.headers if k.lower() == b'content-type'] 41 | if mime_type: 42 | mime_type = mime_type[0].split(b';')[0] 43 | else: 44 | mime_type = None 45 | 46 | return header.code, mime_type, message 47 | 48 | def main(argv): 49 | (options, input_files) = parser.parse_args(args=argv[1:]) 50 | 51 | try: # python3 52 | out = sys.stdout.buffer 53 | except AttributeError: # python2 54 | out = sys.stdout 55 | 56 | if len(input_files) < 1: 57 | parser.error("no pattern") 58 | 59 | 60 | pattern, input_files = input_files[0].encode(), input_files[1:] 61 | 62 | 63 | invert = options.invert 64 | pattern = re.compile(pattern) 65 | if not input_files: 66 | fh = WarcRecord.open_archive(file_handle=sys.stdin, gzip=None) 67 | filter_archive(fh, options, pattern, out) 68 | else: 69 | for name in expand_files(input_files): 70 | fh = WarcRecord.open_archive(name, gzip="auto") 71 | filter_archive(fh, options, pattern,out) 72 | fh.close() 73 | 74 | 75 | 76 | return 0 77 | 78 | def filter_archive(fh, options, pattern, out): 79 | invert = options.invert 80 | for record in fh: 81 | if options.url: 82 | if bool(record.url and pattern.search(record.url)) ^ invert : 83 | record.write_to(out) 84 | 85 | elif options.type: 86 | if bool(record.type and pattern.search(record.type)) ^ invert: 87 | record.write_to(out) 88 | 89 | elif options.content_type: 90 | if bool(record.content_type and pattern.search(record.content_type)) ^ invert: 91 | record.write_to(out) 92 | 93 | elif options.http_content_type: 94 | if record.type == WarcRecord.RESPONSE and record.content_type.startswith(b'application/http'): 95 | code, content_type, message = parse_http_response(record) 96 | 97 | if bool(content_type and pattern.search(content_type)) ^ invert: 98 | record.write_to(out) 99 | 100 | elif options.warc_date: 101 | if bool(record.date and pattern.search(record.date)) ^ invert: 102 | record.write_to(out) 103 | 104 | else: 105 | found = False 106 | for name, value in record.headers: 107 | if pattern.search(value): 108 | found = True 109 | break 110 | 111 | content_type, content = record.content 112 | if not found: 113 | found = bool(pattern.search(content)) 114 | 115 | 116 | if found ^ invert: 117 | record.write_to(out) 118 | 119 | 120 | def run(): 121 | sys.exit(main(sys.argv)) 122 | 123 | 124 | if __name__ == '__main__': 125 | run() 126 | 127 | 128 | -------------------------------------------------------------------------------- /hanzo/warcindex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warcindex - dump warc index""" 3 | 4 | import os 5 | import sys 6 | 7 | import sys 8 | import os.path 9 | 10 | from optparse import OptionParser 11 | 12 | from .warctools import WarcRecord, expand_files 13 | 14 | parser = OptionParser(usage="%prog [options] warc warc warc") 15 | 16 | parser.add_option("-l", "--limit", dest="limit") 17 | parser.add_option("-O", "--output-format", dest="output_format", help="output format (ignored)") 18 | parser.add_option("-o", "--output", dest="output_format", help="output file (ignored)") 19 | 20 | parser.add_option("-L", "--log-level", dest="log_level") 21 | 22 | parser.set_defaults(output=None, limit=None, log_level="info") 23 | 24 | def main(argv): 25 | (options, input_files) = parser.parse_args(args=argv[1:]) 26 | 27 | try: # python3 28 | out = sys.stdout.buffer 29 | except AttributeError: # python2 30 | out = sys.stdout 31 | 32 | if len(input_files) < 1: 33 | parser.error("no imput warc file(s)") 34 | 35 | out.write(b'#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length\n') 36 | for name in expand_files(input_files): 37 | fh = WarcRecord.open_archive(name, gzip="auto") 38 | 39 | try: 40 | for (offset, record, errors) in fh.read_records(limit=None): 41 | if record: 42 | fields = [name.encode('utf-8'), 43 | str(offset).encode('utf-8'), 44 | record.type or b'-', 45 | record.url or b'-', 46 | record.id or b'-', 47 | record.content_type or b'-', 48 | str(record.content_length).encode('utf-8')] 49 | out.write(b' '.join(fields) + b'\n') 50 | elif errors: 51 | pass 52 | # ignore 53 | else: 54 | pass 55 | # no errors at tail 56 | 57 | finally: 58 | fh.close() 59 | 60 | return 0 61 | 62 | 63 | def run(): 64 | sys.exit(main(sys.argv)) 65 | 66 | 67 | if __name__ == '__main__': 68 | run() 69 | 70 | 71 | -------------------------------------------------------------------------------- /hanzo/warclinks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from __future__ import print_function 3 | 4 | import os 5 | import re 6 | import sys 7 | import os.path 8 | import logging 9 | 10 | from urllib.parse import urlparse, urlunparse 11 | from html.parser import HTMLParser, HTMLParseError 12 | from optparse import OptionParser 13 | from contextlib import closing 14 | 15 | from .warctools import WarcRecord, expand_files 16 | from .httptools import RequestMessage, ResponseMessage 17 | 18 | 19 | LEVELS = {'debug': logging.DEBUG, 20 | 'info': logging.INFO, 21 | 'warning': logging.WARNING, 22 | 'error': logging.ERROR, 23 | 'critical': logging.CRITICAL} 24 | 25 | parser = OptionParser(usage="%prog [options] warc (warc ...)") 26 | 27 | parser.add_option("-L", "--log-level", dest="log_level") 28 | 29 | parser.set_defaults(log_level="info") 30 | 31 | 32 | 33 | def parse_http_response(record): 34 | message = ResponseMessage(RequestMessage()) 35 | remainder = message.feed(record.content[1]) 36 | message.close() 37 | if remainder or not message.complete(): 38 | if remainder: 39 | logging.warning('trailing data in http response for %s'% record.url) 40 | if not message.complete(): 41 | logging.warning('truncated http response for %s'%record.url) 42 | 43 | header = message.header 44 | 45 | mime_type = [v for k,v in header.headers if k.lower() =='content-type'] 46 | if mime_type: 47 | mime_type = mime_type[0].split(';')[0] 48 | else: 49 | mime_type = None 50 | 51 | return header.code, mime_type, message 52 | 53 | 54 | def extract_links_from_warcfh(fh): 55 | for (offset, record, errors) in fh.read_records(limit=None): 56 | if record: 57 | try: 58 | content_type, content = record.content 59 | 60 | if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'): 61 | 62 | code, mime_type, message = parse_http_response(record) 63 | 64 | if 200 <= code < 300 and mime_type.find('html') > -1: 65 | for link in extract_links_from_html(record.url, message.get_body()): 66 | yield ("".join(c for c in link if c not in '\n\r\t')) 67 | 68 | 69 | except Exception as e: 70 | logging.warning("error in handling record "+str(e)) 71 | import traceback; traceback.print_exc() 72 | 73 | elif errors: 74 | logging.warning("warc error at %d: %s"%((offset if offset else 0), ", ".join(str(e) for e in errors))) 75 | import traceback; traceback.print_exc() 76 | 77 | 78 | 79 | try: 80 | import lxml.html 81 | 82 | def extract_links_from_html(base, body): 83 | try: 84 | html = lxml.html.fromstring(body) 85 | html.make_links_absolute(base) 86 | 87 | for element, attribute, link, pos in html.iterlinks(): 88 | if isinstance(link, str): 89 | link = link.encode('utf-8', 'ignore') 90 | yield link 91 | 92 | except Exception: 93 | logging.warning("(lxml) html parse error") 94 | import traceback; traceback.print_exc() 95 | 96 | 97 | except ImportError: 98 | logging.warning("using fallback parser") 99 | def extract_links_from_html(base, body): 100 | try: 101 | html = LinkParser(base) 102 | html.feed(body) 103 | html.close() 104 | for link in html.get_abs_links(): 105 | yield link 106 | except HTMLParseError as ex: 107 | logging.warning("html parse error") 108 | 109 | 110 | """ fallback link extractor """ 111 | def attr_extractor(*names): 112 | def _extractor(attrs): 113 | return [value for key,value in attrs if key in names and value] 114 | return _extractor 115 | 116 | def meta_extractor(attrs): 117 | content = [value for key,value in attrs if key =="content" and value] 118 | urls = [] 119 | for value in content: 120 | for pair in value.split(";"): 121 | bits = pair.split("=",2) 122 | if len(bits)>1 and bits[0].lower()=="url": 123 | urls.append(bits[1].strip()) 124 | return urls 125 | 126 | 127 | class LinkParser(HTMLParser): 128 | def __init__(self, base): 129 | HTMLParser.__init__(self) 130 | self.links = [] 131 | self.base = base 132 | 133 | self.tag_extractor = { 134 | "a": attr_extractor("href"), 135 | "applet": attr_extractor("code"), 136 | "area": attr_extractor("href"), 137 | "bgsound": attr_extractor("src"), 138 | "body": attr_extractor("background"), 139 | "embed": attr_extractor("href","src"), 140 | "fig": attr_extractor("src"), 141 | "form": attr_extractor("action"), 142 | "frame": attr_extractor("src"), 143 | "iframe": attr_extractor("src"), 144 | "img": attr_extractor("href","src","lowsrc"), 145 | "input": attr_extractor("src"), 146 | "link": attr_extractor("href"), 147 | "layer": attr_extractor("src"), 148 | "object": attr_extractor("data"), 149 | "overlay": attr_extractor("src"), 150 | "script": attr_extractor("src"), 151 | "table": attr_extractor("background"), 152 | "td": attr_extractor("background"), 153 | "th": attr_extractor("background"), 154 | 155 | "meta": meta_extractor, 156 | "base": self.base_extractor, 157 | } 158 | 159 | def base_extractor(self, attrs): 160 | base = [value for key,value in attrs if key == "href" and value] 161 | if base: 162 | self.base = base[-1] 163 | return () 164 | 165 | def handle_starttag(self, tag, attrs): 166 | extractor = self.tag_extractor.get(tag, None) 167 | if extractor: 168 | self.links.extend(extractor(attrs)) 169 | 170 | def get_abs_links(self): 171 | full_urls = [] 172 | root = urlparse(self.base) 173 | root_dir = os.path.split(root.path)[0] 174 | for link in self.links: 175 | parsed = urlparse(link) 176 | if not parsed.netloc: # does it have no protocol or host, i.e relative 177 | if parsed.path.startswith("/"): 178 | parsed = root[0:2] + parsed[2:5] + (None,) 179 | else: 180 | dir = root_dir 181 | path = parsed.path 182 | while True: 183 | if path.startswith("../"): 184 | path=path[3:] 185 | dir=os.path.split(dir)[0] 186 | elif path.startswith("./"): 187 | path=path[2:] 188 | else: 189 | break 190 | 191 | parsed = root[0:2] + (os.path.join(dir, path),) + parsed[3:5] + (None,) 192 | new_link = urlunparse(parsed) 193 | logging.debug("relative %s -> %s"%(link, new_link)) 194 | link=new_link 195 | 196 | else: 197 | logging.debug("absolute %s"%link) 198 | full_urls.append(link) 199 | return full_urls 200 | 201 | 202 | def main(argv): 203 | (options, warcs) = parser.parse_args(args=argv[1:]) 204 | logging.basicConfig(level=LEVELS[options.log_level]) 205 | 206 | if len(warcs) < 1: 207 | parser.error("missing warcs(s)") 208 | 209 | 210 | ret = 0 211 | 212 | for warc in expand_files(warcs): 213 | try: 214 | with closing(WarcRecord.open_archive(filename=warc, gzip="auto")) as fh: 215 | for link in extract_links_from_warcfh(fh): 216 | print(link) 217 | 218 | except Exception as e: 219 | logging.error(str(e)) 220 | ret -=1 221 | 222 | return ret 223 | 224 | 225 | def run(): 226 | sys.exit(main(sys.argv)) 227 | 228 | 229 | if __name__ == '__main__': 230 | run() 231 | 232 | 233 | -------------------------------------------------------------------------------- /hanzo/warcpayload.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import os 6 | import sys 7 | try: 8 | from http.client import HTTPResponse 9 | except ImportError: 10 | from httplib import HTTPResponse 11 | 12 | 13 | from optparse import OptionParser 14 | from contextlib import closing 15 | 16 | from .warctools import WarcRecord 17 | 18 | parser = OptionParser(usage="%prog warc:offset") 19 | 20 | parser.set_defaults(output_directory=None, limit=None, log_level="info") 21 | 22 | def main(argv): 23 | (options, args) = parser.parse_args(args=argv[1:]) 24 | 25 | filename, offset = args[0].rsplit(':',1) 26 | if ',' in offset: 27 | offset, length = [int(n) for n in offset.split(',',1)] 28 | else: 29 | offset = int(offset) 30 | length = None # unknown 31 | 32 | dump_payload_from_file(filename, offset, length) 33 | 34 | def dump_payload_from_file(filename, offset=None, length=None): 35 | with closing(WarcRecord.open_archive(filename=filename, gzip="auto", offset=offset, length=length)) as fh: 36 | return dump_payload_from_stream(fh) 37 | 38 | def dump_payload_from_stream(fh): 39 | try: # python3 40 | out = sys.stdout.buffer 41 | except AttributeError: # python2 42 | out = sys.stdout 43 | 44 | for (offset, record, errors) in fh.read_records(limit=1, offsets=False): 45 | if record: 46 | if (record.type == WarcRecord.RESPONSE 47 | and record.content_type.startswith(b'application/http')): 48 | f = FileHTTPResponse(record.content_file) 49 | f.begin() 50 | else: 51 | f = record.content_file 52 | 53 | buf = f.read(8192) 54 | while buf != b'': 55 | out.write(buf) 56 | buf = f.read(8192) 57 | 58 | elif errors: 59 | print("warc errors at %s:%d"%(name, offset if offset else 0), file=sys.stderr) 60 | for e in errors: 61 | print('\t', e) 62 | 63 | class FileHTTPResponse(HTTPResponse): 64 | """HTTPResponse subclass that reads from the supplied fileobj instead of 65 | from a socket.""" 66 | 67 | def __init__(self, fileobj, debuglevel=0, strict=0, method=None, buffering=False): 68 | self.fp = fileobj 69 | 70 | # We can't call HTTPResponse.__init__(self, ...) because it will try to 71 | # call sock.makefile() and we have no sock. So we have to copy and 72 | # paste the rest of the constructor below. 73 | 74 | self.debuglevel = debuglevel 75 | self.strict = strict 76 | self._method = method 77 | 78 | self.headers = self.msg = None 79 | 80 | # from the Status-Line of the response 81 | self.version = 'UNKNOWN' # HTTP-Version 82 | self.status = 'UNKNOWN' # Status-Code 83 | self.reason = 'UNKNOWN' # Reason-Phrase 84 | 85 | self.chunked = 'UNKNOWN' # is "chunked" being used? 86 | self.chunk_left = 'UNKNOWN' # bytes left to read in current chunk 87 | self.length = 'UNKNOWN' # number of bytes left in response 88 | self.will_close = 'UNKNOWN' # conn will close at end of response 89 | 90 | 91 | def run(): 92 | sys.exit(main(sys.argv)) 93 | 94 | 95 | if __name__ == '__main__': 96 | run() 97 | 98 | 99 | -------------------------------------------------------------------------------- /hanzo/warctools/__init__.py: -------------------------------------------------------------------------------- 1 | from .record import ArchiveRecord 2 | from .warc import WarcRecord 3 | from .arc import ArcRecord 4 | from .mixed import MixedRecord 5 | from .s3 import list_files 6 | from . import record, warc, arc, s3 7 | 8 | def expand_files(files): 9 | for file in files: 10 | if file.startswith('s3:'): 11 | for f in list_files(file): 12 | yield f 13 | else: 14 | yield file 15 | 16 | __all__= [ 17 | 'MixedRecord', 18 | 'ArchiveRecord', 19 | 'ArcRecord', 20 | 'WarcRecord', 21 | 'record', 22 | 'warc', 23 | 'arc', 24 | 'expand_files', 25 | ] 26 | -------------------------------------------------------------------------------- /hanzo/warctools/arc.py: -------------------------------------------------------------------------------- 1 | """An object to represent arc records 2 | http://archive.org/web/researcher/ArcFileFormat.php 3 | """ 4 | 5 | import re 6 | 7 | from hanzo.warctools.record import ArchiveRecord, ArchiveParser 8 | from hanzo.warctools.archive_detect import register_record_type 9 | 10 | # URLIP-addressArchive-dateContent-type 11 | #Result-codeChecksumLocation OffsetFilename 12 | #Archive-length 13 | # 14 | @ArchiveRecord.HEADERS( 15 | URL = b'URL', 16 | IP = b'IP-address', 17 | DATE = b'Archive-date', 18 | CONTENT_TYPE = b'Content-type', 19 | CONTENT_LENGTH = b'Archive-length', 20 | RESULT_CODE = b'Result-code', 21 | CHECKSUM = b'Checksum', 22 | LOCATION = b'Location', 23 | OFFSET = b'Offset', 24 | FILENAME = b'Filename', 25 | ) 26 | class ArcRecord(ArchiveRecord): 27 | 28 | TRAILER = b'\n' # an ARC record is trailed by single unix newline 29 | 30 | """Represents a record in an arc file.""" 31 | def __init__(self, headers=None, content=None, errors=None): 32 | ArchiveRecord.__init__(self, headers, content, errors) 33 | 34 | @property 35 | def type(self): 36 | return b"response" 37 | 38 | def _write_to(self, out, nl): 39 | #TODO: empty method? 40 | pass 41 | 42 | @classmethod 43 | def make_parser(cls): 44 | """Constructs a parser for arc records.""" 45 | return ArcParser() 46 | 47 | class ArcRecordHeader(ArcRecord): 48 | """Represents the headers in an arc record.""" 49 | def __init__(self, headers=None, content=None, errors=None, version=None, 50 | raw_headers=None): 51 | ArcRecord.__init__(self, headers, content, errors) 52 | self.version = version 53 | self.raw_headers = raw_headers 54 | 55 | @property 56 | def type(self): 57 | return b"filedesc" 58 | 59 | def raw(self): 60 | """Return the raw representation of this record.""" 61 | return b"".join(self.raw_headers) + self.content[1] 62 | 63 | def rx(pat): 64 | """Helper function to compile a regular expression with the IGNORECASE 65 | flag.""" 66 | return re.compile(pat, flags=re.IGNORECASE) 67 | 68 | nl_rx = rx('^\r\n|\r|\n$') 69 | length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101 70 | type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101 71 | SPLIT = re.compile(br'\b\s|\s\b').split 72 | 73 | class ArcParser(ArchiveParser): 74 | """A parser for arc archives.""" 75 | 76 | 77 | def __init__(self): 78 | self.version = 0 79 | # we don't know which version to parse initially - a v1 or v2 file so 80 | # we read the filedesc because the order and number of the headers 81 | # change between versions. 82 | 83 | # question? will we get arc fragments? 84 | # should we store both headers & detect records by header length? 85 | # if we don't know 86 | 87 | self.headers = [] 88 | 89 | def parse(self, stream, offset, line=None): 90 | """Parses a stream as an arc archive and returns an Arc record along 91 | with the offset in the stream of the end of the record.""" 92 | record = None 93 | content_type = None 94 | content_length = None 95 | if line is None: 96 | line = stream.readline() 97 | 98 | while not line.rstrip(): 99 | if not line: 100 | return (None, (), offset) 101 | line = stream.readline() 102 | 103 | if line.startswith(b'filedesc:'): 104 | raw_headers = [] 105 | raw_headers.append(line) 106 | # read headers named in body of record 107 | # to assign names to header, to read body of record 108 | arc_version_line = stream.readline() 109 | raw_headers.append(arc_version_line) 110 | arc_names_line = stream.readline() 111 | raw_headers.append(arc_names_line) 112 | 113 | arc_version = arc_version_line.strip() 114 | 115 | # configure parser instance 116 | self.version = arc_version.split()[0] 117 | self.headers = arc_names_line.strip().split() 118 | 119 | # now we have read header field in record body 120 | # we can extract the headers from the current record, 121 | # and read the length field 122 | 123 | # which is in a different place with v1 and v2 124 | 125 | # read headers 126 | arc_headers = self.parse_header_list(line) 127 | 128 | # extract content, ignoring header lines parsed already 129 | content_type, content_length, errors = \ 130 | self.get_content_headers(arc_headers) 131 | 132 | content_length = content_length \ 133 | - len(arc_version_line) \ 134 | - len(arc_names_line) 135 | 136 | record = ArcRecordHeader(headers=arc_headers, 137 | version=arc_version, 138 | errors=errors, 139 | raw_headers=raw_headers) 140 | else: 141 | if not self.headers: 142 | raise Exception('missing filedesc') 143 | headers = self.parse_header_list(line) 144 | content_type, content_length, errors = \ 145 | self.get_content_headers(headers) 146 | 147 | record = ArcRecord(headers = headers, errors=errors) 148 | 149 | line = None 150 | 151 | record.content_file = stream 152 | record.content_file.bytes_to_eoc = content_length 153 | 154 | return (record, (), offset) 155 | 156 | def trim(self, stream): 157 | return () 158 | 159 | def parse_header_list(self, line): 160 | # some people use ' ' as the empty value. lovely. 161 | line = line.rstrip(b'\r\n') 162 | values = SPLIT(line) 163 | if len(self.headers) != len(values): 164 | if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE): 165 | # fencepost 166 | values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))] 167 | else: 168 | values = SPLIT(line, len(self.headers)-1) 169 | 170 | if len(self.headers) != len(values): 171 | raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers))) 172 | 173 | return list(zip(self.headers, values)) 174 | 175 | 176 | @staticmethod 177 | def get_content_headers(headers): 178 | content_type = None 179 | content_length = None 180 | errors = [] 181 | 182 | for name, value in headers: 183 | if type_rx.match(name): 184 | if value: 185 | content_type = value 186 | else: 187 | errors.append(('invalid header', name, value)) 188 | elif length_rx.match(name): 189 | try: 190 | content_length = int(value) 191 | except ValueError: 192 | errors.append(('invalid header', name, value)) 193 | 194 | return content_type, content_length, errors 195 | 196 | 197 | register_record_type(re.compile(br'^filedesc://'), ArcRecord) 198 | -------------------------------------------------------------------------------- /hanzo/warctools/archive_detect.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | 3 | archive_types = [] 4 | 5 | def is_gzip_file(file_handle): 6 | signature = file_handle.read(2) 7 | file_handle.seek(-len(signature),1) 8 | return signature == b'\x1f\x8b' 9 | 10 | def guess_record_type(file_handle): 11 | offset = file_handle.tell() 12 | if is_gzip_file(file_handle): 13 | nfh=gzip.GzipFile(fileobj=file_handle) 14 | else: 15 | nfh=file_handle 16 | 17 | line = nfh.readline() 18 | file_handle.seek(offset) 19 | for rx, record in archive_types: 20 | if rx.match(line): 21 | return record 22 | 23 | else: 24 | return None 25 | 26 | def register_record_type(rx, record): 27 | archive_types.append((rx,record)) 28 | -------------------------------------------------------------------------------- /hanzo/warctools/log.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | 5 | __all__ = ['debug'] 6 | 7 | if __debug__: 8 | def debug(*args): 9 | print('WARCTOOLS', args, file=sys.stderr) 10 | else: 11 | def debug(*args): 12 | pass 13 | 14 | -------------------------------------------------------------------------------- /hanzo/warctools/mixed.py: -------------------------------------------------------------------------------- 1 | 2 | from hanzo.warctools.record import ArchiveRecord, ArchiveParser 3 | from hanzo.warctools.warc import WarcParser 4 | from hanzo.warctools.arc import ArcParser 5 | 6 | 7 | class MixedRecord(ArchiveRecord): 8 | @classmethod 9 | def make_parser(self): 10 | return MixedParser() 11 | 12 | class MixedParser(ArchiveParser): 13 | def __init__(self): 14 | self.arc = ArcParser() 15 | self.warc = WarcParser() 16 | 17 | def parse(self, stream, offset=None, line=None): 18 | if line is None: 19 | line = stream.readline() 20 | 21 | while line: 22 | if line.startswith(b'WARC'): 23 | return self.warc.parse(stream, offset, line=line) 24 | elif line not in (b'\n',b'\r\n',b'\r'): 25 | return self.arc.parse(stream, offset, line=line) 26 | 27 | line = stream.readline() 28 | return None, (), offset 29 | 30 | 31 | -------------------------------------------------------------------------------- /hanzo/warctools/record.py: -------------------------------------------------------------------------------- 1 | """a skeleton class for archive records""" 2 | 3 | from __future__ import print_function 4 | from gzip import GzipFile 5 | import re 6 | 7 | from hanzo.warctools.stream import open_record_stream 8 | 9 | strip = re.compile(br'[^\w\t \|\\\/]') 10 | 11 | 12 | def add_headers(**kwargs): 13 | """a useful helper for defining header names in record formats""" 14 | 15 | def _add_headers(cls): 16 | for k, v in kwargs.items(): 17 | setattr(cls, k, v) 18 | cls._HEADERS = list(kwargs.keys()) 19 | return cls 20 | return _add_headers 21 | 22 | 23 | class ArchiveParser(object): 24 | """ methods parse, and trim """ 25 | pass 26 | 27 | 28 | @add_headers(DATE=b'Date', 29 | CONTENT_TYPE=b'Type', 30 | CONTENT_LENGTH=b'Length', 31 | TYPE=b'Type', 32 | URL=b'Url') 33 | class ArchiveRecord(object): 34 | """An archive record has some headers, maybe some content and 35 | a list of errors encountered. record.headers is a list of tuples (name, 36 | value). errors is a list, and content is a tuple of (type, data)""" 37 | 38 | #pylint: disable-msg=e1101 39 | 40 | def __init__(self, headers=None, content=None, errors=None): 41 | self.headers = headers if headers else [] 42 | self.errors = errors if errors else [] 43 | self._content = content 44 | 45 | HEADERS = staticmethod(add_headers) 46 | 47 | @property 48 | def date(self): 49 | return self.get_header(self.DATE) 50 | 51 | def error(self, *args): 52 | self.errors.append(args) 53 | 54 | @property 55 | def type(self): 56 | return self.get_header(self.TYPE) 57 | 58 | @property 59 | def content_type(self): 60 | return self.content[0] 61 | 62 | @property 63 | def content_file(self): 64 | """ 65 | File handle for streaming the payload. 66 | 67 | If the record has been read from a RecordStream, content_file wraps the 68 | same underlying file handle as the RecordStream itself. This has 69 | important implications. Results are undefined if you try to read from 70 | content_file after reading the next record from RecordStream; and 71 | closing content_file will close the RecordStream, and vice versa. 72 | But if you avoid these caveats, content_file takes care to bound itself 73 | within the content-length specified in the warc record, so that reading 74 | to the end of content_file will bring you only to the end of the 75 | record's payload. 76 | 77 | When creating a record for writing and supplying content_file, the 78 | record can only be written once, since writing the record entails 79 | reading content_file and advancing the file position. Subsequent 80 | attempts to write using content_file will throw an exception. 81 | """ 82 | return self._content_file 83 | 84 | @content_file.setter 85 | def content_file(self, fh): 86 | self._content_file = fh 87 | self._content_file_valid = fh is not None 88 | 89 | @property 90 | def content(self): 91 | """A tuple (content_type, content). When first referenced, content[0] 92 | is populated from the Content-Type header, and content[1] by reading 93 | self.content_file.""" 94 | if self._content is None: 95 | content_type = self.get_header(self.CONTENT_TYPE) 96 | try: 97 | content = self.content_file.read() 98 | self._content = (content_type, content) 99 | finally: 100 | self.content_file = None 101 | 102 | return self._content 103 | 104 | @property 105 | def content_type(self): 106 | """If self.content tuple was supplied, or has already been snarfed, or 107 | we don't have a Content-Type header, return self.content[0]. Otherwise, 108 | return the value of the Content-Type header.""" 109 | if self._content is None: 110 | content_type = self.get_header(self.CONTENT_TYPE) 111 | if content_type is not None: 112 | return content_type 113 | 114 | return self.content[0] 115 | 116 | @property 117 | def content_length(self): 118 | """If self.content tuple was supplied, or has already been snarfed, or 119 | we don't have a Content-Length header, return len(self.content[1]). 120 | Otherwise, return the value of the Content-Length header.""" 121 | if self._content is None: 122 | content_length = self.get_header(self.CONTENT_LENGTH) 123 | if content_length is not None: 124 | return int(content_length) 125 | 126 | return len(self.content[1]) 127 | 128 | @property 129 | def url(self): 130 | return self.get_header(self.URL) 131 | 132 | def get_header(self, name): 133 | """Returns value of first header found matching name, case 134 | insensitively.""" 135 | for k, v in self.headers: 136 | if name.lower() == k.lower(): 137 | return v 138 | 139 | def set_header(self, name, value): 140 | self.headers = [(k, v) for (k, v) in self.headers if k != name] 141 | self.headers.append((name, value)) 142 | 143 | def dump(self, content=True): 144 | print('Headers:') 145 | for (h, v) in self.headers: 146 | print('\t%s:%s' % (h.decode('latin1'), v.decode('latin1'))) 147 | if content and self.content: 148 | print('Content Headers:') 149 | content_type, content_body = self.content 150 | print('\t' + self.CONTENT_TYPE.decode('latin1'), ':', content_type.decode('latin1')) 151 | print('\t' + self.CONTENT_LENGTH.decode('latin1'), ':', len(content_body)) 152 | print('Content:') 153 | ln = min(1024, len(content_body)) 154 | abbr_strp_content = strip.sub(lambda x: ('\\x%00X' % ord(x.group())).encode('ascii'), content_body[:ln]) 155 | print('\t' + abbr_strp_content.decode('ascii')) 156 | print('\t...') 157 | print() 158 | else: 159 | print('Content: none') 160 | print() 161 | print() 162 | if self.errors: 163 | print('Errors:') 164 | for e in self.errors: 165 | print('\t' + e) 166 | 167 | def write_to(self, out, newline=b'\x0D\x0A', gzip=False): 168 | if self.content_file is not None: 169 | if not self._content_file_valid: 170 | raise Exception('cannot write record because content_file has already been used') 171 | 172 | if gzip: 173 | if hasattr(out, 'mode'): 174 | out = GzipFile(fileobj=out) 175 | else: 176 | out = GzipFile(fileobj=out, mode='ab') 177 | 178 | self._write_to(out, newline) 179 | 180 | if gzip: 181 | out.flush() 182 | out.close() 183 | 184 | if self.content_file is not None: 185 | self._content_file_valid = False 186 | 187 | def _write_to(self, out, newline): 188 | raise AssertionError('this is bad') 189 | 190 | ### class methods for parsing 191 | @classmethod 192 | def open_archive(cls, filename=None, file_handle=None, 193 | mode="rb", gzip="auto", offset=None, length=None): 194 | """Generically open an archive - magic autodetect""" 195 | if cls is ArchiveRecord: 196 | cls = None # means guess 197 | return open_record_stream(cls, filename, file_handle, mode, gzip, offset, length) 198 | 199 | @classmethod 200 | def make_parser(self): 201 | """Reads a (w)arc record from the stream, returns a tuple (record, 202 | errors). Either records is null or errors is null. Any 203 | record-specific errors are contained in the record - errors is only 204 | used when *nothing* could be parsed""" 205 | raise Exception() 206 | -------------------------------------------------------------------------------- /hanzo/warctools/s3.py: -------------------------------------------------------------------------------- 1 | try: 2 | from urllib.parse import urlparse 3 | except ImportError: 4 | import urlparse 5 | 6 | from io import StringIO 7 | 8 | try: 9 | from boto.s3.connection import S3Connection 10 | from boto.s3.key import Key 11 | except ImportError: 12 | def open_url(url, offset=None, length=None): 13 | raise ImportError('boto') 14 | 15 | def list_files(prefix): 16 | raise ImportError('boto') 17 | else: 18 | def open_url(url, offset=None, length=None): 19 | p = urlparse(url) 20 | bucket_name = p.netloc 21 | key = p.path[1:] 22 | conn = S3Connection() 23 | bucket = conn.get_bucket(bucket_name) 24 | k = Key(bucket) 25 | k.key = key 26 | if offset is not None and length is not None: 27 | headers = {'Range': 'bytes=%d-%d' % (offset, offset + length)} 28 | elif offset is not None: 29 | headers = {'Range': 'bytes=%d-' % offset} 30 | else: 31 | headers = {} 32 | 33 | s = StringIO() 34 | k.get_contents_to_file(s, headers=headers) 35 | s.seek(0) 36 | return s 37 | 38 | def list_files(prefix): 39 | p = urlparse(prefix) 40 | bucket_name = p.netloc 41 | prefix = p.path[1:] 42 | 43 | conn = S3Connection() 44 | 45 | bucket = conn.get_bucket(bucket_name) 46 | complete = False 47 | marker = '' 48 | 49 | while not complete: 50 | rs = bucket.get_all_keys(prefix=prefix, marker=marker, delimiter='') 51 | for k in rs: 52 | yield 's3://%s/%s' % (bucket_name, k.key) 53 | marker = k.key 54 | 55 | complete = not rs.is_truncated 56 | -------------------------------------------------------------------------------- /hanzo/warctools/stream.py: -------------------------------------------------------------------------------- 1 | """Read records from normal file and compressed file""" 2 | 3 | import gzip 4 | import re 5 | 6 | from hanzo.warctools.archive_detect import is_gzip_file, guess_record_type 7 | 8 | def open_record_stream(record_class=None, filename=None, file_handle=None, 9 | mode="rb", gzip="auto", offset=None, length=None): 10 | """Can take a filename or a file_handle. Normally called 11 | indirectly from A record class i.e WarcRecord.open_archive. If the 12 | first parameter is None, will try to guess""" 13 | 14 | if file_handle is None: 15 | if filename.startswith('s3://'): 16 | from . import s3 17 | file_handle = s3.open_url(filename, offset=offset, length=length) 18 | else: 19 | file_handle = open(filename, mode=mode) 20 | if offset is not None: 21 | file_handle.seek(offset) 22 | 23 | if record_class == None: 24 | record_class = guess_record_type(file_handle) 25 | 26 | if record_class == None: 27 | raise Exception('Failed to guess compression') 28 | 29 | record_parser = record_class.make_parser() 30 | 31 | if gzip == 'auto': 32 | if (filename and filename.endswith('.gz')) or is_gzip_file(file_handle): 33 | gzip = 'record' 34 | #debug('autodetect: record gzip') 35 | else: 36 | # assume uncompressed file 37 | #debug('autodetected: uncompressed file') 38 | gzip = None 39 | 40 | if gzip == 'record': 41 | return GzipRecordStream(file_handle, record_parser) 42 | elif gzip == 'file': 43 | return GzipFileStream(file_handle, record_parser) 44 | else: 45 | return RecordStream(file_handle, record_parser) 46 | 47 | 48 | class RecordStream(object): 49 | """A readable/writable stream of Archive Records. Can be iterated over 50 | or read_records can give more control, and potentially offset information. 51 | """ 52 | def __init__(self, file_handle, record_parser): 53 | self.fh = file_handle 54 | self.record_parser = record_parser 55 | 56 | # Number of bytes until the end of the record's content, if known. 57 | # Normally set by the record parser based on the Content-Length header. 58 | self.bytes_to_eoc = None 59 | 60 | def seek(self, offset, pos=0): 61 | """Same as a seek on a file""" 62 | self.fh.seek(offset, pos) 63 | 64 | def read_records(self, limit=1, offsets=True): 65 | """Yield a tuple of (offset, record, errors) where 66 | Offset is either a number or None. 67 | Record is an object and errors is an empty list 68 | or record is none and errors is a list""" 69 | nrecords = 0 70 | while limit is None or nrecords < limit: 71 | offset, record, errors = self._read_record(offsets) 72 | nrecords += 1 73 | yield (offset, record, errors) 74 | if not record: 75 | break 76 | 77 | def __iter__(self): 78 | while True: 79 | _, record, errors = self._read_record(offsets=False) 80 | if record: 81 | yield record 82 | elif errors: 83 | error_str = ",".join(str(error) for error in errors) 84 | raise Exception("Errors while decoding %s" % error_str) 85 | else: 86 | break 87 | 88 | def _read_record(self, offsets): 89 | """overridden by sub-classes to read individual records""" 90 | if self.bytes_to_eoc is not None: 91 | self._skip_to_eoc() # skip to end of previous record 92 | self.bytes_to_eoc = None 93 | 94 | # handle any sort of valid or invalid record terminator 95 | while True: 96 | offset = self.fh.tell() if offsets else None 97 | line = self.fh.readline() 98 | if not re.match(br'^[\r\n]+$', line): 99 | break 100 | 101 | record, errors, offset = self.record_parser.parse(self, offset, line) 102 | return offset, record, errors 103 | 104 | def write(self, record): 105 | """Writes an archive record to the stream""" 106 | record.write_to(self) 107 | 108 | def close(self): 109 | """Close the underlying file handle.""" 110 | self.fh.close() 111 | 112 | def _skip_to_eoc(self): 113 | if self.bytes_to_eoc is None: 114 | raise Exception('bytes_to_eoc is unset, cannot skip to end') 115 | 116 | while self.bytes_to_eoc > 0: 117 | read_size = min(CHUNK_SIZE, self.bytes_to_eoc) 118 | buf = self._read(read_size) 119 | if len(buf) < read_size: 120 | raise Exception('expected {} bytes but only read {}'.format(read_size, len(buf))) 121 | 122 | def _read(self, count=None): 123 | """Raw read, will read into next record if caller isn't careful""" 124 | if count is not None: 125 | result = self.fh.read(count) 126 | else: 127 | result = self.fh.read() 128 | 129 | if self.bytes_to_eoc is not None: 130 | self.bytes_to_eoc -= len(result) 131 | 132 | return result 133 | 134 | def read(self, count=None): 135 | """Safe read for reading content, will not read past the end of the 136 | payload, assuming self.bytes_to_eoc is set. The record's trailing 137 | bytes, \\r\\n\\r\\n for warcs or \\n for arcs, will remain when this 138 | method returns "". 139 | """ 140 | if self.bytes_to_eoc is not None and count is not None: 141 | read_size = min(count, self.bytes_to_eoc) 142 | elif self.bytes_to_eoc is not None: 143 | read_size = self.bytes_to_eoc 144 | elif count is not None: 145 | read_size = count 146 | else: 147 | read_size = None 148 | 149 | return self._read(read_size) 150 | 151 | # XXX dumb implementation to support python3 http.client 152 | def readinto(self, b): 153 | tmp = self.read(count=len(b)) 154 | b[:len(tmp)] = tmp 155 | return len(tmp) 156 | 157 | def readline(self, maxlen=None): 158 | """Safe readline for reading content, will not read past the end of the 159 | payload, assuming self.bytes_to_eoc is set. The record's trailing 160 | bytes, \\r\\n\\r\\n for valid warcs or \\n for valid arcs, will remain 161 | when this method returns "". 162 | """ 163 | if self.bytes_to_eoc is not None and maxlen is not None: 164 | lim = min(maxlen, self.bytes_to_eoc) 165 | elif self.bytes_to_eoc is not None: 166 | lim = self.bytes_to_eoc 167 | elif maxlen is not None: 168 | lim = maxlen 169 | else: 170 | lim = None 171 | 172 | if lim is not None: 173 | result = self.fh.readline(lim) 174 | else: 175 | result = self.fh.readline() 176 | 177 | if self.bytes_to_eoc is not None: 178 | self.bytes_to_eoc -= len(result) 179 | return result 180 | 181 | CHUNK_SIZE = 8192 # the size to read in, make this bigger things go faster. 182 | 183 | class GeeZipFile(gzip.GzipFile): 184 | """Extends gzip.GzipFile to remember self.member_offset, the raw file 185 | offset of the current gzip member.""" 186 | 187 | def __init__(self, filename=None, mode=None, 188 | compresslevel=9, fileobj=None, mtime=None): 189 | # ignore mtime for python 2.6 190 | gzip.GzipFile.__init__(self, filename=filename, mode=mode, compresslevel=compresslevel, fileobj=fileobj) 191 | self.member_offset = None 192 | 193 | # hook in to the place we seem to be able to reliably get the raw gzip 194 | # member offset 195 | def _read(self, size=1024): 196 | if self._new_member: 197 | try: 198 | # works for python3.2 199 | self.member_offset = self.fileobj.tell() - self.fileobj._length + (self.fileobj._read or 0) 200 | except AttributeError: 201 | # works for python2.7 202 | self.member_offset = self.fileobj.tell() 203 | 204 | return gzip.GzipFile._read(self, size) 205 | 206 | class GzipRecordStream(RecordStream): 207 | """A stream to read/write concatted file made up of gzipped 208 | archive records""" 209 | def __init__(self, file_handle, record_parser): 210 | RecordStream.__init__(self, GeeZipFile(fileobj=file_handle), record_parser) 211 | self.raw_fh = file_handle 212 | 213 | def _read_record(self, offsets): 214 | if self.bytes_to_eoc is not None: 215 | self._skip_to_eoc() # skip to end of previous record 216 | self.bytes_to_eoc = None 217 | 218 | # handle any sort of valid or invalid record terminator 219 | while True: 220 | line = self.fh.readline() 221 | if not re.match(br'^[\r\n]+$', line): 222 | break 223 | 224 | record, errors, _offset = \ 225 | self.record_parser.parse(self, offset=None, line=line) 226 | 227 | offset = self.fh.member_offset 228 | 229 | return offset, record, errors 230 | 231 | def seek(self, offset, pos=0): 232 | """Same as a seek on a file""" 233 | self.raw_fh.seek(offset, pos) 234 | # trick to avoid closing and recreating GzipFile, does it always work? 235 | self.fh._new_member = True 236 | 237 | class GzipFileStream(RecordStream): 238 | """A stream to read/write gzipped file made up of all archive records""" 239 | def __init__(self, file_handle, record): 240 | RecordStream.__init__(self, gzip.GzipFile(fileobj=file_handle), record) 241 | 242 | def _read_record(self, offsets): 243 | # no useful offsets in a gzipped file 244 | if self.bytes_to_eoc is not None: 245 | self._skip_to_eoc() # skip to end of previous record 246 | self.bytes_to_eoc = None 247 | 248 | # handle any sort of valid or invalid record terminator 249 | while True: 250 | line = self.fh.readline() 251 | if not re.match(br'^[\r\n]+$', line): 252 | break 253 | 254 | record, errors, _offset = \ 255 | self.record_parser.parse(self, offset=None, line=line) 256 | 257 | return offset, record, errors 258 | 259 | -------------------------------------------------------------------------------- /hanzo/warctools/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/warctools/21db132fd3e4b4042cd011d9dc3fb30276a5a0b6/hanzo/warctools/tests/__init__.py -------------------------------------------------------------------------------- /hanzo/warctools/tests/test_warctools.py: -------------------------------------------------------------------------------- 1 | # vim: set sw=4 et: 2 | 3 | import unittest 4 | 5 | # want unittest2 for python2.6 6 | try: 7 | unittest.TestCase.assertIsNone 8 | except AttributeError: 9 | import unittest2 10 | unittest = unittest2 11 | 12 | import tempfile 13 | import gzip 14 | from hanzo import warctools, httptools 15 | 16 | try: 17 | from io import BytesIO 18 | except ImportError: 19 | from StringIO import StringIO 20 | BytesIO = StringIO 21 | 22 | class ArcRecordTerminatorTest(unittest.TestCase): 23 | REC1_CONTENT = (b'1 0 InternetArchive\n' 24 | + b'URL IP-address Archive-date Content-type Archive-length\n' 25 | + b'Here is some funky arc header content!\n') 26 | RECORD1 = b'filedesc://ArcRecordTerminatorTest.arc 0.0.0.0 20131113000000 text/plain ' + str(len(REC1_CONTENT)).encode('ascii') + b'\n' + REC1_CONTENT 27 | 28 | REC2_CONTENT = (b'HTTP/1.1 200 OK\r\n' 29 | + b'Content-Type: text/plain\r\n' 30 | + b'Content-Length: 12\r\n' 31 | + b'\r\n' 32 | + b'01234567890\r\n') 33 | RECORD2 = b'http://example.org/ 192.168.1.1 20131113000000 text/plain ' + str(len(REC2_CONTENT)).encode('ascii') + b'\n' + REC2_CONTENT 34 | 35 | REC1_GZ = b"\x1f\x8b\x08\x00\xbf\xa9\x99R\x02\xff=NK\x0e\x820\x14\xdc\xf7\x14\xcf\x03\xf0\xa9\xc4\x8d;\xe3F\x12\x17\x86\xe0\x01\x9av\x90Fh\xc9\xeb\xd3\xc8\xedE4\xce\xec\xe6\x97\xe9\xfc\x00\x87d\xf7Eq`\xdb\xc0Fv-x\xf4\xc1H\xe4\x16Ir\xc3\x96\xca|%mK]i\xad\xabr\x05\t^RL\x83\xf1\x81\xb4\xde)M%\xd5A\xc0\x01\xb2\xac\xf5\xfe\tum\xceT_2\xe3\x1c#%\xfa\xc9\x993\x02:\xc6%\x1c$\x93y\xc2\xdf\x19\x10n\xd2\xab\x13\x18\xe4\x13\xa58\x82\xbaG\xb8\xcf\xf49\xd2\xc380\xd9os\xa3\xd4\x1b\xa0\xa9\x1c5\xc1\x00\x00\x00" 36 | REC2_GZ = b"\x1f\x8b\x08\x00\xbf\xa9\x99R\x02\xffM\xca1\x0e\xc20\x0c@\xd1\xddR\xee\xe0\x0b\x10\xdb\t\xb4iV\x16$\x90`\xc8\x05:X-RI#\xe4\xa1\xdc\x1e\t\x06\xf8\xeb\x7f\xb3Y\xcbD\xba\x8d\x8f\xb6\xa8_\x9f\x13\xa1\x0c\xc1K\x97\xbcx\xc1\xc0\x12E$\xf2'4\xdd\x8c\xda2\xde+\xf6\tN\xa5\xdc\xe8\xab\x18\xafg\x07\xc7\xb5\x9aV\xdb\x95W\xd3\xfc\x87\x7f\xe7\xa2u\xb29\xa3\x04\x07\x0eXB\xdc\x1f\xba>\r\xec\x00\xde#Pz\x9d\x8c\x00\x00\x00" 37 | 38 | def _arc_gz(self, terminator=b'\r\n\r\n'): 39 | return BytesIO(self.REC1_GZ + self.REC2_GZ) 40 | 41 | def _arc(self, terminator): 42 | s = self.RECORD1 + terminator + self.RECORD2 + terminator 43 | f = BytesIO(s) 44 | return f 45 | 46 | def _test_terminator(self, terminator): 47 | # print('testing warc with record terminator {}'.format(repr(terminator))) 48 | fin = self._arc(terminator) 49 | try: 50 | self._run_checks(fin, terminator, False) 51 | finally: 52 | fin.close() 53 | 54 | fin = self._arc_gz(terminator) 55 | try: 56 | self._run_checks(fin, terminator, True) 57 | finally: 58 | fin.close() 59 | 60 | def _run_checks(self, fin, terminator, gzipped): 61 | fh = warctools.ArchiveRecord.open_archive(file_handle=fin) 62 | try: 63 | i = 0 64 | for (offset, record, errors) in fh.read_records(limit=None, offsets=True): 65 | if i == 0: 66 | self.assertEqual(offset, 0) 67 | self.assertEqual(type(record), warctools.arc.ArcRecordHeader) 68 | self.assertEqual(record.type, b'filedesc') 69 | self.assertEqual(record.content_type, b'text/plain') 70 | # content_length != len(record.content[1]) here because 71 | # ArcParser reads and parses part of the "content" of the 72 | # arc header record 73 | self.assertEqual(record.content_length, 115) 74 | self.assertEqual(record.content[1], b'Here is some funky arc header content!\n') 75 | elif i == 1: 76 | if not gzipped: 77 | self.assertEqual(offset, len(self.RECORD1) + len(terminator)) 78 | else: 79 | self.assertEqual(offset, len(self.REC1_GZ)) 80 | self.assertEqual(type(record), warctools.arc.ArcRecord) 81 | self.assertEqual(record.type, b'response') 82 | self.assertEqual(record.content_type, b'text/plain') 83 | self.assertEqual(record.content_length, 78) 84 | self.assertEqual(record.content[1], b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 12\r\n\r\n01234567890\r\n') 85 | elif i == 2: 86 | if not gzipped: 87 | self.assertEqual(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator)) 88 | else: 89 | self.assertLess(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator)) 90 | self.assertIsNone(record) 91 | else: 92 | self.fail('this line should not be reached') 93 | 94 | i += 1 95 | finally: 96 | fh.close() 97 | 98 | def runTest(self): 99 | # anything works as long as it contains only \r and \n and ends with \n 100 | self._test_terminator(b'\n') # the good one 101 | self._test_terminator(b'\r\n\r\n') 102 | self._test_terminator(b'\r\n') 103 | self._test_terminator(b'\n\r\n') 104 | self._test_terminator(b'\n\n\r\n') 105 | self._test_terminator(b'\r\n\n') 106 | self._test_terminator(b'\r\n\r\n\r\n') 107 | self._test_terminator(b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n') 108 | self._test_terminator(b'\n\n') 109 | self._test_terminator(b'\n\n\n') 110 | self._test_terminator(b'\n\n\n\n') 111 | self._test_terminator(b'\r\n\n\r\n\n') 112 | self._test_terminator(b'\r\r\r\r\r\r\n') 113 | self._test_terminator(b'\r\r\r\r\r\r\n\n') 114 | self._test_terminator(b'\r\r\r\r\r\r\n\n\n') 115 | 116 | class WarcRecordTerminatorTest(unittest.TestCase): 117 | RECORD1 = (b'WARC/1.0\r\n' 118 | + b'WARC-Record-ID: \r\n' 119 | + b'WARC-Type: warcinfo\r\n' 120 | + b'Content-Type: application/warc-fields\r\n' 121 | + b'Content-Length: 30\r\n' 122 | + b'\r\n' 123 | + b'format: WARC File Format 1.0\r\n') 124 | 125 | RECORD2 = (b'WARC/1.0\r\n' 126 | + b'WARC-Type: response\r\n' 127 | + b'WARC-Record-ID: \r\n' 128 | + b'WARC-Target-URI: http://example.org/\r\n' 129 | + b'Content-Type: application/http;msgtype=response\r\n' 130 | + b'Content-Length: 78\r\n' 131 | + b'\r\n' 132 | + b'HTTP/1.1 200 OK\r\n' 133 | + b'Content-Type: text/plain\r\n' 134 | + b'Content-Length: 12\r\n' 135 | + b'\r\n' 136 | + b'01234567890\r\n') 137 | 138 | RECORD1_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xff\x0bw\x0cr\xd67\xd43\xe0\xe5\n\x07\xb2t\x83R\x93\xf3\x8bRt=]\xac\x14lJ\x8b\xf2\xacJK3S\xac\x0c\xa0@\x17\x0b\x01\x03vP\x03B*\x0bR\xad\x14\xca\x13\x8b\x923\xf3\xd2\xf2y\xb9\x9c\xf3\xf3JR\xf3J\xa0\xe2\x89\x05\x059\x99\xc9\x89%\x99\xf9y\xfa 5\xbai\x99\xa99)\xc5\x08e>\xa9y\xe9%\x19V\n\xc6@\x07\xf1r\xa5\xe5\x17\xe5&\x96X)\x80LVp\xcb\xccIUp\x03\x8b(\x80\x1d\x0c\x82\x00\x04h\xbe\xd2\xbf\x00\x00\x00' 139 | RECORD2_GZ = b'\x1f\x8b\x08\x00\xce\xae\x99R\x02\xffm\x8f\xc9\n\xc20\x10\x86\xef\x81\xbcC^\xa0MR\x97j\\@\xeaAQPJ\xa5\xe7\xa0C-\xd4$\xa4S\xd0\xb7\xb7\x85\x16A\xfd\x0f\xc3\xac\xdf\xcc\xe4\x9b4\xe12\x14\x94\xe4\xad\x17d/\x07\x8ay\xa8\x9d55\xf4\xc9\x14\xae\xd6\xdf\x82\xfdV\xb1e\xe3\x8dj\x9a\xf2\xa6D\xaf\xe0\x8f\xe9%\xd7\x03U\xfb\x020\xb8\xa4{\xc5\xee\x88Nq\x0eO\xfdp\x15\x84\xd6\x17\x9c\x92\xc4\x1a\x04\x83\xfdz\xed\\U^5\x96\xd6\xf0\xae}\xf1\xa8\x0bl+\xab\xcf]\xc3\xc0\x11L\x81w\xc5\xe2\x19%\x94\xec\xb2\xec\xdc>#Y$\x04;\x1d\xbe\xb9\x08O\xe4\xae\xd2\xa5\xf9\x05\xc8\xa8\x03\x08\x19\x8d\xc6\x93i<\x9b\x8b.\xa4\xe4\rV`\x1c`\x1f\x01\x00\x00' 140 | 141 | def _warc_gz(self, terminator=b'\r\n\r\n'): 142 | return BytesIO(self.RECORD1_GZ + self.RECORD2_GZ) 143 | 144 | def _warc(self, terminator): 145 | s = self.RECORD1 + terminator + self.RECORD2 + terminator 146 | f = BytesIO(s) 147 | return f 148 | 149 | def _test_terminator(self, terminator): 150 | # print('testing warc with record terminator {}'.format(repr(terminator))) 151 | fin = self._warc(terminator) 152 | try: 153 | self._run_checks(fin, terminator, False) 154 | finally: 155 | fin.close() 156 | 157 | fin = self._warc_gz(terminator) 158 | try: 159 | self._run_checks(fin, terminator, True) 160 | finally: 161 | fin.close() 162 | 163 | def _run_checks(self, fin, terminator, gzipped): 164 | fh = warctools.ArchiveRecord.open_archive(file_handle=fin) 165 | try: 166 | i = 0 167 | for (offset, record, errors) in fh.read_records(limit=None, offsets=True): 168 | if i == 0: 169 | self.assertEqual(offset, 0) 170 | self.assertEqual(type(record), warctools.warc.WarcRecord) 171 | self.assertEqual(record.type, b'warcinfo') 172 | self.assertEqual(record.content_type, b'application/warc-fields') 173 | self.assertEqual(record.content_length, 30) 174 | self.assertEqual(record.content[1], b'format: WARC File Format 1.0\r\n') 175 | elif i == 1: 176 | if not gzipped: 177 | self.assertEqual(offset, len(self.RECORD1) + len(terminator)) 178 | else: 179 | self.assertEqual(offset, len(self.RECORD1_GZ)) 180 | self.assertEqual(type(record), warctools.warc.WarcRecord) 181 | self.assertEqual(record.type, b'response') 182 | self.assertEqual(record.content_type, b'application/http;msgtype=response') 183 | self.assertEqual(record.content_length, 78) 184 | self.assertEqual(record.content[1], b'HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 12\r\n\r\n01234567890\r\n') 185 | elif i == 2: 186 | if not gzipped: 187 | self.assertEqual(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator)) 188 | else: 189 | self.assertLess(offset, len(self.RECORD1) + len(self.RECORD2) + 2 * len(terminator)) 190 | self.assertIsNone(record) 191 | else: 192 | self.fail('this line should not be reached') 193 | 194 | i += 1 195 | finally: 196 | fh.close() 197 | 198 | def runTest(self): 199 | # anything works as long as it contains only \r and \n and ends with \n 200 | self._test_terminator(b'\r\n\r\n') # the good one 201 | self._test_terminator(b'\r\n') 202 | self._test_terminator(b'\n\r\n') 203 | self._test_terminator(b'\n\n\r\n') 204 | self._test_terminator(b'\r\n\n') 205 | self._test_terminator(b'\r\n\r\n\r\n') 206 | self._test_terminator(b'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n') 207 | self._test_terminator(b'\n') 208 | self._test_terminator(b'\n\n') 209 | self._test_terminator(b'\n\n\n') 210 | self._test_terminator(b'\n\n\n\n') 211 | self._test_terminator(b'\r\n\n\r\n\n') 212 | self._test_terminator(b'\r\r\r\r\r\r\n') 213 | self._test_terminator(b'\r\r\r\r\r\r\n\n') 214 | self._test_terminator(b'\r\r\r\r\r\r\n\n\n') 215 | 216 | 217 | class WarcWritingTest(unittest.TestCase): 218 | 219 | # XXX should this a part of the library? 220 | def build_warc_record(self, url, warc_date=None, content_buffer=None, 221 | content_file=None, content_length=None, concurrent_to=None, 222 | warc_type=None, content_type=None, remote_ip=None, profile=None, 223 | refers_to=None, refers_to_target_uri=None, refers_to_date=None, 224 | record_id=None, block_digest=None, payload_digest=None): 225 | 226 | if warc_date is None: 227 | warc_date = warctools.warc.warc_datetime_str(datetime.now()) 228 | 229 | if record_id is None: 230 | record_id = warctools.WarcRecord.random_warc_uuid() 231 | 232 | headers = [] 233 | if warc_type is not None: 234 | headers.append((warctools.WarcRecord.TYPE, warc_type)) 235 | headers.append((warctools.WarcRecord.ID, record_id)) 236 | headers.append((warctools.WarcRecord.DATE, warc_date)) 237 | headers.append((warctools.WarcRecord.URL, url)) 238 | if remote_ip is not None: 239 | headers.append((warctools.WarcRecord.IP_ADDRESS, remote_ip)) 240 | if profile is not None: 241 | headers.append((warctools.WarcRecord.PROFILE, profile)) 242 | if refers_to is not None: 243 | headers.append((warctools.WarcRecord.REFERS_TO, refers_to)) 244 | if refers_to_target_uri is not None: 245 | headers.append((warctools.WarcRecord.REFERS_TO_TARGET_URI, refers_to_target_uri)) 246 | if refers_to_date is not None: 247 | headers.append((warctools.WarcRecord.REFERS_TO_DATE, refers_to_date)) 248 | if concurrent_to is not None: 249 | headers.append((warctools.WarcRecord.CONCURRENT_TO, concurrent_to)) 250 | if content_type is not None: 251 | headers.append((warctools.WarcRecord.CONTENT_TYPE, content_type)) 252 | if content_length is not None: 253 | headers.append((warctools.WarcRecord.CONTENT_LENGTH, content_length)) 254 | if block_digest is not None: 255 | headers.append((warctools.WarcRecord.BLOCK_DIGEST, block_digest)) 256 | if payload_digest is not None: 257 | headers.append((warctools.WarcRecord.BLOCK_DIGEST, payload_digest)) 258 | 259 | if content_file is not None: 260 | assert content_buffer is None 261 | assert content_length is not None 262 | record = warctools.WarcRecord(headers=headers, content_file=content_file) 263 | else: 264 | assert content_buffer is not None 265 | content_tuple = (content_type, content_buffer) 266 | record = warctools.WarcRecord(headers=headers, content=content_tuple) 267 | 268 | return record 269 | 270 | def build_record_using_tuple(self): 271 | content_buffer = b'Luke, I am your payload' 272 | record = self.build_warc_record(url=b'http://example.org/', 273 | content_buffer=content_buffer, 274 | record_id=b'', 275 | warc_date=b'2013-11-15T00:00:00Z', 276 | warc_type=warctools.WarcRecord.RESPONSE, 277 | content_type=httptools.RequestMessage.CONTENT_TYPE) 278 | return record 279 | 280 | def build_record_using_stream(self): 281 | content_buffer = b'Shmuke, I gam four snayglob' 282 | fh = BytesIO(content_buffer) 283 | record = self.build_warc_record(url=b'http://example.org/', 284 | content_file=fh, content_length=str(len(content_buffer)).encode('ascii'), 285 | record_id=b'', 286 | warc_date=b'2013-11-15T00:00:00Z', 287 | warc_type=warctools.WarcRecord.RESPONSE, 288 | content_type=httptools.RequestMessage.CONTENT_TYPE) 289 | return record 290 | 291 | 292 | def test_write_using_tuple(self): 293 | record = self.build_record_using_tuple() 294 | 295 | f = BytesIO() 296 | record.write_to(f) 297 | self.assertEqual(f.getvalue(), 298 | b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') 299 | f.close() 300 | 301 | # should work again if we do it again 302 | f = BytesIO() 303 | record.write_to(f) 304 | self.assertEqual(f.getvalue(), 305 | b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') 306 | f.close() 307 | 308 | 309 | def test_write_using_tuple_gz(self): 310 | record = self.build_record_using_tuple() 311 | 312 | f = BytesIO() 313 | record.write_to(f, gzip=True) 314 | f.seek(0) 315 | g = gzip.GzipFile(fileobj=f, mode='rb') 316 | self.assertEqual(g.read(), b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') 317 | g.close() 318 | f.close() 319 | 320 | # should work again if we do it again 321 | f = BytesIO() 322 | record.write_to(f, gzip=True) 323 | f.seek(0) 324 | g = gzip.GzipFile(fileobj=f, mode='rb') 325 | self.assertEqual(g.read(), b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 23\r\n\r\nLuke, I am your payload\r\n\r\n') 326 | g.close() 327 | f.close() 328 | 329 | 330 | def test_write_using_stream(self): 331 | record = self.build_record_using_stream() 332 | 333 | f = BytesIO() 334 | record.write_to(f) 335 | self.assertEqual(f.getvalue(), 336 | b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n') 337 | f.close() 338 | 339 | # throws exception because record.content_file position has advanced 340 | f = BytesIO() 341 | with self.assertRaises(Exception): 342 | record.write_to(f) 343 | f.close() 344 | 345 | 346 | def test_write_using_stream_gz(self): 347 | record = self.build_record_using_stream() 348 | 349 | f = BytesIO() 350 | record.write_to(f, gzip=True) 351 | f.seek(0) 352 | g = gzip.GzipFile(fileobj=f, mode='rb') 353 | self.assertEqual(g.read(), b'WARC/1.0\r\nWARC-Type: response\r\nWARC-Record-ID: \r\nWARC-Date: 2013-11-15T00:00:00Z\r\nWARC-Target-URI: http://example.org/\r\nContent-Type: application/http;msgtype=request\r\nContent-Length: 27\r\n\r\nShmuke, I gam four snayglob\r\n\r\n') 354 | g.close() 355 | f.close() 356 | 357 | # throws exception because record.content_file position has advanced 358 | f = BytesIO() 359 | with self.assertRaises(Exception): 360 | record.write_to(f, gzip=True) 361 | f.close() 362 | 363 | 364 | if __name__ == '__main__': 365 | unittest.main() 366 | -------------------------------------------------------------------------------- /hanzo/warctools/warc.py: -------------------------------------------------------------------------------- 1 | """An object to represent warc records, using the abstract record in 2 | record.py""" 3 | 4 | import re 5 | import hashlib 6 | from hanzo.warctools.record import ArchiveRecord, ArchiveParser 7 | from hanzo.warctools.archive_detect import register_record_type 8 | import uuid 9 | 10 | bad_lines = 5 # when to give up looking for the version stamp 11 | 12 | 13 | @ArchiveRecord.HEADERS( 14 | DATE=b'WARC-Date', 15 | TYPE=b'WARC-Type', 16 | ID=b'WARC-Record-ID', 17 | CONCURRENT_TO=b'WARC-Concurrent-To', 18 | REFERS_TO=b'WARC-Refers-To', 19 | REFERS_TO_TARGET_URI=b'WARC-Refers-To-Target-URI', 20 | REFERS_TO_DATE=b'WARC-Refers-To-Date', 21 | CONTENT_LENGTH=b'Content-Length', 22 | CONTENT_TYPE=b'Content-Type', 23 | URL=b'WARC-Target-URI', 24 | BLOCK_DIGEST=b'WARC-Block-Digest', 25 | PAYLOAD_DIGEST=b'WARC-Payload-Digest', 26 | IP_ADDRESS=b'WARC-IP-Address', 27 | FILENAME=b'WARC-Filename', 28 | WARCINFO_ID=b'WARC-Warcinfo-ID', 29 | PROFILE=b'WARC-Profile' 30 | ) 31 | class WarcRecord(ArchiveRecord): 32 | 33 | # Pylint is very bad at decorators, E1101 is the message that says 34 | # a member variable does not exist 35 | 36 | # pylint: disable-msg=E1101 37 | 38 | VERSION = b"WARC/1.0" 39 | VERSION18 = b"WARC/0.18" 40 | VERSION17 = b"WARC/0.17" 41 | RESPONSE = b"response" 42 | RESOURCE = b"resource" 43 | REQUEST = b"request" 44 | REVISIT = b"revisit" 45 | METADATA = b"metadata" 46 | CONVERSION = b"conversion" 47 | WARCINFO = b"warcinfo" 48 | 49 | PROFILE_IDENTICAL_PAYLOAD_DIGEST = b"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest" 50 | 51 | TRAILER = b'\r\n\r\n' 52 | 53 | def __init__(self, version=VERSION, headers=None, content=None, 54 | errors=None, content_file=None): 55 | """ 56 | WarcRecord constructor. 57 | 58 | Either content or content_file must be provided, but not both. If 59 | content, which is a tuple (content_type, content_buffer), is provided, 60 | when writing the warc record, any Content-Type and Content-Length that 61 | appear in the supplied headers are ignored, and the values content[0] 62 | and len(content[1]), respectively, are used. 63 | 64 | When reading, the caller can stream content_file or use content, which is 65 | lazily filled using content_file, and after which content_file is 66 | unavailable. 67 | """ 68 | ArchiveRecord.__init__(self, headers, content, errors) 69 | self.version = version 70 | self.content_file = content_file 71 | 72 | @property 73 | def id(self): 74 | return self.get_header(self.ID) 75 | 76 | def _write_to(self, out, nl): 77 | """WARC Format: 78 | VERSION NL 79 | (Key: Value NL)* 80 | NL 81 | CONTENT NL 82 | NL 83 | 84 | don't write multi line headers 85 | """ 86 | out.write(self.version) 87 | out.write(nl) 88 | for k, v in self.headers: 89 | if self.content_file is not None or k not in (self.CONTENT_TYPE, self.CONTENT_LENGTH): 90 | out.write(k) 91 | out.write(b": ") 92 | out.write(v) 93 | out.write(nl) 94 | 95 | if self.content_file is not None: 96 | out.write(nl) # end of header blank nl 97 | while True: 98 | buf = self.content_file.read(8192) 99 | if buf == b'': break 100 | out.write(buf) 101 | else: 102 | # if content tuple is provided, set Content-Type and 103 | # Content-Length based on the values in the tuple 104 | content_type, content_buffer = self.content 105 | 106 | if content_type: 107 | out.write(self.CONTENT_TYPE) 108 | out.write(b": ") 109 | out.write(content_type) 110 | out.write(nl) 111 | if content_buffer is None: 112 | content_buffer = b"" 113 | 114 | content_length = len(content_buffer) 115 | out.write(self.CONTENT_LENGTH) 116 | out.write(b": ") 117 | out.write(str(content_length).encode('ascii')) 118 | out.write(nl) 119 | 120 | out.write(nl) # end of header blank nl 121 | if content_buffer: 122 | out.write(content_buffer) 123 | 124 | # end of record nl nl 125 | out.write(nl) 126 | out.write(nl) 127 | out.flush() 128 | 129 | def repair(self): 130 | pass 131 | 132 | def validate(self): 133 | return self.errors 134 | 135 | @classmethod 136 | def make_parser(self): 137 | return WarcParser() 138 | 139 | def block_digest(self, content_buffer): 140 | block_hash = hashlib.sha256() 141 | block_hash.update(content_buffer) 142 | 143 | digest = "sha256:%s" % block_hash.hexdigest() 144 | return digest 145 | 146 | @staticmethod 147 | def warc_uuid(text): 148 | return "".format(uuid.UUID(hashlib.sha1(text).hexdigest()[0:32])).encode('ascii') 149 | 150 | @staticmethod 151 | def random_warc_uuid(): 152 | return "".format(uuid.uuid4()).encode('ascii') 153 | 154 | 155 | def rx(pat): 156 | """Helper to compile regexps with IGNORECASE option set.""" 157 | return re.compile(pat, flags=re.IGNORECASE) 158 | 159 | version_rx = rx(br'^(?P.*?)(?P\s*WARC/(?P.*?))' 160 | b'(?P\r\n|\r|\n)\\Z') 161 | # a header is key: value plus any following lines with leading whitespace 162 | header_rx = rx(br'^(?P.*?):\s?(?P.*?)' b'(?P\r\n|\r|\n)\\Z') 163 | value_rx = rx(br'^\s+(?P.+?)' b'(?P\r\n|\r|\n)\\Z') 164 | nl_rx = rx(b'^(?P\r\n|\r|\n\\Z)') 165 | length_rx = rx(b'^' + WarcRecord.CONTENT_LENGTH + b'$' ) # pylint: disable-msg=E1101 166 | type_rx = rx(b'^' + WarcRecord.CONTENT_TYPE + b'$') # pylint: disable-msg=E1101 167 | 168 | required_headers = set(( 169 | WarcRecord.TYPE.lower(), # pylint: disable-msg=E1101 170 | WarcRecord.ID.lower(), # pylint: disable-msg=E1101 171 | WarcRecord.CONTENT_LENGTH.lower(), # pylint: disable-msg=E1101 172 | WarcRecord.DATE.lower(), # pylint: disable-msg=E1101 173 | )) 174 | 175 | 176 | class WarcParser(ArchiveParser): 177 | KNOWN_VERSIONS = set((b'1.0', b'0.17', b'0.18')) 178 | 179 | def parse(self, stream, offset, line=None): 180 | """Reads a warc record from the stream, returns a tuple 181 | (record, errors). Either records is null or errors is 182 | null. Any record-specific errors are contained in the record - 183 | errors is only used when *nothing* could be parsed""" 184 | # pylint: disable-msg=E1101 185 | errors = [] 186 | version = None 187 | # find WARC/.* 188 | if line is None: 189 | line = stream.readline() 190 | 191 | while line: 192 | match = version_rx.match(line) 193 | 194 | if match: 195 | version = match.group('version') 196 | if offset is not None: 197 | offset += len(match.group('prefix')) 198 | break 199 | else: 200 | if offset is not None: 201 | offset += len(line) 202 | if not nl_rx.match(line): 203 | errors.append(('ignored line', line)) 204 | if len(errors) > bad_lines: 205 | errors.append(('too many errors, giving up hope',)) 206 | return (None, errors, offset) 207 | line = stream.readline() 208 | if not line: 209 | if version: 210 | errors.append(('warc version but no headers', version)) 211 | return (None, errors, offset) 212 | if line: 213 | content_length = 0 214 | content_type = None 215 | 216 | record = WarcRecord(errors=errors, version=version) 217 | 218 | if match.group('nl') != b'\x0d\x0a': 219 | record.error('incorrect newline in version', match.group('nl')) 220 | 221 | if match.group('number') not in self.KNOWN_VERSIONS: 222 | record.error('version field is not known (%s)' 223 | % (",".join(self.KNOWN_VERSIONS)), 224 | match.group('number')) 225 | 226 | prefix = match.group('prefix') 227 | 228 | if prefix: 229 | record.error('bad prefix on WARC version header', prefix) 230 | 231 | #Read headers 232 | line = stream.readline() 233 | while line and not nl_rx.match(line): 234 | 235 | #print 'header', repr(line) 236 | match = header_rx.match(line) 237 | if match: 238 | if match.group('nl') != b'\x0d\x0a': 239 | record.error('incorrect newline in header', 240 | match.group('nl')) 241 | name = match.group('name').strip() 242 | value = [match.group('value').strip()] 243 | #print 'match',name, value 244 | 245 | line = stream.readline() 246 | match = value_rx.match(line) 247 | while match: 248 | #print 'follow', repr(line) 249 | if match.group('nl') != b'\x0d\x0a': 250 | record.error('incorrect newline in follow header', 251 | line, match.group('nl')) 252 | value.append(match.group('value').strip()) 253 | line = stream.readline() 254 | match = value_rx.match(line) 255 | 256 | value = b" ".join(value) 257 | 258 | record.headers.append((name, value)) 259 | 260 | if type_rx.match(name): 261 | if value: 262 | content_type = value 263 | else: 264 | record.error('invalid header', name, value) 265 | elif length_rx.match(name): 266 | try: 267 | #print name, value 268 | content_length = int(value) 269 | #print content_length 270 | except ValueError: 271 | record.error('invalid header', name, value) 272 | 273 | # have read blank line following headers 274 | 275 | record.content_file = stream 276 | record.content_file.bytes_to_eoc = content_length 277 | 278 | # check mandatory headers 279 | # WARC-Type WARC-Date WARC-Record-ID Content-Length 280 | 281 | return (record, (), offset) 282 | 283 | 284 | blank_rx = rx(br'^$') 285 | register_record_type(version_rx, WarcRecord) 286 | register_record_type(blank_rx, WarcRecord) 287 | 288 | 289 | def make_response(id, date, url, content, request_id): 290 | # pylint: disable-msg=E1101 291 | headers = [ 292 | (WarcRecord.TYPE, WarcRecord.RESPONSE), 293 | (WarcRecord.ID, id), 294 | (WarcRecord.DATE, date), 295 | (WarcRecord.URL, url), 296 | 297 | ] 298 | if request_id: 299 | headers.append((WarcRecord.CONCURRENT_TO, request_id)) 300 | 301 | record = WarcRecord(headers=headers, content=content) 302 | 303 | return record 304 | 305 | 306 | def make_request(request_id, date, url, content, response_id): 307 | # pylint: disable-msg=E1101 308 | headers = [ 309 | (WarcRecord.TYPE, WarcRecord.REQUEST), 310 | (WarcRecord.ID, request_id), 311 | (WarcRecord.DATE, date), 312 | (WarcRecord.URL, url), 313 | 314 | ] 315 | if response_id: 316 | headers.append((WarcRecord.CONCURRENT_TO, response_id)) 317 | 318 | record = WarcRecord(headers=headers, content=content) 319 | 320 | return record 321 | 322 | 323 | def make_metadata(meta_id, date, content, concurrent_to=None, url=None): 324 | # pylint: disable-msg=E1101 325 | headers = [ 326 | (WarcRecord.TYPE, WarcRecord.METADATA), 327 | (WarcRecord.ID, meta_id), 328 | (WarcRecord.DATE, date), 329 | 330 | ] 331 | if concurrent_to: 332 | headers.append((WarcRecord.CONCURRENT_TO, concurrent_to)) 333 | 334 | if url: 335 | headers.append((WarcRecord.URL, url)) 336 | 337 | record = WarcRecord(headers=headers, content=content) 338 | 339 | return record 340 | 341 | 342 | def make_conversion(conv_id, date, content, refers_to=None, url=None): 343 | # pylint: disable-msg=E1101 344 | headers = [ 345 | (WarcRecord.TYPE, WarcRecord.CONVERSION), 346 | (WarcRecord.ID, conv_id), 347 | (WarcRecord.DATE, date), 348 | 349 | ] 350 | if refers_to: 351 | headers.append((WarcRecord.REFERS_TO, refers_to)) 352 | 353 | if url: 354 | headers.append((WarcRecord.URL, url)) 355 | 356 | record = WarcRecord(headers=headers, content=content) 357 | 358 | return record 359 | 360 | 361 | def warc_datetime_str(d): 362 | s = d.isoformat() 363 | if '.' in s: 364 | s = s[:s.find('.')] 365 | return (s + 'Z').encode('utf-8') 366 | -------------------------------------------------------------------------------- /hanzo/warcvalid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warcvalid - check a warc is ok""" 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import sys 8 | 9 | import sys 10 | import os.path 11 | 12 | from optparse import OptionParser 13 | 14 | from .warctools import WarcRecord, expand_files 15 | 16 | parser = OptionParser(usage="%prog [options] warc warc warc") 17 | 18 | parser.add_option("-l", "--limit", dest="limit") 19 | parser.add_option("-I", "--input", dest="input_format") 20 | parser.add_option("-L", "--log-level", dest="log_level") 21 | 22 | parser.set_defaults(output_directory=None, limit=None, log_level="info") 23 | 24 | def main(argv): 25 | (options, input_files) = parser.parse_args(args=argv[1:]) 26 | 27 | out = sys.stdout 28 | if len(input_files) < 1: 29 | parser.error("no imput warc file(s)") 30 | 31 | 32 | correct=True 33 | fh=None 34 | try: 35 | for name in expand_files(input_files): 36 | fh = WarcRecord.open_archive(name, gzip="auto") 37 | 38 | for (offset, record, errors) in fh.read_records(limit=None): 39 | if errors: 40 | print("warc errors at %s:%d"%(name, offset), file=sys.stderr) 41 | print(errors, file=sys.stderr) 42 | correct=False 43 | 44 | break 45 | elif record is not None and record.validate(): # ugh name, returns errorsa 46 | print("warc errors at %s:%d"%(name, offset), file=sys.stderr) 47 | print(record.validate(), file=sys.stderr) 48 | correct=False 49 | break 50 | 51 | 52 | except Exception as e: 53 | print("Exception: %s"%(str(e)), file=sys.stderr) 54 | correct=False 55 | finally: 56 | if fh: fh.close() 57 | 58 | if correct: 59 | return 0 60 | else: 61 | return -1 # failure code 62 | 63 | 64 | def run(): 65 | sys.exit(main(sys.argv)) 66 | 67 | 68 | if __name__ == '__main__': 69 | run() 70 | 71 | 72 | -------------------------------------------------------------------------------- /make-deb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | unset CDPATH 4 | 5 | if [ -d debian ]; then 6 | rm -rf debian 7 | fi 8 | 9 | mkdir debian 10 | 11 | VERSION="$(cat version)" 12 | 13 | if ! (echo "$VERSION" | egrep -q '^[0-9]+\.[0-9]+$'); then 14 | echo "Invalid version number $VERSION" 1>&2 15 | exit 1 16 | fi 17 | 18 | if [ "$(hg branch)" = 'default' ]; then 19 | REVISION="$(hg id -n)" 20 | VERSION="${VERSION}-tip" 21 | elif [ "$(hg branch)" = "$VERSION" ]; then 22 | REVISION="$(hg id -n)" 23 | fi 24 | 25 | 26 | mkdir -p debian/DEBIAN 27 | cat < debian/DEBIAN/control 28 | Package: hanzo-warc-tools 29 | Version: ${VERSION}-${REVISION} 30 | Maintainer: Stephen Jones 31 | Section: admin 32 | Priority: optional 33 | Architecture: all 34 | Depends: python (>= 2.7) 35 | Description: Suite of tools and libraries for manipulating warc files. 36 | Provides commands for listing the contents of warc files and libraries for 37 | manipulating warc files and http. 38 | EOF 39 | 40 | python setup.py install -q --no-compile --root "$PWD/debian" --install-layout=deb 41 | 42 | mkdir -p debian/usr/share/doc/hanzo-warc-tools 43 | echo "Copyright Hanzo Archives $(date +%Y)" > debian/usr/share/doc/hanzo-warc-tools/copyright 44 | cp README debian/usr/share/doc/hanzo-warc-tools/ 45 | hg log --style=changelog | gzip -9 > debian/usr/share/doc/hanzo-warc-tools/changelog.gz 46 | 47 | cat < debian/usr/share/doc/hanzo-warc-tools/changelog.Debian.gz 48 | hanzo-warc-tools ($VERSION) Hanzo; 49 | 50 | * Made debian style package 51 | 52 | -- Stephen Jones $(date +'%a, %d %h %Y %T %z') 53 | EOF 54 | 55 | cat < debian/DEBIAN/postinst 56 | #!/bin/bash -e 57 | 58 | if which pycompile >/dev/null 2>&1; then 59 | pycompile -p hanzo-warc-tools 60 | fi 61 | EOF 62 | 63 | pushd debian 64 | 65 | find usr/bin -type f -name '*.py' | ( 66 | while read SCRIPT; do 67 | mv "$SCRIPT" "${SCRIPT%.py}" 68 | chmod 755 "${SCRIPT%.py}" 69 | done 70 | ) 71 | md5sum $(find . -path ./DEBIAN -prune -o -type f -print) > DEBIAN/md5sums 72 | 73 | find usr/lib -type f -exec chmod 644 '{}' ';' 74 | find usr/share -type f -exec chmod 644 '{}' ';' 75 | find DEBIAN -type f -exec chmod 644 '{}' ';' 76 | find . -type d -exec chmod 755 '{}' ';' 77 | 78 | chmod 755 DEBIAN/postinst 79 | 80 | popd 81 | 82 | fakeroot dpkg-deb --build debian . 83 | 84 | lintian "hanzo-warc-tools_${VERSION}-${REVISION}_all.deb" 85 | 86 | if [ -n "$1" ] && [ -d "$1" ] && [ -w "$1" ]; then 87 | mv "hanzo-warc-tools_${VERSION}-${REVISION}_all.deb" "$1" 88 | fi 89 | -------------------------------------------------------------------------------- /pylint.rc: -------------------------------------------------------------------------------- 1 | # lint Python modules using external checkers. 2 | # 3 | # This is the main checker controlling the other ones and the reports 4 | # generation. It is itself both a raw checker and an astng checker in order 5 | # to: 6 | # * handle message activation / deactivation at the module level 7 | # * handle some basic but necessary stats'data (number of classes, methods...) 8 | # 9 | [MASTER] 10 | 11 | # Specify a configuration file. 12 | #rcfile= 13 | 14 | # Python code to execute, usually for sys.path manipulation such as 15 | # pygtk.require(). 16 | #init-hook= 17 | 18 | # Profiled execution. 19 | profile=no 20 | 21 | # Add to the black list. It should be a base name, not a 22 | # path. You may set this option multiple times. 23 | ignore=CVS 24 | 25 | # Pickle collected data for later comparisons. 26 | persistent=yes 27 | 28 | # Set the cache size for astng objects. 29 | cache-size=500 30 | 31 | # List of plugins (as comma separated values of python modules names) to load, 32 | # usually to register additional checkers. 33 | load-plugins= 34 | 35 | 36 | [MESSAGES CONTROL] 37 | 38 | # Enable only checker(s) with the given id(s). This option conflicts with the 39 | # disable-checker option 40 | #enable-checker= 41 | 42 | # Enable all checker(s) except those with the given id(s). This option 43 | # conflicts with the enable-checker option 44 | #disable-checker= 45 | 46 | # Enable all messages in the listed categories (IRCWEF). 47 | #enable-msg-cat= 48 | 49 | # Disable all messages in the listed categories (IRCWEF). 50 | disable-msg-cat=I 51 | 52 | # Enable the message(s) with the given id(s). 53 | #enable-msg= 54 | 55 | # Disable the message(s) with the given id(s). 56 | disable-msg=W0703, C0103, R0904, R0903, W0142 57 | 58 | 59 | [REPORTS] 60 | 61 | # Set the output format. Available formats are text, parseable, colorized, msvs 62 | # (visual studio) and html 63 | output-format=text 64 | 65 | # Include message's id in output 66 | include-ids=no 67 | 68 | # Put messages in a separate file for each module / package specified on the 69 | # command line instead of printing them on stdout. Reports (if any) will be 70 | # written in a file name "pylint_global.[txt|html]". 71 | files-output=no 72 | 73 | # Tells whether to display a full report or only the messages 74 | reports=yes 75 | 76 | # Python expression which should return a note less than 10 (10 is the highest 77 | # note). You have access to the variables errors warning, statement which 78 | # respectively contain the number of errors / warnings messages and the total 79 | # number of statements analyzed. This is used by the global evaluation report 80 | # (R0004). 81 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 82 | 83 | # Add a comment according to your evaluation note. This is used by the global 84 | # evaluation report (R0004). 85 | comment=no 86 | 87 | # Enable the report(s) with the given id(s). 88 | #enable-report= 89 | 90 | # Disable the report(s) with the given id(s). 91 | #disable-report= 92 | 93 | 94 | # checks for : 95 | # * doc strings 96 | # * modules / classes / functions / methods / arguments / variables name 97 | # * number of arguments, local variables, branches, returns and statements in 98 | # functions, methods 99 | # * required module attributes 100 | # * dangerous default values as arguments 101 | # * redefinition of function / method / class 102 | # * uses of the global statement 103 | # 104 | [BASIC] 105 | 106 | # Required attributes for module, separated by a comma 107 | required-attributes= 108 | 109 | # Regular expression which should only match functions or classes name which do 110 | # not require a docstring 111 | no-docstring-rgx=__.*__ 112 | 113 | # Regular expression which should only match correct module names 114 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 115 | 116 | # Regular expression which should only match correct module level names 117 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 118 | 119 | # Regular expression which should only match correct class names 120 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 121 | 122 | # Regular expression which should only match correct function names 123 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 124 | 125 | # Regular expression which should only match correct method names 126 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 127 | 128 | # Regular expression which should only match correct instance attribute names 129 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 130 | 131 | # Regular expression which should only match correct argument names 132 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 133 | 134 | # Regular expression which should only match correct variable names 135 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 136 | 137 | # Regular expression which should only match correct list comprehension / 138 | # generator expression variable names 139 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 140 | 141 | # Good variable names which should always be accepted, separated by a comma 142 | good-names=i,j,k,ex,Run,_ 143 | 144 | # Bad variable names which should always be refused, separated by a comma 145 | bad-names=foo,bar,baz,toto,tutu,tata 146 | 147 | # List of builtins function names that should not be used, separated by a comma 148 | bad-functions=map,filter,apply,input 149 | 150 | 151 | # try to find bugs in the code using type inference 152 | # 153 | [TYPECHECK] 154 | 155 | # Tells whether missing members accessed in mixin class should be ignored. A 156 | # mixin class is detected if its name ends with "mixin" (case insensitive). 157 | ignore-mixin-members=yes 158 | 159 | # List of classes names for which member attributes should not be checked 160 | # (useful for classes with attributes dynamically set). 161 | ignored-classes=SQLObject 162 | 163 | # When zope mode is activated, add a predefined set of Zope acquired attributes 164 | # to generated-members. 165 | zope=no 166 | 167 | # List of members which are set dynamically and missed by pylint inference 168 | # system, and so shouldn't trigger E0201 when accessed. 169 | generated-members=REQUEST,acl_users,aq_parent 170 | 171 | 172 | # checks for 173 | # * unused variables / imports 174 | # * undefined variables 175 | # * redefinition of variable from builtins or from an outer scope 176 | # * use of variable before assignment 177 | # 178 | [VARIABLES] 179 | 180 | # Tells whether we should check for unused import in __init__ files. 181 | init-import=no 182 | 183 | # A regular expression matching names used for dummy variables (i.e. not used). 184 | dummy-variables-rgx=_|dummy 185 | 186 | # List of additional names supposed to be defined in builtins. Remember that 187 | # you should avoid to define new builtins when possible. 188 | additional-builtins= 189 | 190 | 191 | # checks for sign of poor/misdesign: 192 | # * number of methods, attributes, local variables... 193 | # * size, complexity of functions, methods 194 | # 195 | [DESIGN] 196 | 197 | # Maximum number of arguments for function / method 198 | max-args=5 199 | 200 | # Maximum number of locals for function / method body 201 | max-locals=15 202 | 203 | # Maximum number of return / yield for function / method body 204 | max-returns=6 205 | 206 | # Maximum number of branch for function / method body 207 | max-branchs=12 208 | 209 | # Maximum number of statements in function / method body 210 | max-statements=50 211 | 212 | # Maximum number of parents for a class (see R0901). 213 | max-parents=7 214 | 215 | # Maximum number of attributes for a class (see R0902). 216 | max-attributes=7 217 | 218 | # Minimum number of public methods for a class (see R0903). 219 | min-public-methods=2 220 | 221 | # Maximum number of public methods for a class (see R0904). 222 | max-public-methods=20 223 | 224 | 225 | # checks for : 226 | # * methods without self as first argument 227 | # * overridden methods signature 228 | # * access only to existent members via self 229 | # * attributes not defined in the __init__ method 230 | # * supported interfaces implementation 231 | # * unreachable code 232 | # 233 | [CLASSES] 234 | 235 | # List of interface methods to ignore, separated by a comma. This is used for 236 | # instance to not check methods defines in Zope's Interface base class. 237 | ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by 238 | 239 | # List of method names used to declare (i.e. assign) instance attributes. 240 | defining-attr-methods=__init__,__new__,setUp 241 | 242 | 243 | # checks for 244 | # * external modules dependencies 245 | # * relative / wildcard imports 246 | # * cyclic imports 247 | # * uses of deprecated modules 248 | # 249 | [IMPORTS] 250 | 251 | # Deprecated modules which should not be used, separated by a comma 252 | deprecated-modules=regsub,string,TERMIOS,Bastion,rexec 253 | 254 | # Create a graph of every (i.e. internal and external) dependencies in the 255 | # given file (report R0402 must not be disabled) 256 | import-graph= 257 | 258 | # Create a graph of external dependencies in the given file (report R0402 must 259 | # not be disabled) 260 | ext-import-graph= 261 | 262 | # Create a graph of internal dependencies in the given file (report R0402 must 263 | # not be disabled) 264 | int-import-graph= 265 | 266 | 267 | # checks for: 268 | # * warning notes in the code like FIXME, XXX 269 | # * PEP 263: source code with non ascii character but no encoding declaration 270 | # 271 | [MISCELLANEOUS] 272 | 273 | # List of note tags to take in consideration, separated by a comma. 274 | notes=FIXME,XXX,TODO 275 | 276 | 277 | # checks for similarities and duplicated code. This computation may be 278 | # memory / CPU intensive, so you should disable it if you experiments some 279 | # problems. 280 | # 281 | [SIMILARITIES] 282 | 283 | # Minimum lines number of a similarity. 284 | min-similarity-lines=4 285 | 286 | # Ignore comments when computing similarities. 287 | ignore-comments=yes 288 | 289 | # Ignore docstrings when computing similarities. 290 | ignore-docstrings=yes 291 | 292 | 293 | # checks for : 294 | # * unauthorized constructions 295 | # * strict indentation 296 | # * line length 297 | # * use of <> instead of != 298 | # 299 | [FORMAT] 300 | 301 | # Maximum number of characters on a single line. 302 | max-line-length=80 303 | 304 | # Maximum number of lines in a module 305 | max-module-lines=1000 306 | 307 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 308 | # tab). 309 | indent-string=' ' 310 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "warctools" 3 | version = "5.0.0" 4 | authors = [ 5 | { name="Thomas Figg", email="tef@warctools.twentygototen.org" }, 6 | ] 7 | maintainers = [ 8 | { name="Internet Archive", email="info@archive.org" }, 9 | ] 10 | description = "Command line tools and libraries for handling and manipulating WARC files (and HTTP contents)" 11 | readme = "README.md" 12 | requires-python = ">=3.5" 13 | classifiers = [ 14 | "Operating System :: OS Independent", 15 | "Programming Language :: Python :: 3.5", 16 | "Topic :: System :: Archiving", 17 | ] 18 | license = "MIT" 19 | license-files = ["LICENSE"] 20 | 21 | [project.scripts] 22 | warcdump = "hanzo.warcdump:run" 23 | arc2warc = "hanzo.arc2warc:run" 24 | warcextract = "hanzo.warcextract:run" 25 | warcfilter = "hanzo.warcfilter:run" 26 | warcindex = "hanzo.warcindex:run" 27 | warclinks = "hanzo.warclinks:run" 28 | warcvalid = "hanzo.warcvalid:run" 29 | warc2warc = "hanzo.warc2warc:run" 30 | warcpayload = "hanzo.warcpayload:run" 31 | 32 | [dependency-groups] 33 | dev = [ 34 | "nose", 35 | ] 36 | 37 | [build-system] 38 | requires = ["setuptools>=61.0"] 39 | build-backend = "setuptools.build_meta" 40 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27, py32, py33, pypy 8 | 9 | [testenv] 10 | commands = {envpython} setup.py test 11 | 12 | -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | revision = 2 3 | requires-python = ">=3.5" 4 | 5 | [[package]] 6 | name = "nose" 7 | version = "1.3.7" 8 | source = { registry = "https://pypi.org/simple" } 9 | sdist = { url = "https://files.pythonhosted.org/packages/58/a5/0dc93c3ec33f4e281849523a5a913fa1eea9a3068acfa754d44d88107a44/nose-1.3.7.tar.gz", hash = "sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98", size = 280488, upload-time = "2015-06-02T09:12:32.961Z" } 10 | wheels = [ 11 | { url = "https://files.pythonhosted.org/packages/15/d8/dd071918c040f50fa1cf80da16423af51ff8ce4a0f2399b7bf8de45ac3d9/nose-1.3.7-py3-none-any.whl", hash = "sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac", size = 154731, upload-time = "2015-06-02T09:12:40.57Z" }, 12 | ] 13 | 14 | [[package]] 15 | name = "warctools" 16 | version = "5.0.0" 17 | source = { editable = "." } 18 | 19 | [package.dev-dependencies] 20 | dev = [ 21 | { name = "nose" }, 22 | ] 23 | 24 | [package.metadata] 25 | 26 | [package.metadata.requires-dev] 27 | dev = [{ name = "nose" }] 28 | -------------------------------------------------------------------------------- /warcunpack_ia.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """warcextract - dump warc record context to directory""" 3 | 4 | from __future__ import print_function 5 | 6 | import os 7 | import sys 8 | import os.path 9 | import uuid 10 | import mimetypes 11 | import shlex 12 | 13 | from optparse import OptionParser 14 | from contextlib import closing 15 | try: 16 | from urllib.parse import urlparse 17 | except ImportError: 18 | from urlparse import urlparse 19 | 20 | from hanzo.warctools import ArchiveRecord, WarcRecord 21 | from hanzo.httptools import RequestMessage, ResponseMessage 22 | 23 | mimetypes.add_type('text/javascript', 'js') 24 | 25 | parser = OptionParser(usage="%prog [options] warc offset") 26 | 27 | parser.add_option("-D", "--default-name", dest="default_name") 28 | parser.add_option("-o", "--output", dest="output") 29 | parser.add_option("-l", "--log", dest="log_file") 30 | parser.add_option("-W", "--wayback_prefix", dest="wayback") 31 | 32 | parser.set_defaults(output=None, log_file=None, default_name='crawlerdefault', wayback="http://wayback.archive-it.org/") 33 | 34 | 35 | def log_headers(log_file): 36 | print('>>warc_file\twarc_id\twarc_type\twarc_content_length\twarc_uri_date\twarc_subject_uri\turi_content_type\toutfile\twayback_uri', file=log_file) 37 | 38 | def log_entry(log_file, input_file, record, content_type, output_file, wayback_uri): 39 | log = (input_file, record.id, record.type, record.content_length, record.date, record.url, content_type, output_file, wayback_uri) 40 | print("\t".join(str(s) for s in log), file=log_file) 41 | 42 | def main(argv): 43 | (options, args) = parser.parse_args(args=argv[1:]) 44 | 45 | out = sys.stdout 46 | if options.output: 47 | if not os.path.exists(options.output): 48 | os.makedirs(options.output) 49 | output_dir = options.output 50 | else: 51 | output_dir = os.getcwd() 52 | 53 | collisions = 0 54 | 55 | 56 | if len(args) < 1: 57 | log_file = sys.stdout if not options.log_file else open(options.log_file, 'wb') 58 | log_headers(log_file) 59 | 60 | with closing(WarcRecord.open_archive(file_handle=sys.stdin, gzip=None)) as fh: 61 | collisions += unpack_records('', fh, output_dir, options.default_name, log_file, options.wayback) 62 | 63 | else: 64 | for filename in args: 65 | 66 | log_file = os.path.join(output_dir, os.path.basename(filename)+ '.index.txt') if not options.log_file else options.log_file 67 | log_file = open(log_file, 'wb') 68 | log_headers(log_file) 69 | try: 70 | with closing(ArchiveRecord.open_archive(filename=filename, gzip="auto")) as fh: 71 | collisions+=unpack_records(filename, fh, output_dir, options.default_name, log_file, options.wayback) 72 | 73 | except Exception as e: 74 | print("exception in handling", filename, e, file=sys.stderr) 75 | if collisions: 76 | print(collisions, "filenames that collided", file=sys.stderr) 77 | 78 | 79 | return 0 80 | 81 | def unpack_records(name, fh, output_dir, default_name, output_log, wayback_prefix): 82 | collectionId = '' 83 | collisions = 0 84 | for (offset, record, errors) in fh.read_records(limit=None): 85 | if record: 86 | try: 87 | content_type, content = record.content 88 | 89 | if record.type == WarcRecord.WARCINFO: 90 | info = parse_warcinfo(record) 91 | for entry in shlex.split(info.get('description', "")): 92 | if entry.startswith('collectionId'): 93 | collectionId = entry.split('=',1)[1].split(',')[0] 94 | if not collectionId: 95 | filename = record.get_header("WARC-Filename") 96 | if filename: 97 | collectionId = filename.split(r'-')[1] 98 | elif '-' in name: 99 | collectionId = name.split(r'-')[1] 100 | 101 | 102 | 103 | if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'): 104 | 105 | code, mime_type, message = parse_http_response(record) 106 | 107 | if 200 <= code < 300: 108 | filename, collision = output_file(output_dir, record.url, mime_type, default_name) 109 | if collision: 110 | collisions+=1 111 | 112 | wayback_uri = '' 113 | if collectionId: 114 | wayback_date = record.date.translate(None,r'TZ:-') 115 | wayback_uri = wayback_prefix + collectionId + '/' + wayback_date + '/' + record.url 116 | 117 | with open(filename, 'wb') as out: 118 | out.write(message.get_body()) 119 | log_entry(output_log, name, record, mime_type, filename, wayback_uri) 120 | 121 | except Exception as e: 122 | import traceback; traceback.print_exc() 123 | print("exception in handling record", e, file=sys.stderr) 124 | 125 | elif errors: 126 | print("warc errors at %s:%d"%(name, offset if offset else 0), end=' ', file=sys.stderr) 127 | for e in errors: 128 | print(e, end=' ', file=sys.stderr) 129 | print(file=sys.stderr) 130 | return collisions 131 | 132 | def parse_warcinfo(record): 133 | info = {} 134 | try: 135 | for line in record.content[1].split('\n'): 136 | line = line.strip() 137 | if line: 138 | try: 139 | key, value =line.split(':',1) 140 | info[key]=value 141 | except Exception as e: 142 | print('malformed warcinfo line', line, file=sys.stderr) 143 | except Exception as e: 144 | print('exception reading warcinfo record', e, file=sys.stderr) 145 | return info 146 | 147 | def parse_http_response(record): 148 | message = ResponseMessage(RequestMessage()) 149 | remainder = message.feed(record.content[1]) 150 | message.close() 151 | if remainder or not message.complete(): 152 | if remainder: 153 | print('warning: trailing data in http response for', record.url, file=sys.stderr) 154 | if not message.complete(): 155 | print('warning: truncated http response for', record.url, file=sys.stderr) 156 | 157 | header = message.header 158 | 159 | mime_type = [v for k,v in header.headers if k.lower() =='content-type'] 160 | if mime_type: 161 | mime_type = mime_type[0].split(';')[0] 162 | else: 163 | mime_type = None 164 | 165 | return header.code, mime_type, message 166 | 167 | 168 | def output_file(output_dir, url, mime_type, default_name): 169 | clean_url = "".join((c if c.isalpha() or c.isdigit() or c in '_-/.' else '_') for c in url.replace('://','/',1)) 170 | 171 | parts = clean_url.split('/') 172 | directories, filename = parts[:-1], parts[-1] 173 | 174 | 175 | path = [output_dir] 176 | for d in directories: 177 | if d: 178 | path.append(d) 179 | 180 | if filename: 181 | name, ext = os.path.splitext(filename) 182 | else: 183 | name, ext = default_name, '' 184 | 185 | if mime_type: 186 | guess_type = mimetypes.guess_type(url) 187 | # preserve variant file extensions, rather than clobber with default for mime type 188 | if not ext or guess_type != mime_type: 189 | mime_ext = mimetypes.guess_extension(mime_type) 190 | if mime_ext: 191 | ext = mime_ext 192 | elif not ext: 193 | ext = '.html' # no mime time, no extension 194 | 195 | directory = os.path.normpath(os.path.join(*path)) 196 | directory = directory[:200] 197 | 198 | if not os.path.exists(directory): 199 | os.makedirs(directory) 200 | 201 | filename = name[:45-len(ext)] + ext 202 | 203 | fullname = os.path.join(directory, filename) 204 | 205 | collision = False 206 | 207 | while os.path.exists(fullname): 208 | collision = True 209 | u = str(uuid.uuid4())[:8] 210 | 211 | filename = name[:45-len(ext)] + '_R'+ u + ext 212 | 213 | fullname = os.path.join(directory, filename) 214 | 215 | return os.path.realpath(os.path.normpath(fullname)), collision 216 | 217 | if __name__ == '__main__': 218 | sys.exit(main(sys.argv)) 219 | 220 | 221 | 222 | --------------------------------------------------------------------------------