├── .pylintrc ├── README.md ├── html2warc.py └── license.txt /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable = line-too-long, 3 | invalid-name 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html2warc 2 | A simple script to convert offline data into a warc file 3 | 4 | # Usage 5 | python html2warc.py $TARGET_URI $SOURCE_DIR $TARGET_WARC 6 | -------------------------------------------------------------------------------- /html2warc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | html2warc creates warc files from local web resources 4 | """ 5 | 6 | __date__ = '2023/05/11' 7 | __version__ = '0.7.1' 8 | __status__ = 'Testing' 9 | __license__ = 'The MIT License (MIT)' 10 | __copyright__ = 'Copyright (c) 2014-2023 Steffen Fritz' 11 | __author__ = 'steffen fritz' 12 | __maintainer__ = 'steffen fritz' 13 | __contributor__ = 'dragan espenschied' 14 | __contact__ = 'steffen AT fritz.wtf' 15 | 16 | 17 | import os 18 | import sys 19 | import uuid 20 | import datetime 21 | import mimetypes 22 | import re 23 | 24 | # possible names for an index file, listed as compiled regexp, 25 | # in order of preference 26 | index_filenames = [ 27 | re.compile(r'(index)\.(html?|asp|php)', re.I), # most common 28 | re.compile(r'welcome\.html?', re.I) # AOL server 29 | ] 30 | 31 | 32 | def source_to_warc(source_dir, targetwarc, createdate, rooturl): 33 | """ 34 | :param source_dir: source directory 35 | :param targetwarc: output warc file 36 | :param createdate: bag creation date 37 | :param rooturl: arbitrary URL 38 | :type: string objects 39 | """ 40 | for rootdir, _, files in os.walk(source_dir): 41 | for file_ in files: 42 | 43 | possible_filenames_ = [file_] 44 | 45 | for index_filename_ in index_filenames: 46 | if index_filename_.fullmatch(file_): 47 | possible_filenames_.append('') 48 | continue # only one index per directory! 49 | 50 | source_file_ = os.path.join(rootdir, file_) 51 | mime_type_ = mimetypes.guess_type(source_file_) 52 | file_size_ = os.path.getsize(source_file_) 53 | block_length = 110 # init with len of network header 54 | 55 | rootdir_parts = rootdir.split('/') 56 | source_dir_parts = source_dir.split('/') 57 | source_file_uri_parts = [] 58 | path_step = 0 59 | for segment in rootdir_parts: 60 | if len(source_dir_parts) < path_step+1 or segment != source_dir_parts[path_step]: 61 | source_file_uri_parts.append(segment) 62 | path_step = path_step + 1 63 | 64 | for possible_filename_ in possible_filenames_: 65 | 66 | source_file_uri = rooturl + '/'.join(source_file_uri_parts) + '/' + possible_filename_ 67 | 68 | print("{}\t[{}]\t{}b".format(source_file_uri, mime_type_[0], file_size_)) 69 | 70 | with open(targetwarc, "a", newline="\r\n") as fw: 71 | fw.write("WARC/1.0\n") 72 | fw.write("WARC-Type: response\n") 73 | 74 | fw.write("WARC-Target-URI: " + source_file_uri + "\n") 75 | 76 | fw.write("WARC-Record-ID: \n") 77 | fw.write("WARC-Date: " + str(createdate) + "\n") 78 | fw.write("Content-Type: " + "application/http;msgtype=response" + "\n") 79 | fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n") 80 | 81 | block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate)) 82 | fw.write("Content-Length: " + str(block_length) + "\n") 83 | fw.write("\n") 84 | 85 | # network protocol information 86 | fw.write("HTTP/1.1 200 OK\n") 87 | fw.write("DATE: " + str(createdate) + "\n") 88 | fw.write("Accept-Ranges: bytes" + "\n") 89 | fw.write("Connection: close" + "\n") 90 | fw.write("Content-Type: " + str(mime_type_[0]) + "\n") 91 | fw.write("Content-Length: " + str(file_size_) + "\n") 92 | fw.write("\n") 93 | 94 | with open(source_file_, "rb") as fd: 95 | for line_ in fd: 96 | with open(targetwarc, "ab") as fw: 97 | fw.write(line_) 98 | fw = open(targetwarc, "a") 99 | fw.write("\r\n\r\n") 100 | 101 | 102 | def write_init_record(targetwarc, createdate): 103 | """ 104 | this function writes the warcinfo record 105 | :param targetwarc: the output file 106 | :ptype string 107 | :return 108 | """ 109 | content_length = 2 110 | record_ = [] 111 | 112 | record_.append("software: html2warc https://github.com/steffenfritz/html2warc\n") 113 | record_.append("format: WARC File Format 1.0\n") 114 | record_.append("conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\n") 115 | record_.append("description: warc file created from offline data\n") 116 | for n in range(0, len(record_)): 117 | content_length += len(record_[n]) 118 | 119 | with open(targetwarc, "w", newline="\r\n") as fd: 120 | fd.write("WARC/1.0\n") 121 | fd.write("WARC-Type: warcinfo\n") 122 | fd.write("WARC-Date: " + createdate + "\n") 123 | fd.write("WARC-Filename: " + targetwarc + "\n") 124 | fd.write("WARC-Record-ID: \n") 125 | fd.write("Content-Type: application/warc-fields\n") 126 | fd.write("Content-Length: " + str(content_length) + "\n") 127 | fd.write("\n") 128 | for line in record_: 129 | fd.write(line) 130 | fd.write("\n\n\n") 131 | 132 | 133 | def help_message(): 134 | """ 135 | prints a usage message if html2warc is not executed with 3 arguments 136 | """ 137 | print("\nUSAGE: html2warc.py ROOTURL SOURCEDIR TARGETWARC\n") 138 | 139 | 140 | def main(): 141 | """ 142 | :return exit code 143 | :rtype int 144 | """ 145 | if len(sys.argv) != 4: 146 | help_message() 147 | sys.exit(0) 148 | if len(sys.argv) == 4: 149 | try: 150 | rooturl = sys.argv[1] 151 | if not rooturl.endswith("/"): 152 | rooturl += "/" 153 | sourcedir = sys.argv[2] 154 | targetwarc = sys.argv[3] + ".warc" 155 | except IOError as err: 156 | print(str(err)) 157 | sys.exit(1) 158 | 159 | createdate = datetime.datetime.now().isoformat() 160 | write_init_record(targetwarc, createdate) 161 | source_to_warc(sourcedir, targetwarc, createdate, rooturl) 162 | 163 | return 0 164 | 165 | 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Steffen Fritz - amp-off.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | --------------------------------------------------------------------------------