├── .pylintrc
├── README.md
├── html2warc.py
└── license.txt
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable = line-too-long,
3 | invalid-name
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # html2warc
2 | A simple script to convert offline data into a warc file
3 |
4 | # Usage
5 | python html2warc.py $TARGET_URI $SOURCE_DIR $TARGET_WARC
6 |
--------------------------------------------------------------------------------
/html2warc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | html2warc creates warc files from local web resources
4 | """
5 |
6 | __date__ = '2023/05/11'
7 | __version__ = '0.7.1'
8 | __status__ = 'Testing'
9 | __license__ = 'The MIT License (MIT)'
10 | __copyright__ = 'Copyright (c) 2014-2023 Steffen Fritz'
11 | __author__ = 'steffen fritz'
12 | __maintainer__ = 'steffen fritz'
13 | __contributor__ = 'dragan espenschied'
14 | __contact__ = 'steffen AT fritz.wtf'
15 |
16 |
17 | import os
18 | import sys
19 | import uuid
20 | import datetime
21 | import mimetypes
22 | import re
23 |
24 | # possible names for an index file, listed as compiled regexp,
25 | # in order of preference
26 | index_filenames = [
27 | re.compile(r'(index)\.(html?|asp|php)', re.I), # most common
28 | re.compile(r'welcome\.html?', re.I) # AOL server
29 | ]
30 |
31 |
32 | def source_to_warc(source_dir, targetwarc, createdate, rooturl):
33 | """
34 | :param source_dir: source directory
35 | :param targetwarc: output warc file
36 | :param createdate: bag creation date
37 | :param rooturl: arbitrary URL
38 | :type: string objects
39 | """
40 | for rootdir, _, files in os.walk(source_dir):
41 | for file_ in files:
42 |
43 | possible_filenames_ = [file_]
44 |
45 | for index_filename_ in index_filenames:
46 | if index_filename_.fullmatch(file_):
47 | possible_filenames_.append('')
48 | continue # only one index per directory!
49 |
50 | source_file_ = os.path.join(rootdir, file_)
51 | mime_type_ = mimetypes.guess_type(source_file_)
52 | file_size_ = os.path.getsize(source_file_)
53 | block_length = 110 # init with len of network header
54 |
55 | rootdir_parts = rootdir.split('/')
56 | source_dir_parts = source_dir.split('/')
57 | source_file_uri_parts = []
58 | path_step = 0
59 | for segment in rootdir_parts:
60 | if len(source_dir_parts) < path_step+1 or segment != source_dir_parts[path_step]:
61 | source_file_uri_parts.append(segment)
62 | path_step = path_step + 1
63 |
64 | for possible_filename_ in possible_filenames_:
65 |
66 | source_file_uri = rooturl + '/'.join(source_file_uri_parts) + '/' + possible_filename_
67 |
68 | print("{}\t[{}]\t{}b".format(source_file_uri, mime_type_[0], file_size_))
69 |
70 | with open(targetwarc, "a", newline="\r\n") as fw:
71 | fw.write("WARC/1.0\n")
72 | fw.write("WARC-Type: response\n")
73 |
74 | fw.write("WARC-Target-URI: " + source_file_uri + "\n")
75 |
76 | fw.write("WARC-Record-ID: \n")
77 | fw.write("WARC-Date: " + str(createdate) + "\n")
78 | fw.write("Content-Type: " + "application/http;msgtype=response" + "\n")
79 | fw.write("WARC-Identified-Payload-Type: " + str(mime_type_[0]) + "\n")
80 |
81 | block_length = block_length + file_size_ + len(str(mime_type_[0])) + len(str(createdate))
82 | fw.write("Content-Length: " + str(block_length) + "\n")
83 | fw.write("\n")
84 |
85 | # network protocol information
86 | fw.write("HTTP/1.1 200 OK\n")
87 | fw.write("DATE: " + str(createdate) + "\n")
88 | fw.write("Accept-Ranges: bytes" + "\n")
89 | fw.write("Connection: close" + "\n")
90 | fw.write("Content-Type: " + str(mime_type_[0]) + "\n")
91 | fw.write("Content-Length: " + str(file_size_) + "\n")
92 | fw.write("\n")
93 |
94 | with open(source_file_, "rb") as fd:
95 | for line_ in fd:
96 | with open(targetwarc, "ab") as fw:
97 | fw.write(line_)
98 | fw = open(targetwarc, "a")
99 | fw.write("\r\n\r\n")
100 |
101 |
102 | def write_init_record(targetwarc, createdate):
103 | """
104 | this function writes the warcinfo record
105 | :param targetwarc: the output file
106 | :ptype string
107 | :return
108 | """
109 | content_length = 2
110 | record_ = []
111 |
112 | record_.append("software: html2warc https://github.com/steffenfritz/html2warc\n")
113 | record_.append("format: WARC File Format 1.0\n")
114 | record_.append("conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\n")
115 | record_.append("description: warc file created from offline data\n")
116 | for n in range(0, len(record_)):
117 | content_length += len(record_[n])
118 |
119 | with open(targetwarc, "w", newline="\r\n") as fd:
120 | fd.write("WARC/1.0\n")
121 | fd.write("WARC-Type: warcinfo\n")
122 | fd.write("WARC-Date: " + createdate + "\n")
123 | fd.write("WARC-Filename: " + targetwarc + "\n")
124 | fd.write("WARC-Record-ID: \n")
125 | fd.write("Content-Type: application/warc-fields\n")
126 | fd.write("Content-Length: " + str(content_length) + "\n")
127 | fd.write("\n")
128 | for line in record_:
129 | fd.write(line)
130 | fd.write("\n\n\n")
131 |
132 |
133 | def help_message():
134 | """
135 | prints a usage message if html2warc is not executed with 3 arguments
136 | """
137 | print("\nUSAGE: html2warc.py ROOTURL SOURCEDIR TARGETWARC\n")
138 |
139 |
140 | def main():
141 | """
142 | :return exit code
143 | :rtype int
144 | """
145 | if len(sys.argv) != 4:
146 | help_message()
147 | sys.exit(0)
148 | if len(sys.argv) == 4:
149 | try:
150 | rooturl = sys.argv[1]
151 | if not rooturl.endswith("/"):
152 | rooturl += "/"
153 | sourcedir = sys.argv[2]
154 | targetwarc = sys.argv[3] + ".warc"
155 | except IOError as err:
156 | print(str(err))
157 | sys.exit(1)
158 |
159 | createdate = datetime.datetime.now().isoformat()
160 | write_init_record(targetwarc, createdate)
161 | source_to_warc(sourcedir, targetwarc, createdate, rooturl)
162 |
163 | return 0
164 |
165 |
166 | if __name__ == '__main__':
167 | main()
168 |
--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Steffen Fritz - amp-off.com
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------