├── LICENSE ├── README.md ├── requirements.txt ├── setup.py └── unwarcit ├── __init__.py ├── main.py ├── unwarcit.py └── util.py /LICENSE: -------------------------------------------------------------------------------- 1 | Version 2.0, January 2004 2 | http://www.apache.org/licenses/ 3 | 4 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 5 | 6 | 1. Definitions. 7 | 8 | "License" shall mean the terms and conditions for use, reproduction, 9 | and distribution as defined by Sections 1 through 9 of this document. 10 | 11 | "Licensor" shall mean the copyright owner or entity authorized by 12 | the copyright owner that is granting the License. 13 | 14 | "Legal Entity" shall mean the union of the acting entity and all 15 | other entities that control, are controlled by, or are under common 16 | control with that entity. For the purposes of this definition, 17 | "control" means (i) the power, direct or indirect, to cause the 18 | direction or management of such entity, whether by contract or 19 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity 23 | exercising permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, 26 | including but not limited to software source code, documentation 27 | source, and configuration files. 28 | 29 | "Object" form shall mean any form resulting from mechanical 30 | transformation or translation of a Source form, including but 31 | not limited to compiled object code, generated documentation, 32 | and conversions to other media types. 33 | 34 | "Work" shall mean the work of authorship, whether in Source or 35 | Object form, made available under the License, as indicated by a 36 | copyright notice that is included in or attached to the work 37 | (an example is provided in the Appendix below). 38 | 39 | "Derivative Works" shall mean any work, whether in Source or Object 40 | form, that is based on (or derived from) the Work and for which the 41 | editorial revisions, annotations, elaborations, or other modifications 42 | represent, as a whole, an original work of authorship. For the purposes 43 | of this License, Derivative Works shall not include works that remain 44 | separable from, or merely link (or bind by name) to the interfaces of, 45 | the Work and Derivative Works thereof. 46 | 47 | "Contribution" shall mean any work of authorship, including 48 | the original version of the Work and any modifications or additions 49 | to that Work or Derivative Works thereof, that is intentionally 50 | submitted to Licensor for inclusion in the Work by the copyright owner 51 | or by an individual or Legal Entity authorized to submit on behalf of 52 | the copyright owner. For the purposes of this definition, "submitted" 53 | means any form of electronic, verbal, or written communication sent 54 | to the Licensor or its representatives, including but not limited to 55 | communication on electronic mailing lists, source code control systems, 56 | and issue tracking systems that are managed by, or on behalf of, the 57 | Licensor for the purpose of discussing and improving the Work, but 58 | excluding communication that is conspicuously marked or otherwise 59 | designated in writing by the copyright owner as "Not a Contribution." 60 | 61 | "Contributor" shall mean Licensor and any individual or Legal Entity 62 | on behalf of whom a Contribution has been received by Licensor and 63 | subsequently incorporated within the Work. 64 | 65 | 2. Grant of Copyright License. Subject to the terms and conditions of 66 | this License, each Contributor hereby grants to You a perpetual, 67 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 68 | copyright license to reproduce, prepare Derivative Works of, 69 | publicly display, publicly perform, sublicense, and distribute the 70 | Work and such Derivative Works in Source or Object form. 71 | 72 | 3. Grant of Patent License. Subject to the terms and conditions of 73 | this License, each Contributor hereby grants to You a perpetual, 74 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 75 | (except as stated in this section) patent license to make, have made, 76 | use, offer to sell, sell, import, and otherwise transfer the Work, 77 | where such license applies only to those patent claims licensable 78 | by such Contributor that are necessarily infringed by their 79 | Contribution(s) alone or by combination of their Contribution(s) 80 | with the Work to which such Contribution(s) was submitted. If You 81 | institute patent litigation against any entity (including a 82 | cross-claim or counterclaim in a lawsuit) alleging that the Work 83 | or a Contribution incorporated within the Work constitutes direct 84 | or contributory patent infringement, then any patent licenses 85 | granted to You under this License for that Work shall terminate 86 | as of the date such litigation is filed. 87 | 88 | 4. Redistribution. You may reproduce and distribute copies of the 89 | Work or Derivative Works thereof in any medium, with or without 90 | modifications, and in Source or Object form, provided that You 91 | meet the following conditions: 92 | 93 | (a) You must give any other recipients of the Work or 94 | Derivative Works a copy of this License; and 95 | 96 | (b) You must cause any modified files to carry prominent notices 97 | stating that You changed the files; and 98 | 99 | (c) You must retain, in the Source form of any Derivative Works 100 | that You distribute, all copyright, patent, trademark, and 101 | attribution notices from the Source form of the Work, 102 | excluding those notices that do not pertain to any part of 103 | the Derivative Works; and 104 | 105 | (d) If the Work includes a "NOTICE" text file as part of its 106 | distribution, then any Derivative Works that You distribute must 107 | include a readable copy of the attribution notices contained 108 | within such NOTICE file, excluding those notices that do not 109 | pertain to any part of the Derivative Works, in at least one 110 | of the following places: within a NOTICE text file distributed 111 | as part of the Derivative Works; within the Source form or 112 | documentation, if provided along with the Derivative Works; or, 113 | within a display generated by the Derivative Works, if and 114 | wherever such third-party notices normally appear. The contents 115 | of the NOTICE file are for informational purposes only and 116 | do not modify the License. You may add Your own attribution 117 | notices within Derivative Works that You distribute, alongside 118 | or as an addendum to the NOTICE text from the Work, provided 119 | that such additional attribution notices cannot be construed 120 | as modifying the License. 121 | 122 | You may add Your own copyright statement to Your modifications and 123 | may provide additional or different license terms and conditions 124 | for use, reproduction, or distribution of Your modifications, or 125 | for any such Derivative Works as a whole, provided Your use, 126 | reproduction, and distribution of the Work otherwise complies with 127 | the conditions stated in this License. 128 | 129 | 5. Submission of Contributions. Unless You explicitly state otherwise, 130 | any Contribution intentionally submitted for inclusion in the Work 131 | by You to the Licensor shall be under the terms and conditions of 132 | this License, without any additional terms or conditions. 133 | Notwithstanding the above, nothing herein shall supersede or modify 134 | the terms of any separate license agreement you may have executed 135 | with Licensor regarding such Contributions. 136 | 137 | 6. Trademarks. This License does not grant permission to use the trade 138 | names, trademarks, service marks, or product names of the Licensor, 139 | except as required for reasonable and customary use in describing the 140 | origin of the Work and reproducing the content of the NOTICE file. 141 | 142 | 7. Disclaimer of Warranty. Unless required by applicable law or 143 | agreed to in writing, Licensor provides the Work (and each 144 | Contributor provides its Contributions) on an "AS IS" BASIS, 145 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 146 | implied, including, without limitation, any warranties or conditions 147 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 148 | PARTICULAR PURPOSE. You are solely responsible for determining the 149 | appropriateness of using or redistributing the Work and assume any 150 | risks associated with Your exercise of permissions under this License. 151 | 152 | 8. Limitation of Liability. In no event and under no legal theory, 153 | whether in tort (including negligence), contract, or otherwise, 154 | unless required by applicable law (such as deliberate and grossly 155 | negligent acts) or agreed to in writing, shall any Contributor be 156 | liable to You for damages, including any direct, indirect, special, 157 | incidental, or consequential damages of any character arising as a 158 | result of this License or out of the use or inability to use the 159 | Work (including but not limited to damages for loss of goodwill, 160 | work stoppage, computer failure or malfunction, or any and all 161 | other commercial damages or losses), even if such Contributor 162 | has been advised of the possibility of such damages. 163 | 164 | 9. Accepting Warranty or Additional Liability. While redistributing 165 | the Work or Derivative Works thereof, You may choose to offer, 166 | and charge a fee for, acceptance of support, warranty, indemnity, 167 | or other liability obligations and/or rights consistent with this 168 | License. However, in accepting such obligations, You may act only 169 | on Your own behalf and on Your sole responsibility, not on behalf 170 | of any other Contributor, and only if You agree to indemnify, 171 | defend, and hold each Contributor harmless for any liability 172 | incurred by, or claims asserted against, such Contributor by reason 173 | of your accepting any such warranty or additional liability. 174 | 175 | END OF TERMS AND CONDITIONS 176 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | UNWARCIT: WARC (and WACZ) Unzipping Library 2 | ======================================== 3 | 4 | Background 5 | ---------- 6 | 7 | This library provides a command line interface to unzip warc and wacz files. 8 | 9 | Builds off of the [warcio library]() to read and validate warc files and the [py-wacz library]() to validate wacz files. 10 | 11 | Both libraries are provided by 12 | [Webrecorder](https://github.com/webrecorder/webrecorder) 13 | 14 | Setup 15 | ---------- 16 | Install by cloning the repo and then running: ``python3 setup.py install`` 17 | 18 | You can now run the tool like so: 19 | `` unwarcit metro_capture2.wacz data.warc --output myfolder `` 20 | 21 | You can pass a single file or a list of files, either warc or wacz, separated by spaces to unwarcit by placing them after the unwarcit command. 22 | `` unwarcit warcfile1.warc warcfile2.warc waczfile.wacz`` 23 | 24 | Configuration Options 25 | ---------- 26 | 27 |
28 | Unwarcit currently accepts the following parameters: 29 | 30 | ``` 31 | --help Show help [str] 32 | --version Show version number [int] 33 | --output The folder to output the results to [str] 34 | ``` 35 |
36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wacz 2 | uuid 3 | warcio 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | from setuptools.command.test import test as TestCommand 5 | 6 | __version__ = "0.0.0" 7 | 8 | 9 | class PyTest(TestCommand): 10 | def finalize_options(self): 11 | TestCommand.finalize_options(self) 12 | 13 | def run_tests(self): 14 | import pytest 15 | import sys 16 | import os 17 | 18 | errcode = pytest.main( 19 | ["--doctest-modules", "./unwarcit", "--cov", "unwarcit", "-v", "test/"] 20 | ) 21 | sys.exit(errcode) 22 | 23 | 24 | setup( 25 | name="unwarcit", 26 | version=__version__, 27 | author="Emma Dickson", 28 | author_email="emma.jk.dickson@gmail.com", 29 | packages=find_packages(), 30 | license="Apache 2.0", 31 | url="https://github.com/emmadickson/unwarcit", 32 | description="Unzip and Access Files in Web Archives (WARC) and WACZ Files", 33 | entry_points=""" 34 | [console_scripts] 35 | unwarcit = unwarcit.main:main 36 | """, 37 | ) 38 | -------------------------------------------------------------------------------- /unwarcit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/emmadickson/unwarcit/962758a1ed85813aae4b5e13a65b129b57b559b7/unwarcit/__init__.py -------------------------------------------------------------------------------- /unwarcit/main.py: -------------------------------------------------------------------------------- 1 | from unwarcit.unwarcit import Unwarcit 2 | from unwarcit.util import get_version 3 | import sys, datetime 4 | from argparse import ArgumentParser, RawTextHelpFormatter 5 | 6 | 7 | # ============================================================================ 8 | def main(args=None): 9 | if sys.version_info < (2, 7): 10 | print( 11 | "Unwarcit requires python >= 2.7, you are running {0}".format( 12 | sys.version.split(" ")[0] 13 | ) 14 | ) 15 | return 1 16 | 17 | parser = ArgumentParser( 18 | description="Unzip Warc and Wacz files into individual components" 19 | ) 20 | 21 | parser.add_argument("--version", action="version", version=get_version()) 22 | 23 | parser.add_argument("inputs", nargs="+", help="""Paths of files to be unpacked.""") 24 | 25 | parser.add_argument( 26 | "--output", 27 | help="""Path where the results will be placed. Default is 'output'.""", 28 | default=f"output_{datetime.datetime.now().strftime('%Y-%m-%d:%s')}", 29 | ) 30 | 31 | arguments = parser.parse_args(args=args) 32 | 33 | return Unwarcit(arguments.inputs, arguments.output).unzip() 34 | 35 | 36 | # ============================================================================ 37 | if __name__ == "__main__": 38 | res = main() 39 | sys.exit(res) 40 | -------------------------------------------------------------------------------- /unwarcit/unwarcit.py: -------------------------------------------------------------------------------- 1 | import os, gzip, json, zipfile, pathlib, uuid 2 | from wacz.validate import Validation, OUTDATED_WACZ 3 | from unwarcit.util import get_version, generate_datapackage, is_gz_file, write_out_file 4 | from os.path import isfile, join, exists 5 | from warcio.archiveiterator import ArchiveIterator 6 | from warcio.checker import Checker 7 | from wacz.util import WACZ_VERSION 8 | from warcio.exceptions import ArchiveLoadFailed 9 | 10 | # ============================================================================ 11 | class Unwarcit: 12 | def __init__(self, inputs, output): 13 | self.inputs = inputs 14 | self.output = output 15 | self.verbose = False # Hack because the imported library wants a self.verbose option 16 | self.unwarc = {} 17 | 18 | def unzip(self): 19 | """ 20 | unzips the passed list of warcs and wacz files 21 | """ 22 | valid_files = self.validate_passed_files(self.inputs) 23 | print("All files successfully validated") 24 | if valid_files == 1: 25 | return print("File failed to be validated") 26 | 27 | for key in self.inputs: 28 | self.unwarc[key] = {} 29 | self.unwarc = self.identify_file_formats(key, self.unwarc) 30 | current_file = self.unwarc[key] 31 | 32 | if current_file["format"] == "wacz": 33 | print(f"\nAnalyzing file {key}") 34 | warc_file = self.unzip_wacz(key, self.output) 35 | file_path = f"{self.output}/{key}/unpacked_wacz/archive/{warc_file}" 36 | 37 | else: 38 | warc_file = current_file 39 | print(f"\nAnalyzing file {key}") 40 | file_path = key 41 | 42 | self.unwrap_warc(key, file_path, self.unwarc) 43 | 44 | print("\nGenerating Datapackage.json file") 45 | datapackage = generate_datapackage(self.unwarc, key, self.output) 46 | datapackage_file = open(f"{self.output}/{key}/datapackage.json", "w") 47 | datapackage_file.write(datapackage) 48 | 49 | def identify_file_formats(self, file, unwarc): 50 | """ 51 | Updates the self.unwarcit object file format and zipped status 52 | 53 | Looks at the passed file and creates it's entry in self.unwarcit with the format 54 | 55 | unwarc[file] = { 56 | "format": str, 57 | "file_is_zipped": boolean, 58 | "zipped_extension": str, 59 | } 60 | 61 | Parameters 62 | ---------- 63 | file : str 64 | Description of arg1 65 | unwarc : dict 66 | Description of arg2 67 | 68 | Returns 69 | ------- 70 | boolean 71 | Returns True 72 | 73 | """ 74 | zip_format = file.split(".")[-1] 75 | format = file.split(".")[1] 76 | if zip_format == format: 77 | file_is_zipped = False 78 | zipped_extension = None 79 | else: 80 | file_is_zipped = True 81 | zipped_extension = zip_format 82 | unwarc[file] = { 83 | "format": format, 84 | "file_is_zipped": file_is_zipped, 85 | "zipped_extension": zipped_extension, 86 | } 87 | return unwarc 88 | 89 | def unzip_wacz(self, filepath, output): 90 | """ 91 | unzips a wacz file allowing us to access original warc 92 | 93 | Parameters 94 | ---------- 95 | filepath : str 96 | filepath to be opened and checked. 97 | output : str 98 | output folder to place unpacked wacz 99 | Returns 100 | ------- 101 | str 102 | path of warc file to analyze 103 | 104 | """ 105 | print(f"Unzipping wacz file {filepath}") 106 | with zipfile.ZipFile(filepath, "r") as zip_ref: 107 | pathlib.Path(f"{output}/{filepath}/unpacked_wacz/").mkdir( 108 | parents=True, exist_ok=True 109 | ) 110 | zip_ref.extractall(f"{output}/{filepath}/unpacked_wacz/") 111 | 112 | warc_files = [ 113 | f for f in os.listdir(f"{output}/{filepath}/unpacked_wacz/archive") 114 | ] 115 | if len(warc_files) != 1: 116 | print( 117 | "More than one warc detected in this wacz file, combining automatically" 118 | ) 119 | # TODO: combine 120 | 121 | return warc_files[0] 122 | 123 | def unwrap_warc(self, file, file_path, unwarc): 124 | """ 125 | reads a warc file and the conents of it's targets then writes them out to file and unwarc object 126 | 127 | Parameters 128 | ---------- 129 | filepath : str 130 | filepath to be opened and checked. 131 | output : str 132 | output folder to place unpacked wacz 133 | Returns 134 | ------- 135 | str 136 | path of warc file to analyze 137 | 138 | """ 139 | print(f"Beginning to unwrap warc file {file}") 140 | is_gz = is_gz_file(file_path) 141 | if is_gz: 142 | opened_warc = gzip.open(file_path, "rb") 143 | else: 144 | opened_warc = open(file_path, "rb") 145 | with opened_warc as stream: 146 | unwarc[file]["found_files"] = [] 147 | for record in ArchiveIterator(stream): 148 | if record.rec_type == "response": 149 | name = record.rec_headers.get_header("WARC-Target-URI") 150 | file_name = name.split("/")[-1].split("?")[0].split("@")[0] 151 | content = record.content_stream().read() 152 | content_length = len(content) 153 | file_uuid = record.rec_headers.get_header("WARC-Record-ID") 154 | file_uuid = file_uuid.split(":")[-1][0:-1] 155 | 156 | if (content_length != 0): 157 | 158 | if (file_name == '' and content_length > 0): 159 | file_name = str(uuid.uuid4()) 160 | #print(f"\nA file with no name but some content has been detected, it will be stored as an 'unrecognized' type with the name {file_name}") 161 | 162 | file_name = (file_name[-30:]) if len(file_name) > 250 else file_name 163 | 164 | fetch_type = file_name.split(".") 165 | if len(fetch_type) > 1: 166 | fetch_type = fetch_type[-1] 167 | elif file_name in ['css', 'html', 'jpg', 'js', 'json', 'php', 'png', 'svg']: 168 | fetch_type = file_name 169 | else: 170 | fetch_type = 'unrecognized' 171 | 172 | 173 | unwarc[file]["found_files"].append( 174 | { 175 | "url": name, 176 | "file_name": file_name, 177 | "detected_type": fetch_type, 178 | "content": content, 179 | } 180 | ) 181 | 182 | write_out_file(self.output, file, fetch_type, file_name, content, file_uuid) 183 | 184 | def validate_wacz(self, file): 185 | """ 186 | validate a passed wacz 187 | 188 | Parameters 189 | ---------- 190 | file : str 191 | file to be opened and checked. 192 | 193 | Returns 194 | ------- 195 | int 196 | 1 if it fails 0 if its successful 197 | 198 | """ 199 | print(f"Wacz file detected {file}, attempting validation") 200 | validate = Validation(file) 201 | version = validate.version 202 | validation_tests = [] 203 | 204 | if version == OUTDATED_WACZ: 205 | print("Validation Succeeded the passed Wacz is outdate but valid") 206 | return 0 207 | 208 | elif version == WACZ_VERSION: 209 | validation_tests += [ 210 | validate.check_required_contents, 211 | validate.frictionless_validate, 212 | validate.check_file_paths, 213 | validate.check_file_hashes, 214 | ] 215 | else: 216 | print("Validation Failed the passed Wacz is invalid") 217 | return 1 218 | 219 | for func in validation_tests: 220 | success = func() 221 | if success is False: 222 | print("Validation Failed the passed Wacz is invalid") 223 | return 1 224 | 225 | def validate_warc(self, file): 226 | """ 227 | validate a passed warc 228 | 229 | Parameters 230 | ---------- 231 | file : str 232 | file to be opened and checked. 233 | 234 | Returns 235 | ------- 236 | int 237 | 1 if it fails 0 if its successful 238 | 239 | """ 240 | print(f"Warc file detected {file}, attempting validation") 241 | 242 | try: 243 | Checker.process_one(self, file) 244 | except ArchiveLoadFailed as e: 245 | logging.ingo(filename) 246 | print(" saw exception ArchiveLoadFailed: " + str(e).rstrip()) 247 | print(" skipping rest of file") 248 | return 1 249 | return 0 250 | 251 | def validate_passed_files(self, files): 252 | """ 253 | validates the passed list of files. Checks if they exist and then applies either warc or wacz validation. 254 | 255 | Parameters 256 | ---------- 257 | files : list 258 | list of files to be opened and checked. 259 | 260 | Returns 261 | ------- 262 | int 263 | 1 if it fails 0 if its successful 264 | 265 | """ 266 | all_files_are_valid = 0 267 | 268 | for file in files: 269 | file_exists = exists(file) 270 | print(f"file {file} exists {file_exists}") 271 | 272 | if file_exists: 273 | format = file.split(".")[1] 274 | if format == "wacz": 275 | valid_wacz = self.validate_wacz(file) 276 | all_files_are_valid = 1 if valid_wacz == 1 else 0 277 | elif format == "warc": 278 | valid_warc = self.validate_warc(file) 279 | all_files_are_valid = 1 if valid_warc == 1 else 0 280 | else: 281 | all_files_are_valid = 1 282 | return all_files_are_valid 283 | -------------------------------------------------------------------------------- /unwarcit/util.py: -------------------------------------------------------------------------------- 1 | import pathlib, datetime, json, pkg_resources 2 | from wacz.util import WACZ_VERSION, support_hash_file, now 3 | from os.path import exists 4 | 5 | def get_version(): 6 | """ 7 | Returns the current version of unwarcit 8 | 9 | Parameters 10 | ---------- 11 | None 12 | 13 | Returns 14 | ------- 15 | str 16 | version of unwarcit being used 17 | 18 | """ 19 | return "%(prog)s " + pkg_resources.get_distribution("unwarcit").version 20 | 21 | 22 | def generate_datapackage(unwarc, key, output, hash_type="md5"): 23 | """ 24 | Generates a datapackage.json file 25 | 26 | The datapackage is Frictionless compliant 27 | 28 | Parameters 29 | ---------- 30 | unwarc : dict 31 | object storing info about the files we're recovering 32 | key : str 33 | the key for the file we need to process in the unwarc dict 34 | output : str 35 | The output folder for the files 36 | hash_type : The hash algorithm used to hash the file contents 37 | 38 | Returns 39 | ------- 40 | json 41 | content of data_package file 42 | 43 | """ 44 | unwarc_record = unwarc[key] 45 | package_dict = {} 46 | 47 | package_dict["profile"] = "data-package" 48 | package_dict["resources"] = [] 49 | for i in range(0, len(unwarc_record["found_files"])): 50 | file = unwarc_record["found_files"][i] 51 | package_dict["resources"].append({}) 52 | package_dict["resources"][i]["name"] = file["file_name"].lower() 53 | package_dict["resources"][i]["detected_type"] = file["detected_type"] 54 | package_dict["resources"][i][ 55 | "stored_path" 56 | ] = f"{output}/{key}/{file['detected_type']}/{file['file_name']}" 57 | package_dict["resources"][i]["url"] = file["url"] 58 | content = file["content"] 59 | package_dict["resources"][i]["hash"] = support_hash_file(hash_type, content) 60 | package_dict["resources"][i]["bytes"] = len(content) 61 | 62 | package_dict["created"] = datetime.datetime.utcnow().strftime( 63 | "%Y-%m-%dT%H:%M:%SZ" 64 | ) 65 | 66 | package_dict["wacz_version"] = WACZ_VERSION 67 | 68 | package_dict["software"] = "unwarcit " + get_version() 69 | 70 | return json.dumps(package_dict, indent=2) 71 | 72 | 73 | def is_gz_file(filepath): 74 | """ 75 | Identifies if a file is .gz zipped or not 76 | 77 | Parameters 78 | ---------- 79 | filepath : str 80 | filepath to be opened and checked. 81 | 82 | Returns 83 | ------- 84 | boolean 85 | True if the file is .gz zipped, otherwise False 86 | 87 | """ 88 | with open(filepath, "rb") as test_f: 89 | return test_f.read(2) == b"\x1f\x8b" 90 | 91 | 92 | def write_out_file(output, original_file, file_path, file_name, content, file_uuid): 93 | """ 94 | Writes the passed information to the given file path 95 | Parameters 96 | ---------- 97 | key : str 98 | The name of the original warc or wacz file 99 | file_path : str 100 | The output folder where we will place the file 101 | file_name : str 102 | individual file name 103 | content : str 104 | content to be written to file 105 | file_uuid: str 106 | uuid to be added to file name if it already exists 107 | Returns 108 | ------- 109 | None 110 | 111 | """ 112 | path = f"{output}/{original_file}/downloaded_files/{file_path}" 113 | pathlib.Path(path).mkdir(parents=True, exist_ok=True) 114 | if (exists(f"{path}/{file_name}")): 115 | file_name = file_uuid + file_name 116 | f = open(f"{path}/{file_name}", "wb") 117 | print(f"Writing out {file_name} to {path}/{file_name}") 118 | f.write(content) 119 | return 120 | --------------------------------------------------------------------------------