├── tests ├── __init__.py ├── fixtures │ ├── pages │ │ ├── invalid.txt │ │ ├── invalid.jsonl │ │ ├── pages.jsonl │ │ └── extraPages.jsonl │ ├── .gitignore │ ├── example-iana.warc │ ├── valid_example_1.wacz │ ├── valid_example_2.wacz │ ├── example-collection.warc │ ├── example-resource.warc.gz │ ├── invalid_example_1.wacz │ ├── invalid_signed_example_1.wacz │ ├── valid_signed_example_1.wacz │ ├── example-warcinfo-metadata.warc │ ├── example-collection-with-lists.warc │ └── logs │ │ ├── wr-crawl.log │ │ └── wr-specs-crawl.log ├── test_verify_signed.py ├── test_util.py ├── test_create_wacz_hash_in_page.py ├── test_wacz_indexer_functions.py ├── test_create_wacz.py ├── test_validate_wacz.py ├── test_create_wacz_indexing.py └── test_optional_flags_wacz.py ├── wacz ├── __init__.py ├── __main__.py ├── util.py ├── validate.py ├── main.py └── waczindexer.py ├── requirements.txt ├── codecov.yml ├── .coveragerc ├── .gitignore ├── .github └── workflows │ ├── publish_pypi.yaml │ └── ci.yaml ├── setup.py ├── LICENSE ├── CHANGES.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wacz/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/fixtures/pages/invalid.txt: -------------------------------------------------------------------------------- 1 | Not a JSONL file 2 | -------------------------------------------------------------------------------- /wacz/__main__.py: -------------------------------------------------------------------------------- 1 | from wacz.main import main 2 | 3 | main() 4 | -------------------------------------------------------------------------------- /tests/fixtures/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | */ 3 | !.gitignore 4 | !*.warc 5 | !*.warc.gz 6 | !*.wacz 7 | !*.jsonl 8 | !pages/* 9 | -------------------------------------------------------------------------------- /tests/fixtures/example-iana.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-iana.warc -------------------------------------------------------------------------------- /tests/fixtures/valid_example_1.wacz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/valid_example_1.wacz -------------------------------------------------------------------------------- /tests/fixtures/valid_example_2.wacz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/valid_example_2.wacz -------------------------------------------------------------------------------- /tests/fixtures/example-collection.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-collection.warc -------------------------------------------------------------------------------- /tests/fixtures/example-resource.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-resource.warc.gz -------------------------------------------------------------------------------- /tests/fixtures/invalid_example_1.wacz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/invalid_example_1.wacz -------------------------------------------------------------------------------- /tests/fixtures/invalid_signed_example_1.wacz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/invalid_signed_example_1.wacz -------------------------------------------------------------------------------- /tests/fixtures/valid_signed_example_1.wacz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/valid_signed_example_1.wacz -------------------------------------------------------------------------------- /tests/fixtures/example-warcinfo-metadata.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-warcinfo-metadata.warc -------------------------------------------------------------------------------- /tests/fixtures/example-collection-with-lists.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-collection-with-lists.warc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | frictionless>=3.23.4 2 | shortuuid>=1.0.1 3 | cdxj-indexer>=1.4.4 4 | boilerpy3>=1.0.2 5 | pytest-cov>=2.10.1 6 | PyYAML>=5.3.1 7 | black>=20.8b1 8 | jsonlines>=3.0.0 9 | click>=8.0.0 10 | typer==0.11.1 11 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | # basic 6 | target: 80% 7 | threshold: 2% 8 | base: auto 9 | 10 | ignore: 11 | - "tests/*" 12 | - "wacz/__init__.py" 13 | - "wacz/__main__.py" 14 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = codecov 3 | branch = True 4 | omit = 5 | */test/* 6 | */tests/* 7 | *.html 8 | *.js 9 | *.css 10 | 11 | [report] 12 | exclude_lines = 13 | pragma: no cover 14 | if __name__ == .__main__.: 15 | def __repr__ 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /tests/fixtures/pages/invalid.jsonl: -------------------------------------------------------------------------------- 1 | {id": "extra-pages", "title": "Extra Pages"} 2 | {"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} 3 | -------------------------------------------------------------------------------- /tests/fixtures/pages/pages.jsonl: -------------------------------------------------------------------------------- 1 | {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"} 2 | {"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"} 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | .eggs 13 | *.cache 14 | parts 15 | bin 16 | var 17 | sdist 18 | develop-eggs 19 | .installed.cfg 20 | lib 21 | lib64 22 | __pycache__ 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Translations 33 | *.mo 34 | 35 | .DS_Store 36 | **/.DS_Store 37 | 38 | # Mr Developer 39 | .mr.developer.cfg 40 | .project 41 | .pydevproject 42 | 43 | Pipfile* 44 | -------------------------------------------------------------------------------- /tests/fixtures/pages/extraPages.jsonl: -------------------------------------------------------------------------------- 1 | {"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"} 2 | {"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"} 3 | {"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} 4 | {"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"} 5 | -------------------------------------------------------------------------------- /.github/workflows/publish_pypi.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PYPI 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | pypi-release: 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.9] 12 | 13 | steps: 14 | - name: checkout 15 | uses: actions/checkout@v1 16 | 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | 22 | - name: Install dependencies 23 | run: python -m pip install --upgrade pip wheel twine 24 | 25 | - name: Build Dist 26 | run: python setup.py sdist bdist_wheel 27 | 28 | - name: Publish package to TestPyPI 29 | uses: pypa/gh-action-pypi-publish@master 30 | with: 31 | user: __token__ 32 | password: ${{ secrets.PYPI_API_TOKEN }} 33 | 34 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # vim: set sw=4 et: 3 | from setuptools import setup, find_packages 4 | 5 | __version__ = "0.5.0" 6 | 7 | def load_requirements(filename): 8 | with open(filename, "rt") as fh: 9 | return fh.read().rstrip().split("\n") 10 | 11 | def long_description(): 12 | with open("README.md") as f: 13 | return f.read() 14 | 15 | setup( 16 | name="wacz", 17 | version=__version__, 18 | author="Ilya Kreymer, Emma Dickson", 19 | author_email="info@webrecorder.net", 20 | license="Apache 2.0", 21 | packages=find_packages(exclude=["test"]), 22 | url="https://github.com/webrecorder/py-wacz", 23 | description="WACZ Format Tools", 24 | long_description=long_description(), 25 | long_description_content_type="text/markdown", 26 | install_requires=load_requirements("requirements.txt"), 27 | extras_require={"signing": ["authsign>=0.5.1", "requests"]}, 28 | zip_safe=True, 29 | setup_requires=["pytest-runner"], 30 | entry_points=""" 31 | [console_scripts] 32 | wacz = wacz.main:main 33 | """, 34 | ) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Webrecorder 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | 4 | jobs: 5 | unit-tests: 6 | runs-on: ubuntu-latest 7 | strategy: 8 | max-parallel: 3 9 | matrix: 10 | python-version: ['3.8', '3.9', '3.10'] 11 | 12 | steps: 13 | - name: checkout 14 | uses: actions/checkout@v2 15 | 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install --upgrade -r requirements.txt 25 | python setup.py -q install 26 | pip install -e .[signing] 27 | 28 | - name: Style Check 29 | run: | 30 | black --check tests/* 31 | black --check wacz/* 32 | 33 | - name: Test with pytest 34 | run: | 35 | set -e 36 | pytest --cov-config=.coveragerc 37 | pytest -v --cov=wacz --cov-report=xml 38 | 39 | - name: Upload coverage to Codecov 40 | uses: codecov/codecov-action@v1 41 | with: 42 | verbose: true 43 | -------------------------------------------------------------------------------- /tests/test_verify_signed.py: -------------------------------------------------------------------------------- 1 | import unittest, os 2 | from wacz.main import main 3 | 4 | 5 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 6 | 7 | 8 | class TestVerifySigned(unittest.TestCase): 9 | def test_wacz_valid_and_verify_sig(self): 10 | self.assertEqual( 11 | main( 12 | [ 13 | "validate", 14 | "--verify-auth", 15 | "-f", 16 | os.path.join(TEST_DIR, "valid_signed_example_1.wacz"), 17 | ] 18 | ), 19 | 0, 20 | ) 21 | 22 | def test_wacz_valid_and_not_valid_sig(self): 23 | self.assertEqual( 24 | main( 25 | [ 26 | "validate", 27 | "--verify-auth", 28 | "-f", 29 | os.path.join(TEST_DIR, "invalid_signed_example_1.wacz"), 30 | ] 31 | ), 32 | 1, 33 | ) 34 | 35 | def test_wacz_valid_not_signed(self): 36 | self.assertEqual( 37 | main( 38 | [ 39 | "validate", 40 | "--verify-auth", 41 | "-f", 42 | os.path.join(TEST_DIR, "valid_example_1.wacz"), 43 | ] 44 | ), 45 | 1, 46 | ) 47 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | import os 4 | import zipfile, json, gzip, hashlib 5 | from io import BytesIO 6 | 7 | from wacz.util import hash_stream, validateJSON 8 | 9 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 10 | 11 | 12 | class TestUtilFunctions(unittest.TestCase): 13 | def test_util_hash(self): 14 | """When invoking the util hash method a hash should be returned""" 15 | test_hash = "sha256:%s" % hashlib.sha256("test".encode("utf-8")).hexdigest() 16 | bytes_, hash_ = hash_stream("sha256", BytesIO("test".encode("utf-8"))) 17 | self.assertEqual(bytes_, 4) 18 | self.assertEqual(hash_, test_hash) 19 | 20 | test_hash = "md5:%s" % hashlib.md5("test".encode("utf-8")).hexdigest() 21 | bytes_, hash_ = hash_stream("md5", BytesIO("test".encode("utf-8"))) 22 | self.assertEqual(bytes_, 4) 23 | self.assertEqual(hash_, test_hash) 24 | 25 | def test_util_validate_json_succeed(self): 26 | """validate json method should succeed with valid json""" 27 | self.assertTrue(validateJSON('{"test": "test"}')) 28 | 29 | def test_util_validate_json_fail(self): 30 | """validate json method should fail with valid json""" 31 | self.assertFalse(validateJSON('test": "test"}')) 32 | 33 | 34 | if __name__ == "__main__": 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # 0.4.8 2 | 3 | - Add -l/--log-directory option to add logs directory to WACZ 4 | 5 | # 0.4.7 6 | 7 | - include request cookie in cdxj via 'req.http:cookie' field (#27) 8 | - fix Click dependency version 9 | 10 | # 0.4.6 11 | 12 | - wacz zip write: ensure zip file is fully closed on exit (fixes #20 13 | - ci: add ci for py3.10 14 | - wacz create: support --url, --detect-pages and --split-seeds to write detect pages to extraPages.jsonl, specified seed to pages.jsonl 15 | - text extract: don't raise exception, keep parsed text 16 | 17 | # 0.4.5 18 | - Pages: also ignore pages with invalid utf-8 encoding 19 | 20 | # 0.4.4 21 | 22 | - Pages: read pages line by line in case of large pages file 23 | 24 | # 0.4.3 25 | 26 | - Pages: Better page parsing fix, more lenient on page parsing errors: print error and continue, ignoring invalid page 27 | 28 | # 0.4.2 29 | 30 | - Pages: Fix parsing of page URLs that contain extra ':' 31 | 32 | # 0.4.1 33 | 34 | - More efficient hash computation 35 | 36 | # 0.4.0 37 | 38 | - Add support for signing and verification! 39 | 40 | # 0.3.1 41 | 42 | - Ensure passed in pages are check via both http and https URLs 43 | - Update to cdxj-indexer 1.4.1, supporting improved indexing of JSON POST requests 44 | 45 | # 0.3.0 46 | 47 | - Add `name` field to `resources` for better compatibility with frictionless spec. 48 | 49 | # wacz 0.3.0b1 50 | 51 | Improved compatibility with frictionless data spec 52 | 53 | - Top-level `title`, `description`, `created`, `software` fields and optional `mainPageURL` and `mainPageTS` fields. 54 | - Include full WARC record digests in `recordDigest` field in CDX, `digest` in IDX 55 | - Support for `pages/extraPages.jsonl` passed in via --extra-pages/-e flag 56 | -------------------------------------------------------------------------------- /tests/test_create_wacz_hash_in_page.py: -------------------------------------------------------------------------------- 1 | import unittest, os, zipfile, sys, gzip, json, tempfile 2 | from wacz.main import main, now 3 | from unittest.mock import patch 4 | from wacz.util import hash_stream 5 | from frictionless import validate, Report 6 | 7 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 8 | 9 | 10 | class TestWaczFor(unittest.TestCase): 11 | @classmethod 12 | @patch("wacz.main.now") 13 | def setUpClass(self, mock_now): 14 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 15 | self.tmpdir = tempfile.TemporaryDirectory() 16 | with open(os.path.join(self.tmpdir.name, "test-pages.jsonl"), "wt") as fh: 17 | fh.write('{"format": "json-pages-1.0", "id": "pages", "title": "Pages"}\n') 18 | fh.write( 19 | '{"id": "abcdef", "url": "https://www.example.com/#hashtag", "title": "Example", "loadState": 4}\n' 20 | ) 21 | 22 | main( 23 | [ 24 | "create", 25 | "-f", 26 | os.path.join(TEST_DIR, "example-collection.warc"), 27 | "-p", 28 | os.path.join(self.tmpdir.name, "test-pages.jsonl"), 29 | "-o", 30 | os.path.join(self.tmpdir.name, "example-custom-page.wacz"), 31 | ] 32 | ) 33 | 34 | def test_hash(self): 35 | with zipfile.ZipFile( 36 | os.path.join(self.tmpdir.name, "example-custom-page.wacz"), "r" 37 | ) as zip_ref: 38 | zip_ref.extract( 39 | "pages/pages.jsonl", 40 | os.path.join(self.tmpdir.name, "extract-custom-page"), 41 | ) 42 | zip_ref.close() 43 | 44 | with open( 45 | os.path.join( 46 | self.tmpdir.name, "extract-custom-page", "pages", "pages.jsonl" 47 | ), 48 | "rt", 49 | ) as f: 50 | content = f.read() 51 | 52 | assert ( 53 | content 54 | == """\ 55 | {"format": "json-pages-1.0", "id": "pages", "title": "Pages"} 56 | {"id": "abcdef", "url": "https://www.example.com/#hashtag", "title": "Example", "loadState": 4, "ts": "2020-10-07T21:22:36Z"} 57 | """ 58 | ) 59 | -------------------------------------------------------------------------------- /tests/fixtures/logs/wr-crawl.log: -------------------------------------------------------------------------------- 1 | {"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}} 2 | {"logLevel":"info","timestamp":"2023-02-23T23:44:39.666Z","context":"general","message":"Set netIdleWait to 15 seconds","details":{}} 3 | {"logLevel":"info","timestamp":"2023-02-23T23:44:39.666Z","context":"general","message":"Seeds","details":[{"url":"https://webrecorder.net/","include":[],"exclude":[],"scopeType":"page","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]} 4 | {"logLevel":"info","timestamp":"2023-02-23T23:44:40.016Z","context":"state","message":"Storing state in memory","details":{}} 5 | {"logLevel":"info","timestamp":"2023-02-23T23:44:40.473Z","context":"general","message":"Text Extraction: Disabled","details":{}} 6 | {"logLevel":"info","timestamp":"2023-02-23T23:44:40.590Z","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":0,"total":1,"pending":1,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://webrecorder.net/\",\"seedId\":0,\"depth\":0,\"started\":\"2023-02-23T23:44:40.517Z\"}"]}} 7 | {"logLevel":"error","timestamp":"2023-02-23T23:44:43.279Z","context":"general","message":"Invalid Seed \"mailto:info@webrecorder.net\" - URL must start with http:// or https://","details":{}} 8 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.286Z","context":"behavior","message":"Behaviors started","details":{"behaviorTimeout":90,"page":"https://webrecorder.net/"}} 9 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.287Z","context":"behavior","message":"Run Script Started","details":{"url":"https://webrecorder.net/","page":"https://webrecorder.net/"}} 10 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.291Z","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://webrecorder.net/"}} 11 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.291Z","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!","page":"https://webrecorder.net/"}} 12 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"behavior","message":"Run Script Finished","details":{"url":"https://webrecorder.net/","page":"https://webrecorder.net/"}} 13 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"behavior","message":"Behaviors finished","details":{"finished":1,"page":"https://webrecorder.net/"}} 14 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"pageStatus","message":"Page finished","details":{"page":"https://webrecorder.net/"}} 15 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.358Z","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":1,"total":1,"pending":0,"limit":{"max":0,"hit":false},"pendingPages":[]}} 16 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.358Z","context":"general","message":"Waiting to ensure pending data is written to WARCs...","details":{}} 17 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.364Z","context":"general","message":"Crawl status: done","details":{}} 18 | -------------------------------------------------------------------------------- /tests/test_wacz_indexer_functions.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | import os 4 | import zipfile, json, gzip 5 | from wacz.main import main, now 6 | from wacz.waczindexer import WACZIndexer 7 | 8 | PAGE_INDEX = "pages/pages.jsonl" 9 | 10 | 11 | def match_detected_pages(self, detected_pages, passed_pages_url, passed_pages_ts): 12 | for page in detected_pages: 13 | page = detected_pages[page] 14 | url = page["url"] 15 | ts = page["timestamp"] 16 | if passed_pages_url == url and passed_pages_ts == None: 17 | return page 18 | if passed_pages_url == url and passed_pages_ts == ts: 19 | return page 20 | return 0 21 | 22 | 23 | class TestWaczIndexerFunctions(unittest.TestCase): 24 | def test_match_detected_page_invalid(self): 25 | """When passed invalid urls and invalid timestamps the function should return 0""" 26 | detected_pages = { 27 | "20201007212236/http://www.example.com/": { 28 | "url": "http://www.example.com/", 29 | "timestamp": "20201007212236", 30 | "title": "Example Domain", 31 | "rec": "fbt5hqmtseanlxzt", 32 | "id": "1db0ef709a", 33 | "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\n", 34 | } 35 | } 36 | self.assertEqual( 37 | match_detected_pages(self, detected_pages, "fake_url", None), 0 38 | ) 39 | self.assertEqual( 40 | match_detected_pages(self, detected_pages, "fake_url", "fake-ts"), 41 | 0, 42 | ) 43 | 44 | def test_match_detected_page_valid(self): 45 | """When passed valid urls and valid timestamps the function should return the page""" 46 | detected_pages = { 47 | "20201007212236/http://www.example.com/": { 48 | "url": "http://www.example.com/", 49 | "timestamp": "20201007212236", 50 | "title": "Example Domain", 51 | "rec": "fbt5hqmtseanlxzt", 52 | "id": "1db0ef709a", 53 | "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\n", 54 | } 55 | } 56 | self.assertEqual( 57 | match_detected_pages(self, detected_pages, "http://www.example.com/", None), 58 | { 59 | "url": "http://www.example.com/", 60 | "timestamp": "20201007212236", 61 | "title": "Example Domain", 62 | "rec": "fbt5hqmtseanlxzt", 63 | "id": "1db0ef709a", 64 | "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\n", 65 | }, 66 | ) 67 | self.assertEqual( 68 | match_detected_pages( 69 | self, detected_pages, "http://www.example.com/", "20201007212236" 70 | ), 71 | { 72 | "url": "http://www.example.com/", 73 | "timestamp": "20201007212236", 74 | "title": "Example Domain", 75 | "rec": "fbt5hqmtseanlxzt", 76 | "id": "1db0ef709a", 77 | "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\n", 78 | }, 79 | ) 80 | 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /wacz/util.py: -------------------------------------------------------------------------------- 1 | import hashlib, datetime, json, os 2 | from warcio.timeutils import iso_date_to_timestamp 3 | import pkg_resources 4 | 5 | WACZ_VERSION = "1.1.1" 6 | 7 | 8 | BUFF_SIZE = 1024 * 64 9 | 10 | 11 | def check_http_and_https(url, ts, pages_dict): 12 | """Checks for http and https versions of the passed url 13 | in the pages dict 14 | :param url to check, pages_dict the user passed 15 | :returns: True or False depending on if a match was found 16 | :rtype: boolean 17 | """ 18 | parts = url.split(":", 1) 19 | if len(parts) < 2: 20 | return "" 21 | 22 | url_body = parts[1] 23 | checks = [ 24 | f"http:{url_body}", 25 | f"https:{url_body}", 26 | f"{ts}/http:{url_body}", 27 | f"{ts}/https:{url_body}", 28 | ] 29 | 30 | for check in checks: 31 | if check in pages_dict: 32 | return check 33 | 34 | return "" 35 | 36 | 37 | def get_py_wacz_version(): 38 | """Get version of the py-wacz package""" 39 | return pkg_resources.get_distribution("wacz").version 40 | 41 | 42 | def hash_stream(hash_type, stream): 43 | """Hashes the stream with given hash_type hasher""" 44 | try: 45 | hasher = hashlib.new(hash_type) 46 | except: 47 | return 0, "" 48 | 49 | size = 0 50 | 51 | while True: 52 | buff = stream.read(BUFF_SIZE) 53 | size += len(buff) 54 | hasher.update(buff) 55 | if not buff: 56 | break 57 | 58 | return size, hash_type + ":" + hasher.hexdigest() 59 | 60 | 61 | def hash_file(type_, filename): 62 | with open(filename, "rb") as fh: 63 | size_, hash_ = hash_stream(type_, fh) 64 | 65 | return hash_ 66 | 67 | 68 | def construct_passed_pages_dict(passed_pages_list): 69 | """Creates a dictionary of the passed pages with the url as the key or ts/url if ts is present and the title and text as the values if they have been passed""" 70 | passed_pages_dict = {} 71 | 72 | for page_data in passed_pages_list: 73 | # Skip invalid page data 74 | try: 75 | page_dict = json.loads(page_data) 76 | except: 77 | print("Warning: Skipping invalid page {0}".format(page_data)) 78 | continue 79 | 80 | # Skip the file's header if it's been set 81 | if "format" not in page_dict: 82 | url = page_dict.get("url", "") 83 | 84 | # Set the default key as url, but without hashtag, as will match pages 85 | # to URLs without hashtag, but keep hashtag on page list 86 | key = url.split("#", 1)[0] 87 | 88 | # If timestamp is present overwrite the key to be 'ts/url' 89 | if "ts" in page_dict: 90 | key = iso_date_to_timestamp(page_dict.pop("ts")) + "/" + url 91 | 92 | # Add the key to the dictionary with remaining data 93 | passed_pages_dict[key] = page_dict 94 | 95 | return passed_pages_dict 96 | 97 | 98 | def now(): 99 | """Returns the current time""" 100 | return tuple(datetime.datetime.utcnow().timetuple()[:6]) 101 | 102 | 103 | def validateJSON(jsonData): 104 | """Attempts to validate a string as json""" 105 | try: 106 | json.loads(jsonData) 107 | except ValueError as err: 108 | return False 109 | return True 110 | 111 | 112 | def validate_pages_jsonl_file(json_file_path): 113 | """Attempt to validate pages.jsonl file""" 114 | filename = os.path.basename(json_file_path) 115 | if not filename.lower().endswith(".jsonl"): 116 | return False 117 | 118 | line_index = 0 119 | 120 | with open(json_file_path, "r") as jsonl_file: 121 | for line in jsonl_file: 122 | try: 123 | data = json.loads(line) 124 | if line_index == 0: 125 | data["format"] 126 | data["id"] 127 | else: 128 | data["url"] 129 | data["ts"] 130 | line_index += 1 131 | except json.JSONDecodeError: 132 | print(f"File {filename} is invalid JSONL") 133 | return False 134 | except KeyError: 135 | print(f"File {filename} missing required fields") 136 | return False 137 | 138 | return True 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## py-wacz 2 | 3 | The **py-wacz** repository contains a Python module and command line utility 4 | for working with web archive data using the [WACZ] format specification. Web 5 | Archive Collection Zipped (WACZ) allows web archives to be shared and 6 | distributed by providing a predictable way of packaging up web archive data and 7 | metadata as a ZIP file. The **wacz** command line utility supports converting 8 | any [WARC] files into WACZ files, and optionally generating full-text search 9 | indices of pages. 10 | 11 | ## Install 12 | 13 | Use pip to install the module and a command line utility: 14 | 15 | ``` 16 | pip install wacz 17 | ``` 18 | 19 | Once installed you can use the **wacz** command line utility to *create* and *validate* WACZ files. 20 | 21 | ## Create 22 | 23 | To create a WACZ package you can point **wacz** at a WARC file and tell it 24 | where to write the WACZ with the `-o` option: 25 | 26 | ``` 27 | wacz create -o myfile.wacz 28 | ``` 29 | 30 | The resulting `myfile.wacz` should be loadable via [ReplayWeb.page]. 31 | 32 | **wacz** accepts the following options for customizing how the WACZ file is assembled. 33 | 34 | ### -f --file 35 | 36 | Explicitly declare the file being passed to the create function. 37 | 38 | ``` 39 | wacz create -f tests/fixtures/example-collection.warc 40 | ``` 41 | 42 | ### -o --output 43 | 44 | Explicitly declare the name of the wacz being created. 45 | 46 | ``` 47 | wacz create tests/fixtures/example-collection.warc -o mywacz.wacz 48 | ``` 49 | 50 | ### -t --text 51 | 52 | Generates pages.jsonl page index with a full-text index, must be run in conjunction with --detect-pages. Will have no effect if run alone. 53 | 54 | ``` 55 | wacz create tests/fixtures/example-collection.warc -t 56 | ``` 57 | 58 | ### --detect-pages 59 | 60 | Generates pages.jsonl page index without a full-text index. 61 | 62 | ``` 63 | wacz create tests/fixtures/example-collection.warc --detect-pages 64 | ``` 65 | 66 | ### -p --pages 67 | 68 | Overrides the pages index generation with the passed jsonl pages. 69 | 70 | ``` 71 | wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl 72 | ``` 73 | 74 | ### -e --extra-pages 75 | 76 | Overrides the extra pages index generation with the passed extra jsonl pages. 77 | 78 | ``` 79 | wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl -e extra_pages.jsonl 80 | ``` 81 | 82 | ### -c --copy-pages 83 | 84 | Overrides the behavior of --pages and --extra-pages options to copy existing pages.jsonl and/or extraPages.jsonl as-is directly into the WACZ rather than attempting to match each page to WARC record. The files are still parsed for basic correctness. 85 | 86 | ``` 87 | wacz create tests/fixtures/example-collection.warc --pages pages/pages.jsonl --extra-pages pages/extraPages.jsonl --copy-pages 88 | ``` 89 | 90 | ### -t --text 91 | 92 | You can add a full text index by including the --text tag. 93 | 94 | ``` 95 | wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl --text 96 | ``` 97 | 98 | ### -l --log-directory 99 | 100 | Adds log files in specified directory to WACZ 101 | 102 | ``` 103 | wacz create tests/fixtures/example-collection.warc -l tests/fixtures/logs 104 | ``` 105 | 106 | ### --ts 107 | 108 | Overrides the ts metadata value in the datapackage.json file. 109 | 110 | ``` 111 | wacz create tests/fixtures/example-collection.warc --ts TIMESTAMP 112 | ``` 113 | 114 | ### --url 115 | 116 | Overrides the url metadata value in the datapackage.json file. 117 | 118 | ``` 119 | wacz create tests/fixtures/example-collection.warc --url URL 120 | ``` 121 | 122 | ### --title 123 | 124 | Overrides the titles metadata value in the datapackage.json file. 125 | 126 | ``` 127 | wacz create tests/fixtures/example-collection.warc --title TITLE 128 | ``` 129 | 130 | ### --desc 131 | 132 | Overrides the desc metadata value in the datapackage.json file. 133 | 134 | ``` 135 | wacz create tests/fixtures/example-collection.warc --desc DESC 136 | ``` 137 | 138 | ### --hash-type 139 | 140 | Allows the user to specify the hash type used (sha256 or md5). 141 | 142 | ``` 143 | wacz create tests/fixtures/example-collection.warc --hash-type md5 144 | ``` 145 | 146 | ### --signing-url 147 | 148 | An optional URL for [WACZ signing server](https://github.com/webrecorder/authsign) which will be used to add a signature to the new WACZ. 149 | 150 | This URL should point to an authsign `/sign` api endpoint. 151 | 152 | See the section on `--verify-auth` for more info on signing and verification. 153 | 154 | ### --signing-token 155 | 156 | An optional, secret token passed to signing server to allow access. See `authsign` for more details. 157 | 158 | 159 | ## Validate 160 | 161 | You can also validate an existing WACZ file by running: 162 | 163 | ``` 164 | wacz validate myfile.wacz 165 | ``` 166 | 167 | ### -f --file 168 | 169 | Explicitly declare the file being passed to the validate function. 170 | 171 | ``` 172 | wacz validate -f tests/fixtures/example-collection.warc 173 | ``` 174 | 175 | ### --verify-auth 176 | 177 | New option in 0.4.0, this option also verifies the WACZ is signed, using [authsign](https://github.com/webrecorder/authsign) 178 | 179 | The verification can be done locally, or via remote signing/verification server. 180 | 181 | To use remote server, add `--verifier-url` which should be a URL pointing to the authsign `/verify` endpoint. 182 | 183 | To run locally, the `authsign` must be installed, which can be done by running `pip install wacz[signing]`. 184 | 185 | See [WACZ Authentication Spec](https://github.com/webrecorder/wacz-auth-spec) on WACZ authentication. 186 | 187 | This feature and the specification are still in development (alpha-quality) and are subject to change. 188 | 189 | 190 | 191 | ## Testing 192 | 193 | If you are developing wacz you can run the unit tests with [pytest]: 194 | 195 | ``` 196 | pytest tests 197 | ``` 198 | 199 | [WACZ]: https://github.com/webrecorder/wacz-format 200 | [WARC]: https://en.wikipedia.org/wiki/Web_ARChive 201 | [ReplayWeb.page]: https://replayweb.page 202 | [pytest]: https://docs.pytest.org/ 203 | -------------------------------------------------------------------------------- /tests/test_create_wacz.py: -------------------------------------------------------------------------------- 1 | import unittest, os, zipfile, sys, gzip, json, tempfile 2 | from wacz.main import main, now 3 | from unittest.mock import patch 4 | from wacz.util import hash_file 5 | from frictionless import validate, Report 6 | 7 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 8 | 9 | 10 | class TestWaczFormat(unittest.TestCase): 11 | def find_resource(self, resource_list, filename): 12 | for file in resource_list: 13 | if filename in file["path"]: 14 | return file 15 | 16 | @classmethod 17 | @patch("wacz.main.now") 18 | def setUpClass(self, mock_now): 19 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 20 | self.tmpdir = tempfile.TemporaryDirectory() 21 | main( 22 | [ 23 | "create", 24 | "-f", 25 | os.path.join(TEST_DIR, "example-collection.warc"), 26 | "-o", 27 | os.path.join(self.tmpdir.name, "valid_example_1.wacz"), 28 | "-l", 29 | os.path.join(TEST_DIR, "logs"), 30 | ] 31 | ) 32 | with zipfile.ZipFile( 33 | os.path.join(self.tmpdir.name, "valid_example_1.wacz"), "r" 34 | ) as zip_ref: 35 | zip_ref.extractall(os.path.join(self.tmpdir.name, "unzipped_wacz_1")) 36 | zip_ref.close() 37 | 38 | self.wacz_file = os.path.join(self.tmpdir.name, "valid_example_1.wacz") 39 | self.warc_file = os.path.join(TEST_DIR, "example-collection.warc") 40 | 41 | self.wacz_archive_warc = os.path.join( 42 | self.tmpdir.name, 43 | "unzipped_wacz_1/archive/example-collection.warc", 44 | ) 45 | self.wacz_index_cdx = os.path.join( 46 | self.tmpdir.name, 47 | "unzipped_wacz_1/indexes/index.cdx.gz", 48 | ) 49 | self.wacz_index_idx = os.path.join( 50 | self.tmpdir.name, 51 | "unzipped_wacz_1/indexes/index.idx", 52 | ) 53 | self.wacz_json = os.path.join( 54 | self.tmpdir.name, 55 | "unzipped_wacz_1/datapackage.json", 56 | ) 57 | self.wacz_log = os.path.join( 58 | self.tmpdir.name, "unzipped_wacz_1/logs/wr-specs-crawl.log" 59 | ) 60 | self.wacz_second_log = os.path.join( 61 | self.tmpdir.name, "unzipped_wacz_1/logs/wr-crawl.log" 62 | ) 63 | 64 | def test_components(self): 65 | """Check that the basic components of a wacz file exist""" 66 | self.assertTrue( 67 | "example-collection.warc" 68 | in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/archive")) 69 | ) 70 | self.assertTrue( 71 | "index.cdx.gz" 72 | in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/indexes")) 73 | ) 74 | self.assertTrue( 75 | "index.idx" 76 | in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/indexes")) 77 | ) 78 | self.assertTrue( 79 | "pages.jsonl" 80 | in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/pages")) 81 | ) 82 | self.assertTrue( 83 | "datapackage.json" 84 | in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/")) 85 | ) 86 | 87 | def test_archive_structure(self): 88 | """Check that the hash of the original warc file matches that of the warc file in the archive folder""" 89 | original_warc = hash_file("sha256", self.warc_file) 90 | 91 | archive_warc = hash_file("sha256", self.wacz_archive_warc) 92 | 93 | self.assertEqual(original_warc, archive_warc) 94 | 95 | def test_idx_structure(self): 96 | """Check that the idx file has the expected content""" 97 | with open(self.wacz_index_idx, "rb") as f: 98 | content = f.read() 99 | f.close() 100 | 101 | # doing a startswith because compressed gzip block may be different depending on platform, so sha256 is platform dependent 102 | # just checking that the hash is set 103 | self.assertTrue( 104 | content.startswith( 105 | b'!meta 0 {"format": "cdxj-gzip-1.0", "filename": "index.cdx.gz"}\ncom,example)/ 20201007212236 {"offset": 0, "length": 256, "digest": "sha256:', 106 | ) 107 | ) 108 | 109 | def test_cdx_structure(self): 110 | """Check that the cdx file has the expected content""" 111 | content = "" 112 | with gzip.open(self.wacz_index_cdx, "rb") as f: 113 | for line in f: 114 | content = content + line.decode() 115 | f.close() 116 | self.assertEqual( 117 | content, 118 | 'com,example)/ 20201007212236 {"url": "http://www.example.com/", "mime": "text/html", "status": "200", "digest": "sha1:WJM2KPM4GF3QK2BISVUH2ASX64NOUY7L", "length": "1293", "offset": "845", "filename": "example-collection.warc", "recordDigest": "sha256:f78838ace891c96f7a6299e9e085b55a5aba8950a6d77f0f2e9ffe90f63255f2"}\n', 119 | ) 120 | 121 | def test_logs(self): 122 | with open(self.wacz_log, "rb") as f: 123 | content = f.read() 124 | f.close() 125 | 126 | with open(self.wacz_second_log, "rb") as f: 127 | second_content = f.read() 128 | f.close() 129 | 130 | self.assertTrue( 131 | content.startswith( 132 | b'{"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]}\n', 133 | ) 134 | ) 135 | self.assertTrue( 136 | second_content.startswith( 137 | b'{"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}}\n' 138 | ) 139 | ) 140 | 141 | def test_data_package_structure(self): 142 | """Check that the package_descriptor is valid""" 143 | f = open(self.wacz_json, "rb") 144 | json_parse = json.loads(f.read()) 145 | # Make sure it's recording the correct number of resources 146 | self.assertEqual(len(json_parse["resources"]), 6) 147 | 148 | # Check that the correct hash was recorded for a warc 149 | original_warc = hash_file("sha256", self.warc_file) 150 | 151 | warc_resource = self.find_resource( 152 | json_parse["resources"], "example-collection.warc" 153 | ) 154 | self.assertEqual(original_warc, warc_resource["hash"]) 155 | 156 | # Check that the correct hash was recorded for the index.idx 157 | original_wacz_index_idx = hash_file("sha256", self.wacz_index_idx) 158 | idx_resource = self.find_resource(json_parse["resources"], "idx") 159 | self.assertEqual(original_wacz_index_idx, idx_resource["hash"]) 160 | 161 | # Check that the correct hash was recorded for the index.cdx.gz 162 | original_wacz_index_cdx = hash_file("sha256", self.wacz_index_cdx) 163 | cdx_resource = self.find_resource(json_parse["resources"], "cdx") 164 | self.assertEqual(original_wacz_index_cdx, cdx_resource["hash"]) 165 | 166 | # Check that the correct hash was recorded for the log files 167 | original_wacz_log = hash_file("sha256", self.wacz_log) 168 | log_resource = self.find_resource(json_parse["resources"], "wr-specs-crawl.log") 169 | self.assertEqual(original_wacz_log, log_resource["hash"]) 170 | 171 | second_wacz_log = hash_file("sha256", self.wacz_second_log) 172 | log_resource = self.find_resource(json_parse["resources"], "wr-crawl.log") 173 | self.assertEqual(second_wacz_log, log_resource["hash"]) 174 | 175 | # Use frictionless validation 176 | valid = validate(self.wacz_json) 177 | self.assertTrue(valid.valid) 178 | 179 | 180 | if __name__ == "__main__": 181 | unittest.main() 182 | -------------------------------------------------------------------------------- /tests/test_validate_wacz.py: -------------------------------------------------------------------------------- 1 | import unittest, os, zipfile, sys, gzip, json, tempfile 2 | from wacz.main import main 3 | from frictionless import validate 4 | from wacz.validate import Validation 5 | from unittest.mock import patch 6 | 7 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 8 | 9 | 10 | class TestWaczFormat(unittest.TestCase): 11 | @classmethod 12 | @patch("wacz.main.now") 13 | def setUpClass(self, mock_now): 14 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 15 | self.tmpdir = tempfile.TemporaryDirectory() 16 | main( 17 | [ 18 | "create", 19 | "-f", 20 | os.path.join(TEST_DIR, "example-collection.warc"), 21 | "-o", 22 | os.path.join(self.tmpdir.name, "valid_example_1.wacz"), 23 | ] 24 | ) 25 | with zipfile.ZipFile( 26 | os.path.join(self.tmpdir.name, "valid_example_1.wacz"), "r" 27 | ) as zip_ref: 28 | zip_ref.extractall(os.path.join(self.tmpdir.name, "unzipped_wacz_1")) 29 | zip_ref.close() 30 | 31 | self.validation_class_valid_1 = Validation( 32 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 33 | ) 34 | self.validation_class_invalid = Validation( 35 | os.path.join(TEST_DIR, "invalid_example_1.wacz") 36 | ) 37 | 38 | def test_overall_command(self): 39 | self.assertEqual( 40 | main( 41 | [ 42 | "validate", 43 | "-f", 44 | os.path.join(self.tmpdir.name, "valid_example_1.wacz"), 45 | ] 46 | ), 47 | 0, 48 | ) 49 | 50 | def test_check_indexes_valid(self): 51 | self.assertTrue(self.validation_class_valid_1.check_indexes()) 52 | 53 | def test_check_compression_valid(self): 54 | self.assertTrue(self.validation_class_valid_1.check_compression()) 55 | 56 | def test_frictionless_validate_valid_wacz(self): 57 | """Check that the frictionless validation feature identifies a valid wacz data package as valid""" 58 | # Use frictionless validation 59 | valid_1 = self.validation_class_valid_1.frictionless_validate() 60 | self.assertTrue(valid_1) 61 | 62 | def test_frictionless_validate_invalid_wacz(self): 63 | """Check that the frictionless validation feature identifies an invalid wacz data package as invalid""" 64 | # Use frictionless validation 65 | valid = self.validation_class_invalid.frictionless_validate() 66 | self.assertFalse(valid) 67 | 68 | def test_filepaths_invalid_wacz(self): 69 | """Correctly fail on a wacz with invalid files""" 70 | valid = self.validation_class_invalid.check_file_paths() 71 | self.assertFalse(valid) 72 | 73 | def test_filepaths_valid_wacz(self): 74 | """Correctly succeed on a wacz with valid files""" 75 | valid_1 = self.validation_class_valid_1.check_file_paths() 76 | self.assertTrue(valid_1) 77 | 78 | def test_hashes_valid_wacz(self): 79 | """Correctly succeed on a wacz with matching hashes""" 80 | valid_1 = self.validation_class_valid_1.check_file_hashes() 81 | self.assertTrue(valid_1) 82 | 83 | def test_hashes_invalid_wacz(self): 84 | """Correctly fail on a wacz with nonmatching hashes""" 85 | valid = self.validation_class_invalid.check_file_hashes() 86 | self.assertFalse(valid) 87 | 88 | def test_ability_to_detect_hash_md5(self): 89 | """Correctly identify the hash type of a file as md5""" 90 | tmpdir = tempfile.TemporaryDirectory() 91 | main( 92 | [ 93 | "create", 94 | "-f", 95 | os.path.join(TEST_DIR, "example-collection.warc"), 96 | "-o", 97 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 98 | "--hash-type", 99 | "md5", 100 | ] 101 | ) 102 | with zipfile.ZipFile( 103 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 104 | ) as zip_ref: 105 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 106 | zip_ref.close() 107 | 108 | validation_class = Validation( 109 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 110 | ) 111 | valid = validation_class.detect_hash_type() 112 | self.assertEqual(valid, 0) 113 | valid = validation_class.hash_type 114 | self.assertEqual(valid, "md5") 115 | 116 | def test_ability_to_detect_hash_md5(self): 117 | """Correctly validate hashes and identify the type when no flag is set to md5""" 118 | tmpdir = tempfile.TemporaryDirectory() 119 | main( 120 | [ 121 | "create", 122 | "-f", 123 | os.path.join(TEST_DIR, "example-collection.warc"), 124 | "-o", 125 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 126 | "--hash-type", 127 | "sha256", 128 | ] 129 | ) 130 | with zipfile.ZipFile( 131 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 132 | ) as zip_ref: 133 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 134 | zip_ref.close() 135 | 136 | validation_class = Validation( 137 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 138 | ) 139 | valid = validation_class.detect_hash_type() 140 | self.assertEqual(valid, 0) 141 | valid = validation_class.hash_type 142 | self.assertEqual(valid, "sha256") 143 | 144 | def test_ability_to_detect_hash_md5(self): 145 | """Correctly validate hashes and identify the type when flag is set to 256""" 146 | tmpdir = tempfile.TemporaryDirectory() 147 | main( 148 | [ 149 | "create", 150 | "-f", 151 | os.path.join(TEST_DIR, "example-collection.warc"), 152 | "-o", 153 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 154 | "--hash-type", 155 | "sha256", 156 | ] 157 | ) 158 | with zipfile.ZipFile( 159 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 160 | ) as zip_ref: 161 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 162 | zip_ref.close() 163 | 164 | validation_class = Validation( 165 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 166 | ) 167 | valid = validation_class.detect_hash_type() 168 | self.assertEqual(valid, 0) 169 | valid = validation_class.hash_type 170 | self.assertEqual(valid, "sha256") 171 | 172 | def test_invalid_wacz_missing_datapackage(self): 173 | """Correctly validate hashes and identify the type when no flag is set""" 174 | tmpdir = tempfile.TemporaryDirectory() 175 | main( 176 | [ 177 | "create", 178 | "-f", 179 | os.path.join(TEST_DIR, "example-collection.warc"), 180 | "-o", 181 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 182 | ] 183 | ) 184 | with zipfile.ZipFile( 185 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 186 | ) as zip_ref: 187 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 188 | zip_ref.close() 189 | 190 | os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/datapackage.json")) 191 | validation_class = Validation( 192 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 193 | ) 194 | 195 | valid = validation_class.check_required_contents() 196 | self.assertEqual(valid, 0) 197 | 198 | def test_invalid_wacz_missing_index(self): 199 | """Correctly fail on a wacz with no index""" 200 | tmpdir = tempfile.TemporaryDirectory() 201 | main( 202 | [ 203 | "create", 204 | "-f", 205 | os.path.join(TEST_DIR, "example-collection.warc"), 206 | "-o", 207 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 208 | ] 209 | ) 210 | with zipfile.ZipFile( 211 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 212 | ) as zip_ref: 213 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 214 | zip_ref.close() 215 | 216 | os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/indexes/index.cdx.gz")) 217 | validation_class = Validation( 218 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 219 | ) 220 | valid = validation_class.check_required_contents() 221 | self.assertEqual(valid, 0) 222 | 223 | def test_invalid_wacz_missing_warc(self): 224 | """Correctly fail on a wacz with no warc file""" 225 | tmpdir = tempfile.TemporaryDirectory() 226 | main( 227 | [ 228 | "create", 229 | "-f", 230 | os.path.join(TEST_DIR, "example-collection.warc"), 231 | "-o", 232 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 233 | ] 234 | ) 235 | with zipfile.ZipFile( 236 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 237 | ) as zip_ref: 238 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 239 | zip_ref.close() 240 | 241 | os.remove( 242 | os.path.join(tmpdir.name, "unzipped_wacz_1/archive/example-collection.warc") 243 | ) 244 | validation_class = Validation( 245 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 246 | ) 247 | valid = validation_class.check_required_contents() 248 | self.assertEqual(valid, 0) 249 | 250 | def test_invalid_wacz_missing_pages(self): 251 | """Correctly fail on a wacz with no pages file""" 252 | tmpdir = tempfile.TemporaryDirectory() 253 | main( 254 | [ 255 | "create", 256 | "-f", 257 | os.path.join(TEST_DIR, "example-collection.warc"), 258 | "-o", 259 | os.path.join(tmpdir.name, "valid_example_1.wacz"), 260 | ] 261 | ) 262 | with zipfile.ZipFile( 263 | os.path.join(tmpdir.name, "valid_example_1.wacz"), "r" 264 | ) as zip_ref: 265 | zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1")) 266 | zip_ref.close() 267 | 268 | os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/pages/pages.jsonl")) 269 | validation_class = Validation( 270 | os.path.join(self.tmpdir.name, "valid_example_1.wacz") 271 | ) 272 | valid = validation_class.check_required_contents() 273 | self.assertEqual(valid, 0) 274 | 275 | 276 | if __name__ == "__main__": 277 | unittest.main() 278 | -------------------------------------------------------------------------------- /tests/test_create_wacz_indexing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | import os 4 | from wacz.main import main, now 5 | from wacz.util import check_http_and_https 6 | 7 | import zipfile 8 | 9 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 10 | 11 | 12 | class TestWaczIndexing(unittest.TestCase): 13 | def test_check_http_and_https_changed(self): 14 | pages_dict = {"https://www.example.org/": "1db0ef709a"} 15 | check_url = "http://www.example.org/" 16 | match = check_http_and_https(check_url, "", pages_dict) 17 | self.assertEqual(match, "https://www.example.org/") 18 | 19 | def test_check_http_and_https_not_found(self): 20 | pages_dict = {"https://www.example.org/": "1db0ef709a"} 21 | check_url = "http://fake" 22 | match = check_http_and_https(check_url, "", pages_dict) 23 | self.assertEqual(match, "") 24 | 25 | def test_warc_with_other_metadata(self): 26 | with tempfile.TemporaryDirectory() as tmpdir: 27 | self.assertEqual( 28 | main( 29 | [ 30 | "create", 31 | "-f", 32 | os.path.join(TEST_DIR, "example-warcinfo-metadata.warc"), 33 | "-o", 34 | os.path.join(tmpdir, "example-warcinfo-metadata.wacz"), 35 | ] 36 | ), 37 | 0, 38 | ) 39 | 40 | self.assertEqual( 41 | main( 42 | [ 43 | "validate", 44 | "-f", 45 | os.path.join(tmpdir, "example-warcinfo-metadata.wacz"), 46 | ] 47 | ), 48 | 0, 49 | ) 50 | 51 | def test_warc_with_extra_lists(self): 52 | with tempfile.TemporaryDirectory() as tmpdir: 53 | self.assertEqual( 54 | main( 55 | [ 56 | "create", 57 | "-f", 58 | os.path.join(TEST_DIR, "example-collection-with-lists.warc"), 59 | "-o", 60 | os.path.join(tmpdir, "example-collection-with-lists.wacz"), 61 | ] 62 | ), 63 | 0, 64 | ) 65 | 66 | self.assertEqual( 67 | main( 68 | [ 69 | "validate", 70 | "-f", 71 | os.path.join(tmpdir, "example-collection-with-lists.wacz"), 72 | ] 73 | ), 74 | 0, 75 | ) 76 | 77 | with zipfile.ZipFile( 78 | os.path.join(tmpdir, "example-collection-with-lists.wacz") 79 | ) as zf: 80 | filelist = sorted(zf.namelist()) 81 | 82 | # verify pages file added for each list 83 | self.assertEqual( 84 | filelist, 85 | [ 86 | "archive/example-collection-with-lists.warc", 87 | "datapackage-digest.json", 88 | "datapackage.json", 89 | "indexes/index.cdx.gz", 90 | "indexes/index.idx", 91 | "pages/example.jsonl", 92 | "pages/iana.jsonl", 93 | "pages/pages.jsonl", 94 | ], 95 | ) 96 | 97 | def test_warc_with_extra_pages(self): 98 | with tempfile.TemporaryDirectory() as tmpdir: 99 | with open(os.path.join(tmpdir, "test-extra-pages.jsonl"), "wt") as fh: 100 | fh.write( 101 | """\ 102 | {"url": "https://www.iana.org/about"} 103 | {"url": "https://www.iana.org/protocols"}\ 104 | """ 105 | ) 106 | 107 | self.assertEqual( 108 | main( 109 | [ 110 | "create", 111 | "-f", 112 | os.path.join(TEST_DIR, "example-iana.warc"), 113 | "-o", 114 | os.path.join(tmpdir, "test-extra-pages.wacz"), 115 | "-e", 116 | os.path.join(tmpdir, "test-extra-pages.jsonl"), 117 | "--detect-pages", 118 | ] 119 | ), 120 | 0, 121 | ) 122 | 123 | self.assertEqual( 124 | main( 125 | [ 126 | "validate", 127 | "-f", 128 | os.path.join(tmpdir, "test-extra-pages.wacz"), 129 | ] 130 | ), 131 | 0, 132 | ) 133 | 134 | with zipfile.ZipFile(os.path.join(tmpdir, "test-extra-pages.wacz")) as zf: 135 | filelist = sorted(zf.namelist()) 136 | 137 | # verify pages file added for each list 138 | self.assertEqual( 139 | filelist, 140 | [ 141 | "archive/example-iana.warc", 142 | "datapackage-digest.json", 143 | "datapackage.json", 144 | "indexes/index.cdx.gz", 145 | "indexes/index.idx", 146 | "pages/extraPages.jsonl", 147 | "pages/pages.jsonl", 148 | ], 149 | ) 150 | 151 | def test_warc_with_detect_pages_split_seeds(self): 152 | with tempfile.TemporaryDirectory() as tmpdir: 153 | self.assertEqual( 154 | main( 155 | [ 156 | "create", 157 | "-f", 158 | os.path.join(TEST_DIR, "example-iana.warc"), 159 | "-o", 160 | os.path.join(tmpdir, "test-detect-extra-pages.wacz"), 161 | "--detect-pages", 162 | "--split-seeds", 163 | "--url", 164 | "https://example.com/", 165 | ] 166 | ), 167 | 0, 168 | ) 169 | 170 | self.assertEqual( 171 | main( 172 | [ 173 | "validate", 174 | "-f", 175 | os.path.join(tmpdir, "test-detect-extra-pages.wacz"), 176 | ] 177 | ), 178 | 0, 179 | ) 180 | 181 | with zipfile.ZipFile( 182 | os.path.join(tmpdir, "test-detect-extra-pages.wacz") 183 | ) as zf: 184 | filelist = sorted(zf.namelist()) 185 | 186 | # verify pages file added for each list 187 | self.assertEqual( 188 | filelist, 189 | [ 190 | "archive/example-iana.warc", 191 | "datapackage-digest.json", 192 | "datapackage.json", 193 | "indexes/index.cdx.gz", 194 | "indexes/index.idx", 195 | "pages/extraPages.jsonl", 196 | "pages/pages.jsonl", 197 | ], 198 | ) 199 | 200 | with zf.open("pages/pages.jsonl", "r") as fh: 201 | data = fh.read() 202 | 203 | self.assertTrue(b"https://example.com/" in data) 204 | 205 | self.assertTrue(len(data.strip().split(b"\n")) == 2) 206 | 207 | with zf.open("pages/extraPages.jsonl", "r") as fh: 208 | data = fh.read() 209 | 210 | self.assertTrue(len(data.strip().split(b"\n")) == 7) 211 | 212 | def test_warc_with_extra_pages_via_seeds(self): 213 | with tempfile.TemporaryDirectory() as tmpdir: 214 | with open(os.path.join(tmpdir, "pages.jsonl"), "wt") as fh: 215 | fh.write( 216 | """\ 217 | {"url": "https://example.com/", "seed": true} 218 | {"url": "https://www.iana.org/about"} 219 | {"url": "https://www.iana.org/protocols"}\ 220 | """ 221 | ) 222 | 223 | self.assertEqual( 224 | main( 225 | [ 226 | "create", 227 | "-f", 228 | os.path.join(TEST_DIR, "example-iana.warc"), 229 | "-o", 230 | os.path.join(tmpdir, "test-extra-pages.wacz"), 231 | "-p", 232 | os.path.join(tmpdir, "pages.jsonl"), 233 | "--split-seeds", 234 | ] 235 | ), 236 | 0, 237 | ) 238 | 239 | self.assertEqual( 240 | main( 241 | [ 242 | "validate", 243 | "-f", 244 | os.path.join(tmpdir, "test-extra-pages.wacz"), 245 | ] 246 | ), 247 | 0, 248 | ) 249 | 250 | with zipfile.ZipFile(os.path.join(tmpdir, "test-extra-pages.wacz")) as zf: 251 | filelist = sorted(zf.namelist()) 252 | 253 | # verify pages file added for each list 254 | self.assertEqual( 255 | filelist, 256 | [ 257 | "archive/example-iana.warc", 258 | "datapackage-digest.json", 259 | "datapackage.json", 260 | "indexes/index.cdx.gz", 261 | "indexes/index.idx", 262 | "pages/extraPages.jsonl", 263 | "pages/pages.jsonl", 264 | ], 265 | ) 266 | 267 | with zf.open("pages/extraPages.jsonl", "r") as fh: 268 | data = fh.read() 269 | self.assertTrue(b"https://www.iana.org/about" in data) 270 | self.assertTrue(b"https://www.iana.org/protocols" in data) 271 | 272 | with zf.open("pages/pages.jsonl", "r") as fh: 273 | data = fh.read() 274 | self.assertTrue(b"https://example.com/" in data) 275 | 276 | def test_warc_resource_record(self): 277 | with tempfile.TemporaryDirectory() as tmpdir: 278 | self.assertEqual( 279 | main( 280 | [ 281 | "create", 282 | "-f", 283 | os.path.join(TEST_DIR, "example-resource.warc.gz"), 284 | "-o", 285 | os.path.join(tmpdir, "example-resource.wacz"), 286 | "--url", 287 | "https://example.com/", 288 | ] 289 | ), 290 | 0, 291 | ) 292 | 293 | self.assertEqual( 294 | main( 295 | [ 296 | "validate", 297 | "-f", 298 | os.path.join(tmpdir, "example-resource.wacz"), 299 | ] 300 | ), 301 | 0, 302 | ) 303 | 304 | with zipfile.ZipFile(os.path.join(tmpdir, "example-resource.wacz")) as zf: 305 | filelist = sorted(zf.namelist()) 306 | 307 | # verify pages file added for each list 308 | self.assertEqual( 309 | filelist, 310 | [ 311 | "archive/example-resource.warc.gz", 312 | "datapackage-digest.json", 313 | "datapackage.json", 314 | "indexes/index.cdx.gz", 315 | "indexes/index.idx", 316 | "pages/pages.jsonl", 317 | ], 318 | ) 319 | -------------------------------------------------------------------------------- /wacz/validate.py: -------------------------------------------------------------------------------- 1 | import tempfile, os, zipfile, json, pathlib, pkg_resources, gzip 2 | from frictionless import validate 3 | from wacz.util import hash_stream, now 4 | from wacz.waczindexer import WACZIndexer 5 | from io import BytesIO, StringIO, TextIOWrapper 6 | import glob 7 | import datetime 8 | import logging 9 | import requests 10 | 11 | OUTDATED_WACZ = "0.1.0" 12 | 13 | 14 | class Validation(object): 15 | def __init__(self, filename, verify_auth=False, verifier_url=None): 16 | self.dir = tempfile.TemporaryDirectory() 17 | self.wacz = filename 18 | with zipfile.ZipFile(filename, "r") as zip_ref: 19 | zip_ref.extractall(self.dir.name) 20 | zip_ref.close() 21 | self.detect_version() 22 | self.detect_hash_type() 23 | 24 | self.verify_auth = verify_auth 25 | self.verifier_url = verifier_url 26 | 27 | def check_required_contents(self): 28 | """Checks the general component of the wacz and notifies users whats missing""" 29 | if os.path.exists(os.path.join(self.dir.name, "datapackage.json")) is False: 30 | print("Datapackage is missing from your wacz file") 31 | return 1 32 | if ( 33 | glob.glob(os.path.join(self.dir.name, "archive/*.warc")) == False 34 | and glob.glob(os.path.join(self.dir.name, "archive/*.warc.gz")) == False 35 | ): 36 | print( 37 | "A warc file is missing from your archive folder you must have a .warc or .warc.gz file in your archive folder" 38 | ) 39 | return 1 40 | if ( 41 | glob.glob(os.path.join(self.dir.name, "indexes/index.cdx.gz")) == False 42 | and glob.glob(os.path.join(self.dir.name, "indexes/index.cdx.gz")) == False 43 | and glob.glob(os.path.join(self.dir.name, "indexes/index.idx")) == False 44 | ): 45 | print( 46 | "An index file is missing from your indexes folder you must have an index.cdx.gz, index,cdx or index.idx in your index folder" 47 | ) 48 | return 1 49 | if glob.glob(os.path.join(self.dir.name, "pages/pages.jsonl")) == False: 50 | print( 51 | "An index file is missing from your indexes folder you must have an index.cdx.gz, index,cdx or index.idx in your index folder" 52 | ) 53 | return 1 54 | 55 | return 0 56 | 57 | def detect_hash_type(self): 58 | self.hash_type = None 59 | # we know the datapackage exists at this point because we're running it after the version check 60 | self.datapackage_path = os.path.join(self.dir.name, "datapackage.json") 61 | self.datapackage = json.loads(open(self.datapackage_path, "rb").read()) 62 | try: 63 | self.hash_type = self.datapackage["resources"][0]["hash"].split(":")[0] 64 | return 0 65 | except: 66 | print( 67 | "\nHashing type was unable to be detected the wacz file may have no resources" 68 | ) 69 | return 1 70 | 71 | def detect_version(self): 72 | self.version = None 73 | if os.path.exists(os.path.join(self.dir.name, "datapackage.json")): 74 | self.data_folder = os.listdir(self.dir.name) 75 | self.datapackage_path = os.path.join(self.dir.name, "datapackage.json") 76 | self.datapackage = json.loads(open(self.datapackage_path, "rb").read()) 77 | 78 | try: 79 | self.version = self.datapackage["wacz_version"] 80 | except: 81 | print("\nVersion missing from datapackage.json, invalid wacz file") 82 | return 83 | 84 | print("\nVersion detected as %s" % self.version) 85 | elif os.path.exists(os.path.join(self.dir.name, "webarchive.yaml")): 86 | self.version = OUTDATED_WACZ 87 | self.webarchive_yaml = os.path.join(self.dir.name, "webarchive.yaml") 88 | print( 89 | "\nWACZ version detected as 0.1.0. This is an outdated version of WACZ." 90 | ) 91 | else: 92 | print("\nVersion not able to be detected, invalid wacz file") 93 | 94 | def frictionless_validate(self): 95 | """Uses the frictionless data package to validate the datapackage.json file""" 96 | if validate(self.datapackage_path).valid == True: 97 | return True 98 | else: 99 | print( 100 | "\nFrictionless has detected that this is an invalid package with errors %s" 101 | % validate(self.datapackage_path).errors 102 | ) 103 | return False 104 | 105 | def check_file_paths(self): 106 | """Uses the datapackage to check that all the files listed exist in the data folder or that the wacz contains a webarchive.yml file""" 107 | if self.version != OUTDATED_WACZ: 108 | package_files = [item["path"] for item in self.datapackage["resources"]] 109 | for filepath in pathlib.Path(self.dir.name).glob("**/*.*"): 110 | filename = os.path.basename(filepath) 111 | if ( 112 | filename != "datapackage.json" 113 | and filename != "datapackage-digest.json" 114 | ): 115 | file = str(filepath).split("/")[-2:] 116 | file = "/".join(file) 117 | if file not in package_files: 118 | print("file %s is not listed in the datapackage" % file) 119 | return False 120 | return True 121 | 122 | def check_compression(self): 123 | """WARCs and compressed cdx.gz should be in ZIP with 'store' compression (not deflate) Indexes and page list can be compressed""" 124 | wacz = zipfile.ZipInfo(self.wacz) 125 | if wacz.compress_type != 0: 126 | return False 127 | 128 | if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")): 129 | cdx = zipfile.ZipInfo(os.path.join(self.dir.name, "indexes/index.cdx.gz")) 130 | if cdx.compress_type != 0: 131 | return False 132 | 133 | archive_folder = os.listdir(os.path.join(self.dir.name, "archive")) 134 | for item in archive_folder: 135 | if ".warc" not in item and zf.getinfo(item).compress_type != 0: 136 | return False 137 | return True 138 | 139 | def check_indexes(self): 140 | """Indexing existing WARC which should match the index in the wacz""" 141 | if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")): 142 | for resource in self.datapackage["resources"]: 143 | if resource["path"] == "indexes/index.cdx.gz": 144 | cdx = resource["hash"] 145 | else: 146 | return False 147 | 148 | archive_folder = os.listdir(os.path.join(self.dir.name, "archive")) 149 | for item in archive_folder: 150 | if ".warc" in item: 151 | warc = item 152 | wacz_file = tempfile.NamedTemporaryFile(delete=False) 153 | wacz = zipfile.ZipFile(wacz_file.name, "w") 154 | data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now()) 155 | index_buff = BytesIO() 156 | text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True) 157 | wacz_indexer = None 158 | with wacz.open(data_file, "w") as data: 159 | wacz_indexer = WACZIndexer( 160 | text_wrap, 161 | {}, 162 | sort=True, 163 | compress=data, 164 | fields="referrer", 165 | data_out_name="index.cdx.gz", 166 | records="all", 167 | main_url="", 168 | detect_pages="", 169 | ) 170 | 171 | wacz_indexer.process_all() 172 | wacz.close() 173 | dir = tempfile.TemporaryDirectory() 174 | with zipfile.ZipFile(self.wacz, "r") as zip_ref: 175 | zip_ref.extractall(dir.name) 176 | zip_ref.close() 177 | 178 | with open(os.path.join(dir.name, "indexes/index.cdx.gz"), "rb") as fd: 179 | size, hash_ = hash_stream(self.hash_type, fd) 180 | gzip_fd = gzip.GzipFile(fileobj=fd) 181 | 182 | return cdx == hash_ 183 | 184 | def check_file_hashes(self): 185 | """Uses the datapackage to check that all the hashes of file in the data folder match those in the datapackage""" 186 | for filepath in pathlib.Path(self.dir.name).glob("**/*.*"): 187 | filename = os.path.basename(filepath) 188 | if filename != "datapackage.json" and filename != "datapackage-digest.json": 189 | with open(filepath, "rb") as fh: 190 | size, hash_ = hash_stream(self.hash_type, fh) 191 | 192 | path = str(filepath).split("/")[-2:] 193 | path = "/".join(path) 194 | res = None 195 | for item in self.datapackage["resources"]: 196 | if item["path"] == path: 197 | res = item 198 | if res == None or (res["hash"] != hash_): 199 | print( 200 | "\nfile %s's hash does not match the hash listed in the datapackage" 201 | % path 202 | ) 203 | return False 204 | return True 205 | 206 | def check_data_package_hash_and_sig(self): 207 | data_digest_filename = os.path.join(self.dir.name, "datapackage-digest.json") 208 | if not os.path.exists(data_digest_filename): 209 | return True 210 | 211 | with open(data_digest_filename) as fh: 212 | data_digest = json.loads(fh.read()) 213 | 214 | with open(os.path.join(self.dir.name, "datapackage.json"), "rb") as fh: 215 | size, hash_ = hash_stream(self.hash_type, fh) 216 | 217 | if hash_ != data_digest["hash"]: 218 | print("datapackage.json hash mismatch to datapackage-digest.json") 219 | return False 220 | 221 | signed_data = data_digest.get("signedData") 222 | if not signed_data: 223 | return True 224 | 225 | try: 226 | if self.datapackage.get("created") != signed_data.get("created"): 227 | print("signed timestamp != created timestamp") 228 | return False 229 | 230 | if not self.verify_auth: 231 | print( 232 | "Note: Has signature, but auth verification skipped, run with --verify-auth to also include verification" 233 | ) 234 | return True 235 | 236 | if self.verifier_url: 237 | res = requests.post(self.verifier_url, json=signed_data) 238 | success = res.status_code == 200 239 | msg = self.verifier_url 240 | else: 241 | try: 242 | from authsign.verifier import Verifier 243 | except ImportError: 244 | print( 245 | "authsign package not found, can not verify signature. Try installing with 'pip install wacz[signing]'" 246 | ) 247 | return False 248 | 249 | logging.basicConfig( 250 | format="%(asctime)s: [%(levelname)s]: %(message)s", 251 | level=logging.INFO, 252 | ) 253 | 254 | verifier = Verifier() 255 | success = verifier(signed_data) 256 | msg = "direct check" 257 | 258 | if success: 259 | print("Successfully verified signature via: " + msg) 260 | return True 261 | else: 262 | print("Signature not verified via: " + msg) 263 | return False 264 | 265 | except Exception as e: 266 | import traceback 267 | 268 | traceback.print_exc() 269 | print("Validation failed due to error", e) 270 | return False 271 | 272 | return True 273 | 274 | def parse_date(self, string): 275 | if not string: 276 | return None 277 | 278 | return datetime.datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ") 279 | -------------------------------------------------------------------------------- /wacz/main.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser, RawTextHelpFormatter 2 | from io import BytesIO, StringIO, TextIOWrapper 3 | import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources 4 | from wacz.waczindexer import WACZIndexer 5 | from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict 6 | from wacz.validate import Validation, OUTDATED_WACZ 7 | from wacz.util import validateJSON, get_py_wacz_version, validate_pages_jsonl_file 8 | from warcio.timeutils import iso_date_to_timestamp 9 | 10 | """ 11 | WACZ Generator 12 | """ 13 | 14 | PAGE_INDEX = "pages/pages.jsonl" 15 | EXTRA_PAGES_INDEX = "pages/extraPages.jsonl" 16 | 17 | PAGE_INDEX_TEMPLATE = "pages/{0}.jsonl" 18 | 19 | # setting to size matching archiveweb.page defaults 20 | DEFAULT_NUM_LINES = 1024 21 | 22 | 23 | def main(args=None): 24 | parser = ArgumentParser( 25 | description="WACZ creator", formatter_class=RawTextHelpFormatter 26 | ) 27 | 28 | parser.add_argument("-V", "--version", action="version", version=get_version()) 29 | 30 | subparsers = parser.add_subparsers(dest="cmd") 31 | subparsers.required = True 32 | 33 | create = subparsers.add_parser("create", help="create wacz file") 34 | create.add_argument("inputs", nargs="+") 35 | create.add_argument("-f", "--file", action="store_true") 36 | 37 | create.add_argument("-o", "--output", default="archive.wacz") 38 | 39 | create.add_argument("-e", "--extra-pages") 40 | 41 | create.add_argument( 42 | "-t", 43 | "--text", 44 | help="Generates pages.jsonl with a full-text index. Must be run in addition with --detect-pages or it will have no effect", 45 | action="store_true", 46 | ) 47 | 48 | create.add_argument( 49 | "-p", 50 | "--pages", 51 | help="Overrides the pages generation with the passed jsonl pages", 52 | action="store", 53 | ) 54 | 55 | create.add_argument( 56 | "-d", 57 | "--detect-pages", 58 | help="Generates pages.jsonl without a text index", 59 | action="store_true", 60 | ) 61 | 62 | create.add_argument( 63 | "-c", 64 | "--copy-pages", 65 | help="Overrides the pages/extra-pages options by copying files to WACZ without parsing", 66 | action="store_true", 67 | ) 68 | 69 | create.add_argument( 70 | "--hash-type", 71 | choices=["sha256", "md5"], 72 | help="Allows the user to specify the hash type used. Currently we allow sha256 and md5", 73 | ) 74 | 75 | create.add_argument( 76 | "-l", 77 | "--log-directory", 78 | help="Adds log files in specified directory to WACZ", 79 | action="store", 80 | ) 81 | 82 | create.add_argument("--split-seeds", action="store_true") 83 | 84 | create.add_argument("--ts") 85 | create.add_argument("--url") 86 | create.add_argument("--date") 87 | create.add_argument("--title") 88 | create.add_argument("--desc") 89 | 90 | create.add_argument( 91 | "--signing-url", 92 | help="URL of signing server to obtain signature for datapackage-digest.json", 93 | ) 94 | create.add_argument("--signing-token", help="Auth token for signing URL") 95 | 96 | create.set_defaults(func=create_wacz) 97 | 98 | validate = subparsers.add_parser("validate", help="validate a wacz file") 99 | validate.add_argument("-f", "--file", required=True) 100 | validate.set_defaults(func=validate_wacz) 101 | 102 | validate.add_argument( 103 | "--verify-auth", 104 | action="store_true", 105 | help="If set, will attempt to validate authenticity of the WACZ, either directly or via remote server if --verifier-url is also set", 106 | ) 107 | 108 | validate.add_argument( 109 | "--verifier-url", 110 | help="URL of verify server to verify the signature, if any, in dapackage-digest.json", 111 | ) 112 | 113 | cmd = parser.parse_args(args=args) 114 | 115 | if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None: 116 | parser.error("--url must be specified when --ts is passed") 117 | 118 | if cmd.cmd == "create" and cmd.detect_pages is not False and cmd.pages is not None: 119 | parser.error( 120 | "--pages and --detect-pages can't be set at the same time they cancel each other out." 121 | ) 122 | 123 | value = cmd.func(cmd) 124 | return value 125 | 126 | 127 | def get_version(): 128 | return "%(prog)s " + get_py_wacz_version() + " -- WACZ File Format: " + WACZ_VERSION 129 | 130 | 131 | def validate_wacz(res): 132 | validate = Validation( 133 | res.file, verify_auth=res.verify_auth, verifier_url=res.verifier_url 134 | ) 135 | version = validate.version 136 | validation_tests = [] 137 | 138 | if version == OUTDATED_WACZ: 139 | print("Validation succeeded, the passed WACZ is outdated but valid") 140 | return 0 141 | 142 | elif version == WACZ_VERSION: 143 | validation_tests += [ 144 | validate.check_required_contents, 145 | validate.frictionless_validate, 146 | validate.check_file_paths, 147 | validate.check_file_hashes, 148 | validate.check_data_package_hash_and_sig, 149 | ] 150 | else: 151 | print("Validation failed, the passed WACZ is invalid") 152 | return 1 153 | 154 | for func in validation_tests: 155 | success = func() 156 | if success is False: 157 | print("Validation failed, the passed WACZ is invalid") 158 | return 1 159 | 160 | print("Validation succeeded, the passed WACZ is valid") 161 | return 0 162 | 163 | 164 | def create_wacz(res): 165 | wacz = zipfile.ZipFile(res.output, "w") 166 | 167 | # write index 168 | data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now()) 169 | 170 | index_file = zipfile.ZipInfo("indexes/index.idx", now()) 171 | index_file.compress_type = zipfile.ZIP_DEFLATED 172 | 173 | index_buff = BytesIO() 174 | 175 | text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True) 176 | 177 | wacz_indexer = None 178 | 179 | passed_pages_dict = {} 180 | 181 | # Handle pages 182 | if res.pages != None: 183 | if res.copy_pages: 184 | print("Copying passed pages.jsonl file to WACZ") 185 | 186 | if not validate_pages_jsonl_file(res.pages): 187 | print("Unable to create WACZ without valid pages.jsonl file, quitting") 188 | wacz.close() 189 | return 1 190 | 191 | with open(res.pages, "rb") as fh: 192 | pages_jsonl = zipfile.ZipInfo("pages/pages.jsonl", now()) 193 | with wacz.open(pages_jsonl, "w") as pages_file: 194 | shutil.copyfileobj(fh, pages_file) 195 | 196 | else: 197 | print("Validating passed pages.jsonl file") 198 | passed_content = [] 199 | with open(res.pages, "rb") as fh: 200 | for line in fh: 201 | if not line: 202 | continue 203 | 204 | try: 205 | line = line.decode("utf-8") 206 | passed_content.append(line) 207 | except: 208 | print("Page data not utf-8 encoded, skipping", line) 209 | 210 | # Create a dict of the passed pages that will be used in the construction of the index 211 | passed_pages_dict = construct_passed_pages_dict(passed_content) 212 | 213 | if res.extra_pages: 214 | if res.copy_pages: 215 | print("Copying passed extraPages.jsonl file to WACZ") 216 | if validate_pages_jsonl_file(res.extra_pages): 217 | extra_pages_jsonl = zipfile.ZipInfo("pages/extraPages.jsonl", now()) 218 | with open(res.extra_pages, "rb") as fh: 219 | with wacz.open(extra_pages_jsonl, "w") as extra_pages_file: 220 | shutil.copyfileobj(fh, extra_pages_file) 221 | else: 222 | print("Ignoring invalid extraPages.jsonl file") 223 | else: 224 | print("Validating extra pages file") 225 | extra_page_data = [] 226 | with open(res.extra_pages) as fh: 227 | data = fh.read() 228 | for page_str in data.strip().split("\n"): 229 | page_json = validateJSON(page_str) 230 | 231 | if not page_json: 232 | print("Warning: Ignoring invalid extra page\n %s" % page_str) 233 | continue 234 | 235 | extra_page_data.append(page_str.encode("utf-8")) 236 | 237 | extra_pages_file = zipfile.ZipInfo(EXTRA_PAGES_INDEX, now()) 238 | with wacz.open(extra_pages_file, "w") as efh: 239 | efh.write(b"\n".join(extra_page_data)) 240 | 241 | print("Reading and Indexing All WARCs") 242 | with wacz.open(data_file, "w") as data: 243 | wacz_indexer = WACZIndexer( 244 | text_wrap, 245 | res.inputs, 246 | sort=True, 247 | post_append=True, 248 | compress=data, 249 | lines=DEFAULT_NUM_LINES, 250 | digest_records=True, 251 | fields="referrer,req.http:cookie", 252 | data_out_name="index.cdx.gz", 253 | hash_type=res.hash_type, 254 | main_url=res.url, 255 | main_ts=res.ts, 256 | detect_pages=res.detect_pages, 257 | passed_pages_dict=passed_pages_dict, 258 | extract_text=res.text, 259 | signing_url=res.signing_url, 260 | signing_token=res.signing_token, 261 | split_seeds=res.split_seeds, 262 | ) 263 | 264 | wacz_indexer.process_all() 265 | 266 | index_buff.seek(0) 267 | 268 | with wacz.open(index_file, "w") as index: 269 | shutil.copyfileobj(index_buff, index) 270 | 271 | # write archives 272 | print("Writing archives...") 273 | for _input in res.inputs: 274 | archive_file = zipfile.ZipInfo.from_file( 275 | _input, "archive/" + os.path.basename(_input) 276 | ) 277 | with wacz.open(archive_file, "w") as out_fh: 278 | with open(_input, "rb") as in_fh: 279 | shutil.copyfileobj(in_fh, out_fh) 280 | path = "archive/" + os.path.basename(_input) 281 | 282 | if wacz_indexer.passed_pages_dict != None: 283 | for key in wacz_indexer.passed_pages_dict: 284 | print( 285 | "Invalid passed page. We were unable to find a match for %s" % str(key) 286 | ) 287 | 288 | if res.log_directory: 289 | print("Writing logs...") 290 | log_dir = os.path.abspath(res.log_directory) 291 | for log_file in os.listdir(log_dir): 292 | log_path = os.path.join(log_dir, log_file) 293 | log_wacz_file = zipfile.ZipInfo.from_file( 294 | log_path, "logs/{}".format(log_file) 295 | ) 296 | with wacz.open(log_wacz_file, "w") as out_fh: 297 | with open(log_path, "rb") as in_fh: 298 | shutil.copyfileobj(in_fh, out_fh) 299 | path = "logs/{}".format(log_file) 300 | 301 | if len(wacz_indexer.pages) > 0 and res.pages == None and not res.copy_pages: 302 | print("Generating page index...") 303 | # generate pages/text 304 | wacz_indexer.write_page_list( 305 | wacz, 306 | PAGE_INDEX, 307 | wacz_indexer.serialize_json_pages( 308 | wacz_indexer.pages.values(), 309 | id="pages", 310 | title="Pages", 311 | has_text=wacz_indexer.has_text, 312 | ), 313 | ) 314 | 315 | if len(wacz_indexer.pages) > 0 and res.pages != None and not res.copy_pages: 316 | print("Generating page index from passed pages...") 317 | # Initially set the default value of the header id and title 318 | id_value = "pages" 319 | title_value = "Pages" 320 | 321 | # If the user has provided a title or an id in a header of their file we will use those instead of our default. 322 | try: 323 | header = json.loads(passed_content[0]) 324 | except: 325 | print("Warning: Ignoring invalid page header: " + passed_content[0]) 326 | header = {} 327 | 328 | if "format" in header: 329 | print("Header detected in the passed pages.jsonl file") 330 | if "id" in header: 331 | id_value = header["id"] 332 | if "title" in header: 333 | title_value = header["title"] 334 | 335 | wacz_indexer.write_page_list( 336 | wacz, 337 | PAGE_INDEX, 338 | wacz_indexer.serialize_json_pages( 339 | wacz_indexer.pages.values(), 340 | id=id_value, 341 | title=title_value, 342 | has_text=wacz_indexer.has_text, 343 | ), 344 | ) 345 | 346 | if len(wacz_indexer.extra_pages) > 0 and not res.copy_pages: 347 | wacz_indexer.write_page_list( 348 | wacz, 349 | EXTRA_PAGES_INDEX, 350 | wacz_indexer.serialize_json_pages( 351 | wacz_indexer.extra_pages.values(), 352 | id="extra-pages", 353 | title="Extra Pages", 354 | has_text=wacz_indexer.has_text, 355 | ), 356 | ) 357 | 358 | if len(wacz_indexer.extra_page_lists) > 0 and not res.copy_pages: 359 | print("Generating extra page lists...") 360 | 361 | for name, pagelist in wacz_indexer.extra_page_lists.items(): 362 | if name == "pages": 363 | name = shortuuid.uuid() 364 | filename = PAGE_INDEX_TEMPLATE.format(name) 365 | 366 | wacz_indexer.write_page_list(wacz, filename, pagelist) 367 | 368 | # generate datapackage 369 | print("Generating datapackage.json") 370 | 371 | datapackage = wacz_indexer.generate_datapackage(res, wacz) 372 | datapackage_file = zipfile.ZipInfo("datapackage.json", now()) 373 | datapackage_file.compress_type = zipfile.ZIP_DEFLATED 374 | datapackage_bytes = datapackage.encode("utf-8") 375 | wacz.writestr(datapackage_file, datapackage_bytes) 376 | 377 | print("Generating datapackage-digest.json") 378 | datapackage_digest_file = zipfile.ZipInfo("datapackage-digest.json", now()) 379 | datapackage_digest_file.compress_type = zipfile.ZIP_DEFLATED 380 | wacz.writestr( 381 | datapackage_digest_file, 382 | wacz_indexer.generate_datapackage_digest(datapackage_bytes), 383 | ) 384 | 385 | wacz.close() 386 | 387 | return 0 388 | 389 | 390 | if __name__ == "__main__": 391 | main() 392 | -------------------------------------------------------------------------------- /wacz/waczindexer.py: -------------------------------------------------------------------------------- 1 | import json, shortuuid 2 | from urllib.parse import quote, urlsplit, urlunsplit 3 | import os, gzip, glob, zipfile, traceback 4 | from cdxj_indexer.main import CDXJIndexer 5 | from warcio.warcwriter import BufferWARCWriter 6 | from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date 7 | from boilerpy3 import extractors 8 | from wacz.util import ( 9 | hash_stream, 10 | now, 11 | WACZ_VERSION, 12 | get_py_wacz_version, 13 | check_http_and_https, 14 | ) 15 | 16 | import datetime 17 | import hashlib 18 | import requests 19 | 20 | HTML_MIME_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml") 21 | 22 | # Add warcinfo as a default record for indexing to simplify filtering logic 23 | CDXJIndexer.DEFAULT_RECORDS.append("warcinfo") 24 | 25 | 26 | # ============================================================================ 27 | class WACZIndexer(CDXJIndexer): 28 | def __init__(self, *args, **kwargs): 29 | super().__init__(*args, **kwargs) 30 | self.pages = {} 31 | self.extra_pages = {} 32 | self.extra_page_lists = {} 33 | self.title = "" 34 | self.desc = "" 35 | self.has_text = False 36 | self.main_url = kwargs.pop("main_url", "") 37 | self.main_ts = kwargs.pop("main_ts", "") 38 | self.main_page_entry = None 39 | self.main_page_id = None 40 | self.hash_type = kwargs.pop("hash_type", "") 41 | 42 | self.signing_url = kwargs.pop("signing_url", "") 43 | self.signing_token = kwargs.pop("signing_token", "") 44 | 45 | self._created = None 46 | 47 | # If the user has specified a hash type use that otherwise default to sha256 48 | if self.hash_type == None: 49 | self.hash_type = "sha256" 50 | 51 | self.passed_pages_dict = kwargs.pop("passed_pages_dict", {}) 52 | self.split_seeds = kwargs.pop("split_seeds", False) 53 | 54 | if self.main_url != None and self.main_url != "": 55 | self.main_url_flag = False 56 | self.main_ts_flag = False 57 | # if url is missing path segment, ensure it is set to '/' 58 | try: 59 | parts = list(urlsplit(self.main_url)) 60 | if not parts[2]: 61 | parts[2] = "/" 62 | self.main_url = urlunsplit(parts) 63 | except: 64 | pass 65 | 66 | self.detect_pages = kwargs.get("detect_pages") 67 | self.detect_referrer_check = True 68 | self.extract_text = kwargs.get("extract_text") 69 | if self.extract_text == True and self.detect_pages == False: 70 | print( 71 | "Warning. You've passed the --text flag without the --detect-pages flag. No pages.jsonl file will be generated. You must enable the --detect-pages and --text flags together in order to get a pages.jsonl file with full text." 72 | ) 73 | self.referrers = set() 74 | 75 | def process_index_entry(self, it, record, *args): 76 | type_ = record.rec_type 77 | if type_ == "warcinfo": 78 | self.parse_warcinfo(record) 79 | 80 | elif self.filter_record(record): 81 | if type_ in ("response", "resource", "revisit"): 82 | self.check_pages_and_text(record) 83 | 84 | super().process_index_entry(it, record, *args) 85 | 86 | def process_all(self): 87 | super().process_all() 88 | 89 | if self.detect_pages: 90 | if self.detect_referrer_check: 91 | to_delete = [ 92 | id_ 93 | for id_, value in self.pages.items() 94 | if value["url"] not in self.referrers 95 | ] 96 | for delete in to_delete: 97 | del self.pages[delete] 98 | 99 | if self.passed_pages_dict == {}: 100 | print("Num Pages Detected: {0}".format(len(self.pages))) 101 | 102 | if self.split_seeds and self.main_page_entry: 103 | self.extra_pages = self.pages 104 | self.pages = {self.main_page_id: self.main_page_entry} 105 | 106 | if ( 107 | hasattr(self, "main_url_flag") 108 | and hasattr(self, "main_ts_flag") 109 | and self.main_url_flag == False 110 | and self.main_ts_flag == False 111 | ): 112 | raise ValueError( 113 | "ts %s not found in index with %s" % (self.main_ts, self.main_url) 114 | ) 115 | 116 | if hasattr(self, "main_url_flag") and self.main_url_flag == False: 117 | raise ValueError("Url %s not found in index" % (self.main_url)) 118 | 119 | def _do_write(self, urlkey, ts, index, out): 120 | if self.detect_pages: 121 | self.detect_page(ts, index) 122 | 123 | super()._do_write(urlkey, ts, index, out) 124 | 125 | def detect_page(self, ts, index): 126 | referrer = index.get("referrer") 127 | if referrer: 128 | self.referrers.add(referrer) 129 | 130 | def _read_record(self, record): 131 | if hasattr(record, "buffered_stream"): 132 | content = record.buffered_stream.read() 133 | else: 134 | content = record.content_stream().read() 135 | 136 | return content 137 | 138 | def parse_warcinfo(self, record): 139 | """Parse WARC information. 140 | :param record: WARC information 141 | :returns: WARC information or None 142 | :rtype: dict or None 143 | """ 144 | warcinfo = {} 145 | warcinfo_buff = self._read_record(record) 146 | warcinfo_buff = warcinfo_buff.decode("utf-8") 147 | metadata = None 148 | for line in warcinfo_buff.rstrip().split("\n"): 149 | parts = line.split(":", 1) 150 | if parts[0] == "json-metadata": 151 | metadata = json.loads(parts[1]) 152 | elif len(parts) == 2: 153 | warcinfo[parts[0]] = parts[1].strip() 154 | 155 | if not metadata or "type" not in metadata: 156 | return 157 | 158 | if metadata["type"] == "collection": 159 | self.title = metadata.get("title", "") 160 | self.desc = metadata.get("desc", "") 161 | lists = metadata.get("lists") 162 | if lists: 163 | self.extract_page_lists(lists) 164 | 165 | # Don't add the record to the self.pages if were evaluating passed in pages 166 | elif metadata["type"] == "recording" and self.passed_pages_dict == {}: 167 | pages = metadata.get("pages", []) 168 | for page in pages: 169 | id_ = page["timestamp"] + "/" + page["url"] 170 | self.pages[id_] = page 171 | 172 | self.detect_referrer_check = False 173 | 174 | def extract_page_lists(self, lists): 175 | for pagelist in lists: 176 | pagelist_header = {} 177 | # unique id for this page list, will also be the filename 178 | if "slug" in pagelist: 179 | uid = pagelist["slug"] 180 | else: 181 | uid = shortuuid.uuid() 182 | 183 | text_list = list( 184 | self.serialize_json_pages( 185 | pages=pagelist["bookmarks"], 186 | id=uid, 187 | title=pagelist.get("title"), 188 | desc=pagelist.get("desc"), 189 | ) 190 | ) 191 | 192 | self.extra_page_lists[uid] = text_list 193 | 194 | def check_pages_and_text(self, record): 195 | url = record.rec_headers.get("WARC-Target-URI") 196 | date = record.rec_headers.get("WARC-Date") 197 | ts = iso_date_to_timestamp(date) 198 | id_ = ts + "/" + url 199 | matched_id = "" 200 | # Check for both a matching url/ts and url entry 201 | 202 | # if id_ in self.passed_pages_dict: 203 | # matched_id = id_ 204 | 205 | matched_id = check_http_and_https(url, ts, self.passed_pages_dict) 206 | # If we find a match build a record 207 | if matched_id: 208 | page_data = self.passed_pages_dict[matched_id] 209 | page_data["timestamp"] = ts 210 | if "url" not in page_data: 211 | page_data["url"] = url 212 | if "title" not in page_data: 213 | page_data["title"] = url 214 | 215 | if self.split_seeds and not page_data.get("seed"): 216 | self.extra_pages[matched_id] = page_data 217 | else: 218 | self.pages[matched_id] = page_data 219 | 220 | # Delete the entry from our pages_dict so we can't match it again 221 | del self.passed_pages_dict[matched_id] 222 | 223 | if ( 224 | self.main_url 225 | and self.main_url == url 226 | and self.main_ts 227 | and self.main_ts == ts 228 | ): 229 | self.main_ts_flag = True 230 | self.main_url_flag = True 231 | print("Found Main Url: {0}".format(url)) 232 | print("Found Main ts: {0}".format(ts)) 233 | # If were not relying on passed in pages we want to add all records to the self.pages object 234 | if self.passed_pages_dict == {}: 235 | self.main_page_entry = { 236 | "timestamp": ts, 237 | "url": url, 238 | "title": url, 239 | "seed": True, 240 | } 241 | self.main_page_id = id_ 242 | self.pages[id_] = self.main_page_entry 243 | if self.main_url and self.main_url == url and self.main_ts == None: 244 | self.main_url_flag = True 245 | print("Found Main Url: {0}".format(url)) 246 | if id_ not in self.pages: 247 | self.main_page_entry = { 248 | "timestamp": ts, 249 | "url": url, 250 | "title": url, 251 | "seed": True, 252 | } 253 | self.main_page_id = id_ 254 | self.pages[id_] = self.main_page_entry 255 | 256 | mime = self.get_record_mime_type(record) 257 | 258 | if mime not in HTML_MIME_TYPES: 259 | return 260 | 261 | if record.http_headers and record.http_headers.get_statuscode().startswith("3"): 262 | return 263 | 264 | if id_ not in self.pages: 265 | if self.detect_pages: 266 | self.pages[id_] = {"timestamp": ts, "url": url, "title": url} 267 | else: 268 | return 269 | 270 | # if not extracting text, then finish here 271 | if not self.extract_text: 272 | return 273 | 274 | content = self._read_record(record) 275 | if not content: 276 | return 277 | 278 | try: 279 | extractor = extractors.ArticleExtractor(raise_on_failure=False) 280 | 281 | content = content.decode("utf-8") 282 | 283 | doc = extractor.get_doc(content) 284 | 285 | curr_page = self.pages[id_] 286 | 287 | if doc.content: 288 | self.pages[id_]["text"] = doc.content 289 | self.has_text = True 290 | 291 | # only set title if unset, or set to url (default) 292 | # avoid overriding user-specified title, if any 293 | if doc.title and self.pages[id_].get("title", url) == url: 294 | self.pages[id_]["title"] = doc.title 295 | 296 | except Exception as e: 297 | # skip text extraction in case of errors 298 | print("Skipping, Text Extraction Failed For: " + url) 299 | print(e) 300 | 301 | def get_record_mime_type(self, record): 302 | if record.http_headers: 303 | # if the record has HTTP headers, use the Content-Type from those (eg. 'response' record) 304 | content_type = record.http_headers["Content-Type"] 305 | else: 306 | # otherwise, use the Content-Type from WARC headers 307 | content_type = record.rec_headers["Content-Type"] 308 | 309 | mime = content_type or "" 310 | return mime.split(";")[0] 311 | 312 | def write_page_list(self, wacz, filename, page_iter): 313 | pages_file = zipfile.ZipInfo(filename, now()) 314 | pages_file.compress_type = zipfile.ZIP_DEFLATED 315 | 316 | with wacz.open(pages_file, "w") as pg_fh: 317 | for line in page_iter: 318 | pg_fh.write(line.encode("utf-8")) 319 | 320 | def serialize_json_pages(self, pages, id, title, desc=None, has_text=False): 321 | page_header = {"format": "json-pages-1.0", "id": id} 322 | 323 | if title: 324 | page_header["title"] = title 325 | 326 | if desc: 327 | page_header["description"] = desc 328 | 329 | if has_text: 330 | page_header["hasText"] = True 331 | 332 | yield json.dumps(page_header) + "\n" 333 | 334 | for line in pages: 335 | if "ts" not in line and "timestamp" in line: 336 | ts = timestamp_to_iso_date(line["timestamp"]) 337 | line["ts"] = ts 338 | del line["timestamp"] 339 | 340 | line["id"] = line.get("id") or line.get("page_id") or shortuuid.uuid() 341 | 342 | yield json.dumps(line) + "\n" 343 | 344 | def generate_datapackage(self, res, wacz): 345 | package_dict = {} 346 | 347 | package_dict["profile"] = "data-package" 348 | 349 | resources = [] 350 | 351 | for zip_entry in wacz.infolist(): 352 | res_entry = {} 353 | res_entry["name"] = os.path.basename(zip_entry.filename).lower() 354 | res_entry["path"] = zip_entry.filename 355 | 356 | with wacz.open(zip_entry, "r") as stream: 357 | size, hash_ = hash_stream(self.hash_type, stream) 358 | res_entry["hash"] = hash_ 359 | res_entry["bytes"] = size 360 | 361 | resources.append(res_entry) 362 | 363 | package_dict["resources"] = resources 364 | 365 | # set optional metadata 366 | desc = res.desc or self.desc 367 | title = res.title or self.title 368 | 369 | if title: 370 | package_dict["title"] = title 371 | 372 | if desc: 373 | package_dict["description"] = desc 374 | 375 | if self.main_url: 376 | package_dict["mainPageURL"] = self.main_url 377 | if self.main_ts: 378 | package_dict["mainPageDate"] = timestamp_to_iso_date(self.main_ts) 379 | 380 | if res.date: 381 | package_dict["mainPageDate"] = res.date 382 | 383 | package_dict["created"] = datetime.datetime.utcnow().strftime( 384 | "%Y-%m-%dT%H:%M:%SZ" 385 | ) 386 | self._created = package_dict["created"] 387 | 388 | package_dict["wacz_version"] = WACZ_VERSION 389 | 390 | package_dict["software"] = "py-wacz " + get_py_wacz_version() 391 | 392 | return json.dumps(package_dict, indent=2) 393 | 394 | def generate_datapackage_digest(self, datapackage_bytes): 395 | digest_dict = { 396 | "path": "datapackage.json", 397 | "hash": "sha256:" + hashlib.sha256(datapackage_bytes).hexdigest(), 398 | } 399 | 400 | if self.signing_url: 401 | self.do_sign(digest_dict) 402 | 403 | return json.dumps(digest_dict, indent=2) 404 | 405 | def do_sign(self, digest_dict): 406 | try: 407 | headers = {} 408 | if self.signing_token: 409 | headers["Authorization"] = "bearer " + self.signing_token 410 | 411 | req = {"hash": digest_dict["hash"], "created": self._created} 412 | 413 | res = requests.post(self.signing_url, headers=headers, json=req) 414 | 415 | if res.status_code != 200: 416 | raise ValueError("Signing Failed: " + res.text) 417 | return False 418 | 419 | json = res.json() 420 | if json["hash"] != digest_dict["hash"] or json["created"] != self._created: 421 | print("Not Signed, signing request failed") 422 | return 423 | 424 | digest_dict["signedData"] = json 425 | 426 | print("Added Signature") 427 | except: 428 | traceback.print_exc() 429 | -------------------------------------------------------------------------------- /tests/fixtures/logs/wr-specs-crawl.log: -------------------------------------------------------------------------------- 1 | {"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]} 2 | {"logLevel":"info","timestamp":"2023-02-23T20:29:37.197Z","context":"state","message":"Storing state in memory","details":{}} 3 | {"logLevel":"info","timestamp":"2023-02-23T20:29:37.572Z","context":"general","message":"Text Extraction: Disabled","details":{}} 4 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.587Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/"}} 5 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.590Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 6 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.591Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 7 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.591Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/"}} 8 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.083Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async ElementHandle.evaluateHandle (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/JSHandle.js:94:16)\n at async internalHandler.queryOne (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/QueryHandler.js:25:30)\n at async ElementHandle.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ElementHandle.js:78:17)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)"}} 9 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.301Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async ElementHandle.evaluateHandle (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/JSHandle.js:94:16)\n at async internalHandler.queryOne (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/QueryHandler.js:25:30)\n at async ElementHandle.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ElementHandle.js:78:17)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)"}} 10 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.309Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async IsolatedWorld.document (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:186:63)\n at async IsolatedWorld.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:174:26)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)\n at async /app/node_modules/puppeteer-cluster/dist/util.js:63:24"}} 11 | {"logLevel":"info","timestamp":"2023-02-23T20:29:46.594Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/","seedId":0,"depth":0,"started":"2023-02-23T20:29:37.646Z"}} 12 | {"logLevel":"info","timestamp":"2023-02-23T20:29:46.594Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":0,"total":5,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/\",\"seedId\":0,\"depth\":0,\"started\":\"2023-02-23T20:29:37.646Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.593Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}"]}} 13 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.821Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async IsolatedWorld.document (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:186:63)\n at async IsolatedWorld.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:174:26)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)\n at async /app/node_modules/puppeteer-cluster/dist/util.js:63:24"}} 14 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.104Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} 15 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.108Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 16 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.110Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 17 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.111Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} 18 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.600Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} 19 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.606Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 20 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.607Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 21 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.608Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} 22 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.675Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} 23 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.678Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 24 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.678Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 25 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.680Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} 26 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.905Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} 27 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.909Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 28 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.909Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 29 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.911Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} 30 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.113Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/use-cases/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.593Z"}} 31 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.115Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":1,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/use-cases/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.593Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}"]}} 32 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.611Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz-auth/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.633Z"}} 33 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.611Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":2,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}"]}} 34 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.680Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.613Z"}} 35 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.680Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":3,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}"]}} 36 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.914Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/cdxj/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:46.595Z"}} 37 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.914Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":4,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}"]}} 38 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.380Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} 39 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 40 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 41 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} 42 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.784Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} 43 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.789Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 44 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.789Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 45 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.790Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} 46 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.883Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} 47 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.893Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 48 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.893Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 49 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.894Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} 50 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.090Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} 51 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.096Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} 52 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.097Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} 53 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.098Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} 54 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.383Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.119Z"}} 55 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.383Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":5,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} 56 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.793Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.612Z"}} 57 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.794Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":6,"total":9,"pending":3,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} 58 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.896Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.681Z"}} 59 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.896Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":7,"total":9,"pending":2,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} 60 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.103Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.915Z"}} 61 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.107Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":8,"total":9,"pending":1,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} 62 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.265Z","context":"general","message":"Waiting to ensure pending data is written to WARCs...","details":{}} 63 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.277Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":9,"total":9,"pending":0,"limit":{"max":0,"hit":false},"pendingPages":[]}} 64 | -------------------------------------------------------------------------------- /tests/test_optional_flags_wacz.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | import os 4 | import zipfile, json, gzip 5 | from wacz.main import main, now 6 | from wacz.util import hash_file 7 | from unittest.mock import patch 8 | import jsonlines 9 | 10 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") 11 | PAGES_DIR = os.path.join(TEST_DIR, "pages") 12 | 13 | 14 | class TestWaczFormat(unittest.TestCase): 15 | def test_warc_with_invalid_passed_pages(self): 16 | """If a user passes an invalid file using --page we should return an error""" 17 | with tempfile.TemporaryDirectory() as tmpdir: 18 | fp = tempfile.NamedTemporaryFile() 19 | fp.write( 20 | """{"format": "title": "All Pages"}\n{"http://www.example" "0-10-07T21:22:36Z", "title": "Example Domain"}""".encode( 21 | "utf-8" 22 | ) 23 | ) 24 | fp.seek(0) 25 | self.assertEqual( 26 | main( 27 | [ 28 | "create", 29 | "-f", 30 | os.path.join(TEST_DIR, "example-collection.warc"), 31 | "-o", 32 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), 33 | "-p", 34 | os.path.join(tmpdir, fp.name), 35 | ] 36 | ), 37 | 0, 38 | ) 39 | 40 | def test_invalid_passed_pages_copy_pages(self): 41 | """If a user passes an invalid pages.jsonl file using --page --copy-pages we should return an error""" 42 | with tempfile.TemporaryDirectory() as tmpdir: 43 | self.assertEqual( 44 | main( 45 | [ 46 | "create", 47 | "-f", 48 | os.path.join(TEST_DIR, "example-collection.warc"), 49 | "-o", 50 | os.path.join( 51 | tmpdir, "example-collection-invalid-copy-pages.wacz" 52 | ), 53 | "-p", 54 | os.path.join(PAGES_DIR, "invalid.jsonl"), 55 | "--copy-pages", 56 | ] 57 | ), 58 | 1, 59 | ) 60 | 61 | self.assertEqual( 62 | main( 63 | [ 64 | "create", 65 | "-f", 66 | os.path.join(TEST_DIR, "example-collection.warc"), 67 | "-o", 68 | os.path.join( 69 | tmpdir, "example-collection-invalid-copy-pages-txt.wacz" 70 | ), 71 | "-p", 72 | os.path.join(PAGES_DIR, "invalid.txt"), 73 | "--copy-pages", 74 | ] 75 | ), 76 | 1, 77 | ) 78 | 79 | def test_invalid_passed_extra_pages_copy_pages(self): 80 | """If a user passes an invalid extarPages.jsonl file using -e --copy-pages we still create WACZ without extra pages""" 81 | with tempfile.TemporaryDirectory() as tmpdir: 82 | self.assertEqual( 83 | main( 84 | [ 85 | "create", 86 | "-f", 87 | os.path.join(TEST_DIR, "example-collection.warc"), 88 | "-o", 89 | os.path.join( 90 | tmpdir, "example-collection-invalid-copy-extra-pages.wacz" 91 | ), 92 | "-p", 93 | os.path.join(PAGES_DIR, "pages.jsonl"), 94 | "-e", 95 | os.path.join(PAGES_DIR, "invalid.txt"), 96 | "--copy-pages", 97 | ] 98 | ), 99 | 0, 100 | ) 101 | 102 | with zipfile.ZipFile( 103 | os.path.join( 104 | tmpdir, "example-collection-invalid-copy-extra-pages.wacz" 105 | ), 106 | "r", 107 | ) as zip_ref: 108 | zip_ref.extractall(os.path.join(tmpdir, "wacz_no_extra_pages")) 109 | zip_ref.close() 110 | 111 | self.assertEqual( 112 | main( 113 | [ 114 | "validate", 115 | "-f", 116 | os.path.join( 117 | tmpdir, "example-collection-invalid-copy-extra-pages.wacz" 118 | ), 119 | ] 120 | ), 121 | 0, 122 | ) 123 | 124 | self.assertFalse( 125 | "extraPages.jsonl" 126 | in os.listdir(os.path.join(tmpdir, "wacz_no_extra_pages/pages/")) 127 | ) 128 | 129 | @patch("wacz.main.now") 130 | def test_warc_with_pages_flag(self, mock_now): 131 | """When passing the pages flag with a valid pages.jsonl file a pages/pages.jsonl file should be created""" 132 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 133 | 134 | with tempfile.TemporaryDirectory() as tmpdir: 135 | fp = tempfile.NamedTemporaryFile() 136 | fp.write( 137 | """{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}\n{"id": "1db0ef709a", "url": "http://www.example.com/", "ts": "2020-10-07T21:22:36Z", "title": "Example Domain"}""".encode( 138 | "utf-8" 139 | ) 140 | ) 141 | fp.seek(0) 142 | self.assertEqual( 143 | main( 144 | [ 145 | "create", 146 | "-f", 147 | os.path.join(TEST_DIR, "example-collection.warc"), 148 | "-o", 149 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), 150 | "-p", 151 | os.path.join(tmpdir, fp.name), 152 | ] 153 | ), 154 | 0, 155 | ) 156 | with zipfile.ZipFile( 157 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r" 158 | ) as zip_ref: 159 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_pages")) 160 | zip_ref.close() 161 | 162 | self.assertEqual( 163 | main( 164 | [ 165 | "validate", 166 | "-f", 167 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), 168 | ] 169 | ), 170 | 0, 171 | ) 172 | wacz_pages = os.path.join(tmpdir, "unzipped_valid_pages/pages/pages.jsonl") 173 | wacz_cdx = os.path.join(tmpdir, "unzipped_valid_pages/indexes/index.cdx.gz") 174 | cdx_content = gzip.open(wacz_cdx, "rb").read() 175 | self.assertTrue( 176 | "pages.jsonl" 177 | in os.listdir(os.path.join(tmpdir, "unzipped_valid_pages/pages/")) 178 | ) 179 | with open(wacz_pages) as f: 180 | for _ in range(1): 181 | next(f) 182 | for line in f: 183 | obj = json.loads(line) 184 | self.assertTrue("id" in obj.keys()) 185 | self.assertTrue("ts" in obj.keys()) 186 | self.assertTrue("url" in obj.keys()) 187 | self.assertTrue(obj["url"].encode() in cdx_content) 188 | 189 | @patch("wacz.main.now") 190 | def test_warc_with_copy_pages(self, mock_now): 191 | """When passing the pages and extra-pages flags with copy-pages, the files should end up in the WACZ exactly as-is""" 192 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 193 | 194 | with tempfile.TemporaryDirectory() as tmpdir: 195 | self.assertEqual( 196 | main( 197 | [ 198 | "create", 199 | "-f", 200 | os.path.join(TEST_DIR, "example-collection.warc"), 201 | "-o", 202 | os.path.join(tmpdir, "example-collection-copy-pages.wacz"), 203 | "-p", 204 | os.path.join(PAGES_DIR, "pages.jsonl"), 205 | "-e", 206 | os.path.join(PAGES_DIR, "extraPages.jsonl"), 207 | "--copy-pages", 208 | ] 209 | ), 210 | 0, 211 | ) 212 | 213 | with zipfile.ZipFile( 214 | os.path.join(tmpdir, "example-collection-copy-pages.wacz"), "r" 215 | ) as zip_ref: 216 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_copy_pages")) 217 | zip_ref.close() 218 | 219 | self.assertEqual( 220 | main( 221 | [ 222 | "validate", 223 | "-f", 224 | os.path.join(tmpdir, "example-collection-copy-pages.wacz"), 225 | ] 226 | ), 227 | 0, 228 | ) 229 | 230 | wacz_pages = os.path.join(tmpdir, "unzipped_copy_pages/pages/pages.jsonl") 231 | wacz_extra_pages = os.path.join( 232 | tmpdir, "unzipped_copy_pages/pages/extraPages.jsonl" 233 | ) 234 | 235 | self.assertTrue( 236 | "pages.jsonl" 237 | in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/")) 238 | ) 239 | self.assertTrue( 240 | "extraPages.jsonl" 241 | in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/")) 242 | ) 243 | 244 | self.assertEqual( 245 | hash_file("sha256", wacz_pages), 246 | hash_file("sha256", os.path.join(PAGES_DIR, "pages.jsonl")), 247 | ) 248 | self.assertEqual( 249 | hash_file("sha256", wacz_extra_pages), 250 | hash_file("sha256", os.path.join(PAGES_DIR, "extraPages.jsonl")), 251 | ) 252 | 253 | @patch("wacz.main.now") 254 | def test_warc_with_detect_pages_flag(self, mock_now): 255 | """When passing the text index flag pages/pages.jsonl should be generated.""" 256 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 257 | with tempfile.TemporaryDirectory() as tmpdir: 258 | self.assertEqual( 259 | main( 260 | [ 261 | "create", 262 | "-f", 263 | os.path.join(TEST_DIR, "example-collection.warc"), 264 | "-o", 265 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), 266 | "--detect-pages", 267 | ] 268 | ), 269 | 0, 270 | ) 271 | with zipfile.ZipFile( 272 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r" 273 | ) as zip_ref: 274 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_pages")) 275 | zip_ref.close() 276 | 277 | wacz_pages = os.path.join(tmpdir, "unzipped_valid_pages/pages/pages.jsonl") 278 | wacz_cdx = os.path.join(tmpdir, "unzipped_valid_pages/indexes/index.cdx.gz") 279 | cdx_content = gzip.open(wacz_cdx, "rb").read() 280 | self.assertTrue( 281 | "pages.jsonl" 282 | in os.listdir(os.path.join(tmpdir, "unzipped_valid_pages/pages/")) 283 | ) 284 | with open(wacz_pages) as f: 285 | for _ in range(1): 286 | next(f) 287 | for line in f: 288 | obj = json.loads(line) 289 | self.assertTrue("id" in obj.keys()) 290 | self.assertTrue("ts" in obj.keys()) 291 | self.assertTrue("title" in obj.keys()) 292 | self.assertTrue("url" in obj.keys()) 293 | self.assertTrue(obj["url"].encode() in cdx_content) 294 | 295 | @patch("wacz.main.now") 296 | def test_warc_with_text_index_flag(self, mock_now): 297 | """When passing the text index flag pages/pages.jsonl should be generated with a full and accurate text index.""" 298 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 299 | with tempfile.TemporaryDirectory() as tmpdir: 300 | self.assertEqual( 301 | main( 302 | [ 303 | "create", 304 | "-f", 305 | os.path.join(TEST_DIR, "example-collection.warc"), 306 | "-o", 307 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), 308 | "-t", 309 | ] 310 | ), 311 | 0, 312 | ) 313 | with zipfile.ZipFile( 314 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r" 315 | ) as zip_ref: 316 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_text")) 317 | zip_ref.close() 318 | 319 | wacz_pages = os.path.join(tmpdir, "unzipped_valid_text/pages/pages.jsonl") 320 | wacz_cdx = os.path.join(tmpdir, "unzipped_valid_text/indexes/index.cdx.gz") 321 | cdx_content = gzip.open(wacz_cdx, "rb").read() 322 | self.assertTrue( 323 | "pages.jsonl" 324 | in os.listdir(os.path.join(tmpdir, "unzipped_valid_text/pages/")) 325 | ) 326 | with open(wacz_pages) as f: 327 | for _ in range(1): 328 | next(f) 329 | for line in f: 330 | obj = json.loads(line) 331 | self.assertTrue("id" in obj.keys()) 332 | self.assertTrue("ts" in obj.keys()) 333 | self.assertTrue("title" in obj.keys()) 334 | self.assertTrue("url" in obj.keys()) 335 | self.assertTrue(obj["url"].encode() in cdx_content) 336 | self.assertTrue("text" in obj.keys()) 337 | 338 | def test_warc_with_both_p_and_d_flag(self): 339 | """If a user passes both the --pages and --detect-pages flags we should return an error and a message about needing only one""" 340 | with tempfile.TemporaryDirectory() as tmpdir: 341 | with self.assertRaises(SystemExit): 342 | self.assertEqual( 343 | main( 344 | [ 345 | "create", 346 | "-f", 347 | os.path.join(TEST_DIR, "example-collection.warc"), 348 | "-o", 349 | os.path.join(tmpdir, "example-collection.wacz"), 350 | "--detect_pages", 351 | "-p", 352 | "test.jsonl", 353 | ] 354 | ), 355 | 0, 356 | ) 357 | 358 | def test_warc_with_only_ts_flag(self): 359 | """If a user only passes the --ts flag we should return an error and a message about needing to also pass the --url flag""" 360 | with tempfile.TemporaryDirectory() as tmpdir: 361 | with self.assertRaises(SystemExit): 362 | self.assertEqual( 363 | main( 364 | [ 365 | "create", 366 | "-f", 367 | os.path.join(TEST_DIR, "example-collection.warc"), 368 | "-o", 369 | os.path.join(tmpdir, "example-collection.wacz"), 370 | "--ts", 371 | "2020104212236", 372 | ] 373 | ), 374 | 0, 375 | ) 376 | 377 | @patch("wacz.main.now") 378 | def test_warc_with_valid_date_flag(self, mock_now): 379 | """When passing a valid date flag the datapackage should have that as the mainpageTS""" 380 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 381 | with tempfile.TemporaryDirectory() as tmpdir: 382 | self.assertEqual( 383 | main( 384 | [ 385 | "create", 386 | "-f", 387 | os.path.join(TEST_DIR, "example-collection.warc"), 388 | "-o", 389 | os.path.join(tmpdir, "example-collection-valid-desc.wacz"), 390 | "--desc", 391 | "fake desc", 392 | ] 393 | ), 394 | 0, 395 | ) 396 | with zipfile.ZipFile( 397 | os.path.join(tmpdir, "example-collection-valid-desc.wacz"), "r" 398 | ) as zip_ref: 399 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_desc")) 400 | zip_ref.close() 401 | 402 | self.wacz_json = os.path.join( 403 | tmpdir, "unzipped_valid_desc/datapackage.json" 404 | ) 405 | self.wacz_pages = os.path.join( 406 | tmpdir, "unzipped_valid_desc/pages/pages.jsonl" 407 | ) 408 | 409 | f = open(self.wacz_json, "rb") 410 | json_parse = json.loads(f.read()) 411 | 412 | self.assertEqual(json_parse["des"], "fake desc") 413 | 414 | @patch("wacz.main.now") 415 | def test_warc_with_valid_date_flag(self, mock_now): 416 | """When passing a valid title flag the datapackage should have that as the title value""" 417 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 418 | with tempfile.TemporaryDirectory() as tmpdir: 419 | self.assertEqual( 420 | main( 421 | [ 422 | "create", 423 | "-f", 424 | os.path.join(TEST_DIR, "example-collection.warc"), 425 | "-o", 426 | os.path.join(tmpdir, "example-collection-valid-title.wacz"), 427 | "--title", 428 | "Example Title", 429 | ] 430 | ), 431 | 0, 432 | ) 433 | with zipfile.ZipFile( 434 | os.path.join(tmpdir, "example-collection-valid-title.wacz"), "r" 435 | ) as zip_ref: 436 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_title")) 437 | zip_ref.close() 438 | 439 | self.wacz_json = os.path.join( 440 | tmpdir, "unzipped_valid_title/datapackage.json" 441 | ) 442 | self.wacz_pages = os.path.join( 443 | tmpdir, "unzipped_valid_title/pages/pages.jsonl" 444 | ) 445 | 446 | f = open(self.wacz_json, "rb") 447 | json_parse = json.loads(f.read()) 448 | 449 | self.assertEqual(json_parse["title"], "Example Title") 450 | 451 | @patch("wacz.main.now") 452 | def test_warc_with_valid_date_flag(self, mock_now): 453 | """When passing a valid date flag the datapackage should have that as the mainpageTS""" 454 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 455 | with tempfile.TemporaryDirectory() as tmpdir: 456 | self.assertEqual( 457 | main( 458 | [ 459 | "create", 460 | "-f", 461 | os.path.join(TEST_DIR, "example-collection.warc"), 462 | "-o", 463 | os.path.join(tmpdir, "example-collection-valid-date.wacz"), 464 | "--date", 465 | "2020-11-01", 466 | ] 467 | ), 468 | 0, 469 | ) 470 | with zipfile.ZipFile( 471 | os.path.join(tmpdir, "example-collection-valid-date.wacz"), "r" 472 | ) as zip_ref: 473 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_date")) 474 | zip_ref.close() 475 | 476 | self.wacz_json = os.path.join( 477 | tmpdir, "unzipped_valid_date/datapackage.json" 478 | ) 479 | self.wacz_pages = os.path.join( 480 | tmpdir, "unzipped_valid_date/pages/pages.jsonl" 481 | ) 482 | 483 | f = open(self.wacz_json, "rb") 484 | json_parse = json.loads(f.read()) 485 | 486 | self.assertEqual(json_parse["mainPageDate"], "2020-11-01") 487 | 488 | @patch("wacz.main.now") 489 | def test_warc_with_valid_url_flag(self, mock_now): 490 | """When passing a valid url flag the url should be added to the pages.jsonl file and appear in the datapackage""" 491 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 492 | with tempfile.TemporaryDirectory() as tmpdir: 493 | self.assertEqual( 494 | main( 495 | [ 496 | "create", 497 | "-f", 498 | os.path.join(TEST_DIR, "example-collection.warc"), 499 | "-o", 500 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), 501 | "--url", 502 | "http://www.example.com/", 503 | ] 504 | ), 505 | 0, 506 | ) 507 | with zipfile.ZipFile( 508 | os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r" 509 | ) as zip_ref: 510 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_url")) 511 | zip_ref.close() 512 | 513 | self.wacz_json = os.path.join(tmpdir, "unzipped_valid_url/datapackage.json") 514 | self.wacz_pages = os.path.join( 515 | tmpdir, "unzipped_valid_url/pages/pages.jsonl" 516 | ) 517 | 518 | f = open(self.wacz_json, "rb") 519 | json_parse = json.loads(f.read()) 520 | 521 | f = open(self.wacz_pages, "rb") 522 | json_pages = [json.loads(jline) for jline in f.read().splitlines()] 523 | self.assertEqual(json_pages[1]["url"], "http://www.example.com/") 524 | self.assertEqual(json_parse["mainPageURL"], "http://www.example.com/") 525 | assert "mainPageDate" not in json_parse.keys() 526 | 527 | def test_warc_with_invalid_url_flag(self): 528 | """When passing an invalid url flag we should raise a ValueError""" 529 | with tempfile.TemporaryDirectory() as tmpdir: 530 | with self.assertRaises(ValueError): 531 | main( 532 | [ 533 | "create", 534 | "-f", 535 | os.path.join(TEST_DIR, "example-collection.warc"), 536 | "-o", 537 | os.path.join(tmpdir, "example-collection.wacz"), 538 | "--url", 539 | "http://www.examplefake.com/", 540 | ] 541 | ) 542 | 543 | def test_warc_with_valid_url_and_invalid_ts_flag(self): 544 | """When passing a valid url flag with an invalid ts flag we should raise a ValueError""" 545 | with tempfile.TemporaryDirectory() as tmpdir: 546 | with self.assertRaises(ValueError): 547 | main( 548 | [ 549 | "create", 550 | "-f", 551 | os.path.join(TEST_DIR, "example-collection.warc"), 552 | "-o", 553 | os.path.join(tmpdir, "example-collection.wacz"), 554 | "--url", 555 | "http://www.example.com/", 556 | "--ts", 557 | "2020104212236", 558 | ] 559 | ) 560 | 561 | @patch("wacz.main.now") 562 | def test_warc_with_valid_url_and_ts_flag(self, mock_now): 563 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 564 | """When passing an a valid url and ts flag we should see those values represented in the datapackage and pages.jsonl file""" 565 | with tempfile.TemporaryDirectory() as tmpdir: 566 | self.assertEqual( 567 | main( 568 | [ 569 | "create", 570 | "-f", 571 | os.path.join(TEST_DIR, "example-collection.warc"), 572 | "-o", 573 | os.path.join( 574 | tmpdir, "example-collection-valid-url-valid-ts.wacz" 575 | ), 576 | "--url", 577 | "http://www.example.com/", 578 | "--ts", 579 | "20201007212236", 580 | ] 581 | ), 582 | 0, 583 | ) 584 | with zipfile.ZipFile( 585 | os.path.join(tmpdir, "example-collection-valid-url-valid-ts.wacz"), "r" 586 | ) as zip_ref: 587 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_url_valid_ts")) 588 | zip_ref.close() 589 | 590 | self.wacz_json = os.path.join( 591 | tmpdir, "unzipped_valid_url_valid_ts/datapackage.json" 592 | ) 593 | self.wacz_pages = os.path.join( 594 | tmpdir, "unzipped_valid_url_valid_ts/pages/pages.jsonl" 595 | ) 596 | 597 | f = open(self.wacz_json, "rb") 598 | json_parse = json.loads(f.read()) 599 | 600 | f = open(self.wacz_pages, "rb") 601 | json_pages = [json.loads(jline) for jline in f.read().splitlines()] 602 | self.assertEqual(json_pages[1]["url"], "http://www.example.com/") 603 | self.assertEqual(json_parse["mainPageURL"], "http://www.example.com/") 604 | self.assertEqual(json_parse["mainPageDate"], "2020-10-07T21:22:36Z") 605 | 606 | @patch("wacz.main.now") 607 | def test_warc_with_hash_flag_md5(self, mock_now): 608 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 609 | """When passing the --hash-type flag with a value of md5 the datapackage should be hashed using md5""" 610 | with tempfile.TemporaryDirectory() as tmpdir: 611 | self.assertEqual( 612 | main( 613 | [ 614 | "create", 615 | "-f", 616 | os.path.join(TEST_DIR, "example-collection.warc"), 617 | "-o", 618 | os.path.join(tmpdir, "example-collection-md5.wacz"), 619 | "--hash-type", 620 | "md5", 621 | ] 622 | ), 623 | 0, 624 | ) 625 | with zipfile.ZipFile( 626 | os.path.join(tmpdir, "example-collection-md5.wacz"), "r" 627 | ) as zip_ref: 628 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_md5")) 629 | zip_ref.close() 630 | 631 | self.wacz_json = os.path.join(tmpdir, "unzipped_md5/datapackage.json") 632 | 633 | f = open(self.wacz_json, "rb") 634 | json_parse = json.loads(f.read()) 635 | 636 | assert "md5" in json_parse["resources"][0]["hash"] 637 | 638 | @patch("wacz.main.now") 639 | def test_warc_with_hash_flag_sha256(self, mock_now): 640 | mock_now.return_value = (2020, 10, 7, 22, 29, 10) 641 | """When passing the --hash-type flag with a value of sha256 the datapackage should be hashed using sha256""" 642 | with tempfile.TemporaryDirectory() as tmpdir: 643 | self.assertEqual( 644 | main( 645 | [ 646 | "create", 647 | "-f", 648 | os.path.join(TEST_DIR, "example-collection.warc"), 649 | "-o", 650 | os.path.join(tmpdir, "example-collection-sha256.wacz"), 651 | "--hash-type", 652 | "sha256", 653 | ] 654 | ), 655 | 0, 656 | ) 657 | with zipfile.ZipFile( 658 | os.path.join(tmpdir, "example-collection-sha256.wacz"), "r" 659 | ) as zip_ref: 660 | zip_ref.extractall(os.path.join(tmpdir, "unzipped_sha256")) 661 | zip_ref.close() 662 | 663 | self.wacz_json = os.path.join(tmpdir, "unzipped_sha256/datapackage.json") 664 | 665 | f = open(self.wacz_json, "rb") 666 | json_parse = json.loads(f.read()) 667 | 668 | assert "sha256" in json_parse["resources"][0]["hash"] 669 | 670 | 671 | if __name__ == "__main__": 672 | unittest.main() 673 | --------------------------------------------------------------------------------