├── tests
    ├── __init__.py
    ├── fixtures
    │   ├── pages
    │   │   ├── invalid.txt
    │   │   ├── invalid.jsonl
    │   │   ├── pages.jsonl
    │   │   └── extraPages.jsonl
    │   ├── .gitignore
    │   ├── example-iana.warc
    │   ├── valid_example_1.wacz
    │   ├── valid_example_2.wacz
    │   ├── example-collection.warc
    │   ├── example-resource.warc.gz
    │   ├── invalid_example_1.wacz
    │   ├── invalid_signed_example_1.wacz
    │   ├── valid_signed_example_1.wacz
    │   ├── example-warcinfo-metadata.warc
    │   ├── example-collection-with-lists.warc
    │   └── logs
    │   │   ├── wr-crawl.log
    │   │   └── wr-specs-crawl.log
    ├── test_verify_signed.py
    ├── test_util.py
    ├── test_create_wacz_hash_in_page.py
    ├── test_wacz_indexer_functions.py
    ├── test_create_wacz.py
    ├── test_validate_wacz.py
    ├── test_create_wacz_indexing.py
    └── test_optional_flags_wacz.py
├── wacz
    ├── __init__.py
    ├── __main__.py
    ├── util.py
    ├── validate.py
    ├── main.py
    └── waczindexer.py
├── requirements.txt
├── codecov.yml
├── .coveragerc
├── .gitignore
├── .github
    └── workflows
    │   ├── publish_pypi.yaml
    │   └── ci.yaml
├── setup.py
├── LICENSE
├── CHANGES.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wacz/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/fixtures/pages/invalid.txt:
--------------------------------------------------------------------------------
1 | Not a JSONL file
2 | 


--------------------------------------------------------------------------------
/wacz/__main__.py:
--------------------------------------------------------------------------------
1 | from wacz.main import main
2 | 
3 | main()
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
4 | !*.warc
5 | !*.warc.gz
6 | !*.wacz
7 | !*.jsonl
8 | !pages/*
9 | 


--------------------------------------------------------------------------------
/tests/fixtures/example-iana.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-iana.warc


--------------------------------------------------------------------------------
/tests/fixtures/valid_example_1.wacz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/valid_example_1.wacz


--------------------------------------------------------------------------------
/tests/fixtures/valid_example_2.wacz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/valid_example_2.wacz


--------------------------------------------------------------------------------
/tests/fixtures/example-collection.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-collection.warc


--------------------------------------------------------------------------------
/tests/fixtures/example-resource.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-resource.warc.gz


--------------------------------------------------------------------------------
/tests/fixtures/invalid_example_1.wacz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/invalid_example_1.wacz


--------------------------------------------------------------------------------
/tests/fixtures/invalid_signed_example_1.wacz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/invalid_signed_example_1.wacz


--------------------------------------------------------------------------------
/tests/fixtures/valid_signed_example_1.wacz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/valid_signed_example_1.wacz


--------------------------------------------------------------------------------
/tests/fixtures/example-warcinfo-metadata.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-warcinfo-metadata.warc


--------------------------------------------------------------------------------
/tests/fixtures/example-collection-with-lists.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/py-wacz/HEAD/tests/fixtures/example-collection-with-lists.warc


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | frictionless>=3.23.4
 2 | shortuuid>=1.0.1
 3 | cdxj-indexer>=1.4.4
 4 | boilerpy3>=1.0.2
 5 | pytest-cov>=2.10.1
 6 | PyYAML>=5.3.1
 7 | black>=20.8b1
 8 | jsonlines>=3.0.0
 9 | click>=8.0.0
10 | typer==0.11.1
11 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     project:
 4 |       default:
 5 |         # basic
 6 |         target: 80%
 7 |         threshold: 2%
 8 |         base: auto 
 9 |  
10 |   ignore:
11 |   - "tests/*"
12 |   - "wacz/__init__.py"
13 |   - "wacz/__main__.py"
14 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = codecov
 3 | branch = True
 4 | omit =
 5 |     */test/*
 6 |     */tests/*
 7 |     *.html
 8 |     *.js
 9 |     *.css
10 | 
11 | [report]
12 | exclude_lines =
13 |     pragma: no cover
14 |     if __name__ == .__main__.:
15 |     def __repr__
16 |     raise NotImplementedError
17 | 


--------------------------------------------------------------------------------
/tests/fixtures/pages/invalid.jsonl:
--------------------------------------------------------------------------------
1 | {id": "extra-pages", "title": "Extra Pages"}
2 | {"id": "8e584989-8e90-41d6-9f27-c15d0fefe437", "url": "https://webrecorder.net/about", "title": "Webrecorder | About", "loadState": 4, "status": null, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
3 | 


--------------------------------------------------------------------------------
/tests/fixtures/pages/pages.jsonl:
--------------------------------------------------------------------------------
1 | {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}
2 | {"id": "3e01410a-e0a8-4b6f-8a6a-fca6302d9916", "url": "https://webrecorder.net/", "title": "Webrecorder", "loadState": 4, "status": 200, "seed": true, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:17Z"}
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | .eggs
13 | *.cache
14 | parts
15 | bin
16 | var
17 | sdist
18 | develop-eggs
19 | .installed.cfg
20 | lib
21 | lib64
22 | __pycache__
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | 
27 | # Unit test / coverage reports
28 | .coverage
29 | .tox
30 | nosetests.xml
31 | 
32 | # Translations
33 | *.mo
34 | 
35 | .DS_Store
36 | **/.DS_Store
37 | 
38 | # Mr Developer
39 | .mr.developer.cfg
40 | .project
41 | .pydevproject
42 | 
43 | Pipfile*
44 | 


--------------------------------------------------------------------------------
/tests/fixtures/pages/extraPages.jsonl:
--------------------------------------------------------------------------------
1 | {"format": "json-pages-1.0", "id": "extra-pages", "title": "Extra Pages"}
2 | {"id": "e33b4ca5-ce1d-46b2-83ea-405c43b949c5", "url": "https://webrecorder.net/tools", "title": "Webrecorder | Tools", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:22Z"}
3 | {"id": "d026299c-3e37-4473-bcb4-742bc005b25d", "url": "https://webrecorder.net/blog", "title": "Webrecorder | Blog", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
4 | {"id": "726e4e11-abb5-447d-b0be-61c4de7bb4b1", "url": "https://webrecorder.net/community", "title": "Webrecorder | Community", "loadState": 4, "status": 200, "favIconUrl": "https://webrecorder.net/assets/favicon.ico", "ts": "2024-03-20T20:41:20Z"}
5 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish to PYPI
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | 
 6 | jobs:
 7 |   pypi-release:
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.9]
12 | 
13 |     steps:
14 |       - name: checkout
15 |         uses: actions/checkout@v1
16 | 
17 |       - name: Set up Python ${{ matrix.python-version }}
18 |         uses: actions/setup-python@v1
19 |         with:
20 |           python-version: ${{ matrix.python-version }}
21 | 
22 |       - name: Install dependencies
23 |         run: python -m pip install --upgrade pip wheel twine
24 | 
25 |       - name: Build Dist
26 |         run: python setup.py sdist bdist_wheel
27 | 
28 |       - name: Publish package to TestPyPI
29 |         uses: pypa/gh-action-pypi-publish@master
30 |         with:
31 |           user: __token__
32 |           password: ${{ secrets.PYPI_API_TOKEN }}
33 | 
34 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # vim: set sw=4 et:
 3 | from setuptools import setup, find_packages
 4 | 
 5 | __version__ = "0.5.0"
 6 | 
 7 | def load_requirements(filename):
 8 |     with open(filename, "rt") as fh:
 9 |         return fh.read().rstrip().split("\n")
10 | 
11 | def long_description():
12 |     with open("README.md") as f:
13 |         return f.read()
14 | 
15 | setup(
16 |     name="wacz",
17 |     version=__version__,
18 |     author="Ilya Kreymer, Emma Dickson",
19 |     author_email="info@webrecorder.net",
20 |     license="Apache 2.0",
21 |     packages=find_packages(exclude=["test"]),
22 |     url="https://github.com/webrecorder/py-wacz",
23 |     description="WACZ Format Tools",
24 |     long_description=long_description(),
25 |     long_description_content_type="text/markdown",
26 |     install_requires=load_requirements("requirements.txt"),
27 |     extras_require={"signing": ["authsign>=0.5.1", "requests"]},
28 |     zip_safe=True,
29 |     setup_requires=["pytest-runner"],
30 |     entry_points="""
31 |         [console_scripts]
32 |         wacz = wacz.main:main
33 |     """,
34 | )
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Webrecorder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push, pull_request]
 3 | 
 4 | jobs:
 5 |   unit-tests:
 6 |     runs-on: ubuntu-latest
 7 |     strategy:
 8 |       max-parallel: 3
 9 |       matrix:
10 |         python-version: ['3.8', '3.9', '3.10']
11 | 
12 |     steps:
13 |       - name: checkout
14 |         uses: actions/checkout@v2
15 |         
16 |       - name: Set up Python ${{ matrix.python-version }}
17 |         uses: actions/setup-python@v1
18 |         with:
19 |           python-version: ${{ matrix.python-version }}
20 | 
21 |       - name: Install dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install --upgrade -r requirements.txt 
25 |           python setup.py -q install
26 |           pip install -e .[signing]
27 | 
28 |       - name: Style Check
29 |         run: |
30 |           black --check tests/*
31 |           black --check wacz/*
32 | 
33 |       - name: Test with pytest
34 |         run: |
35 |           set -e
36 |           pytest --cov-config=.coveragerc
37 |           pytest -v --cov=wacz --cov-report=xml
38 | 
39 |       - name: Upload coverage to Codecov
40 |         uses: codecov/codecov-action@v1
41 |         with:
42 |           verbose: true  
43 | 


--------------------------------------------------------------------------------
/tests/test_verify_signed.py:
--------------------------------------------------------------------------------
 1 | import unittest, os
 2 | from wacz.main import main
 3 | 
 4 | 
 5 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
 6 | 
 7 | 
 8 | class TestVerifySigned(unittest.TestCase):
 9 |     def test_wacz_valid_and_verify_sig(self):
10 |         self.assertEqual(
11 |             main(
12 |                 [
13 |                     "validate",
14 |                     "--verify-auth",
15 |                     "-f",
16 |                     os.path.join(TEST_DIR, "valid_signed_example_1.wacz"),
17 |                 ]
18 |             ),
19 |             0,
20 |         )
21 | 
22 |     def test_wacz_valid_and_not_valid_sig(self):
23 |         self.assertEqual(
24 |             main(
25 |                 [
26 |                     "validate",
27 |                     "--verify-auth",
28 |                     "-f",
29 |                     os.path.join(TEST_DIR, "invalid_signed_example_1.wacz"),
30 |                 ]
31 |             ),
32 |             1,
33 |         )
34 | 
35 |     def test_wacz_valid_not_signed(self):
36 |         self.assertEqual(
37 |             main(
38 |                 [
39 |                     "validate",
40 |                     "--verify-auth",
41 |                     "-f",
42 |                     os.path.join(TEST_DIR, "valid_example_1.wacz"),
43 |                 ]
44 |             ),
45 |             1,
46 |         )
47 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tempfile
 3 | import os
 4 | import zipfile, json, gzip, hashlib
 5 | from io import BytesIO
 6 | 
 7 | from wacz.util import hash_stream, validateJSON
 8 | 
 9 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
10 | 
11 | 
12 | class TestUtilFunctions(unittest.TestCase):
13 |     def test_util_hash(self):
14 |         """When invoking the util hash method a  hash should be returned"""
15 |         test_hash = "sha256:%s" % hashlib.sha256("test".encode("utf-8")).hexdigest()
16 |         bytes_, hash_ = hash_stream("sha256", BytesIO("test".encode("utf-8")))
17 |         self.assertEqual(bytes_, 4)
18 |         self.assertEqual(hash_, test_hash)
19 | 
20 |         test_hash = "md5:%s" % hashlib.md5("test".encode("utf-8")).hexdigest()
21 |         bytes_, hash_ = hash_stream("md5", BytesIO("test".encode("utf-8")))
22 |         self.assertEqual(bytes_, 4)
23 |         self.assertEqual(hash_, test_hash)
24 | 
25 |     def test_util_validate_json_succeed(self):
26 |         """validate json method should succeed with valid json"""
27 |         self.assertTrue(validateJSON('{"test": "test"}'))
28 | 
29 |     def test_util_validate_json_fail(self):
30 |         """validate json method should fail with valid json"""
31 |         self.assertFalse(validateJSON('test": "test"}'))
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     unittest.main()
36 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | # 0.4.8
 2 | 
 3 | - Add -l/--log-directory option to add logs directory to WACZ
 4 | 
 5 | # 0.4.7
 6 | 
 7 | - include request cookie in cdxj via 'req.http:cookie' field (#27)
 8 | - fix Click dependency version
 9 | 
10 | # 0.4.6
11 | 
12 | - wacz zip write: ensure zip file is fully closed on exit (fixes #20
13 | - ci: add ci for py3.10
14 | - wacz create: support --url, --detect-pages and --split-seeds to write detect pages to extraPages.jsonl, specified seed to pages.jsonl
15 | - text extract: don't raise exception, keep parsed text
16 | 
17 | # 0.4.5
18 | - Pages: also ignore pages with invalid utf-8 encoding
19 | 
20 | # 0.4.4
21 | 
22 | - Pages: read pages line by line in case of large pages file
23 | 
24 | # 0.4.3
25 | 
26 | - Pages: Better page parsing fix, more lenient on page parsing errors: print error and continue, ignoring invalid page 
27 | 
28 | # 0.4.2
29 | 
30 | - Pages: Fix parsing of page URLs that contain extra ':'
31 | 
32 | # 0.4.1
33 | 
34 | - More efficient hash computation
35 | 
36 | # 0.4.0
37 | 
38 | - Add support for signing and verification!
39 | 
40 | # 0.3.1
41 | 
42 | - Ensure passed in pages are check via both http and https URLs
43 | - Update to cdxj-indexer 1.4.1, supporting improved indexing of JSON POST requests
44 | 
45 | # 0.3.0
46 | 
47 | - Add `name` field to `resources` for better compatibility with frictionless spec.
48 | 
49 | # wacz 0.3.0b1
50 | 
51 | Improved compatibility with frictionless data spec
52 | 
53 | - Top-level `title`, `description`, `created`, `software` fields and optional `mainPageURL` and `mainPageTS` fields.
54 | - Include full WARC record digests in `recordDigest` field in CDX, `digest` in IDX
55 | - Support for `pages/extraPages.jsonl` passed in via --extra-pages/-e flag
56 | 


--------------------------------------------------------------------------------
/tests/test_create_wacz_hash_in_page.py:
--------------------------------------------------------------------------------
 1 | import unittest, os, zipfile, sys, gzip, json, tempfile
 2 | from wacz.main import main, now
 3 | from unittest.mock import patch
 4 | from wacz.util import hash_stream
 5 | from frictionless import validate, Report
 6 | 
 7 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
 8 | 
 9 | 
10 | class TestWaczFor(unittest.TestCase):
11 |     @classmethod
12 |     @patch("wacz.main.now")
13 |     def setUpClass(self, mock_now):
14 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
15 |         self.tmpdir = tempfile.TemporaryDirectory()
16 |         with open(os.path.join(self.tmpdir.name, "test-pages.jsonl"), "wt") as fh:
17 |             fh.write('{"format": "json-pages-1.0", "id": "pages", "title": "Pages"}\n')
18 |             fh.write(
19 |                 '{"id": "abcdef", "url": "https://www.example.com/#hashtag", "title": "Example", "loadState": 4}\n'
20 |             )
21 | 
22 |         main(
23 |             [
24 |                 "create",
25 |                 "-f",
26 |                 os.path.join(TEST_DIR, "example-collection.warc"),
27 |                 "-p",
28 |                 os.path.join(self.tmpdir.name, "test-pages.jsonl"),
29 |                 "-o",
30 |                 os.path.join(self.tmpdir.name, "example-custom-page.wacz"),
31 |             ]
32 |         )
33 | 
34 |     def test_hash(self):
35 |         with zipfile.ZipFile(
36 |             os.path.join(self.tmpdir.name, "example-custom-page.wacz"), "r"
37 |         ) as zip_ref:
38 |             zip_ref.extract(
39 |                 "pages/pages.jsonl",
40 |                 os.path.join(self.tmpdir.name, "extract-custom-page"),
41 |             )
42 |             zip_ref.close()
43 | 
44 |         with open(
45 |             os.path.join(
46 |                 self.tmpdir.name, "extract-custom-page", "pages", "pages.jsonl"
47 |             ),
48 |             "rt",
49 |         ) as f:
50 |             content = f.read()
51 | 
52 |         assert (
53 |             content
54 |             == """\
55 | {"format": "json-pages-1.0", "id": "pages", "title": "Pages"}
56 | {"id": "abcdef", "url": "https://www.example.com/#hashtag", "title": "Example", "loadState": 4, "ts": "2020-10-07T21:22:36Z"}
57 | """
58 |         )
59 | 


--------------------------------------------------------------------------------
/tests/fixtures/logs/wr-crawl.log:
--------------------------------------------------------------------------------
 1 | {"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}}
 2 | {"logLevel":"info","timestamp":"2023-02-23T23:44:39.666Z","context":"general","message":"Set netIdleWait to 15 seconds","details":{}}
 3 | {"logLevel":"info","timestamp":"2023-02-23T23:44:39.666Z","context":"general","message":"Seeds","details":[{"url":"https://webrecorder.net/","include":[],"exclude":[],"scopeType":"page","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]}
 4 | {"logLevel":"info","timestamp":"2023-02-23T23:44:40.016Z","context":"state","message":"Storing state in memory","details":{}}
 5 | {"logLevel":"info","timestamp":"2023-02-23T23:44:40.473Z","context":"general","message":"Text Extraction: Disabled","details":{}}
 6 | {"logLevel":"info","timestamp":"2023-02-23T23:44:40.590Z","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":0,"total":1,"pending":1,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://webrecorder.net/\",\"seedId\":0,\"depth\":0,\"started\":\"2023-02-23T23:44:40.517Z\"}"]}}
 7 | {"logLevel":"error","timestamp":"2023-02-23T23:44:43.279Z","context":"general","message":"Invalid Seed \"mailto:info@webrecorder.net\" - URL must start with http:// or https://","details":{}}
 8 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.286Z","context":"behavior","message":"Behaviors started","details":{"behaviorTimeout":90,"page":"https://webrecorder.net/"}}
 9 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.287Z","context":"behavior","message":"Run Script Started","details":{"url":"https://webrecorder.net/","page":"https://webrecorder.net/"}}
10 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.291Z","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://webrecorder.net/"}}
11 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.291Z","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!","page":"https://webrecorder.net/"}}
12 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"behavior","message":"Run Script Finished","details":{"url":"https://webrecorder.net/","page":"https://webrecorder.net/"}}
13 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"behavior","message":"Behaviors finished","details":{"finished":1,"page":"https://webrecorder.net/"}}
14 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"pageStatus","message":"Page finished","details":{"page":"https://webrecorder.net/"}}
15 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.358Z","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":1,"total":1,"pending":0,"limit":{"max":0,"hit":false},"pendingPages":[]}}
16 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.358Z","context":"general","message":"Waiting to ensure pending data is written to WARCs...","details":{}}
17 | {"logLevel":"info","timestamp":"2023-02-23T23:44:43.364Z","context":"general","message":"Crawl status: done","details":{}}
18 | 


--------------------------------------------------------------------------------
/tests/test_wacz_indexer_functions.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tempfile
 3 | import os
 4 | import zipfile, json, gzip
 5 | from wacz.main import main, now
 6 | from wacz.waczindexer import WACZIndexer
 7 | 
 8 | PAGE_INDEX = "pages/pages.jsonl"
 9 | 
10 | 
11 | def match_detected_pages(self, detected_pages, passed_pages_url, passed_pages_ts):
12 |     for page in detected_pages:
13 |         page = detected_pages[page]
14 |         url = page["url"]
15 |         ts = page["timestamp"]
16 |         if passed_pages_url == url and passed_pages_ts == None:
17 |             return page
18 |         if passed_pages_url == url and passed_pages_ts == ts:
19 |             return page
20 |     return 0
21 | 
22 | 
23 | class TestWaczIndexerFunctions(unittest.TestCase):
24 |     def test_match_detected_page_invalid(self):
25 |         """When passed invalid urls and invalid timestamps the function should return 0"""
26 |         detected_pages = {
27 |             "20201007212236/http://www.example.com/": {
28 |                 "url": "http://www.example.com/",
29 |                 "timestamp": "20201007212236",
30 |                 "title": "Example Domain",
31 |                 "rec": "fbt5hqmtseanlxzt",
32 |                 "id": "1db0ef709a",
33 |                 "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\n",
34 |             }
35 |         }
36 |         self.assertEqual(
37 |             match_detected_pages(self, detected_pages, "fake_url", None), 0
38 |         )
39 |         self.assertEqual(
40 |             match_detected_pages(self, detected_pages, "fake_url", "fake-ts"),
41 |             0,
42 |         )
43 | 
44 |     def test_match_detected_page_valid(self):
45 |         """When passed valid urls and valid timestamps the function should return the page"""
46 |         detected_pages = {
47 |             "20201007212236/http://www.example.com/": {
48 |                 "url": "http://www.example.com/",
49 |                 "timestamp": "20201007212236",
50 |                 "title": "Example Domain",
51 |                 "rec": "fbt5hqmtseanlxzt",
52 |                 "id": "1db0ef709a",
53 |                 "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\n",
54 |             }
55 |         }
56 |         self.assertEqual(
57 |             match_detected_pages(self, detected_pages, "http://www.example.com/", None),
58 |             {
59 |                 "url": "http://www.example.com/",
60 |                 "timestamp": "20201007212236",
61 |                 "title": "Example Domain",
62 |                 "rec": "fbt5hqmtseanlxzt",
63 |                 "id": "1db0ef709a",
64 |                 "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\n",
65 |             },
66 |         )
67 |         self.assertEqual(
68 |             match_detected_pages(
69 |                 self, detected_pages, "http://www.example.com/", "20201007212236"
70 |             ),
71 |             {
72 |                 "url": "http://www.example.com/",
73 |                 "timestamp": "20201007212236",
74 |                 "title": "Example Domain",
75 |                 "rec": "fbt5hqmtseanlxzt",
76 |                 "id": "1db0ef709a",
77 |                 "text": "Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\n",
78 |             },
79 |         )
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/wacz/util.py:
--------------------------------------------------------------------------------
  1 | import hashlib, datetime, json, os
  2 | from warcio.timeutils import iso_date_to_timestamp
  3 | import pkg_resources
  4 | 
  5 | WACZ_VERSION = "1.1.1"
  6 | 
  7 | 
  8 | BUFF_SIZE = 1024 * 64
  9 | 
 10 | 
 11 | def check_http_and_https(url, ts, pages_dict):
 12 |     """Checks for http and https versions of the passed url
 13 |     in the pages dict
 14 |     :param url to check, pages_dict the user passed
 15 |     :returns: True or False depending on if a match was found
 16 |     :rtype: boolean
 17 |     """
 18 |     parts = url.split(":", 1)
 19 |     if len(parts) < 2:
 20 |         return ""
 21 | 
 22 |     url_body = parts[1]
 23 |     checks = [
 24 |         f"http:{url_body}",
 25 |         f"https:{url_body}",
 26 |         f"{ts}/http:{url_body}",
 27 |         f"{ts}/https:{url_body}",
 28 |     ]
 29 | 
 30 |     for check in checks:
 31 |         if check in pages_dict:
 32 |             return check
 33 | 
 34 |     return ""
 35 | 
 36 | 
 37 | def get_py_wacz_version():
 38 |     """Get version of the py-wacz package"""
 39 |     return pkg_resources.get_distribution("wacz").version
 40 | 
 41 | 
 42 | def hash_stream(hash_type, stream):
 43 |     """Hashes the stream with given hash_type hasher"""
 44 |     try:
 45 |         hasher = hashlib.new(hash_type)
 46 |     except:
 47 |         return 0, ""
 48 | 
 49 |     size = 0
 50 | 
 51 |     while True:
 52 |         buff = stream.read(BUFF_SIZE)
 53 |         size += len(buff)
 54 |         hasher.update(buff)
 55 |         if not buff:
 56 |             break
 57 | 
 58 |     return size, hash_type + ":" + hasher.hexdigest()
 59 | 
 60 | 
 61 | def hash_file(type_, filename):
 62 |     with open(filename, "rb") as fh:
 63 |         size_, hash_ = hash_stream(type_, fh)
 64 | 
 65 |     return hash_
 66 | 
 67 | 
 68 | def construct_passed_pages_dict(passed_pages_list):
 69 |     """Creates a dictionary of the passed pages with the url as the key or ts/url if ts is present and the title and text as the values if they have been passed"""
 70 |     passed_pages_dict = {}
 71 | 
 72 |     for page_data in passed_pages_list:
 73 |         # Skip invalid page data
 74 |         try:
 75 |             page_dict = json.loads(page_data)
 76 |         except:
 77 |             print("Warning: Skipping invalid page {0}".format(page_data))
 78 |             continue
 79 | 
 80 |         # Skip the file's header if it's been set
 81 |         if "format" not in page_dict:
 82 |             url = page_dict.get("url", "")
 83 | 
 84 |             # Set the default key as url, but without hashtag, as will match pages
 85 |             # to URLs without hashtag, but keep hashtag on page list
 86 |             key = url.split("#", 1)[0]
 87 | 
 88 |             # If timestamp is present overwrite the key to be 'ts/url'
 89 |             if "ts" in page_dict:
 90 |                 key = iso_date_to_timestamp(page_dict.pop("ts")) + "/" + url
 91 | 
 92 |             # Add the key to the dictionary with remaining data
 93 |             passed_pages_dict[key] = page_dict
 94 | 
 95 |     return passed_pages_dict
 96 | 
 97 | 
 98 | def now():
 99 |     """Returns the current time"""
100 |     return tuple(datetime.datetime.utcnow().timetuple()[:6])
101 | 
102 | 
103 | def validateJSON(jsonData):
104 |     """Attempts to validate a string as json"""
105 |     try:
106 |         json.loads(jsonData)
107 |     except ValueError as err:
108 |         return False
109 |     return True
110 | 
111 | 
112 | def validate_pages_jsonl_file(json_file_path):
113 |     """Attempt to validate pages.jsonl file"""
114 |     filename = os.path.basename(json_file_path)
115 |     if not filename.lower().endswith(".jsonl"):
116 |         return False
117 | 
118 |     line_index = 0
119 | 
120 |     with open(json_file_path, "r") as jsonl_file:
121 |         for line in jsonl_file:
122 |             try:
123 |                 data = json.loads(line)
124 |                 if line_index == 0:
125 |                     data["format"]
126 |                     data["id"]
127 |                 else:
128 |                     data["url"]
129 |                     data["ts"]
130 |                 line_index += 1
131 |             except json.JSONDecodeError:
132 |                 print(f"File {filename} is invalid JSONL")
133 |                 return False
134 |             except KeyError:
135 |                 print(f"File {filename} missing required fields")
136 |                 return False
137 | 
138 |     return True
139 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## py-wacz
  2 | 
  3 | The **py-wacz** repository contains a Python module and command line utility
  4 | for working with web archive data using the [WACZ] format specification. Web
  5 | Archive Collection Zipped (WACZ) allows web archives to be shared and
  6 | distributed by providing a predictable way of packaging up web archive data and
  7 | metadata as a ZIP file. The **wacz** command line utility supports converting
  8 | any [WARC] files into WACZ files, and optionally generating full-text search
  9 | indices of pages.
 10 | 
 11 | ## Install
 12 | 
 13 | Use pip to install the module and a command line utility:
 14 | 
 15 | ```
 16 | pip install wacz
 17 | ```
 18 | 
 19 | Once installed you can use the **wacz** command line utility to *create* and *validate* WACZ files.
 20 | 
 21 | ## Create
 22 | 
 23 | To create a WACZ package you can point **wacz** at a WARC file and tell it
 24 | where to write the WACZ with the `-o` option:
 25 | 
 26 | ```
 27 | wacz create -o myfile.wacz <path/to/WARC>
 28 | ```
 29 | 
 30 | The resulting `myfile.wacz` should be loadable via [ReplayWeb.page].
 31 | 
 32 | **wacz** accepts the following options for  customizing how the WACZ file is assembled.
 33 | 
 34 | ### -f --file
 35 | 
 36 | Explicitly declare the file being passed to the create function.
 37 | 
 38 | ```
 39 | wacz create -f tests/fixtures/example-collection.warc
 40 | ```
 41 | 
 42 | ### -o --output
 43 | 
 44 | Explicitly declare the name of the wacz being created.
 45 | 
 46 | ```
 47 | wacz create tests/fixtures/example-collection.warc -o mywacz.wacz
 48 | ```
 49 | 
 50 | ### -t --text
 51 | 
 52 | Generates pages.jsonl page index with a full-text index, must be run in conjunction with --detect-pages. Will have no effect if run alone.
 53 | 
 54 | ```
 55 | wacz create tests/fixtures/example-collection.warc -t
 56 | ```
 57 | 
 58 | ### --detect-pages
 59 | 
 60 | Generates pages.jsonl page index without a full-text index.
 61 | 
 62 | ```
 63 | wacz create tests/fixtures/example-collection.warc --detect-pages
 64 | ```
 65 | 
 66 | ### -p --pages
 67 | 
 68 | Overrides the pages index generation with the passed jsonl pages.
 69 | 
 70 | ```
 71 | wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl
 72 | ```
 73 | 
 74 | ### -e --extra-pages
 75 | 
 76 | Overrides the extra pages index generation with the passed extra jsonl pages.
 77 | 
 78 | ```
 79 | wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl -e extra_pages.jsonl
 80 | ```
 81 | 
 82 | ### -c --copy-pages
 83 | 
 84 | Overrides the behavior of --pages and --extra-pages options to copy existing pages.jsonl and/or extraPages.jsonl as-is directly into the WACZ rather than attempting to match each page to WARC record. The files are still parsed for basic correctness.
 85 | 
 86 | ```
 87 | wacz create tests/fixtures/example-collection.warc --pages pages/pages.jsonl --extra-pages pages/extraPages.jsonl --copy-pages
 88 | ```
 89 | 
 90 | ### -t --text
 91 | 
 92 | You can add a full text index by including the --text tag.
 93 | 
 94 | ```
 95 | wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl --text
 96 | ```
 97 | 
 98 | ### -l --log-directory
 99 | 
100 | Adds log files in specified directory to WACZ
101 | 
102 | ```
103 | wacz create tests/fixtures/example-collection.warc -l tests/fixtures/logs
104 | ```
105 | 
106 | ### --ts
107 | 
108 | Overrides the ts metadata value in the datapackage.json file.
109 | 
110 | ```
111 | wacz create tests/fixtures/example-collection.warc --ts TIMESTAMP
112 | ```
113 | 
114 | ### --url
115 | 
116 | Overrides the url metadata value in the datapackage.json file.
117 | 
118 | ```
119 | wacz create tests/fixtures/example-collection.warc --url URL
120 | ```
121 | 
122 | ### --title
123 | 
124 | Overrides the titles metadata value in the datapackage.json file.
125 | 
126 | ```
127 | wacz create tests/fixtures/example-collection.warc --title TITLE
128 | ```
129 | 
130 | ### --desc
131 | 
132 | Overrides the desc metadata value in the datapackage.json file.
133 | 
134 | ```
135 | wacz create tests/fixtures/example-collection.warc --desc DESC
136 | ```
137 |  
138 | ### --hash-type
139 | 
140 | Allows the user to specify the hash type used (sha256 or md5).
141 | 
142 | ```
143 | wacz create tests/fixtures/example-collection.warc --hash-type md5
144 | ```
145 | 
146 | ### --signing-url
147 | 
148 | An optional URL for [WACZ signing server](https://github.com/webrecorder/authsign) which will be used to add a signature to the new WACZ.
149 | 
150 | This URL should point to an authsign `/sign` api endpoint.
151 | 
152 | See the section on `--verify-auth` for more info on signing and verification.
153 | 
154 | ### --signing-token
155 | 
156 | An optional, secret token passed to signing server to allow access. See `authsign` for more details.
157 | 
158 | 
159 | ## Validate
160 | 
161 | You can also validate an existing WACZ file by running:
162 | 
163 | ```
164 | wacz validate myfile.wacz
165 | ```
166 | 
167 | ### -f --file
168 | 
169 | Explicitly declare the file being passed to the validate function.
170 | 
171 | ```
172 | wacz validate -f tests/fixtures/example-collection.warc
173 | ```
174 | 
175 | ### --verify-auth
176 | 
177 | New option in 0.4.0, this option also verifies the WACZ is signed, using [authsign](https://github.com/webrecorder/authsign)
178 | 
179 | The verification can be done locally, or via remote signing/verification server.
180 | 
181 | To use remote server, add `--verifier-url` which should be a URL pointing to the authsign `/verify` endpoint.
182 | 
183 | To run locally, the `authsign` must be installed, which can be done by running `pip install wacz[signing]`.
184 | 
185 | See [WACZ Authentication Spec](https://github.com/webrecorder/wacz-auth-spec) on WACZ authentication.
186 | 
187 | This feature and the specification are still in development (alpha-quality) and are subject to change.
188 | 
189 | 
190 | 
191 | ## Testing
192 | 
193 | If you are developing wacz you can run the unit tests with [pytest]:
194 | 
195 | ```
196 | pytest tests
197 | ```
198 | 
199 | [WACZ]: https://github.com/webrecorder/wacz-format
200 | [WARC]: https://en.wikipedia.org/wiki/Web_ARChive
201 | [ReplayWeb.page]: https://replayweb.page
202 | [pytest]: https://docs.pytest.org/
203 | 


--------------------------------------------------------------------------------
/tests/test_create_wacz.py:
--------------------------------------------------------------------------------
  1 | import unittest, os, zipfile, sys, gzip, json, tempfile
  2 | from wacz.main import main, now
  3 | from unittest.mock import patch
  4 | from wacz.util import hash_file
  5 | from frictionless import validate, Report
  6 | 
  7 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
  8 | 
  9 | 
 10 | class TestWaczFormat(unittest.TestCase):
 11 |     def find_resource(self, resource_list, filename):
 12 |         for file in resource_list:
 13 |             if filename in file["path"]:
 14 |                 return file
 15 | 
 16 |     @classmethod
 17 |     @patch("wacz.main.now")
 18 |     def setUpClass(self, mock_now):
 19 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
 20 |         self.tmpdir = tempfile.TemporaryDirectory()
 21 |         main(
 22 |             [
 23 |                 "create",
 24 |                 "-f",
 25 |                 os.path.join(TEST_DIR, "example-collection.warc"),
 26 |                 "-o",
 27 |                 os.path.join(self.tmpdir.name, "valid_example_1.wacz"),
 28 |                 "-l",
 29 |                 os.path.join(TEST_DIR, "logs"),
 30 |             ]
 31 |         )
 32 |         with zipfile.ZipFile(
 33 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz"), "r"
 34 |         ) as zip_ref:
 35 |             zip_ref.extractall(os.path.join(self.tmpdir.name, "unzipped_wacz_1"))
 36 |             zip_ref.close()
 37 | 
 38 |         self.wacz_file = os.path.join(self.tmpdir.name, "valid_example_1.wacz")
 39 |         self.warc_file = os.path.join(TEST_DIR, "example-collection.warc")
 40 | 
 41 |         self.wacz_archive_warc = os.path.join(
 42 |             self.tmpdir.name,
 43 |             "unzipped_wacz_1/archive/example-collection.warc",
 44 |         )
 45 |         self.wacz_index_cdx = os.path.join(
 46 |             self.tmpdir.name,
 47 |             "unzipped_wacz_1/indexes/index.cdx.gz",
 48 |         )
 49 |         self.wacz_index_idx = os.path.join(
 50 |             self.tmpdir.name,
 51 |             "unzipped_wacz_1/indexes/index.idx",
 52 |         )
 53 |         self.wacz_json = os.path.join(
 54 |             self.tmpdir.name,
 55 |             "unzipped_wacz_1/datapackage.json",
 56 |         )
 57 |         self.wacz_log = os.path.join(
 58 |             self.tmpdir.name, "unzipped_wacz_1/logs/wr-specs-crawl.log"
 59 |         )
 60 |         self.wacz_second_log = os.path.join(
 61 |             self.tmpdir.name, "unzipped_wacz_1/logs/wr-crawl.log"
 62 |         )
 63 | 
 64 |     def test_components(self):
 65 |         """Check that the basic components of a wacz file exist"""
 66 |         self.assertTrue(
 67 |             "example-collection.warc"
 68 |             in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/archive"))
 69 |         )
 70 |         self.assertTrue(
 71 |             "index.cdx.gz"
 72 |             in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/indexes"))
 73 |         )
 74 |         self.assertTrue(
 75 |             "index.idx"
 76 |             in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/indexes"))
 77 |         )
 78 |         self.assertTrue(
 79 |             "pages.jsonl"
 80 |             in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/pages"))
 81 |         )
 82 |         self.assertTrue(
 83 |             "datapackage.json"
 84 |             in os.listdir(os.path.join(self.tmpdir.name, "unzipped_wacz_1/"))
 85 |         )
 86 | 
 87 |     def test_archive_structure(self):
 88 |         """Check that the hash of the original warc file matches that of the warc file in the archive folder"""
 89 |         original_warc = hash_file("sha256", self.warc_file)
 90 | 
 91 |         archive_warc = hash_file("sha256", self.wacz_archive_warc)
 92 | 
 93 |         self.assertEqual(original_warc, archive_warc)
 94 | 
 95 |     def test_idx_structure(self):
 96 |         """Check that the idx file has the expected content"""
 97 |         with open(self.wacz_index_idx, "rb") as f:
 98 |             content = f.read()
 99 |         f.close()
100 | 
101 |         # doing a startswith because compressed gzip block may be different depending on platform, so sha256 is platform dependent
102 |         # just checking that the hash is set
103 |         self.assertTrue(
104 |             content.startswith(
105 |                 b'!meta 0 {"format": "cdxj-gzip-1.0", "filename": "index.cdx.gz"}\ncom,example)/ 20201007212236 {"offset": 0, "length": 256, "digest": "sha256:',
106 |             )
107 |         )
108 | 
109 |     def test_cdx_structure(self):
110 |         """Check that the cdx file has the expected content"""
111 |         content = ""
112 |         with gzip.open(self.wacz_index_cdx, "rb") as f:
113 |             for line in f:
114 |                 content = content + line.decode()
115 |         f.close()
116 |         self.assertEqual(
117 |             content,
118 |             'com,example)/ 20201007212236 {"url": "http://www.example.com/", "mime": "text/html", "status": "200", "digest": "sha1:WJM2KPM4GF3QK2BISVUH2ASX64NOUY7L", "length": "1293", "offset": "845", "filename": "example-collection.warc", "recordDigest": "sha256:f78838ace891c96f7a6299e9e085b55a5aba8950a6d77f0f2e9ffe90f63255f2"}\n',
119 |         )
120 | 
121 |     def test_logs(self):
122 |         with open(self.wacz_log, "rb") as f:
123 |             content = f.read()
124 |         f.close()
125 | 
126 |         with open(self.wacz_second_log, "rb") as f:
127 |             second_content = f.read()
128 |         f.close()
129 | 
130 |         self.assertTrue(
131 |             content.startswith(
132 |                 b'{"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]}\n',
133 |             )
134 |         )
135 |         self.assertTrue(
136 |             second_content.startswith(
137 |                 b'{"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}}\n'
138 |             )
139 |         )
140 | 
141 |     def test_data_package_structure(self):
142 |         """Check that the package_descriptor is valid"""
143 |         f = open(self.wacz_json, "rb")
144 |         json_parse = json.loads(f.read())
145 |         # Make sure it's recording the correct number of resources
146 |         self.assertEqual(len(json_parse["resources"]), 6)
147 | 
148 |         # Check that the correct hash was recorded for a warc
149 |         original_warc = hash_file("sha256", self.warc_file)
150 | 
151 |         warc_resource = self.find_resource(
152 |             json_parse["resources"], "example-collection.warc"
153 |         )
154 |         self.assertEqual(original_warc, warc_resource["hash"])
155 | 
156 |         # Check that the correct hash was recorded for the index.idx
157 |         original_wacz_index_idx = hash_file("sha256", self.wacz_index_idx)
158 |         idx_resource = self.find_resource(json_parse["resources"], "idx")
159 |         self.assertEqual(original_wacz_index_idx, idx_resource["hash"])
160 | 
161 |         # Check that the correct hash was recorded for the index.cdx.gz
162 |         original_wacz_index_cdx = hash_file("sha256", self.wacz_index_cdx)
163 |         cdx_resource = self.find_resource(json_parse["resources"], "cdx")
164 |         self.assertEqual(original_wacz_index_cdx, cdx_resource["hash"])
165 | 
166 |         # Check that the correct hash was recorded for the log files
167 |         original_wacz_log = hash_file("sha256", self.wacz_log)
168 |         log_resource = self.find_resource(json_parse["resources"], "wr-specs-crawl.log")
169 |         self.assertEqual(original_wacz_log, log_resource["hash"])
170 | 
171 |         second_wacz_log = hash_file("sha256", self.wacz_second_log)
172 |         log_resource = self.find_resource(json_parse["resources"], "wr-crawl.log")
173 |         self.assertEqual(second_wacz_log, log_resource["hash"])
174 | 
175 |         # Use frictionless validation
176 |         valid = validate(self.wacz_json)
177 |         self.assertTrue(valid.valid)
178 | 
179 | 
180 | if __name__ == "__main__":
181 |     unittest.main()
182 | 


--------------------------------------------------------------------------------
/tests/test_validate_wacz.py:
--------------------------------------------------------------------------------
  1 | import unittest, os, zipfile, sys, gzip, json, tempfile
  2 | from wacz.main import main
  3 | from frictionless import validate
  4 | from wacz.validate import Validation
  5 | from unittest.mock import patch
  6 | 
  7 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
  8 | 
  9 | 
 10 | class TestWaczFormat(unittest.TestCase):
 11 |     @classmethod
 12 |     @patch("wacz.main.now")
 13 |     def setUpClass(self, mock_now):
 14 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
 15 |         self.tmpdir = tempfile.TemporaryDirectory()
 16 |         main(
 17 |             [
 18 |                 "create",
 19 |                 "-f",
 20 |                 os.path.join(TEST_DIR, "example-collection.warc"),
 21 |                 "-o",
 22 |                 os.path.join(self.tmpdir.name, "valid_example_1.wacz"),
 23 |             ]
 24 |         )
 25 |         with zipfile.ZipFile(
 26 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz"), "r"
 27 |         ) as zip_ref:
 28 |             zip_ref.extractall(os.path.join(self.tmpdir.name, "unzipped_wacz_1"))
 29 |             zip_ref.close()
 30 | 
 31 |         self.validation_class_valid_1 = Validation(
 32 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
 33 |         )
 34 |         self.validation_class_invalid = Validation(
 35 |             os.path.join(TEST_DIR, "invalid_example_1.wacz")
 36 |         )
 37 | 
 38 |     def test_overall_command(self):
 39 |         self.assertEqual(
 40 |             main(
 41 |                 [
 42 |                     "validate",
 43 |                     "-f",
 44 |                     os.path.join(self.tmpdir.name, "valid_example_1.wacz"),
 45 |                 ]
 46 |             ),
 47 |             0,
 48 |         )
 49 | 
 50 |     def test_check_indexes_valid(self):
 51 |         self.assertTrue(self.validation_class_valid_1.check_indexes())
 52 | 
 53 |     def test_check_compression_valid(self):
 54 |         self.assertTrue(self.validation_class_valid_1.check_compression())
 55 | 
 56 |     def test_frictionless_validate_valid_wacz(self):
 57 |         """Check that the frictionless validation feature identifies a valid wacz data package as valid"""
 58 |         # Use frictionless validation
 59 |         valid_1 = self.validation_class_valid_1.frictionless_validate()
 60 |         self.assertTrue(valid_1)
 61 | 
 62 |     def test_frictionless_validate_invalid_wacz(self):
 63 |         """Check that the frictionless validation feature identifies an invalid wacz data package as invalid"""
 64 |         # Use frictionless validation
 65 |         valid = self.validation_class_invalid.frictionless_validate()
 66 |         self.assertFalse(valid)
 67 | 
 68 |     def test_filepaths_invalid_wacz(self):
 69 |         """Correctly fail on a wacz with invalid files"""
 70 |         valid = self.validation_class_invalid.check_file_paths()
 71 |         self.assertFalse(valid)
 72 | 
 73 |     def test_filepaths_valid_wacz(self):
 74 |         """Correctly succeed on a wacz with valid files"""
 75 |         valid_1 = self.validation_class_valid_1.check_file_paths()
 76 |         self.assertTrue(valid_1)
 77 | 
 78 |     def test_hashes_valid_wacz(self):
 79 |         """Correctly succeed on a wacz with matching hashes"""
 80 |         valid_1 = self.validation_class_valid_1.check_file_hashes()
 81 |         self.assertTrue(valid_1)
 82 | 
 83 |     def test_hashes_invalid_wacz(self):
 84 |         """Correctly fail on a wacz with nonmatching hashes"""
 85 |         valid = self.validation_class_invalid.check_file_hashes()
 86 |         self.assertFalse(valid)
 87 | 
 88 |     def test_ability_to_detect_hash_md5(self):
 89 |         """Correctly identify the hash type of a file as md5"""
 90 |         tmpdir = tempfile.TemporaryDirectory()
 91 |         main(
 92 |             [
 93 |                 "create",
 94 |                 "-f",
 95 |                 os.path.join(TEST_DIR, "example-collection.warc"),
 96 |                 "-o",
 97 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
 98 |                 "--hash-type",
 99 |                 "md5",
100 |             ]
101 |         )
102 |         with zipfile.ZipFile(
103 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
104 |         ) as zip_ref:
105 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
106 |             zip_ref.close()
107 | 
108 |         validation_class = Validation(
109 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
110 |         )
111 |         valid = validation_class.detect_hash_type()
112 |         self.assertEqual(valid, 0)
113 |         valid = validation_class.hash_type
114 |         self.assertEqual(valid, "md5")
115 | 
116 |     def test_ability_to_detect_hash_md5(self):
117 |         """Correctly validate hashes and identify the type when no flag is set to md5"""
118 |         tmpdir = tempfile.TemporaryDirectory()
119 |         main(
120 |             [
121 |                 "create",
122 |                 "-f",
123 |                 os.path.join(TEST_DIR, "example-collection.warc"),
124 |                 "-o",
125 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
126 |                 "--hash-type",
127 |                 "sha256",
128 |             ]
129 |         )
130 |         with zipfile.ZipFile(
131 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
132 |         ) as zip_ref:
133 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
134 |             zip_ref.close()
135 | 
136 |         validation_class = Validation(
137 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
138 |         )
139 |         valid = validation_class.detect_hash_type()
140 |         self.assertEqual(valid, 0)
141 |         valid = validation_class.hash_type
142 |         self.assertEqual(valid, "sha256")
143 | 
144 |     def test_ability_to_detect_hash_md5(self):
145 |         """Correctly validate hashes and identify the type when flag is set to 256"""
146 |         tmpdir = tempfile.TemporaryDirectory()
147 |         main(
148 |             [
149 |                 "create",
150 |                 "-f",
151 |                 os.path.join(TEST_DIR, "example-collection.warc"),
152 |                 "-o",
153 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
154 |                 "--hash-type",
155 |                 "sha256",
156 |             ]
157 |         )
158 |         with zipfile.ZipFile(
159 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
160 |         ) as zip_ref:
161 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
162 |             zip_ref.close()
163 | 
164 |         validation_class = Validation(
165 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
166 |         )
167 |         valid = validation_class.detect_hash_type()
168 |         self.assertEqual(valid, 0)
169 |         valid = validation_class.hash_type
170 |         self.assertEqual(valid, "sha256")
171 | 
172 |     def test_invalid_wacz_missing_datapackage(self):
173 |         """Correctly validate hashes and identify the type when no flag is set"""
174 |         tmpdir = tempfile.TemporaryDirectory()
175 |         main(
176 |             [
177 |                 "create",
178 |                 "-f",
179 |                 os.path.join(TEST_DIR, "example-collection.warc"),
180 |                 "-o",
181 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
182 |             ]
183 |         )
184 |         with zipfile.ZipFile(
185 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
186 |         ) as zip_ref:
187 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
188 |             zip_ref.close()
189 | 
190 |         os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/datapackage.json"))
191 |         validation_class = Validation(
192 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
193 |         )
194 | 
195 |         valid = validation_class.check_required_contents()
196 |         self.assertEqual(valid, 0)
197 | 
198 |     def test_invalid_wacz_missing_index(self):
199 |         """Correctly fail on a wacz with no index"""
200 |         tmpdir = tempfile.TemporaryDirectory()
201 |         main(
202 |             [
203 |                 "create",
204 |                 "-f",
205 |                 os.path.join(TEST_DIR, "example-collection.warc"),
206 |                 "-o",
207 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
208 |             ]
209 |         )
210 |         with zipfile.ZipFile(
211 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
212 |         ) as zip_ref:
213 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
214 |             zip_ref.close()
215 | 
216 |         os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/indexes/index.cdx.gz"))
217 |         validation_class = Validation(
218 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
219 |         )
220 |         valid = validation_class.check_required_contents()
221 |         self.assertEqual(valid, 0)
222 | 
223 |     def test_invalid_wacz_missing_warc(self):
224 |         """Correctly fail on a wacz with no warc file"""
225 |         tmpdir = tempfile.TemporaryDirectory()
226 |         main(
227 |             [
228 |                 "create",
229 |                 "-f",
230 |                 os.path.join(TEST_DIR, "example-collection.warc"),
231 |                 "-o",
232 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
233 |             ]
234 |         )
235 |         with zipfile.ZipFile(
236 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
237 |         ) as zip_ref:
238 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
239 |             zip_ref.close()
240 | 
241 |         os.remove(
242 |             os.path.join(tmpdir.name, "unzipped_wacz_1/archive/example-collection.warc")
243 |         )
244 |         validation_class = Validation(
245 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
246 |         )
247 |         valid = validation_class.check_required_contents()
248 |         self.assertEqual(valid, 0)
249 | 
250 |     def test_invalid_wacz_missing_pages(self):
251 |         """Correctly fail on a wacz with no pages file"""
252 |         tmpdir = tempfile.TemporaryDirectory()
253 |         main(
254 |             [
255 |                 "create",
256 |                 "-f",
257 |                 os.path.join(TEST_DIR, "example-collection.warc"),
258 |                 "-o",
259 |                 os.path.join(tmpdir.name, "valid_example_1.wacz"),
260 |             ]
261 |         )
262 |         with zipfile.ZipFile(
263 |             os.path.join(tmpdir.name, "valid_example_1.wacz"), "r"
264 |         ) as zip_ref:
265 |             zip_ref.extractall(os.path.join(tmpdir.name, "unzipped_wacz_1"))
266 |             zip_ref.close()
267 | 
268 |         os.remove(os.path.join(tmpdir.name, "unzipped_wacz_1/pages/pages.jsonl"))
269 |         validation_class = Validation(
270 |             os.path.join(self.tmpdir.name, "valid_example_1.wacz")
271 |         )
272 |         valid = validation_class.check_required_contents()
273 |         self.assertEqual(valid, 0)
274 | 
275 | 
276 | if __name__ == "__main__":
277 |     unittest.main()
278 | 


--------------------------------------------------------------------------------
/tests/test_create_wacz_indexing.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import tempfile
  3 | import os
  4 | from wacz.main import main, now
  5 | from wacz.util import check_http_and_https
  6 | 
  7 | import zipfile
  8 | 
  9 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
 10 | 
 11 | 
 12 | class TestWaczIndexing(unittest.TestCase):
 13 |     def test_check_http_and_https_changed(self):
 14 |         pages_dict = {"https://www.example.org/": "1db0ef709a"}
 15 |         check_url = "http://www.example.org/"
 16 |         match = check_http_and_https(check_url, "", pages_dict)
 17 |         self.assertEqual(match, "https://www.example.org/")
 18 | 
 19 |     def test_check_http_and_https_not_found(self):
 20 |         pages_dict = {"https://www.example.org/": "1db0ef709a"}
 21 |         check_url = "http://fake"
 22 |         match = check_http_and_https(check_url, "", pages_dict)
 23 |         self.assertEqual(match, "")
 24 | 
 25 |     def test_warc_with_other_metadata(self):
 26 |         with tempfile.TemporaryDirectory() as tmpdir:
 27 |             self.assertEqual(
 28 |                 main(
 29 |                     [
 30 |                         "create",
 31 |                         "-f",
 32 |                         os.path.join(TEST_DIR, "example-warcinfo-metadata.warc"),
 33 |                         "-o",
 34 |                         os.path.join(tmpdir, "example-warcinfo-metadata.wacz"),
 35 |                     ]
 36 |                 ),
 37 |                 0,
 38 |             )
 39 | 
 40 |             self.assertEqual(
 41 |                 main(
 42 |                     [
 43 |                         "validate",
 44 |                         "-f",
 45 |                         os.path.join(tmpdir, "example-warcinfo-metadata.wacz"),
 46 |                     ]
 47 |                 ),
 48 |                 0,
 49 |             )
 50 | 
 51 |     def test_warc_with_extra_lists(self):
 52 |         with tempfile.TemporaryDirectory() as tmpdir:
 53 |             self.assertEqual(
 54 |                 main(
 55 |                     [
 56 |                         "create",
 57 |                         "-f",
 58 |                         os.path.join(TEST_DIR, "example-collection-with-lists.warc"),
 59 |                         "-o",
 60 |                         os.path.join(tmpdir, "example-collection-with-lists.wacz"),
 61 |                     ]
 62 |                 ),
 63 |                 0,
 64 |             )
 65 | 
 66 |             self.assertEqual(
 67 |                 main(
 68 |                     [
 69 |                         "validate",
 70 |                         "-f",
 71 |                         os.path.join(tmpdir, "example-collection-with-lists.wacz"),
 72 |                     ]
 73 |                 ),
 74 |                 0,
 75 |             )
 76 | 
 77 |             with zipfile.ZipFile(
 78 |                 os.path.join(tmpdir, "example-collection-with-lists.wacz")
 79 |             ) as zf:
 80 |                 filelist = sorted(zf.namelist())
 81 | 
 82 |                 # verify pages file added for each list
 83 |                 self.assertEqual(
 84 |                     filelist,
 85 |                     [
 86 |                         "archive/example-collection-with-lists.warc",
 87 |                         "datapackage-digest.json",
 88 |                         "datapackage.json",
 89 |                         "indexes/index.cdx.gz",
 90 |                         "indexes/index.idx",
 91 |                         "pages/example.jsonl",
 92 |                         "pages/iana.jsonl",
 93 |                         "pages/pages.jsonl",
 94 |                     ],
 95 |                 )
 96 | 
 97 |     def test_warc_with_extra_pages(self):
 98 |         with tempfile.TemporaryDirectory() as tmpdir:
 99 |             with open(os.path.join(tmpdir, "test-extra-pages.jsonl"), "wt") as fh:
100 |                 fh.write(
101 |                     """\
102 | {"url": "https://www.iana.org/about"}
103 | {"url": "https://www.iana.org/protocols"}\
104 | """
105 |                 )
106 | 
107 |             self.assertEqual(
108 |                 main(
109 |                     [
110 |                         "create",
111 |                         "-f",
112 |                         os.path.join(TEST_DIR, "example-iana.warc"),
113 |                         "-o",
114 |                         os.path.join(tmpdir, "test-extra-pages.wacz"),
115 |                         "-e",
116 |                         os.path.join(tmpdir, "test-extra-pages.jsonl"),
117 |                         "--detect-pages",
118 |                     ]
119 |                 ),
120 |                 0,
121 |             )
122 | 
123 |             self.assertEqual(
124 |                 main(
125 |                     [
126 |                         "validate",
127 |                         "-f",
128 |                         os.path.join(tmpdir, "test-extra-pages.wacz"),
129 |                     ]
130 |                 ),
131 |                 0,
132 |             )
133 | 
134 |             with zipfile.ZipFile(os.path.join(tmpdir, "test-extra-pages.wacz")) as zf:
135 |                 filelist = sorted(zf.namelist())
136 | 
137 |                 # verify pages file added for each list
138 |                 self.assertEqual(
139 |                     filelist,
140 |                     [
141 |                         "archive/example-iana.warc",
142 |                         "datapackage-digest.json",
143 |                         "datapackage.json",
144 |                         "indexes/index.cdx.gz",
145 |                         "indexes/index.idx",
146 |                         "pages/extraPages.jsonl",
147 |                         "pages/pages.jsonl",
148 |                     ],
149 |                 )
150 | 
151 |     def test_warc_with_detect_pages_split_seeds(self):
152 |         with tempfile.TemporaryDirectory() as tmpdir:
153 |             self.assertEqual(
154 |                 main(
155 |                     [
156 |                         "create",
157 |                         "-f",
158 |                         os.path.join(TEST_DIR, "example-iana.warc"),
159 |                         "-o",
160 |                         os.path.join(tmpdir, "test-detect-extra-pages.wacz"),
161 |                         "--detect-pages",
162 |                         "--split-seeds",
163 |                         "--url",
164 |                         "https://example.com/",
165 |                     ]
166 |                 ),
167 |                 0,
168 |             )
169 | 
170 |             self.assertEqual(
171 |                 main(
172 |                     [
173 |                         "validate",
174 |                         "-f",
175 |                         os.path.join(tmpdir, "test-detect-extra-pages.wacz"),
176 |                     ]
177 |                 ),
178 |                 0,
179 |             )
180 | 
181 |             with zipfile.ZipFile(
182 |                 os.path.join(tmpdir, "test-detect-extra-pages.wacz")
183 |             ) as zf:
184 |                 filelist = sorted(zf.namelist())
185 | 
186 |                 # verify pages file added for each list
187 |                 self.assertEqual(
188 |                     filelist,
189 |                     [
190 |                         "archive/example-iana.warc",
191 |                         "datapackage-digest.json",
192 |                         "datapackage.json",
193 |                         "indexes/index.cdx.gz",
194 |                         "indexes/index.idx",
195 |                         "pages/extraPages.jsonl",
196 |                         "pages/pages.jsonl",
197 |                     ],
198 |                 )
199 | 
200 |                 with zf.open("pages/pages.jsonl", "r") as fh:
201 |                     data = fh.read()
202 | 
203 |                     self.assertTrue(b"https://example.com/" in data)
204 | 
205 |                     self.assertTrue(len(data.strip().split(b"\n")) == 2)
206 | 
207 |                 with zf.open("pages/extraPages.jsonl", "r") as fh:
208 |                     data = fh.read()
209 | 
210 |                     self.assertTrue(len(data.strip().split(b"\n")) == 7)
211 | 
212 |     def test_warc_with_extra_pages_via_seeds(self):
213 |         with tempfile.TemporaryDirectory() as tmpdir:
214 |             with open(os.path.join(tmpdir, "pages.jsonl"), "wt") as fh:
215 |                 fh.write(
216 |                     """\
217 | {"url": "https://example.com/", "seed": true}
218 | {"url": "https://www.iana.org/about"}
219 | {"url": "https://www.iana.org/protocols"}\
220 | """
221 |                 )
222 | 
223 |             self.assertEqual(
224 |                 main(
225 |                     [
226 |                         "create",
227 |                         "-f",
228 |                         os.path.join(TEST_DIR, "example-iana.warc"),
229 |                         "-o",
230 |                         os.path.join(tmpdir, "test-extra-pages.wacz"),
231 |                         "-p",
232 |                         os.path.join(tmpdir, "pages.jsonl"),
233 |                         "--split-seeds",
234 |                     ]
235 |                 ),
236 |                 0,
237 |             )
238 | 
239 |             self.assertEqual(
240 |                 main(
241 |                     [
242 |                         "validate",
243 |                         "-f",
244 |                         os.path.join(tmpdir, "test-extra-pages.wacz"),
245 |                     ]
246 |                 ),
247 |                 0,
248 |             )
249 | 
250 |             with zipfile.ZipFile(os.path.join(tmpdir, "test-extra-pages.wacz")) as zf:
251 |                 filelist = sorted(zf.namelist())
252 | 
253 |                 # verify pages file added for each list
254 |                 self.assertEqual(
255 |                     filelist,
256 |                     [
257 |                         "archive/example-iana.warc",
258 |                         "datapackage-digest.json",
259 |                         "datapackage.json",
260 |                         "indexes/index.cdx.gz",
261 |                         "indexes/index.idx",
262 |                         "pages/extraPages.jsonl",
263 |                         "pages/pages.jsonl",
264 |                     ],
265 |                 )
266 | 
267 |                 with zf.open("pages/extraPages.jsonl", "r") as fh:
268 |                     data = fh.read()
269 |                     self.assertTrue(b"https://www.iana.org/about" in data)
270 |                     self.assertTrue(b"https://www.iana.org/protocols" in data)
271 | 
272 |                 with zf.open("pages/pages.jsonl", "r") as fh:
273 |                     data = fh.read()
274 |                     self.assertTrue(b"https://example.com/" in data)
275 | 
276 |     def test_warc_resource_record(self):
277 |         with tempfile.TemporaryDirectory() as tmpdir:
278 |             self.assertEqual(
279 |                 main(
280 |                     [
281 |                         "create",
282 |                         "-f",
283 |                         os.path.join(TEST_DIR, "example-resource.warc.gz"),
284 |                         "-o",
285 |                         os.path.join(tmpdir, "example-resource.wacz"),
286 |                         "--url",
287 |                         "https://example.com/",
288 |                     ]
289 |                 ),
290 |                 0,
291 |             )
292 | 
293 |             self.assertEqual(
294 |                 main(
295 |                     [
296 |                         "validate",
297 |                         "-f",
298 |                         os.path.join(tmpdir, "example-resource.wacz"),
299 |                     ]
300 |                 ),
301 |                 0,
302 |             )
303 | 
304 |             with zipfile.ZipFile(os.path.join(tmpdir, "example-resource.wacz")) as zf:
305 |                 filelist = sorted(zf.namelist())
306 | 
307 |                 # verify pages file added for each list
308 |                 self.assertEqual(
309 |                     filelist,
310 |                     [
311 |                         "archive/example-resource.warc.gz",
312 |                         "datapackage-digest.json",
313 |                         "datapackage.json",
314 |                         "indexes/index.cdx.gz",
315 |                         "indexes/index.idx",
316 |                         "pages/pages.jsonl",
317 |                     ],
318 |                 )
319 | 


--------------------------------------------------------------------------------
/wacz/validate.py:
--------------------------------------------------------------------------------
  1 | import tempfile, os, zipfile, json, pathlib, pkg_resources, gzip
  2 | from frictionless import validate
  3 | from wacz.util import hash_stream, now
  4 | from wacz.waczindexer import WACZIndexer
  5 | from io import BytesIO, StringIO, TextIOWrapper
  6 | import glob
  7 | import datetime
  8 | import logging
  9 | import requests
 10 | 
 11 | OUTDATED_WACZ = "0.1.0"
 12 | 
 13 | 
 14 | class Validation(object):
 15 |     def __init__(self, filename, verify_auth=False, verifier_url=None):
 16 |         self.dir = tempfile.TemporaryDirectory()
 17 |         self.wacz = filename
 18 |         with zipfile.ZipFile(filename, "r") as zip_ref:
 19 |             zip_ref.extractall(self.dir.name)
 20 |             zip_ref.close()
 21 |         self.detect_version()
 22 |         self.detect_hash_type()
 23 | 
 24 |         self.verify_auth = verify_auth
 25 |         self.verifier_url = verifier_url
 26 | 
 27 |     def check_required_contents(self):
 28 |         """Checks the general component of the wacz and notifies users whats missing"""
 29 |         if os.path.exists(os.path.join(self.dir.name, "datapackage.json")) is False:
 30 |             print("Datapackage is missing from your wacz file")
 31 |             return 1
 32 |         if (
 33 |             glob.glob(os.path.join(self.dir.name, "archive/*.warc")) == False
 34 |             and glob.glob(os.path.join(self.dir.name, "archive/*.warc.gz")) == False
 35 |         ):
 36 |             print(
 37 |                 "A warc file is missing from your archive folder you must have a .warc or .warc.gz file in your archive folder"
 38 |             )
 39 |             return 1
 40 |         if (
 41 |             glob.glob(os.path.join(self.dir.name, "indexes/index.cdx.gz")) == False
 42 |             and glob.glob(os.path.join(self.dir.name, "indexes/index.cdx.gz")) == False
 43 |             and glob.glob(os.path.join(self.dir.name, "indexes/index.idx")) == False
 44 |         ):
 45 |             print(
 46 |                 "An index file is missing from your indexes folder you must have an index.cdx.gz, index,cdx or index.idx in your index folder"
 47 |             )
 48 |             return 1
 49 |         if glob.glob(os.path.join(self.dir.name, "pages/pages.jsonl")) == False:
 50 |             print(
 51 |                 "An index file is missing from your indexes folder you must have an index.cdx.gz, index,cdx or index.idx in your index folder"
 52 |             )
 53 |             return 1
 54 | 
 55 |         return 0
 56 | 
 57 |     def detect_hash_type(self):
 58 |         self.hash_type = None
 59 |         # we know the datapackage exists at this point because we're running it after the version check
 60 |         self.datapackage_path = os.path.join(self.dir.name, "datapackage.json")
 61 |         self.datapackage = json.loads(open(self.datapackage_path, "rb").read())
 62 |         try:
 63 |             self.hash_type = self.datapackage["resources"][0]["hash"].split(":")[0]
 64 |             return 0
 65 |         except:
 66 |             print(
 67 |                 "\nHashing type was unable to be detected the wacz file may have no resources"
 68 |             )
 69 |             return 1
 70 | 
 71 |     def detect_version(self):
 72 |         self.version = None
 73 |         if os.path.exists(os.path.join(self.dir.name, "datapackage.json")):
 74 |             self.data_folder = os.listdir(self.dir.name)
 75 |             self.datapackage_path = os.path.join(self.dir.name, "datapackage.json")
 76 |             self.datapackage = json.loads(open(self.datapackage_path, "rb").read())
 77 | 
 78 |             try:
 79 |                 self.version = self.datapackage["wacz_version"]
 80 |             except:
 81 |                 print("\nVersion missing from datapackage.json, invalid wacz file")
 82 |                 return
 83 | 
 84 |             print("\nVersion detected as %s" % self.version)
 85 |         elif os.path.exists(os.path.join(self.dir.name, "webarchive.yaml")):
 86 |             self.version = OUTDATED_WACZ
 87 |             self.webarchive_yaml = os.path.join(self.dir.name, "webarchive.yaml")
 88 |             print(
 89 |                 "\nWACZ version detected as 0.1.0. This is an outdated version of WACZ."
 90 |             )
 91 |         else:
 92 |             print("\nVersion not able to be detected, invalid wacz file")
 93 | 
 94 |     def frictionless_validate(self):
 95 |         """Uses the frictionless data package to validate the datapackage.json file"""
 96 |         if validate(self.datapackage_path).valid == True:
 97 |             return True
 98 |         else:
 99 |             print(
100 |                 "\nFrictionless has detected that this is an invalid package with errors %s"
101 |                 % validate(self.datapackage_path).errors
102 |             )
103 |             return False
104 | 
105 |     def check_file_paths(self):
106 |         """Uses the datapackage to check that all the files listed exist in the data folder or that the wacz contains a webarchive.yml file"""
107 |         if self.version != OUTDATED_WACZ:
108 |             package_files = [item["path"] for item in self.datapackage["resources"]]
109 |             for filepath in pathlib.Path(self.dir.name).glob("**/*.*"):
110 |                 filename = os.path.basename(filepath)
111 |                 if (
112 |                     filename != "datapackage.json"
113 |                     and filename != "datapackage-digest.json"
114 |                 ):
115 |                     file = str(filepath).split("/")[-2:]
116 |                     file = "/".join(file)
117 |                     if file not in package_files:
118 |                         print("file %s is not listed in the datapackage" % file)
119 |                         return False
120 |         return True
121 | 
122 |     def check_compression(self):
123 |         """WARCs and compressed cdx.gz should be in ZIP with 'store' compression (not deflate) Indexes and page list can be compressed"""
124 |         wacz = zipfile.ZipInfo(self.wacz)
125 |         if wacz.compress_type != 0:
126 |             return False
127 | 
128 |         if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")):
129 |             cdx = zipfile.ZipInfo(os.path.join(self.dir.name, "indexes/index.cdx.gz"))
130 |             if cdx.compress_type != 0:
131 |                 return False
132 | 
133 |         archive_folder = os.listdir(os.path.join(self.dir.name, "archive"))
134 |         for item in archive_folder:
135 |             if ".warc" not in item and zf.getinfo(item).compress_type != 0:
136 |                 return False
137 |         return True
138 | 
139 |     def check_indexes(self):
140 |         """Indexing existing WARC which should match the index in the wacz"""
141 |         if os.path.exists(os.path.join(self.dir.name, "indexes/index.cdx.gz")):
142 |             for resource in self.datapackage["resources"]:
143 |                 if resource["path"] == "indexes/index.cdx.gz":
144 |                     cdx = resource["hash"]
145 |         else:
146 |             return False
147 | 
148 |         archive_folder = os.listdir(os.path.join(self.dir.name, "archive"))
149 |         for item in archive_folder:
150 |             if ".warc" in item:
151 |                 warc = item
152 |         wacz_file = tempfile.NamedTemporaryFile(delete=False)
153 |         wacz = zipfile.ZipFile(wacz_file.name, "w")
154 |         data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now())
155 |         index_buff = BytesIO()
156 |         text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)
157 |         wacz_indexer = None
158 |         with wacz.open(data_file, "w") as data:
159 |             wacz_indexer = WACZIndexer(
160 |                 text_wrap,
161 |                 {},
162 |                 sort=True,
163 |                 compress=data,
164 |                 fields="referrer",
165 |                 data_out_name="index.cdx.gz",
166 |                 records="all",
167 |                 main_url="",
168 |                 detect_pages="",
169 |             )
170 | 
171 |             wacz_indexer.process_all()
172 |         wacz.close()
173 |         dir = tempfile.TemporaryDirectory()
174 |         with zipfile.ZipFile(self.wacz, "r") as zip_ref:
175 |             zip_ref.extractall(dir.name)
176 |             zip_ref.close()
177 | 
178 |         with open(os.path.join(dir.name, "indexes/index.cdx.gz"), "rb") as fd:
179 |             size, hash_ = hash_stream(self.hash_type, fd)
180 |             gzip_fd = gzip.GzipFile(fileobj=fd)
181 | 
182 |         return cdx == hash_
183 | 
184 |     def check_file_hashes(self):
185 |         """Uses the datapackage to check that all the hashes of file in the data folder match those in the datapackage"""
186 |         for filepath in pathlib.Path(self.dir.name).glob("**/*.*"):
187 |             filename = os.path.basename(filepath)
188 |             if filename != "datapackage.json" and filename != "datapackage-digest.json":
189 |                 with open(filepath, "rb") as fh:
190 |                     size, hash_ = hash_stream(self.hash_type, fh)
191 | 
192 |                 path = str(filepath).split("/")[-2:]
193 |                 path = "/".join(path)
194 |                 res = None
195 |                 for item in self.datapackage["resources"]:
196 |                     if item["path"] == path:
197 |                         res = item
198 |                 if res == None or (res["hash"] != hash_):
199 |                     print(
200 |                         "\nfile %s's hash does not match the hash listed in the datapackage"
201 |                         % path
202 |                     )
203 |                     return False
204 |         return True
205 | 
206 |     def check_data_package_hash_and_sig(self):
207 |         data_digest_filename = os.path.join(self.dir.name, "datapackage-digest.json")
208 |         if not os.path.exists(data_digest_filename):
209 |             return True
210 | 
211 |         with open(data_digest_filename) as fh:
212 |             data_digest = json.loads(fh.read())
213 | 
214 |         with open(os.path.join(self.dir.name, "datapackage.json"), "rb") as fh:
215 |             size, hash_ = hash_stream(self.hash_type, fh)
216 | 
217 |         if hash_ != data_digest["hash"]:
218 |             print("datapackage.json hash mismatch to datapackage-digest.json")
219 |             return False
220 | 
221 |         signed_data = data_digest.get("signedData")
222 |         if not signed_data:
223 |             return True
224 | 
225 |         try:
226 |             if self.datapackage.get("created") != signed_data.get("created"):
227 |                 print("signed timestamp != created timestamp")
228 |                 return False
229 | 
230 |             if not self.verify_auth:
231 |                 print(
232 |                     "Note: Has signature, but auth verification skipped, run with --verify-auth to also include verification"
233 |                 )
234 |                 return True
235 | 
236 |             if self.verifier_url:
237 |                 res = requests.post(self.verifier_url, json=signed_data)
238 |                 success = res.status_code == 200
239 |                 msg = self.verifier_url
240 |             else:
241 |                 try:
242 |                     from authsign.verifier import Verifier
243 |                 except ImportError:
244 |                     print(
245 |                         "authsign package not found, can not verify signature. Try installing with 'pip install wacz[signing]'"
246 |                     )
247 |                     return False
248 | 
249 |                 logging.basicConfig(
250 |                     format="%(asctime)s: [%(levelname)s]: %(message)s",
251 |                     level=logging.INFO,
252 |                 )
253 | 
254 |                 verifier = Verifier()
255 |                 success = verifier(signed_data)
256 |                 msg = "direct check"
257 | 
258 |             if success:
259 |                 print("Successfully verified signature via: " + msg)
260 |                 return True
261 |             else:
262 |                 print("Signature not verified via: " + msg)
263 |                 return False
264 | 
265 |         except Exception as e:
266 |             import traceback
267 | 
268 |             traceback.print_exc()
269 |             print("Validation failed due to error", e)
270 |             return False
271 | 
272 |         return True
273 | 
274 |     def parse_date(self, string):
275 |         if not string:
276 |             return None
277 | 
278 |         return datetime.datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ")
279 | 


--------------------------------------------------------------------------------
/wacz/main.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser, RawTextHelpFormatter
  2 | from io import BytesIO, StringIO, TextIOWrapper
  3 | import os, json, datetime, shutil, zipfile, sys, gzip, pkg_resources
  4 | from wacz.waczindexer import WACZIndexer
  5 | from wacz.util import now, WACZ_VERSION, construct_passed_pages_dict
  6 | from wacz.validate import Validation, OUTDATED_WACZ
  7 | from wacz.util import validateJSON, get_py_wacz_version, validate_pages_jsonl_file
  8 | from warcio.timeutils import iso_date_to_timestamp
  9 | 
 10 | """
 11 | WACZ Generator
 12 | """
 13 | 
 14 | PAGE_INDEX = "pages/pages.jsonl"
 15 | EXTRA_PAGES_INDEX = "pages/extraPages.jsonl"
 16 | 
 17 | PAGE_INDEX_TEMPLATE = "pages/{0}.jsonl"
 18 | 
 19 | # setting to size matching archiveweb.page defaults
 20 | DEFAULT_NUM_LINES = 1024
 21 | 
 22 | 
 23 | def main(args=None):
 24 |     parser = ArgumentParser(
 25 |         description="WACZ creator", formatter_class=RawTextHelpFormatter
 26 |     )
 27 | 
 28 |     parser.add_argument("-V", "--version", action="version", version=get_version())
 29 | 
 30 |     subparsers = parser.add_subparsers(dest="cmd")
 31 |     subparsers.required = True
 32 | 
 33 |     create = subparsers.add_parser("create", help="create wacz file")
 34 |     create.add_argument("inputs", nargs="+")
 35 |     create.add_argument("-f", "--file", action="store_true")
 36 | 
 37 |     create.add_argument("-o", "--output", default="archive.wacz")
 38 | 
 39 |     create.add_argument("-e", "--extra-pages")
 40 | 
 41 |     create.add_argument(
 42 |         "-t",
 43 |         "--text",
 44 |         help="Generates pages.jsonl with a full-text index. Must be run in addition with --detect-pages or it will have no effect",
 45 |         action="store_true",
 46 |     )
 47 | 
 48 |     create.add_argument(
 49 |         "-p",
 50 |         "--pages",
 51 |         help="Overrides the pages generation with the passed jsonl pages",
 52 |         action="store",
 53 |     )
 54 | 
 55 |     create.add_argument(
 56 |         "-d",
 57 |         "--detect-pages",
 58 |         help="Generates pages.jsonl without a text index",
 59 |         action="store_true",
 60 |     )
 61 | 
 62 |     create.add_argument(
 63 |         "-c",
 64 |         "--copy-pages",
 65 |         help="Overrides the pages/extra-pages options by copying files to WACZ without parsing",
 66 |         action="store_true",
 67 |     )
 68 | 
 69 |     create.add_argument(
 70 |         "--hash-type",
 71 |         choices=["sha256", "md5"],
 72 |         help="Allows the user to specify the hash type used. Currently we allow sha256 and md5",
 73 |     )
 74 | 
 75 |     create.add_argument(
 76 |         "-l",
 77 |         "--log-directory",
 78 |         help="Adds log files in specified directory to WACZ",
 79 |         action="store",
 80 |     )
 81 | 
 82 |     create.add_argument("--split-seeds", action="store_true")
 83 | 
 84 |     create.add_argument("--ts")
 85 |     create.add_argument("--url")
 86 |     create.add_argument("--date")
 87 |     create.add_argument("--title")
 88 |     create.add_argument("--desc")
 89 | 
 90 |     create.add_argument(
 91 |         "--signing-url",
 92 |         help="URL of signing server to obtain signature for datapackage-digest.json",
 93 |     )
 94 |     create.add_argument("--signing-token", help="Auth token for signing URL")
 95 | 
 96 |     create.set_defaults(func=create_wacz)
 97 | 
 98 |     validate = subparsers.add_parser("validate", help="validate a wacz file")
 99 |     validate.add_argument("-f", "--file", required=True)
100 |     validate.set_defaults(func=validate_wacz)
101 | 
102 |     validate.add_argument(
103 |         "--verify-auth",
104 |         action="store_true",
105 |         help="If set, will attempt to validate authenticity of the WACZ, either directly or via remote server if --verifier-url is also set",
106 |     )
107 | 
108 |     validate.add_argument(
109 |         "--verifier-url",
110 |         help="URL of verify server to verify the signature, if any, in dapackage-digest.json",
111 |     )
112 | 
113 |     cmd = parser.parse_args(args=args)
114 | 
115 |     if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None:
116 |         parser.error("--url must be specified when --ts is passed")
117 | 
118 |     if cmd.cmd == "create" and cmd.detect_pages is not False and cmd.pages is not None:
119 |         parser.error(
120 |             "--pages and --detect-pages can't be set at the same time they cancel each other out."
121 |         )
122 | 
123 |     value = cmd.func(cmd)
124 |     return value
125 | 
126 | 
127 | def get_version():
128 |     return "%(prog)s " + get_py_wacz_version() + " -- WACZ File Format: " + WACZ_VERSION
129 | 
130 | 
131 | def validate_wacz(res):
132 |     validate = Validation(
133 |         res.file, verify_auth=res.verify_auth, verifier_url=res.verifier_url
134 |     )
135 |     version = validate.version
136 |     validation_tests = []
137 | 
138 |     if version == OUTDATED_WACZ:
139 |         print("Validation succeeded, the passed WACZ is outdated but valid")
140 |         return 0
141 | 
142 |     elif version == WACZ_VERSION:
143 |         validation_tests += [
144 |             validate.check_required_contents,
145 |             validate.frictionless_validate,
146 |             validate.check_file_paths,
147 |             validate.check_file_hashes,
148 |             validate.check_data_package_hash_and_sig,
149 |         ]
150 |     else:
151 |         print("Validation failed, the passed WACZ is invalid")
152 |         return 1
153 | 
154 |     for func in validation_tests:
155 |         success = func()
156 |         if success is False:
157 |             print("Validation failed, the passed WACZ is invalid")
158 |             return 1
159 | 
160 |     print("Validation succeeded, the passed WACZ is valid")
161 |     return 0
162 | 
163 | 
164 | def create_wacz(res):
165 |     wacz = zipfile.ZipFile(res.output, "w")
166 | 
167 |     # write index
168 |     data_file = zipfile.ZipInfo("indexes/index.cdx.gz", now())
169 | 
170 |     index_file = zipfile.ZipInfo("indexes/index.idx", now())
171 |     index_file.compress_type = zipfile.ZIP_DEFLATED
172 | 
173 |     index_buff = BytesIO()
174 | 
175 |     text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)
176 | 
177 |     wacz_indexer = None
178 | 
179 |     passed_pages_dict = {}
180 | 
181 |     # Handle pages
182 |     if res.pages != None:
183 |         if res.copy_pages:
184 |             print("Copying passed pages.jsonl file to WACZ")
185 | 
186 |             if not validate_pages_jsonl_file(res.pages):
187 |                 print("Unable to create WACZ without valid pages.jsonl file, quitting")
188 |                 wacz.close()
189 |                 return 1
190 | 
191 |             with open(res.pages, "rb") as fh:
192 |                 pages_jsonl = zipfile.ZipInfo("pages/pages.jsonl", now())
193 |                 with wacz.open(pages_jsonl, "w") as pages_file:
194 |                     shutil.copyfileobj(fh, pages_file)
195 | 
196 |         else:
197 |             print("Validating passed pages.jsonl file")
198 |             passed_content = []
199 |             with open(res.pages, "rb") as fh:
200 |                 for line in fh:
201 |                     if not line:
202 |                         continue
203 | 
204 |                     try:
205 |                         line = line.decode("utf-8")
206 |                         passed_content.append(line)
207 |                     except:
208 |                         print("Page data not utf-8 encoded, skipping", line)
209 | 
210 |             # Create a dict of the passed pages that will be used in the construction of the index
211 |             passed_pages_dict = construct_passed_pages_dict(passed_content)
212 | 
213 |     if res.extra_pages:
214 |         if res.copy_pages:
215 |             print("Copying passed extraPages.jsonl file to WACZ")
216 |             if validate_pages_jsonl_file(res.extra_pages):
217 |                 extra_pages_jsonl = zipfile.ZipInfo("pages/extraPages.jsonl", now())
218 |                 with open(res.extra_pages, "rb") as fh:
219 |                     with wacz.open(extra_pages_jsonl, "w") as extra_pages_file:
220 |                         shutil.copyfileobj(fh, extra_pages_file)
221 |             else:
222 |                 print("Ignoring invalid extraPages.jsonl file")
223 |         else:
224 |             print("Validating extra pages file")
225 |             extra_page_data = []
226 |             with open(res.extra_pages) as fh:
227 |                 data = fh.read()
228 |                 for page_str in data.strip().split("\n"):
229 |                     page_json = validateJSON(page_str)
230 | 
231 |                     if not page_json:
232 |                         print("Warning: Ignoring invalid extra page\n %s" % page_str)
233 |                         continue
234 | 
235 |                     extra_page_data.append(page_str.encode("utf-8"))
236 | 
237 |             extra_pages_file = zipfile.ZipInfo(EXTRA_PAGES_INDEX, now())
238 |             with wacz.open(extra_pages_file, "w") as efh:
239 |                 efh.write(b"\n".join(extra_page_data))
240 | 
241 |     print("Reading and Indexing All WARCs")
242 |     with wacz.open(data_file, "w") as data:
243 |         wacz_indexer = WACZIndexer(
244 |             text_wrap,
245 |             res.inputs,
246 |             sort=True,
247 |             post_append=True,
248 |             compress=data,
249 |             lines=DEFAULT_NUM_LINES,
250 |             digest_records=True,
251 |             fields="referrer,req.http:cookie",
252 |             data_out_name="index.cdx.gz",
253 |             hash_type=res.hash_type,
254 |             main_url=res.url,
255 |             main_ts=res.ts,
256 |             detect_pages=res.detect_pages,
257 |             passed_pages_dict=passed_pages_dict,
258 |             extract_text=res.text,
259 |             signing_url=res.signing_url,
260 |             signing_token=res.signing_token,
261 |             split_seeds=res.split_seeds,
262 |         )
263 | 
264 |         wacz_indexer.process_all()
265 | 
266 |     index_buff.seek(0)
267 | 
268 |     with wacz.open(index_file, "w") as index:
269 |         shutil.copyfileobj(index_buff, index)
270 | 
271 |     # write archives
272 |     print("Writing archives...")
273 |     for _input in res.inputs:
274 |         archive_file = zipfile.ZipInfo.from_file(
275 |             _input, "archive/" + os.path.basename(_input)
276 |         )
277 |         with wacz.open(archive_file, "w") as out_fh:
278 |             with open(_input, "rb") as in_fh:
279 |                 shutil.copyfileobj(in_fh, out_fh)
280 |                 path = "archive/" + os.path.basename(_input)
281 | 
282 |     if wacz_indexer.passed_pages_dict != None:
283 |         for key in wacz_indexer.passed_pages_dict:
284 |             print(
285 |                 "Invalid passed page. We were unable to find a match for %s" % str(key)
286 |             )
287 | 
288 |     if res.log_directory:
289 |         print("Writing logs...")
290 |         log_dir = os.path.abspath(res.log_directory)
291 |         for log_file in os.listdir(log_dir):
292 |             log_path = os.path.join(log_dir, log_file)
293 |             log_wacz_file = zipfile.ZipInfo.from_file(
294 |                 log_path, "logs/{}".format(log_file)
295 |             )
296 |             with wacz.open(log_wacz_file, "w") as out_fh:
297 |                 with open(log_path, "rb") as in_fh:
298 |                     shutil.copyfileobj(in_fh, out_fh)
299 |                     path = "logs/{}".format(log_file)
300 | 
301 |     if len(wacz_indexer.pages) > 0 and res.pages == None and not res.copy_pages:
302 |         print("Generating page index...")
303 |         # generate pages/text
304 |         wacz_indexer.write_page_list(
305 |             wacz,
306 |             PAGE_INDEX,
307 |             wacz_indexer.serialize_json_pages(
308 |                 wacz_indexer.pages.values(),
309 |                 id="pages",
310 |                 title="Pages",
311 |                 has_text=wacz_indexer.has_text,
312 |             ),
313 |         )
314 | 
315 |     if len(wacz_indexer.pages) > 0 and res.pages != None and not res.copy_pages:
316 |         print("Generating page index from passed pages...")
317 |         # Initially set the default value of the header id and title
318 |         id_value = "pages"
319 |         title_value = "Pages"
320 | 
321 |         # If the user has provided a title or an id in a header of their file we will use those instead of our default.
322 |         try:
323 |             header = json.loads(passed_content[0])
324 |         except:
325 |             print("Warning: Ignoring invalid page header: " + passed_content[0])
326 |             header = {}
327 | 
328 |         if "format" in header:
329 |             print("Header detected in the passed pages.jsonl file")
330 |             if "id" in header:
331 |                 id_value = header["id"]
332 |             if "title" in header:
333 |                 title_value = header["title"]
334 | 
335 |         wacz_indexer.write_page_list(
336 |             wacz,
337 |             PAGE_INDEX,
338 |             wacz_indexer.serialize_json_pages(
339 |                 wacz_indexer.pages.values(),
340 |                 id=id_value,
341 |                 title=title_value,
342 |                 has_text=wacz_indexer.has_text,
343 |             ),
344 |         )
345 | 
346 |     if len(wacz_indexer.extra_pages) > 0 and not res.copy_pages:
347 |         wacz_indexer.write_page_list(
348 |             wacz,
349 |             EXTRA_PAGES_INDEX,
350 |             wacz_indexer.serialize_json_pages(
351 |                 wacz_indexer.extra_pages.values(),
352 |                 id="extra-pages",
353 |                 title="Extra Pages",
354 |                 has_text=wacz_indexer.has_text,
355 |             ),
356 |         )
357 | 
358 |     if len(wacz_indexer.extra_page_lists) > 0 and not res.copy_pages:
359 |         print("Generating extra page lists...")
360 | 
361 |         for name, pagelist in wacz_indexer.extra_page_lists.items():
362 |             if name == "pages":
363 |                 name = shortuuid.uuid()
364 |             filename = PAGE_INDEX_TEMPLATE.format(name)
365 | 
366 |             wacz_indexer.write_page_list(wacz, filename, pagelist)
367 | 
368 |     # generate datapackage
369 |     print("Generating datapackage.json")
370 | 
371 |     datapackage = wacz_indexer.generate_datapackage(res, wacz)
372 |     datapackage_file = zipfile.ZipInfo("datapackage.json", now())
373 |     datapackage_file.compress_type = zipfile.ZIP_DEFLATED
374 |     datapackage_bytes = datapackage.encode("utf-8")
375 |     wacz.writestr(datapackage_file, datapackage_bytes)
376 | 
377 |     print("Generating datapackage-digest.json")
378 |     datapackage_digest_file = zipfile.ZipInfo("datapackage-digest.json", now())
379 |     datapackage_digest_file.compress_type = zipfile.ZIP_DEFLATED
380 |     wacz.writestr(
381 |         datapackage_digest_file,
382 |         wacz_indexer.generate_datapackage_digest(datapackage_bytes),
383 |     )
384 | 
385 |     wacz.close()
386 | 
387 |     return 0
388 | 
389 | 
390 | if __name__ == "__main__":
391 |     main()
392 | 


--------------------------------------------------------------------------------
/wacz/waczindexer.py:
--------------------------------------------------------------------------------
  1 | import json, shortuuid
  2 | from urllib.parse import quote, urlsplit, urlunsplit
  3 | import os, gzip, glob, zipfile, traceback
  4 | from cdxj_indexer.main import CDXJIndexer
  5 | from warcio.warcwriter import BufferWARCWriter
  6 | from warcio.timeutils import iso_date_to_timestamp, timestamp_to_iso_date
  7 | from boilerpy3 import extractors
  8 | from wacz.util import (
  9 |     hash_stream,
 10 |     now,
 11 |     WACZ_VERSION,
 12 |     get_py_wacz_version,
 13 |     check_http_and_https,
 14 | )
 15 | 
 16 | import datetime
 17 | import hashlib
 18 | import requests
 19 | 
 20 | HTML_MIME_TYPES = ("text/html", "application/xhtml", "application/xhtml+xml")
 21 | 
 22 | # Add warcinfo as a default record for indexing to simplify filtering logic
 23 | CDXJIndexer.DEFAULT_RECORDS.append("warcinfo")
 24 | 
 25 | 
 26 | # ============================================================================
 27 | class WACZIndexer(CDXJIndexer):
 28 |     def __init__(self, *args, **kwargs):
 29 |         super().__init__(*args, **kwargs)
 30 |         self.pages = {}
 31 |         self.extra_pages = {}
 32 |         self.extra_page_lists = {}
 33 |         self.title = ""
 34 |         self.desc = ""
 35 |         self.has_text = False
 36 |         self.main_url = kwargs.pop("main_url", "")
 37 |         self.main_ts = kwargs.pop("main_ts", "")
 38 |         self.main_page_entry = None
 39 |         self.main_page_id = None
 40 |         self.hash_type = kwargs.pop("hash_type", "")
 41 | 
 42 |         self.signing_url = kwargs.pop("signing_url", "")
 43 |         self.signing_token = kwargs.pop("signing_token", "")
 44 | 
 45 |         self._created = None
 46 | 
 47 |         # If the user has specified a hash type use that otherwise default to sha256
 48 |         if self.hash_type == None:
 49 |             self.hash_type = "sha256"
 50 | 
 51 |         self.passed_pages_dict = kwargs.pop("passed_pages_dict", {})
 52 |         self.split_seeds = kwargs.pop("split_seeds", False)
 53 | 
 54 |         if self.main_url != None and self.main_url != "":
 55 |             self.main_url_flag = False
 56 |             self.main_ts_flag = False
 57 |         # if url is missing path segment, ensure it is set to '/'
 58 |         try:
 59 |             parts = list(urlsplit(self.main_url))
 60 |             if not parts[2]:
 61 |                 parts[2] = "/"
 62 |                 self.main_url = urlunsplit(parts)
 63 |         except:
 64 |             pass
 65 | 
 66 |         self.detect_pages = kwargs.get("detect_pages")
 67 |         self.detect_referrer_check = True
 68 |         self.extract_text = kwargs.get("extract_text")
 69 |         if self.extract_text == True and self.detect_pages == False:
 70 |             print(
 71 |                 "Warning. You've passed the --text flag without the --detect-pages flag. No pages.jsonl file will be generated. You must enable the --detect-pages and --text flags together in order to get a pages.jsonl file with full text."
 72 |             )
 73 |         self.referrers = set()
 74 | 
 75 |     def process_index_entry(self, it, record, *args):
 76 |         type_ = record.rec_type
 77 |         if type_ == "warcinfo":
 78 |             self.parse_warcinfo(record)
 79 | 
 80 |         elif self.filter_record(record):
 81 |             if type_ in ("response", "resource", "revisit"):
 82 |                 self.check_pages_and_text(record)
 83 | 
 84 |             super().process_index_entry(it, record, *args)
 85 | 
 86 |     def process_all(self):
 87 |         super().process_all()
 88 | 
 89 |         if self.detect_pages:
 90 |             if self.detect_referrer_check:
 91 |                 to_delete = [
 92 |                     id_
 93 |                     for id_, value in self.pages.items()
 94 |                     if value["url"] not in self.referrers
 95 |                 ]
 96 |                 for delete in to_delete:
 97 |                     del self.pages[delete]
 98 | 
 99 |             if self.passed_pages_dict == {}:
100 |                 print("Num Pages Detected: {0}".format(len(self.pages)))
101 | 
102 |                 if self.split_seeds and self.main_page_entry:
103 |                     self.extra_pages = self.pages
104 |                     self.pages = {self.main_page_id: self.main_page_entry}
105 | 
106 |         if (
107 |             hasattr(self, "main_url_flag")
108 |             and hasattr(self, "main_ts_flag")
109 |             and self.main_url_flag == False
110 |             and self.main_ts_flag == False
111 |         ):
112 |             raise ValueError(
113 |                 "ts %s not found in index with %s" % (self.main_ts, self.main_url)
114 |             )
115 | 
116 |         if hasattr(self, "main_url_flag") and self.main_url_flag == False:
117 |             raise ValueError("Url %s not found in index" % (self.main_url))
118 | 
119 |     def _do_write(self, urlkey, ts, index, out):
120 |         if self.detect_pages:
121 |             self.detect_page(ts, index)
122 | 
123 |         super()._do_write(urlkey, ts, index, out)
124 | 
125 |     def detect_page(self, ts, index):
126 |         referrer = index.get("referrer")
127 |         if referrer:
128 |             self.referrers.add(referrer)
129 | 
130 |     def _read_record(self, record):
131 |         if hasattr(record, "buffered_stream"):
132 |             content = record.buffered_stream.read()
133 |         else:
134 |             content = record.content_stream().read()
135 | 
136 |         return content
137 | 
138 |     def parse_warcinfo(self, record):
139 |         """Parse WARC information.
140 |         :param record: WARC information
141 |         :returns: WARC information or None
142 |         :rtype: dict or None
143 |         """
144 |         warcinfo = {}
145 |         warcinfo_buff = self._read_record(record)
146 |         warcinfo_buff = warcinfo_buff.decode("utf-8")
147 |         metadata = None
148 |         for line in warcinfo_buff.rstrip().split("\n"):
149 |             parts = line.split(":", 1)
150 |             if parts[0] == "json-metadata":
151 |                 metadata = json.loads(parts[1])
152 |             elif len(parts) == 2:
153 |                 warcinfo[parts[0]] = parts[1].strip()
154 | 
155 |         if not metadata or "type" not in metadata:
156 |             return
157 | 
158 |         if metadata["type"] == "collection":
159 |             self.title = metadata.get("title", "")
160 |             self.desc = metadata.get("desc", "")
161 |             lists = metadata.get("lists")
162 |             if lists:
163 |                 self.extract_page_lists(lists)
164 | 
165 |         # Don't add the record to the self.pages if were evaluating passed in pages
166 |         elif metadata["type"] == "recording" and self.passed_pages_dict == {}:
167 |             pages = metadata.get("pages", [])
168 |             for page in pages:
169 |                 id_ = page["timestamp"] + "/" + page["url"]
170 |                 self.pages[id_] = page
171 | 
172 |         self.detect_referrer_check = False
173 | 
174 |     def extract_page_lists(self, lists):
175 |         for pagelist in lists:
176 |             pagelist_header = {}
177 |             # unique id for this page list, will also be the filename
178 |             if "slug" in pagelist:
179 |                 uid = pagelist["slug"]
180 |             else:
181 |                 uid = shortuuid.uuid()
182 | 
183 |             text_list = list(
184 |                 self.serialize_json_pages(
185 |                     pages=pagelist["bookmarks"],
186 |                     id=uid,
187 |                     title=pagelist.get("title"),
188 |                     desc=pagelist.get("desc"),
189 |                 )
190 |             )
191 | 
192 |             self.extra_page_lists[uid] = text_list
193 | 
194 |     def check_pages_and_text(self, record):
195 |         url = record.rec_headers.get("WARC-Target-URI")
196 |         date = record.rec_headers.get("WARC-Date")
197 |         ts = iso_date_to_timestamp(date)
198 |         id_ = ts + "/" + url
199 |         matched_id = ""
200 |         # Check for both a matching url/ts and url entry
201 | 
202 |         # if id_ in self.passed_pages_dict:
203 |         #    matched_id = id_
204 | 
205 |         matched_id = check_http_and_https(url, ts, self.passed_pages_dict)
206 |         # If we find a match build a record
207 |         if matched_id:
208 |             page_data = self.passed_pages_dict[matched_id]
209 |             page_data["timestamp"] = ts
210 |             if "url" not in page_data:
211 |                 page_data["url"] = url
212 |             if "title" not in page_data:
213 |                 page_data["title"] = url
214 | 
215 |             if self.split_seeds and not page_data.get("seed"):
216 |                 self.extra_pages[matched_id] = page_data
217 |             else:
218 |                 self.pages[matched_id] = page_data
219 | 
220 |             # Delete the entry from our pages_dict so we can't match it again
221 |             del self.passed_pages_dict[matched_id]
222 | 
223 |         if (
224 |             self.main_url
225 |             and self.main_url == url
226 |             and self.main_ts
227 |             and self.main_ts == ts
228 |         ):
229 |             self.main_ts_flag = True
230 |             self.main_url_flag = True
231 |             print("Found Main Url: {0}".format(url))
232 |             print("Found Main ts: {0}".format(ts))
233 |             # If were not relying on passed in pages we want to add all records to the self.pages object
234 |             if self.passed_pages_dict == {}:
235 |                 self.main_page_entry = {
236 |                     "timestamp": ts,
237 |                     "url": url,
238 |                     "title": url,
239 |                     "seed": True,
240 |                 }
241 |                 self.main_page_id = id_
242 |                 self.pages[id_] = self.main_page_entry
243 |         if self.main_url and self.main_url == url and self.main_ts == None:
244 |             self.main_url_flag = True
245 |             print("Found Main Url: {0}".format(url))
246 |             if id_ not in self.pages:
247 |                 self.main_page_entry = {
248 |                     "timestamp": ts,
249 |                     "url": url,
250 |                     "title": url,
251 |                     "seed": True,
252 |                 }
253 |                 self.main_page_id = id_
254 |                 self.pages[id_] = self.main_page_entry
255 | 
256 |         mime = self.get_record_mime_type(record)
257 | 
258 |         if mime not in HTML_MIME_TYPES:
259 |             return
260 | 
261 |         if record.http_headers and record.http_headers.get_statuscode().startswith("3"):
262 |             return
263 | 
264 |         if id_ not in self.pages:
265 |             if self.detect_pages:
266 |                 self.pages[id_] = {"timestamp": ts, "url": url, "title": url}
267 |             else:
268 |                 return
269 | 
270 |         # if not extracting text, then finish here
271 |         if not self.extract_text:
272 |             return
273 | 
274 |         content = self._read_record(record)
275 |         if not content:
276 |             return
277 | 
278 |         try:
279 |             extractor = extractors.ArticleExtractor(raise_on_failure=False)
280 | 
281 |             content = content.decode("utf-8")
282 | 
283 |             doc = extractor.get_doc(content)
284 | 
285 |             curr_page = self.pages[id_]
286 | 
287 |             if doc.content:
288 |                 self.pages[id_]["text"] = doc.content
289 |                 self.has_text = True
290 | 
291 |             # only set title if unset, or set to url (default)
292 |             # avoid overriding user-specified title, if any
293 |             if doc.title and self.pages[id_].get("title", url) == url:
294 |                 self.pages[id_]["title"] = doc.title
295 | 
296 |         except Exception as e:
297 |             # skip text extraction in case of errors
298 |             print("Skipping, Text Extraction Failed For: " + url)
299 |             print(e)
300 | 
301 |     def get_record_mime_type(self, record):
302 |         if record.http_headers:
303 |             # if the record has HTTP headers, use the Content-Type from those (eg. 'response' record)
304 |             content_type = record.http_headers["Content-Type"]
305 |         else:
306 |             # otherwise, use the Content-Type from WARC headers
307 |             content_type = record.rec_headers["Content-Type"]
308 | 
309 |         mime = content_type or ""
310 |         return mime.split(";")[0]
311 | 
312 |     def write_page_list(self, wacz, filename, page_iter):
313 |         pages_file = zipfile.ZipInfo(filename, now())
314 |         pages_file.compress_type = zipfile.ZIP_DEFLATED
315 | 
316 |         with wacz.open(pages_file, "w") as pg_fh:
317 |             for line in page_iter:
318 |                 pg_fh.write(line.encode("utf-8"))
319 | 
320 |     def serialize_json_pages(self, pages, id, title, desc=None, has_text=False):
321 |         page_header = {"format": "json-pages-1.0", "id": id}
322 | 
323 |         if title:
324 |             page_header["title"] = title
325 | 
326 |         if desc:
327 |             page_header["description"] = desc
328 | 
329 |         if has_text:
330 |             page_header["hasText"] = True
331 | 
332 |         yield json.dumps(page_header) + "\n"
333 | 
334 |         for line in pages:
335 |             if "ts" not in line and "timestamp" in line:
336 |                 ts = timestamp_to_iso_date(line["timestamp"])
337 |                 line["ts"] = ts
338 |                 del line["timestamp"]
339 | 
340 |             line["id"] = line.get("id") or line.get("page_id") or shortuuid.uuid()
341 | 
342 |             yield json.dumps(line) + "\n"
343 | 
344 |     def generate_datapackage(self, res, wacz):
345 |         package_dict = {}
346 | 
347 |         package_dict["profile"] = "data-package"
348 | 
349 |         resources = []
350 | 
351 |         for zip_entry in wacz.infolist():
352 |             res_entry = {}
353 |             res_entry["name"] = os.path.basename(zip_entry.filename).lower()
354 |             res_entry["path"] = zip_entry.filename
355 | 
356 |             with wacz.open(zip_entry, "r") as stream:
357 |                 size, hash_ = hash_stream(self.hash_type, stream)
358 |                 res_entry["hash"] = hash_
359 |                 res_entry["bytes"] = size
360 | 
361 |             resources.append(res_entry)
362 | 
363 |         package_dict["resources"] = resources
364 | 
365 |         # set optional metadata
366 |         desc = res.desc or self.desc
367 |         title = res.title or self.title
368 | 
369 |         if title:
370 |             package_dict["title"] = title
371 | 
372 |         if desc:
373 |             package_dict["description"] = desc
374 | 
375 |         if self.main_url:
376 |             package_dict["mainPageURL"] = self.main_url
377 |             if self.main_ts:
378 |                 package_dict["mainPageDate"] = timestamp_to_iso_date(self.main_ts)
379 | 
380 |         if res.date:
381 |             package_dict["mainPageDate"] = res.date
382 | 
383 |         package_dict["created"] = datetime.datetime.utcnow().strftime(
384 |             "%Y-%m-%dT%H:%M:%SZ"
385 |         )
386 |         self._created = package_dict["created"]
387 | 
388 |         package_dict["wacz_version"] = WACZ_VERSION
389 | 
390 |         package_dict["software"] = "py-wacz " + get_py_wacz_version()
391 | 
392 |         return json.dumps(package_dict, indent=2)
393 | 
394 |     def generate_datapackage_digest(self, datapackage_bytes):
395 |         digest_dict = {
396 |             "path": "datapackage.json",
397 |             "hash": "sha256:" + hashlib.sha256(datapackage_bytes).hexdigest(),
398 |         }
399 | 
400 |         if self.signing_url:
401 |             self.do_sign(digest_dict)
402 | 
403 |         return json.dumps(digest_dict, indent=2)
404 | 
405 |     def do_sign(self, digest_dict):
406 |         try:
407 |             headers = {}
408 |             if self.signing_token:
409 |                 headers["Authorization"] = "bearer " + self.signing_token
410 | 
411 |             req = {"hash": digest_dict["hash"], "created": self._created}
412 | 
413 |             res = requests.post(self.signing_url, headers=headers, json=req)
414 | 
415 |             if res.status_code != 200:
416 |                 raise ValueError("Signing Failed: " + res.text)
417 |                 return False
418 | 
419 |             json = res.json()
420 |             if json["hash"] != digest_dict["hash"] or json["created"] != self._created:
421 |                 print("Not Signed, signing request failed")
422 |                 return
423 | 
424 |             digest_dict["signedData"] = json
425 | 
426 |             print("Added Signature")
427 |         except:
428 |             traceback.print_exc()
429 | 


--------------------------------------------------------------------------------
/tests/fixtures/logs/wr-specs-crawl.log:
--------------------------------------------------------------------------------
 1 | {"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]}
 2 | {"logLevel":"info","timestamp":"2023-02-23T20:29:37.197Z","context":"state","message":"Storing state in memory","details":{}}
 3 | {"logLevel":"info","timestamp":"2023-02-23T20:29:37.572Z","context":"general","message":"Text Extraction: Disabled","details":{}}
 4 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.587Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/"}}
 5 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.590Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
 6 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.591Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
 7 | {"logLevel":"info","timestamp":"2023-02-23T20:29:45.591Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/"}}
 8 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.083Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n    at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n    at async ElementHandle.evaluateHandle (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/JSHandle.js:94:16)\n    at async internalHandler.queryOne (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/QueryHandler.js:25:30)\n    at async ElementHandle.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ElementHandle.js:78:17)\n    at async Crawler.checkCF (file:///app/crawler.js:968:14)\n    at async Crawler.loadPage (file:///app/crawler.js:869:5)\n    at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n    at async Crawler.crawlPage (file:///app/crawler.js:384:7)"}}
 9 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.301Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n    at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n    at async ElementHandle.evaluateHandle (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/JSHandle.js:94:16)\n    at async internalHandler.queryOne (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/QueryHandler.js:25:30)\n    at async ElementHandle.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ElementHandle.js:78:17)\n    at async Crawler.checkCF (file:///app/crawler.js:968:14)\n    at async Crawler.loadPage (file:///app/crawler.js:869:5)\n    at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n    at async Crawler.crawlPage (file:///app/crawler.js:384:7)"}}
10 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.309Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n    at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n    at async IsolatedWorld.document (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:186:63)\n    at async IsolatedWorld.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:174:26)\n    at async Crawler.checkCF (file:///app/crawler.js:968:14)\n    at async Crawler.loadPage (file:///app/crawler.js:869:5)\n    at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n    at async Crawler.crawlPage (file:///app/crawler.js:384:7)\n    at async /app/node_modules/puppeteer-cluster/dist/util.js:63:24"}}
11 | {"logLevel":"info","timestamp":"2023-02-23T20:29:46.594Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/","seedId":0,"depth":0,"started":"2023-02-23T20:29:37.646Z"}}
12 | {"logLevel":"info","timestamp":"2023-02-23T20:29:46.594Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":0,"total":5,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/\",\"seedId\":0,\"depth\":0,\"started\":\"2023-02-23T20:29:37.646Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.593Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}"]}}
13 | {"logLevel":"warn","timestamp":"2023-02-23T20:29:46.821Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n    at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n    at async IsolatedWorld.document (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:186:63)\n    at async IsolatedWorld.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:174:26)\n    at async Crawler.checkCF (file:///app/crawler.js:968:14)\n    at async Crawler.loadPage (file:///app/crawler.js:869:5)\n    at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n    at async Crawler.crawlPage (file:///app/crawler.js:384:7)\n    at async /app/node_modules/puppeteer-cluster/dist/util.js:63:24"}}
14 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.104Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}}
15 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.108Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
16 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.110Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
17 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.111Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}}
18 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.600Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}}
19 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.606Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
20 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.607Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
21 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.608Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}}
22 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.675Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}}
23 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.678Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
24 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.678Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
25 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.680Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}}
26 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.905Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}}
27 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.909Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
28 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.909Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
29 | {"logLevel":"info","timestamp":"2023-02-23T20:29:52.911Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}}
30 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.113Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/use-cases/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.593Z"}}
31 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.115Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":1,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/use-cases/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.593Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}"]}}
32 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.611Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz-auth/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.633Z"}}
33 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.611Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":2,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}"]}}
34 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.680Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.613Z"}}
35 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.680Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":3,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}"]}}
36 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.914Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/cdxj/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:46.595Z"}}
37 | {"logLevel":"info","timestamp":"2023-02-23T20:29:53.914Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":4,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}"]}}
38 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.380Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}}
39 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
40 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
41 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}}
42 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.784Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}}
43 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.789Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
44 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.789Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
45 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.790Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}}
46 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.883Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}}
47 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.893Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
48 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.893Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
49 | {"logLevel":"info","timestamp":"2023-02-23T20:29:59.894Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}}
50 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.090Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}}
51 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.096Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}}
52 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.097Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}}
53 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.098Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}}
54 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.383Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.119Z"}}
55 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.383Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":5,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}}
56 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.793Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.612Z"}}
57 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.794Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":6,"total":9,"pending":3,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}}
58 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.896Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.681Z"}}
59 | {"logLevel":"info","timestamp":"2023-02-23T20:30:00.896Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":7,"total":9,"pending":2,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}}
60 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.103Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.915Z"}}
61 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.107Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":8,"total":9,"pending":1,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}}
62 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.265Z","context":"general","message":"Waiting to ensure pending data is written to WARCs...","details":{}}
63 | {"logLevel":"info","timestamp":"2023-02-23T20:30:01.277Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":9,"total":9,"pending":0,"limit":{"max":0,"hit":false},"pendingPages":[]}}
64 | 


--------------------------------------------------------------------------------
/tests/test_optional_flags_wacz.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import tempfile
  3 | import os
  4 | import zipfile, json, gzip
  5 | from wacz.main import main, now
  6 | from wacz.util import hash_file
  7 | from unittest.mock import patch
  8 | import jsonlines
  9 | 
 10 | TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
 11 | PAGES_DIR = os.path.join(TEST_DIR, "pages")
 12 | 
 13 | 
 14 | class TestWaczFormat(unittest.TestCase):
 15 |     def test_warc_with_invalid_passed_pages(self):
 16 |         """If a user passes an invalid file using --page we should return an error"""
 17 |         with tempfile.TemporaryDirectory() as tmpdir:
 18 |             fp = tempfile.NamedTemporaryFile()
 19 |             fp.write(
 20 |                 """{"format": "title": "All Pages"}\n{"http://www.example"  "0-10-07T21:22:36Z", "title": "Example Domain"}""".encode(
 21 |                     "utf-8"
 22 |                 )
 23 |             )
 24 |             fp.seek(0)
 25 |             self.assertEqual(
 26 |                 main(
 27 |                     [
 28 |                         "create",
 29 |                         "-f",
 30 |                         os.path.join(TEST_DIR, "example-collection.warc"),
 31 |                         "-o",
 32 |                         os.path.join(tmpdir, "example-collection-valid-url.wacz"),
 33 |                         "-p",
 34 |                         os.path.join(tmpdir, fp.name),
 35 |                     ]
 36 |                 ),
 37 |                 0,
 38 |             )
 39 | 
 40 |     def test_invalid_passed_pages_copy_pages(self):
 41 |         """If a user passes an invalid pages.jsonl file using --page --copy-pages we should return an error"""
 42 |         with tempfile.TemporaryDirectory() as tmpdir:
 43 |             self.assertEqual(
 44 |                 main(
 45 |                     [
 46 |                         "create",
 47 |                         "-f",
 48 |                         os.path.join(TEST_DIR, "example-collection.warc"),
 49 |                         "-o",
 50 |                         os.path.join(
 51 |                             tmpdir, "example-collection-invalid-copy-pages.wacz"
 52 |                         ),
 53 |                         "-p",
 54 |                         os.path.join(PAGES_DIR, "invalid.jsonl"),
 55 |                         "--copy-pages",
 56 |                     ]
 57 |                 ),
 58 |                 1,
 59 |             )
 60 | 
 61 |             self.assertEqual(
 62 |                 main(
 63 |                     [
 64 |                         "create",
 65 |                         "-f",
 66 |                         os.path.join(TEST_DIR, "example-collection.warc"),
 67 |                         "-o",
 68 |                         os.path.join(
 69 |                             tmpdir, "example-collection-invalid-copy-pages-txt.wacz"
 70 |                         ),
 71 |                         "-p",
 72 |                         os.path.join(PAGES_DIR, "invalid.txt"),
 73 |                         "--copy-pages",
 74 |                     ]
 75 |                 ),
 76 |                 1,
 77 |             )
 78 | 
 79 |     def test_invalid_passed_extra_pages_copy_pages(self):
 80 |         """If a user passes an invalid extarPages.jsonl file using -e --copy-pages we still create WACZ without extra pages"""
 81 |         with tempfile.TemporaryDirectory() as tmpdir:
 82 |             self.assertEqual(
 83 |                 main(
 84 |                     [
 85 |                         "create",
 86 |                         "-f",
 87 |                         os.path.join(TEST_DIR, "example-collection.warc"),
 88 |                         "-o",
 89 |                         os.path.join(
 90 |                             tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
 91 |                         ),
 92 |                         "-p",
 93 |                         os.path.join(PAGES_DIR, "pages.jsonl"),
 94 |                         "-e",
 95 |                         os.path.join(PAGES_DIR, "invalid.txt"),
 96 |                         "--copy-pages",
 97 |                     ]
 98 |                 ),
 99 |                 0,
100 |             )
101 | 
102 |             with zipfile.ZipFile(
103 |                 os.path.join(
104 |                     tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
105 |                 ),
106 |                 "r",
107 |             ) as zip_ref:
108 |                 zip_ref.extractall(os.path.join(tmpdir, "wacz_no_extra_pages"))
109 |                 zip_ref.close()
110 | 
111 |             self.assertEqual(
112 |                 main(
113 |                     [
114 |                         "validate",
115 |                         "-f",
116 |                         os.path.join(
117 |                             tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
118 |                         ),
119 |                     ]
120 |                 ),
121 |                 0,
122 |             )
123 | 
124 |             self.assertFalse(
125 |                 "extraPages.jsonl"
126 |                 in os.listdir(os.path.join(tmpdir, "wacz_no_extra_pages/pages/"))
127 |             )
128 | 
129 |     @patch("wacz.main.now")
130 |     def test_warc_with_pages_flag(self, mock_now):
131 |         """When passing the pages flag with a valid pages.jsonl file a pages/pages.jsonl file should be created"""
132 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
133 | 
134 |         with tempfile.TemporaryDirectory() as tmpdir:
135 |             fp = tempfile.NamedTemporaryFile()
136 |             fp.write(
137 |                 """{"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}\n{"id": "1db0ef709a", "url": "http://www.example.com/", "ts": "2020-10-07T21:22:36Z", "title": "Example Domain"}""".encode(
138 |                     "utf-8"
139 |                 )
140 |             )
141 |             fp.seek(0)
142 |             self.assertEqual(
143 |                 main(
144 |                     [
145 |                         "create",
146 |                         "-f",
147 |                         os.path.join(TEST_DIR, "example-collection.warc"),
148 |                         "-o",
149 |                         os.path.join(tmpdir, "example-collection-valid-url.wacz"),
150 |                         "-p",
151 |                         os.path.join(tmpdir, fp.name),
152 |                     ]
153 |                 ),
154 |                 0,
155 |             )
156 |             with zipfile.ZipFile(
157 |                 os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r"
158 |             ) as zip_ref:
159 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_pages"))
160 |                 zip_ref.close()
161 | 
162 |             self.assertEqual(
163 |                 main(
164 |                     [
165 |                         "validate",
166 |                         "-f",
167 |                         os.path.join(tmpdir, "example-collection-valid-url.wacz"),
168 |                     ]
169 |                 ),
170 |                 0,
171 |             )
172 |             wacz_pages = os.path.join(tmpdir, "unzipped_valid_pages/pages/pages.jsonl")
173 |             wacz_cdx = os.path.join(tmpdir, "unzipped_valid_pages/indexes/index.cdx.gz")
174 |             cdx_content = gzip.open(wacz_cdx, "rb").read()
175 |             self.assertTrue(
176 |                 "pages.jsonl"
177 |                 in os.listdir(os.path.join(tmpdir, "unzipped_valid_pages/pages/"))
178 |             )
179 |             with open(wacz_pages) as f:
180 |                 for _ in range(1):
181 |                     next(f)
182 |                 for line in f:
183 |                     obj = json.loads(line)
184 |                     self.assertTrue("id" in obj.keys())
185 |                     self.assertTrue("ts" in obj.keys())
186 |                     self.assertTrue("url" in obj.keys())
187 |                     self.assertTrue(obj["url"].encode() in cdx_content)
188 | 
189 |     @patch("wacz.main.now")
190 |     def test_warc_with_copy_pages(self, mock_now):
191 |         """When passing the pages and extra-pages flags with copy-pages, the files should end up in the WACZ exactly as-is"""
192 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
193 | 
194 |         with tempfile.TemporaryDirectory() as tmpdir:
195 |             self.assertEqual(
196 |                 main(
197 |                     [
198 |                         "create",
199 |                         "-f",
200 |                         os.path.join(TEST_DIR, "example-collection.warc"),
201 |                         "-o",
202 |                         os.path.join(tmpdir, "example-collection-copy-pages.wacz"),
203 |                         "-p",
204 |                         os.path.join(PAGES_DIR, "pages.jsonl"),
205 |                         "-e",
206 |                         os.path.join(PAGES_DIR, "extraPages.jsonl"),
207 |                         "--copy-pages",
208 |                     ]
209 |                 ),
210 |                 0,
211 |             )
212 | 
213 |             with zipfile.ZipFile(
214 |                 os.path.join(tmpdir, "example-collection-copy-pages.wacz"), "r"
215 |             ) as zip_ref:
216 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_copy_pages"))
217 |                 zip_ref.close()
218 | 
219 |             self.assertEqual(
220 |                 main(
221 |                     [
222 |                         "validate",
223 |                         "-f",
224 |                         os.path.join(tmpdir, "example-collection-copy-pages.wacz"),
225 |                     ]
226 |                 ),
227 |                 0,
228 |             )
229 | 
230 |             wacz_pages = os.path.join(tmpdir, "unzipped_copy_pages/pages/pages.jsonl")
231 |             wacz_extra_pages = os.path.join(
232 |                 tmpdir, "unzipped_copy_pages/pages/extraPages.jsonl"
233 |             )
234 | 
235 |             self.assertTrue(
236 |                 "pages.jsonl"
237 |                 in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/"))
238 |             )
239 |             self.assertTrue(
240 |                 "extraPages.jsonl"
241 |                 in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/"))
242 |             )
243 | 
244 |             self.assertEqual(
245 |                 hash_file("sha256", wacz_pages),
246 |                 hash_file("sha256", os.path.join(PAGES_DIR, "pages.jsonl")),
247 |             )
248 |             self.assertEqual(
249 |                 hash_file("sha256", wacz_extra_pages),
250 |                 hash_file("sha256", os.path.join(PAGES_DIR, "extraPages.jsonl")),
251 |             )
252 | 
253 |     @patch("wacz.main.now")
254 |     def test_warc_with_detect_pages_flag(self, mock_now):
255 |         """When passing the text index flag pages/pages.jsonl should be generated."""
256 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
257 |         with tempfile.TemporaryDirectory() as tmpdir:
258 |             self.assertEqual(
259 |                 main(
260 |                     [
261 |                         "create",
262 |                         "-f",
263 |                         os.path.join(TEST_DIR, "example-collection.warc"),
264 |                         "-o",
265 |                         os.path.join(tmpdir, "example-collection-valid-url.wacz"),
266 |                         "--detect-pages",
267 |                     ]
268 |                 ),
269 |                 0,
270 |             )
271 |             with zipfile.ZipFile(
272 |                 os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r"
273 |             ) as zip_ref:
274 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_pages"))
275 |                 zip_ref.close()
276 | 
277 |             wacz_pages = os.path.join(tmpdir, "unzipped_valid_pages/pages/pages.jsonl")
278 |             wacz_cdx = os.path.join(tmpdir, "unzipped_valid_pages/indexes/index.cdx.gz")
279 |             cdx_content = gzip.open(wacz_cdx, "rb").read()
280 |             self.assertTrue(
281 |                 "pages.jsonl"
282 |                 in os.listdir(os.path.join(tmpdir, "unzipped_valid_pages/pages/"))
283 |             )
284 |             with open(wacz_pages) as f:
285 |                 for _ in range(1):
286 |                     next(f)
287 |                 for line in f:
288 |                     obj = json.loads(line)
289 |                     self.assertTrue("id" in obj.keys())
290 |                     self.assertTrue("ts" in obj.keys())
291 |                     self.assertTrue("title" in obj.keys())
292 |                     self.assertTrue("url" in obj.keys())
293 |                     self.assertTrue(obj["url"].encode() in cdx_content)
294 | 
295 |     @patch("wacz.main.now")
296 |     def test_warc_with_text_index_flag(self, mock_now):
297 |         """When passing the text index flag pages/pages.jsonl should be generated with a full and accurate text index."""
298 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
299 |         with tempfile.TemporaryDirectory() as tmpdir:
300 |             self.assertEqual(
301 |                 main(
302 |                     [
303 |                         "create",
304 |                         "-f",
305 |                         os.path.join(TEST_DIR, "example-collection.warc"),
306 |                         "-o",
307 |                         os.path.join(tmpdir, "example-collection-valid-url.wacz"),
308 |                         "-t",
309 |                     ]
310 |                 ),
311 |                 0,
312 |             )
313 |             with zipfile.ZipFile(
314 |                 os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r"
315 |             ) as zip_ref:
316 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_text"))
317 |                 zip_ref.close()
318 | 
319 |             wacz_pages = os.path.join(tmpdir, "unzipped_valid_text/pages/pages.jsonl")
320 |             wacz_cdx = os.path.join(tmpdir, "unzipped_valid_text/indexes/index.cdx.gz")
321 |             cdx_content = gzip.open(wacz_cdx, "rb").read()
322 |             self.assertTrue(
323 |                 "pages.jsonl"
324 |                 in os.listdir(os.path.join(tmpdir, "unzipped_valid_text/pages/"))
325 |             )
326 |             with open(wacz_pages) as f:
327 |                 for _ in range(1):
328 |                     next(f)
329 |                 for line in f:
330 |                     obj = json.loads(line)
331 |                     self.assertTrue("id" in obj.keys())
332 |                     self.assertTrue("ts" in obj.keys())
333 |                     self.assertTrue("title" in obj.keys())
334 |                     self.assertTrue("url" in obj.keys())
335 |                     self.assertTrue(obj["url"].encode() in cdx_content)
336 |                     self.assertTrue("text" in obj.keys())
337 | 
338 |     def test_warc_with_both_p_and_d_flag(self):
339 |         """If a user passes both the --pages and --detect-pages flags we should return an error and a message about needing only one"""
340 |         with tempfile.TemporaryDirectory() as tmpdir:
341 |             with self.assertRaises(SystemExit):
342 |                 self.assertEqual(
343 |                     main(
344 |                         [
345 |                             "create",
346 |                             "-f",
347 |                             os.path.join(TEST_DIR, "example-collection.warc"),
348 |                             "-o",
349 |                             os.path.join(tmpdir, "example-collection.wacz"),
350 |                             "--detect_pages",
351 |                             "-p",
352 |                             "test.jsonl",
353 |                         ]
354 |                     ),
355 |                     0,
356 |                 )
357 | 
358 |     def test_warc_with_only_ts_flag(self):
359 |         """If a user only passes the --ts flag we should return an error and a message about needing to also pass the --url flag"""
360 |         with tempfile.TemporaryDirectory() as tmpdir:
361 |             with self.assertRaises(SystemExit):
362 |                 self.assertEqual(
363 |                     main(
364 |                         [
365 |                             "create",
366 |                             "-f",
367 |                             os.path.join(TEST_DIR, "example-collection.warc"),
368 |                             "-o",
369 |                             os.path.join(tmpdir, "example-collection.wacz"),
370 |                             "--ts",
371 |                             "2020104212236",
372 |                         ]
373 |                     ),
374 |                     0,
375 |                 )
376 | 
377 |     @patch("wacz.main.now")
378 |     def test_warc_with_valid_date_flag(self, mock_now):
379 |         """When passing a valid date flag the datapackage should have that as the mainpageTS"""
380 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
381 |         with tempfile.TemporaryDirectory() as tmpdir:
382 |             self.assertEqual(
383 |                 main(
384 |                     [
385 |                         "create",
386 |                         "-f",
387 |                         os.path.join(TEST_DIR, "example-collection.warc"),
388 |                         "-o",
389 |                         os.path.join(tmpdir, "example-collection-valid-desc.wacz"),
390 |                         "--desc",
391 |                         "fake desc",
392 |                     ]
393 |                 ),
394 |                 0,
395 |             )
396 |             with zipfile.ZipFile(
397 |                 os.path.join(tmpdir, "example-collection-valid-desc.wacz"), "r"
398 |             ) as zip_ref:
399 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_desc"))
400 |                 zip_ref.close()
401 | 
402 |             self.wacz_json = os.path.join(
403 |                 tmpdir, "unzipped_valid_desc/datapackage.json"
404 |             )
405 |             self.wacz_pages = os.path.join(
406 |                 tmpdir, "unzipped_valid_desc/pages/pages.jsonl"
407 |             )
408 | 
409 |             f = open(self.wacz_json, "rb")
410 |             json_parse = json.loads(f.read())
411 | 
412 |             self.assertEqual(json_parse["des"], "fake desc")
413 | 
414 |     @patch("wacz.main.now")
415 |     def test_warc_with_valid_date_flag(self, mock_now):
416 |         """When passing a valid title flag the datapackage should have that as the title value"""
417 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
418 |         with tempfile.TemporaryDirectory() as tmpdir:
419 |             self.assertEqual(
420 |                 main(
421 |                     [
422 |                         "create",
423 |                         "-f",
424 |                         os.path.join(TEST_DIR, "example-collection.warc"),
425 |                         "-o",
426 |                         os.path.join(tmpdir, "example-collection-valid-title.wacz"),
427 |                         "--title",
428 |                         "Example Title",
429 |                     ]
430 |                 ),
431 |                 0,
432 |             )
433 |             with zipfile.ZipFile(
434 |                 os.path.join(tmpdir, "example-collection-valid-title.wacz"), "r"
435 |             ) as zip_ref:
436 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_title"))
437 |                 zip_ref.close()
438 | 
439 |             self.wacz_json = os.path.join(
440 |                 tmpdir, "unzipped_valid_title/datapackage.json"
441 |             )
442 |             self.wacz_pages = os.path.join(
443 |                 tmpdir, "unzipped_valid_title/pages/pages.jsonl"
444 |             )
445 | 
446 |             f = open(self.wacz_json, "rb")
447 |             json_parse = json.loads(f.read())
448 | 
449 |             self.assertEqual(json_parse["title"], "Example Title")
450 | 
451 |     @patch("wacz.main.now")
452 |     def test_warc_with_valid_date_flag(self, mock_now):
453 |         """When passing a valid date flag the datapackage should have that as the mainpageTS"""
454 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
455 |         with tempfile.TemporaryDirectory() as tmpdir:
456 |             self.assertEqual(
457 |                 main(
458 |                     [
459 |                         "create",
460 |                         "-f",
461 |                         os.path.join(TEST_DIR, "example-collection.warc"),
462 |                         "-o",
463 |                         os.path.join(tmpdir, "example-collection-valid-date.wacz"),
464 |                         "--date",
465 |                         "2020-11-01",
466 |                     ]
467 |                 ),
468 |                 0,
469 |             )
470 |             with zipfile.ZipFile(
471 |                 os.path.join(tmpdir, "example-collection-valid-date.wacz"), "r"
472 |             ) as zip_ref:
473 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_date"))
474 |                 zip_ref.close()
475 | 
476 |             self.wacz_json = os.path.join(
477 |                 tmpdir, "unzipped_valid_date/datapackage.json"
478 |             )
479 |             self.wacz_pages = os.path.join(
480 |                 tmpdir, "unzipped_valid_date/pages/pages.jsonl"
481 |             )
482 | 
483 |             f = open(self.wacz_json, "rb")
484 |             json_parse = json.loads(f.read())
485 | 
486 |             self.assertEqual(json_parse["mainPageDate"], "2020-11-01")
487 | 
488 |     @patch("wacz.main.now")
489 |     def test_warc_with_valid_url_flag(self, mock_now):
490 |         """When passing a valid url flag the url should be added to the pages.jsonl file and appear in the datapackage"""
491 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
492 |         with tempfile.TemporaryDirectory() as tmpdir:
493 |             self.assertEqual(
494 |                 main(
495 |                     [
496 |                         "create",
497 |                         "-f",
498 |                         os.path.join(TEST_DIR, "example-collection.warc"),
499 |                         "-o",
500 |                         os.path.join(tmpdir, "example-collection-valid-url.wacz"),
501 |                         "--url",
502 |                         "http://www.example.com/",
503 |                     ]
504 |                 ),
505 |                 0,
506 |             )
507 |             with zipfile.ZipFile(
508 |                 os.path.join(tmpdir, "example-collection-valid-url.wacz"), "r"
509 |             ) as zip_ref:
510 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_url"))
511 |                 zip_ref.close()
512 | 
513 |             self.wacz_json = os.path.join(tmpdir, "unzipped_valid_url/datapackage.json")
514 |             self.wacz_pages = os.path.join(
515 |                 tmpdir, "unzipped_valid_url/pages/pages.jsonl"
516 |             )
517 | 
518 |             f = open(self.wacz_json, "rb")
519 |             json_parse = json.loads(f.read())
520 | 
521 |             f = open(self.wacz_pages, "rb")
522 |             json_pages = [json.loads(jline) for jline in f.read().splitlines()]
523 |             self.assertEqual(json_pages[1]["url"], "http://www.example.com/")
524 |             self.assertEqual(json_parse["mainPageURL"], "http://www.example.com/")
525 |             assert "mainPageDate" not in json_parse.keys()
526 | 
527 |     def test_warc_with_invalid_url_flag(self):
528 |         """When passing an invalid url flag we should raise a ValueError"""
529 |         with tempfile.TemporaryDirectory() as tmpdir:
530 |             with self.assertRaises(ValueError):
531 |                 main(
532 |                     [
533 |                         "create",
534 |                         "-f",
535 |                         os.path.join(TEST_DIR, "example-collection.warc"),
536 |                         "-o",
537 |                         os.path.join(tmpdir, "example-collection.wacz"),
538 |                         "--url",
539 |                         "http://www.examplefake.com/",
540 |                     ]
541 |                 )
542 | 
543 |     def test_warc_with_valid_url_and_invalid_ts_flag(self):
544 |         """When passing a valid url flag with an invalid ts flag we should raise a ValueError"""
545 |         with tempfile.TemporaryDirectory() as tmpdir:
546 |             with self.assertRaises(ValueError):
547 |                 main(
548 |                     [
549 |                         "create",
550 |                         "-f",
551 |                         os.path.join(TEST_DIR, "example-collection.warc"),
552 |                         "-o",
553 |                         os.path.join(tmpdir, "example-collection.wacz"),
554 |                         "--url",
555 |                         "http://www.example.com/",
556 |                         "--ts",
557 |                         "2020104212236",
558 |                     ]
559 |                 )
560 | 
561 |     @patch("wacz.main.now")
562 |     def test_warc_with_valid_url_and_ts_flag(self, mock_now):
563 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
564 |         """When passing an a valid url and ts flag we should see those values represented in the datapackage and pages.jsonl file"""
565 |         with tempfile.TemporaryDirectory() as tmpdir:
566 |             self.assertEqual(
567 |                 main(
568 |                     [
569 |                         "create",
570 |                         "-f",
571 |                         os.path.join(TEST_DIR, "example-collection.warc"),
572 |                         "-o",
573 |                         os.path.join(
574 |                             tmpdir, "example-collection-valid-url-valid-ts.wacz"
575 |                         ),
576 |                         "--url",
577 |                         "http://www.example.com/",
578 |                         "--ts",
579 |                         "20201007212236",
580 |                     ]
581 |                 ),
582 |                 0,
583 |             )
584 |             with zipfile.ZipFile(
585 |                 os.path.join(tmpdir, "example-collection-valid-url-valid-ts.wacz"), "r"
586 |             ) as zip_ref:
587 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_valid_url_valid_ts"))
588 |                 zip_ref.close()
589 | 
590 |             self.wacz_json = os.path.join(
591 |                 tmpdir, "unzipped_valid_url_valid_ts/datapackage.json"
592 |             )
593 |             self.wacz_pages = os.path.join(
594 |                 tmpdir, "unzipped_valid_url_valid_ts/pages/pages.jsonl"
595 |             )
596 | 
597 |             f = open(self.wacz_json, "rb")
598 |             json_parse = json.loads(f.read())
599 | 
600 |             f = open(self.wacz_pages, "rb")
601 |             json_pages = [json.loads(jline) for jline in f.read().splitlines()]
602 |             self.assertEqual(json_pages[1]["url"], "http://www.example.com/")
603 |             self.assertEqual(json_parse["mainPageURL"], "http://www.example.com/")
604 |             self.assertEqual(json_parse["mainPageDate"], "2020-10-07T21:22:36Z")
605 | 
606 |     @patch("wacz.main.now")
607 |     def test_warc_with_hash_flag_md5(self, mock_now):
608 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
609 |         """When passing the --hash-type flag with a value of md5 the datapackage should be hashed using md5"""
610 |         with tempfile.TemporaryDirectory() as tmpdir:
611 |             self.assertEqual(
612 |                 main(
613 |                     [
614 |                         "create",
615 |                         "-f",
616 |                         os.path.join(TEST_DIR, "example-collection.warc"),
617 |                         "-o",
618 |                         os.path.join(tmpdir, "example-collection-md5.wacz"),
619 |                         "--hash-type",
620 |                         "md5",
621 |                     ]
622 |                 ),
623 |                 0,
624 |             )
625 |             with zipfile.ZipFile(
626 |                 os.path.join(tmpdir, "example-collection-md5.wacz"), "r"
627 |             ) as zip_ref:
628 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_md5"))
629 |                 zip_ref.close()
630 | 
631 |             self.wacz_json = os.path.join(tmpdir, "unzipped_md5/datapackage.json")
632 | 
633 |             f = open(self.wacz_json, "rb")
634 |             json_parse = json.loads(f.read())
635 | 
636 |             assert "md5" in json_parse["resources"][0]["hash"]
637 | 
638 |     @patch("wacz.main.now")
639 |     def test_warc_with_hash_flag_sha256(self, mock_now):
640 |         mock_now.return_value = (2020, 10, 7, 22, 29, 10)
641 |         """When passing the --hash-type flag with a value of sha256 the datapackage should be hashed using sha256"""
642 |         with tempfile.TemporaryDirectory() as tmpdir:
643 |             self.assertEqual(
644 |                 main(
645 |                     [
646 |                         "create",
647 |                         "-f",
648 |                         os.path.join(TEST_DIR, "example-collection.warc"),
649 |                         "-o",
650 |                         os.path.join(tmpdir, "example-collection-sha256.wacz"),
651 |                         "--hash-type",
652 |                         "sha256",
653 |                     ]
654 |                 ),
655 |                 0,
656 |             )
657 |             with zipfile.ZipFile(
658 |                 os.path.join(tmpdir, "example-collection-sha256.wacz"), "r"
659 |             ) as zip_ref:
660 |                 zip_ref.extractall(os.path.join(tmpdir, "unzipped_sha256"))
661 |                 zip_ref.close()
662 | 
663 |             self.wacz_json = os.path.join(tmpdir, "unzipped_sha256/datapackage.json")
664 | 
665 |             f = open(self.wacz_json, "rb")
666 |             json_parse = json.loads(f.read())
667 | 
668 |             assert "sha256" in json_parse["resources"][0]["hash"]
669 | 
670 | 
671 | if __name__ == "__main__":
672 |     unittest.main()
673 | 


--------------------------------------------------------------------------------