├── src
    └── wayback_machine_archiver
    │   ├── __init__.py
    │   ├── clients.py
    │   ├── sitemaps.py
    │   ├── cli.py
    │   ├── archiver.py
    │   └── workflow.py
├── tests
    ├── test_sitemap_is_local.py
    ├── test_get_namespace.py
    ├── test_download_remote_sitemap.py
    ├── test_load_local_sitemap.py
    ├── test_extract_pages_from_sitemap.py
    ├── test_main_logic.py
    ├── test_spn2_client.py
    ├── test_cli.py
    └── test_spn2_workflow.py
├── .github
    └── workflows
    │   ├── tests.yml
    │   └── release.yml
├── LICENSE
├── pyproject.toml
├── .gitignore
└── README.md


/src/wayback_machine_archiver/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.3.1"
2 | 


--------------------------------------------------------------------------------
/tests/test_sitemap_is_local.py:
--------------------------------------------------------------------------------
 1 | from wayback_machine_archiver.sitemaps import sitemap_is_local, LOCAL_PREFIX
 2 | 
 3 | 
 4 | def test_local():
 5 |     URIS = (
 6 |         "/tmp/sitemap.xml",
 7 |         "{prefix}/tmp/sitemap.xml".format(prefix=LOCAL_PREFIX),
 8 |     )
 9 |     for uri in URIS:
10 |         assert sitemap_is_local(uri)
11 | 
12 | 
13 | def test_remote():
14 |     URIS = (
15 |         "https://alexgude.com/sitemap.xml",
16 |         "http://charles.uno/sitemap.xml",
17 |     )
18 |     for uri in URIS:
19 |         assert not sitemap_is_local(uri)
20 | 


--------------------------------------------------------------------------------
/tests/test_get_namespace.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from wayback_machine_archiver.sitemaps import get_namespace
 3 | 
 4 | ELEMENT = namedtuple("Element", "tag")
 5 | 
 6 | 
 7 | def test_good_namespace():
 8 |     NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
 9 |     test_element = ELEMENT("{namespace}urlset".format(namespace=NAMESPACE))
10 | 
11 |     assert get_namespace(test_element) == NAMESPACE
12 | 
13 | 
14 | def test_no_match_namespace():
15 |     NAMESPACE = ""
16 |     test_element = ELEMENT("{namespace}urlset".format(namespace=NAMESPACE))
17 | 
18 |     assert get_namespace(test_element) == NAMESPACE
19 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests 🧪
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       fail-fast: false
10 |       matrix:
11 |         python-version:
12 |         - '3.8'
13 |         - '3.9'
14 |         - '3.10'
15 |         - '3.11'
16 |         - '3.12'
17 |         - 'pypy-3.9'
18 |         - 'pypy-3.10'
19 |     name: Python ${{ matrix.python-version }} Test 🧪
20 |     steps:
21 |     - name: Checkout repository
22 |       uses: actions/checkout@v4
23 | 
24 |     - name: Set up Python ${{ matrix.python-version }} 🐍
25 |       uses: actions/setup-python@v4
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 | 
29 |     - name: Set up uv 💨
30 |       uses: astral-sh/setup-uv@v6
31 |       with:
32 |         uv-version: latest
33 | 
34 |     - name: Install dependencies 🏗
35 |       run: uv pip install --system -e ".[dev]"
36 | 
37 |     - name: Run Tests 🧪
38 |       run: pytest -vv
39 | 
40 |     - name: Run Smoke Test ⚗️
41 |       run: archiver --help
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | # MIT License (MIT)
 2 | 
 3 | Copyright © 2018--2025 Alexander Gude
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/tests/test_download_remote_sitemap.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from wayback_machine_archiver.sitemaps import download_remote_sitemap
 3 | from requests.adapters import HTTPAdapter
 4 | import requests
 5 | 
 6 | 
 7 | SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
 8 |     <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 9 |     <url>
10 |     <loc>https://alexgude.com/blog/double-checking-538/</loc>
11 |     <lastmod>2016-04-28T00:00:00+00:00</lastmod>
12 |     </url>
13 |     <url>
14 |     <loc>https://alexgude.com/files/undergrad_thesis.pdf</loc>
15 |     <lastmod>2019-05-09T16:19:45+00:00</lastmod>
16 |     </url>
17 |     </urlset>
18 | """
19 | 
20 | 
21 | @pytest.fixture
22 | def session():
23 |     session = requests.Session()
24 |     session.mount("https://", HTTPAdapter())
25 |     session.mount("http://", HTTPAdapter())
26 |     return session
27 | 
28 | 
29 | def test_download_remote_sitemap(requests_mock, session):
30 |     url = "https://www.radiokeysmusic.com/sitemap.xml"
31 |     requests_mock.get(url, text=SITEMAP)
32 |     returned_contents = download_remote_sitemap(url, session)
33 |     assert returned_contents == SITEMAP.encode("UTF-8")
34 | 
35 | 
36 | def test_download_remote_sitemap_with_status_error(requests_mock, session):
37 |     url = "https://www.radiokeysmusic.com/sitemap.xml"
38 |     requests_mock.get(url, text=SITEMAP, status_code=404)
39 |     with pytest.raises(requests.exceptions.HTTPError):
40 |         download_remote_sitemap(url, session)
41 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "wayback-machine-archiver"
 7 | version = "3.3.1"
 8 | description = "A Python script to submit web pages to the Wayback Machine for archiving."
 9 | readme = "README.md"
10 | authors = [
11 |     { name = "Alexander Gude", email = "alex.public.account@gmail.com" },
12 | ]
13 | license = { file = "LICENSE" }
14 | requires-python = ">=3.8"
15 | classifiers = [
16 |     "Development Status :: 5 - Production/Stable",
17 |     "Environment :: Console",
18 |     "Intended Audience :: System Administrators",
19 |     "License :: OSI Approved :: MIT License",
20 |     "Natural Language :: English",
21 |     "Operating System :: OS Independent",
22 |     "Programming Language :: Python",
23 |     "Topic :: Utilities",
24 | ]
25 | keywords = ["Internet Archive", "Wayback Machine"]
26 | dependencies = [
27 |     "python-dotenv",
28 |     "requests",
29 |     "urllib3",
30 | ]
31 | 
32 | [project.urls]
33 | Homepage = "https://github.com/agude/wayback-machine-archiver"
34 | 
35 | [project.scripts]
36 | archiver = "wayback_machine_archiver.archiver:main"
37 | 
38 | [project.optional-dependencies]
39 | dev = [
40 |     "pytest",
41 |     "requests-mock",
42 |     "bump-my-version",
43 | ]
44 | 
45 | [tool.bumpversion]
46 | current_version = "3.3.1"
47 | commit = true
48 | tag = true
49 | message = "Bump version to {new_version}"
50 | 
51 | [[tool.bumpversion.files]]
52 | filename = "pyproject.toml"
53 | search = 'version = "{current_version}"'
54 | replace = 'version = "{new_version}"'
55 | 
56 | [[tool.bumpversion.files]]
57 | filename = "src/wayback_machine_archiver/__init__.py"
58 | search = '__version__ = "{current_version}"'
59 | replace = '__version__ = "{new_version}"'
60 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Release Package 📦
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |   test:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       fail-fast: false
12 |       matrix:
13 |         python-version:
14 |         - '3.8'
15 |         - '3.9'
16 |         - '3.10'
17 |         - '3.11'
18 |         - '3.12'
19 |         - 'pypy-3.9'
20 |         - 'pypy-3.10'
21 |     name: Python ${{ matrix.python-version }} Test 🧪
22 |     steps:
23 |     - name: Checkout repository
24 |       uses: actions/checkout@v4
25 | 
26 |     - name: Set up Python ${{ matrix.python-version }} 🐍
27 |       uses: actions/setup-python@v4
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 | 
31 |     - name: Set up uv 💨
32 |       uses: astral-sh/setup-uv@v6
33 |       with:
34 |         uv-version: latest
35 | 
36 |     - name: Install dependencies 🏗
37 |       run: uv pip install --system -e ".[dev]"
38 | 
39 |     - name: Run Tests 🧪
40 |       run: pytest -vv
41 | 
42 |     - name: Run Smoke Test ⚗️
43 |       run: archiver --help
44 | 
45 |   release:
46 |     runs-on: ubuntu-latest
47 |     needs: test
48 |     name: Build and Publish to PyPI 📦
49 |     permissions:
50 |       id-token: write
51 |     steps:
52 |     - name: Checkout repository
53 |       uses: actions/checkout@v4
54 | 
55 |     - name: Set up Python for build 🐍
56 |       uses: actions/setup-python@v4
57 |       with:
58 |         python-version: '3.12'
59 | 
60 |     - name: Install the modern build tool 🏗
61 |       run: python -m pip install build
62 | 
63 |     - name: Build package 👷
64 |       run: python -m build
65 | 
66 |     - name: Publish distribution 📦 to PyPI
67 |       if: startsWith(github.ref, 'refs/tags')
68 |       uses: pypa/gh-action-pypi-publish@release/v1
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # env files
  2 | .env
  3 | env
  4 | 
  5 | # Sometimes I use a test sitemap.xml
  6 | sitemap.xml
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | env/
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | .mypy*
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *,cover
 54 | .hypothesis/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | #Ipython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # swap files
 73 | *.swp
 74 | 
 75 | # OSX crap
 76 | .DS_Store
 77 | 
 78 | # pickled models
 79 | **/*.pickle
 80 | 
 81 | #other crap
 82 | **/.ropeproject
 83 | checkscript.sh
 84 | 
 85 | # swap
 86 | [._]*.s[a-v][a-z]
 87 | [._]*.sw[a-p]
 88 | [._]s[a-v][a-z]
 89 | [._]sw[a-p]
 90 | # session
 91 | Session.vim
 92 | # temporary
 93 | .netrwhist
 94 | *~
 95 | # auto-generated tag files
 96 | tags
 97 | 
 98 | *~
 99 | 
100 | # temporary files which can be created if a process still has a handle open of a deleted file
101 | .fuse_hidden*
102 | 
103 | # KDE directory preferences
104 | .directory
105 | 
106 | # Linux trash folder which might appear on any partition or disk
107 | .Trash-*
108 | 
109 | # .nfs files are created when an open file is removed but is still being accessed
110 | .nfs*
111 | 
112 | .vscode/
113 | 


--------------------------------------------------------------------------------
/tests/test_load_local_sitemap.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from __future__ import unicode_literals
 3 | from wayback_machine_archiver.sitemaps import load_local_sitemap, LOCAL_PREFIX
 4 | import os.path
 5 | import pytest
 6 | 
 7 | 
 8 | SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
 9 |     <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
10 |     <url>
11 |     <loc>https://alexgude.com/blog/double-checking-538/</loc>
12 |     <lastmod>2016-04-28T00:00:00+00:00</lastmod>
13 |     </url>
14 |     <url>
15 |     <loc>https://alexgude.com/files/undergrad_thesis.pdf</loc>
16 |     <lastmod>2019-05-09T16:19:45+00:00</lastmod>
17 |     </url>
18 |     </urlset>
19 | """
20 | 
21 | 
22 | def test_load_local_file_without_prefix(tmpdir):
23 |     # Write a file using pytest's tmpdir so we can read it back
24 |     file = tmpdir.join("sitemap.xml")
25 |     file.write(SITEMAP)
26 |     file_path = os.path.join(file.dirname, file.basename)
27 | 
28 |     # Read the file
29 |     read_contents = load_local_sitemap(file_path)
30 |     assert read_contents == SITEMAP
31 | 
32 | 
33 | def test_load_local_file_with_prefix(tmpdir):
34 |     # Write a file using pytest's tmpdir so we can read it back
35 |     file = tmpdir.join("sitemap.xml")
36 |     file.write(SITEMAP)
37 |     file_path = os.path.join(LOCAL_PREFIX, file.dirname, file.basename)
38 | 
39 |     # Read the file
40 |     read_contents = load_local_sitemap(file_path)
41 |     assert read_contents == SITEMAP
42 | 
43 | 
44 | def test_file_does_not_exist(tmpdir):
45 |     file_path = "{}/tmp/not_a_real_file".format(LOCAL_PREFIX)
46 | 
47 |     with pytest.raises(IOError):
48 |         load_local_sitemap(file_path)
49 | 
50 | 
51 | def test_file_is_remote(tmpdir):
52 |     file_path = "https://alexgude.com/sitemap.xml"
53 | 
54 |     with pytest.raises(IOError):
55 |         load_local_sitemap(file_path)
56 | 
57 | 
58 | def test_file_path_is_invalid(tmpdir):
59 |     file_path = "tmp/file_path"
60 | 
61 |     with pytest.raises(IOError):
62 |         load_local_sitemap(file_path)
63 | 


--------------------------------------------------------------------------------
/src/wayback_machine_archiver/clients.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import time
 3 | import requests
 4 | 
 5 | 
 6 | class SPN2Client:
 7 |     """
 8 |     Handles archiving using the authenticated SPN2 API.
 9 |     """
10 | 
11 |     SAVE_URL = "https://web.archive.org/save"
12 |     STATUS_URL = "https://web.archive.org/save/status"
13 |     STATUS_URL_TEMPLATE = "https://web.archive.org/save/status/{job_id}"
14 | 
15 |     def __init__(self, session, access_key, secret_key):
16 |         self.session = session
17 |         self.is_authenticated = True  # Always true now
18 | 
19 |         self.session.headers.update({"Accept": "application/json"})
20 |         auth_header = f"LOW {access_key}:{secret_key}"
21 |         self.session.headers.update({"Authorization": auth_header})
22 | 
23 |     def submit_capture(self, url_to_archive, rate_limit_wait, api_params=None):
24 |         """Submits a capture request to the SPN2 API."""
25 |         if rate_limit_wait > 0:
26 |             logging.debug("Sleeping for %s seconds", rate_limit_wait)
27 |             time.sleep(rate_limit_wait)
28 |         logging.info("Submitting %s to SPN2", url_to_archive)
29 |         data = {"url": url_to_archive}
30 |         if api_params:
31 |             data.update(api_params)
32 | 
33 |         r = self.session.post(self.SAVE_URL, data=data)
34 |         r.raise_for_status()
35 |         response_json = r.json()
36 |         job_id = response_json.get("job_id")
37 |         logging.info("Successfully submitted %s, job_id: %s", url_to_archive, job_id)
38 | 
39 |         if job_id:
40 |             status_check_url = self.STATUS_URL_TEMPLATE.format(job_id=job_id)
41 |             logging.debug(
42 |                 "Manual status check URL for %s: %s", url_to_archive, status_check_url
43 |             )
44 | 
45 |         return job_id
46 | 
47 |     def check_status(self, job_id):
48 |         """Checks the status of a single capture job."""
49 |         status_url = self.STATUS_URL_TEMPLATE.format(job_id=job_id)
50 |         logging.debug("Checking status for single job_id: %s", job_id)
51 |         r = self.session.get(status_url)
52 |         r.raise_for_status()
53 |         return r.json()
54 | 
55 |     def check_status_batch(self, job_ids):
56 |         """Checks the status of multiple capture jobs in a single request."""
57 |         logging.debug("Checking status for %d jobs in a batch.", len(job_ids))
58 |         data = {"job_ids": ",".join(job_ids)}
59 |         r = self.session.post(self.STATUS_URL, data=data)
60 |         r.raise_for_status()
61 |         return r.json()
62 | 


--------------------------------------------------------------------------------
/tests/test_extract_pages_from_sitemap.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | from __future__ import unicode_literals
 3 | from wayback_machine_archiver.sitemaps import extract_urls_from_sitemap
 4 | 
 5 | 
 6 | def test_ascii_sitemap():
 7 |     SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
 8 |         <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 9 |         <url>
10 |         <loc>https://alexgude.com/blog/double-checking-538/</loc>
11 |         <lastmod>2016-04-28T00:00:00+00:00</lastmod>
12 |         </url>
13 |         <url>
14 |         <loc>https://alexgude.com/files/undergrad_thesis.pdf</loc>
15 |         <lastmod>2019-05-09T16:19:45+00:00</lastmod>
16 |         </url>
17 |         </urlset>
18 |     """.encode("UTF-8")
19 | 
20 |     URLS = set(
21 |         (
22 |             "https://alexgude.com/blog/double-checking-538/",
23 |             "https://alexgude.com/files/undergrad_thesis.pdf",
24 |         )
25 |     )
26 | 
27 |     assert extract_urls_from_sitemap(SITEMAP) == URLS
28 | 
29 | 
30 | def test_unicode_sitemap():
31 |     SITEMAP = """<?xml version="1.0" encoding="UTF-8"?>
32 |         <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
33 |         <url>
34 |         <loc>https://www.radiokeysmusic.com/home</loc>
35 |         <changefreq>daily</changefreq>
36 |         <priority>1.0</priority>
37 |         <lastmod>2018-12-17</lastmod>
38 |         <image:image>
39 |         <image:loc>https://static1.squarespace.com/static/5c06e0ab1137a66237a2399c/t/5c0d6a4d562fa7678539d405/1544383062969/</image:loc>
40 |         <image:title>Home</image:title>
41 |         <image:caption>Tom, Stewart, Allante, &amp; Emily. Photo by Cory Cullington, 2018.</image:caption>
42 |         </image:image>
43 |         </url>
44 |         <url>
45 |         <loc>https://www.radiokeysmusic.com/about</loc>
46 |         <changefreq>daily</changefreq>
47 |         <priority>0.75</priority>
48 |         <lastmod>2019-01-05</lastmod>
49 |         <image:image>
50 |         <image:loc>https://static1.squarespace.com/static/5c06e0ab1137a66237a2399c/t/5c0d6b5b6d2a7379672b9b34/1544896195646/IMG_9107.jpg</image:loc>
51 |         <image:title>About - Story</image:title>
52 |         <image:caption> instrumentation complimented by Emily’s velvety voice and Stewart’s </image:caption>
53 |         </image:image>
54 |         </url>
55 |         </urlset>
56 |     """.encode("UTF-8")
57 | 
58 |     URLS = set(
59 |         (
60 |             "https://www.radiokeysmusic.com/home",
61 |             "https://www.radiokeysmusic.com/about",
62 |         )
63 |     )
64 | 
65 |     assert extract_urls_from_sitemap(SITEMAP) == URLS
66 | 


--------------------------------------------------------------------------------
/src/wayback_machine_archiver/sitemaps.py:
--------------------------------------------------------------------------------
 1 | # src/wayback_machine_archiver/sitemaps.py
 2 | import logging
 3 | import re
 4 | import xml.etree.ElementTree as ET
 5 | from xml.etree.ElementTree import ParseError
 6 | 
 7 | LOCAL_PREFIX = "file://"
 8 | 
 9 | 
10 | def get_namespace(element):
11 |     """Extract the namespace from an XML element."""
12 |     match = re.match(r"\{.*\}", element.tag)
13 |     return match.group(0) if match else ""
14 | 
15 | 
16 | def download_remote_sitemap(sitemap_url, session):
17 |     """Download a remote sitemap file."""
18 |     logging.debug("Downloading: %s", sitemap_url)
19 |     r = session.get(sitemap_url)
20 |     r.raise_for_status()
21 |     return r.text.encode("utf-8")
22 | 
23 | 
24 | def load_local_sitemap(sitemap_filepath):
25 |     """Load a local sitemap file."""
26 |     logging.debug("Loading local sitemap: %s", sitemap_filepath)
27 |     if sitemap_filepath.startswith(LOCAL_PREFIX):
28 |         sitemap_filepath = sitemap_filepath[len(LOCAL_PREFIX) :]
29 |     with open(sitemap_filepath, "r") as fp:
30 |         return fp.read()
31 | 
32 | 
33 | def sitemap_is_local(sitemap_url):
34 |     """Check if a sitemap URI is local."""
35 |     return sitemap_url.startswith(LOCAL_PREFIX) or sitemap_url.startswith("/")
36 | 
37 | 
38 | def extract_urls_from_sitemap(site_map_text):
39 |     """Parse XML sitemap text and extract URLs."""
40 |     root = ET.fromstring(site_map_text)
41 |     namespace = get_namespace(root)
42 |     loc_nodes = root.findall(".//{}loc".format(namespace))
43 |     return {node.text for node in loc_nodes}
44 | 
45 | 
46 | def process_sitemaps(sitemap_urls, session):
47 |     """
48 |     Given a list of sitemap URLs, downloads/loads them and returns a set of all unique URLs found.
49 |     """
50 |     all_urls = set()
51 |     for sitemap_url in sitemap_urls:
52 |         try:
53 |             if sitemap_is_local(sitemap_url):
54 |                 logging.debug("The sitemap '%s' is local.", sitemap_url)
55 |                 sitemap_xml = load_local_sitemap(sitemap_url)
56 |             else:
57 |                 logging.debug("The sitemap '%s' is remote.", sitemap_url)
58 |                 sitemap_xml = download_remote_sitemap(sitemap_url, session)
59 | 
60 |             extracted_urls = extract_urls_from_sitemap(sitemap_xml)
61 |             all_urls.update(extracted_urls)
62 |         except ParseError:
63 |             logging.error(
64 |                 "Failed to parse sitemap from '%s'. The content is not valid XML. Please ensure the URL points directly to a sitemap.xml file. Skipping this sitemap.",
65 |                 sitemap_url,
66 |             )
67 |         except Exception as e:
68 |             logging.error(
69 |                 "An error occurred while processing sitemap '%s': %s. Skipping.",
70 |                 sitemap_url,
71 |                 e,
72 |             )
73 |     return all_urls
74 | 


--------------------------------------------------------------------------------
/tests/test_main_logic.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from unittest import mock
 3 | from wayback_machine_archiver.archiver import main
 4 | 
 5 | # This test file now mocks the main workflow and assumes credentials are present
 6 | # to test the URL gathering and shuffling logic.
 7 | 
 8 | 
 9 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
10 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
11 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key")
12 | @mock.patch("wayback_machine_archiver.archiver.random.shuffle")
13 | def test_random_order_flag_shuffles_urls(
14 |     mock_shuffle, mock_getenv, mock_workflow, mock_sitemaps
15 | ):
16 |     """Verify that when --random-order is passed, random.shuffle is called."""
17 |     urls_to_archive = ["http://test.com/a", "http://test.com/b"]
18 |     sys.argv = ["archiver", "--random-order"] + urls_to_archive
19 |     main()
20 |     mock_shuffle.assert_called_once()
21 | 
22 |     # Check for membership, not order, by comparing sets.
23 |     # The second argument to the mock_workflow call is the list of URLs.
24 |     passed_urls = mock_workflow.call_args[0][1]
25 |     assert set(passed_urls) == set(urls_to_archive)
26 | 
27 | 
28 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
29 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
30 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key")
31 | @mock.patch("wayback_machine_archiver.archiver.random.shuffle")
32 | def test_default_order_does_not_shuffle(
33 |     mock_shuffle, mock_getenv, mock_workflow, mock_sitemaps
34 | ):
35 |     """Verify that without --random-order, shuffle is not called."""
36 |     urls_to_archive = ["http://test.com/a", "http://test.com/b"]
37 |     sys.argv = ["archiver"] + urls_to_archive
38 |     main()
39 |     mock_shuffle.assert_not_called()
40 | 
41 |     # Check for membership, not order, by comparing sets.
42 |     passed_urls = mock_workflow.call_args[0][1]
43 |     assert set(passed_urls) == set(urls_to_archive)
44 | 
45 | 
46 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
47 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
48 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key")
49 | def test_main_builds_and_passes_api_params(mock_getenv, mock_workflow, mock_sitemaps):
50 |     """
51 |     Verify that main() correctly constructs the api_params dictionary from CLI
52 |     flags and passes it to the workflow.
53 |     """
54 |     sys.argv = [
55 |         "archiver",
56 |         "http://test.com",
57 |         "--capture-screenshot",
58 |         "--js-behavior-timeout",
59 |         "10",
60 |         "--if-not-archived-within",
61 |         "5d",
62 |         "--user-agent",
63 |         "TestBot/1.0",
64 |     ]
65 |     main()
66 | 
67 |     # The fourth argument to the mock_workflow call is the api_params dict.
68 |     passed_params = mock_workflow.call_args[0][3]
69 |     expected_params = {
70 |         "capture_screenshot": "1",
71 |         "js_behavior_timeout": 10,
72 |         "if_not_archived_within": "5d",
73 |         "use_user_agent": "TestBot/1.0",
74 |     }
75 |     assert passed_params == expected_params
76 | 


--------------------------------------------------------------------------------
/tests/test_spn2_client.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from wayback_machine_archiver.clients import SPN2Client
  3 | from requests.adapters import HTTPAdapter
  4 | import requests
  5 | import urllib.parse
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def session():
 10 |     session = requests.Session()
 11 |     session.mount("https://", HTTPAdapter())
 12 |     session.mount("http://", HTTPAdapter())
 13 |     return session
 14 | 
 15 | 
 16 | @pytest.mark.parametrize(
 17 |     "api_params",
 18 |     [
 19 |         (None),
 20 |         ({"capture_outlinks": "1", "js_behavior_timeout": 0}),
 21 |         ({"capture_screenshot": "1", "force_get": "1"}),
 22 |     ],
 23 | )
 24 | def test_spn2_client_submit_capture(requests_mock, session, api_params):
 25 |     """
 26 |     Verify that submit_capture sends a correct POST request, including optional
 27 |     API parameters, and returns the job_id.
 28 |     """
 29 |     access_key = "test-access"
 30 |     secret_key = "test-secret"
 31 |     url_to_archive = "https://example.com"
 32 |     expected_job_id = "c4b1-4f2a-ac04-1d1225e98695"
 33 | 
 34 |     requests_mock.post(
 35 |         SPN2Client.SAVE_URL, json={"job_id": expected_job_id}, status_code=200
 36 |     )
 37 | 
 38 |     client = SPN2Client(session=session, access_key=access_key, secret_key=secret_key)
 39 |     job_id = client.submit_capture(
 40 |         url_to_archive, rate_limit_wait=0, api_params=api_params
 41 |     )
 42 | 
 43 |     # Assertions
 44 |     assert job_id == expected_job_id
 45 |     history = requests_mock.request_history
 46 |     assert len(history) == 1
 47 |     request = history[0]
 48 |     assert request.method == "POST"
 49 |     assert request.url == SPN2Client.SAVE_URL
 50 |     assert f"LOW {access_key}:{secret_key}" == request.headers["Authorization"]
 51 | 
 52 |     expected_payload = {"url": url_to_archive}
 53 |     if api_params:
 54 |         expected_payload.update(api_params)
 55 |     expected_body = urllib.parse.urlencode(expected_payload)
 56 |     assert request.text == expected_body
 57 | 
 58 | 
 59 | def test_spn2_client_check_status_success(requests_mock, session):
 60 |     """
 61 |     Verify check_status correctly parses a 'success' response.
 62 |     """
 63 |     job_id = "test-job-123"
 64 |     status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id)
 65 |     success_payload = {
 66 |         "status": "success",
 67 |         "original_url": "https://example.com",
 68 |         "timestamp": "20250101000000",
 69 |     }
 70 |     requests_mock.get(status_url, json=success_payload)
 71 | 
 72 |     client = SPN2Client(session=session, access_key="key", secret_key="secret")
 73 |     status_data = client.check_status(job_id)
 74 | 
 75 |     assert status_data == success_payload
 76 | 
 77 | 
 78 | def test_spn2_client_check_status_pending(requests_mock, session):
 79 |     """
 80 |     Verify check_status correctly parses a 'pending' response.
 81 |     """
 82 |     job_id = "test-job-456"
 83 |     status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id)
 84 |     pending_payload = {"status": "pending"}
 85 |     requests_mock.get(status_url, json=pending_payload)
 86 | 
 87 |     client = SPN2Client(session=session, access_key="key", secret_key="secret")
 88 |     status_data = client.check_status(job_id)
 89 | 
 90 |     assert status_data == pending_payload
 91 | 
 92 | 
 93 | def test_spn2_client_check_status_error(requests_mock, session):
 94 |     """
 95 |     Verify check_status correctly parses an 'error' response.
 96 |     """
 97 |     job_id = "test-job-789"
 98 |     status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id)
 99 |     error_payload = {"status": "error", "message": "Too many redirects."}
100 |     requests_mock.get(status_url, json=error_payload)
101 | 
102 |     client = SPN2Client(session=session, access_key="key", secret_key="secret")
103 |     status_data = client.check_status(job_id)
104 | 
105 |     assert status_data == error_payload
106 | 


--------------------------------------------------------------------------------
/src/wayback_machine_archiver/cli.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | from . import __version__
  4 | 
  5 | LOCAL_PREFIX = "file://"
  6 | 
  7 | 
  8 | def create_parser():
  9 |     """Creates and returns the argparse parser."""
 10 |     parser = argparse.ArgumentParser(
 11 |         prog="archiver",
 12 |         description="A script to backup a web pages with Internet Archive",
 13 |     )
 14 |     parser.add_argument(
 15 |         "--version",
 16 |         action="version",
 17 |         version="%(prog)s {version}".format(version=__version__),
 18 |     )
 19 |     parser.add_argument(
 20 |         "urls",
 21 |         nargs="*",
 22 |         default=[],
 23 |         help="Specifies the URLs of the pages to archive.",
 24 |     )
 25 |     parser.add_argument(
 26 |         "--file",
 27 |         help="Specifies the path to a file containing URLs to save, one per line.",
 28 |         required=False,
 29 |     )
 30 |     parser.add_argument(
 31 |         "--sitemaps",
 32 |         nargs="+",
 33 |         default=[],
 34 |         help="Specifies one or more URIs to sitemaps listing pages to archive. Local paths must be prefixed with '{f}'.".format(
 35 |             f=LOCAL_PREFIX
 36 |         ),
 37 |         required=False,
 38 |     )
 39 |     parser.add_argument(
 40 |         "--log",
 41 |         help="Sets the logging level. Defaults to WARNING (case-insensitive).",
 42 |         dest="log_level",
 43 |         default=logging.WARNING,
 44 |         type=str.upper,
 45 |         choices=[
 46 |             "DEBUG",
 47 |             "INFO",
 48 |             "WARNING",
 49 |             "ERROR",
 50 |             "CRITICAL",
 51 |         ],
 52 |     )
 53 |     parser.add_argument(
 54 |         "--log-to-file",
 55 |         help="Redirects logs to a specified file instead of the console.",
 56 |         dest="log_file",
 57 |         default=None,
 58 |     )
 59 |     parser.add_argument(
 60 |         "--archive-sitemap-also",
 61 |         help="Submits the URL of the sitemap itself to be archived.",
 62 |         dest="archive_sitemap",
 63 |         default=False,
 64 |         action="store_true",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--rate-limit-wait",
 68 |         help="Specifies the number of seconds to wait between submissions. A minimum of 5 seconds is enforced for authenticated users. Defaults to 15.",
 69 |         dest="rate_limit_in_sec",
 70 |         default=15,
 71 |         type=int,
 72 |     )
 73 |     parser.add_argument(
 74 |         "--random-order",
 75 |         help="Randomizes the order of pages before archiving.",
 76 |         dest="random_order",
 77 |         default=False,
 78 |         action="store_true",
 79 |     )
 80 | 
 81 |     # --- SPN2 API Options ---
 82 |     api_group = parser.add_argument_group(
 83 |         "SPN2 API Options", "Control the behavior of the Internet Archive capture API."
 84 |     )
 85 |     api_group.add_argument(
 86 |         "--capture-all",
 87 |         action="store_true",
 88 |         help="Captures a web page even if it returns an error (e.g., 404, 500).",
 89 |     )
 90 |     api_group.add_argument(
 91 |         "--capture-outlinks",
 92 |         action="store_true",
 93 |         help="Captures web page outlinks automatically. Note: this can significantly increase the total number of captures and runtime.",
 94 |     )
 95 |     api_group.add_argument(
 96 |         "--capture-screenshot",
 97 |         action="store_true",
 98 |         help="Captures a full page screenshot.",
 99 |     )
100 |     api_group.add_argument(
101 |         "--delay-wb-availability",
102 |         action="store_true",
103 |         help="Reduces load on Internet Archive systems by making the capture publicly available after ~12 hours instead of immediately.",
104 |     )
105 |     api_group.add_argument(
106 |         "--force-get",
107 |         action="store_true",
108 |         help="Bypasses the headless browser check, which can speed up captures for non-HTML content (e.g., PDFs, images).",
109 |     )
110 |     api_group.add_argument(
111 |         "--skip-first-archive",
112 |         action="store_true",
113 |         help="Speeds up captures by skipping the check for whether this is the first time a URL has been archived.",
114 |     )
115 |     api_group.add_argument(
116 |         "--email-result",
117 |         action="store_true",
118 |         help="Sends an email report of the captured URLs to the user's registered email.",
119 |     )
120 |     api_group.add_argument(
121 |         "--if-not-archived-within",
122 |         type=str,
123 |         metavar="<timedelta>",
124 |         help="Captures only if the latest capture is older than <timedelta> (e.g., '3d 5h').",
125 |     )
126 |     api_group.add_argument(
127 |         "--js-behavior-timeout",
128 |         type=int,
129 |         metavar="<seconds>",
130 |         help="Runs JS code for <N> seconds after page load to trigger dynamic content. Defaults to 5, max is 30. Use 0 to disable for static pages.",
131 |     )
132 |     api_group.add_argument(
133 |         "--capture-cookie",
134 |         type=str,
135 |         metavar="<cookie>",
136 |         help="Uses an extra HTTP Cookie value when capturing the target page.",
137 |     )
138 |     api_group.add_argument(
139 |         "--user-agent",
140 |         type=str,
141 |         metavar="<string>",
142 |         dest="use_user_agent",
143 |         help="Uses a custom HTTP User-Agent value when capturing the target page.",
144 |     )
145 | 
146 |     return parser
147 | 


--------------------------------------------------------------------------------
/src/wayback_machine_archiver/archiver.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import random
  4 | import sys
  5 | import requests
  6 | from requests.adapters import HTTPAdapter
  7 | from urllib3.util.retry import Retry
  8 | from dotenv import load_dotenv
  9 | 
 10 | from .clients import SPN2Client
 11 | from .cli import create_parser
 12 | from .sitemaps import process_sitemaps
 13 | from .workflow import run_archive_workflow
 14 | 
 15 | 
 16 | def main():
 17 |     """Main entry point for the archiver script."""
 18 |     parser = create_parser()
 19 |     args = parser.parse_args()
 20 | 
 21 |     logging.basicConfig(level=args.log_level, filename=args.log_file)
 22 |     load_dotenv()
 23 | 
 24 |     # --- Load and REQUIRE credentials ---
 25 |     access_key = os.getenv("INTERNET_ARCHIVE_ACCESS_KEY")
 26 |     secret_key = os.getenv("INTERNET_ARCHIVE_SECRET_KEY")
 27 | 
 28 |     if not (access_key and secret_key):
 29 |         logging.error(
 30 |             "Authentication required. Please provide your Internet Archive S3-style keys."
 31 |         )
 32 |         logging.error("You can get your keys from: https://archive.org/account/s3.php")
 33 |         logging.error("Then, create a .env file or set the environment variables:")
 34 |         logging.error("INTERNET_ARCHIVE_ACCESS_KEY and INTERNET_ARCHIVE_SECRET_KEY")
 35 |         sys.exit(1)
 36 | 
 37 |     # --- Enforce API rate-limiting minimums for authenticated users ---
 38 |     MIN_WAIT_SEC = 5
 39 |     if args.rate_limit_in_sec < MIN_WAIT_SEC:
 40 |         logging.warning(
 41 |             "Provided rate limit of %d seconds is below the API minimum of %d for authenticated users. Overriding to %d seconds.",
 42 |             args.rate_limit_in_sec,
 43 |             MIN_WAIT_SEC,
 44 |             MIN_WAIT_SEC,
 45 |         )
 46 |         args.rate_limit_in_sec = MIN_WAIT_SEC
 47 | 
 48 |     # --- Build API parameters dictionary from CLI args ---
 49 |     api_params = {}
 50 |     if args.capture_all:
 51 |         api_params["capture_all"] = "1"
 52 |     if args.capture_outlinks:
 53 |         api_params["capture_outlinks"] = "1"
 54 |     if args.capture_screenshot:
 55 |         api_params["capture_screenshot"] = "1"
 56 |     if args.delay_wb_availability:
 57 |         api_params["delay_wb_availability"] = "1"
 58 |     if args.force_get:
 59 |         api_params["force_get"] = "1"
 60 |     if args.skip_first_archive:
 61 |         api_params["skip_first_archive"] = "1"
 62 |     if args.email_result:
 63 |         api_params["email_result"] = "1"
 64 |     if args.if_not_archived_within:
 65 |         api_params["if_not_archived_within"] = args.if_not_archived_within
 66 |     if args.js_behavior_timeout is not None:
 67 |         api_params["js_behavior_timeout"] = args.js_behavior_timeout
 68 |     if args.capture_cookie:
 69 |         api_params["capture_cookie"] = args.capture_cookie
 70 |     if args.use_user_agent:
 71 |         api_params["use_user_agent"] = args.use_user_agent
 72 | 
 73 |     if api_params:
 74 |         logging.info(f"Using the following API parameters: {api_params}")
 75 | 
 76 |     # --- Gather all URLs to archive ---
 77 |     urls_to_archive = set()
 78 |     logging.info("Gathering URLs to archive...")
 79 |     if args.urls:
 80 |         logging.info(f"Found {len(args.urls)} URLs from command-line arguments.")
 81 |         urls_to_archive.update(args.urls)
 82 |     if args.sitemaps:
 83 |         session = requests.Session()
 84 |         retries = Retry(
 85 |             total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
 86 |         )
 87 |         session.mount("https://", HTTPAdapter(max_retries=retries))
 88 |         session.mount("http://", HTTPAdapter(max_retries=retries))
 89 |         logging.info(f"Processing {len(args.sitemaps)} sitemap(s)...")
 90 |         sitemap_urls = process_sitemaps(args.sitemaps, session)
 91 |         logging.info(f"Found {len(sitemap_urls)} URLs from sitemaps.")
 92 |         urls_to_archive.update(sitemap_urls)
 93 |         if args.archive_sitemap:
 94 |             remote_sitemaps = {s for s in args.sitemaps if not s.startswith("file://")}
 95 |             urls_to_archive.update(remote_sitemaps)
 96 |     if args.file:
 97 |         with open(args.file) as f:
 98 |             urls_from_file = {line.strip() for line in f if line.strip()}
 99 |             logging.info(f"Found {len(urls_from_file)} URLs from file: {args.file}")
100 |             urls_to_archive.update(urls_from_file)
101 | 
102 |     urls_to_process = list(urls_to_archive)
103 |     if not urls_to_process:
104 |         logging.warning("No unique URLs found to archive. Exiting.")
105 |         return
106 |     logging.info(f"Found a total of {len(urls_to_process)} unique URLs to archive.")
107 |     if args.random_order:
108 |         logging.info("Randomizing the order of URLs.")
109 |         random.shuffle(urls_to_process)
110 | 
111 |     # --- Run the archiving workflow ---
112 |     logging.info("SPN2 credentials found. Using authenticated API workflow.")
113 |     client_session = requests.Session()
114 |     retries = Retry(
115 |         total=5,
116 |         backoff_factor=args.rate_limit_in_sec,
117 |         status_forcelist=[500, 502, 503, 504, 520],
118 |         allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"],
119 |     )
120 |     client_session.mount("https://", HTTPAdapter(max_retries=retries))
121 |     client_session.mount("http://", HTTPAdapter(max_retries=retries))
122 | 
123 |     client = SPN2Client(
124 |         session=client_session, access_key=access_key, secret_key=secret_key
125 |     )
126 |     run_archive_workflow(client, urls_to_process, args.rate_limit_in_sec, api_params)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | # tests/test_cli.py
  2 | import sys
  3 | from unittest import mock
  4 | import pytest
  5 | import logging
  6 | from wayback_machine_archiver.archiver import main
  7 | from wayback_machine_archiver.cli import create_parser
  8 | 
  9 | # This test file now mocks the main workflow and any I/O functions
 10 | # to keep the tests focused purely on the CLI argument parsing logic.
 11 | 
 12 | 
 13 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
 14 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
 15 | @mock.patch("wayback_machine_archiver.archiver.logging.basicConfig")
 16 | @pytest.mark.parametrize(
 17 |     "input_level, expected_level",
 18 |     [("info", "INFO"), ("DEBUG", "DEBUG")],
 19 | )
 20 | def test_log_level(
 21 |     mock_basic_config, mock_workflow, mock_sitemaps, input_level, expected_level
 22 | ):
 23 |     """Verify that the --log argument is case-insensitive."""
 24 |     with mock.patch(
 25 |         "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key"
 26 |     ):
 27 |         sys.argv = ["archiver", "http://test.com", "--log", input_level]
 28 |         main()
 29 |         mock_basic_config.assert_called_once_with(level=expected_level, filename=None)
 30 | 
 31 | 
 32 | def test_version_action_exits():
 33 |     """Verify that the --version argument exits the program."""
 34 |     sys.argv = ["archiver", "--version"]
 35 |     with pytest.raises(SystemExit):
 36 |         main()
 37 | 
 38 | 
 39 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
 40 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
 41 | @mock.patch("wayback_machine_archiver.archiver.logging.basicConfig")
 42 | def test_log_to_file(mock_basic_config, mock_workflow, mock_sitemaps):
 43 |     """Verify that --log-to-file passes the filename to the logging config."""
 44 |     with mock.patch(
 45 |         "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key"
 46 |     ):
 47 |         log_file = "archive.log"
 48 |         sys.argv = ["archiver", "http://test.com", "--log-to-file", log_file]
 49 |         main()
 50 |         mock_basic_config.assert_called_once_with(
 51 |             level=logging.WARNING, filename=log_file
 52 |         )
 53 | 
 54 | 
 55 | @pytest.mark.parametrize(
 56 |     "user_input, expected_wait",
 57 |     [(2, 5), (10, 10)],
 58 | )
 59 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
 60 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
 61 | def test_rate_limit_override(mock_workflow, mock_sitemaps, user_input, expected_wait):
 62 |     """Verify the script enforces the minimum rate-limit for authenticated users."""
 63 |     with mock.patch(
 64 |         "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key"
 65 |     ):
 66 |         sys.argv = ["archiver", "http://test.com", "--rate-limit-wait", str(user_input)]
 67 |         main()
 68 |         # The third argument to the mock_workflow call is the rate limit.
 69 |         final_rate_limit = mock_workflow.call_args[0][2]
 70 |         assert final_rate_limit == expected_wait
 71 | 
 72 | 
 73 | @mock.patch("wayback_machine_archiver.archiver.logging.error")
 74 | def test_main_exits_if_no_credentials(mock_logging_error):
 75 |     """Verify the script raises SystemExit if getenv returns None for credentials."""
 76 |     with mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value=None):
 77 |         sys.argv = ["archiver", "http://test.com"]
 78 |         with pytest.raises(SystemExit) as e:
 79 |             main()
 80 | 
 81 |         # Check that the exit code is 1 (error)
 82 |         assert e.value.code == 1
 83 |         # Check that we logged an error message to the user
 84 |         assert mock_logging_error.call_count > 0
 85 | 
 86 | 
 87 | def test_api_option_flags_are_parsed_correctly():
 88 |     """
 89 |     Directly tests the parser to ensure all API flags are correctly defined
 90 |     and their default values are as expected.
 91 |     """
 92 |     parser = create_parser()
 93 | 
 94 |     # Test default values (when no flags are passed)
 95 |     args = parser.parse_args([])
 96 |     assert args.capture_all is False
 97 |     assert args.capture_outlinks is False
 98 |     assert args.capture_screenshot is False
 99 |     assert args.delay_wb_availability is False
100 |     assert args.force_get is False
101 |     assert args.skip_first_archive is False
102 |     assert args.email_result is False
103 |     assert args.if_not_archived_within is None
104 |     assert args.js_behavior_timeout is None
105 |     assert args.capture_cookie is None
106 |     assert args.use_user_agent is None
107 | 
108 |     # Test boolean flags are set to True
109 |     args = parser.parse_args(
110 |         [
111 |             "--capture-all",
112 |             "--capture-outlinks",
113 |             "--capture-screenshot",
114 |             "--delay-wb-availability",
115 |             "--force-get",
116 |             "--skip-first-archive",
117 |             "--email-result",
118 |         ]
119 |     )
120 |     assert args.capture_all is True
121 |     assert args.capture_outlinks is True
122 |     assert args.capture_screenshot is True
123 |     assert args.delay_wb_availability is True
124 |     assert args.force_get is True
125 |     assert args.skip_first_archive is True
126 |     assert args.email_result is True
127 | 
128 |     # Test value-based flags
129 |     args = parser.parse_args(
130 |         [
131 |             "--if-not-archived-within",
132 |             "10d 5h",
133 |             "--js-behavior-timeout",
134 |             "25",
135 |             "--capture-cookie",
136 |             "name=value",
137 |             "--user-agent",
138 |             "MyTestAgent/1.0",
139 |         ]
140 |     )
141 |     assert args.if_not_archived_within == "10d 5h"
142 |     assert args.js_behavior_timeout == 25
143 |     assert args.capture_cookie == "name=value"
144 |     assert args.use_user_agent == "MyTestAgent/1.0"
145 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Wayback Machine Archiver
  2 | 
  3 | Wayback Machine Archiver (Archiver for short) is a command-line utility
  4 | written in Python to back up web pages using the [Internet Archive][ia].
  5 | 
  6 | [ia]: https://archive.org/
  7 | 
  8 | ## Installation
  9 | 
 10 | The best way to install Archiver is with `pip`:
 11 | 
 12 | ```bash
 13 | pip install wayback-machine-archiver
 14 | ```
 15 | 
 16 | This will give you access to the script simply by calling:
 17 | 
 18 | ```bash
 19 | archiver --help
 20 | ```
 21 | 
 22 | You can also install it directly from a local clone of this repository:
 23 | 
 24 | ```bash
 25 | git clone https://github.com/agude/wayback-machine-archiver.git
 26 | cd wayback-machine-archiver
 27 | pip install .
 28 | ```
 29 | 
 30 | All dependencies are handled automatically. Archiver supports Python 3.8+.
 31 | 
 32 | ## Usage
 33 | 
 34 | The archiver is simple to use from the command line.
 35 | 
 36 | ### Command-Line Examples
 37 | 
 38 | **Archive a single page:**
 39 | ```bash
 40 | archiver https://alexgude.com
 41 | ```
 42 | 
 43 | **Archive all pages from a sitemap:**
 44 | ```bash
 45 | archiver --sitemaps https://alexgude.com/sitemap.xml
 46 | ```
 47 | 
 48 | **Archive from a local sitemap file:**
 49 | (Note the `file://` prefix is required)
 50 | ```bash
 51 | archiver --sitemaps file://sitemap.xml
 52 | ```
 53 | 
 54 | **Archive from a text file of URLs:**
 55 | (The file should contain one URL per line)
 56 | ```bash
 57 | archiver --file urls.txt
 58 | ```
 59 | 
 60 | **Combine multiple sources:**
 61 | ```bash
 62 | archiver https://radiokeysmusic.com --sitemaps https://charles.uno/sitemap.xml
 63 | ```
 64 | 
 65 | **Use advanced API options:**
 66 | (Capture a screenshot and skip if archived in the last 10 days)
 67 | ```bash
 68 | archiver https://alexgude.com --capture-screenshot --if-not-archived-within 10d
 69 | ```
 70 | 
 71 | **Archive the sitemap URL itself:**
 72 | ```bash
 73 | archiver --sitemaps https://alexgude.com/sitemaps.xml --archive-sitemap-also
 74 | ```
 75 | 
 76 | ## Authentication (Required)
 77 | 
 78 | As of version 3.0.0, this tool requires authentication with the Internet
 79 | Archive's SPN2 API. This change was made to ensure all archiving jobs are
 80 | reliable and their final success or failure status can be confirmed. The
 81 | previous, less reliable method for unauthenticated users has been removed.
 82 | 
 83 | If you run the script without credentials, it will exit with an error message.
 84 | 
 85 | **To set up authentication:**
 86 | 
 87 | 1.  Get your S3-style API keys from your Internet Archive account settings:
 88 |     [https://archive.org/account/s3.php](https://archive.org/account/s3.php)
 89 | 
 90 | 2.  Create a `.env` file in the directory where you run the `archiver`
 91 |     command. Add your keys to it:
 92 |     ```
 93 |     INTERNET_ARCHIVE_ACCESS_KEY="YOUR_ACCESS_KEY_HERE"
 94 |     INTERNET_ARCHIVE_SECRET_KEY="YOUR_SECRET_KEY_HERE"
 95 |     ```
 96 | 
 97 | The script will automatically detect this file (or the equivalent environment
 98 | variables) and use the authenticated API.
 99 | 
100 | ## Help
101 | 
102 | For a full list of command-line flags, Archiver has built-in help displayed
103 | with `archiver --help`:
104 | 
105 | ```
106 | usage: archiver [-h] [--version] [--file FILE]
107 |                 [--sitemaps SITEMAPS [SITEMAPS ...]]
108 |                 [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
109 |                 [--log-to-file LOG_FILE]
110 |                 [--archive-sitemap-also]
111 |                 [--rate-limit-wait RATE_LIMIT_IN_SEC]
112 |                 [--random-order] [--capture-all]
113 |                 [--capture-outlinks] [--capture-screenshot]
114 |                 [--delay-wb-availability] [--force-get]
115 |                 [--skip-first-archive] [--email-result]
116 |                 [--if-not-archived-within <timedelta>]
117 |                 [--js-behavior-timeout <seconds>]
118 |                 [--capture-cookie <cookie>]
119 |                 [--user-agent <string>]
120 |                 [urls ...]
121 | 
122 | A script to backup a web pages with Internet Archive
123 | 
124 | positional arguments:
125 |   urls                  Specifies the URLs of the pages to archive.
126 | 
127 | options:
128 |   -h, --help            show this help message and exit
129 |   --version             show program's version number and exit
130 |   --file FILE           Specifies the path to a file containing URLs to save,
131 |                         one per line.
132 |   --sitemaps SITEMAPS [SITEMAPS ...]
133 |                         Specifies one or more URIs to sitemaps listing pages
134 |                         to archive. Local paths must be prefixed with
135 |                         'file://'.
136 |   --log {DEBUG,INFO,WARNING,ERROR,CRITICAL}
137 |                         Sets the logging level. Defaults to WARNING
138 |                         (case-insensitive).
139 |   --log-to-file LOG_FILE
140 |                         Redirects logs to a specified file instead of the
141 |                         console.
142 |   --archive-sitemap-also
143 |                         Submits the URL of the sitemap itself to be archived.
144 |   --rate-limit-wait RATE_LIMIT_IN_SEC
145 |                         Specifies the number of seconds to wait between
146 |                         submissions. A minimum of 5 seconds is enforced for
147 |                         authenticated users. Defaults to 15.
148 |   --random-order        Randomizes the order of pages before archiving.
149 | 
150 | SPN2 API Options:
151 |   Control the behavior of the Internet Archive capture API.
152 | 
153 |   --capture-all         Captures a web page even if it returns an error (e.g.,
154 |                         404, 500).
155 |   --capture-outlinks    Captures web page outlinks automatically. Note: this
156 |                         can significantly increase the total number of
157 |                         captures and runtime.
158 |   --capture-screenshot  Captures a full page screenshot.
159 |   --delay-wb-availability
160 |                         Reduces load on Internet Archive systems by making the
161 |                         capture publicly available after ~12 hours instead of
162 |                         immediately.
163 |   --force-get           Bypasses the headless browser check, which can speed
164 |                         up captures for non-HTML content (e.g., PDFs, images).
165 |   --skip-first-archive  Speeds up captures by skipping the check for whether
166 |                         this is the first time a URL has been archived.
167 |   --email-result        Sends an email report of the captured URLs to the
168 |                         user's registered email.
169 |   --if-not-archived-within <timedelta>
170 |                         Captures only if the latest capture is older than
171 |                         <timedelta> (e.g., '3d 5h').
172 |   --js-behavior-timeout <seconds>
173 |                         Runs JS code for <N> seconds after page load to
174 |                         trigger dynamic content. Defaults to 5, max is 30. Use
175 |                         0 to disable for static pages.
176 |   --capture-cookie <cookie>
177 |                         Uses an extra HTTP Cookie value when capturing the
178 |                         target page.
179 |   --user-agent <string>
180 |                         Uses a custom HTTP User-Agent value when capturing the
181 |                         target page.
182 | ```
183 | 
184 | ## Setting Up a `Sitemap.xml` for Github Pages
185 | 
186 | It is easy to automatically generate a sitemap for a Github Pages Jekyll site.
187 | Simply use [jekyll/jekyll-sitemap][jsm].
188 | 
189 | Setup instructions can be found on the above site; they require changing just
190 | a single line of your site's `_config.yml`.
191 | 
192 | [jsm]: https://github.com/jekyll/jekyll-sitemap
193 | 


--------------------------------------------------------------------------------
/tests/test_spn2_workflow.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from unittest import mock
  3 | import pytest
  4 | import time
  5 | from wayback_machine_archiver.workflow import (
  6 |     _submit_next_url,
  7 |     _poll_pending_jobs,
  8 |     run_archive_workflow,
  9 |     PERMANENT_ERROR_MESSAGES,
 10 |     TRANSIENT_ERROR_MESSAGES,
 11 | )
 12 | 
 13 | # --- Tests for _submit_next_url ---
 14 | 
 15 | 
 16 | def test_submit_next_url_success():
 17 |     """
 18 |     Verify that a successful submission adds the job_id to the pending_jobs
 19 |     dictionary, consumes the URL, and clears the attempts tracker for that URL.
 20 |     """
 21 |     mock_client = mock.Mock()
 22 |     mock_client.submit_capture.return_value = "job-123"
 23 | 
 24 |     urls_to_process = ["http://example.com"]
 25 |     pending_jobs = {}
 26 |     # Simulate a previous failure to ensure the tracker is cleared on success
 27 |     submission_attempts = {"http://example.com": 1}
 28 | 
 29 |     _submit_next_url(
 30 |         urls_to_process,
 31 |         mock_client,
 32 |         pending_jobs,
 33 |         5,
 34 |         submission_attempts,
 35 |         api_params={},
 36 |     )
 37 | 
 38 |     # Assertions
 39 |     mock_client.submit_capture.assert_called_once_with(
 40 |         "http://example.com", rate_limit_wait=5, api_params={}
 41 |     )
 42 |     # --- Check the new data structure ---
 43 |     assert "job-123" in pending_jobs
 44 |     assert pending_jobs["job-123"]["url"] == "http://example.com"
 45 |     assert "submitted_at" in pending_jobs["job-123"]
 46 |     assert not urls_to_process, "URL should have been consumed from the list"
 47 |     assert "http://example.com" not in submission_attempts, (
 48 |         "Attempts tracker should be cleared on success"
 49 |     )
 50 | 
 51 | 
 52 | def test_submit_next_url_failure_requeues_and_tracks_attempt():
 53 |     """
 54 |     Verify that a failed submission re-queues the URL at the end of the list
 55 |     and increments its attempt count.
 56 |     """
 57 |     mock_client = mock.Mock()
 58 |     mock_client.submit_capture.side_effect = Exception("API Error")
 59 | 
 60 |     urls_to_process = ["http://a.com", "http://b.com"]
 61 |     pending_jobs = {}
 62 |     submission_attempts = {}
 63 | 
 64 |     _submit_next_url(
 65 |         urls_to_process,
 66 |         mock_client,
 67 |         pending_jobs,
 68 |         5,
 69 |         submission_attempts,
 70 |         api_params={},
 71 |     )
 72 | 
 73 |     # Assertions
 74 |     assert not pending_jobs, "No job should have been added on failure"
 75 |     assert urls_to_process == ["http://b.com", "http://a.com"], (
 76 |         "Failed URL should be at the end of the list"
 77 |     )
 78 |     assert submission_attempts == {"http://a.com": 1}, (
 79 |         "Attempt count should be incremented"
 80 |     )
 81 | 
 82 | 
 83 | def test_submit_next_url_gives_up_after_max_retries():
 84 |     """
 85 |     Verify that if a URL has reached its max retry count, it is not
 86 |     re-queued and the submission is not attempted.
 87 |     """
 88 |     mock_client = mock.Mock()
 89 | 
 90 |     urls_to_process = ["http://will-fail.com"]
 91 |     pending_jobs = {}
 92 |     # Simulate that the URL has already failed 3 times
 93 |     submission_attempts = {"http://will-fail.com": 3}
 94 | 
 95 |     _submit_next_url(
 96 |         urls_to_process,
 97 |         mock_client,
 98 |         pending_jobs,
 99 |         5,
100 |         submission_attempts,
101 |         api_params={},
102 |         max_retries=3,
103 |     )
104 | 
105 |     # Assertions
106 |     mock_client.submit_capture.assert_not_called()
107 |     assert not pending_jobs
108 |     assert not urls_to_process, "URL should be consumed but not re-queued"
109 |     assert submission_attempts == {"http://will-fail.com": 4}, (
110 |         "Attempt count is still updated"
111 |     )
112 | 
113 | 
114 | def test_submit_next_url_passes_api_params_to_client():
115 |     """
116 |     Verify that the api_params dictionary is correctly passed to the client's
117 |     submit_capture method.
118 |     """
119 |     mock_client = mock.Mock()
120 |     mock_client.submit_capture.return_value = "job-123"
121 |     urls_to_process = ["http://example.com"]
122 |     pending_jobs = {}
123 |     submission_attempts = {}
124 |     api_params = {"capture_screenshot": "1", "force_get": "1"}
125 | 
126 |     _submit_next_url(
127 |         urls_to_process,
128 |         mock_client,
129 |         pending_jobs,
130 |         0,
131 |         submission_attempts,
132 |         api_params,
133 |     )
134 | 
135 |     mock_client.submit_capture.assert_called_once_with(
136 |         "http://example.com", rate_limit_wait=0, api_params=api_params
137 |     )
138 | 
139 | 
140 | # --- Tests for _poll_pending_jobs ---
141 | 
142 | 
143 | @mock.patch("wayback_machine_archiver.workflow.time.sleep")
144 | def test_poll_uses_batch_and_removes_completed_jobs(mock_sleep):
145 |     """
146 |     Verify that jobs with 'success' or 'error' status are removed from the
147 |     pending list via the batch endpoint, while 'pending' jobs remain.
148 |     """
149 |     mock_client = mock.Mock()
150 |     # Define the return value for the single batch request
151 |     mock_client.check_status_batch.return_value = [
152 |         {"status": "success", "job_id": "job-success", "timestamp": "20250101"},
153 |         {"status": "error", "job_id": "job-error", "message": "Too many redirects."},
154 |         {"status": "pending", "job_id": "job-pending"},
155 |     ]
156 | 
157 |     # --- Use the new data structure for pending_jobs ---
158 |     now = time.time()
159 |     pending_jobs = {
160 |         "job-success": {"url": "http://a.com", "submitted_at": now},
161 |         "job-error": {"url": "http://b.com", "submitted_at": now},
162 |         "job-pending": {"url": "http://c.com", "submitted_at": now},
163 |     }
164 | 
165 |     # --- Provide the new required arguments ---
166 |     successful, failed, requeued = _poll_pending_jobs(
167 |         mock_client,
168 |         pending_jobs,
169 |         transient_error_retries={},
170 |         max_transient_retries=3,
171 |         job_timeout_sec=7200,
172 |     )
173 | 
174 |     # Assertions
175 |     mock_client.check_status_batch.assert_called_once_with(
176 |         ["job-success", "job-error", "job-pending"]
177 |     )
178 |     # --- Check the new data structure in the assertion ---
179 |     assert list(pending_jobs.keys()) == ["job-pending"]
180 |     assert pending_jobs["job-pending"]["url"] == "http://c.com"
181 |     assert successful == ["http://a.com"]
182 |     assert failed == ["http://b.com"]
183 |     assert requeued == []
184 |     mock_sleep.assert_called_once()
185 | 
186 | 
187 | @pytest.mark.parametrize(
188 |     "status_ext, api_message, expected_outcome, expected_log_level, expected_log_snippet",
189 |     [
190 |         (
191 |             "error:service-unavailable",
192 |             "Service is down",
193 |             "requeue",
194 |             logging.WARNING,
195 |             TRANSIENT_ERROR_MESSAGES["error:service-unavailable"],
196 |         ),
197 |         (
198 |             "error:not-found",
199 |             "Page not found",
200 |             "fail",
201 |             logging.ERROR,
202 |             PERMANENT_ERROR_MESSAGES["error:not-found"],
203 |         ),
204 |         (
205 |             "error:some-new-unseen-error",
206 |             "A new error",
207 |             "fail",
208 |             logging.ERROR,
209 |             "An unrecoverable error occurred.",
210 |         ),
211 |         # --- NEW TEST CASE ---
212 |         # This simulates the bug: the status_ext is a generic failure, but the
213 |         # message contains "RecursionError", which should trigger a requeue.
214 |         (
215 |             "error:job-failed",
216 |             "encoding with 'idna' codec failed (RecursionError: maximum recursion depth exceeded)",
217 |             "requeue",
218 |             logging.WARNING,
219 |             TRANSIENT_ERROR_MESSAGES["error:recursion-error"],
220 |         ),
221 |     ],
222 | )
223 | @mock.patch("wayback_machine_archiver.workflow.time.sleep")
224 | def test_poll_pending_jobs_handles_errors_intelligently(
225 |     mock_sleep,
226 |     caplog,
227 |     status_ext,
228 |     api_message,
229 |     expected_outcome,
230 |     expected_log_level,
231 |     expected_log_snippet,
232 | ):
233 |     """
234 |     Verify that _poll_pending_jobs correctly categorizes errors as either
235 |     transient (re-queue) or permanent (fail) and logs helpful messages.
236 |     """
237 |     mock_client = mock.Mock()
238 |     mock_client.check_status_batch.return_value = [
239 |         {
240 |             "status": "error",
241 |             "job_id": "job-1",
242 |             "status_ext": status_ext,
243 |             "message": api_message,
244 |         }
245 |     ]
246 |     # --- Use the new data structure for pending_jobs ---
247 |     pending_jobs = {"job-1": {"url": "http://example.com", "submitted_at": time.time()}}
248 | 
249 |     with caplog.at_level(logging.WARNING):
250 |         # --- Provide the new required arguments ---
251 |         successful, failed, requeued = _poll_pending_jobs(
252 |             mock_client,
253 |             pending_jobs,
254 |             transient_error_retries={},
255 |             max_transient_retries=3,
256 |             job_timeout_sec=7200,
257 |         )
258 | 
259 |     assert not successful
260 |     if expected_outcome == "requeue":
261 |         assert requeued == ["http://example.com"]
262 |         assert not failed
263 |     else:  # fail
264 |         assert not requeued
265 |         assert failed == ["http://example.com"]
266 | 
267 |     assert len(caplog.records) == 1
268 |     log_record = caplog.records[0]
269 |     assert log_record.levelno == expected_log_level
270 |     assert expected_log_snippet in log_record.message
271 | 
272 | 
273 | # --- Corrected test for run_archive_workflow dynamic polling ---
274 | 
275 | 
276 | @mock.patch("wayback_machine_archiver.workflow.time.sleep")
277 | @mock.patch("wayback_machine_archiver.workflow._poll_pending_jobs")
278 | @mock.patch("wayback_machine_archiver.workflow._submit_next_url")
279 | def test_run_archive_workflow_dynamic_polling_is_fast_and_correct(
280 |     mock_submit, mock_poll, mock_sleep
281 | ):
282 |     """
283 |     Verify that the polling wait time increases exponentially when jobs are pending
284 |     and the submission queue is empty, and that the test runs quickly.
285 |     """
286 |     mock_client = mock.Mock()
287 |     initial_urls = ["http://a.com"]
288 |     # Use a mutable list for the test to simulate its modification by _submit_next_url
289 |     urls_to_process_list = list(initial_urls)
290 |     rate_limit_in_sec = 0
291 |     api_params = {}
292 | 
293 |     # Configure mock_submit to simulate a successful submission
294 |     # It needs to modify the urls_to_process_list and pending_jobs_dict passed to it
295 |     def submit_side_effect(urls_proc, client_arg, pending_jobs_dict, *args, **kwargs):
296 |         url = urls_proc.pop(0)  # Remove the URL from the list
297 |         job_id = f"job-{url}"
298 |         # --- Use the new data structure ---
299 |         pending_jobs_dict[job_id] = {"url": url, "submitted_at": time.time()}
300 |         return job_id
301 | 
302 |     mock_submit.side_effect = submit_side_effect
303 | 
304 |     # Configure mock_poll to simulate jobs staying pending, then succeeding
305 |     poll_calls = 0
306 | 
307 |     def poll_side_effect(client_arg, pending_jobs_dict, *args, **kwargs):
308 |         nonlocal poll_calls
309 |         poll_calls += 1
310 |         if poll_calls <= 3:  # Simulate pending for 3 calls
311 |             return [], [], []  # No success, no failure, no requeue
312 |         else:  # Simulate success on the 4th call
313 |             # --- Extract URLs from the new data structure ---
314 |             successful_urls = [job["url"] for job in pending_jobs_dict.values()]
315 |             pending_jobs_dict.clear()
316 |             return successful_urls, [], []
317 | 
318 |     mock_poll.side_effect = poll_side_effect
319 | 
320 |     # Call the main workflow function
321 |     run_archive_workflow(
322 |         mock_client, urls_to_process_list, rate_limit_in_sec, api_params
323 |     )
324 | 
325 |     # Assertions
326 |     # Check the calls to time.sleep
327 |     # We expect sleep to be called between polling cycles when the submission
328 |     # queue is empty.
329 |     # Cycle 1: Submits URL. Polls. Loop continues.
330 |     # Cycle 2: No URLs to submit. Polls. Sleeps for 5s.
331 |     # Cycle 3: No URLs to submit. Polls. Sleeps for 7s (5 * 1.5).
332 |     # Cycle 4: No URLs to submit. Polls. Sleeps for 10s (7 * 1.5).
333 |     # Cycle 5: No URLs to submit. Polls (job succeeds). Loop terminates.
334 |     # We filter out the small 0.2s sleeps that happen inside _poll_pending_jobs.
335 |     sleep_calls = [call[0][0] for call in mock_sleep.call_args_list if call[0][0] > 1]
336 | 
337 |     assert sleep_calls == [5, 7, 10]
338 |     assert mock_submit.call_count == 1
339 |     # The poll side effect now runs 4 times to get to the success case
340 |     assert mock_poll.call_count == 4
341 |     assert not urls_to_process_list  # Ensure the initial URL list is empty
342 | 
343 | 
344 | def test_poll_gives_up_after_max_transient_retries(caplog):
345 |     """
346 |     Verify that if a URL fails with a transient error more times than allowed,
347 |     it is marked as a permanent failure and not re-queued.
348 |     """
349 |     mock_client = mock.Mock()
350 |     mock_client.check_status_batch.return_value = [
351 |         {
352 |             "status": "error",
353 |             "job_id": "job-1",
354 |             "status_ext": "error:service-unavailable",  # A transient error
355 |             "message": "API message",
356 |         }
357 |     ]
358 | 
359 |     url = "http://example.com"
360 |     max_retries = 3
361 | 
362 |     # Simulate that this URL has already failed 3 times with a transient error
363 |     transient_error_retries = {url: 3}
364 |     pending_jobs = {"job-1": {"url": url, "submitted_at": time.time()}}
365 | 
366 |     with caplog.at_level(logging.INFO):
367 |         successful, failed, requeued = _poll_pending_jobs(
368 |             mock_client,
369 |             pending_jobs,
370 |             transient_error_retries,
371 |             max_transient_retries=max_retries,
372 |             job_timeout_sec=7200,
373 |         )
374 | 
375 |     # Assertions
376 |     assert not successful
377 |     assert not requeued, "URL should not have been re-queued"
378 |     assert failed == [url], "URL should have been marked as failed"
379 |     assert "Marking as a permanent failure" in caplog.text
380 | 
381 | 
382 | def test_poll_fails_job_after_timeout(caplog):
383 |     """
384 |     Verify that a job that remains in a 'pending' state for longer than the
385 |     timeout period is marked as a failure.
386 |     """
387 |     mock_client = mock.Mock()
388 |     mock_client.check_status_batch.return_value = [
389 |         {"status": "pending", "job_id": "job-stuck"}
390 |     ]
391 | 
392 |     url = "http://stuck.com"
393 |     timeout_sec = 3600  # 1 hour
394 | 
395 |     # Simulate a job that was submitted long ago, well before the timeout
396 |     stale_timestamp = time.time() - (timeout_sec + 60)
397 |     pending_jobs = {"job-stuck": {"url": url, "submitted_at": stale_timestamp}}
398 | 
399 |     with caplog.at_level(logging.INFO):
400 |         successful, failed, requeued = _poll_pending_jobs(
401 |             mock_client,
402 |             pending_jobs,
403 |             transient_error_retries={},
404 |             max_transient_retries=3,
405 |             job_timeout_sec=timeout_sec,
406 |         )
407 | 
408 |     # Assertions
409 |     assert not successful
410 |     assert not requeued
411 |     assert failed == [url], "Stuck job should have been marked as failed"
412 |     assert "timed out after being pending" in caplog.text
413 | 


--------------------------------------------------------------------------------
/src/wayback_machine_archiver/workflow.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | # A set of transient errors that suggest a retry might be successful.
  5 | REQUEUE_ERRORS = {
  6 |     "error:bad-gateway",
  7 |     "error:bandwidth-limit-exceeded",
  8 |     "error:browsing-timeout",
  9 |     "error:cannot-fetch",
 10 |     "error:capture-location-error",
 11 |     "error:celery",
 12 |     "error:gateway-timeout",
 13 |     "error:internal-server-error",
 14 |     "error:invalid-server-response",
 15 |     "error:job-failed",
 16 |     "error:no-browsers-available",
 17 |     "error:protocol-error",
 18 |     "error:proxy-error",
 19 |     "error:read-timeout",
 20 |     "error:recursion-error",
 21 |     "error:service-unavailable",
 22 |     "error:soft-time-limit-exceeded",
 23 |     "error:too-many-requests",
 24 |     "error:user-session-limit",
 25 | }
 26 | 
 27 | # A map of transient error codes to user-friendly, explanatory messages.
 28 | TRANSIENT_ERROR_MESSAGES = {
 29 |     "error:bad-gateway": "The server reported a temporary upstream issue (Bad Gateway).",
 30 |     "error:bandwidth-limit-exceeded": "The target server has exceeded its bandwidth limit.",
 31 |     "error:browsing-timeout": "The headless browser timed out, possibly due to high server load.",
 32 |     "error:cannot-fetch": "The Internet Archive's systems are temporarily overloaded.",
 33 |     "error:capture-location-error": "An internal Internet Archive system error occurred.",
 34 |     "error:celery": "An error occurred in the Internet Archive's internal job queue.",
 35 |     "error:gateway-timeout": "The server reported a temporary upstream timeout (Gateway Timeout).",
 36 |     "error:internal-server-error": "The Internet Archive's server reported a temporary internal error.",
 37 |     "error:invalid-server-response": "The target server sent a malformed response, possibly due to a network glitch.",
 38 |     "error:job-failed": "The capture failed due to a generic Internet Archive system error.",
 39 |     "error:no-browsers-available": "The Internet Archive's capture browsers are temporarily at capacity.",
 40 |     "error:protocol-error": "The HTTP connection was broken, likely due to a network issue.",
 41 |     "error:proxy-error": "An internal Internet Archive proxy error occurred.",
 42 |     "error:read-timeout": "The connection timed out while reading data from the server.",
 43 |     "error:recursion-error": "The server encountered a temporary processing error (RecursionError).",
 44 |     "error:service-unavailable": "The Internet Archive's service is temporarily unavailable.",
 45 |     "error:soft-time-limit-exceeded": "The capture took too long and was terminated; a retry may succeed.",
 46 |     "error:too-many-requests": "The target server is rate-limiting requests.",
 47 |     "error:user-session-limit": "Your Internet Archive account has reached its concurrent job limit.",
 48 | }
 49 | 
 50 | # A map of permanent error codes to user-friendly, explanatory messages.
 51 | PERMANENT_ERROR_MESSAGES = {
 52 |     "error:bad-request": "The API reported a bad request. This may be a bug in the archiver script.",
 53 |     "error:blocked": "The target site is actively blocking the Internet Archive's requests. To save the block page, use the --capture-all flag.",
 54 |     "error:blocked-client-ip": "Your IP address is on a blocklist (e.g., Spamhaus), and the Internet Archive is refusing the request.",
 55 |     "error:blocked-url": "This URL is on a blocklist (e.g., a tracking domain) and cannot be archived.",
 56 |     "error:filesize-limit": "The file at this URL is larger than the 2GB limit and cannot be archived.",
 57 |     "error:ftp-access-denied": "Access to the FTP resource was denied due to a permissions issue.",
 58 |     "error:http-version-not-supported": "The target server uses an unsupported HTTP version.",
 59 |     "error:invalid-host-resolution": "The domain name could not be found. Check for typos in the URL.",
 60 |     "error:invalid-url-syntax": "The URL is malformed. Please check its structure.",
 61 |     "error:method-not-allowed": "The server forbids the HTTP method used for archiving. To save this error page, use the --capture-all flag.",
 62 |     "error:network-authentication-required": "A captive portal or proxy is requiring authentication. To save the login page, use the --capture-all flag.",
 63 |     "error:no-access": "The page is forbidden (403 Forbidden). To save this error page, use the --capture-all flag.",
 64 |     "error:not-found": "The page could not be found (404 Not Found). To save this error page, use the --capture-all flag.",
 65 |     "error:not-implemented": "The server does not support the functionality required to archive the page.",
 66 |     "error:too-many-daily-captures": "This URL has already been captured the maximum number of times today.",
 67 |     "error:too-many-redirects": "The URL has too many redirects, likely indicating a redirect loop.",
 68 |     "error:unauthorized": "The page requires a login (401 Unauthorized). To save the login/error page, use the --capture-all flag.",
 69 | }
 70 | 
 71 | 
 72 | def _submit_next_url(
 73 |     urls_to_process,
 74 |     client,
 75 |     pending_jobs,
 76 |     rate_limit_in_sec,
 77 |     submission_attempts,
 78 |     api_params,
 79 |     max_retries=3,
 80 | ):
 81 |     """
 82 |     Pops the next URL, submits it, and adds its job_id to pending_jobs.
 83 |     Returns 'failed' on a definitive failure, otherwise None.
 84 |     """
 85 |     url = urls_to_process.pop(0)
 86 |     attempt_num = submission_attempts.get(url, 0) + 1
 87 |     submission_attempts[url] = attempt_num
 88 | 
 89 |     if attempt_num > max_retries:
 90 |         logging.error("URL %s failed submission %d times, giving up.", url, max_retries)
 91 |         return "failed"
 92 | 
 93 |     try:
 94 |         logging.info("Submitting %s (attempt %d/%d)...", url, attempt_num, max_retries)
 95 |         job_id = client.submit_capture(
 96 |             url, rate_limit_wait=rate_limit_in_sec, api_params=api_params
 97 |         )
 98 | 
 99 |         if not job_id:
100 |             # The API accepted the request but didn't provide a job_id.
101 |             # This is treated as a transient error to trigger a retry.
102 |             raise ValueError(
103 |                 "API did not return a job_id, likely due to rate limiting."
104 |             )
105 | 
106 |         # --- Store a dictionary with URL and timestamp ---
107 |         pending_jobs[job_id] = {"url": url, "submitted_at": time.time()}
108 |         if url in submission_attempts:
109 |             del submission_attempts[url]
110 | 
111 |     except ValueError as _:
112 |         # This block specifically catches the "no job_id" case.
113 |         logging.warning(
114 |             "Submission for %s was accepted but no job_id was returned. This can happen under high load or due to rate limits. Re-queuing for another attempt.",
115 |             url,
116 |         )
117 |         urls_to_process.append(url)
118 | 
119 |     except Exception as e:
120 |         # This block now catches all OTHER submission errors (e.g., network).
121 |         logging.warning(
122 |             "Failed to submit URL %s due to a connection or API error: %s. Re-queuing for another attempt.",
123 |             url,
124 |             e,
125 |         )
126 |         urls_to_process.append(url)
127 | 
128 |     return None
129 | 
130 | 
131 | def _poll_pending_jobs(
132 |     client,
133 |     pending_jobs,
134 |     transient_error_retries,
135 |     max_transient_retries,
136 |     job_timeout_sec,
137 |     poll_interval_sec=0.2,
138 | ):
139 |     """
140 |     Checks the status of all pending jobs using a single batch request.
141 |     Returns a tuple of (successful_urls, failed_urls, requeued_urls) for completed jobs.
142 |     """
143 |     successful_urls = []
144 |     failed_urls = []
145 |     requeued_urls = []
146 | 
147 |     # Get all job IDs that need to be checked.
148 |     job_ids_to_check = list(pending_jobs.keys())
149 |     if not job_ids_to_check:
150 |         return [], [], []
151 | 
152 |     try:
153 |         # Make a single batch request for all pending jobs.
154 |         # The API is expected to return a list of status objects.
155 |         batch_statuses = client.check_status_batch(job_ids_to_check)
156 | 
157 |         # It's possible the API returns a single object if only one job was queried.
158 |         if not isinstance(batch_statuses, list):
159 |             batch_statuses = [batch_statuses]
160 | 
161 |         for status_data in batch_statuses:
162 |             job_id = status_data.get("job_id")
163 |             if not job_id or job_id not in pending_jobs:
164 |                 continue
165 | 
166 |             # --- URL is now inside a dictionary ---
167 |             original_url = pending_jobs[job_id]["url"]
168 |             status = status_data.get("status")
169 | 
170 |             if status == "success":
171 |                 timestamp = status_data.get("timestamp")
172 |                 archive_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
173 |                 logging.info("Success for job %s: %s", job_id, archive_url)
174 |                 del pending_jobs[job_id]
175 |                 successful_urls.append(original_url)
176 |             elif status == "error":
177 |                 status_ext = status_data.get("status_ext")
178 |                 api_message = status_data.get("message", "Unknown error")
179 | 
180 |                 # The API can return a generic error code for what is actually a transient
181 |                 # server-side processing error. We check the message for this specific case.
182 |                 if "RecursionError" in api_message:
183 |                     status_ext = "error:recursion-error"
184 | 
185 |                 if status_ext in REQUEUE_ERRORS:
186 |                     # --- Check if this URL has exceeded its transient retry limit ---
187 |                     retry_count = transient_error_retries.get(original_url, 0) + 1
188 |                     transient_error_retries[original_url] = retry_count
189 | 
190 |                     if retry_count > max_transient_retries:
191 |                         logging.error(
192 |                             "URL %s failed with a transient error %d times. Marking as a permanent failure. (API code: %s)",
193 |                             original_url,
194 |                             max_transient_retries,
195 |                             status_ext,
196 |                         )
197 |                         del pending_jobs[job_id]
198 |                         failed_urls.append(original_url)
199 |                     else:
200 |                         # --- This is the original re-queue logic ---
201 |                         helpful_message = TRANSIENT_ERROR_MESSAGES.get(
202 |                             status_ext, "A transient error occurred."
203 |                         )
204 |                         logging.warning(
205 |                             "Transient error for %s: %s Re-queuing for another attempt (%d/%d). (API code: %s)",
206 |                             original_url,
207 |                             helpful_message,
208 |                             retry_count,
209 |                             max_transient_retries,
210 |                             status_ext,
211 |                         )
212 |                         del pending_jobs[job_id]
213 |                         requeued_urls.append(original_url)
214 |                 else:
215 |                     # Look up the helpful message, with a fallback for unknown permanent errors.
216 |                     helpful_message = PERMANENT_ERROR_MESSAGES.get(
217 |                         status_ext, "An unrecoverable error occurred."
218 |                     )
219 |                     logging.error(
220 |                         "Permanent error for %s: %s (API message: %s)",
221 |                         original_url,
222 |                         helpful_message,
223 |                         api_message,
224 |                     )
225 |                     del pending_jobs[job_id]
226 |                     failed_urls.append(original_url)
227 |             else:
228 |                 # --- Check for job timeout if status is pending ---
229 |                 submitted_at = pending_jobs[job_id]["submitted_at"]
230 |                 job_age = time.time() - submitted_at
231 |                 if job_age > job_timeout_sec:
232 |                     logging.error(
233 |                         "Job for %s timed out after being pending for over %d seconds. Marking as failed.",
234 |                         original_url,
235 |                         job_timeout_sec,
236 |                     )
237 |                     del pending_jobs[job_id]
238 |                     failed_urls.append(original_url)
239 |                 else:
240 |                     logging.debug(
241 |                         "Job %s (%s) is still pending...", job_id, original_url
242 |                     )
243 | 
244 |     except Exception as e:
245 |         logging.error(
246 |             "An exception occurred during batch polling: %s. Clearing all pending jobs for this cycle to prevent loops.",
247 |             e,
248 |         )
249 |         # --- Must extract URLs from the dictionary values ---
250 |         failed_urls.extend([job["url"] for job in pending_jobs.values()])
251 |         pending_jobs.clear()
252 | 
253 |     # A short sleep after each batch poll to be nice to the API.
254 |     time.sleep(poll_interval_sec)
255 | 
256 |     return successful_urls, failed_urls, requeued_urls
257 | 
258 | 
259 | def run_archive_workflow(client, urls_to_process, rate_limit_in_sec, api_params):
260 |     """Manages the main loop for submitting and polling URLs."""
261 |     pending_jobs = {}
262 |     submission_attempts = {}
263 |     # --- Dictionary to track retries for transient polling errors ---
264 |     transient_error_retries = {}
265 |     MAX_TRANSIENT_RETRIES = 3
266 |     # --- Timeout for jobs stuck in pending state ---
267 |     JOB_TIMEOUT_SEC = 7200  # 2 hours
268 | 
269 |     total_urls = len(urls_to_process)
270 |     success_count = 0
271 |     failure_count = 0
272 | 
273 |     # --- Variables for dynamic polling ---
274 |     INITIAL_POLLING_WAIT = 5
275 |     MAX_POLLING_WAIT = 60
276 |     POLLING_BACKOFF_FACTOR = 1.5
277 |     polling_wait_time = INITIAL_POLLING_WAIT
278 | 
279 |     logging.info(
280 |         "Beginning interleaved submission and polling of %d URLs...",
281 |         total_urls,
282 |     )
283 | 
284 |     while urls_to_process or pending_jobs:
285 |         if urls_to_process:
286 |             status = _submit_next_url(
287 |                 urls_to_process,
288 |                 client,
289 |                 pending_jobs,
290 |                 rate_limit_in_sec,
291 |                 submission_attempts,
292 |                 api_params,
293 |             )
294 |             if status == "failed":
295 |                 failure_count += 1
296 |             # Reset polling wait time after a new submission
297 |             polling_wait_time = INITIAL_POLLING_WAIT
298 | 
299 |         if pending_jobs:
300 |             # --- Pass job timeout to the polling function ---
301 |             successful, failed, requeued = _poll_pending_jobs(
302 |                 client,
303 |                 pending_jobs,
304 |                 transient_error_retries,
305 |                 MAX_TRANSIENT_RETRIES,
306 |                 JOB_TIMEOUT_SEC,
307 |             )
308 |             success_count += len(successful)
309 |             failure_count += len(failed)
310 |             if requeued:
311 |                 urls_to_process.extend(requeued)
312 |                 logging.info(
313 |                     "Re-queued %d URLs due to transient API errors.", len(requeued)
314 |                 )
315 | 
316 |         if not urls_to_process and pending_jobs:
317 |             logging.info(
318 |                 "%d captures remaining, starting next polling cycle in %d seconds...",
319 |                 len(pending_jobs),
320 |                 polling_wait_time,
321 |             )
322 |             time.sleep(polling_wait_time)
323 |             # Increase wait time for the next cycle
324 |             polling_wait_time = min(
325 |                 int(polling_wait_time * POLLING_BACKOFF_FACTOR), MAX_POLLING_WAIT
326 |             )
327 | 
328 |     logging.info("--------------------------------------------------")
329 |     logging.info("Archive workflow complete.")
330 |     logging.info(f"Total URLs processed: {total_urls}")
331 |     logging.info(f"Successful captures: {success_count}")
332 |     logging.info(f"Failed captures: {failure_count}")
333 |     logging.info("--------------------------------------------------")
334 | 


--------------------------------------------------------------------------------