├── src
└── wayback_machine_archiver
│ ├── __init__.py
│ ├── clients.py
│ ├── sitemaps.py
│ ├── cli.py
│ ├── archiver.py
│ └── workflow.py
├── tests
├── test_sitemap_is_local.py
├── test_get_namespace.py
├── test_download_remote_sitemap.py
├── test_load_local_sitemap.py
├── test_extract_pages_from_sitemap.py
├── test_main_logic.py
├── test_spn2_client.py
├── test_cli.py
└── test_spn2_workflow.py
├── .github
└── workflows
│ ├── tests.yml
│ └── release.yml
├── LICENSE
├── pyproject.toml
├── .gitignore
└── README.md
/src/wayback_machine_archiver/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "3.3.1"
2 |
--------------------------------------------------------------------------------
/tests/test_sitemap_is_local.py:
--------------------------------------------------------------------------------
1 | from wayback_machine_archiver.sitemaps import sitemap_is_local, LOCAL_PREFIX
2 |
3 |
4 | def test_local():
5 | URIS = (
6 | "/tmp/sitemap.xml",
7 | "{prefix}/tmp/sitemap.xml".format(prefix=LOCAL_PREFIX),
8 | )
9 | for uri in URIS:
10 | assert sitemap_is_local(uri)
11 |
12 |
13 | def test_remote():
14 | URIS = (
15 | "https://alexgude.com/sitemap.xml",
16 | "http://charles.uno/sitemap.xml",
17 | )
18 | for uri in URIS:
19 | assert not sitemap_is_local(uri)
20 |
--------------------------------------------------------------------------------
/tests/test_get_namespace.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | from wayback_machine_archiver.sitemaps import get_namespace
3 |
4 | ELEMENT = namedtuple("Element", "tag")
5 |
6 |
7 | def test_good_namespace():
8 | NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
9 | test_element = ELEMENT("{namespace}urlset".format(namespace=NAMESPACE))
10 |
11 | assert get_namespace(test_element) == NAMESPACE
12 |
13 |
14 | def test_no_match_namespace():
15 | NAMESPACE = ""
16 | test_element = ELEMENT("{namespace}urlset".format(namespace=NAMESPACE))
17 |
18 | assert get_namespace(test_element) == NAMESPACE
19 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests 🧪
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | fail-fast: false
10 | matrix:
11 | python-version:
12 | - '3.8'
13 | - '3.9'
14 | - '3.10'
15 | - '3.11'
16 | - '3.12'
17 | - 'pypy-3.9'
18 | - 'pypy-3.10'
19 | name: Python ${{ matrix.python-version }} Test 🧪
20 | steps:
21 | - name: Checkout repository
22 | uses: actions/checkout@v4
23 |
24 | - name: Set up Python ${{ matrix.python-version }} 🐍
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 |
29 | - name: Set up uv 💨
30 | uses: astral-sh/setup-uv@v6
31 | with:
32 | uv-version: latest
33 |
34 | - name: Install dependencies 🏗
35 | run: uv pip install --system -e ".[dev]"
36 |
37 | - name: Run Tests 🧪
38 | run: pytest -vv
39 |
40 | - name: Run Smoke Test ⚗️
41 | run: archiver --help
42 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | # MIT License (MIT)
2 |
3 | Copyright © 2018--2025 Alexander Gude
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/tests/test_download_remote_sitemap.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from wayback_machine_archiver.sitemaps import download_remote_sitemap
3 | from requests.adapters import HTTPAdapter
4 | import requests
5 |
6 |
7 | SITEMAP = """
8 |
9 |
10 | https://alexgude.com/blog/double-checking-538/
11 | 2016-04-28T00:00:00+00:00
12 |
13 |
14 | https://alexgude.com/files/undergrad_thesis.pdf
15 | 2019-05-09T16:19:45+00:00
16 |
17 |
18 | """
19 |
20 |
21 | @pytest.fixture
22 | def session():
23 | session = requests.Session()
24 | session.mount("https://", HTTPAdapter())
25 | session.mount("http://", HTTPAdapter())
26 | return session
27 |
28 |
29 | def test_download_remote_sitemap(requests_mock, session):
30 | url = "https://www.radiokeysmusic.com/sitemap.xml"
31 | requests_mock.get(url, text=SITEMAP)
32 | returned_contents = download_remote_sitemap(url, session)
33 | assert returned_contents == SITEMAP.encode("UTF-8")
34 |
35 |
36 | def test_download_remote_sitemap_with_status_error(requests_mock, session):
37 | url = "https://www.radiokeysmusic.com/sitemap.xml"
38 | requests_mock.get(url, text=SITEMAP, status_code=404)
39 | with pytest.raises(requests.exceptions.HTTPError):
40 | download_remote_sitemap(url, session)
41 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "wayback-machine-archiver"
7 | version = "3.3.1"
8 | description = "A Python script to submit web pages to the Wayback Machine for archiving."
9 | readme = "README.md"
10 | authors = [
11 | { name = "Alexander Gude", email = "alex.public.account@gmail.com" },
12 | ]
13 | license = { file = "LICENSE" }
14 | requires-python = ">=3.8"
15 | classifiers = [
16 | "Development Status :: 5 - Production/Stable",
17 | "Environment :: Console",
18 | "Intended Audience :: System Administrators",
19 | "License :: OSI Approved :: MIT License",
20 | "Natural Language :: English",
21 | "Operating System :: OS Independent",
22 | "Programming Language :: Python",
23 | "Topic :: Utilities",
24 | ]
25 | keywords = ["Internet Archive", "Wayback Machine"]
26 | dependencies = [
27 | "python-dotenv",
28 | "requests",
29 | "urllib3",
30 | ]
31 |
32 | [project.urls]
33 | Homepage = "https://github.com/agude/wayback-machine-archiver"
34 |
35 | [project.scripts]
36 | archiver = "wayback_machine_archiver.archiver:main"
37 |
38 | [project.optional-dependencies]
39 | dev = [
40 | "pytest",
41 | "requests-mock",
42 | "bump-my-version",
43 | ]
44 |
45 | [tool.bumpversion]
46 | current_version = "3.3.1"
47 | commit = true
48 | tag = true
49 | message = "Bump version to {new_version}"
50 |
51 | [[tool.bumpversion.files]]
52 | filename = "pyproject.toml"
53 | search = 'version = "{current_version}"'
54 | replace = 'version = "{new_version}"'
55 |
56 | [[tool.bumpversion.files]]
57 | filename = "src/wayback_machine_archiver/__init__.py"
58 | search = '__version__ = "{current_version}"'
59 | replace = '__version__ = "{new_version}"'
60 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Build and Release Package 📦
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | jobs:
8 | test:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | fail-fast: false
12 | matrix:
13 | python-version:
14 | - '3.8'
15 | - '3.9'
16 | - '3.10'
17 | - '3.11'
18 | - '3.12'
19 | - 'pypy-3.9'
20 | - 'pypy-3.10'
21 | name: Python ${{ matrix.python-version }} Test 🧪
22 | steps:
23 | - name: Checkout repository
24 | uses: actions/checkout@v4
25 |
26 | - name: Set up Python ${{ matrix.python-version }} 🐍
27 | uses: actions/setup-python@v4
28 | with:
29 | python-version: ${{ matrix.python-version }}
30 |
31 | - name: Set up uv 💨
32 | uses: astral-sh/setup-uv@v6
33 | with:
34 | uv-version: latest
35 |
36 | - name: Install dependencies 🏗
37 | run: uv pip install --system -e ".[dev]"
38 |
39 | - name: Run Tests 🧪
40 | run: pytest -vv
41 |
42 | - name: Run Smoke Test ⚗️
43 | run: archiver --help
44 |
45 | release:
46 | runs-on: ubuntu-latest
47 | needs: test
48 | name: Build and Publish to PyPI 📦
49 | permissions:
50 | id-token: write
51 | steps:
52 | - name: Checkout repository
53 | uses: actions/checkout@v4
54 |
55 | - name: Set up Python for build 🐍
56 | uses: actions/setup-python@v4
57 | with:
58 | python-version: '3.12'
59 |
60 | - name: Install the modern build tool 🏗
61 | run: python -m pip install build
62 |
63 | - name: Build package 👷
64 | run: python -m build
65 |
66 | - name: Publish distribution 📦 to PyPI
67 | if: startsWith(github.ref, 'refs/tags')
68 | uses: pypa/gh-action-pypi-publish@release/v1
69 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # env files
2 | .env
3 | env
4 |
5 | # Sometimes I use a test sitemap.xml
6 | sitemap.xml
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | .mypy*
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *,cover
54 | .hypothesis/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | #Ipython Notebook
70 | .ipynb_checkpoints
71 |
72 | # swap files
73 | *.swp
74 |
75 | # OSX crap
76 | .DS_Store
77 |
78 | # pickled models
79 | **/*.pickle
80 |
81 | #other crap
82 | **/.ropeproject
83 | checkscript.sh
84 |
85 | # swap
86 | [._]*.s[a-v][a-z]
87 | [._]*.sw[a-p]
88 | [._]s[a-v][a-z]
89 | [._]sw[a-p]
90 | # session
91 | Session.vim
92 | # temporary
93 | .netrwhist
94 | *~
95 | # auto-generated tag files
96 | tags
97 |
98 | *~
99 |
100 | # temporary files which can be created if a process still has a handle open of a deleted file
101 | .fuse_hidden*
102 |
103 | # KDE directory preferences
104 | .directory
105 |
106 | # Linux trash folder which might appear on any partition or disk
107 | .Trash-*
108 |
109 | # .nfs files are created when an open file is removed but is still being accessed
110 | .nfs*
111 |
112 | .vscode/
113 |
--------------------------------------------------------------------------------
/tests/test_load_local_sitemap.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from __future__ import unicode_literals
3 | from wayback_machine_archiver.sitemaps import load_local_sitemap, LOCAL_PREFIX
4 | import os.path
5 | import pytest
6 |
7 |
8 | SITEMAP = """
9 |
10 |
11 | https://alexgude.com/blog/double-checking-538/
12 | 2016-04-28T00:00:00+00:00
13 |
14 |
15 | https://alexgude.com/files/undergrad_thesis.pdf
16 | 2019-05-09T16:19:45+00:00
17 |
18 |
19 | """
20 |
21 |
22 | def test_load_local_file_without_prefix(tmpdir):
23 | # Write a file using pytest's tmpdir so we can read it back
24 | file = tmpdir.join("sitemap.xml")
25 | file.write(SITEMAP)
26 | file_path = os.path.join(file.dirname, file.basename)
27 |
28 | # Read the file
29 | read_contents = load_local_sitemap(file_path)
30 | assert read_contents == SITEMAP
31 |
32 |
33 | def test_load_local_file_with_prefix(tmpdir):
34 | # Write a file using pytest's tmpdir so we can read it back
35 | file = tmpdir.join("sitemap.xml")
36 | file.write(SITEMAP)
37 | file_path = os.path.join(LOCAL_PREFIX, file.dirname, file.basename)
38 |
39 | # Read the file
40 | read_contents = load_local_sitemap(file_path)
41 | assert read_contents == SITEMAP
42 |
43 |
44 | def test_file_does_not_exist(tmpdir):
45 | file_path = "{}/tmp/not_a_real_file".format(LOCAL_PREFIX)
46 |
47 | with pytest.raises(IOError):
48 | load_local_sitemap(file_path)
49 |
50 |
51 | def test_file_is_remote(tmpdir):
52 | file_path = "https://alexgude.com/sitemap.xml"
53 |
54 | with pytest.raises(IOError):
55 | load_local_sitemap(file_path)
56 |
57 |
58 | def test_file_path_is_invalid(tmpdir):
59 | file_path = "tmp/file_path"
60 |
61 | with pytest.raises(IOError):
62 | load_local_sitemap(file_path)
63 |
--------------------------------------------------------------------------------
/src/wayback_machine_archiver/clients.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | import requests
4 |
5 |
6 | class SPN2Client:
7 | """
8 | Handles archiving using the authenticated SPN2 API.
9 | """
10 |
11 | SAVE_URL = "https://web.archive.org/save"
12 | STATUS_URL = "https://web.archive.org/save/status"
13 | STATUS_URL_TEMPLATE = "https://web.archive.org/save/status/{job_id}"
14 |
15 | def __init__(self, session, access_key, secret_key):
16 | self.session = session
17 | self.is_authenticated = True # Always true now
18 |
19 | self.session.headers.update({"Accept": "application/json"})
20 | auth_header = f"LOW {access_key}:{secret_key}"
21 | self.session.headers.update({"Authorization": auth_header})
22 |
23 | def submit_capture(self, url_to_archive, rate_limit_wait, api_params=None):
24 | """Submits a capture request to the SPN2 API."""
25 | if rate_limit_wait > 0:
26 | logging.debug("Sleeping for %s seconds", rate_limit_wait)
27 | time.sleep(rate_limit_wait)
28 | logging.info("Submitting %s to SPN2", url_to_archive)
29 | data = {"url": url_to_archive}
30 | if api_params:
31 | data.update(api_params)
32 |
33 | r = self.session.post(self.SAVE_URL, data=data)
34 | r.raise_for_status()
35 | response_json = r.json()
36 | job_id = response_json.get("job_id")
37 | logging.info("Successfully submitted %s, job_id: %s", url_to_archive, job_id)
38 |
39 | if job_id:
40 | status_check_url = self.STATUS_URL_TEMPLATE.format(job_id=job_id)
41 | logging.debug(
42 | "Manual status check URL for %s: %s", url_to_archive, status_check_url
43 | )
44 |
45 | return job_id
46 |
47 | def check_status(self, job_id):
48 | """Checks the status of a single capture job."""
49 | status_url = self.STATUS_URL_TEMPLATE.format(job_id=job_id)
50 | logging.debug("Checking status for single job_id: %s", job_id)
51 | r = self.session.get(status_url)
52 | r.raise_for_status()
53 | return r.json()
54 |
55 | def check_status_batch(self, job_ids):
56 | """Checks the status of multiple capture jobs in a single request."""
57 | logging.debug("Checking status for %d jobs in a batch.", len(job_ids))
58 | data = {"job_ids": ",".join(job_ids)}
59 | r = self.session.post(self.STATUS_URL, data=data)
60 | r.raise_for_status()
61 | return r.json()
62 |
--------------------------------------------------------------------------------
/tests/test_extract_pages_from_sitemap.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | from __future__ import unicode_literals
3 | from wayback_machine_archiver.sitemaps import extract_urls_from_sitemap
4 |
5 |
6 | def test_ascii_sitemap():
7 | SITEMAP = """
8 |
9 |
10 | https://alexgude.com/blog/double-checking-538/
11 | 2016-04-28T00:00:00+00:00
12 |
13 |
14 | https://alexgude.com/files/undergrad_thesis.pdf
15 | 2019-05-09T16:19:45+00:00
16 |
17 |
18 | """.encode("UTF-8")
19 |
20 | URLS = set(
21 | (
22 | "https://alexgude.com/blog/double-checking-538/",
23 | "https://alexgude.com/files/undergrad_thesis.pdf",
24 | )
25 | )
26 |
27 | assert extract_urls_from_sitemap(SITEMAP) == URLS
28 |
29 |
30 | def test_unicode_sitemap():
31 | SITEMAP = """
32 |
33 |
34 | https://www.radiokeysmusic.com/home
35 | daily
36 | 1.0
37 | 2018-12-17
38 |
39 | https://static1.squarespace.com/static/5c06e0ab1137a66237a2399c/t/5c0d6a4d562fa7678539d405/1544383062969/
40 | Home
41 | Tom, Stewart, Allante, & Emily. Photo by Cory Cullington, 2018.
42 |
43 |
44 |
45 | https://www.radiokeysmusic.com/about
46 | daily
47 | 0.75
48 | 2019-01-05
49 |
50 | https://static1.squarespace.com/static/5c06e0ab1137a66237a2399c/t/5c0d6b5b6d2a7379672b9b34/1544896195646/IMG_9107.jpg
51 | About - Story
52 | instrumentation complimented by Emily’s velvety voice and Stewart’s
53 |
54 |
55 |
56 | """.encode("UTF-8")
57 |
58 | URLS = set(
59 | (
60 | "https://www.radiokeysmusic.com/home",
61 | "https://www.radiokeysmusic.com/about",
62 | )
63 | )
64 |
65 | assert extract_urls_from_sitemap(SITEMAP) == URLS
66 |
--------------------------------------------------------------------------------
/src/wayback_machine_archiver/sitemaps.py:
--------------------------------------------------------------------------------
1 | # src/wayback_machine_archiver/sitemaps.py
2 | import logging
3 | import re
4 | import xml.etree.ElementTree as ET
5 | from xml.etree.ElementTree import ParseError
6 |
7 | LOCAL_PREFIX = "file://"
8 |
9 |
10 | def get_namespace(element):
11 | """Extract the namespace from an XML element."""
12 | match = re.match(r"\{.*\}", element.tag)
13 | return match.group(0) if match else ""
14 |
15 |
16 | def download_remote_sitemap(sitemap_url, session):
17 | """Download a remote sitemap file."""
18 | logging.debug("Downloading: %s", sitemap_url)
19 | r = session.get(sitemap_url)
20 | r.raise_for_status()
21 | return r.text.encode("utf-8")
22 |
23 |
24 | def load_local_sitemap(sitemap_filepath):
25 | """Load a local sitemap file."""
26 | logging.debug("Loading local sitemap: %s", sitemap_filepath)
27 | if sitemap_filepath.startswith(LOCAL_PREFIX):
28 | sitemap_filepath = sitemap_filepath[len(LOCAL_PREFIX) :]
29 | with open(sitemap_filepath, "r") as fp:
30 | return fp.read()
31 |
32 |
33 | def sitemap_is_local(sitemap_url):
34 | """Check if a sitemap URI is local."""
35 | return sitemap_url.startswith(LOCAL_PREFIX) or sitemap_url.startswith("/")
36 |
37 |
38 | def extract_urls_from_sitemap(site_map_text):
39 | """Parse XML sitemap text and extract URLs."""
40 | root = ET.fromstring(site_map_text)
41 | namespace = get_namespace(root)
42 | loc_nodes = root.findall(".//{}loc".format(namespace))
43 | return {node.text for node in loc_nodes}
44 |
45 |
46 | def process_sitemaps(sitemap_urls, session):
47 | """
48 | Given a list of sitemap URLs, downloads/loads them and returns a set of all unique URLs found.
49 | """
50 | all_urls = set()
51 | for sitemap_url in sitemap_urls:
52 | try:
53 | if sitemap_is_local(sitemap_url):
54 | logging.debug("The sitemap '%s' is local.", sitemap_url)
55 | sitemap_xml = load_local_sitemap(sitemap_url)
56 | else:
57 | logging.debug("The sitemap '%s' is remote.", sitemap_url)
58 | sitemap_xml = download_remote_sitemap(sitemap_url, session)
59 |
60 | extracted_urls = extract_urls_from_sitemap(sitemap_xml)
61 | all_urls.update(extracted_urls)
62 | except ParseError:
63 | logging.error(
64 | "Failed to parse sitemap from '%s'. The content is not valid XML. Please ensure the URL points directly to a sitemap.xml file. Skipping this sitemap.",
65 | sitemap_url,
66 | )
67 | except Exception as e:
68 | logging.error(
69 | "An error occurred while processing sitemap '%s': %s. Skipping.",
70 | sitemap_url,
71 | e,
72 | )
73 | return all_urls
74 |
--------------------------------------------------------------------------------
/tests/test_main_logic.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from unittest import mock
3 | from wayback_machine_archiver.archiver import main
4 |
5 | # This test file now mocks the main workflow and assumes credentials are present
6 | # to test the URL gathering and shuffling logic.
7 |
8 |
9 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
10 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
11 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key")
12 | @mock.patch("wayback_machine_archiver.archiver.random.shuffle")
13 | def test_random_order_flag_shuffles_urls(
14 | mock_shuffle, mock_getenv, mock_workflow, mock_sitemaps
15 | ):
16 | """Verify that when --random-order is passed, random.shuffle is called."""
17 | urls_to_archive = ["http://test.com/a", "http://test.com/b"]
18 | sys.argv = ["archiver", "--random-order"] + urls_to_archive
19 | main()
20 | mock_shuffle.assert_called_once()
21 |
22 | # Check for membership, not order, by comparing sets.
23 | # The second argument to the mock_workflow call is the list of URLs.
24 | passed_urls = mock_workflow.call_args[0][1]
25 | assert set(passed_urls) == set(urls_to_archive)
26 |
27 |
28 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
29 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
30 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key")
31 | @mock.patch("wayback_machine_archiver.archiver.random.shuffle")
32 | def test_default_order_does_not_shuffle(
33 | mock_shuffle, mock_getenv, mock_workflow, mock_sitemaps
34 | ):
35 | """Verify that without --random-order, shuffle is not called."""
36 | urls_to_archive = ["http://test.com/a", "http://test.com/b"]
37 | sys.argv = ["archiver"] + urls_to_archive
38 | main()
39 | mock_shuffle.assert_not_called()
40 |
41 | # Check for membership, not order, by comparing sets.
42 | passed_urls = mock_workflow.call_args[0][1]
43 | assert set(passed_urls) == set(urls_to_archive)
44 |
45 |
46 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
47 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
48 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key")
49 | def test_main_builds_and_passes_api_params(mock_getenv, mock_workflow, mock_sitemaps):
50 | """
51 | Verify that main() correctly constructs the api_params dictionary from CLI
52 | flags and passes it to the workflow.
53 | """
54 | sys.argv = [
55 | "archiver",
56 | "http://test.com",
57 | "--capture-screenshot",
58 | "--js-behavior-timeout",
59 | "10",
60 | "--if-not-archived-within",
61 | "5d",
62 | "--user-agent",
63 | "TestBot/1.0",
64 | ]
65 | main()
66 |
67 | # The fourth argument to the mock_workflow call is the api_params dict.
68 | passed_params = mock_workflow.call_args[0][3]
69 | expected_params = {
70 | "capture_screenshot": "1",
71 | "js_behavior_timeout": 10,
72 | "if_not_archived_within": "5d",
73 | "use_user_agent": "TestBot/1.0",
74 | }
75 | assert passed_params == expected_params
76 |
--------------------------------------------------------------------------------
/tests/test_spn2_client.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from wayback_machine_archiver.clients import SPN2Client
3 | from requests.adapters import HTTPAdapter
4 | import requests
5 | import urllib.parse
6 |
7 |
8 | @pytest.fixture
9 | def session():
10 | session = requests.Session()
11 | session.mount("https://", HTTPAdapter())
12 | session.mount("http://", HTTPAdapter())
13 | return session
14 |
15 |
16 | @pytest.mark.parametrize(
17 | "api_params",
18 | [
19 | (None),
20 | ({"capture_outlinks": "1", "js_behavior_timeout": 0}),
21 | ({"capture_screenshot": "1", "force_get": "1"}),
22 | ],
23 | )
24 | def test_spn2_client_submit_capture(requests_mock, session, api_params):
25 | """
26 | Verify that submit_capture sends a correct POST request, including optional
27 | API parameters, and returns the job_id.
28 | """
29 | access_key = "test-access"
30 | secret_key = "test-secret"
31 | url_to_archive = "https://example.com"
32 | expected_job_id = "c4b1-4f2a-ac04-1d1225e98695"
33 |
34 | requests_mock.post(
35 | SPN2Client.SAVE_URL, json={"job_id": expected_job_id}, status_code=200
36 | )
37 |
38 | client = SPN2Client(session=session, access_key=access_key, secret_key=secret_key)
39 | job_id = client.submit_capture(
40 | url_to_archive, rate_limit_wait=0, api_params=api_params
41 | )
42 |
43 | # Assertions
44 | assert job_id == expected_job_id
45 | history = requests_mock.request_history
46 | assert len(history) == 1
47 | request = history[0]
48 | assert request.method == "POST"
49 | assert request.url == SPN2Client.SAVE_URL
50 | assert f"LOW {access_key}:{secret_key}" == request.headers["Authorization"]
51 |
52 | expected_payload = {"url": url_to_archive}
53 | if api_params:
54 | expected_payload.update(api_params)
55 | expected_body = urllib.parse.urlencode(expected_payload)
56 | assert request.text == expected_body
57 |
58 |
59 | def test_spn2_client_check_status_success(requests_mock, session):
60 | """
61 | Verify check_status correctly parses a 'success' response.
62 | """
63 | job_id = "test-job-123"
64 | status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id)
65 | success_payload = {
66 | "status": "success",
67 | "original_url": "https://example.com",
68 | "timestamp": "20250101000000",
69 | }
70 | requests_mock.get(status_url, json=success_payload)
71 |
72 | client = SPN2Client(session=session, access_key="key", secret_key="secret")
73 | status_data = client.check_status(job_id)
74 |
75 | assert status_data == success_payload
76 |
77 |
78 | def test_spn2_client_check_status_pending(requests_mock, session):
79 | """
80 | Verify check_status correctly parses a 'pending' response.
81 | """
82 | job_id = "test-job-456"
83 | status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id)
84 | pending_payload = {"status": "pending"}
85 | requests_mock.get(status_url, json=pending_payload)
86 |
87 | client = SPN2Client(session=session, access_key="key", secret_key="secret")
88 | status_data = client.check_status(job_id)
89 |
90 | assert status_data == pending_payload
91 |
92 |
93 | def test_spn2_client_check_status_error(requests_mock, session):
94 | """
95 | Verify check_status correctly parses an 'error' response.
96 | """
97 | job_id = "test-job-789"
98 | status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id)
99 | error_payload = {"status": "error", "message": "Too many redirects."}
100 | requests_mock.get(status_url, json=error_payload)
101 |
102 | client = SPN2Client(session=session, access_key="key", secret_key="secret")
103 | status_data = client.check_status(job_id)
104 |
105 | assert status_data == error_payload
106 |
--------------------------------------------------------------------------------
/src/wayback_machine_archiver/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import logging
3 | from . import __version__
4 |
5 | LOCAL_PREFIX = "file://"
6 |
7 |
8 | def create_parser():
9 | """Creates and returns the argparse parser."""
10 | parser = argparse.ArgumentParser(
11 | prog="archiver",
12 | description="A script to backup a web pages with Internet Archive",
13 | )
14 | parser.add_argument(
15 | "--version",
16 | action="version",
17 | version="%(prog)s {version}".format(version=__version__),
18 | )
19 | parser.add_argument(
20 | "urls",
21 | nargs="*",
22 | default=[],
23 | help="Specifies the URLs of the pages to archive.",
24 | )
25 | parser.add_argument(
26 | "--file",
27 | help="Specifies the path to a file containing URLs to save, one per line.",
28 | required=False,
29 | )
30 | parser.add_argument(
31 | "--sitemaps",
32 | nargs="+",
33 | default=[],
34 | help="Specifies one or more URIs to sitemaps listing pages to archive. Local paths must be prefixed with '{f}'.".format(
35 | f=LOCAL_PREFIX
36 | ),
37 | required=False,
38 | )
39 | parser.add_argument(
40 | "--log",
41 | help="Sets the logging level. Defaults to WARNING (case-insensitive).",
42 | dest="log_level",
43 | default=logging.WARNING,
44 | type=str.upper,
45 | choices=[
46 | "DEBUG",
47 | "INFO",
48 | "WARNING",
49 | "ERROR",
50 | "CRITICAL",
51 | ],
52 | )
53 | parser.add_argument(
54 | "--log-to-file",
55 | help="Redirects logs to a specified file instead of the console.",
56 | dest="log_file",
57 | default=None,
58 | )
59 | parser.add_argument(
60 | "--archive-sitemap-also",
61 | help="Submits the URL of the sitemap itself to be archived.",
62 | dest="archive_sitemap",
63 | default=False,
64 | action="store_true",
65 | )
66 | parser.add_argument(
67 | "--rate-limit-wait",
68 | help="Specifies the number of seconds to wait between submissions. A minimum of 5 seconds is enforced for authenticated users. Defaults to 15.",
69 | dest="rate_limit_in_sec",
70 | default=15,
71 | type=int,
72 | )
73 | parser.add_argument(
74 | "--random-order",
75 | help="Randomizes the order of pages before archiving.",
76 | dest="random_order",
77 | default=False,
78 | action="store_true",
79 | )
80 |
81 | # --- SPN2 API Options ---
82 | api_group = parser.add_argument_group(
83 | "SPN2 API Options", "Control the behavior of the Internet Archive capture API."
84 | )
85 | api_group.add_argument(
86 | "--capture-all",
87 | action="store_true",
88 | help="Captures a web page even if it returns an error (e.g., 404, 500).",
89 | )
90 | api_group.add_argument(
91 | "--capture-outlinks",
92 | action="store_true",
93 | help="Captures web page outlinks automatically. Note: this can significantly increase the total number of captures and runtime.",
94 | )
95 | api_group.add_argument(
96 | "--capture-screenshot",
97 | action="store_true",
98 | help="Captures a full page screenshot.",
99 | )
100 | api_group.add_argument(
101 | "--delay-wb-availability",
102 | action="store_true",
103 | help="Reduces load on Internet Archive systems by making the capture publicly available after ~12 hours instead of immediately.",
104 | )
105 | api_group.add_argument(
106 | "--force-get",
107 | action="store_true",
108 | help="Bypasses the headless browser check, which can speed up captures for non-HTML content (e.g., PDFs, images).",
109 | )
110 | api_group.add_argument(
111 | "--skip-first-archive",
112 | action="store_true",
113 | help="Speeds up captures by skipping the check for whether this is the first time a URL has been archived.",
114 | )
115 | api_group.add_argument(
116 | "--email-result",
117 | action="store_true",
118 | help="Sends an email report of the captured URLs to the user's registered email.",
119 | )
120 | api_group.add_argument(
121 | "--if-not-archived-within",
122 | type=str,
123 | metavar="",
124 | help="Captures only if the latest capture is older than (e.g., '3d 5h').",
125 | )
126 | api_group.add_argument(
127 | "--js-behavior-timeout",
128 | type=int,
129 | metavar="",
130 | help="Runs JS code for seconds after page load to trigger dynamic content. Defaults to 5, max is 30. Use 0 to disable for static pages.",
131 | )
132 | api_group.add_argument(
133 | "--capture-cookie",
134 | type=str,
135 | metavar="",
136 | help="Uses an extra HTTP Cookie value when capturing the target page.",
137 | )
138 | api_group.add_argument(
139 | "--user-agent",
140 | type=str,
141 | metavar="",
142 | dest="use_user_agent",
143 | help="Uses a custom HTTP User-Agent value when capturing the target page.",
144 | )
145 |
146 | return parser
147 |
--------------------------------------------------------------------------------
/src/wayback_machine_archiver/archiver.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import random
4 | import sys
5 | import requests
6 | from requests.adapters import HTTPAdapter
7 | from urllib3.util.retry import Retry
8 | from dotenv import load_dotenv
9 |
10 | from .clients import SPN2Client
11 | from .cli import create_parser
12 | from .sitemaps import process_sitemaps
13 | from .workflow import run_archive_workflow
14 |
15 |
16 | def main():
17 | """Main entry point for the archiver script."""
18 | parser = create_parser()
19 | args = parser.parse_args()
20 |
21 | logging.basicConfig(level=args.log_level, filename=args.log_file)
22 | load_dotenv()
23 |
24 | # --- Load and REQUIRE credentials ---
25 | access_key = os.getenv("INTERNET_ARCHIVE_ACCESS_KEY")
26 | secret_key = os.getenv("INTERNET_ARCHIVE_SECRET_KEY")
27 |
28 | if not (access_key and secret_key):
29 | logging.error(
30 | "Authentication required. Please provide your Internet Archive S3-style keys."
31 | )
32 | logging.error("You can get your keys from: https://archive.org/account/s3.php")
33 | logging.error("Then, create a .env file or set the environment variables:")
34 | logging.error("INTERNET_ARCHIVE_ACCESS_KEY and INTERNET_ARCHIVE_SECRET_KEY")
35 | sys.exit(1)
36 |
37 | # --- Enforce API rate-limiting minimums for authenticated users ---
38 | MIN_WAIT_SEC = 5
39 | if args.rate_limit_in_sec < MIN_WAIT_SEC:
40 | logging.warning(
41 | "Provided rate limit of %d seconds is below the API minimum of %d for authenticated users. Overriding to %d seconds.",
42 | args.rate_limit_in_sec,
43 | MIN_WAIT_SEC,
44 | MIN_WAIT_SEC,
45 | )
46 | args.rate_limit_in_sec = MIN_WAIT_SEC
47 |
48 | # --- Build API parameters dictionary from CLI args ---
49 | api_params = {}
50 | if args.capture_all:
51 | api_params["capture_all"] = "1"
52 | if args.capture_outlinks:
53 | api_params["capture_outlinks"] = "1"
54 | if args.capture_screenshot:
55 | api_params["capture_screenshot"] = "1"
56 | if args.delay_wb_availability:
57 | api_params["delay_wb_availability"] = "1"
58 | if args.force_get:
59 | api_params["force_get"] = "1"
60 | if args.skip_first_archive:
61 | api_params["skip_first_archive"] = "1"
62 | if args.email_result:
63 | api_params["email_result"] = "1"
64 | if args.if_not_archived_within:
65 | api_params["if_not_archived_within"] = args.if_not_archived_within
66 | if args.js_behavior_timeout is not None:
67 | api_params["js_behavior_timeout"] = args.js_behavior_timeout
68 | if args.capture_cookie:
69 | api_params["capture_cookie"] = args.capture_cookie
70 | if args.use_user_agent:
71 | api_params["use_user_agent"] = args.use_user_agent
72 |
73 | if api_params:
74 | logging.info(f"Using the following API parameters: {api_params}")
75 |
76 | # --- Gather all URLs to archive ---
77 | urls_to_archive = set()
78 | logging.info("Gathering URLs to archive...")
79 | if args.urls:
80 | logging.info(f"Found {len(args.urls)} URLs from command-line arguments.")
81 | urls_to_archive.update(args.urls)
82 | if args.sitemaps:
83 | session = requests.Session()
84 | retries = Retry(
85 | total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
86 | )
87 | session.mount("https://", HTTPAdapter(max_retries=retries))
88 | session.mount("http://", HTTPAdapter(max_retries=retries))
89 | logging.info(f"Processing {len(args.sitemaps)} sitemap(s)...")
90 | sitemap_urls = process_sitemaps(args.sitemaps, session)
91 | logging.info(f"Found {len(sitemap_urls)} URLs from sitemaps.")
92 | urls_to_archive.update(sitemap_urls)
93 | if args.archive_sitemap:
94 | remote_sitemaps = {s for s in args.sitemaps if not s.startswith("file://")}
95 | urls_to_archive.update(remote_sitemaps)
96 | if args.file:
97 | with open(args.file) as f:
98 | urls_from_file = {line.strip() for line in f if line.strip()}
99 | logging.info(f"Found {len(urls_from_file)} URLs from file: {args.file}")
100 | urls_to_archive.update(urls_from_file)
101 |
102 | urls_to_process = list(urls_to_archive)
103 | if not urls_to_process:
104 | logging.warning("No unique URLs found to archive. Exiting.")
105 | return
106 | logging.info(f"Found a total of {len(urls_to_process)} unique URLs to archive.")
107 | if args.random_order:
108 | logging.info("Randomizing the order of URLs.")
109 | random.shuffle(urls_to_process)
110 |
111 | # --- Run the archiving workflow ---
112 | logging.info("SPN2 credentials found. Using authenticated API workflow.")
113 | client_session = requests.Session()
114 | retries = Retry(
115 | total=5,
116 | backoff_factor=args.rate_limit_in_sec,
117 | status_forcelist=[500, 502, 503, 504, 520],
118 | allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"],
119 | )
120 | client_session.mount("https://", HTTPAdapter(max_retries=retries))
121 | client_session.mount("http://", HTTPAdapter(max_retries=retries))
122 |
123 | client = SPN2Client(
124 | session=client_session, access_key=access_key, secret_key=secret_key
125 | )
126 | run_archive_workflow(client, urls_to_process, args.rate_limit_in_sec, api_params)
127 |
128 |
129 | if __name__ == "__main__":
130 | main()
131 |
--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | # tests/test_cli.py
2 | import sys
3 | from unittest import mock
4 | import pytest
5 | import logging
6 | from wayback_machine_archiver.archiver import main
7 | from wayback_machine_archiver.cli import create_parser
8 |
9 | # This test file now mocks the main workflow and any I/O functions
10 | # to keep the tests focused purely on the CLI argument parsing logic.
11 |
12 |
13 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
14 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
15 | @mock.patch("wayback_machine_archiver.archiver.logging.basicConfig")
16 | @pytest.mark.parametrize(
17 | "input_level, expected_level",
18 | [("info", "INFO"), ("DEBUG", "DEBUG")],
19 | )
20 | def test_log_level(
21 | mock_basic_config, mock_workflow, mock_sitemaps, input_level, expected_level
22 | ):
23 | """Verify that the --log argument is case-insensitive."""
24 | with mock.patch(
25 | "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key"
26 | ):
27 | sys.argv = ["archiver", "http://test.com", "--log", input_level]
28 | main()
29 | mock_basic_config.assert_called_once_with(level=expected_level, filename=None)
30 |
31 |
32 | def test_version_action_exits():
33 | """Verify that the --version argument exits the program."""
34 | sys.argv = ["archiver", "--version"]
35 | with pytest.raises(SystemExit):
36 | main()
37 |
38 |
39 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
40 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
41 | @mock.patch("wayback_machine_archiver.archiver.logging.basicConfig")
42 | def test_log_to_file(mock_basic_config, mock_workflow, mock_sitemaps):
43 | """Verify that --log-to-file passes the filename to the logging config."""
44 | with mock.patch(
45 | "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key"
46 | ):
47 | log_file = "archive.log"
48 | sys.argv = ["archiver", "http://test.com", "--log-to-file", log_file]
49 | main()
50 | mock_basic_config.assert_called_once_with(
51 | level=logging.WARNING, filename=log_file
52 | )
53 |
54 |
55 | @pytest.mark.parametrize(
56 | "user_input, expected_wait",
57 | [(2, 5), (10, 10)],
58 | )
59 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set())
60 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow")
61 | def test_rate_limit_override(mock_workflow, mock_sitemaps, user_input, expected_wait):
62 | """Verify the script enforces the minimum rate-limit for authenticated users."""
63 | with mock.patch(
64 | "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key"
65 | ):
66 | sys.argv = ["archiver", "http://test.com", "--rate-limit-wait", str(user_input)]
67 | main()
68 | # The third argument to the mock_workflow call is the rate limit.
69 | final_rate_limit = mock_workflow.call_args[0][2]
70 | assert final_rate_limit == expected_wait
71 |
72 |
73 | @mock.patch("wayback_machine_archiver.archiver.logging.error")
74 | def test_main_exits_if_no_credentials(mock_logging_error):
75 | """Verify the script raises SystemExit if getenv returns None for credentials."""
76 | with mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value=None):
77 | sys.argv = ["archiver", "http://test.com"]
78 | with pytest.raises(SystemExit) as e:
79 | main()
80 |
81 | # Check that the exit code is 1 (error)
82 | assert e.value.code == 1
83 | # Check that we logged an error message to the user
84 | assert mock_logging_error.call_count > 0
85 |
86 |
87 | def test_api_option_flags_are_parsed_correctly():
88 | """
89 | Directly tests the parser to ensure all API flags are correctly defined
90 | and their default values are as expected.
91 | """
92 | parser = create_parser()
93 |
94 | # Test default values (when no flags are passed)
95 | args = parser.parse_args([])
96 | assert args.capture_all is False
97 | assert args.capture_outlinks is False
98 | assert args.capture_screenshot is False
99 | assert args.delay_wb_availability is False
100 | assert args.force_get is False
101 | assert args.skip_first_archive is False
102 | assert args.email_result is False
103 | assert args.if_not_archived_within is None
104 | assert args.js_behavior_timeout is None
105 | assert args.capture_cookie is None
106 | assert args.use_user_agent is None
107 |
108 | # Test boolean flags are set to True
109 | args = parser.parse_args(
110 | [
111 | "--capture-all",
112 | "--capture-outlinks",
113 | "--capture-screenshot",
114 | "--delay-wb-availability",
115 | "--force-get",
116 | "--skip-first-archive",
117 | "--email-result",
118 | ]
119 | )
120 | assert args.capture_all is True
121 | assert args.capture_outlinks is True
122 | assert args.capture_screenshot is True
123 | assert args.delay_wb_availability is True
124 | assert args.force_get is True
125 | assert args.skip_first_archive is True
126 | assert args.email_result is True
127 |
128 | # Test value-based flags
129 | args = parser.parse_args(
130 | [
131 | "--if-not-archived-within",
132 | "10d 5h",
133 | "--js-behavior-timeout",
134 | "25",
135 | "--capture-cookie",
136 | "name=value",
137 | "--user-agent",
138 | "MyTestAgent/1.0",
139 | ]
140 | )
141 | assert args.if_not_archived_within == "10d 5h"
142 | assert args.js_behavior_timeout == 25
143 | assert args.capture_cookie == "name=value"
144 | assert args.use_user_agent == "MyTestAgent/1.0"
145 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Wayback Machine Archiver
2 |
3 | Wayback Machine Archiver (Archiver for short) is a command-line utility
4 | written in Python to back up web pages using the [Internet Archive][ia].
5 |
6 | [ia]: https://archive.org/
7 |
8 | ## Installation
9 |
10 | The best way to install Archiver is with `pip`:
11 |
12 | ```bash
13 | pip install wayback-machine-archiver
14 | ```
15 |
16 | This will give you access to the script simply by calling:
17 |
18 | ```bash
19 | archiver --help
20 | ```
21 |
22 | You can also install it directly from a local clone of this repository:
23 |
24 | ```bash
25 | git clone https://github.com/agude/wayback-machine-archiver.git
26 | cd wayback-machine-archiver
27 | pip install .
28 | ```
29 |
30 | All dependencies are handled automatically. Archiver supports Python 3.8+.
31 |
32 | ## Usage
33 |
34 | The archiver is simple to use from the command line.
35 |
36 | ### Command-Line Examples
37 |
38 | **Archive a single page:**
39 | ```bash
40 | archiver https://alexgude.com
41 | ```
42 |
43 | **Archive all pages from a sitemap:**
44 | ```bash
45 | archiver --sitemaps https://alexgude.com/sitemap.xml
46 | ```
47 |
48 | **Archive from a local sitemap file:**
49 | (Note the `file://` prefix is required)
50 | ```bash
51 | archiver --sitemaps file://sitemap.xml
52 | ```
53 |
54 | **Archive from a text file of URLs:**
55 | (The file should contain one URL per line)
56 | ```bash
57 | archiver --file urls.txt
58 | ```
59 |
60 | **Combine multiple sources:**
61 | ```bash
62 | archiver https://radiokeysmusic.com --sitemaps https://charles.uno/sitemap.xml
63 | ```
64 |
65 | **Use advanced API options:**
66 | (Capture a screenshot and skip if archived in the last 10 days)
67 | ```bash
68 | archiver https://alexgude.com --capture-screenshot --if-not-archived-within 10d
69 | ```
70 |
71 | **Archive the sitemap URL itself:**
72 | ```bash
73 | archiver --sitemaps https://alexgude.com/sitemaps.xml --archive-sitemap-also
74 | ```
75 |
76 | ## Authentication (Required)
77 |
78 | As of version 3.0.0, this tool requires authentication with the Internet
79 | Archive's SPN2 API. This change was made to ensure all archiving jobs are
80 | reliable and their final success or failure status can be confirmed. The
81 | previous, less reliable method for unauthenticated users has been removed.
82 |
83 | If you run the script without credentials, it will exit with an error message.
84 |
85 | **To set up authentication:**
86 |
87 | 1. Get your S3-style API keys from your Internet Archive account settings:
88 | [https://archive.org/account/s3.php](https://archive.org/account/s3.php)
89 |
90 | 2. Create a `.env` file in the directory where you run the `archiver`
91 | command. Add your keys to it:
92 | ```
93 | INTERNET_ARCHIVE_ACCESS_KEY="YOUR_ACCESS_KEY_HERE"
94 | INTERNET_ARCHIVE_SECRET_KEY="YOUR_SECRET_KEY_HERE"
95 | ```
96 |
97 | The script will automatically detect this file (or the equivalent environment
98 | variables) and use the authenticated API.
99 |
100 | ## Help
101 |
102 | For a full list of command-line flags, Archiver has built-in help displayed
103 | with `archiver --help`:
104 |
105 | ```
106 | usage: archiver [-h] [--version] [--file FILE]
107 | [--sitemaps SITEMAPS [SITEMAPS ...]]
108 | [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
109 | [--log-to-file LOG_FILE]
110 | [--archive-sitemap-also]
111 | [--rate-limit-wait RATE_LIMIT_IN_SEC]
112 | [--random-order] [--capture-all]
113 | [--capture-outlinks] [--capture-screenshot]
114 | [--delay-wb-availability] [--force-get]
115 | [--skip-first-archive] [--email-result]
116 | [--if-not-archived-within ]
117 | [--js-behavior-timeout ]
118 | [--capture-cookie ]
119 | [--user-agent ]
120 | [urls ...]
121 |
122 | A script to backup a web pages with Internet Archive
123 |
124 | positional arguments:
125 | urls Specifies the URLs of the pages to archive.
126 |
127 | options:
128 | -h, --help show this help message and exit
129 | --version show program's version number and exit
130 | --file FILE Specifies the path to a file containing URLs to save,
131 | one per line.
132 | --sitemaps SITEMAPS [SITEMAPS ...]
133 | Specifies one or more URIs to sitemaps listing pages
134 | to archive. Local paths must be prefixed with
135 | 'file://'.
136 | --log {DEBUG,INFO,WARNING,ERROR,CRITICAL}
137 | Sets the logging level. Defaults to WARNING
138 | (case-insensitive).
139 | --log-to-file LOG_FILE
140 | Redirects logs to a specified file instead of the
141 | console.
142 | --archive-sitemap-also
143 | Submits the URL of the sitemap itself to be archived.
144 | --rate-limit-wait RATE_LIMIT_IN_SEC
145 | Specifies the number of seconds to wait between
146 | submissions. A minimum of 5 seconds is enforced for
147 | authenticated users. Defaults to 15.
148 | --random-order Randomizes the order of pages before archiving.
149 |
150 | SPN2 API Options:
151 | Control the behavior of the Internet Archive capture API.
152 |
153 | --capture-all Captures a web page even if it returns an error (e.g.,
154 | 404, 500).
155 | --capture-outlinks Captures web page outlinks automatically. Note: this
156 | can significantly increase the total number of
157 | captures and runtime.
158 | --capture-screenshot Captures a full page screenshot.
159 | --delay-wb-availability
160 | Reduces load on Internet Archive systems by making the
161 | capture publicly available after ~12 hours instead of
162 | immediately.
163 | --force-get Bypasses the headless browser check, which can speed
164 | up captures for non-HTML content (e.g., PDFs, images).
165 | --skip-first-archive Speeds up captures by skipping the check for whether
166 | this is the first time a URL has been archived.
167 | --email-result Sends an email report of the captured URLs to the
168 | user's registered email.
169 | --if-not-archived-within
170 | Captures only if the latest capture is older than
171 | (e.g., '3d 5h').
172 | --js-behavior-timeout
173 | Runs JS code for seconds after page load to
174 | trigger dynamic content. Defaults to 5, max is 30. Use
175 | 0 to disable for static pages.
176 | --capture-cookie
177 | Uses an extra HTTP Cookie value when capturing the
178 | target page.
179 | --user-agent
180 | Uses a custom HTTP User-Agent value when capturing the
181 | target page.
182 | ```
183 |
184 | ## Setting Up a `Sitemap.xml` for Github Pages
185 |
186 | It is easy to automatically generate a sitemap for a Github Pages Jekyll site.
187 | Simply use [jekyll/jekyll-sitemap][jsm].
188 |
189 | Setup instructions can be found on the above site; they require changing just
190 | a single line of your site's `_config.yml`.
191 |
192 | [jsm]: https://github.com/jekyll/jekyll-sitemap
193 |
--------------------------------------------------------------------------------
/tests/test_spn2_workflow.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from unittest import mock
3 | import pytest
4 | import time
5 | from wayback_machine_archiver.workflow import (
6 | _submit_next_url,
7 | _poll_pending_jobs,
8 | run_archive_workflow,
9 | PERMANENT_ERROR_MESSAGES,
10 | TRANSIENT_ERROR_MESSAGES,
11 | )
12 |
13 | # --- Tests for _submit_next_url ---
14 |
15 |
16 | def test_submit_next_url_success():
17 | """
18 | Verify that a successful submission adds the job_id to the pending_jobs
19 | dictionary, consumes the URL, and clears the attempts tracker for that URL.
20 | """
21 | mock_client = mock.Mock()
22 | mock_client.submit_capture.return_value = "job-123"
23 |
24 | urls_to_process = ["http://example.com"]
25 | pending_jobs = {}
26 | # Simulate a previous failure to ensure the tracker is cleared on success
27 | submission_attempts = {"http://example.com": 1}
28 |
29 | _submit_next_url(
30 | urls_to_process,
31 | mock_client,
32 | pending_jobs,
33 | 5,
34 | submission_attempts,
35 | api_params={},
36 | )
37 |
38 | # Assertions
39 | mock_client.submit_capture.assert_called_once_with(
40 | "http://example.com", rate_limit_wait=5, api_params={}
41 | )
42 | # --- Check the new data structure ---
43 | assert "job-123" in pending_jobs
44 | assert pending_jobs["job-123"]["url"] == "http://example.com"
45 | assert "submitted_at" in pending_jobs["job-123"]
46 | assert not urls_to_process, "URL should have been consumed from the list"
47 | assert "http://example.com" not in submission_attempts, (
48 | "Attempts tracker should be cleared on success"
49 | )
50 |
51 |
52 | def test_submit_next_url_failure_requeues_and_tracks_attempt():
53 | """
54 | Verify that a failed submission re-queues the URL at the end of the list
55 | and increments its attempt count.
56 | """
57 | mock_client = mock.Mock()
58 | mock_client.submit_capture.side_effect = Exception("API Error")
59 |
60 | urls_to_process = ["http://a.com", "http://b.com"]
61 | pending_jobs = {}
62 | submission_attempts = {}
63 |
64 | _submit_next_url(
65 | urls_to_process,
66 | mock_client,
67 | pending_jobs,
68 | 5,
69 | submission_attempts,
70 | api_params={},
71 | )
72 |
73 | # Assertions
74 | assert not pending_jobs, "No job should have been added on failure"
75 | assert urls_to_process == ["http://b.com", "http://a.com"], (
76 | "Failed URL should be at the end of the list"
77 | )
78 | assert submission_attempts == {"http://a.com": 1}, (
79 | "Attempt count should be incremented"
80 | )
81 |
82 |
83 | def test_submit_next_url_gives_up_after_max_retries():
84 | """
85 | Verify that if a URL has reached its max retry count, it is not
86 | re-queued and the submission is not attempted.
87 | """
88 | mock_client = mock.Mock()
89 |
90 | urls_to_process = ["http://will-fail.com"]
91 | pending_jobs = {}
92 | # Simulate that the URL has already failed 3 times
93 | submission_attempts = {"http://will-fail.com": 3}
94 |
95 | _submit_next_url(
96 | urls_to_process,
97 | mock_client,
98 | pending_jobs,
99 | 5,
100 | submission_attempts,
101 | api_params={},
102 | max_retries=3,
103 | )
104 |
105 | # Assertions
106 | mock_client.submit_capture.assert_not_called()
107 | assert not pending_jobs
108 | assert not urls_to_process, "URL should be consumed but not re-queued"
109 | assert submission_attempts == {"http://will-fail.com": 4}, (
110 | "Attempt count is still updated"
111 | )
112 |
113 |
114 | def test_submit_next_url_passes_api_params_to_client():
115 | """
116 | Verify that the api_params dictionary is correctly passed to the client's
117 | submit_capture method.
118 | """
119 | mock_client = mock.Mock()
120 | mock_client.submit_capture.return_value = "job-123"
121 | urls_to_process = ["http://example.com"]
122 | pending_jobs = {}
123 | submission_attempts = {}
124 | api_params = {"capture_screenshot": "1", "force_get": "1"}
125 |
126 | _submit_next_url(
127 | urls_to_process,
128 | mock_client,
129 | pending_jobs,
130 | 0,
131 | submission_attempts,
132 | api_params,
133 | )
134 |
135 | mock_client.submit_capture.assert_called_once_with(
136 | "http://example.com", rate_limit_wait=0, api_params=api_params
137 | )
138 |
139 |
140 | # --- Tests for _poll_pending_jobs ---
141 |
142 |
143 | @mock.patch("wayback_machine_archiver.workflow.time.sleep")
144 | def test_poll_uses_batch_and_removes_completed_jobs(mock_sleep):
145 | """
146 | Verify that jobs with 'success' or 'error' status are removed from the
147 | pending list via the batch endpoint, while 'pending' jobs remain.
148 | """
149 | mock_client = mock.Mock()
150 | # Define the return value for the single batch request
151 | mock_client.check_status_batch.return_value = [
152 | {"status": "success", "job_id": "job-success", "timestamp": "20250101"},
153 | {"status": "error", "job_id": "job-error", "message": "Too many redirects."},
154 | {"status": "pending", "job_id": "job-pending"},
155 | ]
156 |
157 | # --- Use the new data structure for pending_jobs ---
158 | now = time.time()
159 | pending_jobs = {
160 | "job-success": {"url": "http://a.com", "submitted_at": now},
161 | "job-error": {"url": "http://b.com", "submitted_at": now},
162 | "job-pending": {"url": "http://c.com", "submitted_at": now},
163 | }
164 |
165 | # --- Provide the new required arguments ---
166 | successful, failed, requeued = _poll_pending_jobs(
167 | mock_client,
168 | pending_jobs,
169 | transient_error_retries={},
170 | max_transient_retries=3,
171 | job_timeout_sec=7200,
172 | )
173 |
174 | # Assertions
175 | mock_client.check_status_batch.assert_called_once_with(
176 | ["job-success", "job-error", "job-pending"]
177 | )
178 | # --- Check the new data structure in the assertion ---
179 | assert list(pending_jobs.keys()) == ["job-pending"]
180 | assert pending_jobs["job-pending"]["url"] == "http://c.com"
181 | assert successful == ["http://a.com"]
182 | assert failed == ["http://b.com"]
183 | assert requeued == []
184 | mock_sleep.assert_called_once()
185 |
186 |
187 | @pytest.mark.parametrize(
188 | "status_ext, api_message, expected_outcome, expected_log_level, expected_log_snippet",
189 | [
190 | (
191 | "error:service-unavailable",
192 | "Service is down",
193 | "requeue",
194 | logging.WARNING,
195 | TRANSIENT_ERROR_MESSAGES["error:service-unavailable"],
196 | ),
197 | (
198 | "error:not-found",
199 | "Page not found",
200 | "fail",
201 | logging.ERROR,
202 | PERMANENT_ERROR_MESSAGES["error:not-found"],
203 | ),
204 | (
205 | "error:some-new-unseen-error",
206 | "A new error",
207 | "fail",
208 | logging.ERROR,
209 | "An unrecoverable error occurred.",
210 | ),
211 | # --- NEW TEST CASE ---
212 | # This simulates the bug: the status_ext is a generic failure, but the
213 | # message contains "RecursionError", which should trigger a requeue.
214 | (
215 | "error:job-failed",
216 | "encoding with 'idna' codec failed (RecursionError: maximum recursion depth exceeded)",
217 | "requeue",
218 | logging.WARNING,
219 | TRANSIENT_ERROR_MESSAGES["error:recursion-error"],
220 | ),
221 | ],
222 | )
223 | @mock.patch("wayback_machine_archiver.workflow.time.sleep")
224 | def test_poll_pending_jobs_handles_errors_intelligently(
225 | mock_sleep,
226 | caplog,
227 | status_ext,
228 | api_message,
229 | expected_outcome,
230 | expected_log_level,
231 | expected_log_snippet,
232 | ):
233 | """
234 | Verify that _poll_pending_jobs correctly categorizes errors as either
235 | transient (re-queue) or permanent (fail) and logs helpful messages.
236 | """
237 | mock_client = mock.Mock()
238 | mock_client.check_status_batch.return_value = [
239 | {
240 | "status": "error",
241 | "job_id": "job-1",
242 | "status_ext": status_ext,
243 | "message": api_message,
244 | }
245 | ]
246 | # --- Use the new data structure for pending_jobs ---
247 | pending_jobs = {"job-1": {"url": "http://example.com", "submitted_at": time.time()}}
248 |
249 | with caplog.at_level(logging.WARNING):
250 | # --- Provide the new required arguments ---
251 | successful, failed, requeued = _poll_pending_jobs(
252 | mock_client,
253 | pending_jobs,
254 | transient_error_retries={},
255 | max_transient_retries=3,
256 | job_timeout_sec=7200,
257 | )
258 |
259 | assert not successful
260 | if expected_outcome == "requeue":
261 | assert requeued == ["http://example.com"]
262 | assert not failed
263 | else: # fail
264 | assert not requeued
265 | assert failed == ["http://example.com"]
266 |
267 | assert len(caplog.records) == 1
268 | log_record = caplog.records[0]
269 | assert log_record.levelno == expected_log_level
270 | assert expected_log_snippet in log_record.message
271 |
272 |
273 | # --- Corrected test for run_archive_workflow dynamic polling ---
274 |
275 |
276 | @mock.patch("wayback_machine_archiver.workflow.time.sleep")
277 | @mock.patch("wayback_machine_archiver.workflow._poll_pending_jobs")
278 | @mock.patch("wayback_machine_archiver.workflow._submit_next_url")
279 | def test_run_archive_workflow_dynamic_polling_is_fast_and_correct(
280 | mock_submit, mock_poll, mock_sleep
281 | ):
282 | """
283 | Verify that the polling wait time increases exponentially when jobs are pending
284 | and the submission queue is empty, and that the test runs quickly.
285 | """
286 | mock_client = mock.Mock()
287 | initial_urls = ["http://a.com"]
288 | # Use a mutable list for the test to simulate its modification by _submit_next_url
289 | urls_to_process_list = list(initial_urls)
290 | rate_limit_in_sec = 0
291 | api_params = {}
292 |
293 | # Configure mock_submit to simulate a successful submission
294 | # It needs to modify the urls_to_process_list and pending_jobs_dict passed to it
295 | def submit_side_effect(urls_proc, client_arg, pending_jobs_dict, *args, **kwargs):
296 | url = urls_proc.pop(0) # Remove the URL from the list
297 | job_id = f"job-{url}"
298 | # --- Use the new data structure ---
299 | pending_jobs_dict[job_id] = {"url": url, "submitted_at": time.time()}
300 | return job_id
301 |
302 | mock_submit.side_effect = submit_side_effect
303 |
304 | # Configure mock_poll to simulate jobs staying pending, then succeeding
305 | poll_calls = 0
306 |
307 | def poll_side_effect(client_arg, pending_jobs_dict, *args, **kwargs):
308 | nonlocal poll_calls
309 | poll_calls += 1
310 | if poll_calls <= 3: # Simulate pending for 3 calls
311 | return [], [], [] # No success, no failure, no requeue
312 | else: # Simulate success on the 4th call
313 | # --- Extract URLs from the new data structure ---
314 | successful_urls = [job["url"] for job in pending_jobs_dict.values()]
315 | pending_jobs_dict.clear()
316 | return successful_urls, [], []
317 |
318 | mock_poll.side_effect = poll_side_effect
319 |
320 | # Call the main workflow function
321 | run_archive_workflow(
322 | mock_client, urls_to_process_list, rate_limit_in_sec, api_params
323 | )
324 |
325 | # Assertions
326 | # Check the calls to time.sleep
327 | # We expect sleep to be called between polling cycles when the submission
328 | # queue is empty.
329 | # Cycle 1: Submits URL. Polls. Loop continues.
330 | # Cycle 2: No URLs to submit. Polls. Sleeps for 5s.
331 | # Cycle 3: No URLs to submit. Polls. Sleeps for 7s (5 * 1.5).
332 | # Cycle 4: No URLs to submit. Polls. Sleeps for 10s (7 * 1.5).
333 | # Cycle 5: No URLs to submit. Polls (job succeeds). Loop terminates.
334 | # We filter out the small 0.2s sleeps that happen inside _poll_pending_jobs.
335 | sleep_calls = [call[0][0] for call in mock_sleep.call_args_list if call[0][0] > 1]
336 |
337 | assert sleep_calls == [5, 7, 10]
338 | assert mock_submit.call_count == 1
339 | # The poll side effect now runs 4 times to get to the success case
340 | assert mock_poll.call_count == 4
341 | assert not urls_to_process_list # Ensure the initial URL list is empty
342 |
343 |
344 | def test_poll_gives_up_after_max_transient_retries(caplog):
345 | """
346 | Verify that if a URL fails with a transient error more times than allowed,
347 | it is marked as a permanent failure and not re-queued.
348 | """
349 | mock_client = mock.Mock()
350 | mock_client.check_status_batch.return_value = [
351 | {
352 | "status": "error",
353 | "job_id": "job-1",
354 | "status_ext": "error:service-unavailable", # A transient error
355 | "message": "API message",
356 | }
357 | ]
358 |
359 | url = "http://example.com"
360 | max_retries = 3
361 |
362 | # Simulate that this URL has already failed 3 times with a transient error
363 | transient_error_retries = {url: 3}
364 | pending_jobs = {"job-1": {"url": url, "submitted_at": time.time()}}
365 |
366 | with caplog.at_level(logging.INFO):
367 | successful, failed, requeued = _poll_pending_jobs(
368 | mock_client,
369 | pending_jobs,
370 | transient_error_retries,
371 | max_transient_retries=max_retries,
372 | job_timeout_sec=7200,
373 | )
374 |
375 | # Assertions
376 | assert not successful
377 | assert not requeued, "URL should not have been re-queued"
378 | assert failed == [url], "URL should have been marked as failed"
379 | assert "Marking as a permanent failure" in caplog.text
380 |
381 |
382 | def test_poll_fails_job_after_timeout(caplog):
383 | """
384 | Verify that a job that remains in a 'pending' state for longer than the
385 | timeout period is marked as a failure.
386 | """
387 | mock_client = mock.Mock()
388 | mock_client.check_status_batch.return_value = [
389 | {"status": "pending", "job_id": "job-stuck"}
390 | ]
391 |
392 | url = "http://stuck.com"
393 | timeout_sec = 3600 # 1 hour
394 |
395 | # Simulate a job that was submitted long ago, well before the timeout
396 | stale_timestamp = time.time() - (timeout_sec + 60)
397 | pending_jobs = {"job-stuck": {"url": url, "submitted_at": stale_timestamp}}
398 |
399 | with caplog.at_level(logging.INFO):
400 | successful, failed, requeued = _poll_pending_jobs(
401 | mock_client,
402 | pending_jobs,
403 | transient_error_retries={},
404 | max_transient_retries=3,
405 | job_timeout_sec=timeout_sec,
406 | )
407 |
408 | # Assertions
409 | assert not successful
410 | assert not requeued
411 | assert failed == [url], "Stuck job should have been marked as failed"
412 | assert "timed out after being pending" in caplog.text
413 |
--------------------------------------------------------------------------------
/src/wayback_machine_archiver/workflow.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | # A set of transient errors that suggest a retry might be successful.
5 | REQUEUE_ERRORS = {
6 | "error:bad-gateway",
7 | "error:bandwidth-limit-exceeded",
8 | "error:browsing-timeout",
9 | "error:cannot-fetch",
10 | "error:capture-location-error",
11 | "error:celery",
12 | "error:gateway-timeout",
13 | "error:internal-server-error",
14 | "error:invalid-server-response",
15 | "error:job-failed",
16 | "error:no-browsers-available",
17 | "error:protocol-error",
18 | "error:proxy-error",
19 | "error:read-timeout",
20 | "error:recursion-error",
21 | "error:service-unavailable",
22 | "error:soft-time-limit-exceeded",
23 | "error:too-many-requests",
24 | "error:user-session-limit",
25 | }
26 |
27 | # A map of transient error codes to user-friendly, explanatory messages.
28 | TRANSIENT_ERROR_MESSAGES = {
29 | "error:bad-gateway": "The server reported a temporary upstream issue (Bad Gateway).",
30 | "error:bandwidth-limit-exceeded": "The target server has exceeded its bandwidth limit.",
31 | "error:browsing-timeout": "The headless browser timed out, possibly due to high server load.",
32 | "error:cannot-fetch": "The Internet Archive's systems are temporarily overloaded.",
33 | "error:capture-location-error": "An internal Internet Archive system error occurred.",
34 | "error:celery": "An error occurred in the Internet Archive's internal job queue.",
35 | "error:gateway-timeout": "The server reported a temporary upstream timeout (Gateway Timeout).",
36 | "error:internal-server-error": "The Internet Archive's server reported a temporary internal error.",
37 | "error:invalid-server-response": "The target server sent a malformed response, possibly due to a network glitch.",
38 | "error:job-failed": "The capture failed due to a generic Internet Archive system error.",
39 | "error:no-browsers-available": "The Internet Archive's capture browsers are temporarily at capacity.",
40 | "error:protocol-error": "The HTTP connection was broken, likely due to a network issue.",
41 | "error:proxy-error": "An internal Internet Archive proxy error occurred.",
42 | "error:read-timeout": "The connection timed out while reading data from the server.",
43 | "error:recursion-error": "The server encountered a temporary processing error (RecursionError).",
44 | "error:service-unavailable": "The Internet Archive's service is temporarily unavailable.",
45 | "error:soft-time-limit-exceeded": "The capture took too long and was terminated; a retry may succeed.",
46 | "error:too-many-requests": "The target server is rate-limiting requests.",
47 | "error:user-session-limit": "Your Internet Archive account has reached its concurrent job limit.",
48 | }
49 |
50 | # A map of permanent error codes to user-friendly, explanatory messages.
51 | PERMANENT_ERROR_MESSAGES = {
52 | "error:bad-request": "The API reported a bad request. This may be a bug in the archiver script.",
53 | "error:blocked": "The target site is actively blocking the Internet Archive's requests. To save the block page, use the --capture-all flag.",
54 | "error:blocked-client-ip": "Your IP address is on a blocklist (e.g., Spamhaus), and the Internet Archive is refusing the request.",
55 | "error:blocked-url": "This URL is on a blocklist (e.g., a tracking domain) and cannot be archived.",
56 | "error:filesize-limit": "The file at this URL is larger than the 2GB limit and cannot be archived.",
57 | "error:ftp-access-denied": "Access to the FTP resource was denied due to a permissions issue.",
58 | "error:http-version-not-supported": "The target server uses an unsupported HTTP version.",
59 | "error:invalid-host-resolution": "The domain name could not be found. Check for typos in the URL.",
60 | "error:invalid-url-syntax": "The URL is malformed. Please check its structure.",
61 | "error:method-not-allowed": "The server forbids the HTTP method used for archiving. To save this error page, use the --capture-all flag.",
62 | "error:network-authentication-required": "A captive portal or proxy is requiring authentication. To save the login page, use the --capture-all flag.",
63 | "error:no-access": "The page is forbidden (403 Forbidden). To save this error page, use the --capture-all flag.",
64 | "error:not-found": "The page could not be found (404 Not Found). To save this error page, use the --capture-all flag.",
65 | "error:not-implemented": "The server does not support the functionality required to archive the page.",
66 | "error:too-many-daily-captures": "This URL has already been captured the maximum number of times today.",
67 | "error:too-many-redirects": "The URL has too many redirects, likely indicating a redirect loop.",
68 | "error:unauthorized": "The page requires a login (401 Unauthorized). To save the login/error page, use the --capture-all flag.",
69 | }
70 |
71 |
72 | def _submit_next_url(
73 | urls_to_process,
74 | client,
75 | pending_jobs,
76 | rate_limit_in_sec,
77 | submission_attempts,
78 | api_params,
79 | max_retries=3,
80 | ):
81 | """
82 | Pops the next URL, submits it, and adds its job_id to pending_jobs.
83 | Returns 'failed' on a definitive failure, otherwise None.
84 | """
85 | url = urls_to_process.pop(0)
86 | attempt_num = submission_attempts.get(url, 0) + 1
87 | submission_attempts[url] = attempt_num
88 |
89 | if attempt_num > max_retries:
90 | logging.error("URL %s failed submission %d times, giving up.", url, max_retries)
91 | return "failed"
92 |
93 | try:
94 | logging.info("Submitting %s (attempt %d/%d)...", url, attempt_num, max_retries)
95 | job_id = client.submit_capture(
96 | url, rate_limit_wait=rate_limit_in_sec, api_params=api_params
97 | )
98 |
99 | if not job_id:
100 | # The API accepted the request but didn't provide a job_id.
101 | # This is treated as a transient error to trigger a retry.
102 | raise ValueError(
103 | "API did not return a job_id, likely due to rate limiting."
104 | )
105 |
106 | # --- Store a dictionary with URL and timestamp ---
107 | pending_jobs[job_id] = {"url": url, "submitted_at": time.time()}
108 | if url in submission_attempts:
109 | del submission_attempts[url]
110 |
111 | except ValueError as _:
112 | # This block specifically catches the "no job_id" case.
113 | logging.warning(
114 | "Submission for %s was accepted but no job_id was returned. This can happen under high load or due to rate limits. Re-queuing for another attempt.",
115 | url,
116 | )
117 | urls_to_process.append(url)
118 |
119 | except Exception as e:
120 | # This block now catches all OTHER submission errors (e.g., network).
121 | logging.warning(
122 | "Failed to submit URL %s due to a connection or API error: %s. Re-queuing for another attempt.",
123 | url,
124 | e,
125 | )
126 | urls_to_process.append(url)
127 |
128 | return None
129 |
130 |
131 | def _poll_pending_jobs(
132 | client,
133 | pending_jobs,
134 | transient_error_retries,
135 | max_transient_retries,
136 | job_timeout_sec,
137 | poll_interval_sec=0.2,
138 | ):
139 | """
140 | Checks the status of all pending jobs using a single batch request.
141 | Returns a tuple of (successful_urls, failed_urls, requeued_urls) for completed jobs.
142 | """
143 | successful_urls = []
144 | failed_urls = []
145 | requeued_urls = []
146 |
147 | # Get all job IDs that need to be checked.
148 | job_ids_to_check = list(pending_jobs.keys())
149 | if not job_ids_to_check:
150 | return [], [], []
151 |
152 | try:
153 | # Make a single batch request for all pending jobs.
154 | # The API is expected to return a list of status objects.
155 | batch_statuses = client.check_status_batch(job_ids_to_check)
156 |
157 | # It's possible the API returns a single object if only one job was queried.
158 | if not isinstance(batch_statuses, list):
159 | batch_statuses = [batch_statuses]
160 |
161 | for status_data in batch_statuses:
162 | job_id = status_data.get("job_id")
163 | if not job_id or job_id not in pending_jobs:
164 | continue
165 |
166 | # --- URL is now inside a dictionary ---
167 | original_url = pending_jobs[job_id]["url"]
168 | status = status_data.get("status")
169 |
170 | if status == "success":
171 | timestamp = status_data.get("timestamp")
172 | archive_url = f"https://web.archive.org/web/{timestamp}/{original_url}"
173 | logging.info("Success for job %s: %s", job_id, archive_url)
174 | del pending_jobs[job_id]
175 | successful_urls.append(original_url)
176 | elif status == "error":
177 | status_ext = status_data.get("status_ext")
178 | api_message = status_data.get("message", "Unknown error")
179 |
180 | # The API can return a generic error code for what is actually a transient
181 | # server-side processing error. We check the message for this specific case.
182 | if "RecursionError" in api_message:
183 | status_ext = "error:recursion-error"
184 |
185 | if status_ext in REQUEUE_ERRORS:
186 | # --- Check if this URL has exceeded its transient retry limit ---
187 | retry_count = transient_error_retries.get(original_url, 0) + 1
188 | transient_error_retries[original_url] = retry_count
189 |
190 | if retry_count > max_transient_retries:
191 | logging.error(
192 | "URL %s failed with a transient error %d times. Marking as a permanent failure. (API code: %s)",
193 | original_url,
194 | max_transient_retries,
195 | status_ext,
196 | )
197 | del pending_jobs[job_id]
198 | failed_urls.append(original_url)
199 | else:
200 | # --- This is the original re-queue logic ---
201 | helpful_message = TRANSIENT_ERROR_MESSAGES.get(
202 | status_ext, "A transient error occurred."
203 | )
204 | logging.warning(
205 | "Transient error for %s: %s Re-queuing for another attempt (%d/%d). (API code: %s)",
206 | original_url,
207 | helpful_message,
208 | retry_count,
209 | max_transient_retries,
210 | status_ext,
211 | )
212 | del pending_jobs[job_id]
213 | requeued_urls.append(original_url)
214 | else:
215 | # Look up the helpful message, with a fallback for unknown permanent errors.
216 | helpful_message = PERMANENT_ERROR_MESSAGES.get(
217 | status_ext, "An unrecoverable error occurred."
218 | )
219 | logging.error(
220 | "Permanent error for %s: %s (API message: %s)",
221 | original_url,
222 | helpful_message,
223 | api_message,
224 | )
225 | del pending_jobs[job_id]
226 | failed_urls.append(original_url)
227 | else:
228 | # --- Check for job timeout if status is pending ---
229 | submitted_at = pending_jobs[job_id]["submitted_at"]
230 | job_age = time.time() - submitted_at
231 | if job_age > job_timeout_sec:
232 | logging.error(
233 | "Job for %s timed out after being pending for over %d seconds. Marking as failed.",
234 | original_url,
235 | job_timeout_sec,
236 | )
237 | del pending_jobs[job_id]
238 | failed_urls.append(original_url)
239 | else:
240 | logging.debug(
241 | "Job %s (%s) is still pending...", job_id, original_url
242 | )
243 |
244 | except Exception as e:
245 | logging.error(
246 | "An exception occurred during batch polling: %s. Clearing all pending jobs for this cycle to prevent loops.",
247 | e,
248 | )
249 | # --- Must extract URLs from the dictionary values ---
250 | failed_urls.extend([job["url"] for job in pending_jobs.values()])
251 | pending_jobs.clear()
252 |
253 | # A short sleep after each batch poll to be nice to the API.
254 | time.sleep(poll_interval_sec)
255 |
256 | return successful_urls, failed_urls, requeued_urls
257 |
258 |
259 | def run_archive_workflow(client, urls_to_process, rate_limit_in_sec, api_params):
260 | """Manages the main loop for submitting and polling URLs."""
261 | pending_jobs = {}
262 | submission_attempts = {}
263 | # --- Dictionary to track retries for transient polling errors ---
264 | transient_error_retries = {}
265 | MAX_TRANSIENT_RETRIES = 3
266 | # --- Timeout for jobs stuck in pending state ---
267 | JOB_TIMEOUT_SEC = 7200 # 2 hours
268 |
269 | total_urls = len(urls_to_process)
270 | success_count = 0
271 | failure_count = 0
272 |
273 | # --- Variables for dynamic polling ---
274 | INITIAL_POLLING_WAIT = 5
275 | MAX_POLLING_WAIT = 60
276 | POLLING_BACKOFF_FACTOR = 1.5
277 | polling_wait_time = INITIAL_POLLING_WAIT
278 |
279 | logging.info(
280 | "Beginning interleaved submission and polling of %d URLs...",
281 | total_urls,
282 | )
283 |
284 | while urls_to_process or pending_jobs:
285 | if urls_to_process:
286 | status = _submit_next_url(
287 | urls_to_process,
288 | client,
289 | pending_jobs,
290 | rate_limit_in_sec,
291 | submission_attempts,
292 | api_params,
293 | )
294 | if status == "failed":
295 | failure_count += 1
296 | # Reset polling wait time after a new submission
297 | polling_wait_time = INITIAL_POLLING_WAIT
298 |
299 | if pending_jobs:
300 | # --- Pass job timeout to the polling function ---
301 | successful, failed, requeued = _poll_pending_jobs(
302 | client,
303 | pending_jobs,
304 | transient_error_retries,
305 | MAX_TRANSIENT_RETRIES,
306 | JOB_TIMEOUT_SEC,
307 | )
308 | success_count += len(successful)
309 | failure_count += len(failed)
310 | if requeued:
311 | urls_to_process.extend(requeued)
312 | logging.info(
313 | "Re-queued %d URLs due to transient API errors.", len(requeued)
314 | )
315 |
316 | if not urls_to_process and pending_jobs:
317 | logging.info(
318 | "%d captures remaining, starting next polling cycle in %d seconds...",
319 | len(pending_jobs),
320 | polling_wait_time,
321 | )
322 | time.sleep(polling_wait_time)
323 | # Increase wait time for the next cycle
324 | polling_wait_time = min(
325 | int(polling_wait_time * POLLING_BACKOFF_FACTOR), MAX_POLLING_WAIT
326 | )
327 |
328 | logging.info("--------------------------------------------------")
329 | logging.info("Archive workflow complete.")
330 | logging.info(f"Total URLs processed: {total_urls}")
331 | logging.info(f"Successful captures: {success_count}")
332 | logging.info(f"Failed captures: {failure_count}")
333 | logging.info("--------------------------------------------------")
334 |
--------------------------------------------------------------------------------