├── src └── wayback_machine_archiver │ ├── __init__.py │ ├── clients.py │ ├── sitemaps.py │ ├── cli.py │ ├── archiver.py │ └── workflow.py ├── tests ├── test_sitemap_is_local.py ├── test_get_namespace.py ├── test_download_remote_sitemap.py ├── test_load_local_sitemap.py ├── test_extract_pages_from_sitemap.py ├── test_main_logic.py ├── test_spn2_client.py ├── test_cli.py └── test_spn2_workflow.py ├── .github └── workflows │ ├── tests.yml │ └── release.yml ├── LICENSE ├── pyproject.toml ├── .gitignore └── README.md /src/wayback_machine_archiver/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "3.3.1" 2 | -------------------------------------------------------------------------------- /tests/test_sitemap_is_local.py: -------------------------------------------------------------------------------- 1 | from wayback_machine_archiver.sitemaps import sitemap_is_local, LOCAL_PREFIX 2 | 3 | 4 | def test_local(): 5 | URIS = ( 6 | "/tmp/sitemap.xml", 7 | "{prefix}/tmp/sitemap.xml".format(prefix=LOCAL_PREFIX), 8 | ) 9 | for uri in URIS: 10 | assert sitemap_is_local(uri) 11 | 12 | 13 | def test_remote(): 14 | URIS = ( 15 | "https://alexgude.com/sitemap.xml", 16 | "http://charles.uno/sitemap.xml", 17 | ) 18 | for uri in URIS: 19 | assert not sitemap_is_local(uri) 20 | -------------------------------------------------------------------------------- /tests/test_get_namespace.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from wayback_machine_archiver.sitemaps import get_namespace 3 | 4 | ELEMENT = namedtuple("Element", "tag") 5 | 6 | 7 | def test_good_namespace(): 8 | NAMESPACE = "{http://www.sitemaps.org/schemas/sitemap/0.9}" 9 | test_element = ELEMENT("{namespace}urlset".format(namespace=NAMESPACE)) 10 | 11 | assert get_namespace(test_element) == NAMESPACE 12 | 13 | 14 | def test_no_match_namespace(): 15 | NAMESPACE = "" 16 | test_element = ELEMENT("{namespace}urlset".format(namespace=NAMESPACE)) 17 | 18 | assert get_namespace(test_element) == NAMESPACE 19 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 🧪 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | python-version: 12 | - '3.8' 13 | - '3.9' 14 | - '3.10' 15 | - '3.11' 16 | - '3.12' 17 | - 'pypy-3.9' 18 | - 'pypy-3.10' 19 | name: Python ${{ matrix.python-version }} Test 🧪 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v4 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 🐍 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - name: Set up uv 💨 30 | uses: astral-sh/setup-uv@v6 31 | with: 32 | uv-version: latest 33 | 34 | - name: Install dependencies 🏗 35 | run: uv pip install --system -e ".[dev]" 36 | 37 | - name: Run Tests 🧪 38 | run: pytest -vv 39 | 40 | - name: Run Smoke Test ⚗️ 41 | run: archiver --help 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # MIT License (MIT) 2 | 3 | Copyright © 2018--2025 Alexander Gude 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /tests/test_download_remote_sitemap.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from wayback_machine_archiver.sitemaps import download_remote_sitemap 3 | from requests.adapters import HTTPAdapter 4 | import requests 5 | 6 | 7 | SITEMAP = """ 8 | 9 | 10 | https://alexgude.com/blog/double-checking-538/ 11 | 2016-04-28T00:00:00+00:00 12 | 13 | 14 | https://alexgude.com/files/undergrad_thesis.pdf 15 | 2019-05-09T16:19:45+00:00 16 | 17 | 18 | """ 19 | 20 | 21 | @pytest.fixture 22 | def session(): 23 | session = requests.Session() 24 | session.mount("https://", HTTPAdapter()) 25 | session.mount("http://", HTTPAdapter()) 26 | return session 27 | 28 | 29 | def test_download_remote_sitemap(requests_mock, session): 30 | url = "https://www.radiokeysmusic.com/sitemap.xml" 31 | requests_mock.get(url, text=SITEMAP) 32 | returned_contents = download_remote_sitemap(url, session) 33 | assert returned_contents == SITEMAP.encode("UTF-8") 34 | 35 | 36 | def test_download_remote_sitemap_with_status_error(requests_mock, session): 37 | url = "https://www.radiokeysmusic.com/sitemap.xml" 38 | requests_mock.get(url, text=SITEMAP, status_code=404) 39 | with pytest.raises(requests.exceptions.HTTPError): 40 | download_remote_sitemap(url, session) 41 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "wayback-machine-archiver" 7 | version = "3.3.1" 8 | description = "A Python script to submit web pages to the Wayback Machine for archiving." 9 | readme = "README.md" 10 | authors = [ 11 | { name = "Alexander Gude", email = "alex.public.account@gmail.com" }, 12 | ] 13 | license = { file = "LICENSE" } 14 | requires-python = ">=3.8" 15 | classifiers = [ 16 | "Development Status :: 5 - Production/Stable", 17 | "Environment :: Console", 18 | "Intended Audience :: System Administrators", 19 | "License :: OSI Approved :: MIT License", 20 | "Natural Language :: English", 21 | "Operating System :: OS Independent", 22 | "Programming Language :: Python", 23 | "Topic :: Utilities", 24 | ] 25 | keywords = ["Internet Archive", "Wayback Machine"] 26 | dependencies = [ 27 | "python-dotenv", 28 | "requests", 29 | "urllib3", 30 | ] 31 | 32 | [project.urls] 33 | Homepage = "https://github.com/agude/wayback-machine-archiver" 34 | 35 | [project.scripts] 36 | archiver = "wayback_machine_archiver.archiver:main" 37 | 38 | [project.optional-dependencies] 39 | dev = [ 40 | "pytest", 41 | "requests-mock", 42 | "bump-my-version", 43 | ] 44 | 45 | [tool.bumpversion] 46 | current_version = "3.3.1" 47 | commit = true 48 | tag = true 49 | message = "Bump version to {new_version}" 50 | 51 | [[tool.bumpversion.files]] 52 | filename = "pyproject.toml" 53 | search = 'version = "{current_version}"' 54 | replace = 'version = "{new_version}"' 55 | 56 | [[tool.bumpversion.files]] 57 | filename = "src/wayback_machine_archiver/__init__.py" 58 | search = '__version__ = "{current_version}"' 59 | replace = '__version__ = "{new_version}"' 60 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release Package 📦 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | python-version: 14 | - '3.8' 15 | - '3.9' 16 | - '3.10' 17 | - '3.11' 18 | - '3.12' 19 | - 'pypy-3.9' 20 | - 'pypy-3.10' 21 | name: Python ${{ matrix.python-version }} Test 🧪 22 | steps: 23 | - name: Checkout repository 24 | uses: actions/checkout@v4 25 | 26 | - name: Set up Python ${{ matrix.python-version }} 🐍 27 | uses: actions/setup-python@v4 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Set up uv 💨 32 | uses: astral-sh/setup-uv@v6 33 | with: 34 | uv-version: latest 35 | 36 | - name: Install dependencies 🏗 37 | run: uv pip install --system -e ".[dev]" 38 | 39 | - name: Run Tests 🧪 40 | run: pytest -vv 41 | 42 | - name: Run Smoke Test ⚗️ 43 | run: archiver --help 44 | 45 | release: 46 | runs-on: ubuntu-latest 47 | needs: test 48 | name: Build and Publish to PyPI 📦 49 | permissions: 50 | id-token: write 51 | steps: 52 | - name: Checkout repository 53 | uses: actions/checkout@v4 54 | 55 | - name: Set up Python for build 🐍 56 | uses: actions/setup-python@v4 57 | with: 58 | python-version: '3.12' 59 | 60 | - name: Install the modern build tool 🏗 61 | run: python -m pip install build 62 | 63 | - name: Build package 👷 64 | run: python -m build 65 | 66 | - name: Publish distribution 📦 to PyPI 67 | if: startsWith(github.ref, 'refs/tags') 68 | uses: pypa/gh-action-pypi-publish@release/v1 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # env files 2 | .env 3 | env 4 | 5 | # Sometimes I use a test sitemap.xml 6 | sitemap.xml 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | .mypy* 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | #Ipython Notebook 70 | .ipynb_checkpoints 71 | 72 | # swap files 73 | *.swp 74 | 75 | # OSX crap 76 | .DS_Store 77 | 78 | # pickled models 79 | **/*.pickle 80 | 81 | #other crap 82 | **/.ropeproject 83 | checkscript.sh 84 | 85 | # swap 86 | [._]*.s[a-v][a-z] 87 | [._]*.sw[a-p] 88 | [._]s[a-v][a-z] 89 | [._]sw[a-p] 90 | # session 91 | Session.vim 92 | # temporary 93 | .netrwhist 94 | *~ 95 | # auto-generated tag files 96 | tags 97 | 98 | *~ 99 | 100 | # temporary files which can be created if a process still has a handle open of a deleted file 101 | .fuse_hidden* 102 | 103 | # KDE directory preferences 104 | .directory 105 | 106 | # Linux trash folder which might appear on any partition or disk 107 | .Trash-* 108 | 109 | # .nfs files are created when an open file is removed but is still being accessed 110 | .nfs* 111 | 112 | .vscode/ 113 | -------------------------------------------------------------------------------- /tests/test_load_local_sitemap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from __future__ import unicode_literals 3 | from wayback_machine_archiver.sitemaps import load_local_sitemap, LOCAL_PREFIX 4 | import os.path 5 | import pytest 6 | 7 | 8 | SITEMAP = """ 9 | 10 | 11 | https://alexgude.com/blog/double-checking-538/ 12 | 2016-04-28T00:00:00+00:00 13 | 14 | 15 | https://alexgude.com/files/undergrad_thesis.pdf 16 | 2019-05-09T16:19:45+00:00 17 | 18 | 19 | """ 20 | 21 | 22 | def test_load_local_file_without_prefix(tmpdir): 23 | # Write a file using pytest's tmpdir so we can read it back 24 | file = tmpdir.join("sitemap.xml") 25 | file.write(SITEMAP) 26 | file_path = os.path.join(file.dirname, file.basename) 27 | 28 | # Read the file 29 | read_contents = load_local_sitemap(file_path) 30 | assert read_contents == SITEMAP 31 | 32 | 33 | def test_load_local_file_with_prefix(tmpdir): 34 | # Write a file using pytest's tmpdir so we can read it back 35 | file = tmpdir.join("sitemap.xml") 36 | file.write(SITEMAP) 37 | file_path = os.path.join(LOCAL_PREFIX, file.dirname, file.basename) 38 | 39 | # Read the file 40 | read_contents = load_local_sitemap(file_path) 41 | assert read_contents == SITEMAP 42 | 43 | 44 | def test_file_does_not_exist(tmpdir): 45 | file_path = "{}/tmp/not_a_real_file".format(LOCAL_PREFIX) 46 | 47 | with pytest.raises(IOError): 48 | load_local_sitemap(file_path) 49 | 50 | 51 | def test_file_is_remote(tmpdir): 52 | file_path = "https://alexgude.com/sitemap.xml" 53 | 54 | with pytest.raises(IOError): 55 | load_local_sitemap(file_path) 56 | 57 | 58 | def test_file_path_is_invalid(tmpdir): 59 | file_path = "tmp/file_path" 60 | 61 | with pytest.raises(IOError): 62 | load_local_sitemap(file_path) 63 | -------------------------------------------------------------------------------- /src/wayback_machine_archiver/clients.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import requests 4 | 5 | 6 | class SPN2Client: 7 | """ 8 | Handles archiving using the authenticated SPN2 API. 9 | """ 10 | 11 | SAVE_URL = "https://web.archive.org/save" 12 | STATUS_URL = "https://web.archive.org/save/status" 13 | STATUS_URL_TEMPLATE = "https://web.archive.org/save/status/{job_id}" 14 | 15 | def __init__(self, session, access_key, secret_key): 16 | self.session = session 17 | self.is_authenticated = True # Always true now 18 | 19 | self.session.headers.update({"Accept": "application/json"}) 20 | auth_header = f"LOW {access_key}:{secret_key}" 21 | self.session.headers.update({"Authorization": auth_header}) 22 | 23 | def submit_capture(self, url_to_archive, rate_limit_wait, api_params=None): 24 | """Submits a capture request to the SPN2 API.""" 25 | if rate_limit_wait > 0: 26 | logging.debug("Sleeping for %s seconds", rate_limit_wait) 27 | time.sleep(rate_limit_wait) 28 | logging.info("Submitting %s to SPN2", url_to_archive) 29 | data = {"url": url_to_archive} 30 | if api_params: 31 | data.update(api_params) 32 | 33 | r = self.session.post(self.SAVE_URL, data=data) 34 | r.raise_for_status() 35 | response_json = r.json() 36 | job_id = response_json.get("job_id") 37 | logging.info("Successfully submitted %s, job_id: %s", url_to_archive, job_id) 38 | 39 | if job_id: 40 | status_check_url = self.STATUS_URL_TEMPLATE.format(job_id=job_id) 41 | logging.debug( 42 | "Manual status check URL for %s: %s", url_to_archive, status_check_url 43 | ) 44 | 45 | return job_id 46 | 47 | def check_status(self, job_id): 48 | """Checks the status of a single capture job.""" 49 | status_url = self.STATUS_URL_TEMPLATE.format(job_id=job_id) 50 | logging.debug("Checking status for single job_id: %s", job_id) 51 | r = self.session.get(status_url) 52 | r.raise_for_status() 53 | return r.json() 54 | 55 | def check_status_batch(self, job_ids): 56 | """Checks the status of multiple capture jobs in a single request.""" 57 | logging.debug("Checking status for %d jobs in a batch.", len(job_ids)) 58 | data = {"job_ids": ",".join(job_ids)} 59 | r = self.session.post(self.STATUS_URL, data=data) 60 | r.raise_for_status() 61 | return r.json() 62 | -------------------------------------------------------------------------------- /tests/test_extract_pages_from_sitemap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from __future__ import unicode_literals 3 | from wayback_machine_archiver.sitemaps import extract_urls_from_sitemap 4 | 5 | 6 | def test_ascii_sitemap(): 7 | SITEMAP = """ 8 | 9 | 10 | https://alexgude.com/blog/double-checking-538/ 11 | 2016-04-28T00:00:00+00:00 12 | 13 | 14 | https://alexgude.com/files/undergrad_thesis.pdf 15 | 2019-05-09T16:19:45+00:00 16 | 17 | 18 | """.encode("UTF-8") 19 | 20 | URLS = set( 21 | ( 22 | "https://alexgude.com/blog/double-checking-538/", 23 | "https://alexgude.com/files/undergrad_thesis.pdf", 24 | ) 25 | ) 26 | 27 | assert extract_urls_from_sitemap(SITEMAP) == URLS 28 | 29 | 30 | def test_unicode_sitemap(): 31 | SITEMAP = """ 32 | 33 | 34 | https://www.radiokeysmusic.com/home 35 | daily 36 | 1.0 37 | 2018-12-17 38 | 39 | https://static1.squarespace.com/static/5c06e0ab1137a66237a2399c/t/5c0d6a4d562fa7678539d405/1544383062969/ 40 | Home 41 | Tom, Stewart, Allante, & Emily. Photo by Cory Cullington, 2018. 42 | 43 | 44 | 45 | https://www.radiokeysmusic.com/about 46 | daily 47 | 0.75 48 | 2019-01-05 49 | 50 | https://static1.squarespace.com/static/5c06e0ab1137a66237a2399c/t/5c0d6b5b6d2a7379672b9b34/1544896195646/IMG_9107.jpg 51 | About - Story 52 | instrumentation complimented by Emily’s velvety voice and Stewart’s 53 | 54 | 55 | 56 | """.encode("UTF-8") 57 | 58 | URLS = set( 59 | ( 60 | "https://www.radiokeysmusic.com/home", 61 | "https://www.radiokeysmusic.com/about", 62 | ) 63 | ) 64 | 65 | assert extract_urls_from_sitemap(SITEMAP) == URLS 66 | -------------------------------------------------------------------------------- /src/wayback_machine_archiver/sitemaps.py: -------------------------------------------------------------------------------- 1 | # src/wayback_machine_archiver/sitemaps.py 2 | import logging 3 | import re 4 | import xml.etree.ElementTree as ET 5 | from xml.etree.ElementTree import ParseError 6 | 7 | LOCAL_PREFIX = "file://" 8 | 9 | 10 | def get_namespace(element): 11 | """Extract the namespace from an XML element.""" 12 | match = re.match(r"\{.*\}", element.tag) 13 | return match.group(0) if match else "" 14 | 15 | 16 | def download_remote_sitemap(sitemap_url, session): 17 | """Download a remote sitemap file.""" 18 | logging.debug("Downloading: %s", sitemap_url) 19 | r = session.get(sitemap_url) 20 | r.raise_for_status() 21 | return r.text.encode("utf-8") 22 | 23 | 24 | def load_local_sitemap(sitemap_filepath): 25 | """Load a local sitemap file.""" 26 | logging.debug("Loading local sitemap: %s", sitemap_filepath) 27 | if sitemap_filepath.startswith(LOCAL_PREFIX): 28 | sitemap_filepath = sitemap_filepath[len(LOCAL_PREFIX) :] 29 | with open(sitemap_filepath, "r") as fp: 30 | return fp.read() 31 | 32 | 33 | def sitemap_is_local(sitemap_url): 34 | """Check if a sitemap URI is local.""" 35 | return sitemap_url.startswith(LOCAL_PREFIX) or sitemap_url.startswith("/") 36 | 37 | 38 | def extract_urls_from_sitemap(site_map_text): 39 | """Parse XML sitemap text and extract URLs.""" 40 | root = ET.fromstring(site_map_text) 41 | namespace = get_namespace(root) 42 | loc_nodes = root.findall(".//{}loc".format(namespace)) 43 | return {node.text for node in loc_nodes} 44 | 45 | 46 | def process_sitemaps(sitemap_urls, session): 47 | """ 48 | Given a list of sitemap URLs, downloads/loads them and returns a set of all unique URLs found. 49 | """ 50 | all_urls = set() 51 | for sitemap_url in sitemap_urls: 52 | try: 53 | if sitemap_is_local(sitemap_url): 54 | logging.debug("The sitemap '%s' is local.", sitemap_url) 55 | sitemap_xml = load_local_sitemap(sitemap_url) 56 | else: 57 | logging.debug("The sitemap '%s' is remote.", sitemap_url) 58 | sitemap_xml = download_remote_sitemap(sitemap_url, session) 59 | 60 | extracted_urls = extract_urls_from_sitemap(sitemap_xml) 61 | all_urls.update(extracted_urls) 62 | except ParseError: 63 | logging.error( 64 | "Failed to parse sitemap from '%s'. The content is not valid XML. Please ensure the URL points directly to a sitemap.xml file. Skipping this sitemap.", 65 | sitemap_url, 66 | ) 67 | except Exception as e: 68 | logging.error( 69 | "An error occurred while processing sitemap '%s': %s. Skipping.", 70 | sitemap_url, 71 | e, 72 | ) 73 | return all_urls 74 | -------------------------------------------------------------------------------- /tests/test_main_logic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from unittest import mock 3 | from wayback_machine_archiver.archiver import main 4 | 5 | # This test file now mocks the main workflow and assumes credentials are present 6 | # to test the URL gathering and shuffling logic. 7 | 8 | 9 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set()) 10 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow") 11 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key") 12 | @mock.patch("wayback_machine_archiver.archiver.random.shuffle") 13 | def test_random_order_flag_shuffles_urls( 14 | mock_shuffle, mock_getenv, mock_workflow, mock_sitemaps 15 | ): 16 | """Verify that when --random-order is passed, random.shuffle is called.""" 17 | urls_to_archive = ["http://test.com/a", "http://test.com/b"] 18 | sys.argv = ["archiver", "--random-order"] + urls_to_archive 19 | main() 20 | mock_shuffle.assert_called_once() 21 | 22 | # Check for membership, not order, by comparing sets. 23 | # The second argument to the mock_workflow call is the list of URLs. 24 | passed_urls = mock_workflow.call_args[0][1] 25 | assert set(passed_urls) == set(urls_to_archive) 26 | 27 | 28 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set()) 29 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow") 30 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key") 31 | @mock.patch("wayback_machine_archiver.archiver.random.shuffle") 32 | def test_default_order_does_not_shuffle( 33 | mock_shuffle, mock_getenv, mock_workflow, mock_sitemaps 34 | ): 35 | """Verify that without --random-order, shuffle is not called.""" 36 | urls_to_archive = ["http://test.com/a", "http://test.com/b"] 37 | sys.argv = ["archiver"] + urls_to_archive 38 | main() 39 | mock_shuffle.assert_not_called() 40 | 41 | # Check for membership, not order, by comparing sets. 42 | passed_urls = mock_workflow.call_args[0][1] 43 | assert set(passed_urls) == set(urls_to_archive) 44 | 45 | 46 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set()) 47 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow") 48 | @mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key") 49 | def test_main_builds_and_passes_api_params(mock_getenv, mock_workflow, mock_sitemaps): 50 | """ 51 | Verify that main() correctly constructs the api_params dictionary from CLI 52 | flags and passes it to the workflow. 53 | """ 54 | sys.argv = [ 55 | "archiver", 56 | "http://test.com", 57 | "--capture-screenshot", 58 | "--js-behavior-timeout", 59 | "10", 60 | "--if-not-archived-within", 61 | "5d", 62 | "--user-agent", 63 | "TestBot/1.0", 64 | ] 65 | main() 66 | 67 | # The fourth argument to the mock_workflow call is the api_params dict. 68 | passed_params = mock_workflow.call_args[0][3] 69 | expected_params = { 70 | "capture_screenshot": "1", 71 | "js_behavior_timeout": 10, 72 | "if_not_archived_within": "5d", 73 | "use_user_agent": "TestBot/1.0", 74 | } 75 | assert passed_params == expected_params 76 | -------------------------------------------------------------------------------- /tests/test_spn2_client.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from wayback_machine_archiver.clients import SPN2Client 3 | from requests.adapters import HTTPAdapter 4 | import requests 5 | import urllib.parse 6 | 7 | 8 | @pytest.fixture 9 | def session(): 10 | session = requests.Session() 11 | session.mount("https://", HTTPAdapter()) 12 | session.mount("http://", HTTPAdapter()) 13 | return session 14 | 15 | 16 | @pytest.mark.parametrize( 17 | "api_params", 18 | [ 19 | (None), 20 | ({"capture_outlinks": "1", "js_behavior_timeout": 0}), 21 | ({"capture_screenshot": "1", "force_get": "1"}), 22 | ], 23 | ) 24 | def test_spn2_client_submit_capture(requests_mock, session, api_params): 25 | """ 26 | Verify that submit_capture sends a correct POST request, including optional 27 | API parameters, and returns the job_id. 28 | """ 29 | access_key = "test-access" 30 | secret_key = "test-secret" 31 | url_to_archive = "https://example.com" 32 | expected_job_id = "c4b1-4f2a-ac04-1d1225e98695" 33 | 34 | requests_mock.post( 35 | SPN2Client.SAVE_URL, json={"job_id": expected_job_id}, status_code=200 36 | ) 37 | 38 | client = SPN2Client(session=session, access_key=access_key, secret_key=secret_key) 39 | job_id = client.submit_capture( 40 | url_to_archive, rate_limit_wait=0, api_params=api_params 41 | ) 42 | 43 | # Assertions 44 | assert job_id == expected_job_id 45 | history = requests_mock.request_history 46 | assert len(history) == 1 47 | request = history[0] 48 | assert request.method == "POST" 49 | assert request.url == SPN2Client.SAVE_URL 50 | assert f"LOW {access_key}:{secret_key}" == request.headers["Authorization"] 51 | 52 | expected_payload = {"url": url_to_archive} 53 | if api_params: 54 | expected_payload.update(api_params) 55 | expected_body = urllib.parse.urlencode(expected_payload) 56 | assert request.text == expected_body 57 | 58 | 59 | def test_spn2_client_check_status_success(requests_mock, session): 60 | """ 61 | Verify check_status correctly parses a 'success' response. 62 | """ 63 | job_id = "test-job-123" 64 | status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id) 65 | success_payload = { 66 | "status": "success", 67 | "original_url": "https://example.com", 68 | "timestamp": "20250101000000", 69 | } 70 | requests_mock.get(status_url, json=success_payload) 71 | 72 | client = SPN2Client(session=session, access_key="key", secret_key="secret") 73 | status_data = client.check_status(job_id) 74 | 75 | assert status_data == success_payload 76 | 77 | 78 | def test_spn2_client_check_status_pending(requests_mock, session): 79 | """ 80 | Verify check_status correctly parses a 'pending' response. 81 | """ 82 | job_id = "test-job-456" 83 | status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id) 84 | pending_payload = {"status": "pending"} 85 | requests_mock.get(status_url, json=pending_payload) 86 | 87 | client = SPN2Client(session=session, access_key="key", secret_key="secret") 88 | status_data = client.check_status(job_id) 89 | 90 | assert status_data == pending_payload 91 | 92 | 93 | def test_spn2_client_check_status_error(requests_mock, session): 94 | """ 95 | Verify check_status correctly parses an 'error' response. 96 | """ 97 | job_id = "test-job-789" 98 | status_url = SPN2Client.STATUS_URL_TEMPLATE.format(job_id=job_id) 99 | error_payload = {"status": "error", "message": "Too many redirects."} 100 | requests_mock.get(status_url, json=error_payload) 101 | 102 | client = SPN2Client(session=session, access_key="key", secret_key="secret") 103 | status_data = client.check_status(job_id) 104 | 105 | assert status_data == error_payload 106 | -------------------------------------------------------------------------------- /src/wayback_machine_archiver/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from . import __version__ 4 | 5 | LOCAL_PREFIX = "file://" 6 | 7 | 8 | def create_parser(): 9 | """Creates and returns the argparse parser.""" 10 | parser = argparse.ArgumentParser( 11 | prog="archiver", 12 | description="A script to backup a web pages with Internet Archive", 13 | ) 14 | parser.add_argument( 15 | "--version", 16 | action="version", 17 | version="%(prog)s {version}".format(version=__version__), 18 | ) 19 | parser.add_argument( 20 | "urls", 21 | nargs="*", 22 | default=[], 23 | help="Specifies the URLs of the pages to archive.", 24 | ) 25 | parser.add_argument( 26 | "--file", 27 | help="Specifies the path to a file containing URLs to save, one per line.", 28 | required=False, 29 | ) 30 | parser.add_argument( 31 | "--sitemaps", 32 | nargs="+", 33 | default=[], 34 | help="Specifies one or more URIs to sitemaps listing pages to archive. Local paths must be prefixed with '{f}'.".format( 35 | f=LOCAL_PREFIX 36 | ), 37 | required=False, 38 | ) 39 | parser.add_argument( 40 | "--log", 41 | help="Sets the logging level. Defaults to WARNING (case-insensitive).", 42 | dest="log_level", 43 | default=logging.WARNING, 44 | type=str.upper, 45 | choices=[ 46 | "DEBUG", 47 | "INFO", 48 | "WARNING", 49 | "ERROR", 50 | "CRITICAL", 51 | ], 52 | ) 53 | parser.add_argument( 54 | "--log-to-file", 55 | help="Redirects logs to a specified file instead of the console.", 56 | dest="log_file", 57 | default=None, 58 | ) 59 | parser.add_argument( 60 | "--archive-sitemap-also", 61 | help="Submits the URL of the sitemap itself to be archived.", 62 | dest="archive_sitemap", 63 | default=False, 64 | action="store_true", 65 | ) 66 | parser.add_argument( 67 | "--rate-limit-wait", 68 | help="Specifies the number of seconds to wait between submissions. A minimum of 5 seconds is enforced for authenticated users. Defaults to 15.", 69 | dest="rate_limit_in_sec", 70 | default=15, 71 | type=int, 72 | ) 73 | parser.add_argument( 74 | "--random-order", 75 | help="Randomizes the order of pages before archiving.", 76 | dest="random_order", 77 | default=False, 78 | action="store_true", 79 | ) 80 | 81 | # --- SPN2 API Options --- 82 | api_group = parser.add_argument_group( 83 | "SPN2 API Options", "Control the behavior of the Internet Archive capture API." 84 | ) 85 | api_group.add_argument( 86 | "--capture-all", 87 | action="store_true", 88 | help="Captures a web page even if it returns an error (e.g., 404, 500).", 89 | ) 90 | api_group.add_argument( 91 | "--capture-outlinks", 92 | action="store_true", 93 | help="Captures web page outlinks automatically. Note: this can significantly increase the total number of captures and runtime.", 94 | ) 95 | api_group.add_argument( 96 | "--capture-screenshot", 97 | action="store_true", 98 | help="Captures a full page screenshot.", 99 | ) 100 | api_group.add_argument( 101 | "--delay-wb-availability", 102 | action="store_true", 103 | help="Reduces load on Internet Archive systems by making the capture publicly available after ~12 hours instead of immediately.", 104 | ) 105 | api_group.add_argument( 106 | "--force-get", 107 | action="store_true", 108 | help="Bypasses the headless browser check, which can speed up captures for non-HTML content (e.g., PDFs, images).", 109 | ) 110 | api_group.add_argument( 111 | "--skip-first-archive", 112 | action="store_true", 113 | help="Speeds up captures by skipping the check for whether this is the first time a URL has been archived.", 114 | ) 115 | api_group.add_argument( 116 | "--email-result", 117 | action="store_true", 118 | help="Sends an email report of the captured URLs to the user's registered email.", 119 | ) 120 | api_group.add_argument( 121 | "--if-not-archived-within", 122 | type=str, 123 | metavar="", 124 | help="Captures only if the latest capture is older than (e.g., '3d 5h').", 125 | ) 126 | api_group.add_argument( 127 | "--js-behavior-timeout", 128 | type=int, 129 | metavar="", 130 | help="Runs JS code for seconds after page load to trigger dynamic content. Defaults to 5, max is 30. Use 0 to disable for static pages.", 131 | ) 132 | api_group.add_argument( 133 | "--capture-cookie", 134 | type=str, 135 | metavar="", 136 | help="Uses an extra HTTP Cookie value when capturing the target page.", 137 | ) 138 | api_group.add_argument( 139 | "--user-agent", 140 | type=str, 141 | metavar="", 142 | dest="use_user_agent", 143 | help="Uses a custom HTTP User-Agent value when capturing the target page.", 144 | ) 145 | 146 | return parser 147 | -------------------------------------------------------------------------------- /src/wayback_machine_archiver/archiver.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import random 4 | import sys 5 | import requests 6 | from requests.adapters import HTTPAdapter 7 | from urllib3.util.retry import Retry 8 | from dotenv import load_dotenv 9 | 10 | from .clients import SPN2Client 11 | from .cli import create_parser 12 | from .sitemaps import process_sitemaps 13 | from .workflow import run_archive_workflow 14 | 15 | 16 | def main(): 17 | """Main entry point for the archiver script.""" 18 | parser = create_parser() 19 | args = parser.parse_args() 20 | 21 | logging.basicConfig(level=args.log_level, filename=args.log_file) 22 | load_dotenv() 23 | 24 | # --- Load and REQUIRE credentials --- 25 | access_key = os.getenv("INTERNET_ARCHIVE_ACCESS_KEY") 26 | secret_key = os.getenv("INTERNET_ARCHIVE_SECRET_KEY") 27 | 28 | if not (access_key and secret_key): 29 | logging.error( 30 | "Authentication required. Please provide your Internet Archive S3-style keys." 31 | ) 32 | logging.error("You can get your keys from: https://archive.org/account/s3.php") 33 | logging.error("Then, create a .env file or set the environment variables:") 34 | logging.error("INTERNET_ARCHIVE_ACCESS_KEY and INTERNET_ARCHIVE_SECRET_KEY") 35 | sys.exit(1) 36 | 37 | # --- Enforce API rate-limiting minimums for authenticated users --- 38 | MIN_WAIT_SEC = 5 39 | if args.rate_limit_in_sec < MIN_WAIT_SEC: 40 | logging.warning( 41 | "Provided rate limit of %d seconds is below the API minimum of %d for authenticated users. Overriding to %d seconds.", 42 | args.rate_limit_in_sec, 43 | MIN_WAIT_SEC, 44 | MIN_WAIT_SEC, 45 | ) 46 | args.rate_limit_in_sec = MIN_WAIT_SEC 47 | 48 | # --- Build API parameters dictionary from CLI args --- 49 | api_params = {} 50 | if args.capture_all: 51 | api_params["capture_all"] = "1" 52 | if args.capture_outlinks: 53 | api_params["capture_outlinks"] = "1" 54 | if args.capture_screenshot: 55 | api_params["capture_screenshot"] = "1" 56 | if args.delay_wb_availability: 57 | api_params["delay_wb_availability"] = "1" 58 | if args.force_get: 59 | api_params["force_get"] = "1" 60 | if args.skip_first_archive: 61 | api_params["skip_first_archive"] = "1" 62 | if args.email_result: 63 | api_params["email_result"] = "1" 64 | if args.if_not_archived_within: 65 | api_params["if_not_archived_within"] = args.if_not_archived_within 66 | if args.js_behavior_timeout is not None: 67 | api_params["js_behavior_timeout"] = args.js_behavior_timeout 68 | if args.capture_cookie: 69 | api_params["capture_cookie"] = args.capture_cookie 70 | if args.use_user_agent: 71 | api_params["use_user_agent"] = args.use_user_agent 72 | 73 | if api_params: 74 | logging.info(f"Using the following API parameters: {api_params}") 75 | 76 | # --- Gather all URLs to archive --- 77 | urls_to_archive = set() 78 | logging.info("Gathering URLs to archive...") 79 | if args.urls: 80 | logging.info(f"Found {len(args.urls)} URLs from command-line arguments.") 81 | urls_to_archive.update(args.urls) 82 | if args.sitemaps: 83 | session = requests.Session() 84 | retries = Retry( 85 | total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504] 86 | ) 87 | session.mount("https://", HTTPAdapter(max_retries=retries)) 88 | session.mount("http://", HTTPAdapter(max_retries=retries)) 89 | logging.info(f"Processing {len(args.sitemaps)} sitemap(s)...") 90 | sitemap_urls = process_sitemaps(args.sitemaps, session) 91 | logging.info(f"Found {len(sitemap_urls)} URLs from sitemaps.") 92 | urls_to_archive.update(sitemap_urls) 93 | if args.archive_sitemap: 94 | remote_sitemaps = {s for s in args.sitemaps if not s.startswith("file://")} 95 | urls_to_archive.update(remote_sitemaps) 96 | if args.file: 97 | with open(args.file) as f: 98 | urls_from_file = {line.strip() for line in f if line.strip()} 99 | logging.info(f"Found {len(urls_from_file)} URLs from file: {args.file}") 100 | urls_to_archive.update(urls_from_file) 101 | 102 | urls_to_process = list(urls_to_archive) 103 | if not urls_to_process: 104 | logging.warning("No unique URLs found to archive. Exiting.") 105 | return 106 | logging.info(f"Found a total of {len(urls_to_process)} unique URLs to archive.") 107 | if args.random_order: 108 | logging.info("Randomizing the order of URLs.") 109 | random.shuffle(urls_to_process) 110 | 111 | # --- Run the archiving workflow --- 112 | logging.info("SPN2 credentials found. Using authenticated API workflow.") 113 | client_session = requests.Session() 114 | retries = Retry( 115 | total=5, 116 | backoff_factor=args.rate_limit_in_sec, 117 | status_forcelist=[500, 502, 503, 504, 520], 118 | allowed_methods=["HEAD", "GET", "POST", "PUT", "DELETE", "OPTIONS", "TRACE"], 119 | ) 120 | client_session.mount("https://", HTTPAdapter(max_retries=retries)) 121 | client_session.mount("http://", HTTPAdapter(max_retries=retries)) 122 | 123 | client = SPN2Client( 124 | session=client_session, access_key=access_key, secret_key=secret_key 125 | ) 126 | run_archive_workflow(client, urls_to_process, args.rate_limit_in_sec, api_params) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # tests/test_cli.py 2 | import sys 3 | from unittest import mock 4 | import pytest 5 | import logging 6 | from wayback_machine_archiver.archiver import main 7 | from wayback_machine_archiver.cli import create_parser 8 | 9 | # This test file now mocks the main workflow and any I/O functions 10 | # to keep the tests focused purely on the CLI argument parsing logic. 11 | 12 | 13 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set()) 14 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow") 15 | @mock.patch("wayback_machine_archiver.archiver.logging.basicConfig") 16 | @pytest.mark.parametrize( 17 | "input_level, expected_level", 18 | [("info", "INFO"), ("DEBUG", "DEBUG")], 19 | ) 20 | def test_log_level( 21 | mock_basic_config, mock_workflow, mock_sitemaps, input_level, expected_level 22 | ): 23 | """Verify that the --log argument is case-insensitive.""" 24 | with mock.patch( 25 | "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key" 26 | ): 27 | sys.argv = ["archiver", "http://test.com", "--log", input_level] 28 | main() 29 | mock_basic_config.assert_called_once_with(level=expected_level, filename=None) 30 | 31 | 32 | def test_version_action_exits(): 33 | """Verify that the --version argument exits the program.""" 34 | sys.argv = ["archiver", "--version"] 35 | with pytest.raises(SystemExit): 36 | main() 37 | 38 | 39 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set()) 40 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow") 41 | @mock.patch("wayback_machine_archiver.archiver.logging.basicConfig") 42 | def test_log_to_file(mock_basic_config, mock_workflow, mock_sitemaps): 43 | """Verify that --log-to-file passes the filename to the logging config.""" 44 | with mock.patch( 45 | "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key" 46 | ): 47 | log_file = "archive.log" 48 | sys.argv = ["archiver", "http://test.com", "--log-to-file", log_file] 49 | main() 50 | mock_basic_config.assert_called_once_with( 51 | level=logging.WARNING, filename=log_file 52 | ) 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "user_input, expected_wait", 57 | [(2, 5), (10, 10)], 58 | ) 59 | @mock.patch("wayback_machine_archiver.archiver.process_sitemaps", return_value=set()) 60 | @mock.patch("wayback_machine_archiver.archiver.run_archive_workflow") 61 | def test_rate_limit_override(mock_workflow, mock_sitemaps, user_input, expected_wait): 62 | """Verify the script enforces the minimum rate-limit for authenticated users.""" 63 | with mock.patch( 64 | "wayback_machine_archiver.archiver.os.getenv", return_value="dummy_key" 65 | ): 66 | sys.argv = ["archiver", "http://test.com", "--rate-limit-wait", str(user_input)] 67 | main() 68 | # The third argument to the mock_workflow call is the rate limit. 69 | final_rate_limit = mock_workflow.call_args[0][2] 70 | assert final_rate_limit == expected_wait 71 | 72 | 73 | @mock.patch("wayback_machine_archiver.archiver.logging.error") 74 | def test_main_exits_if_no_credentials(mock_logging_error): 75 | """Verify the script raises SystemExit if getenv returns None for credentials.""" 76 | with mock.patch("wayback_machine_archiver.archiver.os.getenv", return_value=None): 77 | sys.argv = ["archiver", "http://test.com"] 78 | with pytest.raises(SystemExit) as e: 79 | main() 80 | 81 | # Check that the exit code is 1 (error) 82 | assert e.value.code == 1 83 | # Check that we logged an error message to the user 84 | assert mock_logging_error.call_count > 0 85 | 86 | 87 | def test_api_option_flags_are_parsed_correctly(): 88 | """ 89 | Directly tests the parser to ensure all API flags are correctly defined 90 | and their default values are as expected. 91 | """ 92 | parser = create_parser() 93 | 94 | # Test default values (when no flags are passed) 95 | args = parser.parse_args([]) 96 | assert args.capture_all is False 97 | assert args.capture_outlinks is False 98 | assert args.capture_screenshot is False 99 | assert args.delay_wb_availability is False 100 | assert args.force_get is False 101 | assert args.skip_first_archive is False 102 | assert args.email_result is False 103 | assert args.if_not_archived_within is None 104 | assert args.js_behavior_timeout is None 105 | assert args.capture_cookie is None 106 | assert args.use_user_agent is None 107 | 108 | # Test boolean flags are set to True 109 | args = parser.parse_args( 110 | [ 111 | "--capture-all", 112 | "--capture-outlinks", 113 | "--capture-screenshot", 114 | "--delay-wb-availability", 115 | "--force-get", 116 | "--skip-first-archive", 117 | "--email-result", 118 | ] 119 | ) 120 | assert args.capture_all is True 121 | assert args.capture_outlinks is True 122 | assert args.capture_screenshot is True 123 | assert args.delay_wb_availability is True 124 | assert args.force_get is True 125 | assert args.skip_first_archive is True 126 | assert args.email_result is True 127 | 128 | # Test value-based flags 129 | args = parser.parse_args( 130 | [ 131 | "--if-not-archived-within", 132 | "10d 5h", 133 | "--js-behavior-timeout", 134 | "25", 135 | "--capture-cookie", 136 | "name=value", 137 | "--user-agent", 138 | "MyTestAgent/1.0", 139 | ] 140 | ) 141 | assert args.if_not_archived_within == "10d 5h" 142 | assert args.js_behavior_timeout == 25 143 | assert args.capture_cookie == "name=value" 144 | assert args.use_user_agent == "MyTestAgent/1.0" 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wayback Machine Archiver 2 | 3 | Wayback Machine Archiver (Archiver for short) is a command-line utility 4 | written in Python to back up web pages using the [Internet Archive][ia]. 5 | 6 | [ia]: https://archive.org/ 7 | 8 | ## Installation 9 | 10 | The best way to install Archiver is with `pip`: 11 | 12 | ```bash 13 | pip install wayback-machine-archiver 14 | ``` 15 | 16 | This will give you access to the script simply by calling: 17 | 18 | ```bash 19 | archiver --help 20 | ``` 21 | 22 | You can also install it directly from a local clone of this repository: 23 | 24 | ```bash 25 | git clone https://github.com/agude/wayback-machine-archiver.git 26 | cd wayback-machine-archiver 27 | pip install . 28 | ``` 29 | 30 | All dependencies are handled automatically. Archiver supports Python 3.8+. 31 | 32 | ## Usage 33 | 34 | The archiver is simple to use from the command line. 35 | 36 | ### Command-Line Examples 37 | 38 | **Archive a single page:** 39 | ```bash 40 | archiver https://alexgude.com 41 | ``` 42 | 43 | **Archive all pages from a sitemap:** 44 | ```bash 45 | archiver --sitemaps https://alexgude.com/sitemap.xml 46 | ``` 47 | 48 | **Archive from a local sitemap file:** 49 | (Note the `file://` prefix is required) 50 | ```bash 51 | archiver --sitemaps file://sitemap.xml 52 | ``` 53 | 54 | **Archive from a text file of URLs:** 55 | (The file should contain one URL per line) 56 | ```bash 57 | archiver --file urls.txt 58 | ``` 59 | 60 | **Combine multiple sources:** 61 | ```bash 62 | archiver https://radiokeysmusic.com --sitemaps https://charles.uno/sitemap.xml 63 | ``` 64 | 65 | **Use advanced API options:** 66 | (Capture a screenshot and skip if archived in the last 10 days) 67 | ```bash 68 | archiver https://alexgude.com --capture-screenshot --if-not-archived-within 10d 69 | ``` 70 | 71 | **Archive the sitemap URL itself:** 72 | ```bash 73 | archiver --sitemaps https://alexgude.com/sitemaps.xml --archive-sitemap-also 74 | ``` 75 | 76 | ## Authentication (Required) 77 | 78 | As of version 3.0.0, this tool requires authentication with the Internet 79 | Archive's SPN2 API. This change was made to ensure all archiving jobs are 80 | reliable and their final success or failure status can be confirmed. The 81 | previous, less reliable method for unauthenticated users has been removed. 82 | 83 | If you run the script without credentials, it will exit with an error message. 84 | 85 | **To set up authentication:** 86 | 87 | 1. Get your S3-style API keys from your Internet Archive account settings: 88 | [https://archive.org/account/s3.php](https://archive.org/account/s3.php) 89 | 90 | 2. Create a `.env` file in the directory where you run the `archiver` 91 | command. Add your keys to it: 92 | ``` 93 | INTERNET_ARCHIVE_ACCESS_KEY="YOUR_ACCESS_KEY_HERE" 94 | INTERNET_ARCHIVE_SECRET_KEY="YOUR_SECRET_KEY_HERE" 95 | ``` 96 | 97 | The script will automatically detect this file (or the equivalent environment 98 | variables) and use the authenticated API. 99 | 100 | ## Help 101 | 102 | For a full list of command-line flags, Archiver has built-in help displayed 103 | with `archiver --help`: 104 | 105 | ``` 106 | usage: archiver [-h] [--version] [--file FILE] 107 | [--sitemaps SITEMAPS [SITEMAPS ...]] 108 | [--log {DEBUG,INFO,WARNING,ERROR,CRITICAL}] 109 | [--log-to-file LOG_FILE] 110 | [--archive-sitemap-also] 111 | [--rate-limit-wait RATE_LIMIT_IN_SEC] 112 | [--random-order] [--capture-all] 113 | [--capture-outlinks] [--capture-screenshot] 114 | [--delay-wb-availability] [--force-get] 115 | [--skip-first-archive] [--email-result] 116 | [--if-not-archived-within ] 117 | [--js-behavior-timeout ] 118 | [--capture-cookie ] 119 | [--user-agent ] 120 | [urls ...] 121 | 122 | A script to backup a web pages with Internet Archive 123 | 124 | positional arguments: 125 | urls Specifies the URLs of the pages to archive. 126 | 127 | options: 128 | -h, --help show this help message and exit 129 | --version show program's version number and exit 130 | --file FILE Specifies the path to a file containing URLs to save, 131 | one per line. 132 | --sitemaps SITEMAPS [SITEMAPS ...] 133 | Specifies one or more URIs to sitemaps listing pages 134 | to archive. Local paths must be prefixed with 135 | 'file://'. 136 | --log {DEBUG,INFO,WARNING,ERROR,CRITICAL} 137 | Sets the logging level. Defaults to WARNING 138 | (case-insensitive). 139 | --log-to-file LOG_FILE 140 | Redirects logs to a specified file instead of the 141 | console. 142 | --archive-sitemap-also 143 | Submits the URL of the sitemap itself to be archived. 144 | --rate-limit-wait RATE_LIMIT_IN_SEC 145 | Specifies the number of seconds to wait between 146 | submissions. A minimum of 5 seconds is enforced for 147 | authenticated users. Defaults to 15. 148 | --random-order Randomizes the order of pages before archiving. 149 | 150 | SPN2 API Options: 151 | Control the behavior of the Internet Archive capture API. 152 | 153 | --capture-all Captures a web page even if it returns an error (e.g., 154 | 404, 500). 155 | --capture-outlinks Captures web page outlinks automatically. Note: this 156 | can significantly increase the total number of 157 | captures and runtime. 158 | --capture-screenshot Captures a full page screenshot. 159 | --delay-wb-availability 160 | Reduces load on Internet Archive systems by making the 161 | capture publicly available after ~12 hours instead of 162 | immediately. 163 | --force-get Bypasses the headless browser check, which can speed 164 | up captures for non-HTML content (e.g., PDFs, images). 165 | --skip-first-archive Speeds up captures by skipping the check for whether 166 | this is the first time a URL has been archived. 167 | --email-result Sends an email report of the captured URLs to the 168 | user's registered email. 169 | --if-not-archived-within 170 | Captures only if the latest capture is older than 171 | (e.g., '3d 5h'). 172 | --js-behavior-timeout 173 | Runs JS code for seconds after page load to 174 | trigger dynamic content. Defaults to 5, max is 30. Use 175 | 0 to disable for static pages. 176 | --capture-cookie 177 | Uses an extra HTTP Cookie value when capturing the 178 | target page. 179 | --user-agent 180 | Uses a custom HTTP User-Agent value when capturing the 181 | target page. 182 | ``` 183 | 184 | ## Setting Up a `Sitemap.xml` for Github Pages 185 | 186 | It is easy to automatically generate a sitemap for a Github Pages Jekyll site. 187 | Simply use [jekyll/jekyll-sitemap][jsm]. 188 | 189 | Setup instructions can be found on the above site; they require changing just 190 | a single line of your site's `_config.yml`. 191 | 192 | [jsm]: https://github.com/jekyll/jekyll-sitemap 193 | -------------------------------------------------------------------------------- /tests/test_spn2_workflow.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from unittest import mock 3 | import pytest 4 | import time 5 | from wayback_machine_archiver.workflow import ( 6 | _submit_next_url, 7 | _poll_pending_jobs, 8 | run_archive_workflow, 9 | PERMANENT_ERROR_MESSAGES, 10 | TRANSIENT_ERROR_MESSAGES, 11 | ) 12 | 13 | # --- Tests for _submit_next_url --- 14 | 15 | 16 | def test_submit_next_url_success(): 17 | """ 18 | Verify that a successful submission adds the job_id to the pending_jobs 19 | dictionary, consumes the URL, and clears the attempts tracker for that URL. 20 | """ 21 | mock_client = mock.Mock() 22 | mock_client.submit_capture.return_value = "job-123" 23 | 24 | urls_to_process = ["http://example.com"] 25 | pending_jobs = {} 26 | # Simulate a previous failure to ensure the tracker is cleared on success 27 | submission_attempts = {"http://example.com": 1} 28 | 29 | _submit_next_url( 30 | urls_to_process, 31 | mock_client, 32 | pending_jobs, 33 | 5, 34 | submission_attempts, 35 | api_params={}, 36 | ) 37 | 38 | # Assertions 39 | mock_client.submit_capture.assert_called_once_with( 40 | "http://example.com", rate_limit_wait=5, api_params={} 41 | ) 42 | # --- Check the new data structure --- 43 | assert "job-123" in pending_jobs 44 | assert pending_jobs["job-123"]["url"] == "http://example.com" 45 | assert "submitted_at" in pending_jobs["job-123"] 46 | assert not urls_to_process, "URL should have been consumed from the list" 47 | assert "http://example.com" not in submission_attempts, ( 48 | "Attempts tracker should be cleared on success" 49 | ) 50 | 51 | 52 | def test_submit_next_url_failure_requeues_and_tracks_attempt(): 53 | """ 54 | Verify that a failed submission re-queues the URL at the end of the list 55 | and increments its attempt count. 56 | """ 57 | mock_client = mock.Mock() 58 | mock_client.submit_capture.side_effect = Exception("API Error") 59 | 60 | urls_to_process = ["http://a.com", "http://b.com"] 61 | pending_jobs = {} 62 | submission_attempts = {} 63 | 64 | _submit_next_url( 65 | urls_to_process, 66 | mock_client, 67 | pending_jobs, 68 | 5, 69 | submission_attempts, 70 | api_params={}, 71 | ) 72 | 73 | # Assertions 74 | assert not pending_jobs, "No job should have been added on failure" 75 | assert urls_to_process == ["http://b.com", "http://a.com"], ( 76 | "Failed URL should be at the end of the list" 77 | ) 78 | assert submission_attempts == {"http://a.com": 1}, ( 79 | "Attempt count should be incremented" 80 | ) 81 | 82 | 83 | def test_submit_next_url_gives_up_after_max_retries(): 84 | """ 85 | Verify that if a URL has reached its max retry count, it is not 86 | re-queued and the submission is not attempted. 87 | """ 88 | mock_client = mock.Mock() 89 | 90 | urls_to_process = ["http://will-fail.com"] 91 | pending_jobs = {} 92 | # Simulate that the URL has already failed 3 times 93 | submission_attempts = {"http://will-fail.com": 3} 94 | 95 | _submit_next_url( 96 | urls_to_process, 97 | mock_client, 98 | pending_jobs, 99 | 5, 100 | submission_attempts, 101 | api_params={}, 102 | max_retries=3, 103 | ) 104 | 105 | # Assertions 106 | mock_client.submit_capture.assert_not_called() 107 | assert not pending_jobs 108 | assert not urls_to_process, "URL should be consumed but not re-queued" 109 | assert submission_attempts == {"http://will-fail.com": 4}, ( 110 | "Attempt count is still updated" 111 | ) 112 | 113 | 114 | def test_submit_next_url_passes_api_params_to_client(): 115 | """ 116 | Verify that the api_params dictionary is correctly passed to the client's 117 | submit_capture method. 118 | """ 119 | mock_client = mock.Mock() 120 | mock_client.submit_capture.return_value = "job-123" 121 | urls_to_process = ["http://example.com"] 122 | pending_jobs = {} 123 | submission_attempts = {} 124 | api_params = {"capture_screenshot": "1", "force_get": "1"} 125 | 126 | _submit_next_url( 127 | urls_to_process, 128 | mock_client, 129 | pending_jobs, 130 | 0, 131 | submission_attempts, 132 | api_params, 133 | ) 134 | 135 | mock_client.submit_capture.assert_called_once_with( 136 | "http://example.com", rate_limit_wait=0, api_params=api_params 137 | ) 138 | 139 | 140 | # --- Tests for _poll_pending_jobs --- 141 | 142 | 143 | @mock.patch("wayback_machine_archiver.workflow.time.sleep") 144 | def test_poll_uses_batch_and_removes_completed_jobs(mock_sleep): 145 | """ 146 | Verify that jobs with 'success' or 'error' status are removed from the 147 | pending list via the batch endpoint, while 'pending' jobs remain. 148 | """ 149 | mock_client = mock.Mock() 150 | # Define the return value for the single batch request 151 | mock_client.check_status_batch.return_value = [ 152 | {"status": "success", "job_id": "job-success", "timestamp": "20250101"}, 153 | {"status": "error", "job_id": "job-error", "message": "Too many redirects."}, 154 | {"status": "pending", "job_id": "job-pending"}, 155 | ] 156 | 157 | # --- Use the new data structure for pending_jobs --- 158 | now = time.time() 159 | pending_jobs = { 160 | "job-success": {"url": "http://a.com", "submitted_at": now}, 161 | "job-error": {"url": "http://b.com", "submitted_at": now}, 162 | "job-pending": {"url": "http://c.com", "submitted_at": now}, 163 | } 164 | 165 | # --- Provide the new required arguments --- 166 | successful, failed, requeued = _poll_pending_jobs( 167 | mock_client, 168 | pending_jobs, 169 | transient_error_retries={}, 170 | max_transient_retries=3, 171 | job_timeout_sec=7200, 172 | ) 173 | 174 | # Assertions 175 | mock_client.check_status_batch.assert_called_once_with( 176 | ["job-success", "job-error", "job-pending"] 177 | ) 178 | # --- Check the new data structure in the assertion --- 179 | assert list(pending_jobs.keys()) == ["job-pending"] 180 | assert pending_jobs["job-pending"]["url"] == "http://c.com" 181 | assert successful == ["http://a.com"] 182 | assert failed == ["http://b.com"] 183 | assert requeued == [] 184 | mock_sleep.assert_called_once() 185 | 186 | 187 | @pytest.mark.parametrize( 188 | "status_ext, api_message, expected_outcome, expected_log_level, expected_log_snippet", 189 | [ 190 | ( 191 | "error:service-unavailable", 192 | "Service is down", 193 | "requeue", 194 | logging.WARNING, 195 | TRANSIENT_ERROR_MESSAGES["error:service-unavailable"], 196 | ), 197 | ( 198 | "error:not-found", 199 | "Page not found", 200 | "fail", 201 | logging.ERROR, 202 | PERMANENT_ERROR_MESSAGES["error:not-found"], 203 | ), 204 | ( 205 | "error:some-new-unseen-error", 206 | "A new error", 207 | "fail", 208 | logging.ERROR, 209 | "An unrecoverable error occurred.", 210 | ), 211 | # --- NEW TEST CASE --- 212 | # This simulates the bug: the status_ext is a generic failure, but the 213 | # message contains "RecursionError", which should trigger a requeue. 214 | ( 215 | "error:job-failed", 216 | "encoding with 'idna' codec failed (RecursionError: maximum recursion depth exceeded)", 217 | "requeue", 218 | logging.WARNING, 219 | TRANSIENT_ERROR_MESSAGES["error:recursion-error"], 220 | ), 221 | ], 222 | ) 223 | @mock.patch("wayback_machine_archiver.workflow.time.sleep") 224 | def test_poll_pending_jobs_handles_errors_intelligently( 225 | mock_sleep, 226 | caplog, 227 | status_ext, 228 | api_message, 229 | expected_outcome, 230 | expected_log_level, 231 | expected_log_snippet, 232 | ): 233 | """ 234 | Verify that _poll_pending_jobs correctly categorizes errors as either 235 | transient (re-queue) or permanent (fail) and logs helpful messages. 236 | """ 237 | mock_client = mock.Mock() 238 | mock_client.check_status_batch.return_value = [ 239 | { 240 | "status": "error", 241 | "job_id": "job-1", 242 | "status_ext": status_ext, 243 | "message": api_message, 244 | } 245 | ] 246 | # --- Use the new data structure for pending_jobs --- 247 | pending_jobs = {"job-1": {"url": "http://example.com", "submitted_at": time.time()}} 248 | 249 | with caplog.at_level(logging.WARNING): 250 | # --- Provide the new required arguments --- 251 | successful, failed, requeued = _poll_pending_jobs( 252 | mock_client, 253 | pending_jobs, 254 | transient_error_retries={}, 255 | max_transient_retries=3, 256 | job_timeout_sec=7200, 257 | ) 258 | 259 | assert not successful 260 | if expected_outcome == "requeue": 261 | assert requeued == ["http://example.com"] 262 | assert not failed 263 | else: # fail 264 | assert not requeued 265 | assert failed == ["http://example.com"] 266 | 267 | assert len(caplog.records) == 1 268 | log_record = caplog.records[0] 269 | assert log_record.levelno == expected_log_level 270 | assert expected_log_snippet in log_record.message 271 | 272 | 273 | # --- Corrected test for run_archive_workflow dynamic polling --- 274 | 275 | 276 | @mock.patch("wayback_machine_archiver.workflow.time.sleep") 277 | @mock.patch("wayback_machine_archiver.workflow._poll_pending_jobs") 278 | @mock.patch("wayback_machine_archiver.workflow._submit_next_url") 279 | def test_run_archive_workflow_dynamic_polling_is_fast_and_correct( 280 | mock_submit, mock_poll, mock_sleep 281 | ): 282 | """ 283 | Verify that the polling wait time increases exponentially when jobs are pending 284 | and the submission queue is empty, and that the test runs quickly. 285 | """ 286 | mock_client = mock.Mock() 287 | initial_urls = ["http://a.com"] 288 | # Use a mutable list for the test to simulate its modification by _submit_next_url 289 | urls_to_process_list = list(initial_urls) 290 | rate_limit_in_sec = 0 291 | api_params = {} 292 | 293 | # Configure mock_submit to simulate a successful submission 294 | # It needs to modify the urls_to_process_list and pending_jobs_dict passed to it 295 | def submit_side_effect(urls_proc, client_arg, pending_jobs_dict, *args, **kwargs): 296 | url = urls_proc.pop(0) # Remove the URL from the list 297 | job_id = f"job-{url}" 298 | # --- Use the new data structure --- 299 | pending_jobs_dict[job_id] = {"url": url, "submitted_at": time.time()} 300 | return job_id 301 | 302 | mock_submit.side_effect = submit_side_effect 303 | 304 | # Configure mock_poll to simulate jobs staying pending, then succeeding 305 | poll_calls = 0 306 | 307 | def poll_side_effect(client_arg, pending_jobs_dict, *args, **kwargs): 308 | nonlocal poll_calls 309 | poll_calls += 1 310 | if poll_calls <= 3: # Simulate pending for 3 calls 311 | return [], [], [] # No success, no failure, no requeue 312 | else: # Simulate success on the 4th call 313 | # --- Extract URLs from the new data structure --- 314 | successful_urls = [job["url"] for job in pending_jobs_dict.values()] 315 | pending_jobs_dict.clear() 316 | return successful_urls, [], [] 317 | 318 | mock_poll.side_effect = poll_side_effect 319 | 320 | # Call the main workflow function 321 | run_archive_workflow( 322 | mock_client, urls_to_process_list, rate_limit_in_sec, api_params 323 | ) 324 | 325 | # Assertions 326 | # Check the calls to time.sleep 327 | # We expect sleep to be called between polling cycles when the submission 328 | # queue is empty. 329 | # Cycle 1: Submits URL. Polls. Loop continues. 330 | # Cycle 2: No URLs to submit. Polls. Sleeps for 5s. 331 | # Cycle 3: No URLs to submit. Polls. Sleeps for 7s (5 * 1.5). 332 | # Cycle 4: No URLs to submit. Polls. Sleeps for 10s (7 * 1.5). 333 | # Cycle 5: No URLs to submit. Polls (job succeeds). Loop terminates. 334 | # We filter out the small 0.2s sleeps that happen inside _poll_pending_jobs. 335 | sleep_calls = [call[0][0] for call in mock_sleep.call_args_list if call[0][0] > 1] 336 | 337 | assert sleep_calls == [5, 7, 10] 338 | assert mock_submit.call_count == 1 339 | # The poll side effect now runs 4 times to get to the success case 340 | assert mock_poll.call_count == 4 341 | assert not urls_to_process_list # Ensure the initial URL list is empty 342 | 343 | 344 | def test_poll_gives_up_after_max_transient_retries(caplog): 345 | """ 346 | Verify that if a URL fails with a transient error more times than allowed, 347 | it is marked as a permanent failure and not re-queued. 348 | """ 349 | mock_client = mock.Mock() 350 | mock_client.check_status_batch.return_value = [ 351 | { 352 | "status": "error", 353 | "job_id": "job-1", 354 | "status_ext": "error:service-unavailable", # A transient error 355 | "message": "API message", 356 | } 357 | ] 358 | 359 | url = "http://example.com" 360 | max_retries = 3 361 | 362 | # Simulate that this URL has already failed 3 times with a transient error 363 | transient_error_retries = {url: 3} 364 | pending_jobs = {"job-1": {"url": url, "submitted_at": time.time()}} 365 | 366 | with caplog.at_level(logging.INFO): 367 | successful, failed, requeued = _poll_pending_jobs( 368 | mock_client, 369 | pending_jobs, 370 | transient_error_retries, 371 | max_transient_retries=max_retries, 372 | job_timeout_sec=7200, 373 | ) 374 | 375 | # Assertions 376 | assert not successful 377 | assert not requeued, "URL should not have been re-queued" 378 | assert failed == [url], "URL should have been marked as failed" 379 | assert "Marking as a permanent failure" in caplog.text 380 | 381 | 382 | def test_poll_fails_job_after_timeout(caplog): 383 | """ 384 | Verify that a job that remains in a 'pending' state for longer than the 385 | timeout period is marked as a failure. 386 | """ 387 | mock_client = mock.Mock() 388 | mock_client.check_status_batch.return_value = [ 389 | {"status": "pending", "job_id": "job-stuck"} 390 | ] 391 | 392 | url = "http://stuck.com" 393 | timeout_sec = 3600 # 1 hour 394 | 395 | # Simulate a job that was submitted long ago, well before the timeout 396 | stale_timestamp = time.time() - (timeout_sec + 60) 397 | pending_jobs = {"job-stuck": {"url": url, "submitted_at": stale_timestamp}} 398 | 399 | with caplog.at_level(logging.INFO): 400 | successful, failed, requeued = _poll_pending_jobs( 401 | mock_client, 402 | pending_jobs, 403 | transient_error_retries={}, 404 | max_transient_retries=3, 405 | job_timeout_sec=timeout_sec, 406 | ) 407 | 408 | # Assertions 409 | assert not successful 410 | assert not requeued 411 | assert failed == [url], "Stuck job should have been marked as failed" 412 | assert "timed out after being pending" in caplog.text 413 | -------------------------------------------------------------------------------- /src/wayback_machine_archiver/workflow.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | # A set of transient errors that suggest a retry might be successful. 5 | REQUEUE_ERRORS = { 6 | "error:bad-gateway", 7 | "error:bandwidth-limit-exceeded", 8 | "error:browsing-timeout", 9 | "error:cannot-fetch", 10 | "error:capture-location-error", 11 | "error:celery", 12 | "error:gateway-timeout", 13 | "error:internal-server-error", 14 | "error:invalid-server-response", 15 | "error:job-failed", 16 | "error:no-browsers-available", 17 | "error:protocol-error", 18 | "error:proxy-error", 19 | "error:read-timeout", 20 | "error:recursion-error", 21 | "error:service-unavailable", 22 | "error:soft-time-limit-exceeded", 23 | "error:too-many-requests", 24 | "error:user-session-limit", 25 | } 26 | 27 | # A map of transient error codes to user-friendly, explanatory messages. 28 | TRANSIENT_ERROR_MESSAGES = { 29 | "error:bad-gateway": "The server reported a temporary upstream issue (Bad Gateway).", 30 | "error:bandwidth-limit-exceeded": "The target server has exceeded its bandwidth limit.", 31 | "error:browsing-timeout": "The headless browser timed out, possibly due to high server load.", 32 | "error:cannot-fetch": "The Internet Archive's systems are temporarily overloaded.", 33 | "error:capture-location-error": "An internal Internet Archive system error occurred.", 34 | "error:celery": "An error occurred in the Internet Archive's internal job queue.", 35 | "error:gateway-timeout": "The server reported a temporary upstream timeout (Gateway Timeout).", 36 | "error:internal-server-error": "The Internet Archive's server reported a temporary internal error.", 37 | "error:invalid-server-response": "The target server sent a malformed response, possibly due to a network glitch.", 38 | "error:job-failed": "The capture failed due to a generic Internet Archive system error.", 39 | "error:no-browsers-available": "The Internet Archive's capture browsers are temporarily at capacity.", 40 | "error:protocol-error": "The HTTP connection was broken, likely due to a network issue.", 41 | "error:proxy-error": "An internal Internet Archive proxy error occurred.", 42 | "error:read-timeout": "The connection timed out while reading data from the server.", 43 | "error:recursion-error": "The server encountered a temporary processing error (RecursionError).", 44 | "error:service-unavailable": "The Internet Archive's service is temporarily unavailable.", 45 | "error:soft-time-limit-exceeded": "The capture took too long and was terminated; a retry may succeed.", 46 | "error:too-many-requests": "The target server is rate-limiting requests.", 47 | "error:user-session-limit": "Your Internet Archive account has reached its concurrent job limit.", 48 | } 49 | 50 | # A map of permanent error codes to user-friendly, explanatory messages. 51 | PERMANENT_ERROR_MESSAGES = { 52 | "error:bad-request": "The API reported a bad request. This may be a bug in the archiver script.", 53 | "error:blocked": "The target site is actively blocking the Internet Archive's requests. To save the block page, use the --capture-all flag.", 54 | "error:blocked-client-ip": "Your IP address is on a blocklist (e.g., Spamhaus), and the Internet Archive is refusing the request.", 55 | "error:blocked-url": "This URL is on a blocklist (e.g., a tracking domain) and cannot be archived.", 56 | "error:filesize-limit": "The file at this URL is larger than the 2GB limit and cannot be archived.", 57 | "error:ftp-access-denied": "Access to the FTP resource was denied due to a permissions issue.", 58 | "error:http-version-not-supported": "The target server uses an unsupported HTTP version.", 59 | "error:invalid-host-resolution": "The domain name could not be found. Check for typos in the URL.", 60 | "error:invalid-url-syntax": "The URL is malformed. Please check its structure.", 61 | "error:method-not-allowed": "The server forbids the HTTP method used for archiving. To save this error page, use the --capture-all flag.", 62 | "error:network-authentication-required": "A captive portal or proxy is requiring authentication. To save the login page, use the --capture-all flag.", 63 | "error:no-access": "The page is forbidden (403 Forbidden). To save this error page, use the --capture-all flag.", 64 | "error:not-found": "The page could not be found (404 Not Found). To save this error page, use the --capture-all flag.", 65 | "error:not-implemented": "The server does not support the functionality required to archive the page.", 66 | "error:too-many-daily-captures": "This URL has already been captured the maximum number of times today.", 67 | "error:too-many-redirects": "The URL has too many redirects, likely indicating a redirect loop.", 68 | "error:unauthorized": "The page requires a login (401 Unauthorized). To save the login/error page, use the --capture-all flag.", 69 | } 70 | 71 | 72 | def _submit_next_url( 73 | urls_to_process, 74 | client, 75 | pending_jobs, 76 | rate_limit_in_sec, 77 | submission_attempts, 78 | api_params, 79 | max_retries=3, 80 | ): 81 | """ 82 | Pops the next URL, submits it, and adds its job_id to pending_jobs. 83 | Returns 'failed' on a definitive failure, otherwise None. 84 | """ 85 | url = urls_to_process.pop(0) 86 | attempt_num = submission_attempts.get(url, 0) + 1 87 | submission_attempts[url] = attempt_num 88 | 89 | if attempt_num > max_retries: 90 | logging.error("URL %s failed submission %d times, giving up.", url, max_retries) 91 | return "failed" 92 | 93 | try: 94 | logging.info("Submitting %s (attempt %d/%d)...", url, attempt_num, max_retries) 95 | job_id = client.submit_capture( 96 | url, rate_limit_wait=rate_limit_in_sec, api_params=api_params 97 | ) 98 | 99 | if not job_id: 100 | # The API accepted the request but didn't provide a job_id. 101 | # This is treated as a transient error to trigger a retry. 102 | raise ValueError( 103 | "API did not return a job_id, likely due to rate limiting." 104 | ) 105 | 106 | # --- Store a dictionary with URL and timestamp --- 107 | pending_jobs[job_id] = {"url": url, "submitted_at": time.time()} 108 | if url in submission_attempts: 109 | del submission_attempts[url] 110 | 111 | except ValueError as _: 112 | # This block specifically catches the "no job_id" case. 113 | logging.warning( 114 | "Submission for %s was accepted but no job_id was returned. This can happen under high load or due to rate limits. Re-queuing for another attempt.", 115 | url, 116 | ) 117 | urls_to_process.append(url) 118 | 119 | except Exception as e: 120 | # This block now catches all OTHER submission errors (e.g., network). 121 | logging.warning( 122 | "Failed to submit URL %s due to a connection or API error: %s. Re-queuing for another attempt.", 123 | url, 124 | e, 125 | ) 126 | urls_to_process.append(url) 127 | 128 | return None 129 | 130 | 131 | def _poll_pending_jobs( 132 | client, 133 | pending_jobs, 134 | transient_error_retries, 135 | max_transient_retries, 136 | job_timeout_sec, 137 | poll_interval_sec=0.2, 138 | ): 139 | """ 140 | Checks the status of all pending jobs using a single batch request. 141 | Returns a tuple of (successful_urls, failed_urls, requeued_urls) for completed jobs. 142 | """ 143 | successful_urls = [] 144 | failed_urls = [] 145 | requeued_urls = [] 146 | 147 | # Get all job IDs that need to be checked. 148 | job_ids_to_check = list(pending_jobs.keys()) 149 | if not job_ids_to_check: 150 | return [], [], [] 151 | 152 | try: 153 | # Make a single batch request for all pending jobs. 154 | # The API is expected to return a list of status objects. 155 | batch_statuses = client.check_status_batch(job_ids_to_check) 156 | 157 | # It's possible the API returns a single object if only one job was queried. 158 | if not isinstance(batch_statuses, list): 159 | batch_statuses = [batch_statuses] 160 | 161 | for status_data in batch_statuses: 162 | job_id = status_data.get("job_id") 163 | if not job_id or job_id not in pending_jobs: 164 | continue 165 | 166 | # --- URL is now inside a dictionary --- 167 | original_url = pending_jobs[job_id]["url"] 168 | status = status_data.get("status") 169 | 170 | if status == "success": 171 | timestamp = status_data.get("timestamp") 172 | archive_url = f"https://web.archive.org/web/{timestamp}/{original_url}" 173 | logging.info("Success for job %s: %s", job_id, archive_url) 174 | del pending_jobs[job_id] 175 | successful_urls.append(original_url) 176 | elif status == "error": 177 | status_ext = status_data.get("status_ext") 178 | api_message = status_data.get("message", "Unknown error") 179 | 180 | # The API can return a generic error code for what is actually a transient 181 | # server-side processing error. We check the message for this specific case. 182 | if "RecursionError" in api_message: 183 | status_ext = "error:recursion-error" 184 | 185 | if status_ext in REQUEUE_ERRORS: 186 | # --- Check if this URL has exceeded its transient retry limit --- 187 | retry_count = transient_error_retries.get(original_url, 0) + 1 188 | transient_error_retries[original_url] = retry_count 189 | 190 | if retry_count > max_transient_retries: 191 | logging.error( 192 | "URL %s failed with a transient error %d times. Marking as a permanent failure. (API code: %s)", 193 | original_url, 194 | max_transient_retries, 195 | status_ext, 196 | ) 197 | del pending_jobs[job_id] 198 | failed_urls.append(original_url) 199 | else: 200 | # --- This is the original re-queue logic --- 201 | helpful_message = TRANSIENT_ERROR_MESSAGES.get( 202 | status_ext, "A transient error occurred." 203 | ) 204 | logging.warning( 205 | "Transient error for %s: %s Re-queuing for another attempt (%d/%d). (API code: %s)", 206 | original_url, 207 | helpful_message, 208 | retry_count, 209 | max_transient_retries, 210 | status_ext, 211 | ) 212 | del pending_jobs[job_id] 213 | requeued_urls.append(original_url) 214 | else: 215 | # Look up the helpful message, with a fallback for unknown permanent errors. 216 | helpful_message = PERMANENT_ERROR_MESSAGES.get( 217 | status_ext, "An unrecoverable error occurred." 218 | ) 219 | logging.error( 220 | "Permanent error for %s: %s (API message: %s)", 221 | original_url, 222 | helpful_message, 223 | api_message, 224 | ) 225 | del pending_jobs[job_id] 226 | failed_urls.append(original_url) 227 | else: 228 | # --- Check for job timeout if status is pending --- 229 | submitted_at = pending_jobs[job_id]["submitted_at"] 230 | job_age = time.time() - submitted_at 231 | if job_age > job_timeout_sec: 232 | logging.error( 233 | "Job for %s timed out after being pending for over %d seconds. Marking as failed.", 234 | original_url, 235 | job_timeout_sec, 236 | ) 237 | del pending_jobs[job_id] 238 | failed_urls.append(original_url) 239 | else: 240 | logging.debug( 241 | "Job %s (%s) is still pending...", job_id, original_url 242 | ) 243 | 244 | except Exception as e: 245 | logging.error( 246 | "An exception occurred during batch polling: %s. Clearing all pending jobs for this cycle to prevent loops.", 247 | e, 248 | ) 249 | # --- Must extract URLs from the dictionary values --- 250 | failed_urls.extend([job["url"] for job in pending_jobs.values()]) 251 | pending_jobs.clear() 252 | 253 | # A short sleep after each batch poll to be nice to the API. 254 | time.sleep(poll_interval_sec) 255 | 256 | return successful_urls, failed_urls, requeued_urls 257 | 258 | 259 | def run_archive_workflow(client, urls_to_process, rate_limit_in_sec, api_params): 260 | """Manages the main loop for submitting and polling URLs.""" 261 | pending_jobs = {} 262 | submission_attempts = {} 263 | # --- Dictionary to track retries for transient polling errors --- 264 | transient_error_retries = {} 265 | MAX_TRANSIENT_RETRIES = 3 266 | # --- Timeout for jobs stuck in pending state --- 267 | JOB_TIMEOUT_SEC = 7200 # 2 hours 268 | 269 | total_urls = len(urls_to_process) 270 | success_count = 0 271 | failure_count = 0 272 | 273 | # --- Variables for dynamic polling --- 274 | INITIAL_POLLING_WAIT = 5 275 | MAX_POLLING_WAIT = 60 276 | POLLING_BACKOFF_FACTOR = 1.5 277 | polling_wait_time = INITIAL_POLLING_WAIT 278 | 279 | logging.info( 280 | "Beginning interleaved submission and polling of %d URLs...", 281 | total_urls, 282 | ) 283 | 284 | while urls_to_process or pending_jobs: 285 | if urls_to_process: 286 | status = _submit_next_url( 287 | urls_to_process, 288 | client, 289 | pending_jobs, 290 | rate_limit_in_sec, 291 | submission_attempts, 292 | api_params, 293 | ) 294 | if status == "failed": 295 | failure_count += 1 296 | # Reset polling wait time after a new submission 297 | polling_wait_time = INITIAL_POLLING_WAIT 298 | 299 | if pending_jobs: 300 | # --- Pass job timeout to the polling function --- 301 | successful, failed, requeued = _poll_pending_jobs( 302 | client, 303 | pending_jobs, 304 | transient_error_retries, 305 | MAX_TRANSIENT_RETRIES, 306 | JOB_TIMEOUT_SEC, 307 | ) 308 | success_count += len(successful) 309 | failure_count += len(failed) 310 | if requeued: 311 | urls_to_process.extend(requeued) 312 | logging.info( 313 | "Re-queued %d URLs due to transient API errors.", len(requeued) 314 | ) 315 | 316 | if not urls_to_process and pending_jobs: 317 | logging.info( 318 | "%d captures remaining, starting next polling cycle in %d seconds...", 319 | len(pending_jobs), 320 | polling_wait_time, 321 | ) 322 | time.sleep(polling_wait_time) 323 | # Increase wait time for the next cycle 324 | polling_wait_time = min( 325 | int(polling_wait_time * POLLING_BACKOFF_FACTOR), MAX_POLLING_WAIT 326 | ) 327 | 328 | logging.info("--------------------------------------------------") 329 | logging.info("Archive workflow complete.") 330 | logging.info(f"Total URLs processed: {total_urls}") 331 | logging.info(f"Successful captures: {success_count}") 332 | logging.info(f"Failed captures: {failure_count}") 333 | logging.info("--------------------------------------------------") 334 | --------------------------------------------------------------------------------