├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── build-test.yml │ ├── codeql-analysis.yml │ ├── python-publish.yml │ └── unit-test.yml ├── .gitignore ├── .pep8speaks.yml ├── .whitesource ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── _config.yml ├── assets └── waybackpy_logo.svg ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── snapcraft.yaml ├── tests ├── __init__.py ├── test_availability_api.py ├── test_cdx_api.py ├── test_cdx_snapshot.py ├── test_cdx_utils.py ├── test_cli.py ├── test_save_api.py ├── test_utils.py └── test_wrapper.py └── waybackpy ├── __init__.py ├── availability_api.py ├── cdx_api.py ├── cdx_snapshot.py ├── cdx_utils.py ├── cli.py ├── exceptions.py ├── py.typed ├── save_api.py ├── utils.py └── wrapper.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: akamhy 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | 1. Go to '...' 17 | 2. Click on '....' 18 | 3. Scroll down to '....' 19 | 4. See error 20 | 21 | **Expected behavior** 22 | A clear and concise description of what you expected to happen. 23 | 24 | **Screenshots** 25 | If applicable, add screenshots to help explain your problem. 26 | 27 | **Version:** 28 | 29 | - OS: [e.g. iOS] 30 | - Version [e.g. 22] 31 | - Is latest version? [e.g. Yes/No] 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: akamhy 7 | --- 8 | 9 | **Is your feature request related to a problem? Please describe.** 10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | 12 | **Describe the solution you'd like** 13 | A clear and concise description of what you want to happen. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ['3.7', '3.10'] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -U setuptools wheel 28 | - name: Build test the package 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '30 6 * * 1' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/unit-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Tests 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ['3.10'] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install '.[dev]' 29 | - name: Lint with flake8 30 | run: | 31 | flake8 . --count --show-source --statistics 32 | - name: Lint with black 33 | run: | 34 | black . --check --diff 35 | - name: Static type test with mypy 36 | run: | 37 | mypy -p waybackpy -p tests 38 | - name: Test with pytest 39 | run: | 40 | pytest 41 | - name: Upload coverage to Codecov 42 | run: | 43 | bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }} 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files generated while testing 2 | *-urls-*.txt 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /.pep8speaks.yml: -------------------------------------------------------------------------------- 1 | scanner: 2 | diff_only: True 3 | linter: flake8 4 | 5 | flake8: 6 | max-line-length: 88 7 | extend-ignore: W503,W605 8 | -------------------------------------------------------------------------------- /.whitesource: -------------------------------------------------------------------------------- 1 | { 2 | "scanSettings": { 3 | "baseBranches": [] 4 | }, 5 | "checkRunSettings": { 6 | "vulnerableCheckRunConclusionLevel": "failure", 7 | "displayMode": "diff" 8 | }, 9 | "issueSettings": { 10 | "minSeverityLevel": "LOW" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | title: waybackpy 4 | abstract: "Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily." 5 | version: '3.0.6' 6 | doi: 10.5281/ZENODO.3977276 7 | date-released: 2022-03-15 8 | type: software 9 | authors: 10 | - given-names: Akash 11 | family-names: Mahanty 12 | email: akamhy@yahoo.com 13 | orcid: https://orcid.org/0000-0003-2482-8227 14 | keywords: 15 | - Archive Website 16 | - Wayback Machine 17 | - Internet Archive 18 | - Wayback Machine CLI 19 | - Wayback Machine Python 20 | - Internet Archiving 21 | - Availability API 22 | - CDX API 23 | - savepagenow 24 | license: MIT 25 | repository-code: "https://github.com/akamhy/waybackpy" 26 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | akamhy@yahoo.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | . 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | . Translations are available at 128 | . 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome to waybackpy contributing guide 2 | 3 | 4 | ## Getting started 5 | 6 | Read our [Code of Conduct](./CODE_OF_CONDUCT.md). 7 | 8 | ## Creating an issue 9 | 10 | It's a good idea to open an issue and discuss suspected bugs and new feature ideas with the maintainers. Somebody might be working on your bug/idea and it would be best to discuss it to avoid wasting your time. It is a recommendation. You may avoid creating an issue and directly open pull requests. 11 | 12 | ## Fork this repository 13 | 14 | Fork this repository. See '[Fork a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo)' for help forking this repository on GitHub. 15 | 16 | ## Make changes to the forked copy 17 | 18 | Make the required changes to your forked copy of waybackpy, please don't forget to add or update comments and docstrings. 19 | 20 | ## Add tests for your changes 21 | 22 | You have made the required changes to the codebase, now go ahead and add tests for newly written methods/functions and update the tests of code that you changed. 23 | 24 | ## Testing and Linting 25 | 26 | You must run the tests and linter on your changes before opening a pull request. 27 | 28 | ### pytest 29 | 30 | Runs all test from tests directory. pytest is a mature full-featured Python testing tool. 31 | ```bash 32 | pytest 33 | ``` 34 | 35 | ### mypy 36 | 37 | Mypy is a static type checker for Python. Type checkers help ensure that you're using variables and functions in your code correctly. 38 | ```bash 39 | mypy -p waybackpy -p tests 40 | ``` 41 | 42 | ### black 43 | 44 | After testing with pytest and type checking with mypy run black on the code base. The codestyle used by the project is 'black'. 45 | 46 | ```bash 47 | black . 48 | ``` 49 | 50 | ## Create a pull request 51 | 52 | Read [Creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request). 53 | 54 | Try to make sure that all automated tests are passing, and if some of them do not pass then don't worry. Tests are meant to catch bugs and a failed test is better than introducing bugs to the master branch. 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2022 waybackpy contributors ( https://github.com/akamhy/waybackpy/graphs/contributors ) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 | 6 |

Python package & CLI tool that interfaces the Wayback Machine APIs

7 | 8 |
9 | 10 |

11 | Unit Tests 12 | codecov 13 | pypi 14 | Downloads 15 | Codacy Badge 16 | GitHub lastest commit 17 | PyPI - Python Version 18 | Code style: black 19 |

20 | 21 | --- 22 | 23 | # Introduction 24 | 25 | Waybackpy is a Python package and a CLI tool that interfaces with the Wayback Machine APIs. 26 | 27 | Internet Archive's Wayback Machine has 3 useful public APIs. 28 | 29 | - SavePageNow or Save API 30 | - CDX Server API 31 | - Availability API 32 | 33 | These three APIs can be accessed via the waybackpy either by importing it from a python file/module or from the command-line interface. 34 | 35 | ## Installation 36 | 37 | **Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)), from [PyPI](https://pypi.org/) (recommended)**: 38 | 39 | ```bash 40 | pip install waybackpy -U 41 | ``` 42 | 43 | **Using [conda](https://en.wikipedia.org/wiki/Conda_(package_manager)), from [conda-forge](https://anaconda.org/conda-forge/waybackpy) (recommended)**: 44 | 45 | See also [waybackpy feedstock](https://github.com/conda-forge/waybackpy-feedstock), maintainers are [@rafaelrdealmeida](https://github.com/rafaelrdealmeida/), 46 | [@labriunesp](https://github.com/labriunesp/) 47 | and [@akamhy](https://github.com/akamhy/). 48 | 49 | ```bash 50 | conda install -c conda-forge waybackpy 51 | ``` 52 | 53 | **Install directly from [this git repository](https://github.com/akamhy/waybackpy) (NOT recommended)**: 54 | 55 | ```bash 56 | pip install git+https://github.com/akamhy/waybackpy.git 57 | ``` 58 | 59 | ## Docker Image 60 | 61 | Docker Hub: [hub.docker.com/r/secsi/waybackpy](https://hub.docker.com/r/secsi/waybackpy) 62 | 63 | Docker image is automatically updated on every release by [Regulary and Automatically Updated Docker Images](https://github.com/cybersecsi/RAUDI) (RAUDI). 64 | 65 | RAUDI is a tool by [SecSI](https://secsi.io), an Italian cybersecurity startup. 66 | 67 | ## Usage 68 | 69 | ### As a Python package 70 | 71 | #### Save API aka SavePageNow 72 | 73 | ```python 74 | >>> from waybackpy import WaybackMachineSaveAPI 75 | >>> url = "https://github.com" 76 | >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" 77 | >>> 78 | >>> save_api = WaybackMachineSaveAPI(url, user_agent) 79 | >>> save_api.save() 80 | https://web.archive.org/web/20220118125249/https://github.com/ 81 | >>> save_api.cached_save 82 | False 83 | >>> save_api.timestamp() 84 | datetime.datetime(2022, 1, 18, 12, 52, 49) 85 | ``` 86 | 87 | #### CDX API aka CDXServerAPI 88 | 89 | ```python 90 | >>> from waybackpy import WaybackMachineCDXServerAPI 91 | >>> url = "https://google.com" 92 | >>> user_agent = "my new app's user agent" 93 | >>> cdx_api = WaybackMachineCDXServerAPI(url, user_agent) 94 | ``` 95 | ##### oldest 96 | ```python 97 | >>> cdx_api.oldest() 98 | com,google)/ 19981111184551 http://google.com:80/ text/html 200 HOQ2TGPYAEQJPNUA6M4SMZ3NGQRBXDZ3 381 99 | >>> oldest = cdx_api.oldest() 100 | >>> oldest 101 | com,google)/ 19981111184551 http://google.com:80/ text/html 200 HOQ2TGPYAEQJPNUA6M4SMZ3NGQRBXDZ3 381 102 | >>> oldest.archive_url 103 | 'https://web.archive.org/web/19981111184551/http://google.com:80/' 104 | >>> oldest.original 105 | 'http://google.com:80/' 106 | >>> oldest.urlkey 107 | 'com,google)/' 108 | >>> oldest.timestamp 109 | '19981111184551' 110 | >>> oldest.datetime_timestamp 111 | datetime.datetime(1998, 11, 11, 18, 45, 51) 112 | >>> oldest.statuscode 113 | '200' 114 | >>> oldest.mimetype 115 | 'text/html' 116 | ``` 117 | ##### newest 118 | ```python 119 | >>> newest = cdx_api.newest() 120 | >>> newest 121 | com,google)/ 20220217234427 http://@google.com/ text/html 301 Y6PVK4XWOI3BXQEXM5WLLWU5JKUVNSFZ 563 122 | >>> newest.archive_url 123 | 'https://web.archive.org/web/20220217234427/http://@google.com/' 124 | >>> newest.timestamp 125 | '20220217234427' 126 | ``` 127 | ##### near 128 | ```python 129 | >>> near = cdx_api.near(year=2010, month=10, day=10, hour=10, minute=10) 130 | >>> near.archive_url 131 | 'https://web.archive.org/web/20101010101435/http://google.com/' 132 | >>> near 133 | com,google)/ 20101010101435 http://google.com/ text/html 301 Y6PVK4XWOI3BXQEXM5WLLWU5JKUVNSFZ 391 134 | >>> near.timestamp 135 | '20101010101435' 136 | >>> near.timestamp 137 | '20101010101435' 138 | >>> near = cdx_api.near(wayback_machine_timestamp=2008080808) 139 | >>> near.archive_url 140 | 'https://web.archive.org/web/20080808051143/http://google.com/' 141 | >>> near = cdx_api.near(unix_timestamp=1286705410) 142 | >>> near 143 | com,google)/ 20101010101435 http://google.com/ text/html 301 Y6PVK4XWOI3BXQEXM5WLLWU5JKUVNSFZ 391 144 | >>> near.archive_url 145 | 'https://web.archive.org/web/20101010101435/http://google.com/' 146 | >>> 147 | ``` 148 | ##### snapshots 149 | ```python 150 | >>> from waybackpy import WaybackMachineCDXServerAPI 151 | >>> url = "https://pypi.org" 152 | >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" 153 | >>> cdx = WaybackMachineCDXServerAPI(url, user_agent, start_timestamp=2016, end_timestamp=2017) 154 | >>> for item in cdx.snapshots(): 155 | ... print(item.archive_url) 156 | ... 157 | https://web.archive.org/web/20160110011047/http://pypi.org/ 158 | https://web.archive.org/web/20160305104847/http://pypi.org/ 159 | . 160 | . # URLS REDACTED FOR READABILITY 161 | . 162 | https://web.archive.org/web/20171127171549/https://pypi.org/ 163 | https://web.archive.org/web/20171206002737/http://pypi.org:80/ 164 | ``` 165 | 166 | #### Availability API 167 | 168 | It is recommended to not use the availability API due to performance issues. All the methods of availability API interface class, `WaybackMachineAvailabilityAPI`, are also implemented in the CDX server API interface class, `WaybackMachineCDXServerAPI`. Also note 169 | that the `newest()` method of `WaybackMachineAvailabilityAPI` can be more recent than `WaybackMachineCDXServerAPI`'s same method. 170 | 171 | ```python 172 | >>> from waybackpy import WaybackMachineAvailabilityAPI 173 | >>> 174 | >>> url = "https://google.com" 175 | >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0" 176 | >>> 177 | >>> availability_api = WaybackMachineAvailabilityAPI(url, user_agent) 178 | ``` 179 | ##### oldest 180 | ```python 181 | >>> availability_api.oldest() 182 | https://web.archive.org/web/19981111184551/http://google.com:80/ 183 | ``` 184 | ##### newest 185 | ```python 186 | >>> availability_api.newest() 187 | https://web.archive.org/web/20220118150444/https://www.google.com/ 188 | ``` 189 | ##### near 190 | ```python 191 | >>> availability_api.near(year=2010, month=10, day=10, hour=10) 192 | https://web.archive.org/web/20101010101708/http://www.google.com/ 193 | ``` 194 | 195 | > Documentation is at . 196 | 197 | ### As a CLI tool 198 | 199 | Demo video on [asciinema.org](https://asciinema.org/a/469890), you can copy the text from video: 200 | 201 | [![asciicast](https://asciinema.org/a/469890.svg)](https://asciinema.org/a/469890) 202 | 203 | > CLI documentation is at . 204 | 205 | 206 | ## CONTRIBUTORS 207 | 208 | ### AUTHORS 209 | 210 | - akamhy () 211 | - eggplants () 212 | - danvalen1 () 213 | - AntiCompositeNumber () 214 | - rafaelrdealmeida () 215 | - jonasjancarik () 216 | - jfinkhaeuser () 217 | 218 | ### ACKNOWLEDGEMENTS 219 | 220 | - mhmdiaa () `--known-urls` is based on [this](https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050) gist. 221 | - dequeued0 () for reporting bugs and useful feature requests. 222 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | -------------------------------------------------------------------------------- /assets/waybackpy_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["wheel", "setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | click 3 | codecov 4 | flake8 5 | mypy 6 | pytest 7 | pytest-cov 8 | requests 9 | setuptools>=46.4.0 10 | types-requests 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | requests 3 | urllib3 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = waybackpy 3 | version = attr: waybackpy.__version__ 4 | description = Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily. 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | license = MIT 8 | author = Akash Mahanty 9 | author_email = akamhy@yahoo.com 10 | url = https://akamhy.github.io/waybackpy/ 11 | download_url = https://github.com/akamhy/waybackpy/releases 12 | project_urls = 13 | Documentation = https://github.com/akamhy/waybackpy/wiki 14 | Source = https://github.com/akamhy/waybackpy 15 | Tracker = https://github.com/akamhy/waybackpy/issues 16 | keywords = 17 | Archive Website 18 | Wayback Machine 19 | Internet Archive 20 | Wayback Machine CLI 21 | Wayback Machine Python 22 | Internet Archiving 23 | Availability API 24 | CDX API 25 | savepagenow 26 | classifiers = 27 | Development Status :: 5 - Production/Stable 28 | Intended Audience :: Developers 29 | Intended Audience :: End Users/Desktop 30 | Natural Language :: English 31 | Typing :: Typed 32 | License :: OSI Approved :: MIT License 33 | Programming Language :: Python 34 | Programming Language :: Python :: 3 35 | Programming Language :: Python :: 3.6 36 | Programming Language :: Python :: 3.7 37 | Programming Language :: Python :: 3.8 38 | Programming Language :: Python :: 3.9 39 | Programming Language :: Python :: 3.10 40 | Programming Language :: Python :: 3.11 41 | Programming Language :: Python :: Implementation :: CPython 42 | 43 | [options] 44 | packages = find: 45 | include-package-data = True 46 | python_requires = >= 3.6 47 | install_requires = 48 | click 49 | requests 50 | urllib3 51 | 52 | [options.package_data] 53 | waybackpy = py.typed 54 | 55 | [options.extras_require] 56 | dev = 57 | black 58 | codecov 59 | flake8 60 | mypy 61 | pytest 62 | pytest-cov 63 | setuptools>=46.4.0 64 | types-requests 65 | 66 | [options.entry_points] 67 | console_scripts = 68 | waybackpy = waybackpy.cli:main 69 | 70 | [isort] 71 | profile = black 72 | 73 | [flake8] 74 | indent-size = 4 75 | max-line-length = 88 76 | extend-ignore = W503,W605 77 | exclude = 78 | venv 79 | __pycache__ 80 | .venv 81 | ./env 82 | venv/ 83 | env 84 | .env 85 | ./build 86 | 87 | [mypy] 88 | python_version = 3.9 89 | show_error_codes = True 90 | pretty = True 91 | strict = True 92 | 93 | [tool:pytest] 94 | addopts = 95 | # show summary of all tests that did not pass 96 | -ra 97 | # enable all warnings 98 | -Wd 99 | # coverage and html report 100 | --cov=waybackpy 101 | --cov-report=html 102 | testpaths = 103 | tests 104 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /snapcraft.yaml: -------------------------------------------------------------------------------- 1 | name: waybackpy 2 | summary: Wayback Machine API interface and a command-line tool 3 | description: | 4 | Waybackpy is a CLI tool that interfaces with the Wayback Machine APIs. 5 | Wayback Machine has three client side public APIs, Save API, 6 | Availability API and CDX API. These three APIs can be accessed via 7 | the waybackpy from the terminal. 8 | version: git 9 | grade: stable 10 | confinement: strict 11 | base: core20 12 | architectures: 13 | - build-on: [arm64, armhf, amd64] 14 | 15 | apps: 16 | waybackpy: 17 | command: bin/waybackpy 18 | plugs: [home, network, network-bind, removable-media] 19 | 20 | parts: 21 | waybackpy: 22 | plugin: python 23 | source: https://github.com/akamhy/waybackpy.git 24 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamhy/waybackpy/3b3e78d901a600bb22943202c6a8981ca04a5e48/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_availability_api.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | from datetime import datetime, timedelta 4 | 5 | import pytest 6 | 7 | from waybackpy.availability_api import WaybackMachineAvailabilityAPI 8 | from waybackpy.exceptions import ( 9 | ArchiveNotInAvailabilityAPIResponse, 10 | InvalidJSONInAvailabilityAPIResponse, 11 | ) 12 | 13 | now = datetime.utcnow() 14 | url = "https://example.com/" 15 | user_agent = ( 16 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " 17 | "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" 18 | ) 19 | 20 | 21 | def rndstr(n: int) -> str: 22 | return "".join( 23 | random.choice(string.ascii_uppercase + string.digits) for _ in range(n) 24 | ) 25 | 26 | 27 | def test_oldest() -> None: 28 | """ 29 | Test the oldest archive of Google.com and also checks the attributes. 30 | """ 31 | url = "https://example.com/" 32 | user_agent = ( 33 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " 34 | "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" 35 | ) 36 | availability_api = WaybackMachineAvailabilityAPI(url, user_agent) 37 | oldest = availability_api.oldest() 38 | oldest_archive_url = oldest.archive_url 39 | assert "2002" in oldest_archive_url 40 | oldest_timestamp = oldest.timestamp() 41 | assert abs(oldest_timestamp - now) > timedelta(days=7000) # More than 19 years 42 | assert ( 43 | availability_api.json is not None 44 | and availability_api.json["archived_snapshots"]["closest"]["available"] is True 45 | ) 46 | assert repr(oldest).find("example.com") != -1 47 | assert "2002" in str(oldest) 48 | 49 | 50 | def test_newest() -> None: 51 | """ 52 | Assuming that the recent most Google Archive was made no more earlier than 53 | last one day which is 86400 seconds. 54 | """ 55 | url = "https://www.youtube.com/" 56 | user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0" 57 | availability_api = WaybackMachineAvailabilityAPI(url, user_agent) 58 | newest = availability_api.newest() 59 | newest_timestamp = newest.timestamp() 60 | # betting in favor that latest youtube archive was not before the last 3 days 61 | # high tarffic sites like youtube are archived mnay times a day, so seems 62 | # very reasonable to me. 63 | assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3) 64 | 65 | 66 | def test_invalid_json() -> None: 67 | """ 68 | When the API is malfunctioning or we don't pass a URL, 69 | it may return invalid JSON data. 70 | """ 71 | with pytest.raises(InvalidJSONInAvailabilityAPIResponse): 72 | availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent) 73 | _ = availability_api.archive_url 74 | 75 | 76 | def test_no_archive() -> None: 77 | """ 78 | ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not 79 | replied with the archive despite the fact that we know the site has million 80 | of archives. Don't know the reason for this wierd behavior. 81 | 82 | And also if really there are no archives for the passed URL this exception 83 | is raised. 84 | """ 85 | with pytest.raises(ArchiveNotInAvailabilityAPIResponse): 86 | availability_api = WaybackMachineAvailabilityAPI( 87 | url=f"https://{rndstr(30)}.cn", user_agent=user_agent 88 | ) 89 | _ = availability_api.archive_url 90 | 91 | 92 | def test_no_api_call_str_repr() -> None: 93 | """ 94 | Some entitled users maybe want to see what is the string representation 95 | if they don’t make any API requests. 96 | 97 | str() must not return None so we return "" 98 | """ 99 | availability_api = WaybackMachineAvailabilityAPI( 100 | url=f"https://{rndstr(30)}.gov", user_agent=user_agent 101 | ) 102 | assert str(availability_api) == "" 103 | 104 | 105 | def test_no_call_timestamp() -> None: 106 | """ 107 | If no API requests were made the bound timestamp() method returns 108 | the datetime.max as a default value. 109 | """ 110 | availability_api = WaybackMachineAvailabilityAPI( 111 | url=f"https://{rndstr(30)}.in", user_agent=user_agent 112 | ) 113 | assert datetime.max == availability_api.timestamp() 114 | -------------------------------------------------------------------------------- /tests/test_cdx_api.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import pytest 5 | 6 | from waybackpy.cdx_api import WaybackMachineCDXServerAPI 7 | from waybackpy.exceptions import NoCDXRecordFound 8 | 9 | 10 | def rndstr(n: int) -> str: 11 | return "".join( 12 | random.choice(string.ascii_uppercase + string.digits) for _ in range(n) 13 | ) 14 | 15 | 16 | def test_a() -> None: 17 | user_agent = ( 18 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 19 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 20 | ) 21 | url = "https://twitter.com/jack" 22 | 23 | wayback = WaybackMachineCDXServerAPI( 24 | url=url, 25 | user_agent=user_agent, 26 | match_type="prefix", 27 | collapses=["urlkey"], 28 | start_timestamp="201001", 29 | end_timestamp="201002", 30 | ) 31 | # timeframe bound prefix matching enabled along with active urlkey based collapsing 32 | 33 | snapshots = wayback.snapshots() # 34 | 35 | for snapshot in snapshots: 36 | assert snapshot.timestamp.startswith("2010") 37 | 38 | 39 | def test_b() -> None: 40 | user_agent = ( 41 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 42 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 43 | ) 44 | url = "https://www.google.com" 45 | 46 | wayback = WaybackMachineCDXServerAPI( 47 | url=url, 48 | user_agent=user_agent, 49 | start_timestamp="202101", 50 | end_timestamp="202112", 51 | collapses=["urlkey"], 52 | ) 53 | # timeframe bound prefix matching enabled along with active urlkey based collapsing 54 | 55 | snapshots = wayback.snapshots() # 56 | 57 | for snapshot in snapshots: 58 | assert snapshot.timestamp.startswith("2021") 59 | 60 | 61 | def test_c() -> None: 62 | user_agent = ( 63 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 64 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 65 | ) 66 | url = "https://www.google.com" 67 | 68 | cdx = WaybackMachineCDXServerAPI( 69 | url=url, 70 | user_agent=user_agent, 71 | closest="201010101010", 72 | sort="closest", 73 | limit="1", 74 | ) 75 | snapshots = cdx.snapshots() 76 | for snapshot in snapshots: 77 | archive_url = snapshot.archive_url 78 | timestamp = snapshot.timestamp 79 | break 80 | 81 | assert str(archive_url).find("google.com") 82 | assert "20101010" in timestamp 83 | 84 | 85 | def test_d() -> None: 86 | user_agent = ( 87 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 88 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 89 | ) 90 | 91 | cdx = WaybackMachineCDXServerAPI( 92 | url="akamhy.github.io", 93 | user_agent=user_agent, 94 | match_type="prefix", 95 | use_pagination=True, 96 | filters=["statuscode:200"], 97 | ) 98 | snapshots = cdx.snapshots() 99 | 100 | count = 0 101 | for snapshot in snapshots: 102 | count += 1 103 | assert str(snapshot.archive_url).find("akamhy.github.io") 104 | assert count > 50 105 | 106 | 107 | def test_oldest() -> None: 108 | user_agent = ( 109 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 110 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 111 | ) 112 | 113 | cdx = WaybackMachineCDXServerAPI( 114 | url="google.com", 115 | user_agent=user_agent, 116 | filters=["statuscode:200"], 117 | ) 118 | oldest = cdx.oldest() 119 | assert "1998" in oldest.timestamp 120 | assert "google" in oldest.urlkey 121 | assert oldest.original.find("google.com") != -1 122 | assert oldest.archive_url.find("google.com") != -1 123 | 124 | 125 | def test_newest() -> None: 126 | user_agent = ( 127 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 128 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 129 | ) 130 | 131 | cdx = WaybackMachineCDXServerAPI( 132 | url="google.com", 133 | user_agent=user_agent, 134 | filters=["statuscode:200"], 135 | ) 136 | newest = cdx.newest() 137 | assert "google" in newest.urlkey 138 | assert newest.original.find("google.com") != -1 139 | assert newest.archive_url.find("google.com") != -1 140 | 141 | 142 | def test_near() -> None: 143 | user_agent = ( 144 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 145 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 146 | ) 147 | 148 | cdx = WaybackMachineCDXServerAPI( 149 | url="google.com", 150 | user_agent=user_agent, 151 | filters=["statuscode:200"], 152 | ) 153 | near = cdx.near(year=2010, month=10, day=10, hour=10, minute=10) 154 | assert "2010101010" in near.timestamp 155 | assert "google" in near.urlkey 156 | assert near.original.find("google.com") != -1 157 | assert near.archive_url.find("google.com") != -1 158 | 159 | near = cdx.near(wayback_machine_timestamp="201010101010") 160 | assert "2010101010" in near.timestamp 161 | assert "google" in near.urlkey 162 | assert near.original.find("google.com") != -1 163 | assert near.archive_url.find("google.com") != -1 164 | 165 | near = cdx.near(unix_timestamp=1286705410) 166 | assert "2010101010" in near.timestamp 167 | assert "google" in near.urlkey 168 | assert near.original.find("google.com") != -1 169 | assert near.archive_url.find("google.com") != -1 170 | 171 | with pytest.raises(NoCDXRecordFound): 172 | dne_url = f"https://{rndstr(30)}.in" 173 | cdx = WaybackMachineCDXServerAPI( 174 | url=dne_url, 175 | user_agent=user_agent, 176 | filters=["statuscode:200"], 177 | ) 178 | cdx.near(unix_timestamp=1286705410) 179 | 180 | 181 | def test_before() -> None: 182 | user_agent = ( 183 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 184 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 185 | ) 186 | 187 | cdx = WaybackMachineCDXServerAPI( 188 | url="http://www.google.com/", 189 | user_agent=user_agent, 190 | filters=["statuscode:200"], 191 | ) 192 | before = cdx.before(wayback_machine_timestamp=20160731235949) 193 | assert "20160731233347" in before.timestamp 194 | assert "google" in before.urlkey 195 | assert before.original.find("google.com") != -1 196 | assert before.archive_url.find("google.com") != -1 197 | 198 | 199 | def test_after() -> None: 200 | user_agent = ( 201 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " 202 | "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" 203 | ) 204 | 205 | cdx = WaybackMachineCDXServerAPI( 206 | url="http://www.google.com/", 207 | user_agent=user_agent, 208 | filters=["statuscode:200"], 209 | ) 210 | after = cdx.after(wayback_machine_timestamp=20160731235949) 211 | assert "20160801000917" in after.timestamp, after.timestamp 212 | assert "google" in after.urlkey 213 | assert after.original.find("google.com") != -1 214 | assert after.archive_url.find("google.com") != -1 215 | -------------------------------------------------------------------------------- /tests/test_cdx_snapshot.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from waybackpy.cdx_snapshot import CDXSnapshot 4 | 5 | 6 | def test_CDXSnapshot() -> None: 7 | sample_input = ( 8 | "org,archive)/ 20080126045828 http://github.com " 9 | "text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415" 10 | ) 11 | prop_values = sample_input.split(" ") 12 | properties = {} 13 | ( 14 | properties["urlkey"], 15 | properties["timestamp"], 16 | properties["original"], 17 | properties["mimetype"], 18 | properties["statuscode"], 19 | properties["digest"], 20 | properties["length"], 21 | ) = prop_values 22 | 23 | snapshot = CDXSnapshot(properties) 24 | 25 | assert properties["urlkey"] == snapshot.urlkey 26 | assert properties["timestamp"] == snapshot.timestamp 27 | assert properties["original"] == snapshot.original 28 | assert properties["mimetype"] == snapshot.mimetype 29 | assert properties["statuscode"] == snapshot.statuscode 30 | assert properties["digest"] == snapshot.digest 31 | assert properties["length"] == snapshot.length 32 | assert ( 33 | datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S") 34 | == snapshot.datetime_timestamp 35 | ) 36 | archive_url = ( 37 | "https://web.archive.org/web/" 38 | + properties["timestamp"] 39 | + "/" 40 | + properties["original"] 41 | ) 42 | assert archive_url == snapshot.archive_url 43 | assert sample_input == str(snapshot) 44 | assert sample_input == repr(snapshot) 45 | -------------------------------------------------------------------------------- /tests/test_cdx_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | 3 | import pytest 4 | 5 | from waybackpy.cdx_utils import ( 6 | check_collapses, 7 | check_filters, 8 | check_match_type, 9 | check_sort, 10 | full_url, 11 | get_response, 12 | get_total_pages, 13 | ) 14 | from waybackpy.exceptions import WaybackError 15 | 16 | 17 | def test_get_total_pages() -> None: 18 | url = "twitter.com" 19 | user_agent = ( 20 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 " 21 | "(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15" 22 | ) 23 | assert get_total_pages(url=url, user_agent=user_agent) >= 56 24 | 25 | 26 | def test_full_url() -> None: 27 | endpoint = "https://web.archive.org/cdx/search/cdx" 28 | params: Dict[str, Any] = {} 29 | assert endpoint == full_url(endpoint, params) 30 | 31 | params = {"a": "1"} 32 | assert full_url(endpoint, params) == "https://web.archive.org/cdx/search/cdx?a=1" 33 | assert ( 34 | full_url(endpoint + "?", params) == "https://web.archive.org/cdx/search/cdx?a=1" 35 | ) 36 | 37 | params["b"] = 2 38 | assert ( 39 | full_url(endpoint + "?", params) 40 | == "https://web.archive.org/cdx/search/cdx?a=1&b=2" 41 | ) 42 | 43 | params["c"] = "foo bar" 44 | assert ( 45 | full_url(endpoint + "?", params) 46 | == "https://web.archive.org/cdx/search/cdx?a=1&b=2&c=foo%20bar" 47 | ) 48 | 49 | 50 | def test_get_response() -> None: 51 | url = "https://github.com" 52 | user_agent = ( 53 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0" 54 | ) 55 | headers = {"User-Agent": str(user_agent)} 56 | response = get_response(url, headers=headers) 57 | assert not isinstance(response, Exception) and response.status_code == 200 58 | 59 | 60 | def test_check_filters() -> None: 61 | filters: List[str] = [] 62 | check_filters(filters) 63 | 64 | filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"] 65 | check_filters(filters) 66 | 67 | with pytest.raises(WaybackError): 68 | check_filters("not-list") # type: ignore[arg-type] 69 | 70 | with pytest.raises(WaybackError): 71 | check_filters(["invalid"]) 72 | 73 | 74 | def test_check_collapses() -> None: 75 | collapses: List[str] = [] 76 | check_collapses(collapses) 77 | 78 | collapses = ["timestamp:10"] 79 | check_collapses(collapses) 80 | 81 | collapses = ["urlkey"] 82 | check_collapses(collapses) 83 | 84 | collapses = "urlkey" # type: ignore[assignment] 85 | with pytest.raises(WaybackError): 86 | check_collapses(collapses) 87 | 88 | collapses = ["also illegal collapse"] 89 | with pytest.raises(WaybackError): 90 | check_collapses(collapses) 91 | 92 | 93 | def test_check_match_type() -> None: 94 | assert check_match_type(None, "url") 95 | match_type = "exact" 96 | url = "test_url" 97 | assert check_match_type(match_type, url) 98 | 99 | url = "has * in it" 100 | with pytest.raises(WaybackError): 101 | check_match_type("domain", url) 102 | 103 | with pytest.raises(WaybackError): 104 | check_match_type("not a valid type", "url") 105 | 106 | 107 | def test_check_sort() -> None: 108 | assert check_sort("default") 109 | assert check_sort("closest") 110 | assert check_sort("reverse") 111 | 112 | with pytest.raises(WaybackError): 113 | assert check_sort("random crap") 114 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from click.testing import CliRunner 3 | 4 | from waybackpy import __version__ 5 | from waybackpy.cli import main 6 | 7 | 8 | def test_oldest() -> None: 9 | runner = CliRunner() 10 | result = runner.invoke(main, ["--url", " https://github.com ", "--oldest"]) 11 | assert result.exit_code == 0 12 | assert ( 13 | result.output 14 | == "Archive URL:\nhttps://web.archive.org/web/2008051421\ 15 | 0148/http://github.com/\n" 16 | ) 17 | 18 | 19 | def test_near() -> None: 20 | runner = CliRunner() 21 | result = runner.invoke( 22 | main, 23 | [ 24 | "--url", 25 | " https://facebook.com ", 26 | "--near", 27 | "--year", 28 | "2010", 29 | "--month", 30 | "5", 31 | "--day", 32 | "10", 33 | "--hour", 34 | "6", 35 | ], 36 | ) 37 | assert result.exit_code == 0 38 | assert ( 39 | result.output 40 | == "Archive URL:\nhttps://web.archive.org/web/2010051008\ 41 | 2647/http://www.facebook.com/\n" 42 | ) 43 | 44 | 45 | def test_newest() -> None: 46 | runner = CliRunner() 47 | result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"]) 48 | assert result.exit_code == 0 49 | assert ( 50 | result.output.find("microsoft.com") != -1 51 | and result.output.find("Archive URL:\n") != -1 52 | ) 53 | 54 | 55 | def test_cdx() -> None: 56 | runner = CliRunner() 57 | result = runner.invoke( 58 | main, 59 | "--url https://twitter.com/jack --cdx --user-agent some-user-agent \ 60 | --start-timestamp 2010 --end-timestamp 2012 --collapse urlkey \ 61 | --match-type prefix --cdx-print archiveurl --cdx-print length \ 62 | --cdx-print digest --cdx-print statuscode --cdx-print mimetype \ 63 | --cdx-print original --cdx-print timestamp --cdx-print urlkey".split( 64 | " " 65 | ), 66 | ) 67 | assert result.exit_code == 0 68 | assert result.output.count("\n") > 3000 69 | 70 | 71 | def test_save() -> None: 72 | runner = CliRunner() 73 | result = runner.invoke( 74 | main, 75 | "--url https://yahoo.com --user_agent my-unique-user-agent \ 76 | --save --headers".split( 77 | " " 78 | ), 79 | ) 80 | assert result.exit_code == 0 81 | assert result.output.find("Archive URL:") != -1 82 | assert (result.output.find("Cached save:\nTrue") != -1) or ( 83 | result.output.find("Cached save:\nFalse") != -1 84 | ) 85 | assert result.output.find("Save API headers:\n") != -1 86 | assert result.output.find("yahoo.com") != -1 87 | 88 | 89 | def test_version() -> None: 90 | runner = CliRunner() 91 | result = runner.invoke(main, ["--version"]) 92 | assert result.exit_code == 0 93 | assert result.output == f"waybackpy version {__version__}\n" 94 | 95 | 96 | def test_license() -> None: 97 | runner = CliRunner() 98 | result = runner.invoke(main, ["--license"]) 99 | assert result.exit_code == 0 100 | assert ( 101 | result.output 102 | == requests.get( 103 | url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE" 104 | ).text 105 | + "\n" 106 | ) 107 | 108 | 109 | def test_only_url() -> None: 110 | runner = CliRunner() 111 | result = runner.invoke(main, ["--url", "https://google.com"]) 112 | assert result.exit_code == 0 113 | assert ( 114 | result.output 115 | == "NoCommandFound: Only URL passed, but did not specify what to do with the URL. Use \ 116 | --help flag for help using waybackpy.\n" 117 | ) 118 | 119 | 120 | def test_known_url() -> None: 121 | # with file generator enabled 122 | runner = CliRunner() 123 | result = runner.invoke( 124 | main, ["--url", "https://akamhy.github.io", "--known-urls", "--file"] 125 | ) 126 | assert result.exit_code == 0 127 | assert result.output.count("\n") > 40 128 | assert result.output.count("akamhy.github.io") > 40 129 | assert result.output.find("in the current working directory.\n") != -1 130 | 131 | # without file 132 | runner = CliRunner() 133 | result = runner.invoke(main, ["--url", "https://akamhy.github.io", "--known-urls"]) 134 | assert result.exit_code == 0 135 | assert result.output.count("\n") > 40 136 | assert result.output.count("akamhy.github.io") > 40 137 | -------------------------------------------------------------------------------- /tests/test_save_api.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | import time 4 | from datetime import datetime 5 | from typing import cast 6 | 7 | import pytest 8 | from requests.structures import CaseInsensitiveDict 9 | 10 | from waybackpy.exceptions import MaximumSaveRetriesExceeded 11 | from waybackpy.save_api import WaybackMachineSaveAPI 12 | 13 | 14 | def rndstr(n: int) -> str: 15 | return "".join( 16 | random.choice(string.ascii_uppercase + string.digits) for _ in range(n) 17 | ) 18 | 19 | 20 | def test_save() -> None: 21 | url = "https://github.com/akamhy/waybackpy" 22 | user_agent = ( 23 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 24 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 25 | ) 26 | save_api = WaybackMachineSaveAPI(url, user_agent) 27 | save_api.save() 28 | archive_url = save_api.archive_url 29 | timestamp = save_api.timestamp() 30 | headers = save_api.headers # CaseInsensitiveDict 31 | cached_save = save_api.cached_save 32 | assert cached_save in [True, False] 33 | assert archive_url.find("github.com/akamhy/waybackpy") != -1 34 | assert timestamp is not None 35 | assert str(headers).find("github.com/akamhy/waybackpy") != -1 36 | assert isinstance(save_api.timestamp(), datetime) 37 | 38 | 39 | def test_max_redirect_exceeded() -> None: 40 | with pytest.raises(MaximumSaveRetriesExceeded): 41 | url = f"https://{rndstr}.gov" 42 | user_agent = ( 43 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 44 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 45 | ) 46 | save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3) 47 | save_api.save() 48 | 49 | 50 | def test_sleep() -> None: 51 | """ 52 | sleeping is actually very important for SaveAPI 53 | interface stability. 54 | The test checks that the time taken by sleep method 55 | is as intended. 56 | """ 57 | url = "https://example.com" 58 | user_agent = ( 59 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 60 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 61 | ) 62 | save_api = WaybackMachineSaveAPI(url, user_agent) 63 | s_time = int(time.time()) 64 | save_api.sleep(6) # multiple of 3 sleep for 10 seconds 65 | e_time = int(time.time()) 66 | assert (e_time - s_time) >= 10 67 | 68 | s_time = int(time.time()) 69 | save_api.sleep(7) # sleeps for 5 seconds 70 | e_time = int(time.time()) 71 | assert (e_time - s_time) >= 5 72 | 73 | 74 | def test_timestamp() -> None: 75 | url = "https://example.com" 76 | user_agent = ( 77 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 78 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 79 | ) 80 | save_api = WaybackMachineSaveAPI(url, user_agent) 81 | now = datetime.utcnow().strftime("%Y%m%d%H%M%S") 82 | save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/" 83 | save_api.timestamp() 84 | assert save_api.cached_save is False 85 | now = "20100124063622" 86 | save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/" 87 | save_api.timestamp() 88 | assert save_api.cached_save is True 89 | 90 | 91 | def test_archive_url_parser() -> None: 92 | """ 93 | Testing three regex for matches and also tests the response URL. 94 | """ 95 | url = "https://example.com" 96 | user_agent = ( 97 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 98 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 99 | ) 100 | save_api = WaybackMachineSaveAPI(url, user_agent) 101 | 102 | h = ( 103 | "\nSTART\nContent-Location: " 104 | "/web/20201126185327/https://www.scribbr.com/citing-sources/et-al" 105 | "\nEND\n" 106 | ) 107 | save_api.headers = h # type: ignore[assignment] 108 | 109 | expected_url = ( 110 | "https://web.archive.org/web/20201126185327/" 111 | "https://www.scribbr.com/citing-sources/et-al" 112 | ) 113 | assert save_api.archive_url_parser() == expected_url 114 | 115 | headers = { 116 | "Server": "nginx/1.15.8", 117 | "Date": "Sat, 02 Jan 2021 09:40:25 GMT", 118 | "Content-Type": "text/html; charset=UTF-8", 119 | "Transfer-Encoding": "chunked", 120 | "Connection": "keep-alive", 121 | "X-Archive-Orig-Server": "nginx", 122 | "X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT", 123 | "X-Archive-Orig-Transfer-Encoding": "chunked", 124 | "X-Archive-Orig-Connection": "keep-alive", 125 | "X-Archive-Orig-Vary": "Accept-Encoding", 126 | "X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT", 127 | "X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;", 128 | "X-Archive-Guessed-Content-Type": "text/html", 129 | "X-Archive-Guessed-Charset": "utf-8", 130 | "Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT", 131 | "Link": ( 132 | '; rel="original", ' 133 | "; rel="timemap"; type="application/link-format", ' 135 | "; rel="timegate", ; rel="first memento"; ' 138 | 'datetime="Mon, 01 Jun 2020 08:29:11 GMT", ; " 140 | 'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", ' 141 | "; rel="memento"; datetime="Sat, 02 Jan 2021 ' 143 | '09:40:09 GMT", ; " 145 | 'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"' 146 | ), 147 | "Content-Security-Policy": ( 148 | "default-src 'self' 'unsafe-eval' 'unsafe-inline' " 149 | "data: blob: archive.org web.archive.org analytics.archive.org " 150 | "pragma.archivelab.org", 151 | ), 152 | "X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz", 153 | "Server-Timing": ( 154 | "captures_list;dur=112.646325, exclusion.robots;dur=0.172010, " 155 | "exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, " 156 | "esindex;dur=0.014647, LoadShardBlock;dur=82.205012, " 157 | "PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, " 158 | "load_resource;dur=26.520179" 159 | ), 160 | "X-App-Server": "wwwb-app200", 161 | "X-ts": "200", 162 | "X-location": "All", 163 | "X-Cache-Key": ( 164 | "httpsweb.archive.org/web/20210102094009/" 165 | "https://www.scribbr.com/citing-sources/et-al/IN", 166 | ), 167 | "X-RL": "0", 168 | "X-Page-Cache": "MISS", 169 | "X-Archive-Screenname": "0", 170 | "Content-Encoding": "gzip", 171 | } 172 | 173 | save_api.headers = cast(CaseInsensitiveDict[str], headers) 174 | 175 | expected_url2 = ( 176 | "https://web.archive.org/web/20210102094009/" 177 | "https://www.scribbr.com/citing-sources/et-al/" 178 | ) 179 | assert save_api.archive_url_parser() == expected_url2 180 | 181 | expected_url_3 = ( 182 | "https://web.archive.org/web/20171128185327/" 183 | "https://www.scribbr.com/citing-sources/et-al/US" 184 | ) 185 | h = f"START\nX-Cache-Key: {expected_url_3}\nEND\n" 186 | save_api.headers = h # type: ignore[assignment] 187 | 188 | expected_url4 = ( 189 | "https://web.archive.org/web/20171128185327/" 190 | "https://www.scribbr.com/citing-sources/et-al/" 191 | ) 192 | assert save_api.archive_url_parser() == expected_url4 193 | 194 | h = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING" 195 | save_api.headers = h # type: ignore[assignment] 196 | save_api.response_url = ( 197 | "https://web.archive.org/web/20171128185327/" 198 | "https://www.scribbr.com/citing-sources/et-al" 199 | ) 200 | expected_url5 = ( 201 | "https://web.archive.org/web/20171128185327/" 202 | "https://www.scribbr.com/citing-sources/et-al" 203 | ) 204 | assert save_api.archive_url_parser() == expected_url5 205 | 206 | 207 | def test_archive_url() -> None: 208 | """ 209 | Checks the attribute archive_url's value when the save method was not 210 | explicitly invoked by the end-user but the save method was invoked implicitly 211 | by the archive_url method which is an attribute due to @property. 212 | """ 213 | url = "https://example.com" 214 | user_agent = ( 215 | "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 " 216 | "(KHTML, like Gecko) Version/14.1.1 Safari/604.1" 217 | ) 218 | save_api = WaybackMachineSaveAPI(url, user_agent) 219 | save_api.saved_archive = ( 220 | "https://web.archive.org/web/20220124063056/https://example.com/" 221 | ) 222 | save_api._archive_url = save_api.saved_archive 223 | assert save_api.archive_url == save_api.saved_archive 224 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from waybackpy import __version__ 2 | from waybackpy.utils import DEFAULT_USER_AGENT 3 | 4 | 5 | def test_default_user_agent() -> None: 6 | assert ( 7 | DEFAULT_USER_AGENT 8 | == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" 9 | ) 10 | -------------------------------------------------------------------------------- /tests/test_wrapper.py: -------------------------------------------------------------------------------- 1 | from waybackpy.wrapper import Url 2 | 3 | 4 | def test_oldest() -> None: 5 | url = "https://bing.com" 6 | oldest_archive = ( 7 | "https://web.archive.org/web/20030726111100/http://www.bing.com:80/" 8 | ) 9 | wayback = Url(url).oldest() 10 | assert wayback.archive_url == oldest_archive 11 | assert str(wayback) == oldest_archive 12 | assert len(wayback) > 365 * 15 # days in a year times years 13 | 14 | 15 | def test_newest() -> None: 16 | url = "https://www.youtube.com/" 17 | wayback = Url(url).newest() 18 | assert "youtube" in str(wayback.archive_url) 19 | assert "archived_snapshots" in str(wayback.json) 20 | 21 | 22 | def test_near() -> None: 23 | url = "https://www.google.com" 24 | wayback = Url(url).near(year=2010, month=10, day=10, hour=10, minute=10) 25 | assert "20101010" in str(wayback.archive_url) 26 | 27 | 28 | def test_total_archives() -> None: 29 | wayback = Url("https://akamhy.github.io") 30 | assert wayback.total_archives() > 10 31 | 32 | wayback = Url("https://gaha.ef4i3n.m5iai3kifp6ied.cima/gahh2718gs/ahkst63t7gad8") 33 | assert wayback.total_archives() == 0 34 | 35 | 36 | def test_known_urls() -> None: 37 | wayback = Url("akamhy.github.io") 38 | assert len(list(wayback.known_urls(subdomain=True))) > 40 39 | 40 | 41 | def test_Save() -> None: 42 | wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property") 43 | wayback.save() 44 | archive_url = str(wayback.archive_url) 45 | assert archive_url.find("Asymptotic_equipartition_property") != -1 46 | -------------------------------------------------------------------------------- /waybackpy/__init__.py: -------------------------------------------------------------------------------- 1 | """Module initializer and provider of static information.""" 2 | 3 | __version__ = "3.0.6" 4 | 5 | from .availability_api import WaybackMachineAvailabilityAPI 6 | from .cdx_api import WaybackMachineCDXServerAPI 7 | from .save_api import WaybackMachineSaveAPI 8 | from .wrapper import Url 9 | 10 | __all__ = [ 11 | "__version__", 12 | "WaybackMachineAvailabilityAPI", 13 | "WaybackMachineCDXServerAPI", 14 | "WaybackMachineSaveAPI", 15 | "Url", 16 | ] 17 | -------------------------------------------------------------------------------- /waybackpy/availability_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module interfaces the Wayback Machine's availability API. 3 | 4 | The interface is useful for looking up archives and finding archives 5 | that are close to a specific date and time. 6 | 7 | It has a class WaybackMachineAvailabilityAPI, and the class has 8 | methods like: 9 | 10 | near() for retrieving archives close to a specific date and time. 11 | 12 | oldest() for retrieving the first archive URL of the webpage. 13 | 14 | newest() for retrieving the latest archive of the webpage. 15 | 16 | The Wayback Machine Availability API response must be a valid JSON and 17 | if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised. 18 | 19 | If the Availability API returned valid JSON but archive URL could not be found 20 | it it then ArchiveNotInAvailabilityAPIResponse is raised. 21 | """ 22 | 23 | import json 24 | import time 25 | from datetime import datetime 26 | from typing import Any, Dict, Optional 27 | 28 | import requests 29 | from requests.models import Response 30 | 31 | from .exceptions import ( 32 | ArchiveNotInAvailabilityAPIResponse, 33 | InvalidJSONInAvailabilityAPIResponse, 34 | ) 35 | from .utils import ( 36 | DEFAULT_USER_AGENT, 37 | unix_timestamp_to_wayback_timestamp, 38 | wayback_timestamp, 39 | ) 40 | 41 | ResponseJSON = Dict[str, Any] 42 | 43 | 44 | class WaybackMachineAvailabilityAPI: 45 | """ 46 | Class that interfaces the Wayback Machine's availability API. 47 | """ 48 | 49 | def __init__( 50 | self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3 51 | ) -> None: 52 | 53 | self.url = str(url).strip().replace(" ", "%20") 54 | self.user_agent = user_agent 55 | self.headers: Dict[str, str] = {"User-Agent": self.user_agent} 56 | self.payload: Dict[str, str] = {"url": self.url} 57 | self.endpoint: str = "https://archive.org/wayback/available" 58 | self.max_tries: int = max_tries 59 | self.tries: int = 0 60 | self.last_api_call_unix_time: int = int(time.time()) 61 | self.api_call_time_gap: int = 5 62 | self.json: Optional[ResponseJSON] = None 63 | self.response: Optional[Response] = None 64 | 65 | def __repr__(self) -> str: 66 | """ 67 | Same as string representation, just return the archive URL as a string. 68 | """ 69 | return str(self) 70 | 71 | def __str__(self) -> str: 72 | """ 73 | String representation of the class. If atleast one API 74 | call was successfully made then return the archive URL 75 | as a string. Else returns "" (empty string literal). 76 | """ 77 | # __str__ can not return anything other than a string object 78 | # So, if a string repr is asked even before making a API request 79 | # just return "" 80 | if not self.json: 81 | return "" 82 | 83 | return self.archive_url 84 | 85 | def setup_json(self) -> Optional[ResponseJSON]: 86 | """ 87 | Makes the API call to the availability API and set the JSON response 88 | to the JSON attribute of the instance and also returns the JSON 89 | attribute. 90 | 91 | time_diff and sleep_time makes sure that you are not making too many 92 | requests in a short interval of item, making too many requests is bad 93 | as Wayback Machine may reject them above a certain threshold. 94 | 95 | The end-user can change the api_call_time_gap attribute of the instance 96 | to increase or decrease the default time gap between two successive API 97 | calls, but it is not recommended to increase it. 98 | """ 99 | time_diff = int(time.time()) - self.last_api_call_unix_time 100 | sleep_time = self.api_call_time_gap - time_diff 101 | 102 | if sleep_time > 0: 103 | time.sleep(sleep_time) 104 | 105 | self.response = requests.get( 106 | self.endpoint, params=self.payload, headers=self.headers 107 | ) 108 | self.last_api_call_unix_time = int(time.time()) 109 | self.tries += 1 110 | try: 111 | self.json = None if self.response is None else self.response.json() 112 | except json.decoder.JSONDecodeError as json_decode_error: 113 | raise InvalidJSONInAvailabilityAPIResponse( 114 | f"Response data:\n{self.response.text}" 115 | ) from json_decode_error 116 | 117 | return self.json 118 | 119 | def timestamp(self) -> datetime: 120 | """ 121 | Converts the timestamp form the JSON response to datetime object. 122 | If JSON attribute of the instance is None it implies that the either 123 | the the last API call failed or one was never made. 124 | 125 | If not JSON or if JSON but no timestamp in the JSON response then 126 | returns the maximum value for datetime object that is possible. 127 | 128 | If you get an URL as a response form the availability API it is 129 | guaranteed that you can get the datetime object from the timestamp. 130 | """ 131 | if self.json is None or "archived_snapshots" not in self.json: 132 | return datetime.max 133 | 134 | if ( 135 | self.json is not None 136 | and "archived_snapshots" in self.json 137 | and self.json["archived_snapshots"] is not None 138 | and "closest" in self.json["archived_snapshots"] 139 | and self.json["archived_snapshots"]["closest"] is not None 140 | and "timestamp" in self.json["archived_snapshots"]["closest"] 141 | ): 142 | return datetime.strptime( 143 | self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S" 144 | ) 145 | 146 | raise ValueError("Timestamp not found in the Availability API's JSON response.") 147 | 148 | @property 149 | def archive_url(self) -> str: 150 | """ 151 | Reads the the JSON response data and returns 152 | the timestamp if found and if not found raises 153 | ArchiveNotInAvailabilityAPIResponse. 154 | """ 155 | archive_url = "" 156 | data = self.json 157 | 158 | # If the user didn't invoke oldest, newest or near but tries to access 159 | # archive_url attribute then assume they that are fine with any archive 160 | # and invoke the oldest method. 161 | if not data: 162 | self.oldest() 163 | 164 | # If data is still not none then probably there are no 165 | # archive for the requested URL. 166 | if not data or not data["archived_snapshots"]: 167 | while (self.tries < self.max_tries) and ( 168 | not data or not data["archived_snapshots"] 169 | ): 170 | self.setup_json() # It makes a new API call 171 | data = self.json # setup_json() updates value of json attribute 172 | 173 | # If exhausted max_tries, then give up and 174 | # raise ArchiveNotInAvailabilityAPIResponse. 175 | 176 | if not data or not data["archived_snapshots"]: 177 | raise ArchiveNotInAvailabilityAPIResponse( 178 | "Archive not found in the availability " 179 | "API response, the URL you requested may not have any archives " 180 | "yet. You may retry after some time or archive the webpage now.\n" 181 | "Response data:\n" 182 | "" 183 | if self.response is None 184 | else self.response.text 185 | ) 186 | else: 187 | archive_url = data["archived_snapshots"]["closest"]["url"] 188 | archive_url = archive_url.replace( 189 | "http://web.archive.org/web/", "https://web.archive.org/web/", 1 190 | ) 191 | return archive_url 192 | 193 | def oldest(self) -> "WaybackMachineAvailabilityAPI": 194 | """ 195 | Passes the date 1994-01-01 to near which should return the oldest archive 196 | because Wayback Machine was started in May, 1996 and it is assumed that 197 | there would be no archive older than January 1, 1994. 198 | """ 199 | return self.near(year=1994, month=1, day=1) 200 | 201 | def newest(self) -> "WaybackMachineAvailabilityAPI": 202 | """ 203 | Passes the current UNIX time to near() for retrieving the newest archive 204 | from the availability API. 205 | 206 | Remember UNIX time is UTC and Wayback Machine is also UTC based. 207 | """ 208 | return self.near(unix_timestamp=int(time.time())) 209 | 210 | def near( 211 | self, 212 | year: Optional[int] = None, 213 | month: Optional[int] = None, 214 | day: Optional[int] = None, 215 | hour: Optional[int] = None, 216 | minute: Optional[int] = None, 217 | unix_timestamp: Optional[int] = None, 218 | ) -> "WaybackMachineAvailabilityAPI": 219 | """ 220 | The most important method of this Class, oldest() and newest() are 221 | dependent on it. 222 | 223 | It generates the timestamp based on the input either by calling the 224 | unix_timestamp_to_wayback_timestamp or wayback_timestamp method with 225 | appropriate arguments for their respective parameters. 226 | 227 | Adds the timestamp to the payload dictionary. 228 | 229 | And finally invokes the setup_json method to make the API call then 230 | finally returns the instance. 231 | """ 232 | if unix_timestamp: 233 | timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) 234 | else: 235 | now = datetime.utcnow().timetuple() 236 | timestamp = wayback_timestamp( 237 | year=now.tm_year if year is None else year, 238 | month=now.tm_mon if month is None else month, 239 | day=now.tm_mday if day is None else day, 240 | hour=now.tm_hour if hour is None else hour, 241 | minute=now.tm_min if minute is None else minute, 242 | ) 243 | 244 | self.payload["timestamp"] = timestamp 245 | self.setup_json() 246 | return self 247 | -------------------------------------------------------------------------------- /waybackpy/cdx_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module interfaces the Wayback Machine's CDX server API. 3 | 4 | The module has WaybackMachineCDXServerAPI which should be used by the users of 5 | this module to consume the CDX server API. 6 | 7 | WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and 8 | the snapshots are yielded as instances of the CDXSnapshot class. 9 | """ 10 | 11 | 12 | import time 13 | from datetime import datetime 14 | from typing import Dict, Generator, List, Optional, Union, cast 15 | 16 | from .cdx_snapshot import CDXSnapshot 17 | from .cdx_utils import ( 18 | check_collapses, 19 | check_filters, 20 | check_match_type, 21 | check_sort, 22 | full_url, 23 | get_response, 24 | get_total_pages, 25 | ) 26 | from .exceptions import NoCDXRecordFound, WaybackError 27 | from .utils import ( 28 | DEFAULT_USER_AGENT, 29 | unix_timestamp_to_wayback_timestamp, 30 | wayback_timestamp, 31 | ) 32 | 33 | 34 | class WaybackMachineCDXServerAPI: 35 | """ 36 | Class that interfaces the CDX server API of the Wayback Machine. 37 | 38 | snapshot() returns a generator that can be iterated upon by the end-user, 39 | the generator returns the snapshots/entries as instance of CDXSnapshot to 40 | make the usage easy, just use '.' to get any attribute as the attributes are 41 | accessible via a dot ".". 42 | """ 43 | 44 | # start_timestamp: from, can not use from as it's a keyword 45 | # end_timestamp: to, not using to as can not use from 46 | def __init__( 47 | self, 48 | url: str, 49 | user_agent: str = DEFAULT_USER_AGENT, 50 | start_timestamp: Optional[str] = None, 51 | end_timestamp: Optional[str] = None, 52 | filters: Optional[List[str]] = None, 53 | match_type: Optional[str] = None, 54 | sort: Optional[str] = None, 55 | gzip: Optional[str] = None, 56 | collapses: Optional[List[str]] = None, 57 | limit: Optional[str] = None, 58 | max_tries: int = 3, 59 | use_pagination: bool = False, 60 | closest: Optional[str] = None, 61 | ) -> None: 62 | self.url = str(url).strip().replace(" ", "%20") 63 | self.user_agent = user_agent 64 | self.start_timestamp = None if start_timestamp is None else str(start_timestamp) 65 | self.end_timestamp = None if end_timestamp is None else str(end_timestamp) 66 | self.filters = [] if filters is None else filters 67 | check_filters(self.filters) 68 | self.match_type = None if match_type is None else str(match_type).strip() 69 | check_match_type(self.match_type, self.url) 70 | self.sort = None if sort is None else str(sort).strip() 71 | check_sort(self.sort) 72 | self.gzip = gzip 73 | self.collapses = [] if collapses is None else collapses 74 | check_collapses(self.collapses) 75 | self.limit = 25000 if limit is None else limit 76 | self.max_tries = max_tries 77 | self.use_pagination = use_pagination 78 | self.closest = None if closest is None else str(closest) 79 | self.last_api_request_url: Optional[str] = None 80 | self.endpoint = "https://web.archive.org/cdx/search/cdx" 81 | 82 | def cdx_api_manager( 83 | self, payload: Dict[str, str], headers: Dict[str, str] 84 | ) -> Generator[str, None, None]: 85 | """ 86 | This method uses the pagination API of the CDX server if 87 | use_pagination attribute is True else uses the standard 88 | CDX server response data. 89 | """ 90 | 91 | # When using the pagination API of the CDX server. 92 | if self.use_pagination is True: 93 | 94 | total_pages = get_total_pages(self.url, self.user_agent) 95 | successive_blank_pages = 0 96 | 97 | for i in range(total_pages): 98 | payload["page"] = str(i) 99 | 100 | url = full_url(self.endpoint, params=payload) 101 | res = get_response(url, headers=headers) 102 | 103 | if isinstance(res, Exception): 104 | raise res 105 | 106 | self.last_api_request_url = url 107 | text = res.text 108 | 109 | # Reset the counter if the last page was blank 110 | # but the current page is not. 111 | if successive_blank_pages == 1: 112 | if len(text) != 0: 113 | successive_blank_pages = 0 114 | 115 | # Increase the succesive page counter on encountering 116 | # blank page. 117 | if len(text) == 0: 118 | successive_blank_pages += 1 119 | 120 | # If two succesive pages are blank 121 | # then we don't have any more pages left to 122 | # iterate. 123 | if successive_blank_pages >= 2: 124 | break 125 | 126 | yield text 127 | 128 | # When not using the pagination API of the CDX server 129 | else: 130 | payload["showResumeKey"] = "true" 131 | payload["limit"] = str(self.limit) 132 | resume_key = None 133 | more = True 134 | while more: 135 | if resume_key: 136 | payload["resumeKey"] = resume_key 137 | 138 | url = full_url(self.endpoint, params=payload) 139 | res = get_response(url, headers=headers) 140 | if isinstance(res, Exception): 141 | raise res 142 | 143 | self.last_api_request_url = url 144 | 145 | text = res.text.strip() 146 | lines = text.splitlines() 147 | 148 | more = False 149 | 150 | if len(lines) >= 3: 151 | 152 | second_last_line = lines[-2] 153 | 154 | if len(second_last_line) == 0: 155 | 156 | resume_key = lines[-1].strip() 157 | text = text.replace(resume_key, "", 1).strip() 158 | more = True 159 | 160 | yield text 161 | 162 | def add_payload(self, payload: Dict[str, str]) -> None: 163 | """ 164 | Adds the payload to the payload dictionary. 165 | """ 166 | if self.start_timestamp: 167 | payload["from"] = self.start_timestamp 168 | 169 | if self.end_timestamp: 170 | payload["to"] = self.end_timestamp 171 | 172 | if self.gzip is None: 173 | payload["gzip"] = "false" 174 | 175 | if self.closest: 176 | payload["closest"] = self.closest 177 | 178 | if self.match_type: 179 | payload["matchType"] = self.match_type 180 | 181 | if self.sort: 182 | payload["sort"] = self.sort 183 | 184 | if self.filters and len(self.filters) > 0: 185 | for i, _filter in enumerate(self.filters): 186 | payload["filter" + str(i)] = _filter 187 | 188 | if self.collapses and len(self.collapses) > 0: 189 | for i, collapse in enumerate(self.collapses): 190 | payload["collapse" + str(i)] = collapse 191 | 192 | payload["url"] = self.url 193 | 194 | def before( 195 | self, 196 | year: Optional[int] = None, 197 | month: Optional[int] = None, 198 | day: Optional[int] = None, 199 | hour: Optional[int] = None, 200 | minute: Optional[int] = None, 201 | unix_timestamp: Optional[int] = None, 202 | wayback_machine_timestamp: Optional[Union[int, str]] = None, 203 | ) -> CDXSnapshot: 204 | """ 205 | Gets the nearest archive before the given datetime. 206 | """ 207 | if unix_timestamp: 208 | timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) 209 | elif wayback_machine_timestamp: 210 | timestamp = str(wayback_machine_timestamp) 211 | else: 212 | now = datetime.utcnow().timetuple() 213 | timestamp = wayback_timestamp( 214 | year=now.tm_year if year is None else year, 215 | month=now.tm_mon if month is None else month, 216 | day=now.tm_mday if day is None else day, 217 | hour=now.tm_hour if hour is None else hour, 218 | minute=now.tm_min if minute is None else minute, 219 | ) 220 | self.closest = timestamp 221 | self.sort = "closest" 222 | self.limit = 25000 223 | for snapshot in self.snapshots(): 224 | if snapshot.timestamp < timestamp: 225 | return snapshot 226 | 227 | # If a snapshot isn't returned, then none were found. 228 | raise NoCDXRecordFound( 229 | "No records were found before the given date for the query." 230 | + "Either there are no archives before the given date," 231 | + " the URL may not have any archived, or the URL may have been" 232 | + " recently archived and is still not available on the CDX server." 233 | ) 234 | 235 | def after( 236 | self, 237 | year: Optional[int] = None, 238 | month: Optional[int] = None, 239 | day: Optional[int] = None, 240 | hour: Optional[int] = None, 241 | minute: Optional[int] = None, 242 | unix_timestamp: Optional[int] = None, 243 | wayback_machine_timestamp: Optional[Union[int, str]] = None, 244 | ) -> CDXSnapshot: 245 | """ 246 | Gets the nearest archive after the given datetime. 247 | """ 248 | if unix_timestamp: 249 | timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) 250 | elif wayback_machine_timestamp: 251 | timestamp = str(wayback_machine_timestamp) 252 | else: 253 | now = datetime.utcnow().timetuple() 254 | timestamp = wayback_timestamp( 255 | year=now.tm_year if year is None else year, 256 | month=now.tm_mon if month is None else month, 257 | day=now.tm_mday if day is None else day, 258 | hour=now.tm_hour if hour is None else hour, 259 | minute=now.tm_min if minute is None else minute, 260 | ) 261 | self.closest = timestamp 262 | self.sort = "closest" 263 | self.limit = 25000 264 | for snapshot in self.snapshots(): 265 | if snapshot.timestamp > timestamp: 266 | return snapshot 267 | 268 | # If a snapshot isn't returned, then none were found. 269 | raise NoCDXRecordFound( 270 | "No records were found after the given date for the query." 271 | + "Either there are no archives after the given date," 272 | + " the URL may not have any archives, or the URL may have been" 273 | + " recently archived and is still not available on the CDX server." 274 | ) 275 | 276 | def near( 277 | self, 278 | year: Optional[int] = None, 279 | month: Optional[int] = None, 280 | day: Optional[int] = None, 281 | hour: Optional[int] = None, 282 | minute: Optional[int] = None, 283 | unix_timestamp: Optional[int] = None, 284 | wayback_machine_timestamp: Optional[Union[int, str]] = None, 285 | ) -> CDXSnapshot: 286 | """ 287 | Fetch archive close to a datetime, it can only return 288 | a single URL. If you want more do not use this method 289 | instead use the class. 290 | """ 291 | if unix_timestamp: 292 | timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) 293 | elif wayback_machine_timestamp: 294 | timestamp = str(wayback_machine_timestamp) 295 | else: 296 | now = datetime.utcnow().timetuple() 297 | timestamp = wayback_timestamp( 298 | year=now.tm_year if year is None else year, 299 | month=now.tm_mon if month is None else month, 300 | day=now.tm_mday if day is None else day, 301 | hour=now.tm_hour if hour is None else hour, 302 | minute=now.tm_min if minute is None else minute, 303 | ) 304 | self.closest = timestamp 305 | self.sort = "closest" 306 | self.limit = 1 307 | first_snapshot = None 308 | for snapshot in self.snapshots(): 309 | first_snapshot = snapshot 310 | break 311 | 312 | if not first_snapshot: 313 | raise NoCDXRecordFound( 314 | "Wayback Machine's CDX server did not return any records " 315 | + "for the query. The URL may not have any archives " 316 | + " on the Wayback Machine or the URL may have been recently " 317 | + "archived and is still not available on the CDX server." 318 | ) 319 | 320 | return first_snapshot 321 | 322 | def newest(self) -> CDXSnapshot: 323 | """ 324 | Passes the current UNIX time to near() for retrieving the newest archive 325 | from the availability API. 326 | 327 | Remember UNIX time is UTC and Wayback Machine is also UTC based. 328 | """ 329 | return self.near(unix_timestamp=int(time.time())) 330 | 331 | def oldest(self) -> CDXSnapshot: 332 | """ 333 | Passes the date 1994-01-01 to near which should return the oldest archive 334 | because Wayback Machine was started in May, 1996 and it is assumed that 335 | there would be no archive older than January 1, 1994. 336 | """ 337 | return self.near(year=1994, month=1, day=1) 338 | 339 | def snapshots(self) -> Generator[CDXSnapshot, None, None]: 340 | """ 341 | This function yields the CDX data lines as snapshots. 342 | 343 | As it is a generator it exhaustible, the reason that this is 344 | a generator and not a list are: 345 | 346 | a) CDX server API can return millions of entries for a query and list 347 | is not suitable for such cases. 348 | 349 | b) Preventing memory usage issues, as told before this method may yield 350 | millions of records for some queries and your system may not have enough 351 | memory for such a big list. Also Remember this if outputing to Jupyter 352 | Notebooks. 353 | 354 | The objects yielded by this method are instance of CDXSnapshot class, 355 | you can access the attributes of the entries as the attribute of the instance 356 | itself. 357 | """ 358 | payload: Dict[str, str] = {} 359 | headers = {"User-Agent": self.user_agent} 360 | 361 | self.add_payload(payload) 362 | 363 | entries = self.cdx_api_manager(payload, headers) 364 | 365 | for entry in entries: 366 | 367 | if entry.isspace() or len(entry) <= 1 or not entry: 368 | continue 369 | 370 | # each line is a snapshot aka entry of the CDX server API. 371 | # We are able to split the page by lines because it only 372 | # splits the lines on a sinlge page and not all the entries 373 | # at once, thus there should be no issues of too much memory usage. 374 | snapshot_list = entry.split("\n") 375 | 376 | for snapshot in snapshot_list: 377 | 378 | # 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries. 379 | # they are invalid if their length is smaller than sum of length 380 | # of a standard wayback_timestamp and standard digest of an entry. 381 | if len(snapshot) < 46: 382 | continue 383 | 384 | properties: Dict[str, Optional[str]] = { 385 | "urlkey": None, 386 | "timestamp": None, 387 | "original": None, 388 | "mimetype": None, 389 | "statuscode": None, 390 | "digest": None, 391 | "length": None, 392 | } 393 | 394 | property_value = snapshot.split(" ") 395 | 396 | total_property_values = len(property_value) 397 | warranted_total_property_values = len(properties) 398 | 399 | if total_property_values != warranted_total_property_values: 400 | raise WaybackError( 401 | f"Snapshot returned by CDX API has {total_property_values} prop" 402 | f"erties instead of expected {warranted_total_property_values} " 403 | f"properties.\nProblematic Snapshot: {snapshot}" 404 | ) 405 | 406 | ( 407 | properties["urlkey"], 408 | properties["timestamp"], 409 | properties["original"], 410 | properties["mimetype"], 411 | properties["statuscode"], 412 | properties["digest"], 413 | properties["length"], 414 | ) = property_value 415 | 416 | yield CDXSnapshot(cast(Dict[str, str], properties)) 417 | -------------------------------------------------------------------------------- /waybackpy/cdx_snapshot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module that contains the CDXSnapshot class, CDX records/lines are casted 3 | to CDXSnapshot objects for easier access. 4 | 5 | The CDX index format is plain text data. Each line ('record') indicates a 6 | crawled document. And these lines are casted to CDXSnapshot. 7 | """ 8 | 9 | 10 | from datetime import datetime 11 | from typing import Dict 12 | 13 | 14 | class CDXSnapshot: 15 | """ 16 | Class for the CDX snapshot lines('record') returned by the CDX API, 17 | Each valid line of the CDX API is casted to an CDXSnapshot object 18 | by the CDX API interface, just use "." to access any attribute of the 19 | CDX server API snapshot. 20 | 21 | This provides the end-user the ease of using the data as attributes 22 | of the CDXSnapshot. 23 | 24 | The string representation of the class is identical to the line returned 25 | by the CDX server API. 26 | 27 | Besides all the attributes of the CDX server API this class also provides 28 | archive_url attribute, yes it is the archive url of the snapshot. 29 | 30 | Attributes of the this class and what they represents and are useful for: 31 | 32 | urlkey: The document captured, expressed as a SURT 33 | SURT stands for Sort-friendly URI Reordering Transform, and is a 34 | transformation applied to URIs which makes their left-to-right 35 | representation better match the natural hierarchy of domain names. 36 | A URI has SURT 37 | form . 38 | 39 | timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type 40 | is string. 41 | 42 | datetime_timestamp: The timestamp as a datetime object. 43 | 44 | original: The original URL of the archive. If archive_url is 45 | https://web.archive.org/web/20220113130051/https://google.com then the 46 | original URL is https://google.com 47 | 48 | mimetype: The document’s file type. e.g. text/html 49 | 50 | statuscode: HTTP response code for the document at the time of its crawling 51 | 52 | digest: Base32-encoded SHA-1 checksum of the document for discriminating 53 | with others 54 | 55 | length: Document’s volume of bytes in the WARC file 56 | 57 | archive_url: The archive url of the snapshot, this is not returned by the 58 | CDX server API but created by this class on init. 59 | """ 60 | 61 | def __init__(self, properties: Dict[str, str]) -> None: 62 | self.urlkey: str = properties["urlkey"] 63 | self.timestamp: str = properties["timestamp"] 64 | self.datetime_timestamp: datetime = datetime.strptime( 65 | self.timestamp, "%Y%m%d%H%M%S" 66 | ) 67 | self.original: str = properties["original"] 68 | self.mimetype: str = properties["mimetype"] 69 | self.statuscode: str = properties["statuscode"] 70 | self.digest: str = properties["digest"] 71 | self.length: str = properties["length"] 72 | self.archive_url: str = ( 73 | f"https://web.archive.org/web/{self.timestamp}/{self.original}" 74 | ) 75 | 76 | def __repr__(self) -> str: 77 | """ 78 | Same as __str__() 79 | """ 80 | return str(self) 81 | 82 | def __str__(self) -> str: 83 | """ 84 | The string representation is same as the line returned by the 85 | CDX server API for the snapshot. 86 | """ 87 | return ( 88 | f"{self.urlkey} {self.timestamp} {self.original} " 89 | f"{self.mimetype} {self.statuscode} {self.digest} {self.length}" 90 | ) 91 | -------------------------------------------------------------------------------- /waybackpy/cdx_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions required for accessing the CDX server API. 3 | 4 | These are here in this module so that we don’t make any module too 5 | long. 6 | """ 7 | 8 | import re 9 | from typing import Any, Dict, List, Optional, Union 10 | from urllib.parse import quote 11 | 12 | import requests 13 | from requests.adapters import HTTPAdapter 14 | from urllib3.util.retry import Retry 15 | 16 | from .exceptions import BlockedSiteError, WaybackError 17 | from .utils import DEFAULT_USER_AGENT 18 | 19 | 20 | def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int: 21 | """ 22 | When using the pagination use adding showNumPages=true to the request 23 | URL makes the CDX server return an integer which is the number of pages 24 | of CDX pages available for us to query using the pagination API. 25 | """ 26 | endpoint = "https://web.archive.org/cdx/search/cdx?" 27 | payload = {"showNumPages": "true", "url": str(url)} 28 | headers = {"User-Agent": user_agent} 29 | request_url = full_url(endpoint, params=payload) 30 | response = get_response(request_url, headers=headers) 31 | check_for_blocked_site(response, url) 32 | if isinstance(response, requests.Response): 33 | return int(response.text.strip()) 34 | raise response 35 | 36 | 37 | def check_for_blocked_site( 38 | response: Union[requests.Response, Exception], url: Optional[str] = None 39 | ) -> None: 40 | """ 41 | Checks that the URL can be archived by wayback machine or not. 42 | robots.txt policy of the site may prevent the wayback machine. 43 | """ 44 | # see https://github.com/akamhy/waybackpy/issues/157 45 | 46 | # the following if block is to make mypy happy. 47 | if isinstance(response, Exception): 48 | raise response 49 | 50 | if not url: 51 | url = "The requested content" 52 | if ( 53 | "org.archive.util.io.RuntimeIOException: " 54 | + "org.archive.wayback.exception.AdministrativeAccessControlException: " 55 | + "Blocked Site Error" 56 | in response.text.strip() 57 | ): 58 | raise BlockedSiteError( 59 | f"{url} is excluded from Wayback Machine by the site's robots.txt policy." 60 | ) 61 | 62 | 63 | def full_url(endpoint: str, params: Dict[str, Any]) -> str: 64 | """ 65 | As the function's name already implies that it returns 66 | full URL, but why we need a function for generating full URL? 67 | The CDX server can support multiple arguments for parameters 68 | such as filter and collapse and this function adds them without 69 | overwriting earlier added arguments. 70 | """ 71 | if not params: 72 | return endpoint 73 | _full_url = endpoint if endpoint.endswith("?") else (endpoint + "?") 74 | 75 | for key, val in params.items(): 76 | key = "filter" if key.startswith("filter") else key 77 | key = "collapse" if key.startswith("collapse") else key 78 | amp = "" if _full_url.endswith("?") else "&" 79 | val = quote(str(val), safe="") 80 | _full_url += f"{amp}{key}={val}" 81 | 82 | return _full_url 83 | 84 | 85 | def get_response( 86 | url: str, 87 | headers: Optional[Dict[str, str]] = None, 88 | retries: int = 5, 89 | backoff_factor: float = 0.5, 90 | ) -> Union[requests.Response, Exception]: 91 | """ 92 | Makes get request to the CDX server and returns the response. 93 | """ 94 | session = requests.Session() 95 | 96 | retries_ = Retry( 97 | total=retries, 98 | backoff_factor=backoff_factor, 99 | status_forcelist=[500, 502, 503, 504], 100 | ) 101 | 102 | session.mount("https://", HTTPAdapter(max_retries=retries_)) 103 | response = session.get(url, headers=headers) 104 | session.close() 105 | check_for_blocked_site(response) 106 | return response 107 | 108 | 109 | def check_filters(filters: List[str]) -> None: 110 | """ 111 | Check that the filter arguments passed by the end-user are valid. 112 | If not valid then raise WaybackError. 113 | """ 114 | if not isinstance(filters, list): 115 | raise WaybackError("filters must be a list.") 116 | 117 | # [!]field:regex 118 | for _filter in filters: 119 | match = re.search( 120 | r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):" 121 | r"(.*)", 122 | _filter, 123 | ) 124 | 125 | if match is None or len(match.groups()) != 2: 126 | 127 | exc_message = f"Filter '{_filter}' is not following the cdx filter syntax." 128 | raise WaybackError(exc_message) 129 | 130 | 131 | def check_collapses(collapses: List[str]) -> bool: 132 | """ 133 | Check that the collapse arguments passed by the end-user are valid. 134 | If not valid then raise WaybackError. 135 | """ 136 | if not isinstance(collapses, list): 137 | raise WaybackError("collapses must be a list.") 138 | 139 | if len(collapses) == 0: 140 | return True 141 | 142 | for collapse in collapses: 143 | match = re.search( 144 | r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)" 145 | r"(:?[0-9]{1,99})?", 146 | collapse, 147 | ) 148 | if match is None or len(match.groups()) != 2: 149 | exc_message = ( 150 | f"collapse argument '{collapse}' " 151 | "is not following the cdx collapse syntax." 152 | ) 153 | raise WaybackError(exc_message) 154 | 155 | return True 156 | 157 | 158 | def check_match_type(match_type: Optional[str], url: str) -> bool: 159 | """ 160 | Check that the match_type argument passed by the end-user is valid. 161 | If not valid then raise WaybackError. 162 | """ 163 | legal_match_type = ["exact", "prefix", "host", "domain"] 164 | 165 | if not match_type: 166 | return True 167 | 168 | if "*" in url: 169 | raise WaybackError( 170 | "Can not use wildcard in the URL along with the match_type arguments." 171 | ) 172 | 173 | if match_type not in legal_match_type: 174 | exc_message = ( 175 | f"{match_type} is not an allowed match type.\n" 176 | "Use one from 'exact', 'prefix', 'host' or 'domain'" 177 | ) 178 | raise WaybackError(exc_message) 179 | 180 | return True 181 | 182 | 183 | def check_sort(sort: Optional[str]) -> bool: 184 | """ 185 | Check that the sort argument passed by the end-user is valid. 186 | If not valid then raise WaybackError. 187 | """ 188 | 189 | legal_sort = ["default", "closest", "reverse"] 190 | 191 | if not sort: 192 | return True 193 | 194 | if sort not in legal_sort: 195 | exc_message = ( 196 | f"{sort} is not an allowed argument for sort.\n" 197 | "Use one from 'default', 'closest' or 'reverse'" 198 | ) 199 | raise WaybackError(exc_message) 200 | 201 | return True 202 | -------------------------------------------------------------------------------- /waybackpy/cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module responsible for enabling waybackpy to function as a CLI tool. 3 | """ 4 | 5 | import os 6 | import random 7 | import re 8 | import string 9 | from typing import Any, Dict, Generator, List, Optional 10 | 11 | import click 12 | import requests 13 | 14 | from . import __version__ 15 | from .cdx_api import WaybackMachineCDXServerAPI 16 | from .exceptions import BlockedSiteError, NoCDXRecordFound 17 | from .save_api import WaybackMachineSaveAPI 18 | from .utils import DEFAULT_USER_AGENT 19 | from .wrapper import Url 20 | 21 | 22 | def handle_cdx_closest_derivative_methods( 23 | cdx_api: "WaybackMachineCDXServerAPI", 24 | oldest: bool, 25 | near: bool, 26 | newest: bool, 27 | near_args: Optional[Dict[str, int]] = None, 28 | ) -> None: 29 | """ 30 | Handles the closest parameter derivative methods. 31 | 32 | near, newest and oldest use the closest parameter with active 33 | closest based sorting. 34 | """ 35 | try: 36 | if near: 37 | if near_args: 38 | archive_url = cdx_api.near(**near_args).archive_url 39 | else: 40 | archive_url = cdx_api.near().archive_url 41 | elif newest: 42 | archive_url = cdx_api.newest().archive_url 43 | elif oldest: 44 | archive_url = cdx_api.oldest().archive_url 45 | click.echo("Archive URL:") 46 | click.echo(archive_url) 47 | except NoCDXRecordFound as exc: 48 | click.echo(click.style("NoCDXRecordFound: ", fg="red") + str(exc), err=True) 49 | except BlockedSiteError as exc: 50 | click.echo(click.style("BlockedSiteError: ", fg="red") + str(exc), err=True) 51 | 52 | 53 | def handle_cdx(data: List[Any]) -> None: 54 | """ 55 | Handles the CDX CLI options and output format. 56 | """ 57 | url = data[0] 58 | user_agent = data[1] 59 | start_timestamp = data[2] 60 | end_timestamp = data[3] 61 | cdx_filter = data[4] 62 | collapse = data[5] 63 | cdx_print = data[6] 64 | limit = data[7] 65 | gzip = data[8] 66 | match_type = data[9] 67 | sort = data[10] 68 | use_pagination = data[11] 69 | closest = data[12] 70 | 71 | filters = list(cdx_filter) 72 | collapses = list(collapse) 73 | cdx_print = list(cdx_print) 74 | 75 | cdx_api = WaybackMachineCDXServerAPI( 76 | url, 77 | user_agent=user_agent, 78 | start_timestamp=start_timestamp, 79 | end_timestamp=end_timestamp, 80 | closest=closest, 81 | filters=filters, 82 | match_type=match_type, 83 | sort=sort, 84 | use_pagination=use_pagination, 85 | gzip=gzip, 86 | collapses=collapses, 87 | limit=limit, 88 | ) 89 | 90 | snapshots = cdx_api.snapshots() 91 | 92 | for snapshot in snapshots: 93 | if len(cdx_print) == 0: 94 | click.echo(snapshot) 95 | else: 96 | output_string = [] 97 | if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]): 98 | output_string.append(snapshot.urlkey) 99 | if any( 100 | val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"] 101 | ): 102 | output_string.append(snapshot.timestamp) 103 | if "original" in cdx_print: 104 | output_string.append(snapshot.original) 105 | if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]): 106 | output_string.append(snapshot.mimetype) 107 | if any( 108 | val in cdx_print for val in ["statuscode", "status-code", "status_code"] 109 | ): 110 | output_string.append(snapshot.statuscode) 111 | if "digest" in cdx_print: 112 | output_string.append(snapshot.digest) 113 | if "length" in cdx_print: 114 | output_string.append(snapshot.length) 115 | if any( 116 | val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"] 117 | ): 118 | output_string.append(snapshot.archive_url) 119 | 120 | click.echo(" ".join(output_string)) 121 | 122 | 123 | def save_urls_on_file(url_gen: Generator[str, None, None]) -> None: 124 | """ 125 | Save output of CDX API on file. 126 | Mainly here because of backwards compatibility. 127 | """ 128 | domain = None 129 | sys_random = random.SystemRandom() 130 | uid = "".join( 131 | sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6) 132 | ) 133 | url_count = 0 134 | file_name = None 135 | 136 | for url in url_gen: 137 | url_count += 1 138 | if not domain: 139 | match = re.search("https?://([A-Za-z_0-9.-]+).*", url) 140 | 141 | domain = "domain-unknown" 142 | 143 | if match: 144 | domain = match.group(1) 145 | 146 | file_name = f"{domain}-urls-{uid}.txt" 147 | file_path = os.path.join(os.getcwd(), file_name) 148 | if not os.path.isfile(file_path): 149 | with open(file_path, "w+", encoding="utf-8") as file: 150 | file.close() 151 | 152 | with open(file_path, "a", encoding="utf-8") as file: 153 | file.write(f"{url}\n") 154 | 155 | click.echo(url) 156 | 157 | if url_count > 0: 158 | click.echo( 159 | f"\n\n{url_count} URLs saved inside '{file_name}' in the current " 160 | + "working directory." 161 | ) 162 | else: 163 | click.echo("No known URLs found. Please try a diffrent input!") 164 | 165 | 166 | @click.command() 167 | @click.option( 168 | "-u", "--url", help="URL on which Wayback machine operations are to be performed." 169 | ) 170 | @click.option( 171 | "-ua", 172 | "--user-agent", 173 | "--user_agent", 174 | default=DEFAULT_USER_AGENT, 175 | help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.", 176 | ) 177 | @click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.") 178 | @click.option( 179 | "-l", 180 | "--show-license", 181 | "--show_license", 182 | "--license", 183 | is_flag=True, 184 | default=False, 185 | help="Show license of Waybackpy.", 186 | ) 187 | @click.option( 188 | "-n", 189 | "--newest", 190 | "-au", 191 | "--archive_url", 192 | "--archive-url", 193 | default=False, 194 | is_flag=True, 195 | help="Retrieve the newest archive of URL.", 196 | ) 197 | @click.option( 198 | "-o", 199 | "--oldest", 200 | default=False, 201 | is_flag=True, 202 | help="Retrieve the oldest archive of URL.", 203 | ) 204 | @click.option( 205 | "-N", 206 | "--near", 207 | default=False, 208 | is_flag=True, 209 | help="Archive close to a specified time.", 210 | ) 211 | @click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.") 212 | @click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.") 213 | @click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.") 214 | @click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.") 215 | @click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.") 216 | @click.option( 217 | "-s", 218 | "--save", 219 | default=False, 220 | is_flag=True, 221 | help="Save the specified URL's webpage and print the archive URL.", 222 | ) 223 | @click.option( 224 | "-h", 225 | "--headers", 226 | default=False, 227 | is_flag=True, 228 | help="Headers data of the SavePageNow API.", 229 | ) 230 | @click.option( 231 | "-ku", 232 | "--known-urls", 233 | "--known_urls", 234 | default=False, 235 | is_flag=True, 236 | help="List known URLs. Uses CDX API.", 237 | ) 238 | @click.option( 239 | "-sub", 240 | "--subdomain", 241 | default=False, 242 | is_flag=True, 243 | help="Use with '--known_urls' to include known URLs for subdomains.", 244 | ) 245 | @click.option( 246 | "-f", 247 | "--file", 248 | default=False, 249 | is_flag=True, 250 | help="Use with '--known_urls' to save the URLs in file at current directory.", 251 | ) 252 | @click.option( 253 | "--cdx", 254 | default=False, 255 | is_flag=True, 256 | help="Flag for using CDX API.", 257 | ) 258 | @click.option( 259 | "-st", 260 | "--start-timestamp", 261 | "--start_timestamp", 262 | "--from", 263 | help="Start timestamp for CDX API in yyyyMMddhhmmss format.", 264 | ) 265 | @click.option( 266 | "-et", 267 | "--end-timestamp", 268 | "--end_timestamp", 269 | "--to", 270 | help="End timestamp for CDX API in yyyyMMddhhmmss format.", 271 | ) 272 | @click.option( 273 | "-C", 274 | "--closest", 275 | help="Archive that are closest the timestamp passed as arguments to this " 276 | + "parameter.", 277 | ) 278 | @click.option( 279 | "-f", 280 | "--cdx-filter", 281 | "--cdx_filter", 282 | "--filter", 283 | multiple=True, 284 | help="Filter on a specific field or all the CDX fields.", 285 | ) 286 | @click.option( 287 | "-mt", 288 | "--match-type", 289 | "--match_type", 290 | help="The default behavior is to return matches for an exact URL. " 291 | + "However, the CDX server can also return results matching a certain prefix, " 292 | + "a certain host, or all sub-hosts by using the match_type", 293 | ) 294 | @click.option( 295 | "-st", 296 | "--sort", 297 | help="Choose one from default, closest or reverse. It returns sorted CDX entries " 298 | + "in the response.", 299 | ) 300 | @click.option( 301 | "-up", 302 | "--use-pagination", 303 | "--use_pagination", 304 | default=False, 305 | is_flag=True, 306 | help="Use the pagination API of the CDX server instead of the default one.", 307 | ) 308 | @click.option( 309 | "-gz", 310 | "--gzip", 311 | help="To disable gzip compression pass false as argument to this parameter. " 312 | + "The default behavior is gzip compression enabled.", 313 | ) 314 | @click.option( 315 | "-c", 316 | "--collapse", 317 | multiple=True, 318 | help="Filtering or 'collapse' results based on a field, or a substring of a field.", 319 | ) 320 | @click.option( 321 | "-l", 322 | "--limit", 323 | help="Number of maximum record that CDX API is asked to return per API call, " 324 | + "default value is 25000 records.", 325 | ) 326 | @click.option( 327 | "-cp", 328 | "--cdx-print", 329 | "--cdx_print", 330 | multiple=True, 331 | help="Print only certain fields of the CDX API response, " 332 | + "if this parameter is not used then the plain text response of the CDX API " 333 | + "will be printed.", 334 | ) 335 | def main( # pylint: disable=no-value-for-parameter 336 | user_agent: str, 337 | version: bool, 338 | show_license: bool, 339 | newest: bool, 340 | oldest: bool, 341 | near: bool, 342 | save: bool, 343 | headers: bool, 344 | known_urls: bool, 345 | subdomain: bool, 346 | file: bool, 347 | cdx: bool, 348 | use_pagination: bool, 349 | cdx_filter: List[str], 350 | collapse: List[str], 351 | cdx_print: List[str], 352 | url: Optional[str] = None, 353 | year: Optional[int] = None, 354 | month: Optional[int] = None, 355 | day: Optional[int] = None, 356 | hour: Optional[int] = None, 357 | minute: Optional[int] = None, 358 | start_timestamp: Optional[str] = None, 359 | end_timestamp: Optional[str] = None, 360 | closest: Optional[str] = None, 361 | match_type: Optional[str] = None, 362 | sort: Optional[str] = None, 363 | gzip: Optional[str] = None, 364 | limit: Optional[str] = None, 365 | ) -> None: 366 | """\b 367 | _ _ 368 | | | | | 369 | __ ____ _ _ _| |__ __ _ ___| | ___ __ _ _ 370 | \\ \\ /\\ / / _` | | | | '_ \\ / _` |/ __| |/ / '_ \\| | | | 371 | \\ V V / (_| | |_| | |_) | (_| | (__| <| |_) | |_| | 372 | \\_/\\_/ \\__,_|\\__, |_.__/ \\__,_|\\___|_|\\_\\ .__/ \\__, | 373 | __/ | | | __/ | 374 | |___/ |_| |___/ 375 | 376 | Python package & CLI tool that interfaces the Wayback Machine APIs 377 | 378 | Repository: https://github.com/akamhy/waybackpy 379 | 380 | Documentation: https://github.com/akamhy/waybackpy/wiki/CLI-docs 381 | 382 | waybackpy - CLI usage(Demo video): https://asciinema.org/a/469890 383 | 384 | Released under the MIT License. Use the flag --license for license. 385 | 386 | """ 387 | if version: 388 | click.echo(f"waybackpy version {__version__}") 389 | 390 | elif show_license: 391 | click.echo( 392 | requests.get( 393 | url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE" 394 | ).text 395 | ) 396 | elif url is None: 397 | click.echo( 398 | click.style("NoURLDetected: ", fg="red") 399 | + "No URL detected. " 400 | + "Please provide an URL.", 401 | err=True, 402 | ) 403 | 404 | elif oldest: 405 | cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent) 406 | handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest) 407 | 408 | elif newest: 409 | cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent) 410 | handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest) 411 | 412 | elif near: 413 | cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent) 414 | near_args = {} 415 | keys = ["year", "month", "day", "hour", "minute"] 416 | args_arr = [year, month, day, hour, minute] 417 | for key, arg in zip(keys, args_arr): 418 | if arg: 419 | near_args[key] = arg 420 | handle_cdx_closest_derivative_methods( 421 | cdx_api, oldest, near, newest, near_args=near_args 422 | ) 423 | 424 | elif save: 425 | save_api = WaybackMachineSaveAPI(url, user_agent=user_agent) 426 | save_api.save() 427 | click.echo("Archive URL:") 428 | click.echo(save_api.archive_url) 429 | click.echo("Cached save:") 430 | click.echo(save_api.cached_save) 431 | if headers: 432 | click.echo("Save API headers:") 433 | click.echo(save_api.headers) 434 | 435 | elif known_urls: 436 | wayback = Url(url, user_agent) 437 | url_gen = wayback.known_urls(subdomain=subdomain) 438 | 439 | if file: 440 | save_urls_on_file(url_gen) 441 | else: 442 | for url_ in url_gen: 443 | click.echo(url_) 444 | 445 | elif cdx: 446 | data = [ 447 | url, 448 | user_agent, 449 | start_timestamp, 450 | end_timestamp, 451 | cdx_filter, 452 | collapse, 453 | cdx_print, 454 | limit, 455 | gzip, 456 | match_type, 457 | sort, 458 | use_pagination, 459 | closest, 460 | ] 461 | handle_cdx(data) 462 | 463 | else: 464 | 465 | click.echo( 466 | click.style("NoCommandFound: ", fg="red") 467 | + "Only URL passed, but did not specify what to do with the URL. " 468 | + "Use --help flag for help using waybackpy.", 469 | err=True, 470 | ) 471 | 472 | 473 | if __name__ == "__main__": 474 | main() # type: ignore # pylint: disable=no-value-for-parameter 475 | -------------------------------------------------------------------------------- /waybackpy/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | waybackpy.exceptions 3 | ~~~~~~~~~~~~~~~~~~~ 4 | This module contains the set of Waybackpy's exceptions. 5 | """ 6 | 7 | 8 | class WaybackError(Exception): 9 | """ 10 | Raised when Waybackpy can not return what you asked for. 11 | 12 | 1) Wayback Machine API Service is unreachable/down. 13 | 2) You passed illegal arguments. 14 | 15 | All other exceptions are inherited from this main exception. 16 | """ 17 | 18 | 19 | class NoCDXRecordFound(WaybackError): 20 | """ 21 | No records returned by the CDX server for a query. 22 | Raised when the user invokes near(), newest() or oldest() methods 23 | and there are no archives. 24 | """ 25 | 26 | 27 | class BlockedSiteError(WaybackError): 28 | """ 29 | Raised when the archives for website/URLs that was excluded from Wayback 30 | Machine are requested via the CDX server API. 31 | """ 32 | 33 | 34 | class TooManyRequestsError(WaybackError): 35 | """ 36 | Raised when you make more than 15 requests per 37 | minute and the Wayback Machine returns 429. 38 | 39 | See https://github.com/akamhy/waybackpy/issues/131 40 | """ 41 | 42 | 43 | class MaximumRetriesExceeded(WaybackError): 44 | """ 45 | MaximumRetriesExceeded 46 | """ 47 | 48 | 49 | class MaximumSaveRetriesExceeded(MaximumRetriesExceeded): 50 | """ 51 | MaximumSaveRetriesExceeded 52 | """ 53 | 54 | 55 | class ArchiveNotInAvailabilityAPIResponse(WaybackError): 56 | """ 57 | Could not parse the archive in the JSON response of the availability API. 58 | """ 59 | 60 | 61 | class InvalidJSONInAvailabilityAPIResponse(WaybackError): 62 | """ 63 | availability api returned invalid JSON 64 | """ 65 | -------------------------------------------------------------------------------- /waybackpy/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akamhy/waybackpy/3b3e78d901a600bb22943202c6a8981ca04a5e48/waybackpy/py.typed -------------------------------------------------------------------------------- /waybackpy/save_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module interfaces the Wayback Machine's SavePageNow (SPN) API. 3 | 4 | The module has WaybackMachineSaveAPI class which should be used by the users of 5 | this module to use the SavePageNow API. 6 | """ 7 | 8 | import re 9 | import time 10 | from datetime import datetime 11 | from typing import Dict, Optional 12 | 13 | import requests 14 | from requests.adapters import HTTPAdapter 15 | from requests.models import Response 16 | from requests.structures import CaseInsensitiveDict 17 | from urllib3.util.retry import Retry 18 | 19 | from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError 20 | from .utils import DEFAULT_USER_AGENT 21 | 22 | 23 | class WaybackMachineSaveAPI: 24 | """ 25 | WaybackMachineSaveAPI class provides an interface for saving URLs on the 26 | Wayback Machine. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | url: str, 32 | user_agent: str = DEFAULT_USER_AGENT, 33 | max_tries: int = 8, 34 | ) -> None: 35 | self.url = str(url).strip().replace(" ", "%20") 36 | self.request_url = "https://web.archive.org/save/" + self.url 37 | self.user_agent = user_agent 38 | self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent} 39 | if max_tries < 1: 40 | raise ValueError("max_tries should be positive") 41 | self.max_tries = max_tries 42 | self.total_save_retries = 5 43 | self.backoff_factor = 0.5 44 | self.status_forcelist = [500, 502, 503, 504] 45 | self._archive_url: Optional[str] = None 46 | self.instance_birth_time = datetime.utcnow() 47 | self.response: Optional[Response] = None 48 | self.headers: Optional[CaseInsensitiveDict[str]] = None 49 | self.status_code: Optional[int] = None 50 | self.response_url: Optional[str] = None 51 | self.cached_save: Optional[bool] = None 52 | self.saved_archive: Optional[str] = None 53 | 54 | @property 55 | def archive_url(self) -> str: 56 | """ 57 | Returns the archive URL is already cached by _archive_url 58 | else invoke the save method to save the archive which returns the 59 | archive thus we return the methods return value. 60 | """ 61 | if self._archive_url: 62 | return self._archive_url 63 | 64 | return self.save() 65 | 66 | def get_save_request_headers(self) -> None: 67 | """ 68 | Creates a session and tries 'retries' number of times to 69 | retrieve the archive. 70 | 71 | If successful in getting the response, sets the headers, status_code 72 | and response_url attributes. 73 | 74 | The archive is usually in the headers but it can also be the response URL 75 | as the Wayback Machine redirects to the archive after a successful capture 76 | of the webpage. 77 | 78 | Wayback Machine's save API is known 79 | to be very unreliable thus if it fails first check opening 80 | the response URL yourself in the browser. 81 | """ 82 | session = requests.Session() 83 | retries = Retry( 84 | total=self.total_save_retries, 85 | backoff_factor=self.backoff_factor, 86 | status_forcelist=self.status_forcelist, 87 | ) 88 | session.mount("https://", HTTPAdapter(max_retries=retries)) 89 | self.response = session.get(self.request_url, headers=self.request_headers) 90 | # requests.response.headers is requests.structures.CaseInsensitiveDict 91 | self.headers = self.response.headers 92 | self.status_code = self.response.status_code 93 | self.response_url = self.response.url 94 | session.close() 95 | 96 | if self.status_code == 429: 97 | # why wait 5 minutes and 429? 98 | # see https://github.com/akamhy/waybackpy/issues/97 99 | raise TooManyRequestsError( 100 | f"Can not save '{self.url}'. " 101 | f"Save request refused by the server. " 102 | f"Save Page Now limits saving 15 URLs per minutes. " 103 | f"Try waiting for 5 minutes and then try again." 104 | ) 105 | 106 | # why 509? 107 | # see https://github.com/akamhy/waybackpy/pull/99 108 | # also https://t.co/xww4YJ0Iwc 109 | if self.status_code == 509: 110 | raise WaybackError( 111 | f"Can not save '{self.url}'. You have probably reached the " 112 | f"limit of active sessions." 113 | ) 114 | 115 | def archive_url_parser(self) -> Optional[str]: 116 | """ 117 | Three regexen (like oxen?) are used to search for the 118 | archive URL in the headers and finally look in the response URL 119 | for the archive URL. 120 | """ 121 | regex1 = r"Content-Location: (/web/[0-9]{14}/.*)" 122 | match = re.search(regex1, str(self.headers)) 123 | if match: 124 | return "https://web.archive.org" + match.group(1) 125 | 126 | regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>" 127 | match = re.search(regex2, str(self.headers)) 128 | if match is not None and len(match.groups()) == 1: 129 | return "https://" + match.group(1) 130 | 131 | regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}" 132 | match = re.search(regex3, str(self.headers)) 133 | if match is not None and len(match.groups()) == 1: 134 | return "https" + match.group(1) 135 | 136 | self.response_url = ( 137 | "" if self.response_url is None else self.response_url.strip() 138 | ) 139 | regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$" 140 | match = re.search(regex4, self.response_url) 141 | if match is not None: 142 | return "https://" + match.group(0) 143 | 144 | return None 145 | 146 | @staticmethod 147 | def sleep(tries: int) -> None: 148 | """ 149 | Ensure that the we wait some time before succesive retries so that we 150 | don't waste the retries before the page is even captured by the Wayback 151 | Machine crawlers also ensures that we are not putting too much load on 152 | the Wayback Machine's save API. 153 | 154 | If tries are multiple of 3 sleep 10 seconds else sleep 5 seconds. 155 | """ 156 | sleep_seconds = 5 157 | if tries % 3 == 0: 158 | sleep_seconds = 10 159 | time.sleep(sleep_seconds) 160 | 161 | def timestamp(self) -> datetime: 162 | """ 163 | Read the timestamp off the archive URL and convert the Wayback Machine 164 | timestamp to datetime object. 165 | 166 | Also check if the time on archive is URL and compare it to instance birth 167 | time. 168 | 169 | If time on the archive is older than the instance creation time set the 170 | cached_save to True else set it to False. The flag can be used to check 171 | if the Wayback Machine didn't serve a Cached URL. It is quite common for 172 | the Wayback Machine to serve cached archive if last archive was captured 173 | before last 45 minutes. 174 | """ 175 | regex = r"https?://web\.archive.org/web/([0-9]{14})/http" 176 | match = re.search(regex, str(self._archive_url)) 177 | 178 | if match is None or len(match.groups()) != 1: 179 | raise ValueError( 180 | f"Can not parse timestamp from archive URL, '{self._archive_url}'." 181 | ) 182 | 183 | string_timestamp = match.group(1) 184 | timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S") 185 | timestamp_unixtime = time.mktime(timestamp.timetuple()) 186 | instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple()) 187 | 188 | if timestamp_unixtime < instance_birth_time_unixtime: 189 | self.cached_save = True 190 | else: 191 | self.cached_save = False 192 | 193 | return timestamp 194 | 195 | def save(self) -> str: 196 | """ 197 | Calls the SavePageNow API of the Wayback Machine with required parameters 198 | and headers to save the URL. 199 | 200 | Raises MaximumSaveRetriesExceeded is maximum retries are exhausted but still 201 | we were unable to retrieve the archive from the Wayback Machine. 202 | """ 203 | self.saved_archive = None 204 | tries = 0 205 | 206 | while True: 207 | if tries >= 1: 208 | self.sleep(tries) 209 | 210 | self.get_save_request_headers() 211 | self.saved_archive = self.archive_url_parser() 212 | 213 | if isinstance(self.saved_archive, str): 214 | self._archive_url = self.saved_archive 215 | self.timestamp() 216 | return self.saved_archive 217 | 218 | tries += 1 219 | if tries >= self.max_tries: 220 | raise MaximumSaveRetriesExceeded( 221 | f"Tried {tries} times but failed to save " 222 | f"and retrieve the archive for {self.url}.\n" 223 | f"Response URL:\n{self.response_url}\n" 224 | f"Response Header:\n{self.headers}" 225 | ) 226 | -------------------------------------------------------------------------------- /waybackpy/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions and shared variables like DEFAULT_USER_AGENT are here. 3 | """ 4 | 5 | from datetime import datetime 6 | 7 | from . import __version__ 8 | 9 | DEFAULT_USER_AGENT: str = ( 10 | f"waybackpy {__version__} - https://github.com/akamhy/waybackpy" 11 | ) 12 | 13 | 14 | def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str: 15 | """ 16 | Converts Unix time to Wayback Machine timestamp, Wayback Machine 17 | timestamp format is yyyyMMddhhmmss. 18 | """ 19 | return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S") 20 | 21 | 22 | def wayback_timestamp(**kwargs: int) -> str: 23 | """ 24 | Prepends zero before the year, month, day, hour and minute so that they 25 | are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format. 26 | """ 27 | return "".join( 28 | str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"] 29 | ) 30 | -------------------------------------------------------------------------------- /waybackpy/wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module exists because backwards compatibility matters. 3 | Don't touch this or add any new functionality here and don't use 4 | the Url class. 5 | """ 6 | 7 | from datetime import datetime, timedelta 8 | from typing import Generator, Optional 9 | 10 | from requests.structures import CaseInsensitiveDict 11 | 12 | from .availability_api import ResponseJSON, WaybackMachineAvailabilityAPI 13 | from .cdx_api import WaybackMachineCDXServerAPI 14 | from .save_api import WaybackMachineSaveAPI 15 | from .utils import DEFAULT_USER_AGENT 16 | 17 | 18 | class Url: 19 | """ 20 | The Url class is not recommended to be used anymore, instead use: 21 | 22 | - WaybackMachineSaveAPI 23 | - WaybackMachineAvailabilityAPI 24 | - WaybackMachineCDXServerAPI 25 | 26 | The reason it is still in the code is backwards compatibility with 2.x.x 27 | versions. 28 | 29 | If were are using the Url before the update to version 3.x.x, your code should 30 | still be working fine and there is no hurry to update the interface but is 31 | recommended that you do not use the Url class for new code as it would be 32 | removed after 2025 also the first 3.x.x versions was released in January 2022 33 | and three years are more than enough to update the older interface code. 34 | """ 35 | 36 | def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None: 37 | self.url = url 38 | self.user_agent = str(user_agent) 39 | self.archive_url: Optional[str] = None 40 | self.timestamp: Optional[datetime] = None 41 | self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI( 42 | self.url, user_agent=self.user_agent 43 | ) 44 | self.wayback_machine_save_api: Optional[WaybackMachineSaveAPI] = None 45 | self.headers: Optional[CaseInsensitiveDict[str]] = None 46 | self.json: Optional[ResponseJSON] = None 47 | 48 | def __str__(self) -> str: 49 | if not self.archive_url: 50 | self.newest() 51 | return str(self.archive_url) 52 | 53 | def __len__(self) -> int: 54 | td_max = timedelta( 55 | days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999 56 | ) 57 | 58 | if not isinstance(self.timestamp, datetime): 59 | self.oldest() 60 | 61 | if not isinstance(self.timestamp, datetime): 62 | raise TypeError("timestamp must be a datetime") 63 | 64 | if self.timestamp == datetime.max: 65 | return td_max.days 66 | 67 | return (datetime.utcnow() - self.timestamp).days 68 | 69 | def save(self) -> "Url": 70 | """Save the URL on wayback machine.""" 71 | self.wayback_machine_save_api = WaybackMachineSaveAPI( 72 | self.url, user_agent=self.user_agent 73 | ) 74 | self.archive_url = self.wayback_machine_save_api.archive_url 75 | self.timestamp = self.wayback_machine_save_api.timestamp() 76 | self.headers = self.wayback_machine_save_api.headers 77 | return self 78 | 79 | def near( 80 | self, 81 | year: Optional[int] = None, 82 | month: Optional[int] = None, 83 | day: Optional[int] = None, 84 | hour: Optional[int] = None, 85 | minute: Optional[int] = None, 86 | unix_timestamp: Optional[int] = None, 87 | ) -> "Url": 88 | """Returns the archive of the URL close to a date and time.""" 89 | self.wayback_machine_availability_api.near( 90 | year=year, 91 | month=month, 92 | day=day, 93 | hour=hour, 94 | minute=minute, 95 | unix_timestamp=unix_timestamp, 96 | ) 97 | self.set_availability_api_attrs() 98 | return self 99 | 100 | def oldest(self) -> "Url": 101 | """Returns the oldest archive of the URL.""" 102 | self.wayback_machine_availability_api.oldest() 103 | self.set_availability_api_attrs() 104 | return self 105 | 106 | def newest(self) -> "Url": 107 | """Returns the newest archive of the URL.""" 108 | self.wayback_machine_availability_api.newest() 109 | self.set_availability_api_attrs() 110 | return self 111 | 112 | def set_availability_api_attrs(self) -> None: 113 | """Set the attributes for total backwards compatibility.""" 114 | self.archive_url = self.wayback_machine_availability_api.archive_url 115 | self.json = self.wayback_machine_availability_api.json 116 | self.JSON = self.json # for backwards compatibility, do not remove it. 117 | self.timestamp = self.wayback_machine_availability_api.timestamp() 118 | 119 | def total_archives( 120 | self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None 121 | ) -> int: 122 | """ 123 | Returns an integer which indicates total number of archives for an URL. 124 | Useless in my opinion, only here because of backwards compatibility. 125 | """ 126 | cdx = WaybackMachineCDXServerAPI( 127 | self.url, 128 | user_agent=self.user_agent, 129 | start_timestamp=start_timestamp, 130 | end_timestamp=end_timestamp, 131 | ) 132 | 133 | count = 0 134 | for _ in cdx.snapshots(): 135 | count = count + 1 136 | return count 137 | 138 | def known_urls( 139 | self, 140 | subdomain: bool = False, 141 | host: bool = False, 142 | start_timestamp: Optional[str] = None, 143 | end_timestamp: Optional[str] = None, 144 | match_type: str = "prefix", 145 | ) -> Generator[str, None, None]: 146 | """Yields known URLs for any URL.""" 147 | if subdomain: 148 | match_type = "domain" 149 | if host: 150 | match_type = "host" 151 | 152 | cdx = WaybackMachineCDXServerAPI( 153 | self.url, 154 | user_agent=self.user_agent, 155 | start_timestamp=start_timestamp, 156 | end_timestamp=end_timestamp, 157 | match_type=match_type, 158 | collapses=["urlkey"], 159 | ) 160 | 161 | for snapshot in cdx.snapshots(): 162 | yield snapshot.original 163 | --------------------------------------------------------------------------------