├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── build-test.yml
    │   ├── codeql-analysis.yml
    │   ├── python-publish.yml
    │   └── unit-test.yml
├── .gitignore
├── .pep8speaks.yml
├── .whitesource
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── _config.yml
├── assets
    └── waybackpy_logo.svg
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── snapcraft.yaml
├── tests
    ├── __init__.py
    ├── test_availability_api.py
    ├── test_cdx_api.py
    ├── test_cdx_snapshot.py
    ├── test_cdx_utils.py
    ├── test_cli.py
    ├── test_save_api.py
    ├── test_utils.py
    └── test_wrapper.py
└── waybackpy
    ├── __init__.py
    ├── availability_api.py
    ├── cdx_api.py
    ├── cdx_snapshot.py
    ├── cdx_utils.py
    ├── cli.py
    ├── exceptions.py
    ├── py.typed
    ├── save_api.py
    ├── utils.py
    └── wrapper.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: akamhy
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | 1. Go to '...'
17 | 2. Click on '....'
18 | 3. Scroll down to '....'
19 | 4. See error
20 | 
21 | **Expected behavior**
22 | A clear and concise description of what you expected to happen.
23 | 
24 | **Screenshots**
25 | If applicable, add screenshots to help explain your problem.
26 | 
27 | **Version:**
28 | 
29 | - OS: [e.g. iOS]
30 | - Version [e.g. 22]
31 | - Is latest version? [e.g. Yes/No]
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: akamhy
 7 | ---
 8 | 
 9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 | 
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 | 
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 | 
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 | 


--------------------------------------------------------------------------------
/.github/workflows/build-test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ['3.7', '3.10']
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install -U setuptools wheel
28 |     - name: Build test the package
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master ]
20 |   schedule:
21 |     - cron: '30 6 * * 1'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v2
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v1
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v1
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v1
71 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Tests
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ['3.10']
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install '.[dev]'
29 |     - name: Lint with flake8
30 |       run: |
31 |         flake8 . --count --show-source --statistics
32 |     - name: Lint with black
33 |       run: |
34 |         black . --check --diff
35 |     - name: Static type test with mypy
36 |       run: |
37 |         mypy -p waybackpy -p tests
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest
41 |     - name: Upload coverage to Codecov
42 |       run: |
43 |         bash <(curl -s https://codecov.io/bash) -t ${{ secrets.CODECOV_TOKEN }}
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Files generated while testing
  2 | *-urls-*.txt
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/.pep8speaks.yml:
--------------------------------------------------------------------------------
1 | scanner:
2 |     diff_only: True
3 |     linter: flake8
4 | 
5 | flake8:
6 |     max-line-length: 88
7 |     extend-ignore: W503,W605
8 | 


--------------------------------------------------------------------------------
/.whitesource:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scanSettings": {
 3 |     "baseBranches": []
 4 |   },
 5 |   "checkRunSettings": {
 6 |     "vulnerableCheckRunConclusionLevel": "failure",
 7 |     "displayMode": "diff"
 8 |   },
 9 |   "issueSettings": {
10 |     "minSeverityLevel": "LOW"
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | title: waybackpy
 4 | abstract: "Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily."
 5 | version: '3.0.6'
 6 | doi: 10.5281/ZENODO.3977276
 7 | date-released: 2022-03-15
 8 | type: software
 9 | authors:
10 |   - given-names: Akash
11 |     family-names: Mahanty
12 |     email: akamhy@yahoo.com
13 |     orcid: https://orcid.org/0000-0003-2482-8227
14 | keywords:
15 |     - Archive Website
16 |     - Wayback Machine
17 |     - Internet Archive
18 |     - Wayback Machine CLI
19 |     - Wayback Machine Python
20 |     - Internet Archiving
21 |     - Availability API
22 |     - CDX API
23 |     - savepagenow
24 | license: MIT
25 | repository-code: "https://github.com/akamhy/waybackpy"
26 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | akamhy@yahoo.com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | <https://www.contributor-covenant.org/version/2/0/code_of_conduct.html>.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | <https://www.contributor-covenant.org/faq>. Translations are available at
128 | <https://www.contributor-covenant.org/translations>.
129 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Welcome to waybackpy contributing guide
 2 | 
 3 | 
 4 | ## Getting started
 5 | 
 6 | Read our [Code of Conduct](./CODE_OF_CONDUCT.md).
 7 | 
 8 | ## Creating an issue
 9 | 
10 | It's a good idea to open an issue and discuss suspected bugs and new feature ideas with the maintainers. Somebody might be working on your bug/idea and it would be best to discuss it to avoid wasting your time. It is a recommendation. You may avoid creating an issue and directly open pull requests.
11 | 
12 | ## Fork this repository
13 | 
14 | Fork this repository. See '[Fork a repo](https://docs.github.com/en/get-started/quickstart/fork-a-repo)' for help forking this repository on GitHub.
15 | 
16 | ## Make changes to the forked copy
17 | 
18 | Make the required changes to your forked copy of waybackpy, please don't forget to add or update comments and docstrings.
19 | 
20 | ## Add tests for your changes
21 | 
22 | You have made the required changes to the codebase, now go ahead and add tests for newly written methods/functions and update the tests of code that you changed.
23 | 
24 | ## Testing and Linting
25 | 
26 | You must run the tests and linter on your changes before opening a pull request.
27 | 
28 | ### pytest
29 | 
30 | Runs all test from tests directory. pytest is a mature full-featured Python testing tool.
31 | ```bash
32 | pytest
33 | ```
34 | 
35 | ### mypy
36 | 
37 | Mypy is a static type checker for Python. Type checkers help ensure that you're using variables and functions in your code correctly.
38 | ```bash
39 | mypy -p waybackpy -p tests
40 | ```
41 | 
42 | ### black
43 | 
44 | After testing with pytest and type checking with mypy run black on the code base. The codestyle used by the project is 'black'.
45 | 
46 | ```bash
47 | black .
48 | ```
49 | 
50 | ## Create a pull request
51 | 
52 | Read [Creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request).
53 | 
54 | Try to make sure that all automated tests are passing, and if some of them do not pass then don't worry. Tests are meant to catch bugs and a failed test is better than introducing bugs to the master branch.
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020-2022 waybackpy contributors ( https://github.com/akamhy/waybackpy/graphs/contributors )
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable MD033 MD041 -->
  2 | <div align="center">
  3 | 
  4 | <img src="https://raw.githubusercontent.com/akamhy/waybackpy/master/assets/waybackpy_logo.svg"><br>
  5 | 
  6 | <h3>Python package & CLI tool that interfaces the Wayback Machine APIs</h3>
  7 | 
  8 | </div>
  9 | 
 10 | <p align="center">
 11 | <a href="https://github.com/akamhy/waybackpy/actions?query=workflow%3ATests"><img alt="Unit Tests" src="https://github.com/akamhy/waybackpy/workflows/Tests/badge.svg"></a>
 12 | <a href="https://codecov.io/gh/akamhy/waybackpy"><img alt="codecov" src="https://codecov.io/gh/akamhy/waybackpy/branch/master/graph/badge.svg"></a>
 13 | <a href="https://pypi.org/project/waybackpy/"><img alt="pypi" src="https://img.shields.io/pypi/v/waybackpy.svg"></a>
 14 | <a href="https://pepy.tech/project/waybackpy?versions=2*&versions=1*&versions=3*"><img alt="Downloads" src="https://pepy.tech/badge/waybackpy/month"></a>
 15 | <a href="https://app.codacy.com/gh/akamhy/waybackpy?utm_source=github.com&utm_medium=referral&utm_content=akamhy/waybackpy&utm_campaign=Badge_Grade_Settings"><img alt="Codacy Badge" src="https://api.codacy.com/project/badge/Grade/6d777d8509f642ac89a20715bb3a6193"></a>
 16 | <a href="https://github.com/akamhy/waybackpy/commits/master"><img alt="GitHub lastest commit" src="https://img.shields.io/github/last-commit/akamhy/waybackpy?color=blue&style=flat-square"></a>
 17 | <a href="#"><img alt="PyPI - Python Version" src="https://img.shields.io/pypi/pyversions/waybackpy?style=flat-square"></a>
 18 | <a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
 19 | </p>
 20 | 
 21 | ---
 22 | 
 23 | # <img src="https://github.githubassets.com/images/icons/emoji/unicode/2b50.png" width="30"></img> Introduction
 24 | 
 25 | Waybackpy is a Python package and a CLI tool that interfaces with the Wayback Machine APIs.
 26 | 
 27 | Internet Archive's Wayback Machine has 3 useful public APIs.
 28 | 
 29 | - SavePageNow or Save API
 30 | - CDX Server API
 31 | - Availability API
 32 | 
 33 | These three APIs can be accessed via the waybackpy either by importing it from a python file/module or from the command-line interface.
 34 | 
 35 | ## <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f3d7.png" width="20"></img> Installation
 36 | 
 37 | **Using [pip](https://en.wikipedia.org/wiki/Pip_(package_manager)), from [PyPI](https://pypi.org/) (recommended)**:
 38 | 
 39 | ```bash
 40 | pip install waybackpy -U
 41 | ```
 42 | 
 43 | **Using [conda](https://en.wikipedia.org/wiki/Conda_(package_manager)), from [conda-forge](https://anaconda.org/conda-forge/waybackpy) (recommended)**:
 44 | 
 45 | See also [waybackpy feedstock](https://github.com/conda-forge/waybackpy-feedstock), maintainers are [@rafaelrdealmeida](https://github.com/rafaelrdealmeida/),
 46 |  [@labriunesp](https://github.com/labriunesp/)
 47 |  and [@akamhy](https://github.com/akamhy/).
 48 | 
 49 | ```bash
 50 | conda install -c conda-forge waybackpy
 51 | ```
 52 | 
 53 | **Install directly from [this git repository](https://github.com/akamhy/waybackpy) (NOT recommended)**:
 54 | 
 55 | ```bash
 56 | pip install git+https://github.com/akamhy/waybackpy.git
 57 | ```
 58 | 
 59 | ## <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f433.png" width="20"></img> Docker Image
 60 | 
 61 | Docker Hub: [hub.docker.com/r/secsi/waybackpy](https://hub.docker.com/r/secsi/waybackpy)
 62 | 
 63 | Docker image is automatically updated on every release by [Regulary and Automatically Updated Docker Images](https://github.com/cybersecsi/RAUDI) (RAUDI).
 64 | 
 65 | RAUDI is a tool by [SecSI](https://secsi.io), an Italian cybersecurity startup.
 66 | 
 67 | ## <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f680.png" width="20"></img> Usage
 68 | 
 69 | ### As a Python package
 70 | 
 71 | #### Save API aka SavePageNow
 72 | 
 73 | ```python
 74 | >>> from waybackpy import WaybackMachineSaveAPI
 75 | >>> url = "https://github.com"
 76 | >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
 77 | >>>
 78 | >>> save_api = WaybackMachineSaveAPI(url, user_agent)
 79 | >>> save_api.save()
 80 | https://web.archive.org/web/20220118125249/https://github.com/
 81 | >>> save_api.cached_save
 82 | False
 83 | >>> save_api.timestamp()
 84 | datetime.datetime(2022, 1, 18, 12, 52, 49)
 85 | ```
 86 | 
 87 | #### CDX API aka CDXServerAPI
 88 | 
 89 | ```python
 90 | >>> from waybackpy import WaybackMachineCDXServerAPI
 91 | >>> url = "https://google.com"
 92 | >>> user_agent = "my new app's user agent"
 93 | >>> cdx_api = WaybackMachineCDXServerAPI(url, user_agent)
 94 | ```
 95 | ##### oldest
 96 | ```python
 97 | >>> cdx_api.oldest()
 98 | com,google)/ 19981111184551 http://google.com:80/ text/html 200 HOQ2TGPYAEQJPNUA6M4SMZ3NGQRBXDZ3 381
 99 | >>> oldest = cdx_api.oldest()
100 | >>> oldest
101 | com,google)/ 19981111184551 http://google.com:80/ text/html 200 HOQ2TGPYAEQJPNUA6M4SMZ3NGQRBXDZ3 381
102 | >>> oldest.archive_url
103 | 'https://web.archive.org/web/19981111184551/http://google.com:80/'
104 | >>> oldest.original
105 | 'http://google.com:80/'
106 | >>> oldest.urlkey
107 | 'com,google)/'
108 | >>> oldest.timestamp
109 | '19981111184551'
110 | >>> oldest.datetime_timestamp
111 | datetime.datetime(1998, 11, 11, 18, 45, 51)
112 | >>> oldest.statuscode
113 | '200'
114 | >>> oldest.mimetype
115 | 'text/html'
116 | ```
117 | ##### newest
118 | ```python
119 | >>> newest = cdx_api.newest()
120 | >>> newest
121 | com,google)/ 20220217234427 http://@google.com/ text/html 301 Y6PVK4XWOI3BXQEXM5WLLWU5JKUVNSFZ 563
122 | >>> newest.archive_url
123 | 'https://web.archive.org/web/20220217234427/http://@google.com/'
124 | >>> newest.timestamp
125 | '20220217234427'
126 | ```
127 | ##### near
128 | ```python
129 | >>> near = cdx_api.near(year=2010, month=10, day=10, hour=10, minute=10)
130 | >>> near.archive_url
131 | 'https://web.archive.org/web/20101010101435/http://google.com/'
132 | >>> near
133 | com,google)/ 20101010101435 http://google.com/ text/html 301 Y6PVK4XWOI3BXQEXM5WLLWU5JKUVNSFZ 391
134 | >>> near.timestamp
135 | '20101010101435'
136 | >>> near.timestamp
137 | '20101010101435'
138 | >>> near = cdx_api.near(wayback_machine_timestamp=2008080808)
139 | >>> near.archive_url
140 | 'https://web.archive.org/web/20080808051143/http://google.com/'
141 | >>> near = cdx_api.near(unix_timestamp=1286705410)
142 | >>> near
143 | com,google)/ 20101010101435 http://google.com/ text/html 301 Y6PVK4XWOI3BXQEXM5WLLWU5JKUVNSFZ 391
144 | >>> near.archive_url
145 | 'https://web.archive.org/web/20101010101435/http://google.com/'
146 | >>>
147 | ```
148 | ##### snapshots
149 | ```python
150 | >>> from waybackpy import WaybackMachineCDXServerAPI
151 | >>> url = "https://pypi.org"
152 | >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
153 | >>> cdx = WaybackMachineCDXServerAPI(url, user_agent, start_timestamp=2016, end_timestamp=2017)
154 | >>> for item in cdx.snapshots():
155 | ...     print(item.archive_url)
156 | ...
157 | https://web.archive.org/web/20160110011047/http://pypi.org/
158 | https://web.archive.org/web/20160305104847/http://pypi.org/
159 | .
160 | . # URLS REDACTED FOR READABILITY
161 | .
162 | https://web.archive.org/web/20171127171549/https://pypi.org/
163 | https://web.archive.org/web/20171206002737/http://pypi.org:80/
164 | ```
165 | 
166 | #### Availability API
167 | 
168 | It is recommended to not use the availability API due to performance issues. All the methods of availability API interface class, `WaybackMachineAvailabilityAPI`, are also implemented in the CDX server API interface class, `WaybackMachineCDXServerAPI`. Also note
169 | that the `newest()` method of `WaybackMachineAvailabilityAPI` can be more recent than `WaybackMachineCDXServerAPI`'s same method.
170 | 
171 | ```python
172 | >>> from waybackpy import WaybackMachineAvailabilityAPI
173 | >>>
174 | >>> url = "https://google.com"
175 | >>> user_agent = "Mozilla/5.0 (Windows NT 5.1; rv:40.0) Gecko/20100101 Firefox/40.0"
176 | >>>
177 | >>> availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
178 | ```
179 | ##### oldest
180 | ```python
181 | >>> availability_api.oldest()
182 | https://web.archive.org/web/19981111184551/http://google.com:80/
183 | ```
184 | ##### newest
185 | ```python
186 | >>> availability_api.newest()
187 | https://web.archive.org/web/20220118150444/https://www.google.com/
188 | ```
189 | ##### near
190 | ```python
191 | >>> availability_api.near(year=2010, month=10, day=10, hour=10)
192 | https://web.archive.org/web/20101010101708/http://www.google.com/
193 | ```
194 | 
195 | > Documentation is at <https://github.com/akamhy/waybackpy/wiki/Python-package-docs>.
196 | 
197 | ### As a CLI tool
198 | 
199 | Demo video on [asciinema.org](https://asciinema.org/a/469890), you can copy the text from video:
200 | 
201 | [![asciicast](https://asciinema.org/a/469890.svg)](https://asciinema.org/a/469890)
202 | 
203 | > CLI documentation is at <https://github.com/akamhy/waybackpy/wiki/CLI-docs>.
204 | 
205 | 
206 | ## CONTRIBUTORS
207 | 
208 | ### AUTHORS
209 | 
210 | - akamhy (<https://github.com/akamhy>)
211 | - eggplants (<https://github.com/eggplants>)
212 | - danvalen1 (<https://github.com/danvalen1>)
213 | - AntiCompositeNumber (<https://github.com/AntiCompositeNumber>)
214 | - rafaelrdealmeida (<https://github.com/rafaelrdealmeida>)
215 | - jonasjancarik (<https://github.com/jonasjancarik>)
216 | - jfinkhaeuser (<https://github.com/jfinkhaeuser>)
217 | 
218 | ### ACKNOWLEDGEMENTS
219 | 
220 | - mhmdiaa (<https://github.com/mhmdiaa>)  `--known-urls` is based on [this](https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050) gist.
221 | - dequeued0 (<https://github.com/dequeued0>) for reporting bugs and useful feature requests.
222 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | 


--------------------------------------------------------------------------------
/assets/waybackpy_logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <svg width="711.80188pt" height="258.30469pt" viewBox="0 0 711.80188 258.30469" version="1.1" id="svg2" xmlns="http://www.w3.org/2000/svg">
 3 |   <g id="surface1" transform="translate(-40.045801,-148)">
 4 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 224.09 309.814 L 224.09 197.997 L 204.768 197.994 L 204.768 312.635 C 204.768 312.635 205.098 312.9 204.105 313.698 C 203.113 314.497 202.408 313.849 202.408 313.849 L 200.518 313.849 L 200.518 197.991 L 181.139 197.991 L 181.139 313.849 L 179.253 313.849 C 179.253 313.849 178.544 314.497 177.551 313.698 C 176.558 312.9 176.888 312.635 176.888 312.635 L 176.888 197.994 L 157.57 197.997 L 157.57 309.814 C 157.57 309.814 156.539 316.772 162.615 321.658 C 168.691 326.546 177.551 326.049 177.551 326.049 L 204.11 326.049 C 204.11 326.049 212.965 326.546 219.041 321.658 C 225.118 316.772 224.09 309.814 224.09 309.814" id="path5"/>
 5 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 253.892 299.821 C 253.892 299.821 253.632 300.965 251.888 300.965 C 250.143 300.965 249.629 299.821 249.629 299.821 L 249.629 278.477 C 249.629 278.477 249.433 278.166 250.078 277.645 C 250.726 277.124 251.243 277.179 251.243 277.179 L 253.892 277.228 Z M 251.588 199.144 C 230.266 199.144 231.071 213.218 231.071 213.218 L 231.071 254.303 L 249.675 254.303 L 249.675 213.69 C 249.675 213.69 249.775 211.276 251.787 211.276 C 253.8 211.276 254 213.542 254 213.542 L 254 265.146 L 246.156 265.146 C 246.156 265.146 240.022 264.579 235.495 268.22 C 230.968 271.858 231.071 276.791 231.071 276.791 L 231.071 298.955 C 231.071 298.955 229.461 308.016 238.914 312.058 C 248.368 316.103 254.805 309.795 254.805 309.795 L 254.805 312.706 L 272.508 312.706 L 272.508 212.895 C 272.508 212.895 272.907 199.144 251.588 199.144" id="path7"/>
 6 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 404.682 318.261 C 404.682 318.261 404.398 319.494 402.485 319.494 C 400.568 319.494 400.001 318.261 400.001 318.261 L 400.001 295.216 C 400.001 295.216 399.786 294.879 400.496 294.315 C 401.208 293.757 401.776 293.812 401.776 293.812 L 404.682 293.868 Z M 402.152 209.568 C 378.728 209.568 379.61 224.761 379.61 224.761 L 379.61 269.117 L 400.051 269.117 L 400.051 225.273 C 400.051 225.273 400.162 222.665 402.374 222.665 C 404.582 222.665 404.805 225.109 404.805 225.109 L 404.805 280.82 L 396.187 280.82 C 396.187 280.82 389.447 280.213 384.475 284.141 C 379.499 288.072 379.61 293.396 379.61 293.396 L 379.61 317.324 C 379.61 317.324 377.843 327.104 388.232 331.469 C 398.616 335.838 405.69 329.027 405.69 329.027 L 405.69 332.169 L 425.133 332.169 L 425.133 224.413 C 425.133 224.413 425.578 209.568 402.152 209.568" id="path9"/>
 7 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 321.114 328.636 L 321.114 206.587 L 302.582 206.587 L 302.582 304.902 C 302.582 304.902 303.211 307.094 300.624 307.094 C 298.035 307.094 298.316 304.902 298.316 304.902 L 298.316 206.587 L 279.784 206.587 C 279.784 206.587 279.922 304.338 279.922 306.756 C 279.922 309.175 280.27 310.526 280.831 312.379 C 281.391 314.238 282.579 318.116 290.901 319.186 C 299.224 320.256 302.44 315.813 302.44 315.813 L 302.44 327.736 C 302.44 327.736 302.862 329.366 300.554 329.366 C 298.246 329.366 298.316 327.849 298.316 327.849 L 298.316 322.957 L 279.642 322.957 L 279.642 327.791 C 279.642 327.791 278.523 341.514 300.274 341.514 C 322.026 341.514 321.114 328.636 321.114 328.636" id="path11"/>
 8 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 352.449 209.811 L 352.449 273.495 C 352.449 277.49 347.911 277.194 347.911 277.194 L 347.911 207.592 C 347.911 207.592 346.929 207.542 349.567 207.542 C 352.817 207.542 352.449 209.811 352.449 209.811 M 352.326 310.393 C 352.326 310.393 352.143 312.366 350.425 312.366 L 348.033 312.366 L 348.033 289.478 L 349.628 289.478 C 349.628 289.478 352.326 289.428 352.326 292.092 Z M 371.341 287.505 C 371.341 284.791 370.727 282.966 368.826 280.993 C 366.925 279.02 363.367 277.441 363.367 277.441 C 363.367 277.441 365.514 276.948 368.704 274.728 C 371.893 272.509 371.525 267.921 371.525 267.921 L 371.525 212.919 C 371.525 212.919 371.801 204.509 366.925 200.587 C 362.049 196.665 352.515 196.363 352.515 196.363 L 328.711 196.363 L 328.711 324.107 L 350.609 324.107 C 360.055 324.107 364.594 322.232 368.336 318.286 C 372.077 314.34 371.341 308.321 371.341 308.321 Z M 371.341 287.505" id="path13"/>
 9 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 452.747 226.744 L 452.747 268.806 L 471.581 268.806 L 471.581 227.459 C 471.581 227.459 471.846 213.532 450.516 213.532 C 429.182 213.532 430.076 227.533 430.076 227.533 L 430.076 313.381 C 430.076 313.381 428.825 327.523 450.872 327.523 C 472.919 327.523 471.401 313.526 471.401 313.526 L 471.401 292.064 L 452.835 292.064 L 452.835 314.389 C 452.835 314.389 452.923 315.61 450.961 315.61 C 448.997 315.61 448.729 314.389 448.729 314.389 L 448.729 226.524 C 448.729 226.524 448.821 225.378 450.692 225.378 C 452.566 225.378 452.747 226.744 452.747 226.744" id="path15"/>
10 |     <path style="fill: rgb(171, 46, 51); fill-opacity: 1; fill-rule: nonzero; stroke: none;" d="M 520.624 281.841 C 517.672 278.98 514.317 277.904 514.317 277.904 C 514.317 277.904 517.538 277.796 520.489 274.775 C 523.442 271.753 523.173 267.924 523.173 267.924 L 523.173 208.211 L 503.185 208.211 L 503.185 276.014 C 503.185 276.014 503.185 277.361 501.172 277.361 L 498.761 277.309 L 498.761 191.655 L 478.973 191.655 L 478.973 327.905 L 498.692 327.905 L 498.692 290.039 L 501.709 290.039 C 501.709 290.039 502.112 290.039 502.648 290.523 C 503.185 291.01 503.185 291.602 503.185 291.602 L 503.185 327.905 L 523.307 327.905 L 523.307 288.636 C 523.307 288.636 523.576 284.699 520.624 281.841" id="path17"/>
11 |     <path style="fill-opacity: 1; fill-rule: nonzero; stroke: none; fill: rgb(255, 222, 87);" d="M 638.021 327.182 L 638.021 205.132 L 619.489 205.132 L 619.489 303.448 C 619.489 303.448 620.119 305.64 617.53 305.64 C 614.944 305.64 615.223 303.448 615.223 303.448 L 615.223 205.132 L 596.692 205.132 C 596.692 205.132 596.83 302.884 596.83 305.301 C 596.83 307.721 597.178 309.071 597.738 310.924 C 598.299 312.784 599.487 316.662 607.809 317.732 C 616.132 318.802 619.349 314.359 619.349 314.359 L 619.349 326.281 C 619.349 326.281 619.77 327.913 617.462 327.913 C 615.154 327.913 615.223 326.396 615.223 326.396 L 615.223 321.502 L 596.55 321.502 L 596.55 326.336 C 596.55 326.336 595.43 340.059 617.182 340.059 C 638.934 340.059 638.021 327.182 638.021 327.182" id="path-1"/>
12 |     <path d="M 592.159 233.846 C 593.222 238.576 593.75 243.873 593.745 249.735 C 593.74 255.598 593.135 261.281 591.931 266.782 C 590.726 272.285 588.901 277.144 586.453 281.361 C 584.006 285.578 580.938 288.946 577.248 291.466 C 573.559 293.985 569.226 295.246 564.25 295.246 C 561.585 295.246 559.008 294.936 556.521 294.32 C 554.033 293.703 551.813 292.854 549.859 291.774 C 547.905 290.694 546.284 289.512 544.997 288.226 C 543.71 286.94 542.934 285.578 542.668 284.138 L 542.629 328.722 L 526.369 328.722 L 526.475 207.466 L 541.003 207.466 L 542.728 216.259 C 544.507 213.38 547.197 211.065 550.797 209.317 C 554.397 207.568 558.374 206.694 562.728 206.694 C 565.66 206.694 568.637 207.157 571.657 208.083 C 574.677 209.008 577.497 210.551 580.116 212.711 C 582.735 214.871 585.11 217.698 587.239 221.196 C 589.369 224.692 591.009 228.909 592.159 233.846 Z M 558.932 280.744 C 561.597 280.744 564.019 279.972 566.197 278.429 C 568.376 276.887 570.243 274.804 571.801 272.182 C 573.358 269.559 574.582 266.423 575.474 262.772 C 576.366 259.121 576.814 255.238 576.817 251.124 C 576.821 247.113 576.424 243.307 575.628 239.708 C 574.831 236.108 573.701 232.92 572.237 230.143 C 570.774 227.366 568.999 225.155 566.912 223.51 C 564.825 221.864 562.405 221.041 559.65 221.041 C 556.985 221.041 554.54 221.813 552.318 223.356 C 550.095 224.898 548.183 226.981 546.581 229.603 C 544.98 232.226 543.755 235.311 542.908 238.86 C 542.061 242.408 541.635 246.239 541.632 250.353 C 541.628 254.466 542.002 258.349 542.754 262 C 543.506 265.651 544.637 268.865 546.145 271.642 C 547.653 274.419 549.472 276.63 551.603 278.276 C 553.734 279.922 556.177 280.744 558.932 280.744 Z" style="fill: rgb(69, 132, 182); white-space: pre;"/>
13 |   </g>
14 | </svg>


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["wheel", "setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | black
 2 | click
 3 | codecov
 4 | flake8
 5 | mypy
 6 | pytest
 7 | pytest-cov
 8 | requests
 9 | setuptools>=46.4.0
10 | types-requests
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | requests
3 | urllib3
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | [metadata]
  2 | name = waybackpy
  3 | version = attr: waybackpy.__version__
  4 | description = Python package that interfaces with the Internet Archive's Wayback Machine APIs. Archive pages and retrieve archived pages easily.
  5 | long_description = file: README.md
  6 | long_description_content_type = text/markdown
  7 | license = MIT
  8 | author = Akash Mahanty
  9 | author_email = akamhy@yahoo.com
 10 | url = https://akamhy.github.io/waybackpy/
 11 | download_url = https://github.com/akamhy/waybackpy/releases
 12 | project_urls =
 13 |     Documentation = https://github.com/akamhy/waybackpy/wiki
 14 |     Source = https://github.com/akamhy/waybackpy
 15 |     Tracker = https://github.com/akamhy/waybackpy/issues
 16 | keywords =
 17 |     Archive Website
 18 |     Wayback Machine
 19 |     Internet Archive
 20 |     Wayback Machine CLI
 21 |     Wayback Machine Python
 22 |     Internet Archiving
 23 |     Availability API
 24 |     CDX API
 25 |     savepagenow
 26 | classifiers =
 27 |     Development Status :: 5 - Production/Stable
 28 |     Intended Audience :: Developers
 29 |     Intended Audience :: End Users/Desktop
 30 |     Natural Language :: English
 31 |     Typing :: Typed
 32 |     License :: OSI Approved :: MIT License
 33 |     Programming Language :: Python
 34 |     Programming Language :: Python :: 3
 35 |     Programming Language :: Python :: 3.6
 36 |     Programming Language :: Python :: 3.7
 37 |     Programming Language :: Python :: 3.8
 38 |     Programming Language :: Python :: 3.9
 39 |     Programming Language :: Python :: 3.10
 40 |     Programming Language :: Python :: 3.11
 41 |     Programming Language :: Python :: Implementation :: CPython
 42 | 
 43 | [options]
 44 | packages = find:
 45 | include-package-data = True
 46 | python_requires = >= 3.6
 47 | install_requires =
 48 |     click
 49 |     requests
 50 |     urllib3
 51 | 
 52 | [options.package_data]
 53 | waybackpy = py.typed
 54 | 
 55 | [options.extras_require]
 56 | dev =
 57 |     black
 58 |     codecov
 59 |     flake8
 60 |     mypy
 61 |     pytest
 62 |     pytest-cov
 63 |     setuptools>=46.4.0
 64 |     types-requests
 65 | 
 66 | [options.entry_points]
 67 | console_scripts =
 68 |     waybackpy = waybackpy.cli:main
 69 | 
 70 | [isort]
 71 | profile = black
 72 | 
 73 | [flake8]
 74 | indent-size = 4
 75 | max-line-length = 88
 76 | extend-ignore = W503,W605
 77 | exclude =
 78 |     venv
 79 |     __pycache__
 80 |     .venv
 81 |     ./env
 82 |     venv/
 83 |     env
 84 |     .env
 85 |     ./build
 86 | 
 87 | [mypy]
 88 | python_version = 3.9
 89 | show_error_codes = True
 90 | pretty = True
 91 | strict = True
 92 | 
 93 | [tool:pytest]
 94 | addopts =
 95 |     # show summary of all tests that did not pass
 96 |     -ra
 97 |     # enable all warnings
 98 |     -Wd
 99 |     # coverage and html report
100 |     --cov=waybackpy
101 |     --cov-report=html
102 | testpaths =
103 |     tests
104 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()
4 | 


--------------------------------------------------------------------------------
/snapcraft.yaml:
--------------------------------------------------------------------------------
 1 | name: waybackpy
 2 | summary: Wayback Machine API interface and a command-line tool
 3 | description: |
 4 |   Waybackpy is a CLI tool that interfaces with the Wayback Machine APIs.
 5 |   Wayback Machine has three client side public APIs, Save API, 
 6 |   Availability API and CDX API. These three APIs can be accessed via 
 7 |   the waybackpy from the terminal.
 8 | version: git
 9 | grade: stable
10 | confinement: strict
11 | base: core20
12 | architectures:
13 |   - build-on: [arm64, armhf, amd64]
14 | 
15 | apps:
16 |   waybackpy:
17 |     command: bin/waybackpy
18 |     plugs: [home, network, network-bind, removable-media]
19 | 
20 | parts:
21 |   waybackpy:
22 |     plugin: python
23 |     source: https://github.com/akamhy/waybackpy.git
24 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamhy/waybackpy/3b3e78d901a600bb22943202c6a8981ca04a5e48/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_availability_api.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | from datetime import datetime, timedelta
  4 | 
  5 | import pytest
  6 | 
  7 | from waybackpy.availability_api import WaybackMachineAvailabilityAPI
  8 | from waybackpy.exceptions import (
  9 |     ArchiveNotInAvailabilityAPIResponse,
 10 |     InvalidJSONInAvailabilityAPIResponse,
 11 | )
 12 | 
 13 | now = datetime.utcnow()
 14 | url = "https://example.com/"
 15 | user_agent = (
 16 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
 17 |     "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
 18 | )
 19 | 
 20 | 
 21 | def rndstr(n: int) -> str:
 22 |     return "".join(
 23 |         random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
 24 |     )
 25 | 
 26 | 
 27 | def test_oldest() -> None:
 28 |     """
 29 |     Test the oldest archive of Google.com and also checks the attributes.
 30 |     """
 31 |     url = "https://example.com/"
 32 |     user_agent = (
 33 |         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
 34 |         "(KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
 35 |     )
 36 |     availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
 37 |     oldest = availability_api.oldest()
 38 |     oldest_archive_url = oldest.archive_url
 39 |     assert "2002" in oldest_archive_url
 40 |     oldest_timestamp = oldest.timestamp()
 41 |     assert abs(oldest_timestamp - now) > timedelta(days=7000)  # More than 19 years
 42 |     assert (
 43 |         availability_api.json is not None
 44 |         and availability_api.json["archived_snapshots"]["closest"]["available"] is True
 45 |     )
 46 |     assert repr(oldest).find("example.com") != -1
 47 |     assert "2002" in str(oldest)
 48 | 
 49 | 
 50 | def test_newest() -> None:
 51 |     """
 52 |     Assuming that the recent most Google Archive was made no more earlier than
 53 |     last one day which is 86400 seconds.
 54 |     """
 55 |     url = "https://www.youtube.com/"
 56 |     user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0"
 57 |     availability_api = WaybackMachineAvailabilityAPI(url, user_agent)
 58 |     newest = availability_api.newest()
 59 |     newest_timestamp = newest.timestamp()
 60 |     # betting in favor that latest youtube archive was not before the last 3 days
 61 |     # high tarffic sites like youtube are archived mnay times a day, so seems
 62 |     # very reasonable to me.
 63 |     assert abs(newest_timestamp - now) < timedelta(seconds=86400 * 3)
 64 | 
 65 | 
 66 | def test_invalid_json() -> None:
 67 |     """
 68 |     When the API is malfunctioning or we don't pass a URL,
 69 |     it may return invalid JSON data.
 70 |     """
 71 |     with pytest.raises(InvalidJSONInAvailabilityAPIResponse):
 72 |         availability_api = WaybackMachineAvailabilityAPI(url="", user_agent=user_agent)
 73 |         _ = availability_api.archive_url
 74 | 
 75 | 
 76 | def test_no_archive() -> None:
 77 |     """
 78 |     ArchiveNotInAvailabilityAPIResponse may be raised if Wayback Machine did not
 79 |     replied with the archive despite the fact that we know the site has million
 80 |     of archives. Don't know the reason for this wierd behavior.
 81 | 
 82 |     And also if really there are no archives for the passed URL this exception
 83 |     is raised.
 84 |     """
 85 |     with pytest.raises(ArchiveNotInAvailabilityAPIResponse):
 86 |         availability_api = WaybackMachineAvailabilityAPI(
 87 |             url=f"https://{rndstr(30)}.cn", user_agent=user_agent
 88 |         )
 89 |         _ = availability_api.archive_url
 90 | 
 91 | 
 92 | def test_no_api_call_str_repr() -> None:
 93 |     """
 94 |     Some entitled users maybe want to see what is the string representation
 95 |     if they don’t make any API requests.
 96 | 
 97 |     str() must not return None so we return ""
 98 |     """
 99 |     availability_api = WaybackMachineAvailabilityAPI(
100 |         url=f"https://{rndstr(30)}.gov", user_agent=user_agent
101 |     )
102 |     assert str(availability_api) == ""
103 | 
104 | 
105 | def test_no_call_timestamp() -> None:
106 |     """
107 |     If no API requests were made the bound timestamp() method returns
108 |     the datetime.max as a default value.
109 |     """
110 |     availability_api = WaybackMachineAvailabilityAPI(
111 |         url=f"https://{rndstr(30)}.in", user_agent=user_agent
112 |     )
113 |     assert datetime.max == availability_api.timestamp()
114 | 


--------------------------------------------------------------------------------
/tests/test_cdx_api.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | 
  4 | import pytest
  5 | 
  6 | from waybackpy.cdx_api import WaybackMachineCDXServerAPI
  7 | from waybackpy.exceptions import NoCDXRecordFound
  8 | 
  9 | 
 10 | def rndstr(n: int) -> str:
 11 |     return "".join(
 12 |         random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
 13 |     )
 14 | 
 15 | 
 16 | def test_a() -> None:
 17 |     user_agent = (
 18 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 19 |         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 20 |     )
 21 |     url = "https://twitter.com/jack"
 22 | 
 23 |     wayback = WaybackMachineCDXServerAPI(
 24 |         url=url,
 25 |         user_agent=user_agent,
 26 |         match_type="prefix",
 27 |         collapses=["urlkey"],
 28 |         start_timestamp="201001",
 29 |         end_timestamp="201002",
 30 |     )
 31 |     #  timeframe bound prefix matching enabled along with active urlkey based collapsing
 32 | 
 33 |     snapshots = wayback.snapshots()  # <class 'generator'>
 34 | 
 35 |     for snapshot in snapshots:
 36 |         assert snapshot.timestamp.startswith("2010")
 37 | 
 38 | 
 39 | def test_b() -> None:
 40 |     user_agent = (
 41 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 42 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 43 |     )
 44 |     url = "https://www.google.com"
 45 | 
 46 |     wayback = WaybackMachineCDXServerAPI(
 47 |         url=url,
 48 |         user_agent=user_agent,
 49 |         start_timestamp="202101",
 50 |         end_timestamp="202112",
 51 |         collapses=["urlkey"],
 52 |     )
 53 |     #  timeframe bound prefix matching enabled along with active urlkey based collapsing
 54 | 
 55 |     snapshots = wayback.snapshots()  # <class 'generator'>
 56 | 
 57 |     for snapshot in snapshots:
 58 |         assert snapshot.timestamp.startswith("2021")
 59 | 
 60 | 
 61 | def test_c() -> None:
 62 |     user_agent = (
 63 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 64 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 65 |     )
 66 |     url = "https://www.google.com"
 67 | 
 68 |     cdx = WaybackMachineCDXServerAPI(
 69 |         url=url,
 70 |         user_agent=user_agent,
 71 |         closest="201010101010",
 72 |         sort="closest",
 73 |         limit="1",
 74 |     )
 75 |     snapshots = cdx.snapshots()
 76 |     for snapshot in snapshots:
 77 |         archive_url = snapshot.archive_url
 78 |         timestamp = snapshot.timestamp
 79 |         break
 80 | 
 81 |     assert str(archive_url).find("google.com")
 82 |     assert "20101010" in timestamp
 83 | 
 84 | 
 85 | def test_d() -> None:
 86 |     user_agent = (
 87 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
 88 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 89 |     )
 90 | 
 91 |     cdx = WaybackMachineCDXServerAPI(
 92 |         url="akamhy.github.io",
 93 |         user_agent=user_agent,
 94 |         match_type="prefix",
 95 |         use_pagination=True,
 96 |         filters=["statuscode:200"],
 97 |     )
 98 |     snapshots = cdx.snapshots()
 99 | 
100 |     count = 0
101 |     for snapshot in snapshots:
102 |         count += 1
103 |         assert str(snapshot.archive_url).find("akamhy.github.io")
104 |     assert count > 50
105 | 
106 | 
107 | def test_oldest() -> None:
108 |     user_agent = (
109 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
110 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
111 |     )
112 | 
113 |     cdx = WaybackMachineCDXServerAPI(
114 |         url="google.com",
115 |         user_agent=user_agent,
116 |         filters=["statuscode:200"],
117 |     )
118 |     oldest = cdx.oldest()
119 |     assert "1998" in oldest.timestamp
120 |     assert "google" in oldest.urlkey
121 |     assert oldest.original.find("google.com") != -1
122 |     assert oldest.archive_url.find("google.com") != -1
123 | 
124 | 
125 | def test_newest() -> None:
126 |     user_agent = (
127 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
128 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
129 |     )
130 | 
131 |     cdx = WaybackMachineCDXServerAPI(
132 |         url="google.com",
133 |         user_agent=user_agent,
134 |         filters=["statuscode:200"],
135 |     )
136 |     newest = cdx.newest()
137 |     assert "google" in newest.urlkey
138 |     assert newest.original.find("google.com") != -1
139 |     assert newest.archive_url.find("google.com") != -1
140 | 
141 | 
142 | def test_near() -> None:
143 |     user_agent = (
144 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
145 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
146 |     )
147 | 
148 |     cdx = WaybackMachineCDXServerAPI(
149 |         url="google.com",
150 |         user_agent=user_agent,
151 |         filters=["statuscode:200"],
152 |     )
153 |     near = cdx.near(year=2010, month=10, day=10, hour=10, minute=10)
154 |     assert "2010101010" in near.timestamp
155 |     assert "google" in near.urlkey
156 |     assert near.original.find("google.com") != -1
157 |     assert near.archive_url.find("google.com") != -1
158 | 
159 |     near = cdx.near(wayback_machine_timestamp="201010101010")
160 |     assert "2010101010" in near.timestamp
161 |     assert "google" in near.urlkey
162 |     assert near.original.find("google.com") != -1
163 |     assert near.archive_url.find("google.com") != -1
164 | 
165 |     near = cdx.near(unix_timestamp=1286705410)
166 |     assert "2010101010" in near.timestamp
167 |     assert "google" in near.urlkey
168 |     assert near.original.find("google.com") != -1
169 |     assert near.archive_url.find("google.com") != -1
170 | 
171 |     with pytest.raises(NoCDXRecordFound):
172 |         dne_url = f"https://{rndstr(30)}.in"
173 |         cdx = WaybackMachineCDXServerAPI(
174 |             url=dne_url,
175 |             user_agent=user_agent,
176 |             filters=["statuscode:200"],
177 |         )
178 |         cdx.near(unix_timestamp=1286705410)
179 | 
180 | 
181 | def test_before() -> None:
182 |     user_agent = (
183 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
184 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
185 |     )
186 | 
187 |     cdx = WaybackMachineCDXServerAPI(
188 |         url="http://www.google.com/",
189 |         user_agent=user_agent,
190 |         filters=["statuscode:200"],
191 |     )
192 |     before = cdx.before(wayback_machine_timestamp=20160731235949)
193 |     assert "20160731233347" in before.timestamp
194 |     assert "google" in before.urlkey
195 |     assert before.original.find("google.com") != -1
196 |     assert before.archive_url.find("google.com") != -1
197 | 
198 | 
199 | def test_after() -> None:
200 |     user_agent = (
201 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) "
202 |         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1"
203 |     )
204 | 
205 |     cdx = WaybackMachineCDXServerAPI(
206 |         url="http://www.google.com/",
207 |         user_agent=user_agent,
208 |         filters=["statuscode:200"],
209 |     )
210 |     after = cdx.after(wayback_machine_timestamp=20160731235949)
211 |     assert "20160801000917" in after.timestamp, after.timestamp
212 |     assert "google" in after.urlkey
213 |     assert after.original.find("google.com") != -1
214 |     assert after.archive_url.find("google.com") != -1
215 | 


--------------------------------------------------------------------------------
/tests/test_cdx_snapshot.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | from waybackpy.cdx_snapshot import CDXSnapshot
 4 | 
 5 | 
 6 | def test_CDXSnapshot() -> None:
 7 |     sample_input = (
 8 |         "org,archive)/ 20080126045828 http://github.com "
 9 |         "text/html 200 Q4YULN754FHV2U6Q5JUT6Q2P57WEWNNY 1415"
10 |     )
11 |     prop_values = sample_input.split(" ")
12 |     properties = {}
13 |     (
14 |         properties["urlkey"],
15 |         properties["timestamp"],
16 |         properties["original"],
17 |         properties["mimetype"],
18 |         properties["statuscode"],
19 |         properties["digest"],
20 |         properties["length"],
21 |     ) = prop_values
22 | 
23 |     snapshot = CDXSnapshot(properties)
24 | 
25 |     assert properties["urlkey"] == snapshot.urlkey
26 |     assert properties["timestamp"] == snapshot.timestamp
27 |     assert properties["original"] == snapshot.original
28 |     assert properties["mimetype"] == snapshot.mimetype
29 |     assert properties["statuscode"] == snapshot.statuscode
30 |     assert properties["digest"] == snapshot.digest
31 |     assert properties["length"] == snapshot.length
32 |     assert (
33 |         datetime.strptime(properties["timestamp"], "%Y%m%d%H%M%S")
34 |         == snapshot.datetime_timestamp
35 |     )
36 |     archive_url = (
37 |         "https://web.archive.org/web/"
38 |         + properties["timestamp"]
39 |         + "/"
40 |         + properties["original"]
41 |     )
42 |     assert archive_url == snapshot.archive_url
43 |     assert sample_input == str(snapshot)
44 |     assert sample_input == repr(snapshot)
45 | 


--------------------------------------------------------------------------------
/tests/test_cdx_utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List
  2 | 
  3 | import pytest
  4 | 
  5 | from waybackpy.cdx_utils import (
  6 |     check_collapses,
  7 |     check_filters,
  8 |     check_match_type,
  9 |     check_sort,
 10 |     full_url,
 11 |     get_response,
 12 |     get_total_pages,
 13 | )
 14 | from waybackpy.exceptions import WaybackError
 15 | 
 16 | 
 17 | def test_get_total_pages() -> None:
 18 |     url = "twitter.com"
 19 |     user_agent = (
 20 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 "
 21 |         "(KHTML, like Gecko) Version/14.0.2 Safari/605.1.15"
 22 |     )
 23 |     assert get_total_pages(url=url, user_agent=user_agent) >= 56
 24 | 
 25 | 
 26 | def test_full_url() -> None:
 27 |     endpoint = "https://web.archive.org/cdx/search/cdx"
 28 |     params: Dict[str, Any] = {}
 29 |     assert endpoint == full_url(endpoint, params)
 30 | 
 31 |     params = {"a": "1"}
 32 |     assert full_url(endpoint, params) == "https://web.archive.org/cdx/search/cdx?a=1"
 33 |     assert (
 34 |         full_url(endpoint + "?", params) == "https://web.archive.org/cdx/search/cdx?a=1"
 35 |     )
 36 | 
 37 |     params["b"] = 2
 38 |     assert (
 39 |         full_url(endpoint + "?", params)
 40 |         == "https://web.archive.org/cdx/search/cdx?a=1&b=2"
 41 |     )
 42 | 
 43 |     params["c"] = "foo bar"
 44 |     assert (
 45 |         full_url(endpoint + "?", params)
 46 |         == "https://web.archive.org/cdx/search/cdx?a=1&b=2&c=foo%20bar"
 47 |     )
 48 | 
 49 | 
 50 | def test_get_response() -> None:
 51 |     url = "https://github.com"
 52 |     user_agent = (
 53 |         "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"
 54 |     )
 55 |     headers = {"User-Agent": str(user_agent)}
 56 |     response = get_response(url, headers=headers)
 57 |     assert not isinstance(response, Exception) and response.status_code == 200
 58 | 
 59 | 
 60 | def test_check_filters() -> None:
 61 |     filters: List[str] = []
 62 |     check_filters(filters)
 63 | 
 64 |     filters = ["statuscode:200", "timestamp:20215678901234", "original:https://url.com"]
 65 |     check_filters(filters)
 66 | 
 67 |     with pytest.raises(WaybackError):
 68 |         check_filters("not-list")  # type: ignore[arg-type]
 69 | 
 70 |     with pytest.raises(WaybackError):
 71 |         check_filters(["invalid"])
 72 | 
 73 | 
 74 | def test_check_collapses() -> None:
 75 |     collapses: List[str] = []
 76 |     check_collapses(collapses)
 77 | 
 78 |     collapses = ["timestamp:10"]
 79 |     check_collapses(collapses)
 80 | 
 81 |     collapses = ["urlkey"]
 82 |     check_collapses(collapses)
 83 | 
 84 |     collapses = "urlkey"  # type: ignore[assignment]
 85 |     with pytest.raises(WaybackError):
 86 |         check_collapses(collapses)
 87 | 
 88 |     collapses = ["also illegal collapse"]
 89 |     with pytest.raises(WaybackError):
 90 |         check_collapses(collapses)
 91 | 
 92 | 
 93 | def test_check_match_type() -> None:
 94 |     assert check_match_type(None, "url")
 95 |     match_type = "exact"
 96 |     url = "test_url"
 97 |     assert check_match_type(match_type, url)
 98 | 
 99 |     url = "has * in it"
100 |     with pytest.raises(WaybackError):
101 |         check_match_type("domain", url)
102 | 
103 |     with pytest.raises(WaybackError):
104 |         check_match_type("not a valid type", "url")
105 | 
106 | 
107 | def test_check_sort() -> None:
108 |     assert check_sort("default")
109 |     assert check_sort("closest")
110 |     assert check_sort("reverse")
111 | 
112 |     with pytest.raises(WaybackError):
113 |         assert check_sort("random crap")
114 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from click.testing import CliRunner
  3 | 
  4 | from waybackpy import __version__
  5 | from waybackpy.cli import main
  6 | 
  7 | 
  8 | def test_oldest() -> None:
  9 |     runner = CliRunner()
 10 |     result = runner.invoke(main, ["--url", " https://github.com ", "--oldest"])
 11 |     assert result.exit_code == 0
 12 |     assert (
 13 |         result.output
 14 |         == "Archive URL:\nhttps://web.archive.org/web/2008051421\
 15 | 0148/http://github.com/\n"
 16 |     )
 17 | 
 18 | 
 19 | def test_near() -> None:
 20 |     runner = CliRunner()
 21 |     result = runner.invoke(
 22 |         main,
 23 |         [
 24 |             "--url",
 25 |             " https://facebook.com ",
 26 |             "--near",
 27 |             "--year",
 28 |             "2010",
 29 |             "--month",
 30 |             "5",
 31 |             "--day",
 32 |             "10",
 33 |             "--hour",
 34 |             "6",
 35 |         ],
 36 |     )
 37 |     assert result.exit_code == 0
 38 |     assert (
 39 |         result.output
 40 |         == "Archive URL:\nhttps://web.archive.org/web/2010051008\
 41 | 2647/http://www.facebook.com/\n"
 42 |     )
 43 | 
 44 | 
 45 | def test_newest() -> None:
 46 |     runner = CliRunner()
 47 |     result = runner.invoke(main, ["--url", " https://microsoft.com ", "--newest"])
 48 |     assert result.exit_code == 0
 49 |     assert (
 50 |         result.output.find("microsoft.com") != -1
 51 |         and result.output.find("Archive URL:\n") != -1
 52 |     )
 53 | 
 54 | 
 55 | def test_cdx() -> None:
 56 |     runner = CliRunner()
 57 |     result = runner.invoke(
 58 |         main,
 59 |         "--url https://twitter.com/jack --cdx --user-agent some-user-agent \
 60 | --start-timestamp 2010 --end-timestamp 2012 --collapse urlkey \
 61 | --match-type prefix --cdx-print archiveurl --cdx-print length \
 62 | --cdx-print digest --cdx-print statuscode --cdx-print mimetype \
 63 | --cdx-print original --cdx-print timestamp --cdx-print urlkey".split(
 64 |             " "
 65 |         ),
 66 |     )
 67 |     assert result.exit_code == 0
 68 |     assert result.output.count("\n") > 3000
 69 | 
 70 | 
 71 | def test_save() -> None:
 72 |     runner = CliRunner()
 73 |     result = runner.invoke(
 74 |         main,
 75 |         "--url https://yahoo.com --user_agent my-unique-user-agent \
 76 | --save --headers".split(
 77 |             " "
 78 |         ),
 79 |     )
 80 |     assert result.exit_code == 0
 81 |     assert result.output.find("Archive URL:") != -1
 82 |     assert (result.output.find("Cached save:\nTrue") != -1) or (
 83 |         result.output.find("Cached save:\nFalse") != -1
 84 |     )
 85 |     assert result.output.find("Save API headers:\n") != -1
 86 |     assert result.output.find("yahoo.com") != -1
 87 | 
 88 | 
 89 | def test_version() -> None:
 90 |     runner = CliRunner()
 91 |     result = runner.invoke(main, ["--version"])
 92 |     assert result.exit_code == 0
 93 |     assert result.output == f"waybackpy version {__version__}\n"
 94 | 
 95 | 
 96 | def test_license() -> None:
 97 |     runner = CliRunner()
 98 |     result = runner.invoke(main, ["--license"])
 99 |     assert result.exit_code == 0
100 |     assert (
101 |         result.output
102 |         == requests.get(
103 |             url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE"
104 |         ).text
105 |         + "\n"
106 |     )
107 | 
108 | 
109 | def test_only_url() -> None:
110 |     runner = CliRunner()
111 |     result = runner.invoke(main, ["--url", "https://google.com"])
112 |     assert result.exit_code == 0
113 |     assert (
114 |         result.output
115 |         == "NoCommandFound: Only URL passed, but did not specify what to do with the URL. Use \
116 | --help flag for help using waybackpy.\n"
117 |     )
118 | 
119 | 
120 | def test_known_url() -> None:
121 |     # with file generator enabled
122 |     runner = CliRunner()
123 |     result = runner.invoke(
124 |         main, ["--url", "https://akamhy.github.io", "--known-urls", "--file"]
125 |     )
126 |     assert result.exit_code == 0
127 |     assert result.output.count("\n") > 40
128 |     assert result.output.count("akamhy.github.io") > 40
129 |     assert result.output.find("in the current working directory.\n") != -1
130 | 
131 |     # without file
132 |     runner = CliRunner()
133 |     result = runner.invoke(main, ["--url", "https://akamhy.github.io", "--known-urls"])
134 |     assert result.exit_code == 0
135 |     assert result.output.count("\n") > 40
136 |     assert result.output.count("akamhy.github.io") > 40
137 | 


--------------------------------------------------------------------------------
/tests/test_save_api.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | import time
  4 | from datetime import datetime
  5 | from typing import cast
  6 | 
  7 | import pytest
  8 | from requests.structures import CaseInsensitiveDict
  9 | 
 10 | from waybackpy.exceptions import MaximumSaveRetriesExceeded
 11 | from waybackpy.save_api import WaybackMachineSaveAPI
 12 | 
 13 | 
 14 | def rndstr(n: int) -> str:
 15 |     return "".join(
 16 |         random.choice(string.ascii_uppercase + string.digits) for _ in range(n)
 17 |     )
 18 | 
 19 | 
 20 | def test_save() -> None:
 21 |     url = "https://github.com/akamhy/waybackpy"
 22 |     user_agent = (
 23 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 24 |         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 25 |     )
 26 |     save_api = WaybackMachineSaveAPI(url, user_agent)
 27 |     save_api.save()
 28 |     archive_url = save_api.archive_url
 29 |     timestamp = save_api.timestamp()
 30 |     headers = save_api.headers  # CaseInsensitiveDict
 31 |     cached_save = save_api.cached_save
 32 |     assert cached_save in [True, False]
 33 |     assert archive_url.find("github.com/akamhy/waybackpy") != -1
 34 |     assert timestamp is not None
 35 |     assert str(headers).find("github.com/akamhy/waybackpy") != -1
 36 |     assert isinstance(save_api.timestamp(), datetime)
 37 | 
 38 | 
 39 | def test_max_redirect_exceeded() -> None:
 40 |     with pytest.raises(MaximumSaveRetriesExceeded):
 41 |         url = f"https://{rndstr}.gov"
 42 |         user_agent = (
 43 |             "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 44 |             "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 45 |         )
 46 |         save_api = WaybackMachineSaveAPI(url, user_agent, max_tries=3)
 47 |         save_api.save()
 48 | 
 49 | 
 50 | def test_sleep() -> None:
 51 |     """
 52 |     sleeping is actually very important for SaveAPI
 53 |     interface stability.
 54 |     The test checks that the time taken by sleep method
 55 |     is as intended.
 56 |     """
 57 |     url = "https://example.com"
 58 |     user_agent = (
 59 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 60 |         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 61 |     )
 62 |     save_api = WaybackMachineSaveAPI(url, user_agent)
 63 |     s_time = int(time.time())
 64 |     save_api.sleep(6)  # multiple of 3 sleep for 10 seconds
 65 |     e_time = int(time.time())
 66 |     assert (e_time - s_time) >= 10
 67 | 
 68 |     s_time = int(time.time())
 69 |     save_api.sleep(7)  # sleeps for 5 seconds
 70 |     e_time = int(time.time())
 71 |     assert (e_time - s_time) >= 5
 72 | 
 73 | 
 74 | def test_timestamp() -> None:
 75 |     url = "https://example.com"
 76 |     user_agent = (
 77 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 78 |         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 79 |     )
 80 |     save_api = WaybackMachineSaveAPI(url, user_agent)
 81 |     now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
 82 |     save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
 83 |     save_api.timestamp()
 84 |     assert save_api.cached_save is False
 85 |     now = "20100124063622"
 86 |     save_api._archive_url = f"https://web.archive.org/web/{now}/{url}/"
 87 |     save_api.timestamp()
 88 |     assert save_api.cached_save is True
 89 | 
 90 | 
 91 | def test_archive_url_parser() -> None:
 92 |     """
 93 |     Testing three regex for matches and also tests the response URL.
 94 |     """
 95 |     url = "https://example.com"
 96 |     user_agent = (
 97 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
 98 |         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
 99 |     )
100 |     save_api = WaybackMachineSaveAPI(url, user_agent)
101 | 
102 |     h = (
103 |         "\nSTART\nContent-Location: "
104 |         "/web/20201126185327/https://www.scribbr.com/citing-sources/et-al"
105 |         "\nEND\n"
106 |     )
107 |     save_api.headers = h  # type: ignore[assignment]
108 | 
109 |     expected_url = (
110 |         "https://web.archive.org/web/20201126185327/"
111 |         "https://www.scribbr.com/citing-sources/et-al"
112 |     )
113 |     assert save_api.archive_url_parser() == expected_url
114 | 
115 |     headers = {
116 |         "Server": "nginx/1.15.8",
117 |         "Date": "Sat, 02 Jan 2021 09:40:25 GMT",
118 |         "Content-Type": "text/html; charset=UTF-8",
119 |         "Transfer-Encoding": "chunked",
120 |         "Connection": "keep-alive",
121 |         "X-Archive-Orig-Server": "nginx",
122 |         "X-Archive-Orig-Date": "Sat, 02 Jan 2021 09:40:09 GMT",
123 |         "X-Archive-Orig-Transfer-Encoding": "chunked",
124 |         "X-Archive-Orig-Connection": "keep-alive",
125 |         "X-Archive-Orig-Vary": "Accept-Encoding",
126 |         "X-Archive-Orig-Last-Modified": "Fri, 01 Jan 2021 12:19:00 GMT",
127 |         "X-Archive-Orig-Strict-Transport-Security": "max-age=31536000, max-age=0;",
128 |         "X-Archive-Guessed-Content-Type": "text/html",
129 |         "X-Archive-Guessed-Charset": "utf-8",
130 |         "Memento-Datetime": "Sat, 02 Jan 2021 09:40:09 GMT",
131 |         "Link": (
132 |             '<https://www.scribbr.com/citing-sources/et-al/>; rel="original", '
133 |             "<https://web.archive.org/web/timemap/link/https://www.scribbr.com/"
134 |             'citing-sources/et-al/>; rel="timemap"; type="application/link-format", '
135 |             "<https://web.archive.org/web/https://www.scribbr.com/citing-sources/"
136 |             'et-al/>; rel="timegate", <https://web.archive.org/web/20200601082911/'
137 |             'https://www.scribbr.com/citing-sources/et-al/>; rel="first memento"; '
138 |             'datetime="Mon, 01 Jun 2020 08:29:11 GMT", <https://web.archive.org/web/'
139 |             "20201126185327/https://www.scribbr.com/citing-sources/et-al/>; "
140 |             'rel="prev memento"; datetime="Thu, 26 Nov 2020 18:53:27 GMT", '
141 |             "<https://web.archive.org/web/20210102094009/https://www.scribbr.com/"
142 |             'citing-sources/et-al/>; rel="memento"; datetime="Sat, 02 Jan 2021 '
143 |             '09:40:09 GMT", <https://web.archive.org/web/20210102094009/'
144 |             "https://www.scribbr.com/citing-sources/et-al/>; "
145 |             'rel="last memento"; datetime="Sat, 02 Jan 2021 09:40:09 GMT"'
146 |         ),
147 |         "Content-Security-Policy": (
148 |             "default-src 'self' 'unsafe-eval' 'unsafe-inline' "
149 |             "data: blob: archive.org web.archive.org analytics.archive.org "
150 |             "pragma.archivelab.org",
151 |         ),
152 |         "X-Archive-Src": "spn2-20210102092956-wwwb-spn20.us.archive.org-8001.warc.gz",
153 |         "Server-Timing": (
154 |             "captures_list;dur=112.646325, exclusion.robots;dur=0.172010, "
155 |             "exclusion.robots.policy;dur=0.158205, RedisCDXSource;dur=2.205932, "
156 |             "esindex;dur=0.014647, LoadShardBlock;dur=82.205012, "
157 |             "PetaboxLoader3.datanode;dur=70.750239, CDXLines.iter;dur=24.306278, "
158 |             "load_resource;dur=26.520179"
159 |         ),
160 |         "X-App-Server": "wwwb-app200",
161 |         "X-ts": "200",
162 |         "X-location": "All",
163 |         "X-Cache-Key": (
164 |             "httpsweb.archive.org/web/20210102094009/"
165 |             "https://www.scribbr.com/citing-sources/et-al/IN",
166 |         ),
167 |         "X-RL": "0",
168 |         "X-Page-Cache": "MISS",
169 |         "X-Archive-Screenname": "0",
170 |         "Content-Encoding": "gzip",
171 |     }
172 | 
173 |     save_api.headers = cast(CaseInsensitiveDict[str], headers)
174 | 
175 |     expected_url2 = (
176 |         "https://web.archive.org/web/20210102094009/"
177 |         "https://www.scribbr.com/citing-sources/et-al/"
178 |     )
179 |     assert save_api.archive_url_parser() == expected_url2
180 | 
181 |     expected_url_3 = (
182 |         "https://web.archive.org/web/20171128185327/"
183 |         "https://www.scribbr.com/citing-sources/et-al/US"
184 |     )
185 |     h = f"START\nX-Cache-Key: {expected_url_3}\nEND\n"
186 |     save_api.headers = h  # type: ignore[assignment]
187 | 
188 |     expected_url4 = (
189 |         "https://web.archive.org/web/20171128185327/"
190 |         "https://www.scribbr.com/citing-sources/et-al/"
191 |     )
192 |     assert save_api.archive_url_parser() == expected_url4
193 | 
194 |     h = "TEST TEST TEST AND NO MATCH - TEST FOR RESPONSE URL MATCHING"
195 |     save_api.headers = h  # type: ignore[assignment]
196 |     save_api.response_url = (
197 |         "https://web.archive.org/web/20171128185327/"
198 |         "https://www.scribbr.com/citing-sources/et-al"
199 |     )
200 |     expected_url5 = (
201 |         "https://web.archive.org/web/20171128185327/"
202 |         "https://www.scribbr.com/citing-sources/et-al"
203 |     )
204 |     assert save_api.archive_url_parser() == expected_url5
205 | 
206 | 
207 | def test_archive_url() -> None:
208 |     """
209 |     Checks the attribute archive_url's value when the save method was not
210 |     explicitly invoked by the end-user but the save method was invoked implicitly
211 |     by the archive_url method which is an attribute due to @property.
212 |     """
213 |     url = "https://example.com"
214 |     user_agent = (
215 |         "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) AppleWebKit/605.1.15 "
216 |         "(KHTML, like Gecko) Version/14.1.1 Safari/604.1"
217 |     )
218 |     save_api = WaybackMachineSaveAPI(url, user_agent)
219 |     save_api.saved_archive = (
220 |         "https://web.archive.org/web/20220124063056/https://example.com/"
221 |     )
222 |     save_api._archive_url = save_api.saved_archive
223 |     assert save_api.archive_url == save_api.saved_archive
224 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from waybackpy import __version__
 2 | from waybackpy.utils import DEFAULT_USER_AGENT
 3 | 
 4 | 
 5 | def test_default_user_agent() -> None:
 6 |     assert (
 7 |         DEFAULT_USER_AGENT
 8 |         == f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
 9 |     )
10 | 


--------------------------------------------------------------------------------
/tests/test_wrapper.py:
--------------------------------------------------------------------------------
 1 | from waybackpy.wrapper import Url
 2 | 
 3 | 
 4 | def test_oldest() -> None:
 5 |     url = "https://bing.com"
 6 |     oldest_archive = (
 7 |         "https://web.archive.org/web/20030726111100/http://www.bing.com:80/"
 8 |     )
 9 |     wayback = Url(url).oldest()
10 |     assert wayback.archive_url == oldest_archive
11 |     assert str(wayback) == oldest_archive
12 |     assert len(wayback) > 365 * 15  # days in a year times years
13 | 
14 | 
15 | def test_newest() -> None:
16 |     url = "https://www.youtube.com/"
17 |     wayback = Url(url).newest()
18 |     assert "youtube" in str(wayback.archive_url)
19 |     assert "archived_snapshots" in str(wayback.json)
20 | 
21 | 
22 | def test_near() -> None:
23 |     url = "https://www.google.com"
24 |     wayback = Url(url).near(year=2010, month=10, day=10, hour=10, minute=10)
25 |     assert "20101010" in str(wayback.archive_url)
26 | 
27 | 
28 | def test_total_archives() -> None:
29 |     wayback = Url("https://akamhy.github.io")
30 |     assert wayback.total_archives() > 10
31 | 
32 |     wayback = Url("https://gaha.ef4i3n.m5iai3kifp6ied.cima/gahh2718gs/ahkst63t7gad8")
33 |     assert wayback.total_archives() == 0
34 | 
35 | 
36 | def test_known_urls() -> None:
37 |     wayback = Url("akamhy.github.io")
38 |     assert len(list(wayback.known_urls(subdomain=True))) > 40
39 | 
40 | 
41 | def test_Save() -> None:
42 |     wayback = Url("https://en.wikipedia.org/wiki/Asymptotic_equipartition_property")
43 |     wayback.save()
44 |     archive_url = str(wayback.archive_url)
45 |     assert archive_url.find("Asymptotic_equipartition_property") != -1
46 | 


--------------------------------------------------------------------------------
/waybackpy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Module initializer and provider of static information."""
 2 | 
 3 | __version__ = "3.0.6"
 4 | 
 5 | from .availability_api import WaybackMachineAvailabilityAPI
 6 | from .cdx_api import WaybackMachineCDXServerAPI
 7 | from .save_api import WaybackMachineSaveAPI
 8 | from .wrapper import Url
 9 | 
10 | __all__ = [
11 |     "__version__",
12 |     "WaybackMachineAvailabilityAPI",
13 |     "WaybackMachineCDXServerAPI",
14 |     "WaybackMachineSaveAPI",
15 |     "Url",
16 | ]
17 | 


--------------------------------------------------------------------------------
/waybackpy/availability_api.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module interfaces the Wayback Machine's availability API.
  3 | 
  4 | The interface is useful for looking up archives and finding archives
  5 | that are close to a specific date and time.
  6 | 
  7 | It has a class WaybackMachineAvailabilityAPI, and the class has
  8 | methods like:
  9 | 
 10 | near() for retrieving archives close to a specific date and time.
 11 | 
 12 | oldest() for retrieving the first archive URL of the webpage.
 13 | 
 14 | newest() for retrieving the latest archive of the webpage.
 15 | 
 16 | The Wayback Machine Availability API response must be a valid JSON and
 17 | if it is not then an exception, InvalidJSONInAvailabilityAPIResponse is raised.
 18 | 
 19 | If the Availability API returned valid JSON but archive URL could not be found
 20 | it it then ArchiveNotInAvailabilityAPIResponse is raised.
 21 | """
 22 | 
 23 | import json
 24 | import time
 25 | from datetime import datetime
 26 | from typing import Any, Dict, Optional
 27 | 
 28 | import requests
 29 | from requests.models import Response
 30 | 
 31 | from .exceptions import (
 32 |     ArchiveNotInAvailabilityAPIResponse,
 33 |     InvalidJSONInAvailabilityAPIResponse,
 34 | )
 35 | from .utils import (
 36 |     DEFAULT_USER_AGENT,
 37 |     unix_timestamp_to_wayback_timestamp,
 38 |     wayback_timestamp,
 39 | )
 40 | 
 41 | ResponseJSON = Dict[str, Any]
 42 | 
 43 | 
 44 | class WaybackMachineAvailabilityAPI:
 45 |     """
 46 |     Class that interfaces the Wayback Machine's availability API.
 47 |     """
 48 | 
 49 |     def __init__(
 50 |         self, url: str, user_agent: str = DEFAULT_USER_AGENT, max_tries: int = 3
 51 |     ) -> None:
 52 | 
 53 |         self.url = str(url).strip().replace(" ", "%20")
 54 |         self.user_agent = user_agent
 55 |         self.headers: Dict[str, str] = {"User-Agent": self.user_agent}
 56 |         self.payload: Dict[str, str] = {"url": self.url}
 57 |         self.endpoint: str = "https://archive.org/wayback/available"
 58 |         self.max_tries: int = max_tries
 59 |         self.tries: int = 0
 60 |         self.last_api_call_unix_time: int = int(time.time())
 61 |         self.api_call_time_gap: int = 5
 62 |         self.json: Optional[ResponseJSON] = None
 63 |         self.response: Optional[Response] = None
 64 | 
 65 |     def __repr__(self) -> str:
 66 |         """
 67 |         Same as string representation, just return the archive URL as a string.
 68 |         """
 69 |         return str(self)
 70 | 
 71 |     def __str__(self) -> str:
 72 |         """
 73 |         String representation of the class. If atleast one API
 74 |         call was successfully made then return the archive URL
 75 |         as a string. Else returns "" (empty string literal).
 76 |         """
 77 |         # __str__ can not return anything other than a string object
 78 |         # So, if a string repr is asked even before making a API request
 79 |         # just return ""
 80 |         if not self.json:
 81 |             return ""
 82 | 
 83 |         return self.archive_url
 84 | 
 85 |     def setup_json(self) -> Optional[ResponseJSON]:
 86 |         """
 87 |         Makes the API call to the availability API and set the JSON response
 88 |         to the JSON attribute of the instance and also returns the JSON
 89 |         attribute.
 90 | 
 91 |         time_diff and sleep_time makes sure that you are not making too many
 92 |         requests in a short interval of item, making too many requests is bad
 93 |         as Wayback Machine may reject them above a certain threshold.
 94 | 
 95 |         The end-user can change the api_call_time_gap attribute of the instance
 96 |         to increase or decrease the default time gap between two successive API
 97 |         calls, but it is not recommended to increase it.
 98 |         """
 99 |         time_diff = int(time.time()) - self.last_api_call_unix_time
100 |         sleep_time = self.api_call_time_gap - time_diff
101 | 
102 |         if sleep_time > 0:
103 |             time.sleep(sleep_time)
104 | 
105 |         self.response = requests.get(
106 |             self.endpoint, params=self.payload, headers=self.headers
107 |         )
108 |         self.last_api_call_unix_time = int(time.time())
109 |         self.tries += 1
110 |         try:
111 |             self.json = None if self.response is None else self.response.json()
112 |         except json.decoder.JSONDecodeError as json_decode_error:
113 |             raise InvalidJSONInAvailabilityAPIResponse(
114 |                 f"Response data:\n{self.response.text}"
115 |             ) from json_decode_error
116 | 
117 |         return self.json
118 | 
119 |     def timestamp(self) -> datetime:
120 |         """
121 |         Converts the timestamp form the JSON response to datetime object.
122 |         If JSON attribute of the instance is None it implies that the either
123 |         the the last API call failed or one was never made.
124 | 
125 |         If not JSON or if JSON but no timestamp in the JSON response then
126 |         returns the maximum value for datetime object that is possible.
127 | 
128 |         If you get an URL as a response form the availability API it is
129 |         guaranteed that you can get the datetime object from the timestamp.
130 |         """
131 |         if self.json is None or "archived_snapshots" not in self.json:
132 |             return datetime.max
133 | 
134 |         if (
135 |             self.json is not None
136 |             and "archived_snapshots" in self.json
137 |             and self.json["archived_snapshots"] is not None
138 |             and "closest" in self.json["archived_snapshots"]
139 |             and self.json["archived_snapshots"]["closest"] is not None
140 |             and "timestamp" in self.json["archived_snapshots"]["closest"]
141 |         ):
142 |             return datetime.strptime(
143 |                 self.json["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S"
144 |             )
145 | 
146 |         raise ValueError("Timestamp not found in the Availability API's JSON response.")
147 | 
148 |     @property
149 |     def archive_url(self) -> str:
150 |         """
151 |         Reads the the JSON response data and returns
152 |         the timestamp if found and if not found raises
153 |         ArchiveNotInAvailabilityAPIResponse.
154 |         """
155 |         archive_url = ""
156 |         data = self.json
157 | 
158 |         # If the user didn't invoke oldest, newest or near but tries to access
159 |         # archive_url attribute then assume they that are fine with any archive
160 |         # and invoke the oldest method.
161 |         if not data:
162 |             self.oldest()
163 | 
164 |         # If data is still not none then probably there are no
165 |         # archive for the requested URL.
166 |         if not data or not data["archived_snapshots"]:
167 |             while (self.tries < self.max_tries) and (
168 |                 not data or not data["archived_snapshots"]
169 |             ):
170 |                 self.setup_json()  # It makes a new API call
171 |                 data = self.json  # setup_json() updates value of json attribute
172 | 
173 |             # If exhausted max_tries, then give up and
174 |             # raise ArchiveNotInAvailabilityAPIResponse.
175 | 
176 |             if not data or not data["archived_snapshots"]:
177 |                 raise ArchiveNotInAvailabilityAPIResponse(
178 |                     "Archive not found in the availability "
179 |                     "API response, the URL you requested may not have any archives "
180 |                     "yet. You may retry after some time or archive the webpage now.\n"
181 |                     "Response data:\n"
182 |                     ""
183 |                     if self.response is None
184 |                     else self.response.text
185 |                 )
186 |         else:
187 |             archive_url = data["archived_snapshots"]["closest"]["url"]
188 |             archive_url = archive_url.replace(
189 |                 "http://web.archive.org/web/", "https://web.archive.org/web/", 1
190 |             )
191 |         return archive_url
192 | 
193 |     def oldest(self) -> "WaybackMachineAvailabilityAPI":
194 |         """
195 |         Passes the date 1994-01-01 to near which should return the oldest archive
196 |         because Wayback Machine was started in May, 1996 and it is assumed that
197 |         there would be no archive older than January 1, 1994.
198 |         """
199 |         return self.near(year=1994, month=1, day=1)
200 | 
201 |     def newest(self) -> "WaybackMachineAvailabilityAPI":
202 |         """
203 |         Passes the current UNIX time to near() for retrieving the newest archive
204 |         from the availability API.
205 | 
206 |         Remember UNIX time is UTC and Wayback Machine is also UTC based.
207 |         """
208 |         return self.near(unix_timestamp=int(time.time()))
209 | 
210 |     def near(
211 |         self,
212 |         year: Optional[int] = None,
213 |         month: Optional[int] = None,
214 |         day: Optional[int] = None,
215 |         hour: Optional[int] = None,
216 |         minute: Optional[int] = None,
217 |         unix_timestamp: Optional[int] = None,
218 |     ) -> "WaybackMachineAvailabilityAPI":
219 |         """
220 |         The most important method of this Class, oldest() and newest() are
221 |         dependent on it.
222 | 
223 |         It generates the timestamp based on the input either by calling the
224 |         unix_timestamp_to_wayback_timestamp or wayback_timestamp method with
225 |         appropriate arguments for their respective parameters.
226 | 
227 |         Adds the timestamp to the payload dictionary.
228 | 
229 |         And finally invokes the setup_json method to make the API call then
230 |         finally returns the instance.
231 |         """
232 |         if unix_timestamp:
233 |             timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
234 |         else:
235 |             now = datetime.utcnow().timetuple()
236 |             timestamp = wayback_timestamp(
237 |                 year=now.tm_year if year is None else year,
238 |                 month=now.tm_mon if month is None else month,
239 |                 day=now.tm_mday if day is None else day,
240 |                 hour=now.tm_hour if hour is None else hour,
241 |                 minute=now.tm_min if minute is None else minute,
242 |             )
243 | 
244 |         self.payload["timestamp"] = timestamp
245 |         self.setup_json()
246 |         return self
247 | 


--------------------------------------------------------------------------------
/waybackpy/cdx_api.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module interfaces the Wayback Machine's CDX server API.
  3 | 
  4 | The module has WaybackMachineCDXServerAPI which should be used by the users of
  5 | this module to consume the CDX server API.
  6 | 
  7 | WaybackMachineCDXServerAPI has a snapshot method that yields the snapshots, and
  8 | the snapshots are yielded as instances of the CDXSnapshot class.
  9 | """
 10 | 
 11 | 
 12 | import time
 13 | from datetime import datetime
 14 | from typing import Dict, Generator, List, Optional, Union, cast
 15 | 
 16 | from .cdx_snapshot import CDXSnapshot
 17 | from .cdx_utils import (
 18 |     check_collapses,
 19 |     check_filters,
 20 |     check_match_type,
 21 |     check_sort,
 22 |     full_url,
 23 |     get_response,
 24 |     get_total_pages,
 25 | )
 26 | from .exceptions import NoCDXRecordFound, WaybackError
 27 | from .utils import (
 28 |     DEFAULT_USER_AGENT,
 29 |     unix_timestamp_to_wayback_timestamp,
 30 |     wayback_timestamp,
 31 | )
 32 | 
 33 | 
 34 | class WaybackMachineCDXServerAPI:
 35 |     """
 36 |     Class that interfaces the CDX server API of the Wayback Machine.
 37 | 
 38 |     snapshot() returns a generator that can be iterated upon by the end-user,
 39 |     the generator returns the snapshots/entries as instance of CDXSnapshot to
 40 |     make the usage easy, just use '.' to get any attribute as the attributes are
 41 |     accessible via a dot ".".
 42 |     """
 43 | 
 44 |     # start_timestamp: from, can not use from as it's a keyword
 45 |     # end_timestamp: to, not using to as can not use from
 46 |     def __init__(
 47 |         self,
 48 |         url: str,
 49 |         user_agent: str = DEFAULT_USER_AGENT,
 50 |         start_timestamp: Optional[str] = None,
 51 |         end_timestamp: Optional[str] = None,
 52 |         filters: Optional[List[str]] = None,
 53 |         match_type: Optional[str] = None,
 54 |         sort: Optional[str] = None,
 55 |         gzip: Optional[str] = None,
 56 |         collapses: Optional[List[str]] = None,
 57 |         limit: Optional[str] = None,
 58 |         max_tries: int = 3,
 59 |         use_pagination: bool = False,
 60 |         closest: Optional[str] = None,
 61 |     ) -> None:
 62 |         self.url = str(url).strip().replace(" ", "%20")
 63 |         self.user_agent = user_agent
 64 |         self.start_timestamp = None if start_timestamp is None else str(start_timestamp)
 65 |         self.end_timestamp = None if end_timestamp is None else str(end_timestamp)
 66 |         self.filters = [] if filters is None else filters
 67 |         check_filters(self.filters)
 68 |         self.match_type = None if match_type is None else str(match_type).strip()
 69 |         check_match_type(self.match_type, self.url)
 70 |         self.sort = None if sort is None else str(sort).strip()
 71 |         check_sort(self.sort)
 72 |         self.gzip = gzip
 73 |         self.collapses = [] if collapses is None else collapses
 74 |         check_collapses(self.collapses)
 75 |         self.limit = 25000 if limit is None else limit
 76 |         self.max_tries = max_tries
 77 |         self.use_pagination = use_pagination
 78 |         self.closest = None if closest is None else str(closest)
 79 |         self.last_api_request_url: Optional[str] = None
 80 |         self.endpoint = "https://web.archive.org/cdx/search/cdx"
 81 | 
 82 |     def cdx_api_manager(
 83 |         self, payload: Dict[str, str], headers: Dict[str, str]
 84 |     ) -> Generator[str, None, None]:
 85 |         """
 86 |         This method uses the pagination API of the CDX server if
 87 |         use_pagination attribute is True else uses the standard
 88 |         CDX server response data.
 89 |         """
 90 | 
 91 |         # When using the pagination API of the CDX server.
 92 |         if self.use_pagination is True:
 93 | 
 94 |             total_pages = get_total_pages(self.url, self.user_agent)
 95 |             successive_blank_pages = 0
 96 | 
 97 |             for i in range(total_pages):
 98 |                 payload["page"] = str(i)
 99 | 
100 |                 url = full_url(self.endpoint, params=payload)
101 |                 res = get_response(url, headers=headers)
102 | 
103 |                 if isinstance(res, Exception):
104 |                     raise res
105 | 
106 |                 self.last_api_request_url = url
107 |                 text = res.text
108 | 
109 |                 # Reset the counter if the last page was blank
110 |                 # but the current page is not.
111 |                 if successive_blank_pages == 1:
112 |                     if len(text) != 0:
113 |                         successive_blank_pages = 0
114 | 
115 |                 # Increase the succesive page counter on encountering
116 |                 # blank page.
117 |                 if len(text) == 0:
118 |                     successive_blank_pages += 1
119 | 
120 |                 # If two succesive pages are blank
121 |                 # then we don't have any more pages left to
122 |                 # iterate.
123 |                 if successive_blank_pages >= 2:
124 |                     break
125 | 
126 |                 yield text
127 | 
128 |         # When not using the pagination API of the CDX server
129 |         else:
130 |             payload["showResumeKey"] = "true"
131 |             payload["limit"] = str(self.limit)
132 |             resume_key = None
133 |             more = True
134 |             while more:
135 |                 if resume_key:
136 |                     payload["resumeKey"] = resume_key
137 | 
138 |                 url = full_url(self.endpoint, params=payload)
139 |                 res = get_response(url, headers=headers)
140 |                 if isinstance(res, Exception):
141 |                     raise res
142 | 
143 |                 self.last_api_request_url = url
144 | 
145 |                 text = res.text.strip()
146 |                 lines = text.splitlines()
147 | 
148 |                 more = False
149 | 
150 |                 if len(lines) >= 3:
151 | 
152 |                     second_last_line = lines[-2]
153 | 
154 |                     if len(second_last_line) == 0:
155 | 
156 |                         resume_key = lines[-1].strip()
157 |                         text = text.replace(resume_key, "", 1).strip()
158 |                         more = True
159 | 
160 |                 yield text
161 | 
162 |     def add_payload(self, payload: Dict[str, str]) -> None:
163 |         """
164 |         Adds the payload to the payload dictionary.
165 |         """
166 |         if self.start_timestamp:
167 |             payload["from"] = self.start_timestamp
168 | 
169 |         if self.end_timestamp:
170 |             payload["to"] = self.end_timestamp
171 | 
172 |         if self.gzip is None:
173 |             payload["gzip"] = "false"
174 | 
175 |         if self.closest:
176 |             payload["closest"] = self.closest
177 | 
178 |         if self.match_type:
179 |             payload["matchType"] = self.match_type
180 | 
181 |         if self.sort:
182 |             payload["sort"] = self.sort
183 | 
184 |         if self.filters and len(self.filters) > 0:
185 |             for i, _filter in enumerate(self.filters):
186 |                 payload["filter" + str(i)] = _filter
187 | 
188 |         if self.collapses and len(self.collapses) > 0:
189 |             for i, collapse in enumerate(self.collapses):
190 |                 payload["collapse" + str(i)] = collapse
191 | 
192 |         payload["url"] = self.url
193 | 
194 |     def before(
195 |         self,
196 |         year: Optional[int] = None,
197 |         month: Optional[int] = None,
198 |         day: Optional[int] = None,
199 |         hour: Optional[int] = None,
200 |         minute: Optional[int] = None,
201 |         unix_timestamp: Optional[int] = None,
202 |         wayback_machine_timestamp: Optional[Union[int, str]] = None,
203 |     ) -> CDXSnapshot:
204 |         """
205 |         Gets the nearest archive before the given datetime.
206 |         """
207 |         if unix_timestamp:
208 |             timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
209 |         elif wayback_machine_timestamp:
210 |             timestamp = str(wayback_machine_timestamp)
211 |         else:
212 |             now = datetime.utcnow().timetuple()
213 |             timestamp = wayback_timestamp(
214 |                 year=now.tm_year if year is None else year,
215 |                 month=now.tm_mon if month is None else month,
216 |                 day=now.tm_mday if day is None else day,
217 |                 hour=now.tm_hour if hour is None else hour,
218 |                 minute=now.tm_min if minute is None else minute,
219 |             )
220 |         self.closest = timestamp
221 |         self.sort = "closest"
222 |         self.limit = 25000
223 |         for snapshot in self.snapshots():
224 |             if snapshot.timestamp < timestamp:
225 |                 return snapshot
226 | 
227 |         # If a snapshot isn't returned, then none were found.
228 |         raise NoCDXRecordFound(
229 |             "No records were found before the given date for the query."
230 |             + "Either there are no archives before the given date,"
231 |             + " the URL may not have any archived, or the URL may have been"
232 |             + " recently archived and is still not available on the CDX server."
233 |         )
234 | 
235 |     def after(
236 |         self,
237 |         year: Optional[int] = None,
238 |         month: Optional[int] = None,
239 |         day: Optional[int] = None,
240 |         hour: Optional[int] = None,
241 |         minute: Optional[int] = None,
242 |         unix_timestamp: Optional[int] = None,
243 |         wayback_machine_timestamp: Optional[Union[int, str]] = None,
244 |     ) -> CDXSnapshot:
245 |         """
246 |         Gets the nearest archive after the given datetime.
247 |         """
248 |         if unix_timestamp:
249 |             timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
250 |         elif wayback_machine_timestamp:
251 |             timestamp = str(wayback_machine_timestamp)
252 |         else:
253 |             now = datetime.utcnow().timetuple()
254 |             timestamp = wayback_timestamp(
255 |                 year=now.tm_year if year is None else year,
256 |                 month=now.tm_mon if month is None else month,
257 |                 day=now.tm_mday if day is None else day,
258 |                 hour=now.tm_hour if hour is None else hour,
259 |                 minute=now.tm_min if minute is None else minute,
260 |             )
261 |         self.closest = timestamp
262 |         self.sort = "closest"
263 |         self.limit = 25000
264 |         for snapshot in self.snapshots():
265 |             if snapshot.timestamp > timestamp:
266 |                 return snapshot
267 | 
268 |         # If a snapshot isn't returned, then none were found.
269 |         raise NoCDXRecordFound(
270 |             "No records were found after the given date for the query."
271 |             + "Either there are no archives after the given date,"
272 |             + " the URL may not have any archives, or the URL may have been"
273 |             + " recently archived and is still not available on the CDX server."
274 |         )
275 | 
276 |     def near(
277 |         self,
278 |         year: Optional[int] = None,
279 |         month: Optional[int] = None,
280 |         day: Optional[int] = None,
281 |         hour: Optional[int] = None,
282 |         minute: Optional[int] = None,
283 |         unix_timestamp: Optional[int] = None,
284 |         wayback_machine_timestamp: Optional[Union[int, str]] = None,
285 |     ) -> CDXSnapshot:
286 |         """
287 |         Fetch archive close to a datetime, it can only return
288 |         a single URL. If you want more do not use this method
289 |         instead use the class.
290 |         """
291 |         if unix_timestamp:
292 |             timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp)
293 |         elif wayback_machine_timestamp:
294 |             timestamp = str(wayback_machine_timestamp)
295 |         else:
296 |             now = datetime.utcnow().timetuple()
297 |             timestamp = wayback_timestamp(
298 |                 year=now.tm_year if year is None else year,
299 |                 month=now.tm_mon if month is None else month,
300 |                 day=now.tm_mday if day is None else day,
301 |                 hour=now.tm_hour if hour is None else hour,
302 |                 minute=now.tm_min if minute is None else minute,
303 |             )
304 |         self.closest = timestamp
305 |         self.sort = "closest"
306 |         self.limit = 1
307 |         first_snapshot = None
308 |         for snapshot in self.snapshots():
309 |             first_snapshot = snapshot
310 |             break
311 | 
312 |         if not first_snapshot:
313 |             raise NoCDXRecordFound(
314 |                 "Wayback Machine's CDX server did not return any records "
315 |                 + "for the query. The URL may not have any archives "
316 |                 + " on the Wayback Machine or the URL may have been recently "
317 |                 + "archived and is still not available on the CDX server."
318 |             )
319 | 
320 |         return first_snapshot
321 | 
322 |     def newest(self) -> CDXSnapshot:
323 |         """
324 |         Passes the current UNIX time to near() for retrieving the newest archive
325 |         from the availability API.
326 | 
327 |         Remember UNIX time is UTC and Wayback Machine is also UTC based.
328 |         """
329 |         return self.near(unix_timestamp=int(time.time()))
330 | 
331 |     def oldest(self) -> CDXSnapshot:
332 |         """
333 |         Passes the date 1994-01-01 to near which should return the oldest archive
334 |         because Wayback Machine was started in May, 1996 and it is assumed that
335 |         there would be no archive older than January 1, 1994.
336 |         """
337 |         return self.near(year=1994, month=1, day=1)
338 | 
339 |     def snapshots(self) -> Generator[CDXSnapshot, None, None]:
340 |         """
341 |         This function yields the CDX data lines as snapshots.
342 | 
343 |         As it is a generator it exhaustible, the reason that this is
344 |         a generator and not a list are:
345 | 
346 |         a) CDX server API can return millions of entries for a query and list
347 |         is not suitable for such cases.
348 | 
349 |         b) Preventing memory usage issues, as told before this method may yield
350 |         millions of records for some queries and your system may not have enough
351 |         memory for such a big list. Also Remember this if outputing to Jupyter
352 |         Notebooks.
353 | 
354 |         The objects yielded by this method are instance of CDXSnapshot class,
355 |         you can access the attributes of the entries as the attribute of the instance
356 |         itself.
357 |         """
358 |         payload: Dict[str, str] = {}
359 |         headers = {"User-Agent": self.user_agent}
360 | 
361 |         self.add_payload(payload)
362 | 
363 |         entries = self.cdx_api_manager(payload, headers)
364 | 
365 |         for entry in entries:
366 | 
367 |             if entry.isspace() or len(entry) <= 1 or not entry:
368 |                 continue
369 | 
370 |             # each line is a snapshot aka entry of the CDX server API.
371 |             # We are able to split the page by lines because it only
372 |             # splits the lines on a sinlge page and not all the entries
373 |             # at once, thus there should be no issues of too much memory usage.
374 |             snapshot_list = entry.split("\n")
375 | 
376 |             for snapshot in snapshot_list:
377 | 
378 |                 # 14 + 32 == 46 ( timestamp + digest ), ignore the invalid entries.
379 |                 # they are invalid if their length is smaller than sum of length
380 |                 # of a standard wayback_timestamp and standard digest of an entry.
381 |                 if len(snapshot) < 46:
382 |                     continue
383 | 
384 |                 properties: Dict[str, Optional[str]] = {
385 |                     "urlkey": None,
386 |                     "timestamp": None,
387 |                     "original": None,
388 |                     "mimetype": None,
389 |                     "statuscode": None,
390 |                     "digest": None,
391 |                     "length": None,
392 |                 }
393 | 
394 |                 property_value = snapshot.split(" ")
395 | 
396 |                 total_property_values = len(property_value)
397 |                 warranted_total_property_values = len(properties)
398 | 
399 |                 if total_property_values != warranted_total_property_values:
400 |                     raise WaybackError(
401 |                         f"Snapshot returned by CDX API has {total_property_values} prop"
402 |                         f"erties instead of expected {warranted_total_property_values} "
403 |                         f"properties.\nProblematic Snapshot: {snapshot}"
404 |                     )
405 | 
406 |                 (
407 |                     properties["urlkey"],
408 |                     properties["timestamp"],
409 |                     properties["original"],
410 |                     properties["mimetype"],
411 |                     properties["statuscode"],
412 |                     properties["digest"],
413 |                     properties["length"],
414 |                 ) = property_value
415 | 
416 |                 yield CDXSnapshot(cast(Dict[str, str], properties))
417 | 


--------------------------------------------------------------------------------
/waybackpy/cdx_snapshot.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module that contains the CDXSnapshot class, CDX records/lines are casted
 3 | to CDXSnapshot objects for easier access.
 4 | 
 5 | The CDX index format is plain text data. Each line ('record') indicates a
 6 | crawled document. And these lines are casted to CDXSnapshot.
 7 | """
 8 | 
 9 | 
10 | from datetime import datetime
11 | from typing import Dict
12 | 
13 | 
14 | class CDXSnapshot:
15 |     """
16 |     Class for the CDX snapshot lines('record') returned by the CDX API,
17 |     Each valid line of the CDX API is casted to an CDXSnapshot object
18 |     by the CDX API interface, just use "." to access any attribute of the
19 |     CDX server API snapshot.
20 | 
21 |     This provides the end-user the ease of using the data as attributes
22 |     of the CDXSnapshot.
23 | 
24 |     The string representation of the class is identical to the line returned
25 |     by the CDX server API.
26 | 
27 |     Besides all the attributes of the CDX server API this class also provides
28 |     archive_url attribute, yes it is the archive url of the snapshot.
29 | 
30 |     Attributes of the this class and what they represents and are useful for:
31 | 
32 |     urlkey: The document captured, expressed as a SURT
33 |             SURT stands for Sort-friendly URI Reordering Transform, and is a
34 |             transformation applied to URIs which makes their left-to-right
35 |             representation better match the natural hierarchy of domain names.
36 |             A URI <scheme://domain.tld/path?query> has SURT
37 |             form <scheme://(tld,domain,)/path?query>.
38 | 
39 |     timestamp: The timestamp of the archive, format is yyyyMMddhhmmss and type
40 |                is string.
41 | 
42 |     datetime_timestamp: The timestamp as a datetime object.
43 | 
44 |     original: The original URL of the archive. If archive_url is
45 |     https://web.archive.org/web/20220113130051/https://google.com then the
46 |     original URL is https://google.com
47 | 
48 |     mimetype: The document’s file type. e.g. text/html
49 | 
50 |     statuscode: HTTP response code for the document at the time of its crawling
51 | 
52 |     digest: Base32-encoded SHA-1 checksum of the document for discriminating
53 |             with others
54 | 
55 |     length: Document’s volume of bytes in the WARC file
56 | 
57 |     archive_url: The archive url of the snapshot, this is not returned by the
58 |                  CDX server API but created by this class on init.
59 |     """
60 | 
61 |     def __init__(self, properties: Dict[str, str]) -> None:
62 |         self.urlkey: str = properties["urlkey"]
63 |         self.timestamp: str = properties["timestamp"]
64 |         self.datetime_timestamp: datetime = datetime.strptime(
65 |             self.timestamp, "%Y%m%d%H%M%S"
66 |         )
67 |         self.original: str = properties["original"]
68 |         self.mimetype: str = properties["mimetype"]
69 |         self.statuscode: str = properties["statuscode"]
70 |         self.digest: str = properties["digest"]
71 |         self.length: str = properties["length"]
72 |         self.archive_url: str = (
73 |             f"https://web.archive.org/web/{self.timestamp}/{self.original}"
74 |         )
75 | 
76 |     def __repr__(self) -> str:
77 |         """
78 |         Same as __str__()
79 |         """
80 |         return str(self)
81 | 
82 |     def __str__(self) -> str:
83 |         """
84 |         The string representation is same as the line returned by the
85 |         CDX server API for the snapshot.
86 |         """
87 |         return (
88 |             f"{self.urlkey} {self.timestamp} {self.original} "
89 |             f"{self.mimetype} {self.statuscode} {self.digest} {self.length}"
90 |         )
91 | 


--------------------------------------------------------------------------------
/waybackpy/cdx_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions required for accessing the CDX server API.
  3 | 
  4 | These are here in this module so that we don’t make any module too
  5 | long.
  6 | """
  7 | 
  8 | import re
  9 | from typing import Any, Dict, List, Optional, Union
 10 | from urllib.parse import quote
 11 | 
 12 | import requests
 13 | from requests.adapters import HTTPAdapter
 14 | from urllib3.util.retry import Retry
 15 | 
 16 | from .exceptions import BlockedSiteError, WaybackError
 17 | from .utils import DEFAULT_USER_AGENT
 18 | 
 19 | 
 20 | def get_total_pages(url: str, user_agent: str = DEFAULT_USER_AGENT) -> int:
 21 |     """
 22 |     When using the pagination use adding showNumPages=true to the request
 23 |     URL makes the CDX server return an integer which is the number of pages
 24 |     of CDX pages available for us to query using the pagination API.
 25 |     """
 26 |     endpoint = "https://web.archive.org/cdx/search/cdx?"
 27 |     payload = {"showNumPages": "true", "url": str(url)}
 28 |     headers = {"User-Agent": user_agent}
 29 |     request_url = full_url(endpoint, params=payload)
 30 |     response = get_response(request_url, headers=headers)
 31 |     check_for_blocked_site(response, url)
 32 |     if isinstance(response, requests.Response):
 33 |         return int(response.text.strip())
 34 |     raise response
 35 | 
 36 | 
 37 | def check_for_blocked_site(
 38 |     response: Union[requests.Response, Exception], url: Optional[str] = None
 39 | ) -> None:
 40 |     """
 41 |     Checks that the URL can be archived by wayback machine or not.
 42 |     robots.txt policy of the site may prevent the wayback machine.
 43 |     """
 44 |     # see https://github.com/akamhy/waybackpy/issues/157
 45 | 
 46 |     # the following if block is to make mypy happy.
 47 |     if isinstance(response, Exception):
 48 |         raise response
 49 | 
 50 |     if not url:
 51 |         url = "The requested content"
 52 |     if (
 53 |         "org.archive.util.io.RuntimeIOException: "
 54 |         + "org.archive.wayback.exception.AdministrativeAccessControlException: "
 55 |         + "Blocked Site Error"
 56 |         in response.text.strip()
 57 |     ):
 58 |         raise BlockedSiteError(
 59 |             f"{url} is excluded from Wayback Machine by the site's robots.txt policy."
 60 |         )
 61 | 
 62 | 
 63 | def full_url(endpoint: str, params: Dict[str, Any]) -> str:
 64 |     """
 65 |     As the function's name already implies that it returns
 66 |     full URL, but why we need a function for generating full URL?
 67 |     The CDX server can support multiple arguments for parameters
 68 |     such as filter and collapse and this function adds them without
 69 |     overwriting earlier added arguments.
 70 |     """
 71 |     if not params:
 72 |         return endpoint
 73 |     _full_url = endpoint if endpoint.endswith("?") else (endpoint + "?")
 74 | 
 75 |     for key, val in params.items():
 76 |         key = "filter" if key.startswith("filter") else key
 77 |         key = "collapse" if key.startswith("collapse") else key
 78 |         amp = "" if _full_url.endswith("?") else "&"
 79 |         val = quote(str(val), safe="")
 80 |         _full_url += f"{amp}{key}={val}"
 81 | 
 82 |     return _full_url
 83 | 
 84 | 
 85 | def get_response(
 86 |     url: str,
 87 |     headers: Optional[Dict[str, str]] = None,
 88 |     retries: int = 5,
 89 |     backoff_factor: float = 0.5,
 90 | ) -> Union[requests.Response, Exception]:
 91 |     """
 92 |     Makes get request to the CDX server and returns the response.
 93 |     """
 94 |     session = requests.Session()
 95 | 
 96 |     retries_ = Retry(
 97 |         total=retries,
 98 |         backoff_factor=backoff_factor,
 99 |         status_forcelist=[500, 502, 503, 504],
100 |     )
101 | 
102 |     session.mount("https://", HTTPAdapter(max_retries=retries_))
103 |     response = session.get(url, headers=headers)
104 |     session.close()
105 |     check_for_blocked_site(response)
106 |     return response
107 | 
108 | 
109 | def check_filters(filters: List[str]) -> None:
110 |     """
111 |     Check that the filter arguments passed by the end-user are valid.
112 |     If not valid then raise WaybackError.
113 |     """
114 |     if not isinstance(filters, list):
115 |         raise WaybackError("filters must be a list.")
116 | 
117 |     # [!]field:regex
118 |     for _filter in filters:
119 |         match = re.search(
120 |             r"(\!?(?:urlkey|timestamp|original|mimetype|statuscode|digest|length)):"
121 |             r"(.*)",
122 |             _filter,
123 |         )
124 | 
125 |         if match is None or len(match.groups()) != 2:
126 | 
127 |             exc_message = f"Filter '{_filter}' is not following the cdx filter syntax."
128 |             raise WaybackError(exc_message)
129 | 
130 | 
131 | def check_collapses(collapses: List[str]) -> bool:
132 |     """
133 |     Check that the collapse arguments passed by the end-user are valid.
134 |     If not valid then raise WaybackError.
135 |     """
136 |     if not isinstance(collapses, list):
137 |         raise WaybackError("collapses must be a list.")
138 | 
139 |     if len(collapses) == 0:
140 |         return True
141 | 
142 |     for collapse in collapses:
143 |         match = re.search(
144 |             r"(urlkey|timestamp|original|mimetype|statuscode|digest|length)"
145 |             r"(:?[0-9]{1,99})?",
146 |             collapse,
147 |         )
148 |         if match is None or len(match.groups()) != 2:
149 |             exc_message = (
150 |                 f"collapse argument '{collapse}' "
151 |                 "is not following the cdx collapse syntax."
152 |             )
153 |             raise WaybackError(exc_message)
154 | 
155 |     return True
156 | 
157 | 
158 | def check_match_type(match_type: Optional[str], url: str) -> bool:
159 |     """
160 |     Check that the match_type argument passed by the end-user is valid.
161 |     If not valid then raise WaybackError.
162 |     """
163 |     legal_match_type = ["exact", "prefix", "host", "domain"]
164 | 
165 |     if not match_type:
166 |         return True
167 | 
168 |     if "*" in url:
169 |         raise WaybackError(
170 |             "Can not use wildcard in the URL along with the match_type arguments."
171 |         )
172 | 
173 |     if match_type not in legal_match_type:
174 |         exc_message = (
175 |             f"{match_type} is not an allowed match type.\n"
176 |             "Use one from 'exact', 'prefix', 'host' or 'domain'"
177 |         )
178 |         raise WaybackError(exc_message)
179 | 
180 |     return True
181 | 
182 | 
183 | def check_sort(sort: Optional[str]) -> bool:
184 |     """
185 |     Check that the sort argument passed by the end-user is valid.
186 |     If not valid then raise WaybackError.
187 |     """
188 | 
189 |     legal_sort = ["default", "closest", "reverse"]
190 | 
191 |     if not sort:
192 |         return True
193 | 
194 |     if sort not in legal_sort:
195 |         exc_message = (
196 |             f"{sort} is not an allowed argument for sort.\n"
197 |             "Use one from 'default', 'closest' or 'reverse'"
198 |         )
199 |         raise WaybackError(exc_message)
200 | 
201 |     return True
202 | 


--------------------------------------------------------------------------------
/waybackpy/cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module responsible for enabling waybackpy to function as a CLI tool.
  3 | """
  4 | 
  5 | import os
  6 | import random
  7 | import re
  8 | import string
  9 | from typing import Any, Dict, Generator, List, Optional
 10 | 
 11 | import click
 12 | import requests
 13 | 
 14 | from . import __version__
 15 | from .cdx_api import WaybackMachineCDXServerAPI
 16 | from .exceptions import BlockedSiteError, NoCDXRecordFound
 17 | from .save_api import WaybackMachineSaveAPI
 18 | from .utils import DEFAULT_USER_AGENT
 19 | from .wrapper import Url
 20 | 
 21 | 
 22 | def handle_cdx_closest_derivative_methods(
 23 |     cdx_api: "WaybackMachineCDXServerAPI",
 24 |     oldest: bool,
 25 |     near: bool,
 26 |     newest: bool,
 27 |     near_args: Optional[Dict[str, int]] = None,
 28 | ) -> None:
 29 |     """
 30 |     Handles the closest parameter derivative methods.
 31 | 
 32 |     near, newest and oldest use the closest parameter with active
 33 |     closest based sorting.
 34 |     """
 35 |     try:
 36 |         if near:
 37 |             if near_args:
 38 |                 archive_url = cdx_api.near(**near_args).archive_url
 39 |             else:
 40 |                 archive_url = cdx_api.near().archive_url
 41 |         elif newest:
 42 |             archive_url = cdx_api.newest().archive_url
 43 |         elif oldest:
 44 |             archive_url = cdx_api.oldest().archive_url
 45 |         click.echo("Archive URL:")
 46 |         click.echo(archive_url)
 47 |     except NoCDXRecordFound as exc:
 48 |         click.echo(click.style("NoCDXRecordFound: ", fg="red") + str(exc), err=True)
 49 |     except BlockedSiteError as exc:
 50 |         click.echo(click.style("BlockedSiteError: ", fg="red") + str(exc), err=True)
 51 | 
 52 | 
 53 | def handle_cdx(data: List[Any]) -> None:
 54 |     """
 55 |     Handles the CDX CLI options and output format.
 56 |     """
 57 |     url = data[0]
 58 |     user_agent = data[1]
 59 |     start_timestamp = data[2]
 60 |     end_timestamp = data[3]
 61 |     cdx_filter = data[4]
 62 |     collapse = data[5]
 63 |     cdx_print = data[6]
 64 |     limit = data[7]
 65 |     gzip = data[8]
 66 |     match_type = data[9]
 67 |     sort = data[10]
 68 |     use_pagination = data[11]
 69 |     closest = data[12]
 70 | 
 71 |     filters = list(cdx_filter)
 72 |     collapses = list(collapse)
 73 |     cdx_print = list(cdx_print)
 74 | 
 75 |     cdx_api = WaybackMachineCDXServerAPI(
 76 |         url,
 77 |         user_agent=user_agent,
 78 |         start_timestamp=start_timestamp,
 79 |         end_timestamp=end_timestamp,
 80 |         closest=closest,
 81 |         filters=filters,
 82 |         match_type=match_type,
 83 |         sort=sort,
 84 |         use_pagination=use_pagination,
 85 |         gzip=gzip,
 86 |         collapses=collapses,
 87 |         limit=limit,
 88 |     )
 89 | 
 90 |     snapshots = cdx_api.snapshots()
 91 | 
 92 |     for snapshot in snapshots:
 93 |         if len(cdx_print) == 0:
 94 |             click.echo(snapshot)
 95 |         else:
 96 |             output_string = []
 97 |             if any(val in cdx_print for val in ["urlkey", "url-key", "url_key"]):
 98 |                 output_string.append(snapshot.urlkey)
 99 |             if any(
100 |                 val in cdx_print for val in ["timestamp", "time-stamp", "time_stamp"]
101 |             ):
102 |                 output_string.append(snapshot.timestamp)
103 |             if "original" in cdx_print:
104 |                 output_string.append(snapshot.original)
105 |             if any(val in cdx_print for val in ["mimetype", "mime-type", "mime_type"]):
106 |                 output_string.append(snapshot.mimetype)
107 |             if any(
108 |                 val in cdx_print for val in ["statuscode", "status-code", "status_code"]
109 |             ):
110 |                 output_string.append(snapshot.statuscode)
111 |             if "digest" in cdx_print:
112 |                 output_string.append(snapshot.digest)
113 |             if "length" in cdx_print:
114 |                 output_string.append(snapshot.length)
115 |             if any(
116 |                 val in cdx_print for val in ["archiveurl", "archive-url", "archive_url"]
117 |             ):
118 |                 output_string.append(snapshot.archive_url)
119 | 
120 |             click.echo(" ".join(output_string))
121 | 
122 | 
123 | def save_urls_on_file(url_gen: Generator[str, None, None]) -> None:
124 |     """
125 |     Save output of CDX API on file.
126 |     Mainly here because of backwards compatibility.
127 |     """
128 |     domain = None
129 |     sys_random = random.SystemRandom()
130 |     uid = "".join(
131 |         sys_random.choice(string.ascii_lowercase + string.digits) for _ in range(6)
132 |     )
133 |     url_count = 0
134 |     file_name = None
135 | 
136 |     for url in url_gen:
137 |         url_count += 1
138 |         if not domain:
139 |             match = re.search("https?://([A-Za-z_0-9.-]+).*", url)
140 | 
141 |             domain = "domain-unknown"
142 | 
143 |             if match:
144 |                 domain = match.group(1)
145 | 
146 |             file_name = f"{domain}-urls-{uid}.txt"
147 |             file_path = os.path.join(os.getcwd(), file_name)
148 |             if not os.path.isfile(file_path):
149 |                 with open(file_path, "w+", encoding="utf-8") as file:
150 |                     file.close()
151 | 
152 |         with open(file_path, "a", encoding="utf-8") as file:
153 |             file.write(f"{url}\n")
154 | 
155 |         click.echo(url)
156 | 
157 |     if url_count > 0:
158 |         click.echo(
159 |             f"\n\n{url_count} URLs saved inside '{file_name}' in the current "
160 |             + "working directory."
161 |         )
162 |     else:
163 |         click.echo("No known URLs found. Please try a diffrent input!")
164 | 
165 | 
166 | @click.command()
167 | @click.option(
168 |     "-u", "--url", help="URL on which Wayback machine operations are to be performed."
169 | )
170 | @click.option(
171 |     "-ua",
172 |     "--user-agent",
173 |     "--user_agent",
174 |     default=DEFAULT_USER_AGENT,
175 |     help=f"User agent, default value is '{DEFAULT_USER_AGENT}'.",
176 | )
177 | @click.option("-v", "--version", is_flag=True, default=False, help="waybackpy version.")
178 | @click.option(
179 |     "-l",
180 |     "--show-license",
181 |     "--show_license",
182 |     "--license",
183 |     is_flag=True,
184 |     default=False,
185 |     help="Show license of Waybackpy.",
186 | )
187 | @click.option(
188 |     "-n",
189 |     "--newest",
190 |     "-au",
191 |     "--archive_url",
192 |     "--archive-url",
193 |     default=False,
194 |     is_flag=True,
195 |     help="Retrieve the newest archive of URL.",
196 | )
197 | @click.option(
198 |     "-o",
199 |     "--oldest",
200 |     default=False,
201 |     is_flag=True,
202 |     help="Retrieve the oldest archive of URL.",
203 | )
204 | @click.option(
205 |     "-N",
206 |     "--near",
207 |     default=False,
208 |     is_flag=True,
209 |     help="Archive close to a specified time.",
210 | )
211 | @click.option("-Y", "--year", type=click.IntRange(1994, 9999), help="Year in integer.")
212 | @click.option("-M", "--month", type=click.IntRange(1, 12), help="Month in integer.")
213 | @click.option("-D", "--day", type=click.IntRange(1, 31), help="Day in integer.")
214 | @click.option("-H", "--hour", type=click.IntRange(0, 24), help="Hour in integer.")
215 | @click.option("-MIN", "--minute", type=click.IntRange(0, 60), help="Minute in integer.")
216 | @click.option(
217 |     "-s",
218 |     "--save",
219 |     default=False,
220 |     is_flag=True,
221 |     help="Save the specified URL's webpage and print the archive URL.",
222 | )
223 | @click.option(
224 |     "-h",
225 |     "--headers",
226 |     default=False,
227 |     is_flag=True,
228 |     help="Headers data of the SavePageNow API.",
229 | )
230 | @click.option(
231 |     "-ku",
232 |     "--known-urls",
233 |     "--known_urls",
234 |     default=False,
235 |     is_flag=True,
236 |     help="List known URLs. Uses CDX API.",
237 | )
238 | @click.option(
239 |     "-sub",
240 |     "--subdomain",
241 |     default=False,
242 |     is_flag=True,
243 |     help="Use with '--known_urls' to include known URLs for subdomains.",
244 | )
245 | @click.option(
246 |     "-f",
247 |     "--file",
248 |     default=False,
249 |     is_flag=True,
250 |     help="Use with '--known_urls' to save the URLs in file at current directory.",
251 | )
252 | @click.option(
253 |     "--cdx",
254 |     default=False,
255 |     is_flag=True,
256 |     help="Flag for using CDX API.",
257 | )
258 | @click.option(
259 |     "-st",
260 |     "--start-timestamp",
261 |     "--start_timestamp",
262 |     "--from",
263 |     help="Start timestamp for CDX API in yyyyMMddhhmmss format.",
264 | )
265 | @click.option(
266 |     "-et",
267 |     "--end-timestamp",
268 |     "--end_timestamp",
269 |     "--to",
270 |     help="End timestamp for CDX API in yyyyMMddhhmmss format.",
271 | )
272 | @click.option(
273 |     "-C",
274 |     "--closest",
275 |     help="Archive that are closest the timestamp passed as arguments to this "
276 |     + "parameter.",
277 | )
278 | @click.option(
279 |     "-f",
280 |     "--cdx-filter",
281 |     "--cdx_filter",
282 |     "--filter",
283 |     multiple=True,
284 |     help="Filter on a specific field or all the CDX fields.",
285 | )
286 | @click.option(
287 |     "-mt",
288 |     "--match-type",
289 |     "--match_type",
290 |     help="The default behavior is to return matches for an exact URL. "
291 |     + "However, the CDX server can also return results matching a certain prefix, "
292 |     + "a certain host, or all sub-hosts by using the match_type",
293 | )
294 | @click.option(
295 |     "-st",
296 |     "--sort",
297 |     help="Choose one from default, closest or reverse. It returns sorted CDX entries "
298 |     + "in the response.",
299 | )
300 | @click.option(
301 |     "-up",
302 |     "--use-pagination",
303 |     "--use_pagination",
304 |     default=False,
305 |     is_flag=True,
306 |     help="Use the pagination API of the CDX server instead of the default one.",
307 | )
308 | @click.option(
309 |     "-gz",
310 |     "--gzip",
311 |     help="To disable gzip compression pass false as argument to this parameter. "
312 |     + "The default behavior is gzip compression enabled.",
313 | )
314 | @click.option(
315 |     "-c",
316 |     "--collapse",
317 |     multiple=True,
318 |     help="Filtering or 'collapse' results based on a field, or a substring of a field.",
319 | )
320 | @click.option(
321 |     "-l",
322 |     "--limit",
323 |     help="Number of maximum record that CDX API is asked to return per API call, "
324 |     + "default value is 25000 records.",
325 | )
326 | @click.option(
327 |     "-cp",
328 |     "--cdx-print",
329 |     "--cdx_print",
330 |     multiple=True,
331 |     help="Print only certain fields of the CDX API response, "
332 |     + "if this parameter is not used then the plain text response of the CDX API "
333 |     + "will be printed.",
334 | )
335 | def main(  # pylint: disable=no-value-for-parameter
336 |     user_agent: str,
337 |     version: bool,
338 |     show_license: bool,
339 |     newest: bool,
340 |     oldest: bool,
341 |     near: bool,
342 |     save: bool,
343 |     headers: bool,
344 |     known_urls: bool,
345 |     subdomain: bool,
346 |     file: bool,
347 |     cdx: bool,
348 |     use_pagination: bool,
349 |     cdx_filter: List[str],
350 |     collapse: List[str],
351 |     cdx_print: List[str],
352 |     url: Optional[str] = None,
353 |     year: Optional[int] = None,
354 |     month: Optional[int] = None,
355 |     day: Optional[int] = None,
356 |     hour: Optional[int] = None,
357 |     minute: Optional[int] = None,
358 |     start_timestamp: Optional[str] = None,
359 |     end_timestamp: Optional[str] = None,
360 |     closest: Optional[str] = None,
361 |     match_type: Optional[str] = None,
362 |     sort: Optional[str] = None,
363 |     gzip: Optional[str] = None,
364 |     limit: Optional[str] = None,
365 | ) -> None:
366 |     """\b
367 |                          _                _
368 |                         | |              | |
369 |     __      ____ _ _   _| |__   __ _  ___| | ___ __  _   _
370 |     \\ \\ /\\ / / _` | | | | '_ \\ / _` |/ __| |/ / '_ \\| | | |
371 |      \\ V  V / (_| | |_| | |_) | (_| | (__|   <| |_) | |_| |
372 |       \\_/\\_/ \\__,_|\\__, |_.__/ \\__,_|\\___|_|\\_\\ .__/ \\__, |
373 |                     __/ |                     | |     __/ |
374 |                    |___/                      |_|    |___/
375 | 
376 |     Python package & CLI tool that interfaces the Wayback Machine APIs
377 | 
378 |     Repository: https://github.com/akamhy/waybackpy
379 | 
380 |     Documentation: https://github.com/akamhy/waybackpy/wiki/CLI-docs
381 | 
382 |     waybackpy - CLI usage(Demo video): https://asciinema.org/a/469890
383 | 
384 |     Released under the MIT License. Use the flag --license for license.
385 | 
386 |     """
387 |     if version:
388 |         click.echo(f"waybackpy version {__version__}")
389 | 
390 |     elif show_license:
391 |         click.echo(
392 |             requests.get(
393 |                 url="https://raw.githubusercontent.com/akamhy/waybackpy/master/LICENSE"
394 |             ).text
395 |         )
396 |     elif url is None:
397 |         click.echo(
398 |             click.style("NoURLDetected: ", fg="red")
399 |             + "No URL detected. "
400 |             + "Please provide an URL.",
401 |             err=True,
402 |         )
403 | 
404 |     elif oldest:
405 |         cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
406 |         handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest)
407 | 
408 |     elif newest:
409 |         cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
410 |         handle_cdx_closest_derivative_methods(cdx_api, oldest, near, newest)
411 | 
412 |     elif near:
413 |         cdx_api = WaybackMachineCDXServerAPI(url, user_agent=user_agent)
414 |         near_args = {}
415 |         keys = ["year", "month", "day", "hour", "minute"]
416 |         args_arr = [year, month, day, hour, minute]
417 |         for key, arg in zip(keys, args_arr):
418 |             if arg:
419 |                 near_args[key] = arg
420 |         handle_cdx_closest_derivative_methods(
421 |             cdx_api, oldest, near, newest, near_args=near_args
422 |         )
423 | 
424 |     elif save:
425 |         save_api = WaybackMachineSaveAPI(url, user_agent=user_agent)
426 |         save_api.save()
427 |         click.echo("Archive URL:")
428 |         click.echo(save_api.archive_url)
429 |         click.echo("Cached save:")
430 |         click.echo(save_api.cached_save)
431 |         if headers:
432 |             click.echo("Save API headers:")
433 |             click.echo(save_api.headers)
434 | 
435 |     elif known_urls:
436 |         wayback = Url(url, user_agent)
437 |         url_gen = wayback.known_urls(subdomain=subdomain)
438 | 
439 |         if file:
440 |             save_urls_on_file(url_gen)
441 |         else:
442 |             for url_ in url_gen:
443 |                 click.echo(url_)
444 | 
445 |     elif cdx:
446 |         data = [
447 |             url,
448 |             user_agent,
449 |             start_timestamp,
450 |             end_timestamp,
451 |             cdx_filter,
452 |             collapse,
453 |             cdx_print,
454 |             limit,
455 |             gzip,
456 |             match_type,
457 |             sort,
458 |             use_pagination,
459 |             closest,
460 |         ]
461 |         handle_cdx(data)
462 | 
463 |     else:
464 | 
465 |         click.echo(
466 |             click.style("NoCommandFound: ", fg="red")
467 |             + "Only URL passed, but did not specify what to do with the URL. "
468 |             + "Use --help flag for help using waybackpy.",
469 |             err=True,
470 |         )
471 | 
472 | 
473 | if __name__ == "__main__":
474 |     main()  # type: ignore # pylint: disable=no-value-for-parameter
475 | 


--------------------------------------------------------------------------------
/waybackpy/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | waybackpy.exceptions
 3 | ~~~~~~~~~~~~~~~~~~~
 4 | This module contains the set of Waybackpy's exceptions.
 5 | """
 6 | 
 7 | 
 8 | class WaybackError(Exception):
 9 |     """
10 |     Raised when Waybackpy can not return what you asked for.
11 | 
12 |     1) Wayback Machine API Service is unreachable/down.
13 |     2) You passed illegal arguments.
14 | 
15 |     All other exceptions are inherited from this main exception.
16 |     """
17 | 
18 | 
19 | class NoCDXRecordFound(WaybackError):
20 |     """
21 |     No records returned by the CDX server for a query.
22 |     Raised when the user invokes near(), newest() or oldest() methods
23 |     and there are no archives.
24 |     """
25 | 
26 | 
27 | class BlockedSiteError(WaybackError):
28 |     """
29 |     Raised when the archives for website/URLs that was excluded from Wayback
30 |     Machine are requested via the CDX server API.
31 |     """
32 | 
33 | 
34 | class TooManyRequestsError(WaybackError):
35 |     """
36 |     Raised when you make more than 15 requests per
37 |     minute and the Wayback Machine returns 429.
38 | 
39 |     See https://github.com/akamhy/waybackpy/issues/131
40 |     """
41 | 
42 | 
43 | class MaximumRetriesExceeded(WaybackError):
44 |     """
45 |     MaximumRetriesExceeded
46 |     """
47 | 
48 | 
49 | class MaximumSaveRetriesExceeded(MaximumRetriesExceeded):
50 |     """
51 |     MaximumSaveRetriesExceeded
52 |     """
53 | 
54 | 
55 | class ArchiveNotInAvailabilityAPIResponse(WaybackError):
56 |     """
57 |     Could not parse the archive in the JSON response of the availability API.
58 |     """
59 | 
60 | 
61 | class InvalidJSONInAvailabilityAPIResponse(WaybackError):
62 |     """
63 |     availability api returned invalid JSON
64 |     """
65 | 


--------------------------------------------------------------------------------
/waybackpy/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akamhy/waybackpy/3b3e78d901a600bb22943202c6a8981ca04a5e48/waybackpy/py.typed


--------------------------------------------------------------------------------
/waybackpy/save_api.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module interfaces the Wayback Machine's SavePageNow (SPN) API.
  3 | 
  4 | The module has WaybackMachineSaveAPI class which should be used by the users of
  5 | this module to use the SavePageNow API.
  6 | """
  7 | 
  8 | import re
  9 | import time
 10 | from datetime import datetime
 11 | from typing import Dict, Optional
 12 | 
 13 | import requests
 14 | from requests.adapters import HTTPAdapter
 15 | from requests.models import Response
 16 | from requests.structures import CaseInsensitiveDict
 17 | from urllib3.util.retry import Retry
 18 | 
 19 | from .exceptions import MaximumSaveRetriesExceeded, TooManyRequestsError, WaybackError
 20 | from .utils import DEFAULT_USER_AGENT
 21 | 
 22 | 
 23 | class WaybackMachineSaveAPI:
 24 |     """
 25 |     WaybackMachineSaveAPI class provides an interface for saving URLs on the
 26 |     Wayback Machine.
 27 |     """
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         url: str,
 32 |         user_agent: str = DEFAULT_USER_AGENT,
 33 |         max_tries: int = 8,
 34 |     ) -> None:
 35 |         self.url = str(url).strip().replace(" ", "%20")
 36 |         self.request_url = "https://web.archive.org/save/" + self.url
 37 |         self.user_agent = user_agent
 38 |         self.request_headers: Dict[str, str] = {"User-Agent": self.user_agent}
 39 |         if max_tries < 1:
 40 |             raise ValueError("max_tries should be positive")
 41 |         self.max_tries = max_tries
 42 |         self.total_save_retries = 5
 43 |         self.backoff_factor = 0.5
 44 |         self.status_forcelist = [500, 502, 503, 504]
 45 |         self._archive_url: Optional[str] = None
 46 |         self.instance_birth_time = datetime.utcnow()
 47 |         self.response: Optional[Response] = None
 48 |         self.headers: Optional[CaseInsensitiveDict[str]] = None
 49 |         self.status_code: Optional[int] = None
 50 |         self.response_url: Optional[str] = None
 51 |         self.cached_save: Optional[bool] = None
 52 |         self.saved_archive: Optional[str] = None
 53 | 
 54 |     @property
 55 |     def archive_url(self) -> str:
 56 |         """
 57 |         Returns the archive URL is already cached by _archive_url
 58 |         else invoke the save method to save the archive which returns the
 59 |         archive thus we return the methods return value.
 60 |         """
 61 |         if self._archive_url:
 62 |             return self._archive_url
 63 | 
 64 |         return self.save()
 65 | 
 66 |     def get_save_request_headers(self) -> None:
 67 |         """
 68 |         Creates a session and tries 'retries' number of times to
 69 |         retrieve the archive.
 70 | 
 71 |         If successful in getting the response, sets the headers, status_code
 72 |         and response_url attributes.
 73 | 
 74 |         The archive is usually in the headers but it can also be the response URL
 75 |         as the Wayback Machine redirects to the archive after a successful capture
 76 |         of the webpage.
 77 | 
 78 |         Wayback Machine's save API is known
 79 |         to be very unreliable thus if it fails first check opening
 80 |         the response URL yourself in the browser.
 81 |         """
 82 |         session = requests.Session()
 83 |         retries = Retry(
 84 |             total=self.total_save_retries,
 85 |             backoff_factor=self.backoff_factor,
 86 |             status_forcelist=self.status_forcelist,
 87 |         )
 88 |         session.mount("https://", HTTPAdapter(max_retries=retries))
 89 |         self.response = session.get(self.request_url, headers=self.request_headers)
 90 |         # requests.response.headers is requests.structures.CaseInsensitiveDict
 91 |         self.headers = self.response.headers
 92 |         self.status_code = self.response.status_code
 93 |         self.response_url = self.response.url
 94 |         session.close()
 95 | 
 96 |         if self.status_code == 429:
 97 |             # why wait 5 minutes and 429?
 98 |             # see https://github.com/akamhy/waybackpy/issues/97
 99 |             raise TooManyRequestsError(
100 |                 f"Can not save '{self.url}'. "
101 |                 f"Save request refused by the server. "
102 |                 f"Save Page Now limits saving 15 URLs per minutes. "
103 |                 f"Try waiting for 5 minutes and then try again."
104 |             )
105 | 
106 |         # why 509?
107 |         # see https://github.com/akamhy/waybackpy/pull/99
108 |         # also https://t.co/xww4YJ0Iwc
109 |         if self.status_code == 509:
110 |             raise WaybackError(
111 |                 f"Can not save '{self.url}'. You have probably reached the "
112 |                 f"limit of active sessions."
113 |             )
114 | 
115 |     def archive_url_parser(self) -> Optional[str]:
116 |         """
117 |         Three regexen (like oxen?) are used to search for the
118 |         archive URL in the headers and finally look in the response URL
119 |         for the archive URL.
120 |         """
121 |         regex1 = r"Content-Location: (/web/[0-9]{14}/.*)"
122 |         match = re.search(regex1, str(self.headers))
123 |         if match:
124 |             return "https://web.archive.org" + match.group(1)
125 | 
126 |         regex2 = r"rel=\"memento.*?(web\.archive\.org/web/[0-9]{14}/.*?)>"
127 |         match = re.search(regex2, str(self.headers))
128 |         if match is not None and len(match.groups()) == 1:
129 |             return "https://" + match.group(1)
130 | 
131 |         regex3 = r"X-Cache-Key:\shttps(.*)[A-Z]{2}"
132 |         match = re.search(regex3, str(self.headers))
133 |         if match is not None and len(match.groups()) == 1:
134 |             return "https" + match.group(1)
135 | 
136 |         self.response_url = (
137 |             "" if self.response_url is None else self.response_url.strip()
138 |         )
139 |         regex4 = r"web\.archive\.org/web/(?:[0-9]*?)/(?:.*)$"
140 |         match = re.search(regex4, self.response_url)
141 |         if match is not None:
142 |             return "https://" + match.group(0)
143 | 
144 |         return None
145 | 
146 |     @staticmethod
147 |     def sleep(tries: int) -> None:
148 |         """
149 |         Ensure that the we wait some time before succesive retries so that we
150 |         don't waste the retries before the page is even captured by the Wayback
151 |         Machine crawlers also ensures that we are not putting too much load on
152 |         the Wayback Machine's save API.
153 | 
154 |         If tries are multiple of 3 sleep 10 seconds else sleep 5 seconds.
155 |         """
156 |         sleep_seconds = 5
157 |         if tries % 3 == 0:
158 |             sleep_seconds = 10
159 |         time.sleep(sleep_seconds)
160 | 
161 |     def timestamp(self) -> datetime:
162 |         """
163 |         Read the timestamp off the archive URL and convert the Wayback Machine
164 |         timestamp to datetime object.
165 | 
166 |         Also check if the time on archive is URL and compare it to instance birth
167 |         time.
168 | 
169 |         If time on the archive is older than the instance creation time set the
170 |         cached_save to True else set it to False. The flag can be used to check
171 |         if the Wayback Machine didn't serve a Cached URL. It is quite common for
172 |         the Wayback Machine to serve cached archive if last archive was captured
173 |         before last 45 minutes.
174 |         """
175 |         regex = r"https?://web\.archive.org/web/([0-9]{14})/http"
176 |         match = re.search(regex, str(self._archive_url))
177 | 
178 |         if match is None or len(match.groups()) != 1:
179 |             raise ValueError(
180 |                 f"Can not parse timestamp from archive URL, '{self._archive_url}'."
181 |             )
182 | 
183 |         string_timestamp = match.group(1)
184 |         timestamp = datetime.strptime(string_timestamp, "%Y%m%d%H%M%S")
185 |         timestamp_unixtime = time.mktime(timestamp.timetuple())
186 |         instance_birth_time_unixtime = time.mktime(self.instance_birth_time.timetuple())
187 | 
188 |         if timestamp_unixtime < instance_birth_time_unixtime:
189 |             self.cached_save = True
190 |         else:
191 |             self.cached_save = False
192 | 
193 |         return timestamp
194 | 
195 |     def save(self) -> str:
196 |         """
197 |         Calls the SavePageNow API of the Wayback Machine with required parameters
198 |         and headers to save the URL.
199 | 
200 |         Raises MaximumSaveRetriesExceeded is maximum retries are exhausted but still
201 |         we were unable to retrieve the archive from the Wayback Machine.
202 |         """
203 |         self.saved_archive = None
204 |         tries = 0
205 | 
206 |         while True:
207 |             if tries >= 1:
208 |                 self.sleep(tries)
209 | 
210 |             self.get_save_request_headers()
211 |             self.saved_archive = self.archive_url_parser()
212 | 
213 |             if isinstance(self.saved_archive, str):
214 |                 self._archive_url = self.saved_archive
215 |                 self.timestamp()
216 |                 return self.saved_archive
217 | 
218 |             tries += 1
219 |             if tries >= self.max_tries:
220 |                 raise MaximumSaveRetriesExceeded(
221 |                     f"Tried {tries} times but failed to save "
222 |                     f"and retrieve the archive for {self.url}.\n"
223 |                     f"Response URL:\n{self.response_url}\n"
224 |                     f"Response Header:\n{self.headers}"
225 |                 )
226 | 


--------------------------------------------------------------------------------
/waybackpy/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions and shared variables like DEFAULT_USER_AGENT are here.
 3 | """
 4 | 
 5 | from datetime import datetime
 6 | 
 7 | from . import __version__
 8 | 
 9 | DEFAULT_USER_AGENT: str = (
10 |     f"waybackpy {__version__} - https://github.com/akamhy/waybackpy"
11 | )
12 | 
13 | 
14 | def unix_timestamp_to_wayback_timestamp(unix_timestamp: int) -> str:
15 |     """
16 |     Converts Unix time to Wayback Machine timestamp, Wayback Machine
17 |     timestamp format is yyyyMMddhhmmss.
18 |     """
19 |     return datetime.utcfromtimestamp(int(unix_timestamp)).strftime("%Y%m%d%H%M%S")
20 | 
21 | 
22 | def wayback_timestamp(**kwargs: int) -> str:
23 |     """
24 |     Prepends zero before the year, month, day, hour and minute so that they
25 |     are conformable with the YYYYMMDDhhmmss Wayback Machine timestamp format.
26 |     """
27 |     return "".join(
28 |         str(kwargs[key]).zfill(2) for key in ["year", "month", "day", "hour", "minute"]
29 |     )
30 | 


--------------------------------------------------------------------------------
/waybackpy/wrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module exists because backwards compatibility matters.
  3 | Don't touch this or add any new functionality here and don't use
  4 | the Url class.
  5 | """
  6 | 
  7 | from datetime import datetime, timedelta
  8 | from typing import Generator, Optional
  9 | 
 10 | from requests.structures import CaseInsensitiveDict
 11 | 
 12 | from .availability_api import ResponseJSON, WaybackMachineAvailabilityAPI
 13 | from .cdx_api import WaybackMachineCDXServerAPI
 14 | from .save_api import WaybackMachineSaveAPI
 15 | from .utils import DEFAULT_USER_AGENT
 16 | 
 17 | 
 18 | class Url:
 19 |     """
 20 |     The Url class is not recommended to be used anymore, instead use:
 21 | 
 22 |     - WaybackMachineSaveAPI
 23 |     - WaybackMachineAvailabilityAPI
 24 |     - WaybackMachineCDXServerAPI
 25 | 
 26 |     The reason it is still in the code is backwards compatibility with 2.x.x
 27 |     versions.
 28 | 
 29 |     If were are using the Url before the update to version 3.x.x, your code should
 30 |     still be working fine and there is no hurry to update the interface but is
 31 |     recommended that you do not use the Url class for new code as it would be
 32 |     removed after 2025 also the first 3.x.x versions was released in January 2022
 33 |     and three years are more than enough to update the older interface code.
 34 |     """
 35 | 
 36 |     def __init__(self, url: str, user_agent: str = DEFAULT_USER_AGENT) -> None:
 37 |         self.url = url
 38 |         self.user_agent = str(user_agent)
 39 |         self.archive_url: Optional[str] = None
 40 |         self.timestamp: Optional[datetime] = None
 41 |         self.wayback_machine_availability_api = WaybackMachineAvailabilityAPI(
 42 |             self.url, user_agent=self.user_agent
 43 |         )
 44 |         self.wayback_machine_save_api: Optional[WaybackMachineSaveAPI] = None
 45 |         self.headers: Optional[CaseInsensitiveDict[str]] = None
 46 |         self.json: Optional[ResponseJSON] = None
 47 | 
 48 |     def __str__(self) -> str:
 49 |         if not self.archive_url:
 50 |             self.newest()
 51 |         return str(self.archive_url)
 52 | 
 53 |     def __len__(self) -> int:
 54 |         td_max = timedelta(
 55 |             days=999999999, hours=23, minutes=59, seconds=59, microseconds=999999
 56 |         )
 57 | 
 58 |         if not isinstance(self.timestamp, datetime):
 59 |             self.oldest()
 60 | 
 61 |         if not isinstance(self.timestamp, datetime):
 62 |             raise TypeError("timestamp must be a datetime")
 63 | 
 64 |         if self.timestamp == datetime.max:
 65 |             return td_max.days
 66 | 
 67 |         return (datetime.utcnow() - self.timestamp).days
 68 | 
 69 |     def save(self) -> "Url":
 70 |         """Save the URL on wayback machine."""
 71 |         self.wayback_machine_save_api = WaybackMachineSaveAPI(
 72 |             self.url, user_agent=self.user_agent
 73 |         )
 74 |         self.archive_url = self.wayback_machine_save_api.archive_url
 75 |         self.timestamp = self.wayback_machine_save_api.timestamp()
 76 |         self.headers = self.wayback_machine_save_api.headers
 77 |         return self
 78 | 
 79 |     def near(
 80 |         self,
 81 |         year: Optional[int] = None,
 82 |         month: Optional[int] = None,
 83 |         day: Optional[int] = None,
 84 |         hour: Optional[int] = None,
 85 |         minute: Optional[int] = None,
 86 |         unix_timestamp: Optional[int] = None,
 87 |     ) -> "Url":
 88 |         """Returns the archive of the URL close to a date and time."""
 89 |         self.wayback_machine_availability_api.near(
 90 |             year=year,
 91 |             month=month,
 92 |             day=day,
 93 |             hour=hour,
 94 |             minute=minute,
 95 |             unix_timestamp=unix_timestamp,
 96 |         )
 97 |         self.set_availability_api_attrs()
 98 |         return self
 99 | 
100 |     def oldest(self) -> "Url":
101 |         """Returns the oldest archive of the URL."""
102 |         self.wayback_machine_availability_api.oldest()
103 |         self.set_availability_api_attrs()
104 |         return self
105 | 
106 |     def newest(self) -> "Url":
107 |         """Returns the newest archive of the URL."""
108 |         self.wayback_machine_availability_api.newest()
109 |         self.set_availability_api_attrs()
110 |         return self
111 | 
112 |     def set_availability_api_attrs(self) -> None:
113 |         """Set the attributes for total backwards compatibility."""
114 |         self.archive_url = self.wayback_machine_availability_api.archive_url
115 |         self.json = self.wayback_machine_availability_api.json
116 |         self.JSON = self.json  # for backwards compatibility, do not remove it.
117 |         self.timestamp = self.wayback_machine_availability_api.timestamp()
118 | 
119 |     def total_archives(
120 |         self, start_timestamp: Optional[str] = None, end_timestamp: Optional[str] = None
121 |     ) -> int:
122 |         """
123 |         Returns an integer which indicates total number of archives for an URL.
124 |         Useless in my opinion, only here because of backwards compatibility.
125 |         """
126 |         cdx = WaybackMachineCDXServerAPI(
127 |             self.url,
128 |             user_agent=self.user_agent,
129 |             start_timestamp=start_timestamp,
130 |             end_timestamp=end_timestamp,
131 |         )
132 | 
133 |         count = 0
134 |         for _ in cdx.snapshots():
135 |             count = count + 1
136 |         return count
137 | 
138 |     def known_urls(
139 |         self,
140 |         subdomain: bool = False,
141 |         host: bool = False,
142 |         start_timestamp: Optional[str] = None,
143 |         end_timestamp: Optional[str] = None,
144 |         match_type: str = "prefix",
145 |     ) -> Generator[str, None, None]:
146 |         """Yields known URLs for any URL."""
147 |         if subdomain:
148 |             match_type = "domain"
149 |         if host:
150 |             match_type = "host"
151 | 
152 |         cdx = WaybackMachineCDXServerAPI(
153 |             self.url,
154 |             user_agent=self.user_agent,
155 |             start_timestamp=start_timestamp,
156 |             end_timestamp=end_timestamp,
157 |             match_type=match_type,
158 |             collapses=["urlkey"],
159 |         )
160 | 
161 |         for snapshot in cdx.snapshots():
162 |             yield snapshot.original
163 | 


--------------------------------------------------------------------------------