├── .bumpversion.cfg ├── .darglint ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── pre-commit-autoupdate.yml │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE.rst ├── README.md ├── codecov.yml ├── img └── web-transpose-cover.png ├── noxfile.py ├── poetry.lock ├── pyproject.toml ├── src └── webtranspose │ ├── __init__.py │ ├── chat.py │ ├── crawl.py │ ├── openai.py │ ├── scrape.py │ ├── search.py │ └── webt_api.py ├── tasks.py └── tests ├── Untitled.ipynb ├── __init__.py └── test_webtranspose.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | commit = True 3 | tag = False 4 | current_version = 0.1.0 5 | 6 | [bumpversion:file:pyproject.toml] 7 | search = version = "{current_version}" 8 | replace = version = "{new_version}" 9 | 10 | [bumpversion:file:src/webtranspose/__init__.py] 11 | search = __version__ = "{current_version}" 12 | replace = __version__ = "{new_version}" 13 | 14 | [bumpversion:file(title):CHANGELOG.md] 15 | search = {#}{#} [Unreleased] 16 | replace = {#}{#} [Unreleased] 17 | 18 | {#}{#} [{new_version}] - {now:%Y-%m-%d} 19 | 20 | [bumpversion:file(links):CHANGELOG.md] 21 | search = [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v{current_version}...HEAD 22 | replace = [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v{new_version}...HEAD 23 | [{new_version}]: https://github.com/mike-gee/webtranspose/compare/v{current_version}...v{new_version} 24 | -------------------------------------------------------------------------------- /.darglint: -------------------------------------------------------------------------------- 1 | [darglint] 2 | strictness = short 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🐛 Bug report 3 | about: Create a report to help us improve 4 | labels: bug 5 | assignees: '' 6 | 7 | --- 8 | 9 | ## Expected Behavior 10 | 11 | 12 | ## Actual Behavior 13 | 14 | 15 | ## Steps to Reproduce the Problem 16 | 17 | 1. 18 | 1. 19 | 1. 20 | 21 | ## Specifications 22 | 23 | - Version: 24 | - Platform: 25 | - Subsystem: 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: [] 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🚀 Feature request 3 | about: Suggest an idea for this project 4 | labels: enhancement 5 | assignees: '' 6 | 7 | --- 8 | 9 | **Is your feature request related to a problem? Please describe.** 10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | 12 | **Describe the solution you'd like** 13 | A clear and concise description of what you want to happen. 14 | 15 | **Describe alternatives you've considered** 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | ## Proposed Changes 4 | 5 | - 6 | - 7 | - 8 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | - package-ecosystem: pip 8 | directory: "/docs" 9 | schedule: 10 | interval: daily 11 | - package-ecosystem: pip 12 | directory: "/" 13 | schedule: 14 | interval: daily 15 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: 6 | pull_request: 7 | branches: 8 | schedule: 9 | - cron: '0 6 * * 1' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ 'python' ] 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v3.5.2 28 | 29 | # Initializes the CodeQL tools for scanning. 30 | - name: Initialize CodeQL 31 | uses: github/codeql-action/init@v2 32 | with: 33 | languages: ${{ matrix.language }} 34 | 35 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 36 | # If this step fails, then you should remove it and run the build manually (see below) 37 | - name: Autobuild 38 | uses: github/codeql-action/autobuild@v2 39 | 40 | - name: Perform CodeQL Analysis 41 | uses: github/codeql-action/analyze@v2 42 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit-autoupdate.yml: -------------------------------------------------------------------------------- 1 | name: "Pre-commit autoupdate" 2 | 3 | on: 4 | schedule: 5 | - cron: '0 6 * * 1' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | autoupdate: 10 | name: autoupdate 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3.5.2 14 | 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v4.6.1 17 | with: 18 | python-version: 3.8 19 | 20 | - name: Install system deps 21 | shell: bash 22 | run: | 23 | pip install poetry 24 | poetry config virtualenvs.in-project true 25 | poetry install --no-root --only dev --only linters --sync 26 | 27 | - name: Run autoupdate 28 | run: poetry run pre-commit autoupdate 29 | 30 | - name: Run pre-commit 31 | run: poetry run pre-commit run --all-files 32 | 33 | - uses: peter-evans/create-pull-request@v5.0.1 34 | with: 35 | token: ${{ secrets.GITHUB_TOKEN }} 36 | branch: chore-update-pre-commit-hooks 37 | title: Update pre-commit hooks 38 | commit-message: "Update pre-commit hooks" 39 | body: | 40 | # Update pre-commit hooks 41 | 42 | - Update pre-commit hooks to the latest version. 43 | delete-branch: true 44 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | 2 | name: release 3 | 4 | on: 5 | push: 6 | tags: 7 | - 'v*' 8 | 9 | jobs: 10 | release: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3.5.2 14 | 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v4.6.1 17 | with: 18 | python-version: 3.8 19 | 20 | - name: Install system deps 21 | shell: bash 22 | run: | 23 | pip install poetry 24 | poetry config virtualenvs.in-project true 25 | 26 | - name: Build package 27 | run: | 28 | poetry build --ansi 29 | 30 | - name: Publish package on PyPI 31 | uses: pypa/gh-action-pypi-publish@v1.4.2 32 | with: 33 | user: __token__ 34 | password: ${{ secrets.PYPI_TOKEN }} 35 | 36 | - name: Publish package on TestPyPI 37 | uses: pypa/gh-action-pypi-publish@v1.4.2 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.TEST_PYPI_TOKEN }} 41 | repository_url: https://test.pypi.org/legacy/ 42 | 43 | 44 | github_release: 45 | needs: release 46 | name: Create Github Release 47 | runs-on: ubuntu-latest 48 | steps: 49 | - uses: actions/checkout@v3.5.2 50 | 51 | - name: Get version from tag 52 | id: tag_name 53 | shell: bash 54 | run: | 55 | echo ::set-output name=current_version::${GITHUB_REF#refs/tags/v} 56 | 57 | - name: Get Changelog Entry 58 | id: changelog_reader 59 | uses: mindsers/changelog-reader-action@v2.2.2 60 | with: 61 | version: ${{ steps.tag_name.outputs.current_version }} 62 | path: ./CHANGELOG.md 63 | 64 | - name: Create Release 65 | id: create_release 66 | uses: actions/create-release@v1.1.4 67 | env: 68 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 69 | with: 70 | tag_name: ${{ github.ref }} 71 | release_name: Release ${{ github.ref }} 72 | body: ${{ steps.changelog_reader.outputs.changes }} 73 | draft: false 74 | prerelease: false 75 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: 6 | pull_request: 7 | branches: 8 | 9 | jobs: 10 | linting: 11 | name: Linting 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3.5.2 15 | 16 | - name: Set up Python 3.8 17 | uses: actions/setup-python@v4.6.1 18 | with: 19 | python-version: 3.8 20 | 21 | - name: Install system deps 22 | shell: bash 23 | run: | 24 | pip install poetry 25 | poetry config virtualenvs.in-project true 26 | poetry install --no-root --only dev --only linters --sync 27 | 28 | - name: Linting 29 | shell: bash 30 | run: poetry run pre-commit run --all-files 31 | 32 | tests: 33 | needs: linting 34 | name: ${{ matrix.os }} / ${{ matrix.python-version }} 35 | runs-on: ${{ matrix.os }}-latest 36 | strategy: 37 | matrix: 38 | os: [Ubuntu, MacOS, Windows] 39 | python-version: ['3.8', '3.9', '3.10', '3.11'] 40 | fail-fast: true 41 | steps: 42 | - uses: actions/checkout@v3.5.2 43 | 44 | - name: Set up Python ${{ matrix.python-version }} 45 | uses: actions/setup-python@v4.6.1 46 | with: 47 | python-version: ${{ matrix.python-version }} 48 | 49 | - name: Install system deps 50 | shell: bash 51 | run: | 52 | pip install nox-poetry 53 | pip install poetry 54 | poetry config virtualenvs.in-project true 55 | 56 | - name: Run mypy with nox 57 | shell: bash 58 | run: nox --force-color -s mypy-${{ matrix.python-version }} 59 | 60 | - name: Run tests with nox 61 | shell: bash 62 | run: nox --force-color -s tests-${{ matrix.python-version }} 63 | 64 | - name: Run securtity check 65 | if: matrix.python-version == '3.11' && matrix.os == 'Ubuntu' 66 | shell: bash 67 | run: nox --force-color -s security 68 | 69 | - name: Upload coverage data 70 | uses: actions/upload-artifact@v2.2.4 71 | with: 72 | name: coverage-data 73 | path: ".coverage.*" 74 | 75 | coverage: 76 | needs: tests 77 | runs-on: ubuntu-latest 78 | steps: 79 | - uses: actions/checkout@v3.5.2 80 | 81 | - name: Set up Python 3.8 82 | uses: actions/setup-python@v4.6.1 83 | with: 84 | python-version: 3.8 85 | 86 | - name: Install system deps 87 | shell: bash 88 | run: | 89 | pip install nox-poetry 90 | pip install poetry 91 | poetry config virtualenvs.in-project true 92 | 93 | - name: Download coverage data 94 | uses: actions/download-artifact@v2.0.10 95 | with: 96 | name: coverage-data 97 | 98 | - name: Create coverage report 99 | shell: bash 100 | run: | 101 | nox --force-color --session=coverage -- --fmt xml 102 | 103 | - name: Upload coverage report 104 | uses: codecov/codecov-action@v3.1.4 105 | with: 106 | token: ${{ secrets.CODECOV_TOKEN }} 107 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | .ipynb 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # pytype 130 | .pytype/ 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # Code editors 136 | .vscode 137 | 138 | # Caches 139 | .flakeheaven_cache 140 | 141 | # Web Transpose 142 | webtranspose-out/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: check-toml 6 | - id: check-yaml 7 | - id: debug-statements 8 | - id: check-merge-conflict 9 | - id: check-json 10 | - id: end-of-file-fixer 11 | - repo: https://github.com/timothycrosley/isort 12 | rev: 5.12.0 13 | hooks: 14 | - id: isort 15 | - repo: https://github.com/psf/black 16 | rev: 23.3.0 17 | hooks: 18 | - id: black 19 | - repo: local 20 | hooks: 21 | - id: flakeheaven 22 | name: flakeheaven 23 | description: "`FlakeHeaven` it's a Flake8 wrapper to make it cools." 24 | entry: poetry run flakeheaven 25 | args: [lint] 26 | language: system 27 | types: [python] 28 | require_serial: true 29 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | image: latest 5 | 6 | formats: all 7 | 8 | sphinx: 9 | configuration: docs/conf.py 10 | 11 | python: 12 | version: 3.8 13 | install: 14 | - requirements: docs/requirements.txt 15 | - method: pip 16 | path: . 17 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | 8 | ## [Unreleased] 9 | 10 | ## [0.1.0] - 2023-10-21 11 | ### Added 12 | - First release on PyPI. 13 | 14 | [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v0.1.0...HEAD 15 | [0.1.0]: https://github.com/mike-gee/webtranspose/compare/releases/tag/v0.1.0 16 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, caste, color, religion, or sexual identity 11 | and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the 27 | overall community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or 32 | advances of any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email 36 | address, without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official e-mail address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at mike@webtranspose.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 126 | at [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | [translations]: https://www.contributor-covenant.org/translations 133 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | Copyright (C) 2023 Vetro Technologies, Inc. (Web Transpose) 2 | 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Affero General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | 8 | This program is distributed in the hope that it will be useful, 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | GNU Affero General Public License for more details. 12 | 13 | You should have received a copy of the GNU Affero General Public License 14 | along with this program. If not, see . 15 | 16 | 17 | GNU AFFERO GENERAL PUBLIC LICENSE 18 | Version 3, 19 November 2007 19 | 20 | Copyright (C) 2007 Free Software Foundation, Inc. 21 | Everyone is permitted to copy and distribute verbatim copies 22 | of this license document, but changing it is not allowed. 23 | 24 | Preamble 25 | 26 | The GNU Affero General Public License is a free, copyleft license for 27 | software and other kinds of works, specifically designed to ensure 28 | cooperation with the community in the case of network server software. 29 | 30 | The licenses for most software and other practical works are designed 31 | to take away your freedom to share and change the works. By contrast, 32 | our General Public Licenses are intended to guarantee your freedom to 33 | share and change all versions of a program--to make sure it remains free 34 | software for all its users. 35 | 36 | When we speak of free software, we are referring to freedom, not 37 | price. Our General Public Licenses are designed to make sure that you 38 | have the freedom to distribute copies of free software (and charge for 39 | them if you wish), that you receive source code or can get it if you 40 | want it, that you can change the software or use pieces of it in new 41 | free programs, and that you know you can do these things. 42 | 43 | Developers that use our General Public Licenses protect your rights 44 | with two steps: (1) assert copyright on the software, and (2) offer 45 | you this License which gives you legal permission to copy, distribute 46 | and/or modify the software. 47 | 48 | A secondary benefit of defending all users' freedom is that 49 | improvements made in alternate versions of the program, if they 50 | receive widespread use, become available for other developers to 51 | incorporate. Many developers of free software are heartened and 52 | encouraged by the resulting cooperation. However, in the case of 53 | software used on network servers, this result may fail to come about. 54 | The GNU General Public License permits making a modified version and 55 | letting the public access it on a server without ever releasing its 56 | source code to the public. 57 | 58 | The GNU Affero General Public License is designed specifically to 59 | ensure that, in such cases, the modified source code becomes available 60 | to the community. It requires the operator of a network server to 61 | provide the source code of the modified version running there to the 62 | users of that server. Therefore, public use of a modified version, on 63 | a publicly accessible server, gives the public access to the source 64 | code of the modified version. 65 | 66 | An older license, called the Affero General Public License and 67 | published by Affero, was designed to accomplish similar goals. This is 68 | a different license, not a version of the Affero GPL, but Affero has 69 | released a new version of the Affero GPL which permits relicensing under 70 | this license. 71 | 72 | The precise terms and conditions for copying, distribution and 73 | modification follow. 74 | 75 | TERMS AND CONDITIONS 76 | 77 | 0. Definitions. 78 | 79 | "This License" refers to version 3 of the GNU Affero General Public License. 80 | 81 | "Copyright" also means copyright-like laws that apply to other kinds of 82 | works, such as semiconductor masks. 83 | 84 | "The Program" refers to any copyrightable work licensed under this 85 | License. Each licensee is addressed as "you". "Licensees" and 86 | "recipients" may be individuals or organizations. 87 | 88 | To "modify" a work means to copy from or adapt all or part of the work 89 | in a fashion requiring copyright permission, other than the making of an 90 | exact copy. The resulting work is called a "modified version" of the 91 | earlier work or a work "based on" the earlier work. 92 | 93 | A "covered work" means either the unmodified Program or a work based 94 | on the Program. 95 | 96 | To "propagate" a work means to do anything with it that, without 97 | permission, would make you directly or secondarily liable for 98 | infringement under applicable copyright law, except executing it on a 99 | computer or modifying a private copy. Propagation includes copying, 100 | distribution (with or without modification), making available to the 101 | public, and in some countries other activities as well. 102 | 103 | To "convey" a work means any kind of propagation that enables other 104 | parties to make or receive copies. Mere interaction with a user through 105 | a computer network, with no transfer of a copy, is not conveying. 106 | 107 | An interactive user interface displays "Appropriate Legal Notices" 108 | to the extent that it includes a convenient and prominently visible 109 | feature that (1) displays an appropriate copyright notice, and (2) 110 | tells the user that there is no warranty for the work (except to the 111 | extent that warranties are provided), that licensees may convey the 112 | work under this License, and how to view a copy of this License. If 113 | the interface presents a list of user commands or options, such as a 114 | menu, a prominent item in the list meets this criterion. 115 | 116 | 1. Source Code. 117 | 118 | The "source code" for a work means the preferred form of the work 119 | for making modifications to it. "Object code" means any non-source 120 | form of a work. 121 | 122 | A "Standard Interface" means an interface that either is an official 123 | standard defined by a recognized standards body, or, in the case of 124 | interfaces specified for a particular programming language, one that 125 | is widely used among developers working in that language. 126 | 127 | The "System Libraries" of an executable work include anything, other 128 | than the work as a whole, that (a) is included in the normal form of 129 | packaging a Major Component, but which is not part of that Major 130 | Component, and (b) serves only to enable use of the work with that 131 | Major Component, or to implement a Standard Interface for which an 132 | implementation is available to the public in source code form. A 133 | "Major Component", in this context, means a major essential component 134 | (kernel, window system, and so on) of the specific operating system 135 | (if any) on which the executable work runs, or a compiler used to 136 | produce the work, or an object code interpreter used to run it. 137 | 138 | The "Corresponding Source" for a work in object code form means all 139 | the source code needed to generate, install, and (for an executable 140 | work) run the object code and to modify the work, including scripts to 141 | control those activities. However, it does not include the work's 142 | System Libraries, or general-purpose tools or generally available free 143 | programs which are used unmodified in performing those activities but 144 | which are not part of the work. For example, Corresponding Source 145 | includes interface definition files associated with source files for 146 | the work, and the source code for shared libraries and dynamically 147 | linked subprograms that the work is specifically designed to require, 148 | such as by intimate data communication or control flow between those 149 | subprograms and other parts of the work. 150 | 151 | The Corresponding Source need not include anything that users 152 | can regenerate automatically from other parts of the Corresponding 153 | Source. 154 | 155 | The Corresponding Source for a work in source code form is that 156 | same work. 157 | 158 | 2. Basic Permissions. 159 | 160 | All rights granted under this License are granted for the term of 161 | copyright on the Program, and are irrevocable provided the stated 162 | conditions are met. This License explicitly affirms your unlimited 163 | permission to run the unmodified Program. The output from running a 164 | covered work is covered by this License only if the output, given its 165 | content, constitutes a covered work. This License acknowledges your 166 | rights of fair use or other equivalent, as provided by copyright law. 167 | 168 | You may make, run and propagate covered works that you do not 169 | convey, without conditions so long as your license otherwise remains 170 | in force. You may convey covered works to others for the sole purpose 171 | of having them make modifications exclusively for you, or provide you 172 | with facilities for running those works, provided that you comply with 173 | the terms of this License in conveying all material for which you do 174 | not control copyright. Those thus making or running the covered works 175 | for you must do so exclusively on your behalf, under your direction 176 | and control, on terms that prohibit them from making any copies of 177 | your copyrighted material outside their relationship with you. 178 | 179 | Conveying under any other circumstances is permitted solely under 180 | the conditions stated below. Sublicensing is not allowed; section 10 181 | makes it unnecessary. 182 | 183 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 184 | 185 | No covered work shall be deemed part of an effective technological 186 | measure under any applicable law fulfilling obligations under article 187 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 188 | similar laws prohibiting or restricting circumvention of such 189 | measures. 190 | 191 | When you convey a covered work, you waive any legal power to forbid 192 | circumvention of technological measures to the extent such circumvention 193 | is effected by exercising rights under this License with respect to 194 | the covered work, and you disclaim any intention to limit operation or 195 | modification of the work as a means of enforcing, against the work's 196 | users, your or third parties' legal rights to forbid circumvention of 197 | technological measures. 198 | 199 | 4. Conveying Verbatim Copies. 200 | 201 | You may convey verbatim copies of the Program's source code as you 202 | receive it, in any medium, provided that you conspicuously and 203 | appropriately publish on each copy an appropriate copyright notice; 204 | keep intact all notices stating that this License and any 205 | non-permissive terms added in accord with section 7 apply to the code; 206 | keep intact all notices of the absence of any warranty; and give all 207 | recipients a copy of this License along with the Program. 208 | 209 | You may charge any price or no price for each copy that you convey, 210 | and you may offer support or warranty protection for a fee. 211 | 212 | 5. Conveying Modified Source Versions. 213 | 214 | You may convey a work based on the Program, or the modifications to 215 | produce it from the Program, in the form of source code under the 216 | terms of section 4, provided that you also meet all of these conditions: 217 | 218 | a) The work must carry prominent notices stating that you modified 219 | it, and giving a relevant date. 220 | 221 | b) The work must carry prominent notices stating that it is 222 | released under this License and any conditions added under section 223 | 7. This requirement modifies the requirement in section 4 to 224 | "keep intact all notices". 225 | 226 | c) You must license the entire work, as a whole, under this 227 | License to anyone who comes into possession of a copy. This 228 | License will therefore apply, along with any applicable section 7 229 | additional terms, to the whole of the work, and all its parts, 230 | regardless of how they are packaged. This License gives no 231 | permission to license the work in any other way, but it does not 232 | invalidate such permission if you have separately received it. 233 | 234 | d) If the work has interactive user interfaces, each must display 235 | Appropriate Legal Notices; however, if the Program has interactive 236 | interfaces that do not display Appropriate Legal Notices, your 237 | work need not make them do so. 238 | 239 | A compilation of a covered work with other separate and independent 240 | works, which are not by their nature extensions of the covered work, 241 | and which are not combined with it such as to form a larger program, 242 | in or on a volume of a storage or distribution medium, is called an 243 | "aggregate" if the compilation and its resulting copyright are not 244 | used to limit the access or legal rights of the compilation's users 245 | beyond what the individual works permit. Inclusion of a covered work 246 | in an aggregate does not cause this License to apply to the other 247 | parts of the aggregate. 248 | 249 | 6. Conveying Non-Source Forms. 250 | 251 | You may convey a covered work in object code form under the terms 252 | of sections 4 and 5, provided that you also convey the 253 | machine-readable Corresponding Source under the terms of this License, 254 | in one of these ways: 255 | 256 | a) Convey the object code in, or embodied in, a physical product 257 | (including a physical distribution medium), accompanied by the 258 | Corresponding Source fixed on a durable physical medium 259 | customarily used for software interchange. 260 | 261 | b) Convey the object code in, or embodied in, a physical product 262 | (including a physical distribution medium), accompanied by a 263 | written offer, valid for at least three years and valid for as 264 | long as you offer spare parts or customer support for that product 265 | model, to give anyone who possesses the object code either (1) a 266 | copy of the Corresponding Source for all the software in the 267 | product that is covered by this License, on a durable physical 268 | medium customarily used for software interchange, for a price no 269 | more than your reasonable cost of physically performing this 270 | conveying of source, or (2) access to copy the 271 | Corresponding Source from a network server at no charge. 272 | 273 | c) Convey individual copies of the object code with a copy of the 274 | written offer to provide the Corresponding Source. This 275 | alternative is allowed only occasionally and noncommercially, and 276 | only if you received the object code with such an offer, in accord 277 | with subsection 6b. 278 | 279 | d) Convey the object code by offering access from a designated 280 | place (gratis or for a charge), and offer equivalent access to the 281 | Corresponding Source in the same way through the same place at no 282 | further charge. You need not require recipients to copy the 283 | Corresponding Source along with the object code. If the place to 284 | copy the object code is a network server, the Corresponding Source 285 | may be on a different server (operated by you or a third party) 286 | that supports equivalent copying facilities, provided you maintain 287 | clear directions next to the object code saying where to find the 288 | Corresponding Source. Regardless of what server hosts the 289 | Corresponding Source, you remain obligated to ensure that it is 290 | available for as long as needed to satisfy these requirements. 291 | 292 | e) Convey the object code using peer-to-peer transmission, provided 293 | you inform other peers where the object code and Corresponding 294 | Source of the work are being offered to the general public at no 295 | charge under subsection 6d. 296 | 297 | A separable portion of the object code, whose source code is excluded 298 | from the Corresponding Source as a System Library, need not be 299 | included in conveying the object code work. 300 | 301 | A "User Product" is either (1) a "consumer product", which means any 302 | tangible personal property which is normally used for personal, family, 303 | or household purposes, or (2) anything designed or sold for incorporation 304 | into a dwelling. In determining whether a product is a consumer product, 305 | doubtful cases shall be resolved in favor of coverage. For a particular 306 | product received by a particular user, "normally used" refers to a 307 | typical or common use of that class of product, regardless of the status 308 | of the particular user or of the way in which the particular user 309 | actually uses, or expects or is expected to use, the product. A product 310 | is a consumer product regardless of whether the product has substantial 311 | commercial, industrial or non-consumer uses, unless such uses represent 312 | the only significant mode of use of the product. 313 | 314 | "Installation Information" for a User Product means any methods, 315 | procedures, authorization keys, or other information required to install 316 | and execute modified versions of a covered work in that User Product from 317 | a modified version of its Corresponding Source. The information must 318 | suffice to ensure that the continued functioning of the modified object 319 | code is in no case prevented or interfered with solely because 320 | modification has been made. 321 | 322 | If you convey an object code work under this section in, or with, or 323 | specifically for use in, a User Product, and the conveying occurs as 324 | part of a transaction in which the right of possession and use of the 325 | User Product is transferred to the recipient in perpetuity or for a 326 | fixed term (regardless of how the transaction is characterized), the 327 | Corresponding Source conveyed under this section must be accompanied 328 | by the Installation Information. But this requirement does not apply 329 | if neither you nor any third party retains the ability to install 330 | modified object code on the User Product (for example, the work has 331 | been installed in ROM). 332 | 333 | The requirement to provide Installation Information does not include a 334 | requirement to continue to provide support service, warranty, or updates 335 | for a work that has been modified or installed by the recipient, or for 336 | the User Product in which it has been modified or installed. Access to a 337 | network may be denied when the modification itself materially and 338 | adversely affects the operation of the network or violates the rules and 339 | protocols for communication across the network. 340 | 341 | Corresponding Source conveyed, and Installation Information provided, 342 | in accord with this section must be in a format that is publicly 343 | documented (and with an implementation available to the public in 344 | source code form), and must require no special password or key for 345 | unpacking, reading or copying. 346 | 347 | 7. Additional Terms. 348 | 349 | "Additional permissions" are terms that supplement the terms of this 350 | License by making exceptions from one or more of its conditions. 351 | Additional permissions that are applicable to the entire Program shall 352 | be treated as though they were included in this License, to the extent 353 | that they are valid under applicable law. If additional permissions 354 | apply only to part of the Program, that part may be used separately 355 | under those permissions, but the entire Program remains governed by 356 | this License without regard to the additional permissions. 357 | 358 | When you convey a copy of a covered work, you may at your option 359 | remove any additional permissions from that copy, or from any part of 360 | it. (Additional permissions may be written to require their own 361 | removal in certain cases when you modify the work.) You may place 362 | additional permissions on material, added by you to a covered work, 363 | for which you have or can give appropriate copyright permission. 364 | 365 | Notwithstanding any other provision of this License, for material you 366 | add to a covered work, you may (if authorized by the copyright holders of 367 | that material) supplement the terms of this License with terms: 368 | 369 | a) Disclaiming warranty or limiting liability differently from the 370 | terms of sections 15 and 16 of this License; or 371 | 372 | b) Requiring preservation of specified reasonable legal notices or 373 | author attributions in that material or in the Appropriate Legal 374 | Notices displayed by works containing it; or 375 | 376 | c) Prohibiting misrepresentation of the origin of that material, or 377 | requiring that modified versions of such material be marked in 378 | reasonable ways as different from the original version; or 379 | 380 | d) Limiting the use for publicity purposes of names of licensors or 381 | authors of the material; or 382 | 383 | e) Declining to grant rights under trademark law for use of some 384 | trade names, trademarks, or service marks; or 385 | 386 | f) Requiring indemnification of licensors and authors of that 387 | material by anyone who conveys the material (or modified versions of 388 | it) with contractual assumptions of liability to the recipient, for 389 | any liability that these contractual assumptions directly impose on 390 | those licensors and authors. 391 | 392 | All other non-permissive additional terms are considered "further 393 | restrictions" within the meaning of section 10. If the Program as you 394 | received it, or any part of it, contains a notice stating that it is 395 | governed by this License along with a term that is a further 396 | restriction, you may remove that term. If a license document contains 397 | a further restriction but permits relicensing or conveying under this 398 | License, you may add to a covered work material governed by the terms 399 | of that license document, provided that the further restriction does 400 | not survive such relicensing or conveying. 401 | 402 | If you add terms to a covered work in accord with this section, you 403 | must place, in the relevant source files, a statement of the 404 | additional terms that apply to those files, or a notice indicating 405 | where to find the applicable terms. 406 | 407 | Additional terms, permissive or non-permissive, may be stated in the 408 | form of a separately written license, or stated as exceptions; 409 | the above requirements apply either way. 410 | 411 | 8. Termination. 412 | 413 | You may not propagate or modify a covered work except as expressly 414 | provided under this License. Any attempt otherwise to propagate or 415 | modify it is void, and will automatically terminate your rights under 416 | this License (including any patent licenses granted under the third 417 | paragraph of section 11). 418 | 419 | However, if you cease all violation of this License, then your 420 | license from a particular copyright holder is reinstated (a) 421 | provisionally, unless and until the copyright holder explicitly and 422 | finally terminates your license, and (b) permanently, if the copyright 423 | holder fails to notify you of the violation by some reasonable means 424 | prior to 60 days after the cessation. 425 | 426 | Moreover, your license from a particular copyright holder is 427 | reinstated permanently if the copyright holder notifies you of the 428 | violation by some reasonable means, this is the first time you have 429 | received notice of violation of this License (for any work) from that 430 | copyright holder, and you cure the violation prior to 30 days after 431 | your receipt of the notice. 432 | 433 | Termination of your rights under this section does not terminate the 434 | licenses of parties who have received copies or rights from you under 435 | this License. If your rights have been terminated and not permanently 436 | reinstated, you do not qualify to receive new licenses for the same 437 | material under section 10. 438 | 439 | 9. Acceptance Not Required for Having Copies. 440 | 441 | You are not required to accept this License in order to receive or 442 | run a copy of the Program. Ancillary propagation of a covered work 443 | occurring solely as a consequence of using peer-to-peer transmission 444 | to receive a copy likewise does not require acceptance. However, 445 | nothing other than this License grants you permission to propagate or 446 | modify any covered work. These actions infringe copyright if you do 447 | not accept this License. Therefore, by modifying or propagating a 448 | covered work, you indicate your acceptance of this License to do so. 449 | 450 | 10. Automatic Licensing of Downstream Recipients. 451 | 452 | Each time you convey a covered work, the recipient automatically 453 | receives a license from the original licensors, to run, modify and 454 | propagate that work, subject to this License. You are not responsible 455 | for enforcing compliance by third parties with this License. 456 | 457 | An "entity transaction" is a transaction transferring control of an 458 | organization, or substantially all assets of one, or subdividing an 459 | organization, or merging organizations. If propagation of a covered 460 | work results from an entity transaction, each party to that 461 | transaction who receives a copy of the work also receives whatever 462 | licenses to the work the party's predecessor in interest had or could 463 | give under the previous paragraph, plus a right to possession of the 464 | Corresponding Source of the work from the predecessor in interest, if 465 | the predecessor has it or can get it with reasonable efforts. 466 | 467 | You may not impose any further restrictions on the exercise of the 468 | rights granted or affirmed under this License. For example, you may 469 | not impose a license fee, royalty, or other charge for exercise of 470 | rights granted under this License, and you may not initiate litigation 471 | (including a cross-claim or counterclaim in a lawsuit) alleging that 472 | any patent claim is infringed by making, using, selling, offering for 473 | sale, or importing the Program or any portion of it. 474 | 475 | 11. Patents. 476 | 477 | A "contributor" is a copyright holder who authorizes use under this 478 | License of the Program or a work on which the Program is based. The 479 | work thus licensed is called the contributor's "contributor version". 480 | 481 | A contributor's "essential patent claims" are all patent claims 482 | owned or controlled by the contributor, whether already acquired or 483 | hereafter acquired, that would be infringed by some manner, permitted 484 | by this License, of making, using, or selling its contributor version, 485 | but do not include claims that would be infringed only as a 486 | consequence of further modification of the contributor version. For 487 | purposes of this definition, "control" includes the right to grant 488 | patent sublicenses in a manner consistent with the requirements of 489 | this License. 490 | 491 | Each contributor grants you a non-exclusive, worldwide, royalty-free 492 | patent license under the contributor's essential patent claims, to 493 | make, use, sell, offer for sale, import and otherwise run, modify and 494 | propagate the contents of its contributor version. 495 | 496 | In the following three paragraphs, a "patent license" is any express 497 | agreement or commitment, however denominated, not to enforce a patent 498 | (such as an express permission to practice a patent or covenant not to 499 | sue for patent infringement). To "grant" such a patent license to a 500 | party means to make such an agreement or commitment not to enforce a 501 | patent against the party. 502 | 503 | If you convey a covered work, knowingly relying on a patent license, 504 | and the Corresponding Source of the work is not available for anyone 505 | to copy, free of charge and under the terms of this License, through a 506 | publicly available network server or other readily accessible means, 507 | then you must either (1) cause the Corresponding Source to be so 508 | available, or (2) arrange to deprive yourself of the benefit of the 509 | patent license for this particular work, or (3) arrange, in a manner 510 | consistent with the requirements of this License, to extend the patent 511 | license to downstream recipients. "Knowingly relying" means you have 512 | actual knowledge that, but for the patent license, your conveying the 513 | covered work in a country, or your recipient's use of the covered work 514 | in a country, would infringe one or more identifiable patents in that 515 | country that you have reason to believe are valid. 516 | 517 | If, pursuant to or in connection with a single transaction or 518 | arrangement, you convey, or propagate by procuring conveyance of, a 519 | covered work, and grant a patent license to some of the parties 520 | receiving the covered work authorizing them to use, propagate, modify 521 | or convey a specific copy of the covered work, then the patent license 522 | you grant is automatically extended to all recipients of the covered 523 | work and works based on it. 524 | 525 | A patent license is "discriminatory" if it does not include within 526 | the scope of its coverage, prohibits the exercise of, or is 527 | conditioned on the non-exercise of one or more of the rights that are 528 | specifically granted under this License. You may not convey a covered 529 | work if you are a party to an arrangement with a third party that is 530 | in the business of distributing software, under which you make payment 531 | to the third party based on the extent of your activity of conveying 532 | the work, and under which the third party grants, to any of the 533 | parties who would receive the covered work from you, a discriminatory 534 | patent license (a) in connection with copies of the covered work 535 | conveyed by you (or copies made from those copies), or (b) primarily 536 | for and in connection with specific products or compilations that 537 | contain the covered work, unless you entered into that arrangement, 538 | or that patent license was granted, prior to 28 March 2007. 539 | 540 | Nothing in this License shall be construed as excluding or limiting 541 | any implied license or other defenses to infringement that may 542 | otherwise be available to you under applicable patent law. 543 | 544 | 12. No Surrender of Others' Freedom. 545 | 546 | If conditions are imposed on you (whether by court order, agreement or 547 | otherwise) that contradict the conditions of this License, they do not 548 | excuse you from the conditions of this License. If you cannot convey a 549 | covered work so as to satisfy simultaneously your obligations under this 550 | License and any other pertinent obligations, then as a consequence you may 551 | not convey it at all. For example, if you agree to terms that obligate you 552 | to collect a royalty for further conveying from those to whom you convey 553 | the Program, the only way you could satisfy both those terms and this 554 | License would be to refrain entirely from conveying the Program. 555 | 556 | 13. Remote Network Interaction; Use with the GNU General Public License. 557 | 558 | Notwithstanding any other provision of this License, if you modify the 559 | Program, your modified version must prominently offer all users 560 | interacting with it remotely through a computer network (if your version 561 | supports such interaction) an opportunity to receive the Corresponding 562 | Source of your version by providing access to the Corresponding Source 563 | from a network server at no charge, through some standard or customary 564 | means of facilitating copying of software. This Corresponding Source 565 | shall include the Corresponding Source for any work covered by version 3 566 | of the GNU General Public License that is incorporated pursuant to the 567 | following paragraph. 568 | 569 | Notwithstanding any other provision of this License, you have 570 | permission to link or combine any covered work with a work licensed 571 | under version 3 of the GNU General Public License into a single 572 | combined work, and to convey the resulting work. The terms of this 573 | License will continue to apply to the part which is the covered work, 574 | but the work with which it is combined will remain governed by version 575 | 3 of the GNU General Public License. 576 | 577 | 14. Revised Versions of this License. 578 | 579 | The Free Software Foundation may publish revised and/or new versions of 580 | the GNU Affero General Public License from time to time. Such new versions 581 | will be similar in spirit to the present version, but may differ in detail to 582 | address new problems or concerns. 583 | 584 | Each version is given a distinguishing version number. If the 585 | Program specifies that a certain numbered version of the GNU Affero General 586 | Public License "or any later version" applies to it, you have the 587 | option of following the terms and conditions either of that numbered 588 | version or of any later version published by the Free Software 589 | Foundation. If the Program does not specify a version number of the 590 | GNU Affero General Public License, you may choose any version ever published 591 | by the Free Software Foundation. 592 | 593 | If the Program specifies that a proxy can decide which future 594 | versions of the GNU Affero General Public License can be used, that proxy's 595 | public statement of acceptance of a version permanently authorizes you 596 | to choose that version for the Program. 597 | 598 | Later license versions may give you additional or different 599 | permissions. However, no additional obligations are imposed on any 600 | author or copyright holder as a result of your choosing to follow a 601 | later version. 602 | 603 | 15. Disclaimer of Warranty. 604 | 605 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 606 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 607 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 608 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 609 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 610 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 611 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 612 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 613 | 614 | 16. Limitation of Liability. 615 | 616 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 617 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 618 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 619 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 620 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 621 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 622 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 623 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 624 | SUCH DAMAGES. 625 | 626 | 17. Interpretation of Sections 15 and 16. 627 | 628 | If the disclaimer of warranty and limitation of liability provided 629 | above cannot be given local legal effect according to their terms, 630 | reviewing courts shall apply local law that most closely approximates 631 | an absolute waiver of all civil liability in connection with the 632 | Program, unless a warranty or assumption of liability accompanies a 633 | copy of the Program in return for a fee. 634 | 635 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Web Transpose. Simple APIs to get data from the internet. 3 |

Web Transpose

4 |

5 | Web Crawler & AI Web Scraper APIs for building new web experiences. 6 |

7 |
8 | 9 | ```bash 10 | pip install webtranspose 11 | ``` 12 | 13 |

14 | 15 | X 16 | 17 | 18 | License 19 | 20 | 21 | License 22 | 23 |

24 | 25 | 26 |

27 | Introduction · 28 | Installation · 29 | Docs 30 |

31 |
32 | 33 | ## Introduction 34 | 35 | In the near future, **nobody will open websites**. Instead, we will be directly served the information we are seeking. New web experiences will combine the information from many websites into a single, unified experience. 36 | 37 | **Web Transpose** is a collection of API tools that allow building these new web experiences simple. 38 | 39 | - [Webᵀ Crawl: Distributed Web Crawler](#crawl) 40 | - [Webᵀ Scrape: AI Web Scraper](#scrape) 41 | 42 | 43 | ### Crawl 44 | 45 | ```python 46 | import webtranspose as webt 47 | 48 | import os 49 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY" 50 | 51 | crawl = webt.Crawl( 52 | "https://www.example.com", 53 | max_pages=100, 54 | render_js=True, 55 | ) 56 | await crawl.crawl() # crawl.queue_crawl() for async 57 | ``` 58 | 59 | ## Scrape 60 | 61 | ```python 62 | import webtranspose as webt 63 | 64 | import os 65 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY" 66 | 67 | schema = { 68 | "Merchant Name": "string", 69 | "Title of Product": "string", 70 | "Product Photo URL": "string", 71 | } 72 | 73 | scraper = webt.Scraper( 74 | schema, 75 | render_js=True, 76 | ) 77 | out_json = scraper.scrape("https://www.example.com") 78 | ``` 79 | 80 | ## Web Search (AI SERP API) 81 | 82 | ```python 83 | import webtranspose as webt 84 | 85 | import os 86 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY" 87 | 88 | results = webt.search("what caused the fourth great ninja war?") 89 | # results.keys() 90 | # ['results'] 91 | 92 | # AI Filter 93 | results = webt.search_filter("Paul Graham's Blog") 94 | # results.keys() 95 | # ['results', 'filtered_results'] 96 | ``` 97 | 98 | 99 | ## Installation 100 | 101 | Non-Python Users: [📄 API Docs](https://docs.webtranspose.com). 102 | 103 | This repo contains a local **lite** installation of Web Transpose. This is a good option if you want to run Web Transpose locally on your machine for quick use cases. 104 | 105 | ```shell 106 | pip install webtranspose 107 | ``` 108 | 109 | However, if you wish to leverage the full tools of Web Transpose and use in production, you should add your API key to add the **full** version. 110 | 111 | ```python 112 | os.environ["WEBTRANSPOSE_API_KEY"] = "YOUR_API_KEY_HERE" 113 | ``` 114 | 115 | 116 | ## Enterprise Support 117 | 118 | Web Transpose serves enterprises small and large. We partner with companies for the long term with hands-on support and custom solutions. 119 | 120 | Please email me directly at mike@webtranspose.com for enquiries. 121 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: "100" 6 | patch: 7 | default: 8 | target: "100" 9 | comment: 10 | require_changes: true 11 | -------------------------------------------------------------------------------- /img/web-transpose-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mike-gee/webtranspose/46077b4400f72d37b983a6a7ec1bb2f0067f9e3f/img/web-transpose-cover.png -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | """Nox sessions.""" 2 | import platform 3 | 4 | import nox 5 | from nox_poetry import Session, session 6 | 7 | nox.options.sessions = ["tests", "mypy"] 8 | python_versions = ["3.8", "3.9", "3.10", "3.11"] 9 | 10 | 11 | @session(python=python_versions) 12 | def tests(session: Session) -> None: 13 | """Run the test suite.""" 14 | session.install(".") 15 | session.install("invoke", "pytest", "xdoctest", "coverage[toml]", "pytest-cov") 16 | try: 17 | session.run( 18 | "inv", 19 | "tests", 20 | env={ 21 | "COVERAGE_FILE": f".coverage.{platform.system()}.{platform.python_version()}", 22 | }, 23 | ) 24 | finally: 25 | if session.interactive: 26 | session.notify("coverage") 27 | 28 | 29 | @session 30 | def coverage(session: Session) -> None: 31 | """Produce the coverage report.""" 32 | args = session.posargs if session.posargs and len(session._runner.manifest) == 1 else [] 33 | session.install("invoke", "coverage[toml]") 34 | session.run("inv", "coverage", *args) 35 | 36 | 37 | @session(python=python_versions) 38 | def mypy(session: Session) -> None: 39 | """Type-check using mypy.""" 40 | session.install(".") 41 | session.install("invoke", "mypy") 42 | session.run("inv", "mypy") 43 | 44 | 45 | @session(python="3.11") 46 | def security(session: Session) -> None: 47 | """Scan dependencies for insecure packages.""" 48 | session.install("invoke", "safety") 49 | session.run("inv", "security") 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | [tool.poetry] 3 | name = "webtranspose" 4 | version = "0.3.2" 5 | description = "Reliable APIs for the website data" 6 | authors = ["Mike Gee "] 7 | 8 | readme = "README.md" 9 | homepage = "https://github.com/mike-gee/webtranspose" 10 | repository = "https://github.com/mike-gee/webtranspose" 11 | documentation = "https://docs.webtranspose.com" 12 | keywords = ["webtranspose"] 13 | classifiers=[ 14 | "Development Status :: 2 - Pre-Alpha", 15 | "Intended Audience :: Developers", 16 | 17 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)", 18 | 19 | "Natural Language :: English", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | ] 26 | 27 | 28 | [tool.poetry.urls] 29 | "Bug Tracker" = "https://github.com/mike-gee/webtranspose/issues" 30 | 31 | 32 | 33 | 34 | [tool.poetry.dependencies] 35 | python = "<3.12,>=3.8" 36 | requests = "^2.31.0" 37 | httpx = "^0.25.1" 38 | bs4 = "^0.0.1" 39 | openai = "^1.3.3" 40 | tiktoken = "^0.5.1" 41 | lxml = "^4.9.3" 42 | 43 | 44 | [tool.poetry.group.dev.dependencies] 45 | pre-commit = "^3.3.2" 46 | invoke = "^2.1.2" 47 | bump2version = "^1.0.1" 48 | watchdog = {version = "^3.0.0", extras = ["watchmedo"]} 49 | ipykernel = "^6.25.2" 50 | 51 | [tool.poetry.group.test.dependencies] 52 | pytest = "^7.3.1" 53 | xdoctest = "^1.1.1" 54 | coverage = {version = "^7.2.6", extras = ["toml"]} 55 | pytest-cov = "^4.1.0" 56 | 57 | [tool.poetry.group.format.dependencies] 58 | isort = "^5.12.0" 59 | black = "^23.3.0" 60 | 61 | [tool.poetry.group.linters.dependencies] 62 | flake8 = ">=4.0.1,<5.0.0" 63 | flakeheaven = "^3.3.0" 64 | flake8-builtins = "^2.1.0" 65 | flake8-blind-except = "^0.2.1" 66 | flake8-logging-format = "^0.9.0" 67 | flake8-bugbear = "^23.3.12" 68 | flake8-annotations = "^2.9.1" 69 | flake8-docstrings = "^1.7.0" 70 | flake8-bandit = "^3.0.0" 71 | flake8-broken-line = "^0.6.0" 72 | darglint = "^1.8.1" 73 | 74 | [tool.poetry.group.security.dependencies] 75 | safety = "^2.4.0b1" 76 | 77 | [tool.poetry.group.typing.dependencies] 78 | mypy = "^1.3.0" 79 | 80 | [tool.poetry.group.docs.dependencies] 81 | sphinx = "^7.0.1" 82 | recommonmark = "^0.7.1" 83 | 84 | [tool.coverage.paths] 85 | source = ["src", "*/site-packages"] 86 | 87 | [tool.coverage.run] 88 | branch = true 89 | source = ["webtranspose"] 90 | 91 | [tool.coverage.report] 92 | fail_under = 100 93 | exclude_lines = [ 94 | "pragma: no cover", 95 | "def __repr__", 96 | "if self.debug", 97 | "if settings.DEBUG:", 98 | "raise AssertionError", 99 | "raise NotImplementedError", 100 | "if 0:", 101 | "if __name__ == __main__:" 102 | ] 103 | show_missing = true 104 | 105 | [tool.coverage.html] 106 | directory = "htmlcov" 107 | 108 | [tool.flakeheaven] 109 | format = "grouped" 110 | max_line_length = 99 111 | show_source = true 112 | docstring-convention = "google" 113 | extended_default_ignore = [] 114 | 115 | [tool.flakeheaven.plugins] 116 | pyflakes = ["+*"] 117 | pycodestyle = ["+*"] 118 | mccabe = ["+*"] 119 | flake8-annotations = ["+*", "-ANN1??", "-ANN401"] 120 | flake8-docstrings = ["+*", "-D212"] 121 | "flake8-*" = ["+*"] 122 | pylint = ["-C????", "-E????", "+F????", "+I????", "-R????", "-W????"] 123 | 124 | [tool.flakeheaven.exceptions."tests/"] 125 | flake8-bandit = ["-S101"] 126 | 127 | [tool.isort] 128 | multi_line_output = 3 129 | include_trailing_comma = true 130 | force_grid_wrap = 0 131 | use_parentheses = true 132 | line_length = 99 133 | known_third_party = ["invoke", "nox", "nox_poetry"] 134 | 135 | [tool.black] 136 | line-length = 99 137 | target-version = ["py38"] 138 | 139 | [tool.mypy] 140 | warn_return_any = true 141 | warn_unused_configs = true 142 | 143 | [[tool.mypy.overrides]] 144 | module = ["pytest.*", "invoke.*", "nox.*", "nox_poetry.*"] 145 | allow_redefinition = false 146 | check_untyped_defs = true 147 | ignore_errors = false 148 | ignore_missing_imports = true 149 | implicit_reexport = true 150 | local_partial_types = true 151 | strict_optional = true 152 | strict_equality = true 153 | no_implicit_optional = true 154 | warn_unused_ignores = true 155 | warn_unreachable = true 156 | warn_no_return = true 157 | 158 | [build-system] 159 | requires = ["poetry>=0.12"] 160 | build-backend = "poetry.masonry.api" 161 | -------------------------------------------------------------------------------- /src/webtranspose/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for webtranspose.""" 2 | 3 | __author__ = """Mike Gee""" 4 | __email__ = "mike@webtranspose.com" 5 | __version__ = "0.3.1" 6 | 7 | from .chat import * 8 | from .crawl import * 9 | from .openai import * 10 | from .scrape import * 11 | from .search import * -------------------------------------------------------------------------------- /src/webtranspose/chat.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from time import sleep 4 | from typing import List 5 | 6 | from .webt_api import run_webt_api 7 | 8 | 9 | class Chatbot: 10 | def __init__( 11 | self, 12 | url_list: List[str] = [], 13 | name: str = None, 14 | max_pages: int = 100, 15 | api_key: str = None, 16 | verbose: bool = False, 17 | chatbot_id: str = None, 18 | _created: bool = False, 19 | ) -> None: 20 | """ 21 | Initialize a Chatbot instance. 22 | 23 | :param url_list: A list of URLs to crawl. 24 | :param name: The name of the chatbot. 25 | :param max_pages: The maximum number of pages to crawl. 26 | :param api_key: The API key for accessing the Web Transpose API. 27 | :param verbose: Whether to enable verbose logging. 28 | :param chatbot_id: The ID of an existing chatbot. 29 | :param _created: Whether the chatbot has already been created. 30 | """ 31 | self.api_key = api_key 32 | if self.api_key is None: 33 | self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 34 | 35 | if self.api_key is None: 36 | raise ValueError( 37 | "No Web Transpose API provided. \n\nTo use Chatbots, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com." 38 | ) 39 | 40 | self.url_list = url_list 41 | self.name = name 42 | self.max_pages = max_pages 43 | self.verbose = verbose 44 | self.chatbot_id = chatbot_id 45 | self.created = _created 46 | 47 | if not self.chatbot_id: 48 | self.create() 49 | 50 | def create(self): 51 | """ 52 | Create a chatbot. 53 | """ 54 | if not self.chatbot_id: 55 | self._create_chat() 56 | status = self.status() 57 | while status["status"] != "complete": 58 | if self.verbose: 59 | logging.info("Waiting for chat to be created...") 60 | sleep(5) 61 | status = self.status() 62 | 63 | else: 64 | logging.info("Chat already created.") 65 | 66 | def queue_create(self): 67 | """ 68 | Queue the creation of a chatbot. 69 | """ 70 | if not self.chatbot_id: 71 | self._create_chat() 72 | else: 73 | logging.info("Chat already created.") 74 | 75 | def _create_chat(self): 76 | """ 77 | Create a chat. 78 | """ 79 | if self.verbose: 80 | logging.info("Creating chat...") 81 | 82 | if self.chatbot_id is None: 83 | create_json = { 84 | "name": self.name, 85 | "max_pages": self.max_pages, 86 | "url_list": self.url_list, 87 | } 88 | out_json = run_webt_api(create_json, "v1/chat/create", self.api_key) 89 | self.chatbot_id = out_json["chatbot_id"] 90 | 91 | def query_database(self, query: str, num_records: int = 3) -> list: 92 | """ 93 | Query the database of the chatbot. 94 | 95 | :param query: The query string. 96 | :param num_records: The number of records to return. 97 | :return: The query results. 98 | """ 99 | if self.verbose: 100 | logging.info("Querying database...") 101 | 102 | if not self.chatbot_id: 103 | self.create() 104 | 105 | query_json = { 106 | "chatbot_id": self.chatbot_id, 107 | "query": query, 108 | "num_records": num_records, 109 | } 110 | out = run_webt_api(query_json, "v1/chat/database/query", self.api_key) 111 | return out["results"] 112 | 113 | def status(self): 114 | """ 115 | Get the status of the chatbot. 116 | 117 | :return: The chatbot status. 118 | """ 119 | if self.verbose: 120 | logging.info("Getting chat...") 121 | 122 | if not self.chatbot_id: 123 | self.create() 124 | 125 | get_json = { 126 | "chatbot_id": self.chatbot_id, 127 | } 128 | out = run_webt_api(get_json, "v1/chat/get", self.api_key) 129 | return out["chatbot"] 130 | 131 | def add_urls(self, url_list: list): 132 | """ 133 | Add URLs to the chatbot. 134 | 135 | :param url_list: A list of URLs to add. 136 | """ 137 | if self.verbose: 138 | logging.info("Querying database...") 139 | 140 | if not self.chatbot_id: 141 | self.create() 142 | 143 | query_json = { 144 | "chatbot_id": self.chatbot_id, 145 | "max_pages": self.max_pages, 146 | "url_list": url_list, 147 | } 148 | run_webt_api(query_json, "v1/chat/urls/add", self.api_key) 149 | 150 | def delete_crawls(self, crawl_id_list: list): 151 | """ 152 | Delete crawls from the chatbot. 153 | 154 | :param crawl_id_list: A list of crawl IDs to delete. 155 | """ 156 | if self.verbose: 157 | logging.info("Querying database...") 158 | 159 | if not self.chatbot_id: 160 | self.create() 161 | 162 | query_json = { 163 | "chatbot_id": self.chatbot_id, 164 | "crawl_id_list": crawl_id_list, 165 | } 166 | run_webt_api(query_json, "v1/chat/crawls/delete", self.api_key) 167 | 168 | 169 | def get_chatbot(chatbot_id: str, api_key = None) -> Chatbot: 170 | """ 171 | Get a chatbot. 172 | 173 | :param chatbot_id: The ID of the chatbot. 174 | :return: The chatbot. 175 | """ 176 | if api_key is None: 177 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 178 | if api_key is None: 179 | raise ValueError( 180 | "No Web Transpose API provided. \n\nTo use Chatbots, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com." 181 | ) 182 | get_json = { 183 | "chatbot_id": chatbot_id, 184 | } 185 | chat_json = run_webt_api(get_json, "v1/chat/get", api_key) 186 | chatbot_data = chat_json.get('chatbot', {}) 187 | chatbot = Chatbot( 188 | chatbot_id=chatbot_data.get('id'), 189 | name=chatbot_data.get('name'), 190 | max_pages=chatbot_data.get('num_run', 100), 191 | verbose=False, 192 | _created=True 193 | ) 194 | return chatbot -------------------------------------------------------------------------------- /src/webtranspose/crawl.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import os 5 | import shutil 6 | import tempfile 7 | import urllib.parse 8 | import uuid 9 | import zipfile 10 | from datetime import datetime 11 | from fnmatch import fnmatch 12 | from typing import Dict, List, Optional, Set 13 | from urllib.parse import urljoin, urlparse, urlunparse 14 | 15 | import httpx 16 | from bs4 import BeautifulSoup 17 | 18 | from .webt_api import run_webt_api 19 | 20 | 21 | class Crawl: 22 | def __init__( 23 | self, 24 | url: str, 25 | allowed_urls: List[str] = [], 26 | banned_urls: List[str] = [], 27 | n_workers: int = 1, 28 | max_pages: int = 15, 29 | render_js: bool = False, 30 | output_dir: str = "webtranspose-out", 31 | verbose: bool = False, 32 | api_key: Optional[str] = None, 33 | _created: bool = False, 34 | ) -> None: 35 | """ 36 | Initialize the Crawl object. 37 | 38 | :param url: The base URL to start crawling from. 39 | :param allowed_urls: A list of allowed URLs to crawl. 40 | :param banned_urls: A list of banned URLs to exclude from crawling. 41 | :param n_workers: The number of worker tasks to use for crawling. 42 | :param max_pages: The maximum number of pages to crawl. 43 | :param render_js: Whether to render JavaScript on crawled pages. 44 | :param output_dir: The directory to store the crawled data. 45 | :param verbose: Whether to print verbose logging messages. 46 | :param api_key: The API key to use for webt_api calls. 47 | """ 48 | self.api_key = api_key 49 | if self.api_key is None: 50 | self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 51 | 52 | self.base_url = url 53 | self.allowed_urls = allowed_urls 54 | self.banned_urls = banned_urls 55 | self.max_pages = max_pages 56 | self.queue = asyncio.Queue() 57 | self.queue.put_nowait( 58 | { 59 | "url": self.base_url, 60 | "parent_urls": [], 61 | } 62 | ) 63 | self.output_dir = output_dir 64 | self.visited_urls = {} 65 | self.failed_urls = set() 66 | self.ignored_urls = set() 67 | self.n_workers = n_workers 68 | if not os.path.exists(self.output_dir): 69 | os.makedirs(self.output_dir) 70 | self.created = _created 71 | self.render_js = render_js 72 | self.crawl_id = None 73 | if self.api_key is None: 74 | self.crawl_id = str(uuid.uuid4()) 75 | self.verbose = verbose 76 | 77 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 78 | if api_key is None and self.api_key is None: 79 | logging.warning( 80 | "No Web Transpose API provided. Lite version in use...\n\nTo run your Web Crawl on the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics." 81 | ) 82 | 83 | @staticmethod 84 | async def crawl_worker( 85 | name: str, 86 | queue: asyncio.Queue, 87 | crawl_id: str, 88 | visited_urls: Dict[str, str], 89 | allowed_urls: List[str], 90 | failed_urls: Set[str], 91 | banned_urls: List[str], 92 | output_dir: str, 93 | base_url: str, 94 | max_pages: int, 95 | leftover_queue: asyncio.Queue, 96 | ignored_queue: asyncio.Queue, 97 | verbose: bool, 98 | ) -> None: 99 | """ 100 | Worker function for crawling URLs. 101 | 102 | :param name: The name of the worker. 103 | :param queue: The queue of URLs to crawl. 104 | :param crawl_id: The ID of the crawl. 105 | :param visited_urls: A dictionary of visited URLs and their file paths. 106 | :param allowed_urls: A list of allowed URLs to crawl. 107 | :param banned_urls: A list of banned URLs to exclude from crawling. 108 | :param output_dir: The directory to store the crawled data. 109 | :param base_url: The base URL of the crawl. 110 | :param max_pages: The maximum number of pages to crawl. 111 | :param leftover_queue: The queue for leftover URLs. 112 | :param ignored_queue: The queue for ignored URLs. 113 | :param verbose: Whether to print verbose logging messages. 114 | """ 115 | 116 | def _lint_url(url: str) -> str: 117 | """ 118 | Lint the given URL by removing the fragment component. 119 | 120 | :param url: The URL to lint. 121 | :return: The linted URL. 122 | """ 123 | parsed_url = urlparse(url) 124 | cleaned_url = parsed_url._replace(fragment="") 125 | return urlunparse(cleaned_url) 126 | 127 | if verbose: 128 | logging.info(f"{name}: Starting crawl of {base_url}") 129 | while max_pages is None or len(visited_urls) < max_pages or not queue.empty(): 130 | curr_url_data = await queue.get() 131 | curr_url = curr_url_data["url"] 132 | parent_urls = curr_url_data["parent_urls"] 133 | base_url_netloc = urlparse(base_url).netloc 134 | if ( 135 | ( 136 | ( 137 | urlparse(curr_url).netloc == base_url_netloc 138 | and not any(fnmatch(curr_url, banned) for banned in banned_urls) 139 | ) 140 | or any(fnmatch(curr_url, allowed) for allowed in allowed_urls) 141 | ) 142 | and curr_url not in visited_urls 143 | and len(visited_urls) < max_pages 144 | ): 145 | base_dir = os.path.join(output_dir, base_url_netloc) 146 | if not os.path.exists(base_dir): 147 | os.makedirs(base_dir) 148 | filename = urllib.parse.quote_plus(curr_url).replace("/", "_") 149 | filepath = os.path.join(base_dir, filename) + ".json" 150 | async with httpx.AsyncClient() as client: 151 | try: 152 | page = await client.get(curr_url) 153 | except: 154 | failed_urls.add(curr_url) 155 | queue.task_done() 156 | continue 157 | 158 | page_title = None 159 | page_html = None 160 | page_text = None 161 | try: 162 | page_type = "html" 163 | soup = BeautifulSoup(page.content, "lxml") 164 | page_title = soup.title.string if soup.title else "" 165 | page_html = page.content.decode("utf-8") 166 | page_text = soup.get_text() 167 | child_urls = list( 168 | set( 169 | [ 170 | _lint_url(urljoin(base_url, link.get("href"))) 171 | for link in soup.find_all(href=True) 172 | ] 173 | ) 174 | ) 175 | for url in child_urls: 176 | if url.startswith("http"): 177 | queue.put_nowait( 178 | { 179 | "url": url, 180 | "parent_urls": parent_urls + [curr_url], 181 | } 182 | ) 183 | except: 184 | child_urls = [] 185 | page_type = "other" 186 | 187 | visited_urls[curr_url] = filepath 188 | data = { 189 | "crawl_id": crawl_id, 190 | "url": curr_url, 191 | "type": page_type, 192 | "title": page_title, 193 | "date": datetime.now().isoformat(), 194 | "parent_urls": parent_urls, 195 | "child_urls": child_urls, 196 | "html": page_html, 197 | "text": page_text, 198 | } 199 | with open(filepath, "w") as f: 200 | json.dump(data, f) 201 | 202 | elif curr_url not in visited_urls and ( 203 | urlparse(curr_url).netloc == urlparse(base_url).netloc 204 | or any(fnmatch(curr_url, allowed) for allowed in allowed_urls) 205 | ): 206 | leftover_queue.put_nowait( 207 | { 208 | "url": curr_url, 209 | "parent_urls": parent_urls, 210 | } 211 | ) 212 | 213 | else: 214 | ignored_queue.put_nowait(curr_url) 215 | 216 | queue.task_done() 217 | 218 | def create_crawl_api(self): 219 | """ 220 | Creates a Crawl on https://webtranspose.com 221 | """ 222 | if self.verbose: 223 | logging.info(f"Creating crawl of {self.base_url} on Web Transpose...") 224 | create_json = { 225 | "url": self.base_url, 226 | "render_js": self.render_js, 227 | "max_pages": self.max_pages, 228 | "allowed_urls": self.allowed_urls, 229 | "banned_urls": self.banned_urls, 230 | } 231 | out_json = run_webt_api( 232 | create_json, 233 | "v1/crawl/create", 234 | self.api_key, 235 | ) 236 | self.crawl_id = out_json["crawl_id"] 237 | self.created = True 238 | 239 | def queue_crawl(self): 240 | """ 241 | Resume crawling of Crawl object. Don't wait for it to finish crawling. 242 | """ 243 | if self.verbose: 244 | logging.info(f"Starting crawl of {self.base_url} on Web Transpose...") 245 | 246 | if self.api_key is None: 247 | logging.error("Cannot queue a local crawl. Please use the crawl() method.") 248 | 249 | else: 250 | if not self.created: 251 | self.create_crawl_api() 252 | queue_json = { 253 | "crawl_id": self.crawl_id, 254 | } 255 | out = run_webt_api( 256 | queue_json, 257 | "v1/crawl/resume", 258 | self.api_key, 259 | ) 260 | 261 | async def crawl(self): 262 | """ 263 | Resume crawling of Crawl object. 264 | """ 265 | if self.verbose: 266 | logging.info(f"Starting crawl of {self.base_url}...") 267 | if self.api_key is None: 268 | leftover_queue = asyncio.Queue() 269 | ignored_queue = asyncio.Queue() 270 | tasks = [] 271 | for i in range(self.n_workers): 272 | task = asyncio.create_task( 273 | self.crawl_worker( 274 | f"worker-{i}", 275 | self.queue, 276 | self.crawl_id, 277 | self.visited_urls, 278 | self.allowed_urls, 279 | self.failed_urls, 280 | self.banned_urls, 281 | self.output_dir, 282 | self.base_url, 283 | self.max_pages, 284 | leftover_queue, 285 | ignored_queue, 286 | self.verbose, 287 | ) 288 | ) 289 | tasks.append(task) 290 | 291 | await self.queue.join() 292 | for task in tasks: 293 | task.cancel() 294 | await asyncio.gather(*tasks, return_exceptions=True) 295 | self.queue = leftover_queue 296 | self.ignored_urls = list(ignored_queue._queue) 297 | self.to_metadata() 298 | else: 299 | self.queue_crawl() 300 | status = self.status() 301 | while status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0: 302 | await asyncio.sleep(5) 303 | status = self.status() 304 | 305 | if (status["num_failed"] > 0) and ( 306 | status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0 307 | ): 308 | raise Exception("The first page crawled failed") 309 | 310 | while status["num_queued"] > 0 and status["num_visited"] < status["max_pages"]: 311 | await asyncio.sleep(5) 312 | status = self.status() 313 | return self 314 | 315 | def get_queued(self, max_pages: int = 30) -> list: 316 | """ 317 | Get a list of URLs from the queue. 318 | 319 | Args: 320 | max_pages (int): The number of URLs to retrieve from the queue. Defaults to 30. 321 | 322 | Returns: 323 | list: A list of URLs from the queue. 324 | """ 325 | if self.api_key is None: 326 | urls = [] 327 | for _ in range(max_pages): 328 | try: 329 | url = self.queue.get_nowait() 330 | urls.append(url) 331 | except asyncio.QueueEmpty: 332 | break 333 | for url in urls: 334 | self.queue.put_nowait(url) 335 | return urls 336 | else: 337 | if not self.created: 338 | return [self.base_url] 339 | queue_json = { 340 | "crawl_id": self.crawl_id, 341 | "max_pages": max_pages, 342 | } 343 | out_json = run_webt_api( 344 | queue_json, 345 | "v1/crawl/get-queue", 346 | self.api_key, 347 | ) 348 | return out_json["urls"] 349 | 350 | def set_allowed_urls(self, allowed_urls: list) -> "Crawl": 351 | """ 352 | Set the allowed URLs for the crawl. 353 | 354 | Args: 355 | allowed_urls (list): A list of allowed URLs. 356 | 357 | Returns: 358 | self: The Crawl object. 359 | """ 360 | self.allowed_urls = allowed_urls 361 | if not self.created: 362 | self.to_metadata() 363 | else: 364 | update_json = { 365 | "crawl_id": self.crawl_id, 366 | "allowed_urls": allowed_urls, 367 | } 368 | run_webt_api( 369 | update_json, 370 | "v1/crawl/set-allowed", 371 | self.api_key, 372 | ) 373 | return self 374 | 375 | def set_banned_urls(self, banned_urls: list) -> "Crawl": 376 | """ 377 | Set the banned URLs for the crawl. 378 | 379 | Args: 380 | banned_urls (list): A list of banned URLs. 381 | 382 | Returns: 383 | self: The Crawl object. 384 | """ 385 | self.banned_urls = banned_urls 386 | if not self.created: 387 | self.to_metadata() 388 | else: 389 | update_json = { 390 | "crawl_id": self.crawl_id, 391 | "banned_urls": banned_urls, 392 | } 393 | run_webt_api( 394 | update_json, 395 | "v1/crawl/set-banned", 396 | self.api_key, 397 | ) 398 | return self 399 | 400 | def get_filename(self, url: str) -> str: 401 | """ 402 | Get the filename associated with a visited URL. 403 | 404 | Args: 405 | url (str): The visited URL. 406 | 407 | Returns: 408 | str: The filename associated with the visited URL. 409 | 410 | Raises: 411 | ValueError: If the URL is not found in the visited URLs. 412 | """ 413 | try: 414 | return self.visited_urls[url] 415 | except KeyError: 416 | raise ValueError(f"URL {url} not found in visited URLs") 417 | 418 | def set_max_pages(self, max_pages: int) -> "Crawl": 419 | """ 420 | Set the maximum number of pages to crawl. 421 | 422 | Args: 423 | max_pages (int): The maximum number of pages to crawl. 424 | 425 | Returns: 426 | self: The Crawl object. 427 | """ 428 | if not self.created: 429 | self.max_pages = max_pages 430 | self.to_metadata() 431 | else: 432 | max_pages_json = { 433 | "crawl_id": self.crawl_id, 434 | "max_pages": max_pages, 435 | } 436 | run_webt_api( 437 | max_pages_json, 438 | "v1/crawl/set-max-pages", 439 | self.api_key, 440 | ) 441 | return self 442 | 443 | def status(self) -> dict: 444 | """ 445 | Get the status of the Crawl object. 446 | 447 | Returns: 448 | dict: The status of the Crawl object. 449 | """ 450 | if not self.created: 451 | status_json = { 452 | "crawl_id": self.crawl_id, 453 | "loc": "local" if self.api_key is None else "cloud", 454 | "base_url": self.base_url, 455 | "max_pages": self.max_pages, 456 | "num_visited": len(self.visited_urls), 457 | "num_ignored": len(self.ignored_urls), 458 | "num_failed": len(self.failed_urls), 459 | "num_queued": self.queue.qsize(), 460 | "banned_urls": self.banned_urls, 461 | "allowed_urls": self.allowed_urls, 462 | } 463 | status_json["n_workers"] = self.n_workers 464 | return status_json 465 | 466 | status_json = { 467 | "crawl_id": self.crawl_id, 468 | } 469 | crawl_status = run_webt_api( 470 | status_json, 471 | "v1/crawl/get", 472 | self.api_key, 473 | ) 474 | crawl_status["loc"] = "cloud" 475 | if self.verbose: 476 | logging.info(f"Status of crawl {self.crawl_id}: {crawl_status}") 477 | return crawl_status 478 | 479 | def get_ignored(self) -> list: 480 | """ 481 | Get a list of ignored URLs. 482 | 483 | Returns: 484 | list: A list of ignored URLs. 485 | """ 486 | if not self.created: 487 | return list(self.ignored_urls) 488 | 489 | ignored_json = { 490 | "crawl_id": self.crawl_id, 491 | } 492 | out_json = run_webt_api( 493 | ignored_json, 494 | "v1/crawl/get/ignored", 495 | self.api_key, 496 | ) 497 | return out_json["pages"] 498 | 499 | def get_failed(self) -> list: 500 | """ 501 | Get a list of failed URLs. 502 | 503 | Returns: 504 | list: A list of failed URLs. 505 | """ 506 | if not self.created: 507 | return list(self.failed_urls) 508 | 509 | visited_json = { 510 | "crawl_id": self.crawl_id, 511 | } 512 | out_json = run_webt_api( 513 | visited_json, 514 | "v1/crawl/get/failed", 515 | self.api_key, 516 | ) 517 | return out_json["pages"] 518 | 519 | def get_visited(self) -> list: 520 | """ 521 | Get a list of visited URLs. 522 | 523 | Returns: 524 | list: A list of visited URLs. 525 | """ 526 | if not self.created: 527 | return list(self.visited_urls) 528 | 529 | visited_json = { 530 | "crawl_id": self.crawl_id, 531 | } 532 | out_json = run_webt_api( 533 | visited_json, 534 | "v1/crawl/get/visited", 535 | self.api_key, 536 | ) 537 | return out_json["pages"] 538 | 539 | def get_banned(self) -> list: 540 | """ 541 | Get a list of banned URLs. 542 | 543 | Returns: 544 | list: A list of banned URLs. 545 | """ 546 | if not self.created: 547 | return list(self.banned_urls) 548 | 549 | banned_json = { 550 | "crawl_id": self.crawl_id, 551 | } 552 | out_json = run_webt_api( 553 | banned_json, 554 | "v1/crawl/get/banned", 555 | self.api_key, 556 | ) 557 | return out_json["pages"] 558 | 559 | def download(self): 560 | """ 561 | Download the output of the crawl. 562 | """ 563 | if self.verbose: 564 | logging.info(f"Downloading crawl of {self.base_url}...") 565 | 566 | if self.created: 567 | download_json = { 568 | "crawl_id": self.crawl_id, 569 | } 570 | out_json = run_webt_api( 571 | download_json, 572 | "v1/crawl/download", 573 | self.api_key, 574 | ) 575 | presigned_url = out_json["url"] 576 | with tempfile.TemporaryDirectory() as tmpdir: 577 | zip_file_path = os.path.join(tmpdir, "temp.zip") 578 | with open(zip_file_path, "wb") as f: 579 | response = httpx.get(presigned_url) 580 | f.write(response.content) 581 | 582 | with zipfile.ZipFile(zip_file_path, "r") as zip_ref: 583 | zip_ref.extractall(tmpdir) 584 | 585 | for root, _, files in os.walk(tmpdir): 586 | for file in files: 587 | if file.endswith(".json"): 588 | json_file = os.path.join(root, file) 589 | with open(json_file, "r") as f: 590 | data = json.load(f) 591 | url = data["url"] 592 | base_url_netloc = urlparse(self.base_url).netloc 593 | base_dir = os.path.join(self.output_dir, base_url_netloc) 594 | if not os.path.exists(base_dir): 595 | os.makedirs(base_dir) 596 | filename = urllib.parse.quote_plus(url).replace("/", "_") 597 | filepath = os.path.join(base_dir, filename) + ".json" 598 | shutil.move(json_file, filepath) 599 | 600 | logging.info(f"The output of the crawl can be found at: {self.output_dir}") 601 | 602 | def to_metadata(self) -> None: 603 | """ 604 | Save the metadata of the Crawl object to a file. 605 | """ 606 | if not self.created: 607 | filename = os.path.join(self.output_dir, f"{self.crawl_id}.json") 608 | metadata = { 609 | "crawl_id": self.crawl_id, 610 | "n_workers": self.n_workers, 611 | "base_url": self.base_url, 612 | "max_pages": self.max_pages, 613 | "visited_urls": self.visited_urls, 614 | "ignored_urls": list(self.ignored_urls), 615 | "render_js": self.render_js, 616 | "queue": list(self.queue._queue), 617 | "banned_urls": self.banned_urls, 618 | "allowed_urls": self.allowed_urls, 619 | "output_dir": self.output_dir, 620 | } 621 | with open(filename, "w") as file: 622 | json.dump(metadata, file) 623 | 624 | @staticmethod 625 | def from_metadata(crawl_id: str, output_dir: str = "webtranspose-out") -> "Crawl": 626 | """ 627 | Create a Crawl object from metadata stored in a file. 628 | 629 | Args: 630 | crawl_id (str): The ID of the crawl. 631 | output_dir (str, optional): The directory to store the crawled data. Defaults to "webtranspose-out". 632 | 633 | Returns: 634 | Crawl: The Crawl object. 635 | """ 636 | filename = os.path.join(output_dir, f"{crawl_id}.json") 637 | with open(filename, "r") as file: 638 | metadata = json.load(file) 639 | crawl = Crawl( 640 | metadata["base_url"], 641 | metadata["allowed_urls"], 642 | metadata["banned_urls"], 643 | metadata["n_workers"], 644 | metadata["max_pages"], 645 | render_js=metadata["render_js"], 646 | output_dir=metadata["output_dir"], 647 | ) 648 | crawl.crawl_id = metadata["crawl_id"] 649 | crawl.visited_urls = metadata["visited_urls"] 650 | crawl.ignored_urls = set(metadata["ignored_urls"]) 651 | crawl.queue = asyncio.Queue() 652 | for url in metadata["queue"]: 653 | crawl.queue.put_nowait(url) 654 | return crawl 655 | 656 | @staticmethod 657 | def from_cloud(crawl_id: str, api_key: Optional[str] = None) -> "Crawl": 658 | """ 659 | Create a Crawl object from metadata stored in the cloud. 660 | 661 | Args: 662 | crawl_id (str): The ID of the crawl. 663 | api_key (str, optional): The API key for accessing the cloud. Defaults to None. 664 | 665 | Returns: 666 | Crawl: The Crawl object. 667 | """ 668 | if api_key is None: 669 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 670 | 671 | if api_key is not None: 672 | get_json = { 673 | "crawl_id": crawl_id, 674 | } 675 | out_json = run_webt_api(get_json, "v1/crawl/get", api_key) 676 | crawl = Crawl( 677 | out_json["base_url"], 678 | out_json["allowed_urls"], 679 | out_json["banned_urls"], 680 | max_pages=out_json["max_pages"], 681 | render_js=out_json["render_js"], 682 | api_key=api_key, 683 | _created=True, 684 | ) 685 | crawl.crawl_id = out_json["crawl_id"] 686 | return crawl 687 | 688 | raise ValueError( 689 | "API key not found. Please set WEBTRANSPOSE_API_KEY environment variable or pass api_key argument." 690 | ) 691 | 692 | def __str__(self) -> str: 693 | """ 694 | Get a string representation of the Crawl object. 695 | 696 | Returns: 697 | str: The string representation of the Crawl object. 698 | """ 699 | status = self.status() 700 | return ( 701 | f"WebTransposeCrawl(\n" 702 | f" Crawl ID: {status['crawl_id']}\n" 703 | f" Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n" 704 | f" Base URL: {status['base_url']}\n" 705 | f" Max Pages: {status['max_pages']}\n" 706 | f" Number of Visited URLs: {status['num_visited']}\n" 707 | f" Number of Ignored URLs: {status['num_ignored']}\n" 708 | f" Number of Queued URLs: {status['num_queued']}\n" 709 | f" Number of Failed URLs: {status['num_failed']}\n" 710 | f" Banned URLs: {status['banned_urls']}\n" 711 | f" Allowed URLs: {status['allowed_urls']}" 712 | f")" 713 | ) 714 | 715 | def __repr__(self) -> str: 716 | """ 717 | Get a string representation of the Crawl object. 718 | 719 | Returns: 720 | str: The string representation of the Crawl object. 721 | """ 722 | status = self.status() 723 | return ( 724 | f"WebTransposeCrawl(\n" 725 | f" Crawl ID: {status['crawl_id']}\n" 726 | f" Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n" 727 | f" Base URL: {status['base_url']}\n" 728 | f" Max Pages: {status['max_pages']}\n" 729 | f" Number of Visited URLs: {status['num_visited']}\n" 730 | f" Number of Ignored URLs: {status['num_ignored']}\n" 731 | f" Number of Queued URLs: {status['num_queued']}\n" 732 | f" Number of Failed URLs: {status['num_failed']}\n" 733 | f" Banned URLs: {status['banned_urls']}\n" 734 | f" Allowed URLs: {status['allowed_urls']}" 735 | f")" 736 | ) 737 | 738 | def get_page(self, url: str) -> dict: 739 | """ 740 | Get the page data for a given URL. 741 | 742 | Args: 743 | url (str): The URL of the page. 744 | 745 | Returns: 746 | dict: The page data. 747 | """ 748 | if not self.created: 749 | fn = self.visited_urls[url] 750 | try: 751 | with open(fn, "r") as f: 752 | data = json.load(f) 753 | return data 754 | except: 755 | logging.error(f"Could not find HTML for URL {url}") 756 | else: 757 | get_json = { 758 | "crawl_id": self.crawl_id, 759 | "url": url, 760 | } 761 | out_json = run_webt_api( 762 | get_json, 763 | "v1/crawl/get-page", 764 | self.api_key, 765 | ) 766 | return out_json 767 | 768 | def get_child_urls(self, url: str) -> list: 769 | """ 770 | Get the child URLs for a given URL. 771 | 772 | Args: 773 | url (str): The URL. 774 | 775 | Returns: 776 | list: A list of child URLs. 777 | """ 778 | if not self.created: 779 | try: 780 | fn = self.visited_urls[url] 781 | except: 782 | logging.error(f"Could not find child URLs for URL {url}") 783 | return None 784 | try: 785 | with open(fn, "r") as f: 786 | data = json.load(f) 787 | return data["child_urls"] 788 | except: 789 | logging.error(f"Could not find child URLs for URL {url}") 790 | else: 791 | get_json = { 792 | "crawl_id": self.crawl_id, 793 | "url": url, 794 | } 795 | out_json = run_webt_api( 796 | get_json, 797 | "v1/crawl/get-child-urls", 798 | self.api_key, 799 | ) 800 | return out_json 801 | 802 | def retry_failed_urls(self) -> None: 803 | """ 804 | Queue failed URLs from a crawl. 805 | """ 806 | if not self.created: 807 | logging.error("Cannot retry failed URLs for un-created crawl.") 808 | elif self.api_key is not None: 809 | queue_json = { 810 | "crawl_id": self.crawl_id, 811 | } 812 | run_webt_api( 813 | queue_json, 814 | "v1/crawl/retry-failed", 815 | self.api_key, 816 | ) 817 | 818 | 819 | def get_crawl(crawl_id: str, api_key: Optional[str] = None) -> Crawl: 820 | """ 821 | Get a Crawl object based on the crawl ID. 822 | 823 | Args: 824 | crawl_id (str): The ID of the crawl. 825 | api_key (str, optional): The API key. Defaults to None. 826 | 827 | Returns: 828 | Crawl: The Crawl object. 829 | """ 830 | try: 831 | return Crawl.from_metadata(crawl_id) 832 | except FileNotFoundError: 833 | return Crawl.from_cloud(crawl_id, api_key=api_key) 834 | 835 | 836 | def list_crawls(loc: str = "cloud", api_key: Optional[str] = None) -> list: 837 | """ 838 | List all available crawls. 839 | 840 | Args: 841 | loc (str, optional): The location of the crawls. Defaults to 'cloud'. 842 | api_key (str, optional): The API key. Defaults to None. 843 | 844 | Returns: 845 | list: A list of Crawl objects. 846 | """ 847 | if api_key is None: 848 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 849 | 850 | if api_key is not None and loc == "cloud": 851 | crawl_list_data = run_webt_api( 852 | {}, 853 | "v1/crawl/list", 854 | api_key, 855 | ) 856 | return crawl_list_data["crawls"] 857 | 858 | elif loc == "local" or api_key is None: 859 | crawls = [] 860 | for filename in os.listdir("."): 861 | if filename.endswith(".json"): 862 | crawls.append(Crawl.from_metadata(filename[:-5])) 863 | return crawls 864 | 865 | 866 | def retry_failed(crawl_id: str, api_key: Optional[str] = None) -> None: 867 | """ 868 | Queue failed URLs from a crawl. 869 | 870 | Args: 871 | crawl_id (str): The ID of the crawl. 872 | api_key (str, optional): The API key. Defaults to None. 873 | """ 874 | if api_key is None: 875 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 876 | 877 | if api_key is not None: 878 | queue_json = { 879 | "crawl_id": crawl_id, 880 | } 881 | run_webt_api( 882 | queue_json, 883 | "v1/crawl/retry-failed", 884 | api_key, 885 | ) 886 | -------------------------------------------------------------------------------- /src/webtranspose/openai.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import openai 5 | import tiktoken 6 | 7 | 8 | class OpenAIScraper: 9 | def __init__( 10 | self, 11 | chunk_size: int = 2500, 12 | overlap_size: int = 100, 13 | ): 14 | """ 15 | Initialize the OpenAIScraper. 16 | 17 | Args: 18 | chunk_size (int, optional): The size of each chunk of text to process. Defaults to 2500. 19 | overlap_size (int, optional): The size of the overlap between chunks. Defaults to 100. 20 | """ 21 | self.api_key = os.environ.get("OPENAI_API_KEY") 22 | self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") 23 | self.chunk_size = chunk_size 24 | self.overlap_size = overlap_size 25 | 26 | @staticmethod 27 | def process_html( 28 | text: str, chunk_size: int, overlap_size: int, encoding: tiktoken.Encoding 29 | ) -> list: 30 | """ 31 | Process the HTML text into chunks. 32 | 33 | Args: 34 | text (str): The HTML text to process. 35 | chunk_size (int): The size of each chunk of text. 36 | overlap_size (int): The size of the overlap between chunks. 37 | encoding (tiktoken.Encoding): The encoding object. 38 | 39 | Returns: 40 | list: A list of decoded chunks. 41 | """ 42 | encoded = encoding.encode(text) 43 | if overlap_size >= chunk_size: 44 | raise ValueError("Overlap size should be less than chunk size.") 45 | chunks = [] 46 | idx = 0 47 | while idx < len(encoded): 48 | end_idx = idx + chunk_size 49 | chunks.append(encoded[idx:end_idx]) 50 | idx = end_idx - overlap_size 51 | decoded_chunks = [encoding.decode(chunk) for chunk in chunks] 52 | return decoded_chunks 53 | 54 | def scrape(self, html: str, schema: dict) -> dict: 55 | """ 56 | Scrape the HTML text using the provided schema. 57 | 58 | Args: 59 | html (str): The HTML text to scrape. 60 | schema (dict): The schema to use for scraping. 61 | 62 | Returns: 63 | dict: The scraped data. 64 | """ 65 | processed_schema = self.transform_schema(schema) 66 | schema_keys = ", ".join(processed_schema.keys()) 67 | out_data = {} 68 | 69 | for sub_html in self.process_html(html, self.chunk_size, self.overlap_size, self.encoding): 70 | model = "gpt-3.5-turbo-0613" 71 | if len(self.encoding.encode(sub_html)) > 2500: 72 | model = "gpt-3.5-turbo-16k" 73 | 74 | response = openai.ChatCompletion.create( 75 | model=model, 76 | temperature=0, 77 | messages=[{"role": "user", "content": sub_html}], 78 | functions=[ 79 | { 80 | "name": "extract_info", 81 | "description": f"Extract the {schema_keys} from the website text if any exist. Empty if not found.", 82 | "parameters": { 83 | "type": "object", 84 | "properties": processed_schema, 85 | "required": list(processed_schema.keys()), 86 | }, 87 | }, 88 | ], 89 | ) 90 | out = response["choices"][0]["message"] 91 | 92 | if "function_call" in out: 93 | args = json.loads(out["function_call"]["arguments"]) 94 | 95 | for k in args.keys(): 96 | if k in processed_schema: 97 | if processed_schema[k]["type"] == "array": 98 | if k not in out_data: 99 | out_data[k] = [] 100 | out_data[k] += args[k] 101 | else: 102 | out_data[k] = args[k] 103 | del processed_schema[k] 104 | elif k not in out_data: 105 | out_data[k] = None 106 | 107 | return out_data 108 | 109 | def transform_schema(self, schema: dict) -> dict: 110 | """ 111 | Transform the schema into the format required by OpenAI. 112 | 113 | Args: 114 | schema (dict): The schema to transform. 115 | 116 | Returns: 117 | dict: The transformed schema. 118 | """ 119 | openai_type_map = { 120 | "str": "string", 121 | "int": "number", 122 | "bool": "boolean", 123 | } 124 | 125 | properties = {} 126 | for key, value in schema.items(): 127 | if isinstance(value, dict): 128 | if "type" in value and value["type"] == "array": 129 | properties[key] = { 130 | "type": "array", 131 | "items": { 132 | "type": "object", 133 | "properties": self.transform_schema(value["items"]), 134 | }, 135 | "required": list(value["items"].keys()), 136 | } 137 | elif "type" in value: 138 | properties[key] = value 139 | else: 140 | properties[key] = self.transform_schema(value) 141 | elif isinstance(value, list): 142 | try: 143 | properties[key] = { 144 | "type": openai_type_map[type(value[0]).__name__], 145 | "enum": value, 146 | "description": key, 147 | } 148 | except IndexError: 149 | raise Exception(f"Empty list for key {key}") 150 | else: 151 | properties[key] = { 152 | "type": value, 153 | "description": key, 154 | } 155 | 156 | return properties 157 | -------------------------------------------------------------------------------- /src/webtranspose/scrape.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import re 5 | import uuid 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | 10 | from .openai import OpenAIScraper 11 | from .webt_api import run_webt_api 12 | 13 | 14 | class Scraper: 15 | def __init__( 16 | self, 17 | schema: dict, 18 | scraper_id: str = None, 19 | name: str = None, 20 | render_js: bool = False, 21 | verbose: bool = False, 22 | scraper: OpenAIScraper = None, 23 | api_key: str = None, 24 | proxy: str = None, 25 | _created: bool = False, 26 | ): 27 | """ 28 | Initialize the Scraper object. 29 | 30 | Args: 31 | schema (dict): The schema for scraping. 32 | scraper_id (str, optional): The ID of the scraper. Defaults to None. 33 | name (str, optional): The name of the scraper. Defaults to None. 34 | render_js (bool, optional): Whether to render JavaScript. Defaults to False. 35 | verbose (bool, optional): Whether to print verbose output. Defaults to False. 36 | scraper (OpenAIScraper, optional): The scraper object. Defaults to None. 37 | api_key (str, optional): The API key. Defaults to None. 38 | proxy (str, optional): The proxy. Defaults to None. 39 | _created (bool, optional): Whether the scraper has been created. Defaults to False. 40 | """ 41 | self.api_key = api_key 42 | if self.api_key is None: 43 | self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 44 | 45 | self.name = name 46 | if self.name is None: 47 | self.name = "New Scraper" 48 | self.schema = schema 49 | self.verbose = verbose 50 | self.scraper = scraper 51 | self.render_js = render_js 52 | self.scraper_id = scraper_id 53 | self.proxy = proxy 54 | if self.scraper is None: 55 | self.scraper = OpenAIScraper() 56 | if self.scraper_id is None: 57 | self.scraper_id = str(uuid.uuid4()) 58 | self.created = _created 59 | 60 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 61 | if api_key is None and self.api_key is None: 62 | logging.warning( 63 | "No Web Transpose API provided. Lite version in use...\n\nTo run the actual WebT AI Web Scraper the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics." 64 | ) 65 | 66 | def __str__(self) -> str: 67 | """ 68 | Get a string representation of the Scraper object. 69 | 70 | Returns: 71 | str: The string representation of the Scraper object. 72 | """ 73 | status = self.status() 74 | schema = json.dumps(status["schema"], indent=4) 75 | return ( 76 | f"WebTransposeScraper(\n" 77 | f" Status ID: {status['scraper_id']}\n" 78 | f" Name: {status['name']}\n" 79 | f" Render JS: {status['render_js']}\n" 80 | f" Schema: {schema}\n" 81 | f")" 82 | ) 83 | 84 | def __repr__(self) -> str: 85 | """ 86 | Get a string representation of the Scraper object. 87 | 88 | Returns: 89 | str: The string representation of the Scraper object. 90 | """ 91 | status = self.status() 92 | schema = json.dumps(status["schema"], indent=4) 93 | return ( 94 | f"WebTransposeScraper(\n" 95 | f" Status ID: {status['scraper_id']}\n" 96 | f" Name: {status['name']}\n" 97 | f" Render JS: {status['render_js']}\n" 98 | f" Schema: {schema}\n" 99 | f")" 100 | ) 101 | 102 | def create_scraper_api(self): 103 | """ 104 | Creates a Scraper on https://webtranspose.com 105 | """ 106 | if self.verbose: 107 | logging.info(f"Creating AI Web Scraper on Web Transpose...") 108 | 109 | create_json = { 110 | "name": self.name, 111 | "schema": self.schema, 112 | "render_js": self.render_js, 113 | "proxy": self.proxy, 114 | } 115 | out_json = run_webt_api( 116 | create_json, 117 | "/v1/scraper/create", 118 | self.api_key, 119 | ) 120 | self.scraper_id = out_json["scraper_id"] 121 | self.created = True 122 | 123 | def scrape(self, url=None, html=None, timeout=30): 124 | """ 125 | Scrape the data from a given URL or HTML. 126 | 127 | Args: 128 | url (str, optional): The URL to scrape. Defaults to None. 129 | html (str, optional): The HTML to scrape. Defaults to None. 130 | timeout (int, optional): The timeout for the request. Defaults to 30. 131 | 132 | Returns: 133 | dict: The scraped data. 134 | 135 | Raises: 136 | ValueError: If neither URL nor HTML is provided. 137 | """ 138 | if self.verbose: 139 | logging.info(f"Running Scraper({self.name}) on {url}...") 140 | 141 | if self.api_key is None: 142 | if url is not None: 143 | response = requests.get(url, timeout=timeout) 144 | soup = BeautifulSoup(response.content, "html.parser") 145 | body = soup.body 146 | html = re.sub("\s+", " ", str(body)).strip() 147 | 148 | if html is None: 149 | raise ValueError("Must provide either a url or html.") 150 | 151 | return self.scraper.scrape( 152 | html, 153 | self.schema, 154 | ) 155 | else: 156 | if not self.created: 157 | self.create_scraper_api() 158 | 159 | scrape_json = { 160 | "scraper_id": self.scraper_id, 161 | "url": url, 162 | "html": html, 163 | "proxy": self.proxy, 164 | } 165 | out_json = run_webt_api( 166 | scrape_json, 167 | "/v1/scraper/scrape", 168 | self.api_key, 169 | ) 170 | return out_json 171 | 172 | def status(self): 173 | """ 174 | Get the status of the Scraper. 175 | 176 | Returns: 177 | dict: The status of the Scraper. 178 | """ 179 | if self.api_key is None or not self.created: 180 | return { 181 | "scraper_id": self.scraper_id, 182 | "name": self.name, 183 | "verbose": self.verbose, 184 | "render_js": self.render_js, 185 | "schema": self.schema, 186 | "proxy": self.proxy, 187 | } 188 | else: 189 | get_json = { 190 | "scraper_id": self.scraper_id, 191 | } 192 | out_api = run_webt_api( 193 | get_json, 194 | "/v1/scraper/get", 195 | self.api_key, 196 | ) 197 | scraper = out_api["scraper"] 198 | return { 199 | "scraper_id": scraper["id"], 200 | "name": scraper["name"], 201 | "verbose": self.verbose, 202 | "render_js": scraper["render_js"], 203 | "schema": scraper["schema"], 204 | "proxy": scraper["proxy"] 205 | } 206 | 207 | 208 | def get_scraper(scraper_id, api_key: str = None): 209 | """ 210 | Get a Scraper object based on the scraper ID. 211 | 212 | Args: 213 | scraper_id (str): The ID of the scraper. 214 | api_key (str, optional): The API key. Defaults to None. 215 | 216 | Returns: 217 | Scraper: The Scraper object. 218 | 219 | Raises: 220 | ValueError: If api_key is not provided. 221 | """ 222 | if api_key is None: 223 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 224 | 225 | if api_key is not None: 226 | get_json = { 227 | "scraper_id": scraper_id, 228 | } 229 | out_json = run_webt_api( 230 | get_json, 231 | "/v1/scraper/get", 232 | api_key, 233 | ) 234 | scraper = out_json["scraper"] 235 | return Scraper( 236 | scraper_id=scraper["id"], 237 | name=scraper["name"], 238 | schema=scraper["schema"], 239 | render_js=scraper["render_js"], 240 | api_key=api_key, 241 | proxy=scraper['proxy'], 242 | _created=True, 243 | ) 244 | 245 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.") 246 | 247 | 248 | def list_scrapers(api_key: str = None): 249 | """ 250 | List all available scrapers. 251 | 252 | Args: 253 | api_key (str, optional): The API key. Defaults to None. 254 | 255 | Returns: 256 | list: A list of Scrapers. 257 | 258 | Raises: 259 | ValueError: If api_key is not provided. 260 | """ 261 | if api_key is None: 262 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 263 | 264 | if api_key is not None: 265 | out_json = run_webt_api( 266 | {}, 267 | "/v1/scraper/list", 268 | api_key, 269 | ) 270 | return out_json["scrapers"] 271 | 272 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.") 273 | -------------------------------------------------------------------------------- /src/webtranspose/search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | 4 | from .webt_api import run_webt_api 5 | 6 | def search(query, api_key=None) -> dict: 7 | """ 8 | Search for a query using the Web Transpose API. 9 | 10 | Args: 11 | query (str): The query to search for. 12 | api_key (str, optional): The API key to use for authentication. Defaults to None. 13 | 14 | Returns: 15 | dict: The search results. 16 | """ 17 | if api_key is None: 18 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 19 | 20 | if api_key is not None: 21 | out_json = run_webt_api( 22 | { 23 | "query": query, 24 | }, 25 | "/v1/search", 26 | api_key, 27 | ) 28 | return out_json 29 | 30 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.") 31 | 32 | 33 | def search_filter(query, api_key=None) -> dict: 34 | """ 35 | Search for a query using the Web Transpose API with filtering. 36 | 37 | Args: 38 | query (str): The query to search for. 39 | api_key (str, optional): The API key to use for authentication. Defaults to None. 40 | 41 | Returns: 42 | dict: The filtered search results. 43 | """ 44 | if api_key is None: 45 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 46 | 47 | if api_key is not None: 48 | out_json = run_webt_api( 49 | { 50 | "query": query, 51 | }, 52 | "/v1/search/filter", 53 | api_key, 54 | ) 55 | return out_json 56 | 57 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.") -------------------------------------------------------------------------------- /src/webtranspose/webt_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | from urllib.parse import urljoin 3 | 4 | import requests 5 | 6 | 7 | def run_webt_api(params: dict, api_path: str, api_key: str = None) -> dict: 8 | """ 9 | Run a WebTranspose API request. 10 | 11 | Args: 12 | params (dict): The parameters for the API request. 13 | api_path (str): The API path. 14 | api_key (str, optional): The API key. Defaults to None. 15 | 16 | Returns: 17 | dict: The JSON response from the API. 18 | 19 | Raises: 20 | Exception: If the API request fails with a non-200 status code. 21 | """ 22 | WEBTRANSPOSE_API_URL = "https://api.webtranspose.com/" 23 | if api_key is None: 24 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY") 25 | headers = {"X-API-Key": api_key} 26 | api_endpoint = urljoin(WEBTRANSPOSE_API_URL, api_path) 27 | response = requests.post(api_endpoint, headers=headers, json=params, timeout=180) 28 | if response.status_code == 200: 29 | return response.json() 30 | else: 31 | raise Exception("API request failed with status code: {}".format(response.status_code)) 32 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tasks for maintaining the project. 3 | 4 | Execute 'invoke --list' for guidance on using Invoke 5 | """ 6 | import platform 7 | import webbrowser 8 | from pathlib import Path 9 | from typing import Optional 10 | 11 | from invoke import call, task 12 | from invoke.context import Context 13 | from invoke.runners import Result 14 | 15 | ROOT_DIR = Path(__file__).parent 16 | DOCS_DIR = ROOT_DIR.joinpath("docs") 17 | DOCS_BUILD_DIR = DOCS_DIR.joinpath("_build") 18 | DOCS_INDEX = DOCS_BUILD_DIR.joinpath("index.html") 19 | COVERAGE_FILE = ROOT_DIR.joinpath(".coverage") 20 | COVERAGE_DIR = ROOT_DIR.joinpath("htmlcov") 21 | COVERAGE_REPORT = COVERAGE_DIR.joinpath("index.html") 22 | SOURCE_DIR = ROOT_DIR.joinpath("src/webtranspose") 23 | TEST_DIR = ROOT_DIR.joinpath("tests") 24 | PYTHON_TARGETS = [ 25 | SOURCE_DIR, 26 | TEST_DIR, 27 | ROOT_DIR.joinpath("noxfile.py"), 28 | Path(__file__), 29 | ] 30 | PYTHON_TARGETS_STR = " ".join([str(p) for p in PYTHON_TARGETS]) 31 | 32 | 33 | def _run(c: Context, command: str) -> Optional[Result]: 34 | return c.run(command, pty=platform.system() != "Windows") 35 | 36 | 37 | @task() 38 | def clean_build(c): 39 | # type: (Context) -> None 40 | """Clean up files from package building.""" 41 | _run(c, "rm -fr build/") 42 | _run(c, "rm -fr dist/") 43 | _run(c, "rm -fr .eggs/") 44 | _run(c, "find . -name '*.egg-info' -exec rm -fr {} +") 45 | _run(c, "find . -name '*.egg' -exec rm -f {} +") 46 | 47 | 48 | @task() 49 | def clean_python(c): 50 | # type: (Context) -> None 51 | """Clean up python file artifacts.""" 52 | _run(c, "find . -name '*.pyc' -exec rm -f {} +") 53 | _run(c, "find . -name '*.pyo' -exec rm -f {} +") 54 | _run(c, "find . -name '*~' -exec rm -f {} +") 55 | _run(c, "find . -name '__pycache__' -exec rm -fr {} +") 56 | 57 | 58 | @task() 59 | def clean_tests(c): 60 | # type: (Context) -> None 61 | """Clean up files from testing.""" 62 | _run(c, f"rm -f {COVERAGE_FILE}") 63 | _run(c, f"rm -fr {COVERAGE_DIR}") 64 | _run(c, "rm -fr .pytest_cache") 65 | 66 | 67 | @task() 68 | def clean_docs(c): 69 | # type: (Context) -> None 70 | """Clean up files from documentation builds.""" 71 | _run(c, f"rm -fr {DOCS_BUILD_DIR}") 72 | 73 | 74 | @task(pre=[clean_build, clean_python, clean_tests, clean_docs]) 75 | def clean(c): 76 | # type: (Context) -> None 77 | """Run all clean sub-tasks.""" 78 | 79 | 80 | @task() 81 | def install_hooks(c): 82 | # type: (Context) -> None 83 | """Install pre-commit hooks.""" 84 | _run(c, "poetry run pre-commit install") 85 | 86 | 87 | @task() 88 | def hooks(c): 89 | # type: (Context) -> None 90 | """Run pre-commit hooks.""" 91 | _run(c, "poetry run pre-commit run --all-files") 92 | 93 | 94 | @task(name="format", help={"check": "Checks if source is formatted without applying changes"}) 95 | def format_(c, check=False): 96 | # type: (Context, bool) -> None 97 | """Format code.""" 98 | isort_options = ["--check-only", "--diff"] if check else [] 99 | _run(c, f"poetry run isort {' '.join(isort_options)} {PYTHON_TARGETS_STR}") 100 | black_options = ["--diff", "--check"] if check else ["--quiet"] 101 | _run(c, f"poetry run black {' '.join(black_options)} {PYTHON_TARGETS_STR}") 102 | 103 | 104 | @task() 105 | def flake8(c): 106 | # type: (Context) -> None 107 | """Run flake8.""" 108 | _run(c, f"poetry run flakeheaven lint {PYTHON_TARGETS_STR}") 109 | 110 | 111 | @task() 112 | def security(c): 113 | # type: (Context) -> None 114 | """Run security related checks.""" 115 | _run( 116 | c, 117 | "poetry export --with dev --format=requirements.txt --without-hashes | " 118 | "poetry run safety check --stdin --full-report", 119 | ) 120 | 121 | 122 | @task(pre=[flake8, security, call(format_, check=True)]) 123 | def lint(c): 124 | # type: (Context) -> None 125 | """Run all linting.""" 126 | 127 | 128 | @task() 129 | def mypy(c): 130 | # type: (Context) -> None 131 | """Run mypy.""" 132 | _run(c, f"poetry run mypy {PYTHON_TARGETS_STR}") 133 | 134 | 135 | @task() 136 | def tests(c): 137 | # type: (Context) -> None 138 | """Run tests.""" 139 | pytest_options = ["--xdoctest", "--cov", "--cov-report=", "--cov-fail-under=0"] 140 | _run(c, f"poetry run pytest {' '.join(pytest_options)} {TEST_DIR} {SOURCE_DIR}") 141 | 142 | 143 | @task( 144 | help={ 145 | "fmt": "Build a local report: report, html, json, annotate, html, xml.", 146 | "open_browser": "Open the coverage report in the web browser (requires --fmt html)", 147 | } 148 | ) 149 | def coverage(c, fmt="report", open_browser=False): 150 | # type: (Context, str, bool) -> None 151 | """Create coverage report.""" 152 | if any(Path().glob(".coverage.*")): 153 | _run(c, "poetry run coverage combine") 154 | _run(c, f"poetry run coverage {fmt} -i") 155 | if fmt == "html" and open_browser: 156 | webbrowser.open(COVERAGE_REPORT.as_uri()) 157 | 158 | 159 | @task( 160 | help={ 161 | "serve": "Build the docs watching for changes", 162 | "open_browser": "Open the docs in the web browser", 163 | } 164 | ) 165 | def docs(c, serve=False, open_browser=False): 166 | # type: (Context, bool, bool) -> None 167 | """Build documentation.""" 168 | _run(c, f"sphinx-apidoc -o {DOCS_DIR} {SOURCE_DIR}") 169 | build_docs = f"sphinx-build -b html {DOCS_DIR} {DOCS_BUILD_DIR}" 170 | _run(c, build_docs) 171 | if open_browser: 172 | webbrowser.open(DOCS_INDEX.absolute().as_uri()) 173 | if serve: 174 | _run(c, f"poetry run watchmedo shell-command -p '*.rst;*.md' -c '{build_docs}' -R -D .") 175 | 176 | 177 | @task( 178 | help={ 179 | "part": "Part of the version to be bumped.", 180 | "dry_run": "Don't write any files, just pretend. (default: False)", 181 | } 182 | ) 183 | def version(c, part, dry_run=False): 184 | # type: (Context, str, bool) -> None 185 | """Bump version.""" 186 | bump_options = ["--dry-run"] if dry_run else [] 187 | _run(c, f"poetry run bump2version {' '.join(bump_options)} {part}") 188 | -------------------------------------------------------------------------------- /tests/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "ae7fef52", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "ename": "ModuleNotFoundError", 11 | "evalue": "No module named 'webtranspose'", 12 | "output_type": "error", 13 | "traceback": [ 14 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 15 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 16 | "\u001b[0;32m/var/folders/rh/0zrsw9xd3qnbggwbk10z77380000gn/T/ipykernel_5677/167283409.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mwebtranspose\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcrawl\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 17 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'webtranspose'" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "from webtranspose import crawl" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "cef46608", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "webt", 37 | "language": "python", 38 | "name": "webt" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.9.5" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 5 55 | } 56 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit test package for webtranspose.""" 2 | -------------------------------------------------------------------------------- /tests/test_webtranspose.py: -------------------------------------------------------------------------------- 1 | """Tests for `webtranspose` module.""" 2 | from typing import Generator 3 | 4 | import pytest 5 | 6 | import webtranspose 7 | 8 | 9 | @pytest.fixture 10 | def version() -> Generator[str, None, None]: 11 | """Sample pytest fixture.""" 12 | yield webtranspose.__version__ 13 | 14 | 15 | def test_version(version: str) -> None: 16 | """Sample pytest test function with the pytest fixture as an argument.""" 17 | assert version == "0.1.0" 18 | --------------------------------------------------------------------------------