├── .bumpversion.cfg
├── .darglint
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── config.yml
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
└── workflows
│ ├── codeql-analysis.yml
│ ├── pre-commit-autoupdate.yml
│ ├── release.yml
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE.rst
├── README.md
├── codecov.yml
├── img
└── web-transpose-cover.png
├── noxfile.py
├── poetry.lock
├── pyproject.toml
├── src
└── webtranspose
│ ├── __init__.py
│ ├── chat.py
│ ├── crawl.py
│ ├── openai.py
│ ├── scrape.py
│ ├── search.py
│ └── webt_api.py
├── tasks.py
└── tests
├── Untitled.ipynb
├── __init__.py
└── test_webtranspose.py
/.bumpversion.cfg:
--------------------------------------------------------------------------------
1 | [bumpversion]
2 | commit = True
3 | tag = False
4 | current_version = 0.1.0
5 |
6 | [bumpversion:file:pyproject.toml]
7 | search = version = "{current_version}"
8 | replace = version = "{new_version}"
9 |
10 | [bumpversion:file:src/webtranspose/__init__.py]
11 | search = __version__ = "{current_version}"
12 | replace = __version__ = "{new_version}"
13 |
14 | [bumpversion:file(title):CHANGELOG.md]
15 | search = {#}{#} [Unreleased]
16 | replace = {#}{#} [Unreleased]
17 |
18 | {#}{#} [{new_version}] - {now:%Y-%m-%d}
19 |
20 | [bumpversion:file(links):CHANGELOG.md]
21 | search = [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v{current_version}...HEAD
22 | replace = [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v{new_version}...HEAD
23 | [{new_version}]: https://github.com/mike-gee/webtranspose/compare/v{current_version}...v{new_version}
24 |
--------------------------------------------------------------------------------
/.darglint:
--------------------------------------------------------------------------------
1 | [darglint]
2 | strictness = short
3 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🐛 Bug report
3 | about: Create a report to help us improve
4 | labels: bug
5 | assignees: ''
6 |
7 | ---
8 |
9 | ## Expected Behavior
10 |
11 |
12 | ## Actual Behavior
13 |
14 |
15 | ## Steps to Reproduce the Problem
16 |
17 | 1.
18 | 1.
19 | 1.
20 |
21 | ## Specifications
22 |
23 | - Version:
24 | - Platform:
25 | - Subsystem:
26 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links: []
3 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🚀 Feature request
3 | about: Suggest an idea for this project
4 | labels: enhancement
5 | assignees: ''
6 |
7 | ---
8 |
9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 |
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 |
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #
2 |
3 | ## Proposed Changes
4 |
5 | -
6 | -
7 | -
8 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: github-actions
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | - package-ecosystem: pip
8 | directory: "/docs"
9 | schedule:
10 | interval: daily
11 | - package-ecosystem: pip
12 | directory: "/"
13 | schedule:
14 | interval: daily
15 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: "CodeQL"
2 |
3 | on:
4 | push:
5 | branches:
6 | pull_request:
7 | branches:
8 | schedule:
9 | - cron: '0 6 * * 1'
10 |
11 | jobs:
12 | analyze:
13 | name: Analyze
14 | runs-on: ubuntu-latest
15 | permissions:
16 | actions: read
17 | contents: read
18 | security-events: write
19 |
20 | strategy:
21 | fail-fast: false
22 | matrix:
23 | language: [ 'python' ]
24 |
25 | steps:
26 | - name: Checkout repository
27 | uses: actions/checkout@v3.5.2
28 |
29 | # Initializes the CodeQL tools for scanning.
30 | - name: Initialize CodeQL
31 | uses: github/codeql-action/init@v2
32 | with:
33 | languages: ${{ matrix.language }}
34 |
35 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
36 | # If this step fails, then you should remove it and run the build manually (see below)
37 | - name: Autobuild
38 | uses: github/codeql-action/autobuild@v2
39 |
40 | - name: Perform CodeQL Analysis
41 | uses: github/codeql-action/analyze@v2
42 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit-autoupdate.yml:
--------------------------------------------------------------------------------
1 | name: "Pre-commit autoupdate"
2 |
3 | on:
4 | schedule:
5 | - cron: '0 6 * * 1'
6 | workflow_dispatch:
7 |
8 | jobs:
9 | autoupdate:
10 | name: autoupdate
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v3.5.2
14 |
15 | - name: Set up Python 3.8
16 | uses: actions/setup-python@v4.6.1
17 | with:
18 | python-version: 3.8
19 |
20 | - name: Install system deps
21 | shell: bash
22 | run: |
23 | pip install poetry
24 | poetry config virtualenvs.in-project true
25 | poetry install --no-root --only dev --only linters --sync
26 |
27 | - name: Run autoupdate
28 | run: poetry run pre-commit autoupdate
29 |
30 | - name: Run pre-commit
31 | run: poetry run pre-commit run --all-files
32 |
33 | - uses: peter-evans/create-pull-request@v5.0.1
34 | with:
35 | token: ${{ secrets.GITHUB_TOKEN }}
36 | branch: chore-update-pre-commit-hooks
37 | title: Update pre-commit hooks
38 | commit-message: "Update pre-commit hooks"
39 | body: |
40 | # Update pre-commit hooks
41 |
42 | - Update pre-commit hooks to the latest version.
43 | delete-branch: true
44 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 |
2 | name: release
3 |
4 | on:
5 | push:
6 | tags:
7 | - 'v*'
8 |
9 | jobs:
10 | release:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v3.5.2
14 |
15 | - name: Set up Python 3.8
16 | uses: actions/setup-python@v4.6.1
17 | with:
18 | python-version: 3.8
19 |
20 | - name: Install system deps
21 | shell: bash
22 | run: |
23 | pip install poetry
24 | poetry config virtualenvs.in-project true
25 |
26 | - name: Build package
27 | run: |
28 | poetry build --ansi
29 |
30 | - name: Publish package on PyPI
31 | uses: pypa/gh-action-pypi-publish@v1.4.2
32 | with:
33 | user: __token__
34 | password: ${{ secrets.PYPI_TOKEN }}
35 |
36 | - name: Publish package on TestPyPI
37 | uses: pypa/gh-action-pypi-publish@v1.4.2
38 | with:
39 | user: __token__
40 | password: ${{ secrets.TEST_PYPI_TOKEN }}
41 | repository_url: https://test.pypi.org/legacy/
42 |
43 |
44 | github_release:
45 | needs: release
46 | name: Create Github Release
47 | runs-on: ubuntu-latest
48 | steps:
49 | - uses: actions/checkout@v3.5.2
50 |
51 | - name: Get version from tag
52 | id: tag_name
53 | shell: bash
54 | run: |
55 | echo ::set-output name=current_version::${GITHUB_REF#refs/tags/v}
56 |
57 | - name: Get Changelog Entry
58 | id: changelog_reader
59 | uses: mindsers/changelog-reader-action@v2.2.2
60 | with:
61 | version: ${{ steps.tag_name.outputs.current_version }}
62 | path: ./CHANGELOG.md
63 |
64 | - name: Create Release
65 | id: create_release
66 | uses: actions/create-release@v1.1.4
67 | env:
68 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
69 | with:
70 | tag_name: ${{ github.ref }}
71 | release_name: Release ${{ github.ref }}
72 | body: ${{ steps.changelog_reader.outputs.changes }}
73 | draft: false
74 | prerelease: false
75 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: tests
2 |
3 | on:
4 | push:
5 | branches:
6 | pull_request:
7 | branches:
8 |
9 | jobs:
10 | linting:
11 | name: Linting
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v3.5.2
15 |
16 | - name: Set up Python 3.8
17 | uses: actions/setup-python@v4.6.1
18 | with:
19 | python-version: 3.8
20 |
21 | - name: Install system deps
22 | shell: bash
23 | run: |
24 | pip install poetry
25 | poetry config virtualenvs.in-project true
26 | poetry install --no-root --only dev --only linters --sync
27 |
28 | - name: Linting
29 | shell: bash
30 | run: poetry run pre-commit run --all-files
31 |
32 | tests:
33 | needs: linting
34 | name: ${{ matrix.os }} / ${{ matrix.python-version }}
35 | runs-on: ${{ matrix.os }}-latest
36 | strategy:
37 | matrix:
38 | os: [Ubuntu, MacOS, Windows]
39 | python-version: ['3.8', '3.9', '3.10', '3.11']
40 | fail-fast: true
41 | steps:
42 | - uses: actions/checkout@v3.5.2
43 |
44 | - name: Set up Python ${{ matrix.python-version }}
45 | uses: actions/setup-python@v4.6.1
46 | with:
47 | python-version: ${{ matrix.python-version }}
48 |
49 | - name: Install system deps
50 | shell: bash
51 | run: |
52 | pip install nox-poetry
53 | pip install poetry
54 | poetry config virtualenvs.in-project true
55 |
56 | - name: Run mypy with nox
57 | shell: bash
58 | run: nox --force-color -s mypy-${{ matrix.python-version }}
59 |
60 | - name: Run tests with nox
61 | shell: bash
62 | run: nox --force-color -s tests-${{ matrix.python-version }}
63 |
64 | - name: Run securtity check
65 | if: matrix.python-version == '3.11' && matrix.os == 'Ubuntu'
66 | shell: bash
67 | run: nox --force-color -s security
68 |
69 | - name: Upload coverage data
70 | uses: actions/upload-artifact@v2.2.4
71 | with:
72 | name: coverage-data
73 | path: ".coverage.*"
74 |
75 | coverage:
76 | needs: tests
77 | runs-on: ubuntu-latest
78 | steps:
79 | - uses: actions/checkout@v3.5.2
80 |
81 | - name: Set up Python 3.8
82 | uses: actions/setup-python@v4.6.1
83 | with:
84 | python-version: 3.8
85 |
86 | - name: Install system deps
87 | shell: bash
88 | run: |
89 | pip install nox-poetry
90 | pip install poetry
91 | poetry config virtualenvs.in-project true
92 |
93 | - name: Download coverage data
94 | uses: actions/download-artifact@v2.0.10
95 | with:
96 | name: coverage-data
97 |
98 | - name: Create coverage report
99 | shell: bash
100 | run: |
101 | nox --force-color --session=coverage -- --fmt xml
102 |
103 | - name: Upload coverage report
104 | uses: codecov/codecov-action@v3.1.4
105 | with:
106 | token: ${{ secrets.CODECOV_TOKEN }}
107 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 | .ipynb
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # pytype
130 | .pytype/
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
135 | # Code editors
136 | .vscode
137 |
138 | # Caches
139 | .flakeheaven_cache
140 |
141 | # Web Transpose
142 | webtranspose-out/
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.4.0
4 | hooks:
5 | - id: check-toml
6 | - id: check-yaml
7 | - id: debug-statements
8 | - id: check-merge-conflict
9 | - id: check-json
10 | - id: end-of-file-fixer
11 | - repo: https://github.com/timothycrosley/isort
12 | rev: 5.12.0
13 | hooks:
14 | - id: isort
15 | - repo: https://github.com/psf/black
16 | rev: 23.3.0
17 | hooks:
18 | - id: black
19 | - repo: local
20 | hooks:
21 | - id: flakeheaven
22 | name: flakeheaven
23 | description: "`FlakeHeaven` it's a Flake8 wrapper to make it cools."
24 | entry: poetry run flakeheaven
25 | args: [lint]
26 | language: system
27 | types: [python]
28 | require_serial: true
29 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | image: latest
5 |
6 | formats: all
7 |
8 | sphinx:
9 | configuration: docs/conf.py
10 |
11 | python:
12 | version: 3.8
13 | install:
14 | - requirements: docs/requirements.txt
15 | - method: pip
16 | path: .
17 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | All notable changes to this project will be documented in this file.
3 |
4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6 |
7 |
8 | ## [Unreleased]
9 |
10 | ## [0.1.0] - 2023-10-21
11 | ### Added
12 | - First release on PyPI.
13 |
14 | [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v0.1.0...HEAD
15 | [0.1.0]: https://github.com/mike-gee/webtranspose/compare/releases/tag/v0.1.0
16 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributor Covenant Code of Conduct
3 |
4 | ## Our Pledge
5 |
6 | We as members, contributors, and leaders pledge to make participation in our
7 | community a harassment-free experience for everyone, regardless of age, body
8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
9 | identity and expression, level of experience, education, socio-economic status,
10 | nationality, personal appearance, race, caste, color, religion, or sexual identity
11 | and orientation.
12 |
13 | We pledge to act and interact in ways that contribute to an open, welcoming,
14 | diverse, inclusive, and healthy community.
15 |
16 | ## Our Standards
17 |
18 | Examples of behavior that contributes to a positive environment for our
19 | community include:
20 |
21 | * Demonstrating empathy and kindness toward other people
22 | * Being respectful of differing opinions, viewpoints, and experiences
23 | * Giving and gracefully accepting constructive feedback
24 | * Accepting responsibility and apologizing to those affected by our mistakes,
25 | and learning from the experience
26 | * Focusing on what is best not just for us as individuals, but for the
27 | overall community
28 |
29 | Examples of unacceptable behavior include:
30 |
31 | * The use of sexualized language or imagery, and sexual attention or
32 | advances of any kind
33 | * Trolling, insulting or derogatory comments, and personal or political attacks
34 | * Public or private harassment
35 | * Publishing others' private information, such as a physical or email
36 | address, without their explicit permission
37 | * Other conduct which could reasonably be considered inappropriate in a
38 | professional setting
39 |
40 | ## Enforcement Responsibilities
41 |
42 | Community leaders are responsible for clarifying and enforcing our standards of
43 | acceptable behavior and will take appropriate and fair corrective action in
44 | response to any behavior that they deem inappropriate, threatening, offensive,
45 | or harmful.
46 |
47 | Community leaders have the right and responsibility to remove, edit, or reject
48 | comments, commits, code, wiki edits, issues, and other contributions that are
49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
50 | decisions when appropriate.
51 |
52 | ## Scope
53 |
54 | This Code of Conduct applies within all community spaces, and also applies when
55 | an individual is officially representing the community in public spaces.
56 | Examples of representing our community include using an official e-mail address,
57 | posting via an official social media account, or acting as an appointed
58 | representative at an online or offline event.
59 |
60 | ## Enforcement
61 |
62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
63 | reported to the community leaders responsible for enforcement at mike@webtranspose.com.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
120 |
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 |
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available
126 | at [https://www.contributor-covenant.org/translations][translations].
127 |
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 |
--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
1 | Copyright (C) 2023 Vetro Technologies, Inc. (Web Transpose)
2 |
3 | This program is free software: you can redistribute it and/or modify
4 | it under the terms of the GNU Affero General Public License as published by
5 | the Free Software Foundation, either version 3 of the License, or
6 | (at your option) any later version.
7 |
8 | This program is distributed in the hope that it will be useful,
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | GNU Affero General Public License for more details.
12 |
13 | You should have received a copy of the GNU Affero General Public License
14 | along with this program. If not, see .
15 |
16 |
17 | GNU AFFERO GENERAL PUBLIC LICENSE
18 | Version 3, 19 November 2007
19 |
20 | Copyright (C) 2007 Free Software Foundation, Inc.
21 | Everyone is permitted to copy and distribute verbatim copies
22 | of this license document, but changing it is not allowed.
23 |
24 | Preamble
25 |
26 | The GNU Affero General Public License is a free, copyleft license for
27 | software and other kinds of works, specifically designed to ensure
28 | cooperation with the community in the case of network server software.
29 |
30 | The licenses for most software and other practical works are designed
31 | to take away your freedom to share and change the works. By contrast,
32 | our General Public Licenses are intended to guarantee your freedom to
33 | share and change all versions of a program--to make sure it remains free
34 | software for all its users.
35 |
36 | When we speak of free software, we are referring to freedom, not
37 | price. Our General Public Licenses are designed to make sure that you
38 | have the freedom to distribute copies of free software (and charge for
39 | them if you wish), that you receive source code or can get it if you
40 | want it, that you can change the software or use pieces of it in new
41 | free programs, and that you know you can do these things.
42 |
43 | Developers that use our General Public Licenses protect your rights
44 | with two steps: (1) assert copyright on the software, and (2) offer
45 | you this License which gives you legal permission to copy, distribute
46 | and/or modify the software.
47 |
48 | A secondary benefit of defending all users' freedom is that
49 | improvements made in alternate versions of the program, if they
50 | receive widespread use, become available for other developers to
51 | incorporate. Many developers of free software are heartened and
52 | encouraged by the resulting cooperation. However, in the case of
53 | software used on network servers, this result may fail to come about.
54 | The GNU General Public License permits making a modified version and
55 | letting the public access it on a server without ever releasing its
56 | source code to the public.
57 |
58 | The GNU Affero General Public License is designed specifically to
59 | ensure that, in such cases, the modified source code becomes available
60 | to the community. It requires the operator of a network server to
61 | provide the source code of the modified version running there to the
62 | users of that server. Therefore, public use of a modified version, on
63 | a publicly accessible server, gives the public access to the source
64 | code of the modified version.
65 |
66 | An older license, called the Affero General Public License and
67 | published by Affero, was designed to accomplish similar goals. This is
68 | a different license, not a version of the Affero GPL, but Affero has
69 | released a new version of the Affero GPL which permits relicensing under
70 | this license.
71 |
72 | The precise terms and conditions for copying, distribution and
73 | modification follow.
74 |
75 | TERMS AND CONDITIONS
76 |
77 | 0. Definitions.
78 |
79 | "This License" refers to version 3 of the GNU Affero General Public License.
80 |
81 | "Copyright" also means copyright-like laws that apply to other kinds of
82 | works, such as semiconductor masks.
83 |
84 | "The Program" refers to any copyrightable work licensed under this
85 | License. Each licensee is addressed as "you". "Licensees" and
86 | "recipients" may be individuals or organizations.
87 |
88 | To "modify" a work means to copy from or adapt all or part of the work
89 | in a fashion requiring copyright permission, other than the making of an
90 | exact copy. The resulting work is called a "modified version" of the
91 | earlier work or a work "based on" the earlier work.
92 |
93 | A "covered work" means either the unmodified Program or a work based
94 | on the Program.
95 |
96 | To "propagate" a work means to do anything with it that, without
97 | permission, would make you directly or secondarily liable for
98 | infringement under applicable copyright law, except executing it on a
99 | computer or modifying a private copy. Propagation includes copying,
100 | distribution (with or without modification), making available to the
101 | public, and in some countries other activities as well.
102 |
103 | To "convey" a work means any kind of propagation that enables other
104 | parties to make or receive copies. Mere interaction with a user through
105 | a computer network, with no transfer of a copy, is not conveying.
106 |
107 | An interactive user interface displays "Appropriate Legal Notices"
108 | to the extent that it includes a convenient and prominently visible
109 | feature that (1) displays an appropriate copyright notice, and (2)
110 | tells the user that there is no warranty for the work (except to the
111 | extent that warranties are provided), that licensees may convey the
112 | work under this License, and how to view a copy of this License. If
113 | the interface presents a list of user commands or options, such as a
114 | menu, a prominent item in the list meets this criterion.
115 |
116 | 1. Source Code.
117 |
118 | The "source code" for a work means the preferred form of the work
119 | for making modifications to it. "Object code" means any non-source
120 | form of a work.
121 |
122 | A "Standard Interface" means an interface that either is an official
123 | standard defined by a recognized standards body, or, in the case of
124 | interfaces specified for a particular programming language, one that
125 | is widely used among developers working in that language.
126 |
127 | The "System Libraries" of an executable work include anything, other
128 | than the work as a whole, that (a) is included in the normal form of
129 | packaging a Major Component, but which is not part of that Major
130 | Component, and (b) serves only to enable use of the work with that
131 | Major Component, or to implement a Standard Interface for which an
132 | implementation is available to the public in source code form. A
133 | "Major Component", in this context, means a major essential component
134 | (kernel, window system, and so on) of the specific operating system
135 | (if any) on which the executable work runs, or a compiler used to
136 | produce the work, or an object code interpreter used to run it.
137 |
138 | The "Corresponding Source" for a work in object code form means all
139 | the source code needed to generate, install, and (for an executable
140 | work) run the object code and to modify the work, including scripts to
141 | control those activities. However, it does not include the work's
142 | System Libraries, or general-purpose tools or generally available free
143 | programs which are used unmodified in performing those activities but
144 | which are not part of the work. For example, Corresponding Source
145 | includes interface definition files associated with source files for
146 | the work, and the source code for shared libraries and dynamically
147 | linked subprograms that the work is specifically designed to require,
148 | such as by intimate data communication or control flow between those
149 | subprograms and other parts of the work.
150 |
151 | The Corresponding Source need not include anything that users
152 | can regenerate automatically from other parts of the Corresponding
153 | Source.
154 |
155 | The Corresponding Source for a work in source code form is that
156 | same work.
157 |
158 | 2. Basic Permissions.
159 |
160 | All rights granted under this License are granted for the term of
161 | copyright on the Program, and are irrevocable provided the stated
162 | conditions are met. This License explicitly affirms your unlimited
163 | permission to run the unmodified Program. The output from running a
164 | covered work is covered by this License only if the output, given its
165 | content, constitutes a covered work. This License acknowledges your
166 | rights of fair use or other equivalent, as provided by copyright law.
167 |
168 | You may make, run and propagate covered works that you do not
169 | convey, without conditions so long as your license otherwise remains
170 | in force. You may convey covered works to others for the sole purpose
171 | of having them make modifications exclusively for you, or provide you
172 | with facilities for running those works, provided that you comply with
173 | the terms of this License in conveying all material for which you do
174 | not control copyright. Those thus making or running the covered works
175 | for you must do so exclusively on your behalf, under your direction
176 | and control, on terms that prohibit them from making any copies of
177 | your copyrighted material outside their relationship with you.
178 |
179 | Conveying under any other circumstances is permitted solely under
180 | the conditions stated below. Sublicensing is not allowed; section 10
181 | makes it unnecessary.
182 |
183 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
184 |
185 | No covered work shall be deemed part of an effective technological
186 | measure under any applicable law fulfilling obligations under article
187 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
188 | similar laws prohibiting or restricting circumvention of such
189 | measures.
190 |
191 | When you convey a covered work, you waive any legal power to forbid
192 | circumvention of technological measures to the extent such circumvention
193 | is effected by exercising rights under this License with respect to
194 | the covered work, and you disclaim any intention to limit operation or
195 | modification of the work as a means of enforcing, against the work's
196 | users, your or third parties' legal rights to forbid circumvention of
197 | technological measures.
198 |
199 | 4. Conveying Verbatim Copies.
200 |
201 | You may convey verbatim copies of the Program's source code as you
202 | receive it, in any medium, provided that you conspicuously and
203 | appropriately publish on each copy an appropriate copyright notice;
204 | keep intact all notices stating that this License and any
205 | non-permissive terms added in accord with section 7 apply to the code;
206 | keep intact all notices of the absence of any warranty; and give all
207 | recipients a copy of this License along with the Program.
208 |
209 | You may charge any price or no price for each copy that you convey,
210 | and you may offer support or warranty protection for a fee.
211 |
212 | 5. Conveying Modified Source Versions.
213 |
214 | You may convey a work based on the Program, or the modifications to
215 | produce it from the Program, in the form of source code under the
216 | terms of section 4, provided that you also meet all of these conditions:
217 |
218 | a) The work must carry prominent notices stating that you modified
219 | it, and giving a relevant date.
220 |
221 | b) The work must carry prominent notices stating that it is
222 | released under this License and any conditions added under section
223 | 7. This requirement modifies the requirement in section 4 to
224 | "keep intact all notices".
225 |
226 | c) You must license the entire work, as a whole, under this
227 | License to anyone who comes into possession of a copy. This
228 | License will therefore apply, along with any applicable section 7
229 | additional terms, to the whole of the work, and all its parts,
230 | regardless of how they are packaged. This License gives no
231 | permission to license the work in any other way, but it does not
232 | invalidate such permission if you have separately received it.
233 |
234 | d) If the work has interactive user interfaces, each must display
235 | Appropriate Legal Notices; however, if the Program has interactive
236 | interfaces that do not display Appropriate Legal Notices, your
237 | work need not make them do so.
238 |
239 | A compilation of a covered work with other separate and independent
240 | works, which are not by their nature extensions of the covered work,
241 | and which are not combined with it such as to form a larger program,
242 | in or on a volume of a storage or distribution medium, is called an
243 | "aggregate" if the compilation and its resulting copyright are not
244 | used to limit the access or legal rights of the compilation's users
245 | beyond what the individual works permit. Inclusion of a covered work
246 | in an aggregate does not cause this License to apply to the other
247 | parts of the aggregate.
248 |
249 | 6. Conveying Non-Source Forms.
250 |
251 | You may convey a covered work in object code form under the terms
252 | of sections 4 and 5, provided that you also convey the
253 | machine-readable Corresponding Source under the terms of this License,
254 | in one of these ways:
255 |
256 | a) Convey the object code in, or embodied in, a physical product
257 | (including a physical distribution medium), accompanied by the
258 | Corresponding Source fixed on a durable physical medium
259 | customarily used for software interchange.
260 |
261 | b) Convey the object code in, or embodied in, a physical product
262 | (including a physical distribution medium), accompanied by a
263 | written offer, valid for at least three years and valid for as
264 | long as you offer spare parts or customer support for that product
265 | model, to give anyone who possesses the object code either (1) a
266 | copy of the Corresponding Source for all the software in the
267 | product that is covered by this License, on a durable physical
268 | medium customarily used for software interchange, for a price no
269 | more than your reasonable cost of physically performing this
270 | conveying of source, or (2) access to copy the
271 | Corresponding Source from a network server at no charge.
272 |
273 | c) Convey individual copies of the object code with a copy of the
274 | written offer to provide the Corresponding Source. This
275 | alternative is allowed only occasionally and noncommercially, and
276 | only if you received the object code with such an offer, in accord
277 | with subsection 6b.
278 |
279 | d) Convey the object code by offering access from a designated
280 | place (gratis or for a charge), and offer equivalent access to the
281 | Corresponding Source in the same way through the same place at no
282 | further charge. You need not require recipients to copy the
283 | Corresponding Source along with the object code. If the place to
284 | copy the object code is a network server, the Corresponding Source
285 | may be on a different server (operated by you or a third party)
286 | that supports equivalent copying facilities, provided you maintain
287 | clear directions next to the object code saying where to find the
288 | Corresponding Source. Regardless of what server hosts the
289 | Corresponding Source, you remain obligated to ensure that it is
290 | available for as long as needed to satisfy these requirements.
291 |
292 | e) Convey the object code using peer-to-peer transmission, provided
293 | you inform other peers where the object code and Corresponding
294 | Source of the work are being offered to the general public at no
295 | charge under subsection 6d.
296 |
297 | A separable portion of the object code, whose source code is excluded
298 | from the Corresponding Source as a System Library, need not be
299 | included in conveying the object code work.
300 |
301 | A "User Product" is either (1) a "consumer product", which means any
302 | tangible personal property which is normally used for personal, family,
303 | or household purposes, or (2) anything designed or sold for incorporation
304 | into a dwelling. In determining whether a product is a consumer product,
305 | doubtful cases shall be resolved in favor of coverage. For a particular
306 | product received by a particular user, "normally used" refers to a
307 | typical or common use of that class of product, regardless of the status
308 | of the particular user or of the way in which the particular user
309 | actually uses, or expects or is expected to use, the product. A product
310 | is a consumer product regardless of whether the product has substantial
311 | commercial, industrial or non-consumer uses, unless such uses represent
312 | the only significant mode of use of the product.
313 |
314 | "Installation Information" for a User Product means any methods,
315 | procedures, authorization keys, or other information required to install
316 | and execute modified versions of a covered work in that User Product from
317 | a modified version of its Corresponding Source. The information must
318 | suffice to ensure that the continued functioning of the modified object
319 | code is in no case prevented or interfered with solely because
320 | modification has been made.
321 |
322 | If you convey an object code work under this section in, or with, or
323 | specifically for use in, a User Product, and the conveying occurs as
324 | part of a transaction in which the right of possession and use of the
325 | User Product is transferred to the recipient in perpetuity or for a
326 | fixed term (regardless of how the transaction is characterized), the
327 | Corresponding Source conveyed under this section must be accompanied
328 | by the Installation Information. But this requirement does not apply
329 | if neither you nor any third party retains the ability to install
330 | modified object code on the User Product (for example, the work has
331 | been installed in ROM).
332 |
333 | The requirement to provide Installation Information does not include a
334 | requirement to continue to provide support service, warranty, or updates
335 | for a work that has been modified or installed by the recipient, or for
336 | the User Product in which it has been modified or installed. Access to a
337 | network may be denied when the modification itself materially and
338 | adversely affects the operation of the network or violates the rules and
339 | protocols for communication across the network.
340 |
341 | Corresponding Source conveyed, and Installation Information provided,
342 | in accord with this section must be in a format that is publicly
343 | documented (and with an implementation available to the public in
344 | source code form), and must require no special password or key for
345 | unpacking, reading or copying.
346 |
347 | 7. Additional Terms.
348 |
349 | "Additional permissions" are terms that supplement the terms of this
350 | License by making exceptions from one or more of its conditions.
351 | Additional permissions that are applicable to the entire Program shall
352 | be treated as though they were included in this License, to the extent
353 | that they are valid under applicable law. If additional permissions
354 | apply only to part of the Program, that part may be used separately
355 | under those permissions, but the entire Program remains governed by
356 | this License without regard to the additional permissions.
357 |
358 | When you convey a copy of a covered work, you may at your option
359 | remove any additional permissions from that copy, or from any part of
360 | it. (Additional permissions may be written to require their own
361 | removal in certain cases when you modify the work.) You may place
362 | additional permissions on material, added by you to a covered work,
363 | for which you have or can give appropriate copyright permission.
364 |
365 | Notwithstanding any other provision of this License, for material you
366 | add to a covered work, you may (if authorized by the copyright holders of
367 | that material) supplement the terms of this License with terms:
368 |
369 | a) Disclaiming warranty or limiting liability differently from the
370 | terms of sections 15 and 16 of this License; or
371 |
372 | b) Requiring preservation of specified reasonable legal notices or
373 | author attributions in that material or in the Appropriate Legal
374 | Notices displayed by works containing it; or
375 |
376 | c) Prohibiting misrepresentation of the origin of that material, or
377 | requiring that modified versions of such material be marked in
378 | reasonable ways as different from the original version; or
379 |
380 | d) Limiting the use for publicity purposes of names of licensors or
381 | authors of the material; or
382 |
383 | e) Declining to grant rights under trademark law for use of some
384 | trade names, trademarks, or service marks; or
385 |
386 | f) Requiring indemnification of licensors and authors of that
387 | material by anyone who conveys the material (or modified versions of
388 | it) with contractual assumptions of liability to the recipient, for
389 | any liability that these contractual assumptions directly impose on
390 | those licensors and authors.
391 |
392 | All other non-permissive additional terms are considered "further
393 | restrictions" within the meaning of section 10. If the Program as you
394 | received it, or any part of it, contains a notice stating that it is
395 | governed by this License along with a term that is a further
396 | restriction, you may remove that term. If a license document contains
397 | a further restriction but permits relicensing or conveying under this
398 | License, you may add to a covered work material governed by the terms
399 | of that license document, provided that the further restriction does
400 | not survive such relicensing or conveying.
401 |
402 | If you add terms to a covered work in accord with this section, you
403 | must place, in the relevant source files, a statement of the
404 | additional terms that apply to those files, or a notice indicating
405 | where to find the applicable terms.
406 |
407 | Additional terms, permissive or non-permissive, may be stated in the
408 | form of a separately written license, or stated as exceptions;
409 | the above requirements apply either way.
410 |
411 | 8. Termination.
412 |
413 | You may not propagate or modify a covered work except as expressly
414 | provided under this License. Any attempt otherwise to propagate or
415 | modify it is void, and will automatically terminate your rights under
416 | this License (including any patent licenses granted under the third
417 | paragraph of section 11).
418 |
419 | However, if you cease all violation of this License, then your
420 | license from a particular copyright holder is reinstated (a)
421 | provisionally, unless and until the copyright holder explicitly and
422 | finally terminates your license, and (b) permanently, if the copyright
423 | holder fails to notify you of the violation by some reasonable means
424 | prior to 60 days after the cessation.
425 |
426 | Moreover, your license from a particular copyright holder is
427 | reinstated permanently if the copyright holder notifies you of the
428 | violation by some reasonable means, this is the first time you have
429 | received notice of violation of this License (for any work) from that
430 | copyright holder, and you cure the violation prior to 30 days after
431 | your receipt of the notice.
432 |
433 | Termination of your rights under this section does not terminate the
434 | licenses of parties who have received copies or rights from you under
435 | this License. If your rights have been terminated and not permanently
436 | reinstated, you do not qualify to receive new licenses for the same
437 | material under section 10.
438 |
439 | 9. Acceptance Not Required for Having Copies.
440 |
441 | You are not required to accept this License in order to receive or
442 | run a copy of the Program. Ancillary propagation of a covered work
443 | occurring solely as a consequence of using peer-to-peer transmission
444 | to receive a copy likewise does not require acceptance. However,
445 | nothing other than this License grants you permission to propagate or
446 | modify any covered work. These actions infringe copyright if you do
447 | not accept this License. Therefore, by modifying or propagating a
448 | covered work, you indicate your acceptance of this License to do so.
449 |
450 | 10. Automatic Licensing of Downstream Recipients.
451 |
452 | Each time you convey a covered work, the recipient automatically
453 | receives a license from the original licensors, to run, modify and
454 | propagate that work, subject to this License. You are not responsible
455 | for enforcing compliance by third parties with this License.
456 |
457 | An "entity transaction" is a transaction transferring control of an
458 | organization, or substantially all assets of one, or subdividing an
459 | organization, or merging organizations. If propagation of a covered
460 | work results from an entity transaction, each party to that
461 | transaction who receives a copy of the work also receives whatever
462 | licenses to the work the party's predecessor in interest had or could
463 | give under the previous paragraph, plus a right to possession of the
464 | Corresponding Source of the work from the predecessor in interest, if
465 | the predecessor has it or can get it with reasonable efforts.
466 |
467 | You may not impose any further restrictions on the exercise of the
468 | rights granted or affirmed under this License. For example, you may
469 | not impose a license fee, royalty, or other charge for exercise of
470 | rights granted under this License, and you may not initiate litigation
471 | (including a cross-claim or counterclaim in a lawsuit) alleging that
472 | any patent claim is infringed by making, using, selling, offering for
473 | sale, or importing the Program or any portion of it.
474 |
475 | 11. Patents.
476 |
477 | A "contributor" is a copyright holder who authorizes use under this
478 | License of the Program or a work on which the Program is based. The
479 | work thus licensed is called the contributor's "contributor version".
480 |
481 | A contributor's "essential patent claims" are all patent claims
482 | owned or controlled by the contributor, whether already acquired or
483 | hereafter acquired, that would be infringed by some manner, permitted
484 | by this License, of making, using, or selling its contributor version,
485 | but do not include claims that would be infringed only as a
486 | consequence of further modification of the contributor version. For
487 | purposes of this definition, "control" includes the right to grant
488 | patent sublicenses in a manner consistent with the requirements of
489 | this License.
490 |
491 | Each contributor grants you a non-exclusive, worldwide, royalty-free
492 | patent license under the contributor's essential patent claims, to
493 | make, use, sell, offer for sale, import and otherwise run, modify and
494 | propagate the contents of its contributor version.
495 |
496 | In the following three paragraphs, a "patent license" is any express
497 | agreement or commitment, however denominated, not to enforce a patent
498 | (such as an express permission to practice a patent or covenant not to
499 | sue for patent infringement). To "grant" such a patent license to a
500 | party means to make such an agreement or commitment not to enforce a
501 | patent against the party.
502 |
503 | If you convey a covered work, knowingly relying on a patent license,
504 | and the Corresponding Source of the work is not available for anyone
505 | to copy, free of charge and under the terms of this License, through a
506 | publicly available network server or other readily accessible means,
507 | then you must either (1) cause the Corresponding Source to be so
508 | available, or (2) arrange to deprive yourself of the benefit of the
509 | patent license for this particular work, or (3) arrange, in a manner
510 | consistent with the requirements of this License, to extend the patent
511 | license to downstream recipients. "Knowingly relying" means you have
512 | actual knowledge that, but for the patent license, your conveying the
513 | covered work in a country, or your recipient's use of the covered work
514 | in a country, would infringe one or more identifiable patents in that
515 | country that you have reason to believe are valid.
516 |
517 | If, pursuant to or in connection with a single transaction or
518 | arrangement, you convey, or propagate by procuring conveyance of, a
519 | covered work, and grant a patent license to some of the parties
520 | receiving the covered work authorizing them to use, propagate, modify
521 | or convey a specific copy of the covered work, then the patent license
522 | you grant is automatically extended to all recipients of the covered
523 | work and works based on it.
524 |
525 | A patent license is "discriminatory" if it does not include within
526 | the scope of its coverage, prohibits the exercise of, or is
527 | conditioned on the non-exercise of one or more of the rights that are
528 | specifically granted under this License. You may not convey a covered
529 | work if you are a party to an arrangement with a third party that is
530 | in the business of distributing software, under which you make payment
531 | to the third party based on the extent of your activity of conveying
532 | the work, and under which the third party grants, to any of the
533 | parties who would receive the covered work from you, a discriminatory
534 | patent license (a) in connection with copies of the covered work
535 | conveyed by you (or copies made from those copies), or (b) primarily
536 | for and in connection with specific products or compilations that
537 | contain the covered work, unless you entered into that arrangement,
538 | or that patent license was granted, prior to 28 March 2007.
539 |
540 | Nothing in this License shall be construed as excluding or limiting
541 | any implied license or other defenses to infringement that may
542 | otherwise be available to you under applicable patent law.
543 |
544 | 12. No Surrender of Others' Freedom.
545 |
546 | If conditions are imposed on you (whether by court order, agreement or
547 | otherwise) that contradict the conditions of this License, they do not
548 | excuse you from the conditions of this License. If you cannot convey a
549 | covered work so as to satisfy simultaneously your obligations under this
550 | License and any other pertinent obligations, then as a consequence you may
551 | not convey it at all. For example, if you agree to terms that obligate you
552 | to collect a royalty for further conveying from those to whom you convey
553 | the Program, the only way you could satisfy both those terms and this
554 | License would be to refrain entirely from conveying the Program.
555 |
556 | 13. Remote Network Interaction; Use with the GNU General Public License.
557 |
558 | Notwithstanding any other provision of this License, if you modify the
559 | Program, your modified version must prominently offer all users
560 | interacting with it remotely through a computer network (if your version
561 | supports such interaction) an opportunity to receive the Corresponding
562 | Source of your version by providing access to the Corresponding Source
563 | from a network server at no charge, through some standard or customary
564 | means of facilitating copying of software. This Corresponding Source
565 | shall include the Corresponding Source for any work covered by version 3
566 | of the GNU General Public License that is incorporated pursuant to the
567 | following paragraph.
568 |
569 | Notwithstanding any other provision of this License, you have
570 | permission to link or combine any covered work with a work licensed
571 | under version 3 of the GNU General Public License into a single
572 | combined work, and to convey the resulting work. The terms of this
573 | License will continue to apply to the part which is the covered work,
574 | but the work with which it is combined will remain governed by version
575 | 3 of the GNU General Public License.
576 |
577 | 14. Revised Versions of this License.
578 |
579 | The Free Software Foundation may publish revised and/or new versions of
580 | the GNU Affero General Public License from time to time. Such new versions
581 | will be similar in spirit to the present version, but may differ in detail to
582 | address new problems or concerns.
583 |
584 | Each version is given a distinguishing version number. If the
585 | Program specifies that a certain numbered version of the GNU Affero General
586 | Public License "or any later version" applies to it, you have the
587 | option of following the terms and conditions either of that numbered
588 | version or of any later version published by the Free Software
589 | Foundation. If the Program does not specify a version number of the
590 | GNU Affero General Public License, you may choose any version ever published
591 | by the Free Software Foundation.
592 |
593 | If the Program specifies that a proxy can decide which future
594 | versions of the GNU Affero General Public License can be used, that proxy's
595 | public statement of acceptance of a version permanently authorizes you
596 | to choose that version for the Program.
597 |
598 | Later license versions may give you additional or different
599 | permissions. However, no additional obligations are imposed on any
600 | author or copyright holder as a result of your choosing to follow a
601 | later version.
602 |
603 | 15. Disclaimer of Warranty.
604 |
605 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
606 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
607 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
608 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
609 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
610 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
611 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
612 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
613 |
614 | 16. Limitation of Liability.
615 |
616 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
617 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
618 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
619 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
620 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
621 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
622 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
623 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
624 | SUCH DAMAGES.
625 |
626 | 17. Interpretation of Sections 15 and 16.
627 |
628 | If the disclaimer of warranty and limitation of liability provided
629 | above cannot be given local legal effect according to their terms,
630 | reviewing courts shall apply local law that most closely approximates
631 | an absolute waiver of all civil liability in connection with the
632 | Program, unless a warranty or assumption of liability accompanies a
633 | copy of the Program in return for a fee.
634 |
635 | END OF TERMS AND CONDITIONS
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | Web Transpose
4 |
5 | Web Crawler & AI Web Scraper APIs for building new web experiences.
6 |
7 |
8 |
9 | ```bash
10 | pip install webtranspose
11 | ```
12 |
13 |
24 |
25 |
26 |
27 | Introduction ·
28 | Installation ·
29 | Docs
30 |
31 |
32 |
33 | ## Introduction
34 |
35 | In the near future, **nobody will open websites**. Instead, we will be directly served the information we are seeking. New web experiences will combine the information from many websites into a single, unified experience.
36 |
37 | **Web Transpose** is a collection of API tools that allow building these new web experiences simple.
38 |
39 | - [Webᵀ Crawl: Distributed Web Crawler](#crawl)
40 | - [Webᵀ Scrape: AI Web Scraper](#scrape)
41 |
42 |
43 | ### Crawl
44 |
45 | ```python
46 | import webtranspose as webt
47 |
48 | import os
49 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY"
50 |
51 | crawl = webt.Crawl(
52 | "https://www.example.com",
53 | max_pages=100,
54 | render_js=True,
55 | )
56 | await crawl.crawl() # crawl.queue_crawl() for async
57 | ```
58 |
59 | ## Scrape
60 |
61 | ```python
62 | import webtranspose as webt
63 |
64 | import os
65 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY"
66 |
67 | schema = {
68 | "Merchant Name": "string",
69 | "Title of Product": "string",
70 | "Product Photo URL": "string",
71 | }
72 |
73 | scraper = webt.Scraper(
74 | schema,
75 | render_js=True,
76 | )
77 | out_json = scraper.scrape("https://www.example.com")
78 | ```
79 |
80 | ## Web Search (AI SERP API)
81 |
82 | ```python
83 | import webtranspose as webt
84 |
85 | import os
86 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY"
87 |
88 | results = webt.search("what caused the fourth great ninja war?")
89 | # results.keys()
90 | # ['results']
91 |
92 | # AI Filter
93 | results = webt.search_filter("Paul Graham's Blog")
94 | # results.keys()
95 | # ['results', 'filtered_results']
96 | ```
97 |
98 |
99 | ## Installation
100 |
101 | Non-Python Users: [📄 API Docs](https://docs.webtranspose.com).
102 |
103 | This repo contains a local **lite** installation of Web Transpose. This is a good option if you want to run Web Transpose locally on your machine for quick use cases.
104 |
105 | ```shell
106 | pip install webtranspose
107 | ```
108 |
109 | However, if you wish to leverage the full tools of Web Transpose and use in production, you should add your API key to add the **full** version.
110 |
111 | ```python
112 | os.environ["WEBTRANSPOSE_API_KEY"] = "YOUR_API_KEY_HERE"
113 | ```
114 |
115 |
116 | ## Enterprise Support
117 |
118 | Web Transpose serves enterprises small and large. We partner with companies for the long term with hands-on support and custom solutions.
119 |
120 | Please email me directly at mike@webtranspose.com for enquiries.
121 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | status:
3 | project:
4 | default:
5 | target: "100"
6 | patch:
7 | default:
8 | target: "100"
9 | comment:
10 | require_changes: true
11 |
--------------------------------------------------------------------------------
/img/web-transpose-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mike-gee/webtranspose/46077b4400f72d37b983a6a7ec1bb2f0067f9e3f/img/web-transpose-cover.png
--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
1 | """Nox sessions."""
2 | import platform
3 |
4 | import nox
5 | from nox_poetry import Session, session
6 |
7 | nox.options.sessions = ["tests", "mypy"]
8 | python_versions = ["3.8", "3.9", "3.10", "3.11"]
9 |
10 |
11 | @session(python=python_versions)
12 | def tests(session: Session) -> None:
13 | """Run the test suite."""
14 | session.install(".")
15 | session.install("invoke", "pytest", "xdoctest", "coverage[toml]", "pytest-cov")
16 | try:
17 | session.run(
18 | "inv",
19 | "tests",
20 | env={
21 | "COVERAGE_FILE": f".coverage.{platform.system()}.{platform.python_version()}",
22 | },
23 | )
24 | finally:
25 | if session.interactive:
26 | session.notify("coverage")
27 |
28 |
29 | @session
30 | def coverage(session: Session) -> None:
31 | """Produce the coverage report."""
32 | args = session.posargs if session.posargs and len(session._runner.manifest) == 1 else []
33 | session.install("invoke", "coverage[toml]")
34 | session.run("inv", "coverage", *args)
35 |
36 |
37 | @session(python=python_versions)
38 | def mypy(session: Session) -> None:
39 | """Type-check using mypy."""
40 | session.install(".")
41 | session.install("invoke", "mypy")
42 | session.run("inv", "mypy")
43 |
44 |
45 | @session(python="3.11")
46 | def security(session: Session) -> None:
47 | """Scan dependencies for insecure packages."""
48 | session.install("invoke", "safety")
49 | session.run("inv", "security")
50 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 |
2 | [tool.poetry]
3 | name = "webtranspose"
4 | version = "0.3.2"
5 | description = "Reliable APIs for the website data"
6 | authors = ["Mike Gee "]
7 |
8 | readme = "README.md"
9 | homepage = "https://github.com/mike-gee/webtranspose"
10 | repository = "https://github.com/mike-gee/webtranspose"
11 | documentation = "https://docs.webtranspose.com"
12 | keywords = ["webtranspose"]
13 | classifiers=[
14 | "Development Status :: 2 - Pre-Alpha",
15 | "Intended Audience :: Developers",
16 |
17 | "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
18 |
19 | "Natural Language :: English",
20 | "Programming Language :: Python :: 3",
21 | "Programming Language :: Python :: 3.8",
22 | "Programming Language :: Python :: 3.9",
23 | "Programming Language :: Python :: 3.10",
24 | "Programming Language :: Python :: 3.11",
25 | ]
26 |
27 |
28 | [tool.poetry.urls]
29 | "Bug Tracker" = "https://github.com/mike-gee/webtranspose/issues"
30 |
31 |
32 |
33 |
34 | [tool.poetry.dependencies]
35 | python = "<3.12,>=3.8"
36 | requests = "^2.31.0"
37 | httpx = "^0.25.1"
38 | bs4 = "^0.0.1"
39 | openai = "^1.3.3"
40 | tiktoken = "^0.5.1"
41 | lxml = "^4.9.3"
42 |
43 |
44 | [tool.poetry.group.dev.dependencies]
45 | pre-commit = "^3.3.2"
46 | invoke = "^2.1.2"
47 | bump2version = "^1.0.1"
48 | watchdog = {version = "^3.0.0", extras = ["watchmedo"]}
49 | ipykernel = "^6.25.2"
50 |
51 | [tool.poetry.group.test.dependencies]
52 | pytest = "^7.3.1"
53 | xdoctest = "^1.1.1"
54 | coverage = {version = "^7.2.6", extras = ["toml"]}
55 | pytest-cov = "^4.1.0"
56 |
57 | [tool.poetry.group.format.dependencies]
58 | isort = "^5.12.0"
59 | black = "^23.3.0"
60 |
61 | [tool.poetry.group.linters.dependencies]
62 | flake8 = ">=4.0.1,<5.0.0"
63 | flakeheaven = "^3.3.0"
64 | flake8-builtins = "^2.1.0"
65 | flake8-blind-except = "^0.2.1"
66 | flake8-logging-format = "^0.9.0"
67 | flake8-bugbear = "^23.3.12"
68 | flake8-annotations = "^2.9.1"
69 | flake8-docstrings = "^1.7.0"
70 | flake8-bandit = "^3.0.0"
71 | flake8-broken-line = "^0.6.0"
72 | darglint = "^1.8.1"
73 |
74 | [tool.poetry.group.security.dependencies]
75 | safety = "^2.4.0b1"
76 |
77 | [tool.poetry.group.typing.dependencies]
78 | mypy = "^1.3.0"
79 |
80 | [tool.poetry.group.docs.dependencies]
81 | sphinx = "^7.0.1"
82 | recommonmark = "^0.7.1"
83 |
84 | [tool.coverage.paths]
85 | source = ["src", "*/site-packages"]
86 |
87 | [tool.coverage.run]
88 | branch = true
89 | source = ["webtranspose"]
90 |
91 | [tool.coverage.report]
92 | fail_under = 100
93 | exclude_lines = [
94 | "pragma: no cover",
95 | "def __repr__",
96 | "if self.debug",
97 | "if settings.DEBUG:",
98 | "raise AssertionError",
99 | "raise NotImplementedError",
100 | "if 0:",
101 | "if __name__ == __main__:"
102 | ]
103 | show_missing = true
104 |
105 | [tool.coverage.html]
106 | directory = "htmlcov"
107 |
108 | [tool.flakeheaven]
109 | format = "grouped"
110 | max_line_length = 99
111 | show_source = true
112 | docstring-convention = "google"
113 | extended_default_ignore = []
114 |
115 | [tool.flakeheaven.plugins]
116 | pyflakes = ["+*"]
117 | pycodestyle = ["+*"]
118 | mccabe = ["+*"]
119 | flake8-annotations = ["+*", "-ANN1??", "-ANN401"]
120 | flake8-docstrings = ["+*", "-D212"]
121 | "flake8-*" = ["+*"]
122 | pylint = ["-C????", "-E????", "+F????", "+I????", "-R????", "-W????"]
123 |
124 | [tool.flakeheaven.exceptions."tests/"]
125 | flake8-bandit = ["-S101"]
126 |
127 | [tool.isort]
128 | multi_line_output = 3
129 | include_trailing_comma = true
130 | force_grid_wrap = 0
131 | use_parentheses = true
132 | line_length = 99
133 | known_third_party = ["invoke", "nox", "nox_poetry"]
134 |
135 | [tool.black]
136 | line-length = 99
137 | target-version = ["py38"]
138 |
139 | [tool.mypy]
140 | warn_return_any = true
141 | warn_unused_configs = true
142 |
143 | [[tool.mypy.overrides]]
144 | module = ["pytest.*", "invoke.*", "nox.*", "nox_poetry.*"]
145 | allow_redefinition = false
146 | check_untyped_defs = true
147 | ignore_errors = false
148 | ignore_missing_imports = true
149 | implicit_reexport = true
150 | local_partial_types = true
151 | strict_optional = true
152 | strict_equality = true
153 | no_implicit_optional = true
154 | warn_unused_ignores = true
155 | warn_unreachable = true
156 | warn_no_return = true
157 |
158 | [build-system]
159 | requires = ["poetry>=0.12"]
160 | build-backend = "poetry.masonry.api"
161 |
--------------------------------------------------------------------------------
/src/webtranspose/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for webtranspose."""
2 |
3 | __author__ = """Mike Gee"""
4 | __email__ = "mike@webtranspose.com"
5 | __version__ = "0.3.1"
6 |
7 | from .chat import *
8 | from .crawl import *
9 | from .openai import *
10 | from .scrape import *
11 | from .search import *
--------------------------------------------------------------------------------
/src/webtranspose/chat.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from time import sleep
4 | from typing import List
5 |
6 | from .webt_api import run_webt_api
7 |
8 |
9 | class Chatbot:
10 | def __init__(
11 | self,
12 | url_list: List[str] = [],
13 | name: str = None,
14 | max_pages: int = 100,
15 | api_key: str = None,
16 | verbose: bool = False,
17 | chatbot_id: str = None,
18 | _created: bool = False,
19 | ) -> None:
20 | """
21 | Initialize a Chatbot instance.
22 |
23 | :param url_list: A list of URLs to crawl.
24 | :param name: The name of the chatbot.
25 | :param max_pages: The maximum number of pages to crawl.
26 | :param api_key: The API key for accessing the Web Transpose API.
27 | :param verbose: Whether to enable verbose logging.
28 | :param chatbot_id: The ID of an existing chatbot.
29 | :param _created: Whether the chatbot has already been created.
30 | """
31 | self.api_key = api_key
32 | if self.api_key is None:
33 | self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
34 |
35 | if self.api_key is None:
36 | raise ValueError(
37 | "No Web Transpose API provided. \n\nTo use Chatbots, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com."
38 | )
39 |
40 | self.url_list = url_list
41 | self.name = name
42 | self.max_pages = max_pages
43 | self.verbose = verbose
44 | self.chatbot_id = chatbot_id
45 | self.created = _created
46 |
47 | if not self.chatbot_id:
48 | self.create()
49 |
50 | def create(self):
51 | """
52 | Create a chatbot.
53 | """
54 | if not self.chatbot_id:
55 | self._create_chat()
56 | status = self.status()
57 | while status["status"] != "complete":
58 | if self.verbose:
59 | logging.info("Waiting for chat to be created...")
60 | sleep(5)
61 | status = self.status()
62 |
63 | else:
64 | logging.info("Chat already created.")
65 |
66 | def queue_create(self):
67 | """
68 | Queue the creation of a chatbot.
69 | """
70 | if not self.chatbot_id:
71 | self._create_chat()
72 | else:
73 | logging.info("Chat already created.")
74 |
75 | def _create_chat(self):
76 | """
77 | Create a chat.
78 | """
79 | if self.verbose:
80 | logging.info("Creating chat...")
81 |
82 | if self.chatbot_id is None:
83 | create_json = {
84 | "name": self.name,
85 | "max_pages": self.max_pages,
86 | "url_list": self.url_list,
87 | }
88 | out_json = run_webt_api(create_json, "v1/chat/create", self.api_key)
89 | self.chatbot_id = out_json["chatbot_id"]
90 |
91 | def query_database(self, query: str, num_records: int = 3) -> list:
92 | """
93 | Query the database of the chatbot.
94 |
95 | :param query: The query string.
96 | :param num_records: The number of records to return.
97 | :return: The query results.
98 | """
99 | if self.verbose:
100 | logging.info("Querying database...")
101 |
102 | if not self.chatbot_id:
103 | self.create()
104 |
105 | query_json = {
106 | "chatbot_id": self.chatbot_id,
107 | "query": query,
108 | "num_records": num_records,
109 | }
110 | out = run_webt_api(query_json, "v1/chat/database/query", self.api_key)
111 | return out["results"]
112 |
113 | def status(self):
114 | """
115 | Get the status of the chatbot.
116 |
117 | :return: The chatbot status.
118 | """
119 | if self.verbose:
120 | logging.info("Getting chat...")
121 |
122 | if not self.chatbot_id:
123 | self.create()
124 |
125 | get_json = {
126 | "chatbot_id": self.chatbot_id,
127 | }
128 | out = run_webt_api(get_json, "v1/chat/get", self.api_key)
129 | return out["chatbot"]
130 |
131 | def add_urls(self, url_list: list):
132 | """
133 | Add URLs to the chatbot.
134 |
135 | :param url_list: A list of URLs to add.
136 | """
137 | if self.verbose:
138 | logging.info("Querying database...")
139 |
140 | if not self.chatbot_id:
141 | self.create()
142 |
143 | query_json = {
144 | "chatbot_id": self.chatbot_id,
145 | "max_pages": self.max_pages,
146 | "url_list": url_list,
147 | }
148 | run_webt_api(query_json, "v1/chat/urls/add", self.api_key)
149 |
150 | def delete_crawls(self, crawl_id_list: list):
151 | """
152 | Delete crawls from the chatbot.
153 |
154 | :param crawl_id_list: A list of crawl IDs to delete.
155 | """
156 | if self.verbose:
157 | logging.info("Querying database...")
158 |
159 | if not self.chatbot_id:
160 | self.create()
161 |
162 | query_json = {
163 | "chatbot_id": self.chatbot_id,
164 | "crawl_id_list": crawl_id_list,
165 | }
166 | run_webt_api(query_json, "v1/chat/crawls/delete", self.api_key)
167 |
168 |
169 | def get_chatbot(chatbot_id: str, api_key = None) -> Chatbot:
170 | """
171 | Get a chatbot.
172 |
173 | :param chatbot_id: The ID of the chatbot.
174 | :return: The chatbot.
175 | """
176 | if api_key is None:
177 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
178 | if api_key is None:
179 | raise ValueError(
180 | "No Web Transpose API provided. \n\nTo use Chatbots, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com."
181 | )
182 | get_json = {
183 | "chatbot_id": chatbot_id,
184 | }
185 | chat_json = run_webt_api(get_json, "v1/chat/get", api_key)
186 | chatbot_data = chat_json.get('chatbot', {})
187 | chatbot = Chatbot(
188 | chatbot_id=chatbot_data.get('id'),
189 | name=chatbot_data.get('name'),
190 | max_pages=chatbot_data.get('num_run', 100),
191 | verbose=False,
192 | _created=True
193 | )
194 | return chatbot
--------------------------------------------------------------------------------
/src/webtranspose/crawl.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | import logging
4 | import os
5 | import shutil
6 | import tempfile
7 | import urllib.parse
8 | import uuid
9 | import zipfile
10 | from datetime import datetime
11 | from fnmatch import fnmatch
12 | from typing import Dict, List, Optional, Set
13 | from urllib.parse import urljoin, urlparse, urlunparse
14 |
15 | import httpx
16 | from bs4 import BeautifulSoup
17 |
18 | from .webt_api import run_webt_api
19 |
20 |
21 | class Crawl:
22 | def __init__(
23 | self,
24 | url: str,
25 | allowed_urls: List[str] = [],
26 | banned_urls: List[str] = [],
27 | n_workers: int = 1,
28 | max_pages: int = 15,
29 | render_js: bool = False,
30 | output_dir: str = "webtranspose-out",
31 | verbose: bool = False,
32 | api_key: Optional[str] = None,
33 | _created: bool = False,
34 | ) -> None:
35 | """
36 | Initialize the Crawl object.
37 |
38 | :param url: The base URL to start crawling from.
39 | :param allowed_urls: A list of allowed URLs to crawl.
40 | :param banned_urls: A list of banned URLs to exclude from crawling.
41 | :param n_workers: The number of worker tasks to use for crawling.
42 | :param max_pages: The maximum number of pages to crawl.
43 | :param render_js: Whether to render JavaScript on crawled pages.
44 | :param output_dir: The directory to store the crawled data.
45 | :param verbose: Whether to print verbose logging messages.
46 | :param api_key: The API key to use for webt_api calls.
47 | """
48 | self.api_key = api_key
49 | if self.api_key is None:
50 | self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
51 |
52 | self.base_url = url
53 | self.allowed_urls = allowed_urls
54 | self.banned_urls = banned_urls
55 | self.max_pages = max_pages
56 | self.queue = asyncio.Queue()
57 | self.queue.put_nowait(
58 | {
59 | "url": self.base_url,
60 | "parent_urls": [],
61 | }
62 | )
63 | self.output_dir = output_dir
64 | self.visited_urls = {}
65 | self.failed_urls = set()
66 | self.ignored_urls = set()
67 | self.n_workers = n_workers
68 | if not os.path.exists(self.output_dir):
69 | os.makedirs(self.output_dir)
70 | self.created = _created
71 | self.render_js = render_js
72 | self.crawl_id = None
73 | if self.api_key is None:
74 | self.crawl_id = str(uuid.uuid4())
75 | self.verbose = verbose
76 |
77 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
78 | if api_key is None and self.api_key is None:
79 | logging.warning(
80 | "No Web Transpose API provided. Lite version in use...\n\nTo run your Web Crawl on the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics."
81 | )
82 |
83 | @staticmethod
84 | async def crawl_worker(
85 | name: str,
86 | queue: asyncio.Queue,
87 | crawl_id: str,
88 | visited_urls: Dict[str, str],
89 | allowed_urls: List[str],
90 | failed_urls: Set[str],
91 | banned_urls: List[str],
92 | output_dir: str,
93 | base_url: str,
94 | max_pages: int,
95 | leftover_queue: asyncio.Queue,
96 | ignored_queue: asyncio.Queue,
97 | verbose: bool,
98 | ) -> None:
99 | """
100 | Worker function for crawling URLs.
101 |
102 | :param name: The name of the worker.
103 | :param queue: The queue of URLs to crawl.
104 | :param crawl_id: The ID of the crawl.
105 | :param visited_urls: A dictionary of visited URLs and their file paths.
106 | :param allowed_urls: A list of allowed URLs to crawl.
107 | :param banned_urls: A list of banned URLs to exclude from crawling.
108 | :param output_dir: The directory to store the crawled data.
109 | :param base_url: The base URL of the crawl.
110 | :param max_pages: The maximum number of pages to crawl.
111 | :param leftover_queue: The queue for leftover URLs.
112 | :param ignored_queue: The queue for ignored URLs.
113 | :param verbose: Whether to print verbose logging messages.
114 | """
115 |
116 | def _lint_url(url: str) -> str:
117 | """
118 | Lint the given URL by removing the fragment component.
119 |
120 | :param url: The URL to lint.
121 | :return: The linted URL.
122 | """
123 | parsed_url = urlparse(url)
124 | cleaned_url = parsed_url._replace(fragment="")
125 | return urlunparse(cleaned_url)
126 |
127 | if verbose:
128 | logging.info(f"{name}: Starting crawl of {base_url}")
129 | while max_pages is None or len(visited_urls) < max_pages or not queue.empty():
130 | curr_url_data = await queue.get()
131 | curr_url = curr_url_data["url"]
132 | parent_urls = curr_url_data["parent_urls"]
133 | base_url_netloc = urlparse(base_url).netloc
134 | if (
135 | (
136 | (
137 | urlparse(curr_url).netloc == base_url_netloc
138 | and not any(fnmatch(curr_url, banned) for banned in banned_urls)
139 | )
140 | or any(fnmatch(curr_url, allowed) for allowed in allowed_urls)
141 | )
142 | and curr_url not in visited_urls
143 | and len(visited_urls) < max_pages
144 | ):
145 | base_dir = os.path.join(output_dir, base_url_netloc)
146 | if not os.path.exists(base_dir):
147 | os.makedirs(base_dir)
148 | filename = urllib.parse.quote_plus(curr_url).replace("/", "_")
149 | filepath = os.path.join(base_dir, filename) + ".json"
150 | async with httpx.AsyncClient() as client:
151 | try:
152 | page = await client.get(curr_url)
153 | except:
154 | failed_urls.add(curr_url)
155 | queue.task_done()
156 | continue
157 |
158 | page_title = None
159 | page_html = None
160 | page_text = None
161 | try:
162 | page_type = "html"
163 | soup = BeautifulSoup(page.content, "lxml")
164 | page_title = soup.title.string if soup.title else ""
165 | page_html = page.content.decode("utf-8")
166 | page_text = soup.get_text()
167 | child_urls = list(
168 | set(
169 | [
170 | _lint_url(urljoin(base_url, link.get("href")))
171 | for link in soup.find_all(href=True)
172 | ]
173 | )
174 | )
175 | for url in child_urls:
176 | if url.startswith("http"):
177 | queue.put_nowait(
178 | {
179 | "url": url,
180 | "parent_urls": parent_urls + [curr_url],
181 | }
182 | )
183 | except:
184 | child_urls = []
185 | page_type = "other"
186 |
187 | visited_urls[curr_url] = filepath
188 | data = {
189 | "crawl_id": crawl_id,
190 | "url": curr_url,
191 | "type": page_type,
192 | "title": page_title,
193 | "date": datetime.now().isoformat(),
194 | "parent_urls": parent_urls,
195 | "child_urls": child_urls,
196 | "html": page_html,
197 | "text": page_text,
198 | }
199 | with open(filepath, "w") as f:
200 | json.dump(data, f)
201 |
202 | elif curr_url not in visited_urls and (
203 | urlparse(curr_url).netloc == urlparse(base_url).netloc
204 | or any(fnmatch(curr_url, allowed) for allowed in allowed_urls)
205 | ):
206 | leftover_queue.put_nowait(
207 | {
208 | "url": curr_url,
209 | "parent_urls": parent_urls,
210 | }
211 | )
212 |
213 | else:
214 | ignored_queue.put_nowait(curr_url)
215 |
216 | queue.task_done()
217 |
218 | def create_crawl_api(self):
219 | """
220 | Creates a Crawl on https://webtranspose.com
221 | """
222 | if self.verbose:
223 | logging.info(f"Creating crawl of {self.base_url} on Web Transpose...")
224 | create_json = {
225 | "url": self.base_url,
226 | "render_js": self.render_js,
227 | "max_pages": self.max_pages,
228 | "allowed_urls": self.allowed_urls,
229 | "banned_urls": self.banned_urls,
230 | }
231 | out_json = run_webt_api(
232 | create_json,
233 | "v1/crawl/create",
234 | self.api_key,
235 | )
236 | self.crawl_id = out_json["crawl_id"]
237 | self.created = True
238 |
239 | def queue_crawl(self):
240 | """
241 | Resume crawling of Crawl object. Don't wait for it to finish crawling.
242 | """
243 | if self.verbose:
244 | logging.info(f"Starting crawl of {self.base_url} on Web Transpose...")
245 |
246 | if self.api_key is None:
247 | logging.error("Cannot queue a local crawl. Please use the crawl() method.")
248 |
249 | else:
250 | if not self.created:
251 | self.create_crawl_api()
252 | queue_json = {
253 | "crawl_id": self.crawl_id,
254 | }
255 | out = run_webt_api(
256 | queue_json,
257 | "v1/crawl/resume",
258 | self.api_key,
259 | )
260 |
261 | async def crawl(self):
262 | """
263 | Resume crawling of Crawl object.
264 | """
265 | if self.verbose:
266 | logging.info(f"Starting crawl of {self.base_url}...")
267 | if self.api_key is None:
268 | leftover_queue = asyncio.Queue()
269 | ignored_queue = asyncio.Queue()
270 | tasks = []
271 | for i in range(self.n_workers):
272 | task = asyncio.create_task(
273 | self.crawl_worker(
274 | f"worker-{i}",
275 | self.queue,
276 | self.crawl_id,
277 | self.visited_urls,
278 | self.allowed_urls,
279 | self.failed_urls,
280 | self.banned_urls,
281 | self.output_dir,
282 | self.base_url,
283 | self.max_pages,
284 | leftover_queue,
285 | ignored_queue,
286 | self.verbose,
287 | )
288 | )
289 | tasks.append(task)
290 |
291 | await self.queue.join()
292 | for task in tasks:
293 | task.cancel()
294 | await asyncio.gather(*tasks, return_exceptions=True)
295 | self.queue = leftover_queue
296 | self.ignored_urls = list(ignored_queue._queue)
297 | self.to_metadata()
298 | else:
299 | self.queue_crawl()
300 | status = self.status()
301 | while status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0:
302 | await asyncio.sleep(5)
303 | status = self.status()
304 |
305 | if (status["num_failed"] > 0) and (
306 | status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0
307 | ):
308 | raise Exception("The first page crawled failed")
309 |
310 | while status["num_queued"] > 0 and status["num_visited"] < status["max_pages"]:
311 | await asyncio.sleep(5)
312 | status = self.status()
313 | return self
314 |
315 | def get_queued(self, max_pages: int = 30) -> list:
316 | """
317 | Get a list of URLs from the queue.
318 |
319 | Args:
320 | max_pages (int): The number of URLs to retrieve from the queue. Defaults to 30.
321 |
322 | Returns:
323 | list: A list of URLs from the queue.
324 | """
325 | if self.api_key is None:
326 | urls = []
327 | for _ in range(max_pages):
328 | try:
329 | url = self.queue.get_nowait()
330 | urls.append(url)
331 | except asyncio.QueueEmpty:
332 | break
333 | for url in urls:
334 | self.queue.put_nowait(url)
335 | return urls
336 | else:
337 | if not self.created:
338 | return [self.base_url]
339 | queue_json = {
340 | "crawl_id": self.crawl_id,
341 | "max_pages": max_pages,
342 | }
343 | out_json = run_webt_api(
344 | queue_json,
345 | "v1/crawl/get-queue",
346 | self.api_key,
347 | )
348 | return out_json["urls"]
349 |
350 | def set_allowed_urls(self, allowed_urls: list) -> "Crawl":
351 | """
352 | Set the allowed URLs for the crawl.
353 |
354 | Args:
355 | allowed_urls (list): A list of allowed URLs.
356 |
357 | Returns:
358 | self: The Crawl object.
359 | """
360 | self.allowed_urls = allowed_urls
361 | if not self.created:
362 | self.to_metadata()
363 | else:
364 | update_json = {
365 | "crawl_id": self.crawl_id,
366 | "allowed_urls": allowed_urls,
367 | }
368 | run_webt_api(
369 | update_json,
370 | "v1/crawl/set-allowed",
371 | self.api_key,
372 | )
373 | return self
374 |
375 | def set_banned_urls(self, banned_urls: list) -> "Crawl":
376 | """
377 | Set the banned URLs for the crawl.
378 |
379 | Args:
380 | banned_urls (list): A list of banned URLs.
381 |
382 | Returns:
383 | self: The Crawl object.
384 | """
385 | self.banned_urls = banned_urls
386 | if not self.created:
387 | self.to_metadata()
388 | else:
389 | update_json = {
390 | "crawl_id": self.crawl_id,
391 | "banned_urls": banned_urls,
392 | }
393 | run_webt_api(
394 | update_json,
395 | "v1/crawl/set-banned",
396 | self.api_key,
397 | )
398 | return self
399 |
400 | def get_filename(self, url: str) -> str:
401 | """
402 | Get the filename associated with a visited URL.
403 |
404 | Args:
405 | url (str): The visited URL.
406 |
407 | Returns:
408 | str: The filename associated with the visited URL.
409 |
410 | Raises:
411 | ValueError: If the URL is not found in the visited URLs.
412 | """
413 | try:
414 | return self.visited_urls[url]
415 | except KeyError:
416 | raise ValueError(f"URL {url} not found in visited URLs")
417 |
418 | def set_max_pages(self, max_pages: int) -> "Crawl":
419 | """
420 | Set the maximum number of pages to crawl.
421 |
422 | Args:
423 | max_pages (int): The maximum number of pages to crawl.
424 |
425 | Returns:
426 | self: The Crawl object.
427 | """
428 | if not self.created:
429 | self.max_pages = max_pages
430 | self.to_metadata()
431 | else:
432 | max_pages_json = {
433 | "crawl_id": self.crawl_id,
434 | "max_pages": max_pages,
435 | }
436 | run_webt_api(
437 | max_pages_json,
438 | "v1/crawl/set-max-pages",
439 | self.api_key,
440 | )
441 | return self
442 |
443 | def status(self) -> dict:
444 | """
445 | Get the status of the Crawl object.
446 |
447 | Returns:
448 | dict: The status of the Crawl object.
449 | """
450 | if not self.created:
451 | status_json = {
452 | "crawl_id": self.crawl_id,
453 | "loc": "local" if self.api_key is None else "cloud",
454 | "base_url": self.base_url,
455 | "max_pages": self.max_pages,
456 | "num_visited": len(self.visited_urls),
457 | "num_ignored": len(self.ignored_urls),
458 | "num_failed": len(self.failed_urls),
459 | "num_queued": self.queue.qsize(),
460 | "banned_urls": self.banned_urls,
461 | "allowed_urls": self.allowed_urls,
462 | }
463 | status_json["n_workers"] = self.n_workers
464 | return status_json
465 |
466 | status_json = {
467 | "crawl_id": self.crawl_id,
468 | }
469 | crawl_status = run_webt_api(
470 | status_json,
471 | "v1/crawl/get",
472 | self.api_key,
473 | )
474 | crawl_status["loc"] = "cloud"
475 | if self.verbose:
476 | logging.info(f"Status of crawl {self.crawl_id}: {crawl_status}")
477 | return crawl_status
478 |
479 | def get_ignored(self) -> list:
480 | """
481 | Get a list of ignored URLs.
482 |
483 | Returns:
484 | list: A list of ignored URLs.
485 | """
486 | if not self.created:
487 | return list(self.ignored_urls)
488 |
489 | ignored_json = {
490 | "crawl_id": self.crawl_id,
491 | }
492 | out_json = run_webt_api(
493 | ignored_json,
494 | "v1/crawl/get/ignored",
495 | self.api_key,
496 | )
497 | return out_json["pages"]
498 |
499 | def get_failed(self) -> list:
500 | """
501 | Get a list of failed URLs.
502 |
503 | Returns:
504 | list: A list of failed URLs.
505 | """
506 | if not self.created:
507 | return list(self.failed_urls)
508 |
509 | visited_json = {
510 | "crawl_id": self.crawl_id,
511 | }
512 | out_json = run_webt_api(
513 | visited_json,
514 | "v1/crawl/get/failed",
515 | self.api_key,
516 | )
517 | return out_json["pages"]
518 |
519 | def get_visited(self) -> list:
520 | """
521 | Get a list of visited URLs.
522 |
523 | Returns:
524 | list: A list of visited URLs.
525 | """
526 | if not self.created:
527 | return list(self.visited_urls)
528 |
529 | visited_json = {
530 | "crawl_id": self.crawl_id,
531 | }
532 | out_json = run_webt_api(
533 | visited_json,
534 | "v1/crawl/get/visited",
535 | self.api_key,
536 | )
537 | return out_json["pages"]
538 |
539 | def get_banned(self) -> list:
540 | """
541 | Get a list of banned URLs.
542 |
543 | Returns:
544 | list: A list of banned URLs.
545 | """
546 | if not self.created:
547 | return list(self.banned_urls)
548 |
549 | banned_json = {
550 | "crawl_id": self.crawl_id,
551 | }
552 | out_json = run_webt_api(
553 | banned_json,
554 | "v1/crawl/get/banned",
555 | self.api_key,
556 | )
557 | return out_json["pages"]
558 |
559 | def download(self):
560 | """
561 | Download the output of the crawl.
562 | """
563 | if self.verbose:
564 | logging.info(f"Downloading crawl of {self.base_url}...")
565 |
566 | if self.created:
567 | download_json = {
568 | "crawl_id": self.crawl_id,
569 | }
570 | out_json = run_webt_api(
571 | download_json,
572 | "v1/crawl/download",
573 | self.api_key,
574 | )
575 | presigned_url = out_json["url"]
576 | with tempfile.TemporaryDirectory() as tmpdir:
577 | zip_file_path = os.path.join(tmpdir, "temp.zip")
578 | with open(zip_file_path, "wb") as f:
579 | response = httpx.get(presigned_url)
580 | f.write(response.content)
581 |
582 | with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
583 | zip_ref.extractall(tmpdir)
584 |
585 | for root, _, files in os.walk(tmpdir):
586 | for file in files:
587 | if file.endswith(".json"):
588 | json_file = os.path.join(root, file)
589 | with open(json_file, "r") as f:
590 | data = json.load(f)
591 | url = data["url"]
592 | base_url_netloc = urlparse(self.base_url).netloc
593 | base_dir = os.path.join(self.output_dir, base_url_netloc)
594 | if not os.path.exists(base_dir):
595 | os.makedirs(base_dir)
596 | filename = urllib.parse.quote_plus(url).replace("/", "_")
597 | filepath = os.path.join(base_dir, filename) + ".json"
598 | shutil.move(json_file, filepath)
599 |
600 | logging.info(f"The output of the crawl can be found at: {self.output_dir}")
601 |
602 | def to_metadata(self) -> None:
603 | """
604 | Save the metadata of the Crawl object to a file.
605 | """
606 | if not self.created:
607 | filename = os.path.join(self.output_dir, f"{self.crawl_id}.json")
608 | metadata = {
609 | "crawl_id": self.crawl_id,
610 | "n_workers": self.n_workers,
611 | "base_url": self.base_url,
612 | "max_pages": self.max_pages,
613 | "visited_urls": self.visited_urls,
614 | "ignored_urls": list(self.ignored_urls),
615 | "render_js": self.render_js,
616 | "queue": list(self.queue._queue),
617 | "banned_urls": self.banned_urls,
618 | "allowed_urls": self.allowed_urls,
619 | "output_dir": self.output_dir,
620 | }
621 | with open(filename, "w") as file:
622 | json.dump(metadata, file)
623 |
624 | @staticmethod
625 | def from_metadata(crawl_id: str, output_dir: str = "webtranspose-out") -> "Crawl":
626 | """
627 | Create a Crawl object from metadata stored in a file.
628 |
629 | Args:
630 | crawl_id (str): The ID of the crawl.
631 | output_dir (str, optional): The directory to store the crawled data. Defaults to "webtranspose-out".
632 |
633 | Returns:
634 | Crawl: The Crawl object.
635 | """
636 | filename = os.path.join(output_dir, f"{crawl_id}.json")
637 | with open(filename, "r") as file:
638 | metadata = json.load(file)
639 | crawl = Crawl(
640 | metadata["base_url"],
641 | metadata["allowed_urls"],
642 | metadata["banned_urls"],
643 | metadata["n_workers"],
644 | metadata["max_pages"],
645 | render_js=metadata["render_js"],
646 | output_dir=metadata["output_dir"],
647 | )
648 | crawl.crawl_id = metadata["crawl_id"]
649 | crawl.visited_urls = metadata["visited_urls"]
650 | crawl.ignored_urls = set(metadata["ignored_urls"])
651 | crawl.queue = asyncio.Queue()
652 | for url in metadata["queue"]:
653 | crawl.queue.put_nowait(url)
654 | return crawl
655 |
656 | @staticmethod
657 | def from_cloud(crawl_id: str, api_key: Optional[str] = None) -> "Crawl":
658 | """
659 | Create a Crawl object from metadata stored in the cloud.
660 |
661 | Args:
662 | crawl_id (str): The ID of the crawl.
663 | api_key (str, optional): The API key for accessing the cloud. Defaults to None.
664 |
665 | Returns:
666 | Crawl: The Crawl object.
667 | """
668 | if api_key is None:
669 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
670 |
671 | if api_key is not None:
672 | get_json = {
673 | "crawl_id": crawl_id,
674 | }
675 | out_json = run_webt_api(get_json, "v1/crawl/get", api_key)
676 | crawl = Crawl(
677 | out_json["base_url"],
678 | out_json["allowed_urls"],
679 | out_json["banned_urls"],
680 | max_pages=out_json["max_pages"],
681 | render_js=out_json["render_js"],
682 | api_key=api_key,
683 | _created=True,
684 | )
685 | crawl.crawl_id = out_json["crawl_id"]
686 | return crawl
687 |
688 | raise ValueError(
689 | "API key not found. Please set WEBTRANSPOSE_API_KEY environment variable or pass api_key argument."
690 | )
691 |
692 | def __str__(self) -> str:
693 | """
694 | Get a string representation of the Crawl object.
695 |
696 | Returns:
697 | str: The string representation of the Crawl object.
698 | """
699 | status = self.status()
700 | return (
701 | f"WebTransposeCrawl(\n"
702 | f" Crawl ID: {status['crawl_id']}\n"
703 | f" Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
704 | f" Base URL: {status['base_url']}\n"
705 | f" Max Pages: {status['max_pages']}\n"
706 | f" Number of Visited URLs: {status['num_visited']}\n"
707 | f" Number of Ignored URLs: {status['num_ignored']}\n"
708 | f" Number of Queued URLs: {status['num_queued']}\n"
709 | f" Number of Failed URLs: {status['num_failed']}\n"
710 | f" Banned URLs: {status['banned_urls']}\n"
711 | f" Allowed URLs: {status['allowed_urls']}"
712 | f")"
713 | )
714 |
715 | def __repr__(self) -> str:
716 | """
717 | Get a string representation of the Crawl object.
718 |
719 | Returns:
720 | str: The string representation of the Crawl object.
721 | """
722 | status = self.status()
723 | return (
724 | f"WebTransposeCrawl(\n"
725 | f" Crawl ID: {status['crawl_id']}\n"
726 | f" Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
727 | f" Base URL: {status['base_url']}\n"
728 | f" Max Pages: {status['max_pages']}\n"
729 | f" Number of Visited URLs: {status['num_visited']}\n"
730 | f" Number of Ignored URLs: {status['num_ignored']}\n"
731 | f" Number of Queued URLs: {status['num_queued']}\n"
732 | f" Number of Failed URLs: {status['num_failed']}\n"
733 | f" Banned URLs: {status['banned_urls']}\n"
734 | f" Allowed URLs: {status['allowed_urls']}"
735 | f")"
736 | )
737 |
738 | def get_page(self, url: str) -> dict:
739 | """
740 | Get the page data for a given URL.
741 |
742 | Args:
743 | url (str): The URL of the page.
744 |
745 | Returns:
746 | dict: The page data.
747 | """
748 | if not self.created:
749 | fn = self.visited_urls[url]
750 | try:
751 | with open(fn, "r") as f:
752 | data = json.load(f)
753 | return data
754 | except:
755 | logging.error(f"Could not find HTML for URL {url}")
756 | else:
757 | get_json = {
758 | "crawl_id": self.crawl_id,
759 | "url": url,
760 | }
761 | out_json = run_webt_api(
762 | get_json,
763 | "v1/crawl/get-page",
764 | self.api_key,
765 | )
766 | return out_json
767 |
768 | def get_child_urls(self, url: str) -> list:
769 | """
770 | Get the child URLs for a given URL.
771 |
772 | Args:
773 | url (str): The URL.
774 |
775 | Returns:
776 | list: A list of child URLs.
777 | """
778 | if not self.created:
779 | try:
780 | fn = self.visited_urls[url]
781 | except:
782 | logging.error(f"Could not find child URLs for URL {url}")
783 | return None
784 | try:
785 | with open(fn, "r") as f:
786 | data = json.load(f)
787 | return data["child_urls"]
788 | except:
789 | logging.error(f"Could not find child URLs for URL {url}")
790 | else:
791 | get_json = {
792 | "crawl_id": self.crawl_id,
793 | "url": url,
794 | }
795 | out_json = run_webt_api(
796 | get_json,
797 | "v1/crawl/get-child-urls",
798 | self.api_key,
799 | )
800 | return out_json
801 |
802 | def retry_failed_urls(self) -> None:
803 | """
804 | Queue failed URLs from a crawl.
805 | """
806 | if not self.created:
807 | logging.error("Cannot retry failed URLs for un-created crawl.")
808 | elif self.api_key is not None:
809 | queue_json = {
810 | "crawl_id": self.crawl_id,
811 | }
812 | run_webt_api(
813 | queue_json,
814 | "v1/crawl/retry-failed",
815 | self.api_key,
816 | )
817 |
818 |
819 | def get_crawl(crawl_id: str, api_key: Optional[str] = None) -> Crawl:
820 | """
821 | Get a Crawl object based on the crawl ID.
822 |
823 | Args:
824 | crawl_id (str): The ID of the crawl.
825 | api_key (str, optional): The API key. Defaults to None.
826 |
827 | Returns:
828 | Crawl: The Crawl object.
829 | """
830 | try:
831 | return Crawl.from_metadata(crawl_id)
832 | except FileNotFoundError:
833 | return Crawl.from_cloud(crawl_id, api_key=api_key)
834 |
835 |
836 | def list_crawls(loc: str = "cloud", api_key: Optional[str] = None) -> list:
837 | """
838 | List all available crawls.
839 |
840 | Args:
841 | loc (str, optional): The location of the crawls. Defaults to 'cloud'.
842 | api_key (str, optional): The API key. Defaults to None.
843 |
844 | Returns:
845 | list: A list of Crawl objects.
846 | """
847 | if api_key is None:
848 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
849 |
850 | if api_key is not None and loc == "cloud":
851 | crawl_list_data = run_webt_api(
852 | {},
853 | "v1/crawl/list",
854 | api_key,
855 | )
856 | return crawl_list_data["crawls"]
857 |
858 | elif loc == "local" or api_key is None:
859 | crawls = []
860 | for filename in os.listdir("."):
861 | if filename.endswith(".json"):
862 | crawls.append(Crawl.from_metadata(filename[:-5]))
863 | return crawls
864 |
865 |
866 | def retry_failed(crawl_id: str, api_key: Optional[str] = None) -> None:
867 | """
868 | Queue failed URLs from a crawl.
869 |
870 | Args:
871 | crawl_id (str): The ID of the crawl.
872 | api_key (str, optional): The API key. Defaults to None.
873 | """
874 | if api_key is None:
875 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
876 |
877 | if api_key is not None:
878 | queue_json = {
879 | "crawl_id": crawl_id,
880 | }
881 | run_webt_api(
882 | queue_json,
883 | "v1/crawl/retry-failed",
884 | api_key,
885 | )
886 |
--------------------------------------------------------------------------------
/src/webtranspose/openai.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import openai
5 | import tiktoken
6 |
7 |
8 | class OpenAIScraper:
9 | def __init__(
10 | self,
11 | chunk_size: int = 2500,
12 | overlap_size: int = 100,
13 | ):
14 | """
15 | Initialize the OpenAIScraper.
16 |
17 | Args:
18 | chunk_size (int, optional): The size of each chunk of text to process. Defaults to 2500.
19 | overlap_size (int, optional): The size of the overlap between chunks. Defaults to 100.
20 | """
21 | self.api_key = os.environ.get("OPENAI_API_KEY")
22 | self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
23 | self.chunk_size = chunk_size
24 | self.overlap_size = overlap_size
25 |
26 | @staticmethod
27 | def process_html(
28 | text: str, chunk_size: int, overlap_size: int, encoding: tiktoken.Encoding
29 | ) -> list:
30 | """
31 | Process the HTML text into chunks.
32 |
33 | Args:
34 | text (str): The HTML text to process.
35 | chunk_size (int): The size of each chunk of text.
36 | overlap_size (int): The size of the overlap between chunks.
37 | encoding (tiktoken.Encoding): The encoding object.
38 |
39 | Returns:
40 | list: A list of decoded chunks.
41 | """
42 | encoded = encoding.encode(text)
43 | if overlap_size >= chunk_size:
44 | raise ValueError("Overlap size should be less than chunk size.")
45 | chunks = []
46 | idx = 0
47 | while idx < len(encoded):
48 | end_idx = idx + chunk_size
49 | chunks.append(encoded[idx:end_idx])
50 | idx = end_idx - overlap_size
51 | decoded_chunks = [encoding.decode(chunk) for chunk in chunks]
52 | return decoded_chunks
53 |
54 | def scrape(self, html: str, schema: dict) -> dict:
55 | """
56 | Scrape the HTML text using the provided schema.
57 |
58 | Args:
59 | html (str): The HTML text to scrape.
60 | schema (dict): The schema to use for scraping.
61 |
62 | Returns:
63 | dict: The scraped data.
64 | """
65 | processed_schema = self.transform_schema(schema)
66 | schema_keys = ", ".join(processed_schema.keys())
67 | out_data = {}
68 |
69 | for sub_html in self.process_html(html, self.chunk_size, self.overlap_size, self.encoding):
70 | model = "gpt-3.5-turbo-0613"
71 | if len(self.encoding.encode(sub_html)) > 2500:
72 | model = "gpt-3.5-turbo-16k"
73 |
74 | response = openai.ChatCompletion.create(
75 | model=model,
76 | temperature=0,
77 | messages=[{"role": "user", "content": sub_html}],
78 | functions=[
79 | {
80 | "name": "extract_info",
81 | "description": f"Extract the {schema_keys} from the website text if any exist. Empty if not found.",
82 | "parameters": {
83 | "type": "object",
84 | "properties": processed_schema,
85 | "required": list(processed_schema.keys()),
86 | },
87 | },
88 | ],
89 | )
90 | out = response["choices"][0]["message"]
91 |
92 | if "function_call" in out:
93 | args = json.loads(out["function_call"]["arguments"])
94 |
95 | for k in args.keys():
96 | if k in processed_schema:
97 | if processed_schema[k]["type"] == "array":
98 | if k not in out_data:
99 | out_data[k] = []
100 | out_data[k] += args[k]
101 | else:
102 | out_data[k] = args[k]
103 | del processed_schema[k]
104 | elif k not in out_data:
105 | out_data[k] = None
106 |
107 | return out_data
108 |
109 | def transform_schema(self, schema: dict) -> dict:
110 | """
111 | Transform the schema into the format required by OpenAI.
112 |
113 | Args:
114 | schema (dict): The schema to transform.
115 |
116 | Returns:
117 | dict: The transformed schema.
118 | """
119 | openai_type_map = {
120 | "str": "string",
121 | "int": "number",
122 | "bool": "boolean",
123 | }
124 |
125 | properties = {}
126 | for key, value in schema.items():
127 | if isinstance(value, dict):
128 | if "type" in value and value["type"] == "array":
129 | properties[key] = {
130 | "type": "array",
131 | "items": {
132 | "type": "object",
133 | "properties": self.transform_schema(value["items"]),
134 | },
135 | "required": list(value["items"].keys()),
136 | }
137 | elif "type" in value:
138 | properties[key] = value
139 | else:
140 | properties[key] = self.transform_schema(value)
141 | elif isinstance(value, list):
142 | try:
143 | properties[key] = {
144 | "type": openai_type_map[type(value[0]).__name__],
145 | "enum": value,
146 | "description": key,
147 | }
148 | except IndexError:
149 | raise Exception(f"Empty list for key {key}")
150 | else:
151 | properties[key] = {
152 | "type": value,
153 | "description": key,
154 | }
155 |
156 | return properties
157 |
--------------------------------------------------------------------------------
/src/webtranspose/scrape.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import re
5 | import uuid
6 |
7 | import requests
8 | from bs4 import BeautifulSoup
9 |
10 | from .openai import OpenAIScraper
11 | from .webt_api import run_webt_api
12 |
13 |
14 | class Scraper:
15 | def __init__(
16 | self,
17 | schema: dict,
18 | scraper_id: str = None,
19 | name: str = None,
20 | render_js: bool = False,
21 | verbose: bool = False,
22 | scraper: OpenAIScraper = None,
23 | api_key: str = None,
24 | proxy: str = None,
25 | _created: bool = False,
26 | ):
27 | """
28 | Initialize the Scraper object.
29 |
30 | Args:
31 | schema (dict): The schema for scraping.
32 | scraper_id (str, optional): The ID of the scraper. Defaults to None.
33 | name (str, optional): The name of the scraper. Defaults to None.
34 | render_js (bool, optional): Whether to render JavaScript. Defaults to False.
35 | verbose (bool, optional): Whether to print verbose output. Defaults to False.
36 | scraper (OpenAIScraper, optional): The scraper object. Defaults to None.
37 | api_key (str, optional): The API key. Defaults to None.
38 | proxy (str, optional): The proxy. Defaults to None.
39 | _created (bool, optional): Whether the scraper has been created. Defaults to False.
40 | """
41 | self.api_key = api_key
42 | if self.api_key is None:
43 | self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
44 |
45 | self.name = name
46 | if self.name is None:
47 | self.name = "New Scraper"
48 | self.schema = schema
49 | self.verbose = verbose
50 | self.scraper = scraper
51 | self.render_js = render_js
52 | self.scraper_id = scraper_id
53 | self.proxy = proxy
54 | if self.scraper is None:
55 | self.scraper = OpenAIScraper()
56 | if self.scraper_id is None:
57 | self.scraper_id = str(uuid.uuid4())
58 | self.created = _created
59 |
60 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
61 | if api_key is None and self.api_key is None:
62 | logging.warning(
63 | "No Web Transpose API provided. Lite version in use...\n\nTo run the actual WebT AI Web Scraper the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics."
64 | )
65 |
66 | def __str__(self) -> str:
67 | """
68 | Get a string representation of the Scraper object.
69 |
70 | Returns:
71 | str: The string representation of the Scraper object.
72 | """
73 | status = self.status()
74 | schema = json.dumps(status["schema"], indent=4)
75 | return (
76 | f"WebTransposeScraper(\n"
77 | f" Status ID: {status['scraper_id']}\n"
78 | f" Name: {status['name']}\n"
79 | f" Render JS: {status['render_js']}\n"
80 | f" Schema: {schema}\n"
81 | f")"
82 | )
83 |
84 | def __repr__(self) -> str:
85 | """
86 | Get a string representation of the Scraper object.
87 |
88 | Returns:
89 | str: The string representation of the Scraper object.
90 | """
91 | status = self.status()
92 | schema = json.dumps(status["schema"], indent=4)
93 | return (
94 | f"WebTransposeScraper(\n"
95 | f" Status ID: {status['scraper_id']}\n"
96 | f" Name: {status['name']}\n"
97 | f" Render JS: {status['render_js']}\n"
98 | f" Schema: {schema}\n"
99 | f")"
100 | )
101 |
102 | def create_scraper_api(self):
103 | """
104 | Creates a Scraper on https://webtranspose.com
105 | """
106 | if self.verbose:
107 | logging.info(f"Creating AI Web Scraper on Web Transpose...")
108 |
109 | create_json = {
110 | "name": self.name,
111 | "schema": self.schema,
112 | "render_js": self.render_js,
113 | "proxy": self.proxy,
114 | }
115 | out_json = run_webt_api(
116 | create_json,
117 | "/v1/scraper/create",
118 | self.api_key,
119 | )
120 | self.scraper_id = out_json["scraper_id"]
121 | self.created = True
122 |
123 | def scrape(self, url=None, html=None, timeout=30):
124 | """
125 | Scrape the data from a given URL or HTML.
126 |
127 | Args:
128 | url (str, optional): The URL to scrape. Defaults to None.
129 | html (str, optional): The HTML to scrape. Defaults to None.
130 | timeout (int, optional): The timeout for the request. Defaults to 30.
131 |
132 | Returns:
133 | dict: The scraped data.
134 |
135 | Raises:
136 | ValueError: If neither URL nor HTML is provided.
137 | """
138 | if self.verbose:
139 | logging.info(f"Running Scraper({self.name}) on {url}...")
140 |
141 | if self.api_key is None:
142 | if url is not None:
143 | response = requests.get(url, timeout=timeout)
144 | soup = BeautifulSoup(response.content, "html.parser")
145 | body = soup.body
146 | html = re.sub("\s+", " ", str(body)).strip()
147 |
148 | if html is None:
149 | raise ValueError("Must provide either a url or html.")
150 |
151 | return self.scraper.scrape(
152 | html,
153 | self.schema,
154 | )
155 | else:
156 | if not self.created:
157 | self.create_scraper_api()
158 |
159 | scrape_json = {
160 | "scraper_id": self.scraper_id,
161 | "url": url,
162 | "html": html,
163 | "proxy": self.proxy,
164 | }
165 | out_json = run_webt_api(
166 | scrape_json,
167 | "/v1/scraper/scrape",
168 | self.api_key,
169 | )
170 | return out_json
171 |
172 | def status(self):
173 | """
174 | Get the status of the Scraper.
175 |
176 | Returns:
177 | dict: The status of the Scraper.
178 | """
179 | if self.api_key is None or not self.created:
180 | return {
181 | "scraper_id": self.scraper_id,
182 | "name": self.name,
183 | "verbose": self.verbose,
184 | "render_js": self.render_js,
185 | "schema": self.schema,
186 | "proxy": self.proxy,
187 | }
188 | else:
189 | get_json = {
190 | "scraper_id": self.scraper_id,
191 | }
192 | out_api = run_webt_api(
193 | get_json,
194 | "/v1/scraper/get",
195 | self.api_key,
196 | )
197 | scraper = out_api["scraper"]
198 | return {
199 | "scraper_id": scraper["id"],
200 | "name": scraper["name"],
201 | "verbose": self.verbose,
202 | "render_js": scraper["render_js"],
203 | "schema": scraper["schema"],
204 | "proxy": scraper["proxy"]
205 | }
206 |
207 |
208 | def get_scraper(scraper_id, api_key: str = None):
209 | """
210 | Get a Scraper object based on the scraper ID.
211 |
212 | Args:
213 | scraper_id (str): The ID of the scraper.
214 | api_key (str, optional): The API key. Defaults to None.
215 |
216 | Returns:
217 | Scraper: The Scraper object.
218 |
219 | Raises:
220 | ValueError: If api_key is not provided.
221 | """
222 | if api_key is None:
223 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
224 |
225 | if api_key is not None:
226 | get_json = {
227 | "scraper_id": scraper_id,
228 | }
229 | out_json = run_webt_api(
230 | get_json,
231 | "/v1/scraper/get",
232 | api_key,
233 | )
234 | scraper = out_json["scraper"]
235 | return Scraper(
236 | scraper_id=scraper["id"],
237 | name=scraper["name"],
238 | schema=scraper["schema"],
239 | render_js=scraper["render_js"],
240 | api_key=api_key,
241 | proxy=scraper['proxy'],
242 | _created=True,
243 | )
244 |
245 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
246 |
247 |
248 | def list_scrapers(api_key: str = None):
249 | """
250 | List all available scrapers.
251 |
252 | Args:
253 | api_key (str, optional): The API key. Defaults to None.
254 |
255 | Returns:
256 | list: A list of Scrapers.
257 |
258 | Raises:
259 | ValueError: If api_key is not provided.
260 | """
261 | if api_key is None:
262 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
263 |
264 | if api_key is not None:
265 | out_json = run_webt_api(
266 | {},
267 | "/v1/scraper/list",
268 | api_key,
269 | )
270 | return out_json["scrapers"]
271 |
272 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
273 |
--------------------------------------------------------------------------------
/src/webtranspose/search.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 |
4 | from .webt_api import run_webt_api
5 |
6 | def search(query, api_key=None) -> dict:
7 | """
8 | Search for a query using the Web Transpose API.
9 |
10 | Args:
11 | query (str): The query to search for.
12 | api_key (str, optional): The API key to use for authentication. Defaults to None.
13 |
14 | Returns:
15 | dict: The search results.
16 | """
17 | if api_key is None:
18 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
19 |
20 | if api_key is not None:
21 | out_json = run_webt_api(
22 | {
23 | "query": query,
24 | },
25 | "/v1/search",
26 | api_key,
27 | )
28 | return out_json
29 |
30 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
31 |
32 |
33 | def search_filter(query, api_key=None) -> dict:
34 | """
35 | Search for a query using the Web Transpose API with filtering.
36 |
37 | Args:
38 | query (str): The query to search for.
39 | api_key (str, optional): The API key to use for authentication. Defaults to None.
40 |
41 | Returns:
42 | dict: The filtered search results.
43 | """
44 | if api_key is None:
45 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
46 |
47 | if api_key is not None:
48 | out_json = run_webt_api(
49 | {
50 | "query": query,
51 | },
52 | "/v1/search/filter",
53 | api_key,
54 | )
55 | return out_json
56 |
57 | raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
--------------------------------------------------------------------------------
/src/webtranspose/webt_api.py:
--------------------------------------------------------------------------------
1 | import os
2 | from urllib.parse import urljoin
3 |
4 | import requests
5 |
6 |
7 | def run_webt_api(params: dict, api_path: str, api_key: str = None) -> dict:
8 | """
9 | Run a WebTranspose API request.
10 |
11 | Args:
12 | params (dict): The parameters for the API request.
13 | api_path (str): The API path.
14 | api_key (str, optional): The API key. Defaults to None.
15 |
16 | Returns:
17 | dict: The JSON response from the API.
18 |
19 | Raises:
20 | Exception: If the API request fails with a non-200 status code.
21 | """
22 | WEBTRANSPOSE_API_URL = "https://api.webtranspose.com/"
23 | if api_key is None:
24 | api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
25 | headers = {"X-API-Key": api_key}
26 | api_endpoint = urljoin(WEBTRANSPOSE_API_URL, api_path)
27 | response = requests.post(api_endpoint, headers=headers, json=params, timeout=180)
28 | if response.status_code == 200:
29 | return response.json()
30 | else:
31 | raise Exception("API request failed with status code: {}".format(response.status_code))
32 |
--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
1 | """
2 | Tasks for maintaining the project.
3 |
4 | Execute 'invoke --list' for guidance on using Invoke
5 | """
6 | import platform
7 | import webbrowser
8 | from pathlib import Path
9 | from typing import Optional
10 |
11 | from invoke import call, task
12 | from invoke.context import Context
13 | from invoke.runners import Result
14 |
15 | ROOT_DIR = Path(__file__).parent
16 | DOCS_DIR = ROOT_DIR.joinpath("docs")
17 | DOCS_BUILD_DIR = DOCS_DIR.joinpath("_build")
18 | DOCS_INDEX = DOCS_BUILD_DIR.joinpath("index.html")
19 | COVERAGE_FILE = ROOT_DIR.joinpath(".coverage")
20 | COVERAGE_DIR = ROOT_DIR.joinpath("htmlcov")
21 | COVERAGE_REPORT = COVERAGE_DIR.joinpath("index.html")
22 | SOURCE_DIR = ROOT_DIR.joinpath("src/webtranspose")
23 | TEST_DIR = ROOT_DIR.joinpath("tests")
24 | PYTHON_TARGETS = [
25 | SOURCE_DIR,
26 | TEST_DIR,
27 | ROOT_DIR.joinpath("noxfile.py"),
28 | Path(__file__),
29 | ]
30 | PYTHON_TARGETS_STR = " ".join([str(p) for p in PYTHON_TARGETS])
31 |
32 |
33 | def _run(c: Context, command: str) -> Optional[Result]:
34 | return c.run(command, pty=platform.system() != "Windows")
35 |
36 |
37 | @task()
38 | def clean_build(c):
39 | # type: (Context) -> None
40 | """Clean up files from package building."""
41 | _run(c, "rm -fr build/")
42 | _run(c, "rm -fr dist/")
43 | _run(c, "rm -fr .eggs/")
44 | _run(c, "find . -name '*.egg-info' -exec rm -fr {} +")
45 | _run(c, "find . -name '*.egg' -exec rm -f {} +")
46 |
47 |
48 | @task()
49 | def clean_python(c):
50 | # type: (Context) -> None
51 | """Clean up python file artifacts."""
52 | _run(c, "find . -name '*.pyc' -exec rm -f {} +")
53 | _run(c, "find . -name '*.pyo' -exec rm -f {} +")
54 | _run(c, "find . -name '*~' -exec rm -f {} +")
55 | _run(c, "find . -name '__pycache__' -exec rm -fr {} +")
56 |
57 |
58 | @task()
59 | def clean_tests(c):
60 | # type: (Context) -> None
61 | """Clean up files from testing."""
62 | _run(c, f"rm -f {COVERAGE_FILE}")
63 | _run(c, f"rm -fr {COVERAGE_DIR}")
64 | _run(c, "rm -fr .pytest_cache")
65 |
66 |
67 | @task()
68 | def clean_docs(c):
69 | # type: (Context) -> None
70 | """Clean up files from documentation builds."""
71 | _run(c, f"rm -fr {DOCS_BUILD_DIR}")
72 |
73 |
74 | @task(pre=[clean_build, clean_python, clean_tests, clean_docs])
75 | def clean(c):
76 | # type: (Context) -> None
77 | """Run all clean sub-tasks."""
78 |
79 |
80 | @task()
81 | def install_hooks(c):
82 | # type: (Context) -> None
83 | """Install pre-commit hooks."""
84 | _run(c, "poetry run pre-commit install")
85 |
86 |
87 | @task()
88 | def hooks(c):
89 | # type: (Context) -> None
90 | """Run pre-commit hooks."""
91 | _run(c, "poetry run pre-commit run --all-files")
92 |
93 |
94 | @task(name="format", help={"check": "Checks if source is formatted without applying changes"})
95 | def format_(c, check=False):
96 | # type: (Context, bool) -> None
97 | """Format code."""
98 | isort_options = ["--check-only", "--diff"] if check else []
99 | _run(c, f"poetry run isort {' '.join(isort_options)} {PYTHON_TARGETS_STR}")
100 | black_options = ["--diff", "--check"] if check else ["--quiet"]
101 | _run(c, f"poetry run black {' '.join(black_options)} {PYTHON_TARGETS_STR}")
102 |
103 |
104 | @task()
105 | def flake8(c):
106 | # type: (Context) -> None
107 | """Run flake8."""
108 | _run(c, f"poetry run flakeheaven lint {PYTHON_TARGETS_STR}")
109 |
110 |
111 | @task()
112 | def security(c):
113 | # type: (Context) -> None
114 | """Run security related checks."""
115 | _run(
116 | c,
117 | "poetry export --with dev --format=requirements.txt --without-hashes | "
118 | "poetry run safety check --stdin --full-report",
119 | )
120 |
121 |
122 | @task(pre=[flake8, security, call(format_, check=True)])
123 | def lint(c):
124 | # type: (Context) -> None
125 | """Run all linting."""
126 |
127 |
128 | @task()
129 | def mypy(c):
130 | # type: (Context) -> None
131 | """Run mypy."""
132 | _run(c, f"poetry run mypy {PYTHON_TARGETS_STR}")
133 |
134 |
135 | @task()
136 | def tests(c):
137 | # type: (Context) -> None
138 | """Run tests."""
139 | pytest_options = ["--xdoctest", "--cov", "--cov-report=", "--cov-fail-under=0"]
140 | _run(c, f"poetry run pytest {' '.join(pytest_options)} {TEST_DIR} {SOURCE_DIR}")
141 |
142 |
143 | @task(
144 | help={
145 | "fmt": "Build a local report: report, html, json, annotate, html, xml.",
146 | "open_browser": "Open the coverage report in the web browser (requires --fmt html)",
147 | }
148 | )
149 | def coverage(c, fmt="report", open_browser=False):
150 | # type: (Context, str, bool) -> None
151 | """Create coverage report."""
152 | if any(Path().glob(".coverage.*")):
153 | _run(c, "poetry run coverage combine")
154 | _run(c, f"poetry run coverage {fmt} -i")
155 | if fmt == "html" and open_browser:
156 | webbrowser.open(COVERAGE_REPORT.as_uri())
157 |
158 |
159 | @task(
160 | help={
161 | "serve": "Build the docs watching for changes",
162 | "open_browser": "Open the docs in the web browser",
163 | }
164 | )
165 | def docs(c, serve=False, open_browser=False):
166 | # type: (Context, bool, bool) -> None
167 | """Build documentation."""
168 | _run(c, f"sphinx-apidoc -o {DOCS_DIR} {SOURCE_DIR}")
169 | build_docs = f"sphinx-build -b html {DOCS_DIR} {DOCS_BUILD_DIR}"
170 | _run(c, build_docs)
171 | if open_browser:
172 | webbrowser.open(DOCS_INDEX.absolute().as_uri())
173 | if serve:
174 | _run(c, f"poetry run watchmedo shell-command -p '*.rst;*.md' -c '{build_docs}' -R -D .")
175 |
176 |
177 | @task(
178 | help={
179 | "part": "Part of the version to be bumped.",
180 | "dry_run": "Don't write any files, just pretend. (default: False)",
181 | }
182 | )
183 | def version(c, part, dry_run=False):
184 | # type: (Context, str, bool) -> None
185 | """Bump version."""
186 | bump_options = ["--dry-run"] if dry_run else []
187 | _run(c, f"poetry run bump2version {' '.join(bump_options)} {part}")
188 |
--------------------------------------------------------------------------------
/tests/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "id": "ae7fef52",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "ename": "ModuleNotFoundError",
11 | "evalue": "No module named 'webtranspose'",
12 | "output_type": "error",
13 | "traceback": [
14 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
15 | "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
16 | "\u001b[0;32m/var/folders/rh/0zrsw9xd3qnbggwbk10z77380000gn/T/ipykernel_5677/167283409.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mwebtranspose\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcrawl\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
17 | "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'webtranspose'"
18 | ]
19 | }
20 | ],
21 | "source": [
22 | "from webtranspose import crawl"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "id": "cef46608",
29 | "metadata": {},
30 | "outputs": [],
31 | "source": []
32 | }
33 | ],
34 | "metadata": {
35 | "kernelspec": {
36 | "display_name": "webt",
37 | "language": "python",
38 | "name": "webt"
39 | },
40 | "language_info": {
41 | "codemirror_mode": {
42 | "name": "ipython",
43 | "version": 3
44 | },
45 | "file_extension": ".py",
46 | "mimetype": "text/x-python",
47 | "name": "python",
48 | "nbconvert_exporter": "python",
49 | "pygments_lexer": "ipython3",
50 | "version": "3.9.5"
51 | }
52 | },
53 | "nbformat": 4,
54 | "nbformat_minor": 5
55 | }
56 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit test package for webtranspose."""
2 |
--------------------------------------------------------------------------------
/tests/test_webtranspose.py:
--------------------------------------------------------------------------------
1 | """Tests for `webtranspose` module."""
2 | from typing import Generator
3 |
4 | import pytest
5 |
6 | import webtranspose
7 |
8 |
9 | @pytest.fixture
10 | def version() -> Generator[str, None, None]:
11 | """Sample pytest fixture."""
12 | yield webtranspose.__version__
13 |
14 |
15 | def test_version(version: str) -> None:
16 | """Sample pytest test function with the pytest fixture as an argument."""
17 | assert version == "0.1.0"
18 |
--------------------------------------------------------------------------------