├── .darglint
├── tests
    ├── __init__.py
    ├── test_webtranspose.py
    └── Untitled.ipynb
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── pre-commit-autoupdate.yml
    │   ├── release.yml
    │   └── tests.yml
├── img
    └── web-transpose-cover.png
├── codecov.yml
├── .readthedocs.yml
├── src
    └── webtranspose
    │   ├── __init__.py
    │   ├── webt_api.py
    │   ├── search.py
    │   ├── openai.py
    │   ├── chat.py
    │   ├── scrape.py
    │   └── crawl.py
├── CHANGELOG.md
├── .pre-commit-config.yaml
├── .bumpversion.cfg
├── noxfile.py
├── .gitignore
├── README.md
├── pyproject.toml
├── CODE_OF_CONDUCT.md
├── tasks.py
└── LICENSE.rst


/.darglint:
--------------------------------------------------------------------------------
1 | [darglint]
2 | strictness = short
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit test package for webtranspose."""
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links: []
3 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #
2 | 
3 | ## Proposed Changes
4 | 
5 |   -
6 |   -
7 |   -
8 | 


--------------------------------------------------------------------------------
/img/web-transpose-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mike-gee/webtranspose/HEAD/img/web-transpose-cover.png


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     project:
 4 |       default:
 5 |         target: "100"
 6 |     patch:
 7 |       default:
 8 |         target: "100"
 9 | comment:
10 |   require_changes: true
11 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   image: latest
 5 | 
 6 | formats: all
 7 | 
 8 | sphinx:
 9 |   configuration: docs/conf.py
10 | 
11 | python:
12 |   version: 3.8
13 |   install:
14 |     - requirements: docs/requirements.txt
15 |     - method: pip
16 |       path: .
17 | 


--------------------------------------------------------------------------------
/src/webtranspose/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top-level package for webtranspose."""
 2 | 
 3 | __author__ = """Mike Gee"""
 4 | __email__ = "mike@webtranspose.com"
 5 | __version__ = "0.3.1"
 6 | 
 7 | from .chat import *
 8 | from .crawl import *
 9 | from .openai import *
10 | from .scrape import *
11 | from .search import *


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: github-actions
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: daily
 7 |   - package-ecosystem: pip
 8 |     directory: "/docs"
 9 |     schedule:
10 |       interval: daily
11 |   - package-ecosystem: pip
12 |     directory: "/"
13 |     schedule:
14 |       interval: daily
15 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🐛 Bug report
 3 | about: Create a report to help us improve
 4 | labels: bug
 5 | assignees: ''
 6 | 
 7 | ---
 8 | 
 9 | ## Expected Behavior
10 | 
11 | 
12 | ## Actual Behavior
13 | 
14 | 
15 | ## Steps to Reproduce the Problem
16 | 
17 |   1.
18 |   1.
19 |   1.
20 | 
21 | ## Specifications
22 | 
23 |   - Version:
24 |   - Platform:
25 |   - Subsystem:
26 | 


--------------------------------------------------------------------------------
/tests/test_webtranspose.py:
--------------------------------------------------------------------------------
 1 | """Tests for `webtranspose` module."""
 2 | from typing import Generator
 3 | 
 4 | import pytest
 5 | 
 6 | import webtranspose
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def version() -> Generator[str, None, None]:
11 |     """Sample pytest fixture."""
12 |     yield webtranspose.__version__
13 | 
14 | 
15 | def test_version(version: str) -> None:
16 |     """Sample pytest test function with the pytest fixture as an argument."""
17 |     assert version == "0.1.0"
18 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ## [0.1.0] - 2023-10-21
11 | ### Added
12 | - First release on PyPI.
13 | 
14 | [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v0.1.0...HEAD
15 | [0.1.0]: https://github.com/mike-gee/webtranspose/compare/releases/tag/v0.1.0
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🚀 Feature request
 3 | about: Suggest an idea for this project
 4 | labels: enhancement
 5 | assignees: ''
 6 | 
 7 | ---
 8 | 
 9 | **Is your feature request related to a problem? Please describe.**
10 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 | 
12 | **Describe the solution you'd like**
13 | A clear and concise description of what you want to happen.
14 | 
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions or features you've considered.
17 | 
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: check-toml
 6 |       - id: check-yaml
 7 |       - id: debug-statements
 8 |       - id: check-merge-conflict
 9 |       - id: check-json
10 |       - id: end-of-file-fixer
11 | -   repo: https://github.com/timothycrosley/isort
12 |     rev: 5.12.0
13 |     hooks:
14 |       - id: isort
15 | -   repo: https://github.com/psf/black
16 |     rev: 23.3.0
17 |     hooks:
18 |       - id: black
19 | -   repo: local
20 |     hooks:
21 |       - id: flakeheaven
22 |         name: flakeheaven
23 |         description: "`FlakeHeaven` it's a Flake8 wrapper to make it cools."
24 |         entry: poetry run flakeheaven
25 |         args: [lint]
26 |         language: system
27 |         types: [python]
28 |         require_serial: true
29 | 


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | commit = True
 3 | tag = False
 4 | current_version = 0.1.0
 5 | 
 6 | [bumpversion:file:pyproject.toml]
 7 | search = version = "{current_version}"
 8 | replace = version = "{new_version}"
 9 | 
10 | [bumpversion:file:src/webtranspose/__init__.py]
11 | search = __version__ = "{current_version}"
12 | replace = __version__ = "{new_version}"
13 | 
14 | [bumpversion:file(title):CHANGELOG.md]
15 | search = {#}{#} [Unreleased]
16 | replace = {#}{#} [Unreleased]
17 | 
18 | 	{#}{#} [{new_version}] - {now:%Y-%m-%d}
19 | 
20 | [bumpversion:file(links):CHANGELOG.md]
21 | search = [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v{current_version}...HEAD
22 | replace = [Unreleased]: https://github.com/mike-gee/webtranspose/compare/v{new_version}...HEAD
23 | 	[{new_version}]: https://github.com/mike-gee/webtranspose/compare/v{current_version}...v{new_version}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |   pull_request:
 7 |     branches:
 8 |   schedule:
 9 |     - cron: '0 6 * * 1'
10 | 
11 | jobs:
12 |   analyze:
13 |     name: Analyze
14 |     runs-on: ubuntu-latest
15 |     permissions:
16 |       actions: read
17 |       contents: read
18 |       security-events: write
19 | 
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         language: [ 'python' ]
24 | 
25 |     steps:
26 |       - name: Checkout repository
27 |         uses: actions/checkout@v3.5.2
28 | 
29 |       # Initializes the CodeQL tools for scanning.
30 |       - name: Initialize CodeQL
31 |         uses: github/codeql-action/init@v2
32 |         with:
33 |           languages: ${{ matrix.language }}
34 | 
35 |       # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
36 |       # If this step fails, then you should remove it and run the build manually (see below)
37 |       - name: Autobuild
38 |         uses: github/codeql-action/autobuild@v2
39 | 
40 |       - name: Perform CodeQL Analysis
41 |         uses: github/codeql-action/analyze@v2
42 | 


--------------------------------------------------------------------------------
/src/webtranspose/webt_api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from urllib.parse import urljoin
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | def run_webt_api(params: dict, api_path: str, api_key: str = None) -> dict:
 8 |     """
 9 |     Run a WebTranspose API request.
10 | 
11 |     Args:
12 |         params (dict): The parameters for the API request.
13 |         api_path (str): The API path.
14 |         api_key (str, optional): The API key. Defaults to None.
15 | 
16 |     Returns:
17 |         dict: The JSON response from the API.
18 | 
19 |     Raises:
20 |         Exception: If the API request fails with a non-200 status code.
21 |     """
22 |     WEBTRANSPOSE_API_URL = "https://api.webtranspose.com/"
23 |     if api_key is None:
24 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
25 |     headers = {"X-API-Key": api_key}
26 |     api_endpoint = urljoin(WEBTRANSPOSE_API_URL, api_path)
27 |     response = requests.post(api_endpoint, headers=headers, json=params, timeout=180)
28 |     if response.status_code == 200:
29 |         return response.json()
30 |     else:
31 |         raise Exception("API request failed with status code: {}".format(response.status_code))
32 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit-autoupdate.yml:
--------------------------------------------------------------------------------
 1 | name: "Pre-commit autoupdate"
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 6 * * 1'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   autoupdate:
10 |     name: autoupdate
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3.5.2
14 | 
15 |       - name: Set up Python 3.8
16 |         uses: actions/setup-python@v4.6.1
17 |         with:
18 |           python-version: 3.8
19 | 
20 |       - name: Install system deps
21 |         shell: bash
22 |         run: |
23 |           pip install poetry
24 |           poetry config virtualenvs.in-project true
25 |           poetry install --no-root --only dev --only linters --sync
26 | 
27 |       - name: Run autoupdate
28 |         run: poetry run pre-commit autoupdate
29 | 
30 |       - name: Run pre-commit
31 |         run: poetry run pre-commit run --all-files
32 | 
33 |       - uses: peter-evans/create-pull-request@v5.0.1
34 |         with:
35 |           token: ${{ secrets.GITHUB_TOKEN }}
36 |           branch: chore-update-pre-commit-hooks
37 |           title: Update pre-commit hooks
38 |           commit-message: "Update pre-commit hooks"
39 |           body: |
40 |             # Update pre-commit hooks 
41 |             
42 |             - Update pre-commit hooks to the latest version.
43 |           delete-branch: true
44 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
 1 | """Nox sessions."""
 2 | import platform
 3 | 
 4 | import nox
 5 | from nox_poetry import Session, session
 6 | 
 7 | nox.options.sessions = ["tests", "mypy"]
 8 | python_versions = ["3.8", "3.9", "3.10", "3.11"]
 9 | 
10 | 
11 | @session(python=python_versions)
12 | def tests(session: Session) -> None:
13 |     """Run the test suite."""
14 |     session.install(".")
15 |     session.install("invoke", "pytest", "xdoctest", "coverage[toml]", "pytest-cov")
16 |     try:
17 |         session.run(
18 |             "inv",
19 |             "tests",
20 |             env={
21 |                 "COVERAGE_FILE": f".coverage.{platform.system()}.{platform.python_version()}",
22 |             },
23 |         )
24 |     finally:
25 |         if session.interactive:
26 |             session.notify("coverage")
27 | 
28 | 
29 | @session
30 | def coverage(session: Session) -> None:
31 |     """Produce the coverage report."""
32 |     args = session.posargs if session.posargs and len(session._runner.manifest) == 1 else []
33 |     session.install("invoke", "coverage[toml]")
34 |     session.run("inv", "coverage", *args)
35 | 
36 | 
37 | @session(python=python_versions)
38 | def mypy(session: Session) -> None:
39 |     """Type-check using mypy."""
40 |     session.install(".")
41 |     session.install("invoke", "mypy")
42 |     session.run("inv", "mypy")
43 | 
44 | 
45 | @session(python="3.11")
46 | def security(session: Session) -> None:
47 |     """Scan dependencies for insecure packages."""
48 |     session.install("invoke", "safety")
49 |     session.run("inv", "security")
50 | 


--------------------------------------------------------------------------------
/src/webtranspose/search.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | 
 4 | from .webt_api import run_webt_api
 5 | 
 6 | def search(query, api_key=None) -> dict:
 7 |     """
 8 |     Search for a query using the Web Transpose API.
 9 | 
10 |     Args:
11 |         query (str): The query to search for.
12 |         api_key (str, optional): The API key to use for authentication. Defaults to None.
13 | 
14 |     Returns:
15 |         dict: The search results.
16 |     """
17 |     if api_key is None:
18 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
19 | 
20 |     if api_key is not None:
21 |         out_json = run_webt_api(
22 |             {
23 |                 "query": query,
24 |             },
25 |             "/v1/search",
26 |             api_key,
27 |         )
28 |         return out_json
29 | 
30 |     raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
31 | 
32 | 
33 | def search_filter(query, api_key=None) -> dict:
34 |     """
35 |     Search for a query using the Web Transpose API with filtering.
36 | 
37 |     Args:
38 |         query (str): The query to search for.
39 |         api_key (str, optional): The API key to use for authentication. Defaults to None.
40 | 
41 |     Returns:
42 |         dict: The filtered search results.
43 |     """
44 |     if api_key is None:
45 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
46 | 
47 |     if api_key is not None:
48 |         out_json = run_webt_api(
49 |             {
50 |                 "query": query,
51 |             },
52 |             "/v1/search/filter",
53 |             api_key,
54 |         )
55 |         return out_json
56 | 
57 |     raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")


--------------------------------------------------------------------------------
/tests/Untitled.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "id": "ae7fef52",
 7 |    "metadata": {},
 8 |    "outputs": [
 9 |     {
10 |      "ename": "ModuleNotFoundError",
11 |      "evalue": "No module named 'webtranspose'",
12 |      "output_type": "error",
13 |      "traceback": [
14 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
15 |       "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
16 |       "\u001b[0;32m/var/folders/rh/0zrsw9xd3qnbggwbk10z77380000gn/T/ipykernel_5677/167283409.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mwebtranspose\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcrawl\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
17 |       "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'webtranspose'"
18 |      ]
19 |     }
20 |    ],
21 |    "source": [
22 |     "from webtranspose import crawl"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "id": "cef46608",
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": []
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "display_name": "webt",
37 |    "language": "python",
38 |    "name": "webt"
39 |   },
40 |   "language_info": {
41 |    "codemirror_mode": {
42 |     "name": "ipython",
43 |     "version": 3
44 |    },
45 |    "file_extension": ".py",
46 |    "mimetype": "text/x-python",
47 |    "name": "python",
48 |    "nbconvert_exporter": "python",
49 |    "pygments_lexer": "ipython3",
50 |    "version": "3.9.5"
51 |   }
52 |  },
53 |  "nbformat": 4,
54 |  "nbformat_minor": 5
55 | }
56 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: release
 3 | 
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - 'v*'
 8 | 
 9 | jobs:
10 |   release:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3.5.2
14 | 
15 |       - name: Set up Python 3.8
16 |         uses: actions/setup-python@v4.6.1
17 |         with:
18 |           python-version: 3.8
19 | 
20 |       - name: Install system deps
21 |         shell: bash
22 |         run: |
23 |           pip install poetry
24 |           poetry config virtualenvs.in-project true
25 | 
26 |       - name: Build package
27 |         run: |
28 |           poetry build --ansi
29 |       
30 |       - name: Publish package on PyPI
31 |         uses: pypa/gh-action-pypi-publish@v1.4.2
32 |         with:
33 |           user: __token__
34 |           password: ${{ secrets.PYPI_TOKEN }}
35 | 
36 |       - name: Publish package on TestPyPI
37 |         uses: pypa/gh-action-pypi-publish@v1.4.2
38 |         with:
39 |           user: __token__
40 |           password: ${{ secrets.TEST_PYPI_TOKEN }}
41 |           repository_url: https://test.pypi.org/legacy/
42 |       
43 |   
44 |   github_release:
45 |     needs: release
46 |     name: Create Github Release
47 |     runs-on: ubuntu-latest
48 |     steps:
49 |       - uses: actions/checkout@v3.5.2
50 | 
51 |       - name: Get version from tag
52 |         id: tag_name
53 |         shell: bash
54 |         run: |
55 |           echo ::set-output name=current_version::${GITHUB_REF#refs/tags/v}
56 | 
57 |       - name: Get Changelog Entry
58 |         id: changelog_reader
59 |         uses: mindsers/changelog-reader-action@v2.2.2
60 |         with:
61 |           version: ${{ steps.tag_name.outputs.current_version }}
62 |           path: ./CHANGELOG.md
63 | 
64 |       - name: Create Release
65 |         id: create_release
66 |         uses: actions/create-release@v1.1.4
67 |         env:
68 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
69 |         with:
70 |           tag_name: ${{ github.ref }}
71 |           release_name: Release ${{ github.ref }}
72 |           body: ${{ steps.changelog_reader.outputs.changes }}
73 |           draft: false
74 |           prerelease: false
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | .ipynb
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # pytype
130 | .pytype/
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # Code editors
136 | .vscode
137 | 
138 | # Caches
139 | .flakeheaven_cache
140 | 
141 | # Web Transpose
142 | webtranspose-out/


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | name: tests
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |   pull_request:
  7 |     branches:
  8 | 
  9 | jobs:
 10 |   linting:
 11 |     name: Linting
 12 |     runs-on: ubuntu-latest
 13 |     steps:
 14 |       - uses: actions/checkout@v3.5.2
 15 | 
 16 |       - name: Set up Python 3.8
 17 |         uses: actions/setup-python@v4.6.1
 18 |         with:
 19 |           python-version: 3.8
 20 | 
 21 |       - name: Install system deps
 22 |         shell: bash
 23 |         run: |
 24 |           pip install poetry
 25 |           poetry config virtualenvs.in-project true
 26 |           poetry install --no-root --only dev --only linters --sync
 27 | 
 28 |       - name: Linting
 29 |         shell: bash
 30 |         run: poetry run pre-commit run --all-files
 31 | 
 32 |   tests:
 33 |     needs: linting
 34 |     name: ${{ matrix.os }} / ${{ matrix.python-version }}
 35 |     runs-on: ${{ matrix.os }}-latest
 36 |     strategy:
 37 |       matrix:
 38 |         os: [Ubuntu, MacOS, Windows]
 39 |         python-version: ['3.8', '3.9', '3.10', '3.11']
 40 |       fail-fast: true
 41 |     steps:
 42 |       - uses: actions/checkout@v3.5.2
 43 | 
 44 |       - name: Set up Python ${{ matrix.python-version }}
 45 |         uses: actions/setup-python@v4.6.1
 46 |         with:
 47 |           python-version: ${{ matrix.python-version }}
 48 | 
 49 |       - name: Install system deps
 50 |         shell: bash
 51 |         run: |
 52 |           pip install nox-poetry
 53 |           pip install poetry
 54 |           poetry config virtualenvs.in-project true
 55 | 
 56 |       - name: Run mypy with nox
 57 |         shell: bash
 58 |         run: nox --force-color -s mypy-${{ matrix.python-version }}
 59 | 
 60 |       - name: Run tests with nox
 61 |         shell: bash
 62 |         run: nox --force-color -s tests-${{ matrix.python-version }}
 63 | 
 64 |       - name: Run securtity check
 65 |         if: matrix.python-version == '3.11' && matrix.os == 'Ubuntu'
 66 |         shell: bash
 67 |         run: nox --force-color -s security
 68 | 
 69 |       - name: Upload coverage data
 70 |         uses: actions/upload-artifact@v2.2.4
 71 |         with:
 72 |           name: coverage-data
 73 |           path: ".coverage.*"
 74 | 
 75 |   coverage:
 76 |     needs: tests
 77 |     runs-on: ubuntu-latest
 78 |     steps:
 79 |       - uses: actions/checkout@v3.5.2
 80 |       
 81 |       - name: Set up Python 3.8
 82 |         uses: actions/setup-python@v4.6.1
 83 |         with:
 84 |           python-version: 3.8
 85 | 
 86 |       - name: Install system deps
 87 |         shell: bash
 88 |         run: |
 89 |           pip install nox-poetry
 90 |           pip install poetry
 91 |           poetry config virtualenvs.in-project true
 92 | 
 93 |       - name: Download coverage data
 94 |         uses: actions/download-artifact@v2.0.10
 95 |         with:
 96 |           name: coverage-data
 97 | 
 98 |       - name: Create coverage report
 99 |         shell: bash
100 |         run: |
101 |           nox --force-color --session=coverage -- --fmt xml
102 | 
103 |       - name: Upload coverage report
104 |         uses: codecov/codecov-action@v3.1.4
105 |         with:
106 |           token: ${{ secrets.CODECOV_TOKEN }}
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a href="https://webtranspose.com">
  2 |   <img alt="Web Transpose. Simple APIs to get data from the internet." src="img/web-transpose-cover.png" width="100%" />
  3 |   <h1 align="center">Web Transpose</h1>
  4 |   <p align="center"><b>
  5 |     Web Crawler & AI Web Scraper APIs for building new web experiences.
  6 |   </b></p>
  7 | </a>
  8 | 
  9 | ```bash
 10 | pip install webtranspose
 11 | ```
 12 | 
 13 | <h4 align="center">
 14 |   <a href="https://twitter.com/mikegeecmu">
 15 |     <img src="https://img.shields.io/twitter/follow/mikegeecmu?style=flat&label=%40mikegeecmu&logo=twitter&color=0bf&logoColor=fff" alt="X" />
 16 |   </a>
 17 |   <a href="https://github.com/mike-gee/webtranspose/blob/master/LICENSE.rst">
 18 |     <img src="https://img.shields.io/badge/LICENSE-GNU%20AGPLv3-blue" alt="License" />
 19 |   </a>
 20 |   <a href="https://github.com/mikegeecmu/webtranspose/blob/main/LICENSE">
 21 |     <img src="https://img.shields.io/badge/docs-Web%20Transpose-blue" alt="License" />
 22 |   </a>
 23 | </h4>
 24 | 
 25 | 
 26 | <p align="center">
 27 |   <a href="#introduction"><strong>Introduction</strong></a> ·
 28 |   <a href="#installation"><strong>Installation</strong></a> ·
 29 |   <a href="https://docs.webtranspose.com"><strong>Docs</strong></a>
 30 | </p>
 31 | <br/>
 32 | 
 33 | ## Introduction
 34 | 
 35 | In the near future, **nobody will open websites**. Instead, we will be directly served the information we are seeking. New web experiences will combine the information from many websites into a single, unified experience.
 36 | 
 37 | **Web Transpose** is a collection of API tools that allow building these new web experiences simple.
 38 | 
 39 | - [Webᵀ Crawl: Distributed Web Crawler](#crawl)
 40 | - [Webᵀ Scrape: AI Web Scraper](#scrape)
 41 | 
 42 | 
 43 | ### Crawl
 44 | 
 45 | ```python
 46 | import webtranspose as webt
 47 | 
 48 | import os
 49 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY"
 50 | 
 51 | crawl = webt.Crawl(
 52 |     "https://www.example.com",
 53 |     max_pages=100,
 54 |     render_js=True,
 55 | )
 56 | await crawl.crawl() # crawl.queue_crawl() for async
 57 | ```
 58 | 
 59 | ## Scrape
 60 | 
 61 | ```python
 62 | import webtranspose as webt
 63 | 
 64 | import os
 65 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY"
 66 | 
 67 | schema = {
 68 |     "Merchant Name": "string",
 69 |     "Title of Product": "string",
 70 |     "Product Photo URL": "string",
 71 | }
 72 | 
 73 | scraper = webt.Scraper(
 74 |     schema, 
 75 |     render_js=True, 
 76 | )
 77 | out_json = scraper.scrape("https://www.example.com")
 78 | ```
 79 | 
 80 | ## Web Search (AI SERP API)
 81 | 
 82 | ```python
 83 | import webtranspose as webt
 84 | 
 85 | import os
 86 | os.environ['WEBTRANSPOSE_API_KEY'] = "YOUR WEBT API KEY"
 87 | 
 88 | results = webt.search("what caused the fourth great ninja war?")
 89 | # results.keys()
 90 | # ['results']
 91 | 
 92 | # AI Filter
 93 | results = webt.search_filter("Paul Graham's Blog")
 94 | # results.keys()
 95 | # ['results', 'filtered_results']
 96 | ```
 97 | 
 98 | 
 99 | ## Installation
100 | 
101 | Non-Python Users: [📄 API Docs](https://docs.webtranspose.com).
102 | 
103 | This repo contains a local **lite** installation of Web Transpose. This is a good option if you want to run Web Transpose locally on your machine for quick use cases. 
104 | 
105 | ```shell
106 | pip install webtranspose
107 | ```
108 | 
109 | However, if you wish to leverage the full tools of Web Transpose and use in production, you should add your API key to add the **full** version.
110 | 
111 | ```python
112 | os.environ["WEBTRANSPOSE_API_KEY"] = "YOUR_API_KEY_HERE"
113 | ```
114 | 
115 | 
116 | ## Enterprise Support
117 | 
118 | Web Transpose serves enterprises small and large. We partner with companies for the long term with hands-on support and custom solutions.
119 | 
120 | Please email me directly at mike@webtranspose.com for enquiries.
121 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | 
  2 | [tool.poetry]
  3 | name = "webtranspose"
  4 | version = "0.3.2"
  5 | description = "Reliable APIs for the website data"
  6 | authors = ["Mike Gee <mike@webtranspose.com>"]
  7 | 
  8 | readme = "README.md"
  9 | homepage = "https://github.com/mike-gee/webtranspose"
 10 | repository = "https://github.com/mike-gee/webtranspose"
 11 | documentation = "https://docs.webtranspose.com"
 12 | keywords = ["webtranspose"]
 13 | classifiers=[
 14 |     "Development Status :: 2 - Pre-Alpha",
 15 |     "Intended Audience :: Developers",
 16 |     
 17 |     "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
 18 |     
 19 |     "Natural Language :: English",
 20 |     "Programming Language :: Python :: 3",
 21 |     "Programming Language :: Python :: 3.8",
 22 |     "Programming Language :: Python :: 3.9",
 23 |     "Programming Language :: Python :: 3.10",
 24 |     "Programming Language :: Python :: 3.11",
 25 | ]
 26 | 
 27 | 
 28 | [tool.poetry.urls]
 29 | "Bug Tracker" = "https://github.com/mike-gee/webtranspose/issues"
 30 | 
 31 | 
 32 | 
 33 | 
 34 | [tool.poetry.dependencies]
 35 | python = "<3.12,>=3.8"
 36 | requests = "^2.31.0"
 37 | httpx = "^0.25.1"
 38 | bs4 = "^0.0.1"
 39 | openai = "^1.3.3"
 40 | tiktoken = "^0.5.1"
 41 | lxml = "^4.9.3"
 42 | 
 43 | 
 44 | [tool.poetry.group.dev.dependencies]
 45 | pre-commit = "^3.3.2"
 46 | invoke = "^2.1.2"
 47 | bump2version = "^1.0.1"
 48 | watchdog = {version = "^3.0.0", extras = ["watchmedo"]}
 49 | ipykernel = "^6.25.2"
 50 | 
 51 | [tool.poetry.group.test.dependencies]
 52 | pytest = "^7.3.1"
 53 | xdoctest = "^1.1.1"
 54 | coverage = {version = "^7.2.6", extras = ["toml"]}
 55 | pytest-cov = "^4.1.0"
 56 | 
 57 | [tool.poetry.group.format.dependencies]
 58 | isort = "^5.12.0"
 59 | black = "^23.3.0"
 60 | 
 61 | [tool.poetry.group.linters.dependencies]
 62 | flake8 = ">=4.0.1,<5.0.0"
 63 | flakeheaven = "^3.3.0"
 64 | flake8-builtins = "^2.1.0"
 65 | flake8-blind-except = "^0.2.1"
 66 | flake8-logging-format = "^0.9.0"
 67 | flake8-bugbear = "^23.3.12"
 68 | flake8-annotations = "^2.9.1"
 69 | flake8-docstrings = "^1.7.0"
 70 | flake8-bandit = "^3.0.0"
 71 | flake8-broken-line = "^0.6.0"
 72 | darglint = "^1.8.1"
 73 | 
 74 | [tool.poetry.group.security.dependencies]
 75 | safety = "^2.4.0b1"
 76 | 
 77 | [tool.poetry.group.typing.dependencies]
 78 | mypy = "^1.3.0"
 79 | 
 80 | [tool.poetry.group.docs.dependencies]
 81 | sphinx = "^7.0.1"
 82 | recommonmark = "^0.7.1"
 83 | 
 84 | [tool.coverage.paths]
 85 | source = ["src", "*/site-packages"]
 86 | 
 87 | [tool.coverage.run]
 88 | branch = true
 89 | source = ["webtranspose"]
 90 | 
 91 | [tool.coverage.report]
 92 | fail_under = 100
 93 | exclude_lines = [
 94 |   "pragma: no cover",
 95 |   "def __repr__",
 96 |   "if self.debug",
 97 |   "if settings.DEBUG:",
 98 |   "raise AssertionError",
 99 |   "raise NotImplementedError",
100 |   "if 0:",
101 |   "if __name__ == __main__:"
102 | ]
103 | show_missing = true
104 | 
105 | [tool.coverage.html]
106 | directory = "htmlcov"
107 | 
108 | [tool.flakeheaven]
109 | format = "grouped"
110 | max_line_length = 99
111 | show_source = true
112 | docstring-convention = "google"
113 | extended_default_ignore = []
114 | 
115 | [tool.flakeheaven.plugins]
116 | pyflakes = ["+*"]
117 | pycodestyle = ["+*"]
118 | mccabe = ["+*"]
119 | flake8-annotations = ["+*", "-ANN1??", "-ANN401"]
120 | flake8-docstrings = ["+*", "-D212"]
121 | "flake8-*" = ["+*"]
122 | pylint = ["-C????", "-E????", "+F????", "+I????", "-R????", "-W????"]
123 | 
124 | [tool.flakeheaven.exceptions."tests/"]
125 | flake8-bandit = ["-S101"]
126 | 
127 | [tool.isort]
128 | multi_line_output = 3
129 | include_trailing_comma = true
130 | force_grid_wrap = 0
131 | use_parentheses = true
132 | line_length = 99
133 | known_third_party = ["invoke", "nox", "nox_poetry"]
134 | 
135 | [tool.black]
136 | line-length = 99
137 | target-version = ["py38"]
138 | 
139 | [tool.mypy]
140 | warn_return_any = true
141 | warn_unused_configs = true
142 | 
143 | [[tool.mypy.overrides]]
144 | module = ["pytest.*", "invoke.*", "nox.*", "nox_poetry.*"]
145 | allow_redefinition = false
146 | check_untyped_defs = true
147 | ignore_errors = false
148 | ignore_missing_imports = true
149 | implicit_reexport = true
150 | local_partial_types = true
151 | strict_optional = true
152 | strict_equality = true
153 | no_implicit_optional = true
154 | warn_unused_ignores = true
155 | warn_unreachable = true
156 | warn_no_return = true
157 | 
158 | [build-system]
159 | requires = ["poetry>=0.12"]
160 | build-backend = "poetry.masonry.api"
161 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Contributor Covenant Code of Conduct
  3 | 
  4 | ## Our Pledge
  5 | 
  6 | We as members, contributors, and leaders pledge to make participation in our
  7 | community a harassment-free experience for everyone, regardless of age, body
  8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  9 | identity and expression, level of experience, education, socio-economic status,
 10 | nationality, personal appearance, race, caste, color, religion, or sexual identity
 11 | and orientation.
 12 | 
 13 | We pledge to act and interact in ways that contribute to an open, welcoming,
 14 | diverse, inclusive, and healthy community.
 15 | 
 16 | ## Our Standards
 17 | 
 18 | Examples of behavior that contributes to a positive environment for our
 19 | community include:
 20 | 
 21 | * Demonstrating empathy and kindness toward other people
 22 | * Being respectful of differing opinions, viewpoints, and experiences
 23 | * Giving and gracefully accepting constructive feedback
 24 | * Accepting responsibility and apologizing to those affected by our mistakes,
 25 |   and learning from the experience
 26 | * Focusing on what is best not just for us as individuals, but for the
 27 |   overall community
 28 | 
 29 | Examples of unacceptable behavior include:
 30 | 
 31 | * The use of sexualized language or imagery, and sexual attention or
 32 |   advances of any kind
 33 | * Trolling, insulting or derogatory comments, and personal or political attacks
 34 | * Public or private harassment
 35 | * Publishing others' private information, such as a physical or email
 36 |   address, without their explicit permission
 37 | * Other conduct which could reasonably be considered inappropriate in a
 38 |   professional setting
 39 | 
 40 | ## Enforcement Responsibilities
 41 | 
 42 | Community leaders are responsible for clarifying and enforcing our standards of
 43 | acceptable behavior and will take appropriate and fair corrective action in
 44 | response to any behavior that they deem inappropriate, threatening, offensive,
 45 | or harmful.
 46 | 
 47 | Community leaders have the right and responsibility to remove, edit, or reject
 48 | comments, commits, code, wiki edits, issues, and other contributions that are
 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 50 | decisions when appropriate.
 51 | 
 52 | ## Scope
 53 | 
 54 | This Code of Conduct applies within all community spaces, and also applies when
 55 | an individual is officially representing the community in public spaces.
 56 | Examples of representing our community include using an official e-mail address,
 57 | posting via an official social media account, or acting as an appointed
 58 | representative at an online or offline event.
 59 | 
 60 | ## Enforcement
 61 | 
 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 63 | reported to the community leaders responsible for enforcement at mike@webtranspose.com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
120 | 
121 | Community Impact Guidelines were inspired by 
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 
126 | at [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 | 


--------------------------------------------------------------------------------
/src/webtranspose/openai.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | 
  4 | import openai
  5 | import tiktoken
  6 | 
  7 | 
  8 | class OpenAIScraper:
  9 |     def __init__(
 10 |         self,
 11 |         chunk_size: int = 2500,
 12 |         overlap_size: int = 100,
 13 |     ):
 14 |         """
 15 |         Initialize the OpenAIScraper.
 16 | 
 17 |         Args:
 18 |             chunk_size (int, optional): The size of each chunk of text to process. Defaults to 2500.
 19 |             overlap_size (int, optional): The size of the overlap between chunks. Defaults to 100.
 20 |         """
 21 |         self.api_key = os.environ.get("OPENAI_API_KEY")
 22 |         self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
 23 |         self.chunk_size = chunk_size
 24 |         self.overlap_size = overlap_size
 25 | 
 26 |     @staticmethod
 27 |     def process_html(
 28 |         text: str, chunk_size: int, overlap_size: int, encoding: tiktoken.Encoding
 29 |     ) -> list:
 30 |         """
 31 |         Process the HTML text into chunks.
 32 | 
 33 |         Args:
 34 |             text (str): The HTML text to process.
 35 |             chunk_size (int): The size of each chunk of text.
 36 |             overlap_size (int): The size of the overlap between chunks.
 37 |             encoding (tiktoken.Encoding): The encoding object.
 38 | 
 39 |         Returns:
 40 |             list: A list of decoded chunks.
 41 |         """
 42 |         encoded = encoding.encode(text)
 43 |         if overlap_size >= chunk_size:
 44 |             raise ValueError("Overlap size should be less than chunk size.")
 45 |         chunks = []
 46 |         idx = 0
 47 |         while idx < len(encoded):
 48 |             end_idx = idx + chunk_size
 49 |             chunks.append(encoded[idx:end_idx])
 50 |             idx = end_idx - overlap_size
 51 |         decoded_chunks = [encoding.decode(chunk) for chunk in chunks]
 52 |         return decoded_chunks
 53 | 
 54 |     def scrape(self, html: str, schema: dict) -> dict:
 55 |         """
 56 |         Scrape the HTML text using the provided schema.
 57 | 
 58 |         Args:
 59 |             html (str): The HTML text to scrape.
 60 |             schema (dict): The schema to use for scraping.
 61 | 
 62 |         Returns:
 63 |             dict: The scraped data.
 64 |         """
 65 |         processed_schema = self.transform_schema(schema)
 66 |         schema_keys = ", ".join(processed_schema.keys())
 67 |         out_data = {}
 68 | 
 69 |         for sub_html in self.process_html(html, self.chunk_size, self.overlap_size, self.encoding):
 70 |             model = "gpt-3.5-turbo-0613"
 71 |             if len(self.encoding.encode(sub_html)) > 2500:
 72 |                 model = "gpt-3.5-turbo-16k"
 73 | 
 74 |             response = openai.ChatCompletion.create(
 75 |                 model=model,
 76 |                 temperature=0,
 77 |                 messages=[{"role": "user", "content": sub_html}],
 78 |                 functions=[
 79 |                     {
 80 |                         "name": "extract_info",
 81 |                         "description": f"Extract the {schema_keys} from the website text if any exist. Empty if not found.",
 82 |                         "parameters": {
 83 |                             "type": "object",
 84 |                             "properties": processed_schema,
 85 |                             "required": list(processed_schema.keys()),
 86 |                         },
 87 |                     },
 88 |                 ],
 89 |             )
 90 |             out = response["choices"][0]["message"]
 91 | 
 92 |             if "function_call" in out:
 93 |                 args = json.loads(out["function_call"]["arguments"])
 94 | 
 95 |                 for k in args.keys():
 96 |                     if k in processed_schema:
 97 |                         if processed_schema[k]["type"] == "array":
 98 |                             if k not in out_data:
 99 |                                 out_data[k] = []
100 |                             out_data[k] += args[k]
101 |                         else:
102 |                             out_data[k] = args[k]
103 |                             del processed_schema[k]
104 |                     elif k not in out_data:
105 |                         out_data[k] = None
106 | 
107 |         return out_data
108 | 
109 |     def transform_schema(self, schema: dict) -> dict:
110 |         """
111 |         Transform the schema into the format required by OpenAI.
112 | 
113 |         Args:
114 |             schema (dict): The schema to transform.
115 | 
116 |         Returns:
117 |             dict: The transformed schema.
118 |         """
119 |         openai_type_map = {
120 |             "str": "string",
121 |             "int": "number",
122 |             "bool": "boolean",
123 |         }
124 | 
125 |         properties = {}
126 |         for key, value in schema.items():
127 |             if isinstance(value, dict):
128 |                 if "type" in value and value["type"] == "array":
129 |                     properties[key] = {
130 |                         "type": "array",
131 |                         "items": {
132 |                             "type": "object",
133 |                             "properties": self.transform_schema(value["items"]),
134 |                         },
135 |                         "required": list(value["items"].keys()),
136 |                     }
137 |                 elif "type" in value:
138 |                     properties[key] = value
139 |                 else:
140 |                     properties[key] = self.transform_schema(value)
141 |             elif isinstance(value, list):
142 |                 try:
143 |                     properties[key] = {
144 |                         "type": openai_type_map[type(value[0]).__name__],
145 |                         "enum": value,
146 |                         "description": key,
147 |                     }
148 |                 except IndexError:
149 |                     raise Exception(f"Empty list for key {key}")
150 |             else:
151 |                 properties[key] = {
152 |                     "type": value,
153 |                     "description": key,
154 |                 }
155 | 
156 |         return properties
157 | 


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tasks for maintaining the project.
  3 | 
  4 | Execute 'invoke --list' for guidance on using Invoke
  5 | """
  6 | import platform
  7 | import webbrowser
  8 | from pathlib import Path
  9 | from typing import Optional
 10 | 
 11 | from invoke import call, task
 12 | from invoke.context import Context
 13 | from invoke.runners import Result
 14 | 
 15 | ROOT_DIR = Path(__file__).parent
 16 | DOCS_DIR = ROOT_DIR.joinpath("docs")
 17 | DOCS_BUILD_DIR = DOCS_DIR.joinpath("_build")
 18 | DOCS_INDEX = DOCS_BUILD_DIR.joinpath("index.html")
 19 | COVERAGE_FILE = ROOT_DIR.joinpath(".coverage")
 20 | COVERAGE_DIR = ROOT_DIR.joinpath("htmlcov")
 21 | COVERAGE_REPORT = COVERAGE_DIR.joinpath("index.html")
 22 | SOURCE_DIR = ROOT_DIR.joinpath("src/webtranspose")
 23 | TEST_DIR = ROOT_DIR.joinpath("tests")
 24 | PYTHON_TARGETS = [
 25 |     SOURCE_DIR,
 26 |     TEST_DIR,
 27 |     ROOT_DIR.joinpath("noxfile.py"),
 28 |     Path(__file__),
 29 | ]
 30 | PYTHON_TARGETS_STR = " ".join([str(p) for p in PYTHON_TARGETS])
 31 | 
 32 | 
 33 | def _run(c: Context, command: str) -> Optional[Result]:
 34 |     return c.run(command, pty=platform.system() != "Windows")
 35 | 
 36 | 
 37 | @task()
 38 | def clean_build(c):
 39 |     # type: (Context) -> None
 40 |     """Clean up files from package building."""
 41 |     _run(c, "rm -fr build/")
 42 |     _run(c, "rm -fr dist/")
 43 |     _run(c, "rm -fr .eggs/")
 44 |     _run(c, "find . -name '*.egg-info' -exec rm -fr {} +")
 45 |     _run(c, "find . -name '*.egg' -exec rm -f {} +")
 46 | 
 47 | 
 48 | @task()
 49 | def clean_python(c):
 50 |     # type: (Context) -> None
 51 |     """Clean up python file artifacts."""
 52 |     _run(c, "find . -name '*.pyc' -exec rm -f {} +")
 53 |     _run(c, "find . -name '*.pyo' -exec rm -f {} +")
 54 |     _run(c, "find . -name '*~' -exec rm -f {} +")
 55 |     _run(c, "find . -name '__pycache__' -exec rm -fr {} +")
 56 | 
 57 | 
 58 | @task()
 59 | def clean_tests(c):
 60 |     # type: (Context) -> None
 61 |     """Clean up files from testing."""
 62 |     _run(c, f"rm -f {COVERAGE_FILE}")
 63 |     _run(c, f"rm -fr {COVERAGE_DIR}")
 64 |     _run(c, "rm -fr .pytest_cache")
 65 | 
 66 | 
 67 | @task()
 68 | def clean_docs(c):
 69 |     # type: (Context) -> None
 70 |     """Clean up files from documentation builds."""
 71 |     _run(c, f"rm -fr {DOCS_BUILD_DIR}")
 72 | 
 73 | 
 74 | @task(pre=[clean_build, clean_python, clean_tests, clean_docs])
 75 | def clean(c):
 76 |     # type: (Context) -> None
 77 |     """Run all clean sub-tasks."""
 78 | 
 79 | 
 80 | @task()
 81 | def install_hooks(c):
 82 |     # type: (Context) -> None
 83 |     """Install pre-commit hooks."""
 84 |     _run(c, "poetry run pre-commit install")
 85 | 
 86 | 
 87 | @task()
 88 | def hooks(c):
 89 |     # type: (Context) -> None
 90 |     """Run pre-commit hooks."""
 91 |     _run(c, "poetry run pre-commit run --all-files")
 92 | 
 93 | 
 94 | @task(name="format", help={"check": "Checks if source is formatted without applying changes"})
 95 | def format_(c, check=False):
 96 |     # type: (Context, bool) -> None
 97 |     """Format code."""
 98 |     isort_options = ["--check-only", "--diff"] if check else []
 99 |     _run(c, f"poetry run isort {' '.join(isort_options)} {PYTHON_TARGETS_STR}")
100 |     black_options = ["--diff", "--check"] if check else ["--quiet"]
101 |     _run(c, f"poetry run black {' '.join(black_options)} {PYTHON_TARGETS_STR}")
102 | 
103 | 
104 | @task()
105 | def flake8(c):
106 |     # type: (Context) -> None
107 |     """Run flake8."""
108 |     _run(c, f"poetry run flakeheaven lint {PYTHON_TARGETS_STR}")
109 | 
110 | 
111 | @task()
112 | def security(c):
113 |     # type: (Context) -> None
114 |     """Run security related checks."""
115 |     _run(
116 |         c,
117 |         "poetry export --with dev --format=requirements.txt --without-hashes | "
118 |         "poetry run safety check --stdin --full-report",
119 |     )
120 | 
121 | 
122 | @task(pre=[flake8, security, call(format_, check=True)])
123 | def lint(c):
124 |     # type: (Context) -> None
125 |     """Run all linting."""
126 | 
127 | 
128 | @task()
129 | def mypy(c):
130 |     # type: (Context) -> None
131 |     """Run mypy."""
132 |     _run(c, f"poetry run mypy {PYTHON_TARGETS_STR}")
133 | 
134 | 
135 | @task()
136 | def tests(c):
137 |     # type: (Context) -> None
138 |     """Run tests."""
139 |     pytest_options = ["--xdoctest", "--cov", "--cov-report=", "--cov-fail-under=0"]
140 |     _run(c, f"poetry run pytest {' '.join(pytest_options)} {TEST_DIR} {SOURCE_DIR}")
141 | 
142 | 
143 | @task(
144 |     help={
145 |         "fmt": "Build a local report: report, html, json, annotate, html, xml.",
146 |         "open_browser": "Open the coverage report in the web browser (requires --fmt html)",
147 |     }
148 | )
149 | def coverage(c, fmt="report", open_browser=False):
150 |     # type: (Context, str, bool) -> None
151 |     """Create coverage report."""
152 |     if any(Path().glob(".coverage.*")):
153 |         _run(c, "poetry run coverage combine")
154 |     _run(c, f"poetry run coverage {fmt} -i")
155 |     if fmt == "html" and open_browser:
156 |         webbrowser.open(COVERAGE_REPORT.as_uri())
157 | 
158 | 
159 | @task(
160 |     help={
161 |         "serve": "Build the docs watching for changes",
162 |         "open_browser": "Open the docs in the web browser",
163 |     }
164 | )
165 | def docs(c, serve=False, open_browser=False):
166 |     # type: (Context, bool, bool) -> None
167 |     """Build documentation."""
168 |     _run(c, f"sphinx-apidoc -o {DOCS_DIR} {SOURCE_DIR}")
169 |     build_docs = f"sphinx-build -b html {DOCS_DIR} {DOCS_BUILD_DIR}"
170 |     _run(c, build_docs)
171 |     if open_browser:
172 |         webbrowser.open(DOCS_INDEX.absolute().as_uri())
173 |     if serve:
174 |         _run(c, f"poetry run watchmedo shell-command -p '*.rst;*.md' -c '{build_docs}' -R -D .")
175 | 
176 | 
177 | @task(
178 |     help={
179 |         "part": "Part of the version to be bumped.",
180 |         "dry_run": "Don't write any files, just pretend. (default: False)",
181 |     }
182 | )
183 | def version(c, part, dry_run=False):
184 |     # type: (Context, str, bool) -> None
185 |     """Bump version."""
186 |     bump_options = ["--dry-run"] if dry_run else []
187 |     _run(c, f"poetry run bump2version {' '.join(bump_options)} {part}")
188 | 


--------------------------------------------------------------------------------
/src/webtranspose/chat.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from time import sleep
  4 | from typing import List
  5 | 
  6 | from .webt_api import run_webt_api
  7 | 
  8 | 
  9 | class Chatbot:
 10 |     def __init__(
 11 |         self,
 12 |         url_list: List[str] = [],
 13 |         name: str = None,
 14 |         max_pages: int = 100,
 15 |         api_key: str = None,
 16 |         verbose: bool = False,
 17 |         chatbot_id: str = None,
 18 |         _created: bool = False,
 19 |     ) -> None:
 20 |         """
 21 |         Initialize a Chatbot instance.
 22 | 
 23 |         :param url_list: A list of URLs to crawl.
 24 |         :param name: The name of the chatbot.
 25 |         :param max_pages: The maximum number of pages to crawl.
 26 |         :param api_key: The API key for accessing the Web Transpose API.
 27 |         :param verbose: Whether to enable verbose logging.
 28 |         :param chatbot_id: The ID of an existing chatbot.
 29 |         :param _created: Whether the chatbot has already been created.
 30 |         """
 31 |         self.api_key = api_key
 32 |         if self.api_key is None:
 33 |             self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
 34 | 
 35 |         if self.api_key is None:
 36 |             raise ValueError(
 37 |                 "No Web Transpose API provided. \n\nTo use Chatbots, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com."
 38 |             )
 39 | 
 40 |         self.url_list = url_list
 41 |         self.name = name
 42 |         self.max_pages = max_pages
 43 |         self.verbose = verbose
 44 |         self.chatbot_id = chatbot_id
 45 |         self.created = _created
 46 | 
 47 |         if not self.chatbot_id:
 48 |             self.create()
 49 | 
 50 |     def create(self):
 51 |         """
 52 |         Create a chatbot.
 53 |         """
 54 |         if not self.chatbot_id:
 55 |             self._create_chat()
 56 |             status = self.status()
 57 |             while status["status"] != "complete":
 58 |                 if self.verbose:
 59 |                     logging.info("Waiting for chat to be created...")
 60 |                 sleep(5)
 61 |                 status = self.status()
 62 | 
 63 |         else:
 64 |             logging.info("Chat already created.")
 65 | 
 66 |     def queue_create(self):
 67 |         """
 68 |         Queue the creation of a chatbot.
 69 |         """
 70 |         if not self.chatbot_id:
 71 |             self._create_chat()
 72 |         else:
 73 |             logging.info("Chat already created.")
 74 | 
 75 |     def _create_chat(self):
 76 |         """
 77 |         Create a chat.
 78 |         """
 79 |         if self.verbose:
 80 |             logging.info("Creating chat...")
 81 | 
 82 |         if self.chatbot_id is None:
 83 |             create_json = {
 84 |                 "name": self.name,
 85 |                 "max_pages": self.max_pages,
 86 |                 "url_list": self.url_list,
 87 |             }
 88 |             out_json = run_webt_api(create_json, "v1/chat/create", self.api_key)
 89 |             self.chatbot_id = out_json["chatbot_id"]
 90 | 
 91 |     def query_database(self, query: str, num_records: int = 3) -> list:
 92 |         """
 93 |         Query the database of the chatbot.
 94 | 
 95 |         :param query: The query string.
 96 |         :param num_records: The number of records to return.
 97 |         :return: The query results.
 98 |         """
 99 |         if self.verbose:
100 |             logging.info("Querying database...")
101 | 
102 |         if not self.chatbot_id:
103 |             self.create()
104 | 
105 |         query_json = {
106 |             "chatbot_id": self.chatbot_id,
107 |             "query": query,
108 |             "num_records": num_records,
109 |         }
110 |         out = run_webt_api(query_json, "v1/chat/database/query", self.api_key)
111 |         return out["results"]
112 | 
113 |     def status(self):
114 |         """
115 |         Get the status of the chatbot.
116 | 
117 |         :return: The chatbot status.
118 |         """
119 |         if self.verbose:
120 |             logging.info("Getting chat...")
121 | 
122 |         if not self.chatbot_id:
123 |             self.create()
124 | 
125 |         get_json = {
126 |             "chatbot_id": self.chatbot_id,
127 |         }
128 |         out = run_webt_api(get_json, "v1/chat/get", self.api_key)
129 |         return out["chatbot"]
130 | 
131 |     def add_urls(self, url_list: list):
132 |         """
133 |         Add URLs to the chatbot.
134 | 
135 |         :param url_list: A list of URLs to add.
136 |         """
137 |         if self.verbose:
138 |             logging.info("Querying database...")
139 | 
140 |         if not self.chatbot_id:
141 |             self.create()
142 | 
143 |         query_json = {
144 |             "chatbot_id": self.chatbot_id,
145 |             "max_pages": self.max_pages,
146 |             "url_list": url_list,
147 |         }
148 |         run_webt_api(query_json, "v1/chat/urls/add", self.api_key)
149 | 
150 |     def delete_crawls(self, crawl_id_list: list):
151 |         """
152 |         Delete crawls from the chatbot.
153 | 
154 |         :param crawl_id_list: A list of crawl IDs to delete.
155 |         """
156 |         if self.verbose:
157 |             logging.info("Querying database...")
158 | 
159 |         if not self.chatbot_id:
160 |             self.create()
161 | 
162 |         query_json = {
163 |             "chatbot_id": self.chatbot_id,
164 |             "crawl_id_list": crawl_id_list,
165 |         }
166 |         run_webt_api(query_json, "v1/chat/crawls/delete", self.api_key)
167 | 
168 | 
169 | def get_chatbot(chatbot_id: str, api_key = None) -> Chatbot:
170 |     """
171 |     Get a chatbot.
172 | 
173 |     :param chatbot_id: The ID of the chatbot.
174 |     :return: The chatbot.
175 |     """
176 |     if api_key is None:
177 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
178 |         if api_key is None:
179 |             raise ValueError(
180 |                 "No Web Transpose API provided. \n\nTo use Chatbots, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com."
181 |             )
182 |     get_json = {
183 |         "chatbot_id": chatbot_id,
184 |     }
185 |     chat_json = run_webt_api(get_json, "v1/chat/get", api_key)
186 |     chatbot_data = chat_json.get('chatbot', {})
187 |     chatbot = Chatbot(
188 |         chatbot_id=chatbot_data.get('id'),
189 |         name=chatbot_data.get('name'),
190 |         max_pages=chatbot_data.get('num_run', 100),
191 |         verbose=False,
192 |         _created=True
193 |     )
194 |     return chatbot


--------------------------------------------------------------------------------
/src/webtranspose/scrape.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import re
  5 | import uuid
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | from .openai import OpenAIScraper
 11 | from .webt_api import run_webt_api
 12 | 
 13 | 
 14 | class Scraper:
 15 |     def __init__(
 16 |         self,
 17 |         schema: dict,
 18 |         scraper_id: str = None,
 19 |         name: str = None,
 20 |         render_js: bool = False,
 21 |         verbose: bool = False,
 22 |         scraper: OpenAIScraper = None,
 23 |         api_key: str = None,
 24 |         proxy: str = None,
 25 |         _created: bool = False,
 26 |     ):
 27 |         """
 28 |         Initialize the Scraper object.
 29 | 
 30 |         Args:
 31 |             schema (dict): The schema for scraping.
 32 |             scraper_id (str, optional): The ID of the scraper. Defaults to None.
 33 |             name (str, optional): The name of the scraper. Defaults to None.
 34 |             render_js (bool, optional): Whether to render JavaScript. Defaults to False.
 35 |             verbose (bool, optional): Whether to print verbose output. Defaults to False.
 36 |             scraper (OpenAIScraper, optional): The scraper object. Defaults to None.
 37 |             api_key (str, optional): The API key. Defaults to None.
 38 |             proxy (str, optional): The proxy. Defaults to None.
 39 |             _created (bool, optional): Whether the scraper has been created. Defaults to False.
 40 |         """
 41 |         self.api_key = api_key
 42 |         if self.api_key is None:
 43 |             self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
 44 | 
 45 |         self.name = name
 46 |         if self.name is None:
 47 |             self.name = "New Scraper"
 48 |         self.schema = schema
 49 |         self.verbose = verbose
 50 |         self.scraper = scraper
 51 |         self.render_js = render_js
 52 |         self.scraper_id = scraper_id
 53 |         self.proxy = proxy
 54 |         if self.scraper is None:
 55 |             self.scraper = OpenAIScraper()
 56 |         if self.scraper_id is None:
 57 |             self.scraper_id = str(uuid.uuid4())
 58 |         self.created = _created
 59 | 
 60 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
 61 |         if api_key is None and self.api_key is None:
 62 |             logging.warning(
 63 |                 "No Web Transpose API provided. Lite version in use...\n\nTo run the actual WebT AI Web Scraper the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics."
 64 |             )
 65 | 
 66 |     def __str__(self) -> str:
 67 |         """
 68 |         Get a string representation of the Scraper object.
 69 | 
 70 |         Returns:
 71 |             str: The string representation of the Scraper object.
 72 |         """
 73 |         status = self.status()
 74 |         schema = json.dumps(status["schema"], indent=4)
 75 |         return (
 76 |             f"WebTransposeScraper(\n"
 77 |             f"  Status ID: {status['scraper_id']}\n"
 78 |             f"  Name: {status['name']}\n"
 79 |             f"  Render JS: {status['render_js']}\n"
 80 |             f"  Schema: {schema}\n"
 81 |             f")"
 82 |         )
 83 | 
 84 |     def __repr__(self) -> str:
 85 |         """
 86 |         Get a string representation of the Scraper object.
 87 | 
 88 |         Returns:
 89 |             str: The string representation of the Scraper object.
 90 |         """
 91 |         status = self.status()
 92 |         schema = json.dumps(status["schema"], indent=4)
 93 |         return (
 94 |             f"WebTransposeScraper(\n"
 95 |             f"  Status ID: {status['scraper_id']}\n"
 96 |             f"  Name: {status['name']}\n"
 97 |             f"  Render JS: {status['render_js']}\n"
 98 |             f"  Schema: {schema}\n"
 99 |             f")"
100 |         )
101 | 
102 |     def create_scraper_api(self):
103 |         """
104 |         Creates a Scraper on https://webtranspose.com
105 |         """
106 |         if self.verbose:
107 |             logging.info(f"Creating AI Web Scraper on Web Transpose...")
108 | 
109 |         create_json = {
110 |             "name": self.name,
111 |             "schema": self.schema,
112 |             "render_js": self.render_js,
113 |             "proxy": self.proxy,
114 |         }
115 |         out_json = run_webt_api(
116 |             create_json,
117 |             "/v1/scraper/create",
118 |             self.api_key,
119 |         )
120 |         self.scraper_id = out_json["scraper_id"]
121 |         self.created = True
122 | 
123 |     def scrape(self, url=None, html=None, timeout=30):
124 |         """
125 |         Scrape the data from a given URL or HTML.
126 | 
127 |         Args:
128 |             url (str, optional): The URL to scrape. Defaults to None.
129 |             html (str, optional): The HTML to scrape. Defaults to None.
130 |             timeout (int, optional): The timeout for the request. Defaults to 30.
131 | 
132 |         Returns:
133 |             dict: The scraped data.
134 | 
135 |         Raises:
136 |             ValueError: If neither URL nor HTML is provided.
137 |         """
138 |         if self.verbose:
139 |             logging.info(f"Running Scraper({self.name}) on {url}...")
140 | 
141 |         if self.api_key is None:
142 |             if url is not None:
143 |                 response = requests.get(url, timeout=timeout)
144 |                 soup = BeautifulSoup(response.content, "html.parser")
145 |                 body = soup.body
146 |                 html = re.sub("\s+", " ", str(body)).strip()
147 | 
148 |             if html is None:
149 |                 raise ValueError("Must provide either a url or html.")
150 | 
151 |             return self.scraper.scrape(
152 |                 html,
153 |                 self.schema,
154 |             )
155 |         else:
156 |             if not self.created:
157 |                 self.create_scraper_api()
158 | 
159 |             scrape_json = {
160 |                 "scraper_id": self.scraper_id,
161 |                 "url": url,
162 |                 "html": html,
163 |                 "proxy": self.proxy,
164 |             }
165 |             out_json = run_webt_api(
166 |                 scrape_json,
167 |                 "/v1/scraper/scrape",
168 |                 self.api_key,
169 |             )
170 |             return out_json
171 | 
172 |     def status(self):
173 |         """
174 |         Get the status of the Scraper.
175 | 
176 |         Returns:
177 |             dict: The status of the Scraper.
178 |         """
179 |         if self.api_key is None or not self.created:
180 |             return {
181 |                 "scraper_id": self.scraper_id,
182 |                 "name": self.name,
183 |                 "verbose": self.verbose,
184 |                 "render_js": self.render_js,
185 |                 "schema": self.schema,
186 |                 "proxy": self.proxy,
187 |             }
188 |         else:
189 |             get_json = {
190 |                 "scraper_id": self.scraper_id,
191 |             }
192 |             out_api = run_webt_api(
193 |                 get_json,
194 |                 "/v1/scraper/get",
195 |                 self.api_key,
196 |             )
197 |             scraper = out_api["scraper"]
198 |             return {
199 |                 "scraper_id": scraper["id"],
200 |                 "name": scraper["name"],
201 |                 "verbose": self.verbose,
202 |                 "render_js": scraper["render_js"],
203 |                 "schema": scraper["schema"],
204 |                 "proxy": scraper["proxy"]
205 |             }
206 | 
207 | 
208 | def get_scraper(scraper_id, api_key: str = None):
209 |     """
210 |     Get a Scraper object based on the scraper ID.
211 | 
212 |     Args:
213 |         scraper_id (str): The ID of the scraper.
214 |         api_key (str, optional): The API key. Defaults to None.
215 | 
216 |     Returns:
217 |         Scraper: The Scraper object.
218 | 
219 |     Raises:
220 |         ValueError: If api_key is not provided.
221 |     """
222 |     if api_key is None:
223 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
224 | 
225 |     if api_key is not None:
226 |         get_json = {
227 |             "scraper_id": scraper_id,
228 |         }
229 |         out_json = run_webt_api(
230 |             get_json,
231 |             "/v1/scraper/get",
232 |             api_key,
233 |         )
234 |         scraper = out_json["scraper"]
235 |         return Scraper(
236 |             scraper_id=scraper["id"],
237 |             name=scraper["name"],
238 |             schema=scraper["schema"],
239 |             render_js=scraper["render_js"],
240 |             api_key=api_key,
241 |             proxy=scraper['proxy'],
242 |             _created=True,
243 |         )
244 | 
245 |     raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
246 | 
247 | 
248 | def list_scrapers(api_key: str = None):
249 |     """
250 |     List all available scrapers.
251 | 
252 |     Args:
253 |         api_key (str, optional): The API key. Defaults to None.
254 | 
255 |     Returns:
256 |         list: A list of Scrapers.
257 | 
258 |     Raises:
259 |         ValueError: If api_key is not provided.
260 |     """
261 |     if api_key is None:
262 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
263 | 
264 |     if api_key is not None:
265 |         out_json = run_webt_api(
266 |             {},
267 |             "/v1/scraper/list",
268 |             api_key,
269 |         )
270 |         return out_json["scrapers"]
271 | 
272 |     raise ValueError("Must provide api_key or set WEBTRANSPOSE_API_KEY in environment variables.")
273 | 


--------------------------------------------------------------------------------
/src/webtranspose/crawl.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import logging
  4 | import os
  5 | import shutil
  6 | import tempfile
  7 | import urllib.parse
  8 | import uuid
  9 | import zipfile
 10 | from datetime import datetime
 11 | from fnmatch import fnmatch
 12 | from typing import Dict, List, Optional, Set
 13 | from urllib.parse import urljoin, urlparse, urlunparse
 14 | 
 15 | import httpx
 16 | from bs4 import BeautifulSoup
 17 | 
 18 | from .webt_api import run_webt_api
 19 | 
 20 | 
 21 | class Crawl:
 22 |     def __init__(
 23 |         self,
 24 |         url: str,
 25 |         allowed_urls: List[str] = [],
 26 |         banned_urls: List[str] = [],
 27 |         n_workers: int = 1,
 28 |         max_pages: int = 15,
 29 |         render_js: bool = False,
 30 |         output_dir: str = "webtranspose-out",
 31 |         verbose: bool = False,
 32 |         api_key: Optional[str] = None,
 33 |         _created: bool = False,
 34 |     ) -> None:
 35 |         """
 36 |         Initialize the Crawl object.
 37 | 
 38 |         :param url: The base URL to start crawling from.
 39 |         :param allowed_urls: A list of allowed URLs to crawl.
 40 |         :param banned_urls: A list of banned URLs to exclude from crawling.
 41 |         :param n_workers: The number of worker tasks to use for crawling.
 42 |         :param max_pages: The maximum number of pages to crawl.
 43 |         :param render_js: Whether to render JavaScript on crawled pages.
 44 |         :param output_dir: The directory to store the crawled data.
 45 |         :param verbose: Whether to print verbose logging messages.
 46 |         :param api_key: The API key to use for webt_api calls.
 47 |         """
 48 |         self.api_key = api_key
 49 |         if self.api_key is None:
 50 |             self.api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
 51 | 
 52 |         self.base_url = url
 53 |         self.allowed_urls = allowed_urls
 54 |         self.banned_urls = banned_urls
 55 |         self.max_pages = max_pages
 56 |         self.queue = asyncio.Queue()
 57 |         self.queue.put_nowait(
 58 |             {
 59 |                 "url": self.base_url,
 60 |                 "parent_urls": [],
 61 |             }
 62 |         )
 63 |         self.output_dir = output_dir
 64 |         self.visited_urls = {}
 65 |         self.failed_urls = set()
 66 |         self.ignored_urls = set()
 67 |         self.n_workers = n_workers
 68 |         if not os.path.exists(self.output_dir):
 69 |             os.makedirs(self.output_dir)
 70 |         self.created = _created
 71 |         self.render_js = render_js
 72 |         self.crawl_id = None
 73 |         if self.api_key is None:
 74 |             self.crawl_id = str(uuid.uuid4())
 75 |         self.verbose = verbose
 76 | 
 77 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
 78 |         if api_key is None and self.api_key is None:
 79 |             logging.warning(
 80 |                 "No Web Transpose API provided. Lite version in use...\n\nTo run your Web Crawl on the Web Transpose API, set the WEBTRANSPOSE_API_KEY from https://webtranspose.com. Run cheaper with logging and advanced analytics."
 81 |             )
 82 | 
 83 |     @staticmethod
 84 |     async def crawl_worker(
 85 |         name: str,
 86 |         queue: asyncio.Queue,
 87 |         crawl_id: str,
 88 |         visited_urls: Dict[str, str],
 89 |         allowed_urls: List[str],
 90 |         failed_urls: Set[str],
 91 |         banned_urls: List[str],
 92 |         output_dir: str,
 93 |         base_url: str,
 94 |         max_pages: int,
 95 |         leftover_queue: asyncio.Queue,
 96 |         ignored_queue: asyncio.Queue,
 97 |         verbose: bool,
 98 |     ) -> None:
 99 |         """
100 |         Worker function for crawling URLs.
101 | 
102 |         :param name: The name of the worker.
103 |         :param queue: The queue of URLs to crawl.
104 |         :param crawl_id: The ID of the crawl.
105 |         :param visited_urls: A dictionary of visited URLs and their file paths.
106 |         :param allowed_urls: A list of allowed URLs to crawl.
107 |         :param banned_urls: A list of banned URLs to exclude from crawling.
108 |         :param output_dir: The directory to store the crawled data.
109 |         :param base_url: The base URL of the crawl.
110 |         :param max_pages: The maximum number of pages to crawl.
111 |         :param leftover_queue: The queue for leftover URLs.
112 |         :param ignored_queue: The queue for ignored URLs.
113 |         :param verbose: Whether to print verbose logging messages.
114 |         """
115 | 
116 |         def _lint_url(url: str) -> str:
117 |             """
118 |             Lint the given URL by removing the fragment component.
119 | 
120 |             :param url: The URL to lint.
121 |             :return: The linted URL.
122 |             """
123 |             parsed_url = urlparse(url)
124 |             cleaned_url = parsed_url._replace(fragment="")
125 |             return urlunparse(cleaned_url)
126 | 
127 |         if verbose:
128 |             logging.info(f"{name}: Starting crawl of {base_url}")
129 |         while max_pages is None or len(visited_urls) < max_pages or not queue.empty():
130 |             curr_url_data = await queue.get()
131 |             curr_url = curr_url_data["url"]
132 |             parent_urls = curr_url_data["parent_urls"]
133 |             base_url_netloc = urlparse(base_url).netloc
134 |             if (
135 |                 (
136 |                     (
137 |                         urlparse(curr_url).netloc == base_url_netloc
138 |                         and not any(fnmatch(curr_url, banned) for banned in banned_urls)
139 |                     )
140 |                     or any(fnmatch(curr_url, allowed) for allowed in allowed_urls)
141 |                 )
142 |                 and curr_url not in visited_urls
143 |                 and len(visited_urls) < max_pages
144 |             ):
145 |                 base_dir = os.path.join(output_dir, base_url_netloc)
146 |                 if not os.path.exists(base_dir):
147 |                     os.makedirs(base_dir)
148 |                 filename = urllib.parse.quote_plus(curr_url).replace("/", "_")
149 |                 filepath = os.path.join(base_dir, filename) + ".json"
150 |                 async with httpx.AsyncClient() as client:
151 |                     try:
152 |                         page = await client.get(curr_url)
153 |                     except:
154 |                         failed_urls.add(curr_url)
155 |                         queue.task_done()
156 |                         continue
157 | 
158 |                     page_title = None
159 |                     page_html = None
160 |                     page_text = None
161 |                     try:
162 |                         page_type = "html"
163 |                         soup = BeautifulSoup(page.content, "lxml")
164 |                         page_title = soup.title.string if soup.title else ""
165 |                         page_html = page.content.decode("utf-8")
166 |                         page_text = soup.get_text()
167 |                         child_urls = list(
168 |                             set(
169 |                                 [
170 |                                     _lint_url(urljoin(base_url, link.get("href")))
171 |                                     for link in soup.find_all(href=True)
172 |                                 ]
173 |                             )
174 |                         )
175 |                         for url in child_urls:
176 |                             if url.startswith("http"):
177 |                                 queue.put_nowait(
178 |                                     {
179 |                                         "url": url,
180 |                                         "parent_urls": parent_urls + [curr_url],
181 |                                     }
182 |                                 )
183 |                     except:
184 |                         child_urls = []
185 |                         page_type = "other"
186 | 
187 |                     visited_urls[curr_url] = filepath
188 |                     data = {
189 |                         "crawl_id": crawl_id,
190 |                         "url": curr_url,
191 |                         "type": page_type,
192 |                         "title": page_title,
193 |                         "date": datetime.now().isoformat(),
194 |                         "parent_urls": parent_urls,
195 |                         "child_urls": child_urls,
196 |                         "html": page_html,
197 |                         "text": page_text,
198 |                     }
199 |                     with open(filepath, "w") as f:
200 |                         json.dump(data, f)
201 | 
202 |             elif curr_url not in visited_urls and (
203 |                 urlparse(curr_url).netloc == urlparse(base_url).netloc
204 |                 or any(fnmatch(curr_url, allowed) for allowed in allowed_urls)
205 |             ):
206 |                 leftover_queue.put_nowait(
207 |                     {
208 |                         "url": curr_url,
209 |                         "parent_urls": parent_urls,
210 |                     }
211 |                 )
212 | 
213 |             else:
214 |                 ignored_queue.put_nowait(curr_url)
215 | 
216 |             queue.task_done()
217 | 
218 |     def create_crawl_api(self):
219 |         """
220 |         Creates a Crawl on https://webtranspose.com
221 |         """
222 |         if self.verbose:
223 |             logging.info(f"Creating crawl of {self.base_url} on Web Transpose...")
224 |         create_json = {
225 |             "url": self.base_url,
226 |             "render_js": self.render_js,
227 |             "max_pages": self.max_pages,
228 |             "allowed_urls": self.allowed_urls,
229 |             "banned_urls": self.banned_urls,
230 |         }
231 |         out_json = run_webt_api(
232 |             create_json,
233 |             "v1/crawl/create",
234 |             self.api_key,
235 |         )
236 |         self.crawl_id = out_json["crawl_id"]
237 |         self.created = True
238 | 
239 |     def queue_crawl(self):
240 |         """
241 |         Resume crawling of Crawl object. Don't wait for it to finish crawling.
242 |         """
243 |         if self.verbose:
244 |             logging.info(f"Starting crawl of {self.base_url} on Web Transpose...")
245 | 
246 |         if self.api_key is None:
247 |             logging.error("Cannot queue a local crawl. Please use the crawl() method.")
248 | 
249 |         else:
250 |             if not self.created:
251 |                 self.create_crawl_api()
252 |             queue_json = {
253 |                 "crawl_id": self.crawl_id,
254 |             }
255 |             out = run_webt_api(
256 |                 queue_json,
257 |                 "v1/crawl/resume",
258 |                 self.api_key,
259 |             )
260 | 
261 |     async def crawl(self):
262 |         """
263 |         Resume crawling of Crawl object.
264 |         """
265 |         if self.verbose:
266 |             logging.info(f"Starting crawl of {self.base_url}...")
267 |         if self.api_key is None:
268 |             leftover_queue = asyncio.Queue()
269 |             ignored_queue = asyncio.Queue()
270 |             tasks = []
271 |             for i in range(self.n_workers):
272 |                 task = asyncio.create_task(
273 |                     self.crawl_worker(
274 |                         f"worker-{i}",
275 |                         self.queue,
276 |                         self.crawl_id,
277 |                         self.visited_urls,
278 |                         self.allowed_urls,
279 |                         self.failed_urls,
280 |                         self.banned_urls,
281 |                         self.output_dir,
282 |                         self.base_url,
283 |                         self.max_pages,
284 |                         leftover_queue,
285 |                         ignored_queue,
286 |                         self.verbose,
287 |                     )
288 |                 )
289 |                 tasks.append(task)
290 | 
291 |             await self.queue.join()
292 |             for task in tasks:
293 |                 task.cancel()
294 |             await asyncio.gather(*tasks, return_exceptions=True)
295 |             self.queue = leftover_queue
296 |             self.ignored_urls = list(ignored_queue._queue)
297 |             self.to_metadata()
298 |         else:
299 |             self.queue_crawl()
300 |             status = self.status()
301 |             while status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0:
302 |                 await asyncio.sleep(5)
303 |                 status = self.status()
304 | 
305 |             if (status["num_failed"] > 0) and (
306 |                 status["num_queued"] + status["num_visited"] + status["num_ignored"] == 0
307 |             ):
308 |                 raise Exception("The first page crawled failed")
309 | 
310 |             while status["num_queued"] > 0 and status["num_visited"] < status["max_pages"]:
311 |                 await asyncio.sleep(5)
312 |                 status = self.status()
313 |         return self
314 | 
315 |     def get_queued(self, max_pages: int = 30) -> list:
316 |         """
317 |         Get a list of URLs from the queue.
318 | 
319 |         Args:
320 |             max_pages (int): The number of URLs to retrieve from the queue. Defaults to 30.
321 | 
322 |         Returns:
323 |             list: A list of URLs from the queue.
324 |         """
325 |         if self.api_key is None:
326 |             urls = []
327 |             for _ in range(max_pages):
328 |                 try:
329 |                     url = self.queue.get_nowait()
330 |                     urls.append(url)
331 |                 except asyncio.QueueEmpty:
332 |                     break
333 |             for url in urls:
334 |                 self.queue.put_nowait(url)
335 |             return urls
336 |         else:
337 |             if not self.created:
338 |                 return [self.base_url]
339 |             queue_json = {
340 |                 "crawl_id": self.crawl_id,
341 |                 "max_pages": max_pages,
342 |             }
343 |             out_json = run_webt_api(
344 |                 queue_json,
345 |                 "v1/crawl/get-queue",
346 |                 self.api_key,
347 |             )
348 |             return out_json["urls"]
349 | 
350 |     def set_allowed_urls(self, allowed_urls: list) -> "Crawl":
351 |         """
352 |         Set the allowed URLs for the crawl.
353 | 
354 |         Args:
355 |             allowed_urls (list): A list of allowed URLs.
356 | 
357 |         Returns:
358 |             self: The Crawl object.
359 |         """
360 |         self.allowed_urls = allowed_urls
361 |         if not self.created:
362 |             self.to_metadata()
363 |         else:
364 |             update_json = {
365 |                 "crawl_id": self.crawl_id,
366 |                 "allowed_urls": allowed_urls,
367 |             }
368 |             run_webt_api(
369 |                 update_json,
370 |                 "v1/crawl/set-allowed",
371 |                 self.api_key,
372 |             )
373 |         return self
374 | 
375 |     def set_banned_urls(self, banned_urls: list) -> "Crawl":
376 |         """
377 |         Set the banned URLs for the crawl.
378 | 
379 |         Args:
380 |             banned_urls (list): A list of banned URLs.
381 | 
382 |         Returns:
383 |             self: The Crawl object.
384 |         """
385 |         self.banned_urls = banned_urls
386 |         if not self.created:
387 |             self.to_metadata()
388 |         else:
389 |             update_json = {
390 |                 "crawl_id": self.crawl_id,
391 |                 "banned_urls": banned_urls,
392 |             }
393 |             run_webt_api(
394 |                 update_json,
395 |                 "v1/crawl/set-banned",
396 |                 self.api_key,
397 |             )
398 |         return self
399 | 
400 |     def get_filename(self, url: str) -> str:
401 |         """
402 |         Get the filename associated with a visited URL.
403 | 
404 |         Args:
405 |             url (str): The visited URL.
406 | 
407 |         Returns:
408 |             str: The filename associated with the visited URL.
409 | 
410 |         Raises:
411 |             ValueError: If the URL is not found in the visited URLs.
412 |         """
413 |         try:
414 |             return self.visited_urls[url]
415 |         except KeyError:
416 |             raise ValueError(f"URL {url} not found in visited URLs")
417 | 
418 |     def set_max_pages(self, max_pages: int) -> "Crawl":
419 |         """
420 |         Set the maximum number of pages to crawl.
421 | 
422 |         Args:
423 |             max_pages (int): The maximum number of pages to crawl.
424 | 
425 |         Returns:
426 |             self: The Crawl object.
427 |         """
428 |         if not self.created:
429 |             self.max_pages = max_pages
430 |             self.to_metadata()
431 |         else:
432 |             max_pages_json = {
433 |                 "crawl_id": self.crawl_id,
434 |                 "max_pages": max_pages,
435 |             }
436 |             run_webt_api(
437 |                 max_pages_json,
438 |                 "v1/crawl/set-max-pages",
439 |                 self.api_key,
440 |             )
441 |         return self
442 | 
443 |     def status(self) -> dict:
444 |         """
445 |         Get the status of the Crawl object.
446 | 
447 |         Returns:
448 |             dict: The status of the Crawl object.
449 |         """
450 |         if not self.created:
451 |             status_json = {
452 |                 "crawl_id": self.crawl_id,
453 |                 "loc": "local" if self.api_key is None else "cloud",
454 |                 "base_url": self.base_url,
455 |                 "max_pages": self.max_pages,
456 |                 "num_visited": len(self.visited_urls),
457 |                 "num_ignored": len(self.ignored_urls),
458 |                 "num_failed": len(self.failed_urls),
459 |                 "num_queued": self.queue.qsize(),
460 |                 "banned_urls": self.banned_urls,
461 |                 "allowed_urls": self.allowed_urls,
462 |             }
463 |             status_json["n_workers"] = self.n_workers
464 |             return status_json
465 | 
466 |         status_json = {
467 |             "crawl_id": self.crawl_id,
468 |         }
469 |         crawl_status = run_webt_api(
470 |             status_json,
471 |             "v1/crawl/get",
472 |             self.api_key,
473 |         )
474 |         crawl_status["loc"] = "cloud"
475 |         if self.verbose:
476 |             logging.info(f"Status of crawl {self.crawl_id}: {crawl_status}")
477 |         return crawl_status
478 | 
479 |     def get_ignored(self) -> list:
480 |         """
481 |         Get a list of ignored URLs.
482 | 
483 |         Returns:
484 |             list: A list of ignored URLs.
485 |         """
486 |         if not self.created:
487 |             return list(self.ignored_urls)
488 | 
489 |         ignored_json = {
490 |             "crawl_id": self.crawl_id,
491 |         }
492 |         out_json = run_webt_api(
493 |             ignored_json,
494 |             "v1/crawl/get/ignored",
495 |             self.api_key,
496 |         )
497 |         return out_json["pages"]
498 | 
499 |     def get_failed(self) -> list:
500 |         """
501 |         Get a list of failed URLs.
502 | 
503 |         Returns:
504 |             list: A list of failed URLs.
505 |         """
506 |         if not self.created:
507 |             return list(self.failed_urls)
508 | 
509 |         visited_json = {
510 |             "crawl_id": self.crawl_id,
511 |         }
512 |         out_json = run_webt_api(
513 |             visited_json,
514 |             "v1/crawl/get/failed",
515 |             self.api_key,
516 |         )
517 |         return out_json["pages"]
518 | 
519 |     def get_visited(self) -> list:
520 |         """
521 |         Get a list of visited URLs.
522 | 
523 |         Returns:
524 |             list: A list of visited URLs.
525 |         """
526 |         if not self.created:
527 |             return list(self.visited_urls)
528 | 
529 |         visited_json = {
530 |             "crawl_id": self.crawl_id,
531 |         }
532 |         out_json = run_webt_api(
533 |             visited_json,
534 |             "v1/crawl/get/visited",
535 |             self.api_key,
536 |         )
537 |         return out_json["pages"]
538 | 
539 |     def get_banned(self) -> list:
540 |         """
541 |         Get a list of banned URLs.
542 | 
543 |         Returns:
544 |             list: A list of banned URLs.
545 |         """
546 |         if not self.created:
547 |             return list(self.banned_urls)
548 | 
549 |         banned_json = {
550 |             "crawl_id": self.crawl_id,
551 |         }
552 |         out_json = run_webt_api(
553 |             banned_json,
554 |             "v1/crawl/get/banned",
555 |             self.api_key,
556 |         )
557 |         return out_json["pages"]
558 | 
559 |     def download(self):
560 |         """
561 |         Download the output of the crawl.
562 |         """
563 |         if self.verbose:
564 |             logging.info(f"Downloading crawl of {self.base_url}...")
565 | 
566 |         if self.created:
567 |             download_json = {
568 |                 "crawl_id": self.crawl_id,
569 |             }
570 |             out_json = run_webt_api(
571 |                 download_json,
572 |                 "v1/crawl/download",
573 |                 self.api_key,
574 |             )
575 |             presigned_url = out_json["url"]
576 |             with tempfile.TemporaryDirectory() as tmpdir:
577 |                 zip_file_path = os.path.join(tmpdir, "temp.zip")
578 |                 with open(zip_file_path, "wb") as f:
579 |                     response = httpx.get(presigned_url)
580 |                     f.write(response.content)
581 | 
582 |                 with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
583 |                     zip_ref.extractall(tmpdir)
584 | 
585 |                 for root, _, files in os.walk(tmpdir):
586 |                     for file in files:
587 |                         if file.endswith(".json"):
588 |                             json_file = os.path.join(root, file)
589 |                             with open(json_file, "r") as f:
590 |                                 data = json.load(f)
591 |                             url = data["url"]
592 |                             base_url_netloc = urlparse(self.base_url).netloc
593 |                             base_dir = os.path.join(self.output_dir, base_url_netloc)
594 |                             if not os.path.exists(base_dir):
595 |                                 os.makedirs(base_dir)
596 |                             filename = urllib.parse.quote_plus(url).replace("/", "_")
597 |                             filepath = os.path.join(base_dir, filename) + ".json"
598 |                             shutil.move(json_file, filepath)
599 | 
600 |         logging.info(f"The output of the crawl can be found at: {self.output_dir}")
601 | 
602 |     def to_metadata(self) -> None:
603 |         """
604 |         Save the metadata of the Crawl object to a file.
605 |         """
606 |         if not self.created:
607 |             filename = os.path.join(self.output_dir, f"{self.crawl_id}.json")
608 |             metadata = {
609 |                 "crawl_id": self.crawl_id,
610 |                 "n_workers": self.n_workers,
611 |                 "base_url": self.base_url,
612 |                 "max_pages": self.max_pages,
613 |                 "visited_urls": self.visited_urls,
614 |                 "ignored_urls": list(self.ignored_urls),
615 |                 "render_js": self.render_js,
616 |                 "queue": list(self.queue._queue),
617 |                 "banned_urls": self.banned_urls,
618 |                 "allowed_urls": self.allowed_urls,
619 |                 "output_dir": self.output_dir,
620 |             }
621 |             with open(filename, "w") as file:
622 |                 json.dump(metadata, file)
623 | 
624 |     @staticmethod
625 |     def from_metadata(crawl_id: str, output_dir: str = "webtranspose-out") -> "Crawl":
626 |         """
627 |         Create a Crawl object from metadata stored in a file.
628 | 
629 |         Args:
630 |             crawl_id (str): The ID of the crawl.
631 |             output_dir (str, optional): The directory to store the crawled data. Defaults to "webtranspose-out".
632 | 
633 |         Returns:
634 |             Crawl: The Crawl object.
635 |         """
636 |         filename = os.path.join(output_dir, f"{crawl_id}.json")
637 |         with open(filename, "r") as file:
638 |             metadata = json.load(file)
639 |         crawl = Crawl(
640 |             metadata["base_url"],
641 |             metadata["allowed_urls"],
642 |             metadata["banned_urls"],
643 |             metadata["n_workers"],
644 |             metadata["max_pages"],
645 |             render_js=metadata["render_js"],
646 |             output_dir=metadata["output_dir"],
647 |         )
648 |         crawl.crawl_id = metadata["crawl_id"]
649 |         crawl.visited_urls = metadata["visited_urls"]
650 |         crawl.ignored_urls = set(metadata["ignored_urls"])
651 |         crawl.queue = asyncio.Queue()
652 |         for url in metadata["queue"]:
653 |             crawl.queue.put_nowait(url)
654 |         return crawl
655 | 
656 |     @staticmethod
657 |     def from_cloud(crawl_id: str, api_key: Optional[str] = None) -> "Crawl":
658 |         """
659 |         Create a Crawl object from metadata stored in the cloud.
660 | 
661 |         Args:
662 |             crawl_id (str): The ID of the crawl.
663 |             api_key (str, optional): The API key for accessing the cloud. Defaults to None.
664 | 
665 |         Returns:
666 |             Crawl: The Crawl object.
667 |         """
668 |         if api_key is None:
669 |             api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
670 | 
671 |         if api_key is not None:
672 |             get_json = {
673 |                 "crawl_id": crawl_id,
674 |             }
675 |             out_json = run_webt_api(get_json, "v1/crawl/get", api_key)
676 |             crawl = Crawl(
677 |                 out_json["base_url"],
678 |                 out_json["allowed_urls"],
679 |                 out_json["banned_urls"],
680 |                 max_pages=out_json["max_pages"],
681 |                 render_js=out_json["render_js"],
682 |                 api_key=api_key,
683 |                 _created=True,
684 |             )
685 |             crawl.crawl_id = out_json["crawl_id"]
686 |             return crawl
687 | 
688 |         raise ValueError(
689 |             "API key not found. Please set WEBTRANSPOSE_API_KEY environment variable or pass api_key argument."
690 |         )
691 | 
692 |     def __str__(self) -> str:
693 |         """
694 |         Get a string representation of the Crawl object.
695 | 
696 |         Returns:
697 |             str: The string representation of the Crawl object.
698 |         """
699 |         status = self.status()
700 |         return (
701 |             f"WebTransposeCrawl(\n"
702 |             f"  Crawl ID: {status['crawl_id']}\n"
703 |             f"  Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
704 |             f"  Base URL: {status['base_url']}\n"
705 |             f"  Max Pages: {status['max_pages']}\n"
706 |             f"  Number of Visited URLs: {status['num_visited']}\n"
707 |             f"  Number of Ignored URLs: {status['num_ignored']}\n"
708 |             f"  Number of Queued URLs: {status['num_queued']}\n"
709 |             f"  Number of Failed URLs: {status['num_failed']}\n"
710 |             f"  Banned URLs: {status['banned_urls']}\n"
711 |             f"  Allowed URLs: {status['allowed_urls']}"
712 |             f")"
713 |         )
714 | 
715 |     def __repr__(self) -> str:
716 |         """
717 |         Get a string representation of the Crawl object.
718 | 
719 |         Returns:
720 |             str: The string representation of the Crawl object.
721 |         """
722 |         status = self.status()
723 |         return (
724 |             f"WebTransposeCrawl(\n"
725 |             f"  Crawl ID: {status['crawl_id']}\n"
726 |             f"  Number of Workers: {status['n_workers'] if 'n_workers' in status else 'cloud'}\n"
727 |             f"  Base URL: {status['base_url']}\n"
728 |             f"  Max Pages: {status['max_pages']}\n"
729 |             f"  Number of Visited URLs: {status['num_visited']}\n"
730 |             f"  Number of Ignored URLs: {status['num_ignored']}\n"
731 |             f"  Number of Queued URLs: {status['num_queued']}\n"
732 |             f"  Number of Failed URLs: {status['num_failed']}\n"
733 |             f"  Banned URLs: {status['banned_urls']}\n"
734 |             f"  Allowed URLs: {status['allowed_urls']}"
735 |             f")"
736 |         )
737 | 
738 |     def get_page(self, url: str) -> dict:
739 |         """
740 |         Get the page data for a given URL.
741 | 
742 |         Args:
743 |             url (str): The URL of the page.
744 | 
745 |         Returns:
746 |             dict: The page data.
747 |         """
748 |         if not self.created:
749 |             fn = self.visited_urls[url]
750 |             try:
751 |                 with open(fn, "r") as f:
752 |                     data = json.load(f)
753 |                     return data
754 |             except:
755 |                 logging.error(f"Could not find HTML for URL {url}")
756 |         else:
757 |             get_json = {
758 |                 "crawl_id": self.crawl_id,
759 |                 "url": url,
760 |             }
761 |             out_json = run_webt_api(
762 |                 get_json,
763 |                 "v1/crawl/get-page",
764 |                 self.api_key,
765 |             )
766 |             return out_json
767 | 
768 |     def get_child_urls(self, url: str) -> list:
769 |         """
770 |         Get the child URLs for a given URL.
771 | 
772 |         Args:
773 |             url (str): The URL.
774 | 
775 |         Returns:
776 |             list: A list of child URLs.
777 |         """
778 |         if not self.created:
779 |             try:
780 |                 fn = self.visited_urls[url]
781 |             except:
782 |                 logging.error(f"Could not find child URLs for URL {url}")
783 |                 return None
784 |             try:
785 |                 with open(fn, "r") as f:
786 |                     data = json.load(f)
787 |                     return data["child_urls"]
788 |             except:
789 |                 logging.error(f"Could not find child URLs for URL {url}")
790 |         else:
791 |             get_json = {
792 |                 "crawl_id": self.crawl_id,
793 |                 "url": url,
794 |             }
795 |             out_json = run_webt_api(
796 |                 get_json,
797 |                 "v1/crawl/get-child-urls",
798 |                 self.api_key,
799 |             )
800 |             return out_json
801 | 
802 |     def retry_failed_urls(self) -> None:
803 |         """
804 |         Queue failed URLs from a crawl.
805 |         """
806 |         if not self.created:
807 |             logging.error("Cannot retry failed URLs for un-created crawl.")
808 |         elif self.api_key is not None:
809 |             queue_json = {
810 |                 "crawl_id": self.crawl_id,
811 |             }
812 |             run_webt_api(
813 |                 queue_json,
814 |                 "v1/crawl/retry-failed",
815 |                 self.api_key,
816 |             )
817 | 
818 | 
819 | def get_crawl(crawl_id: str, api_key: Optional[str] = None) -> Crawl:
820 |     """
821 |     Get a Crawl object based on the crawl ID.
822 | 
823 |     Args:
824 |         crawl_id (str): The ID of the crawl.
825 |         api_key (str, optional): The API key. Defaults to None.
826 | 
827 |     Returns:
828 |         Crawl: The Crawl object.
829 |     """
830 |     try:
831 |         return Crawl.from_metadata(crawl_id)
832 |     except FileNotFoundError:
833 |         return Crawl.from_cloud(crawl_id, api_key=api_key)
834 | 
835 | 
836 | def list_crawls(loc: str = "cloud", api_key: Optional[str] = None) -> list:
837 |     """
838 |     List all available crawls.
839 | 
840 |     Args:
841 |         loc (str, optional): The location of the crawls. Defaults to 'cloud'.
842 |         api_key (str, optional): The API key. Defaults to None.
843 | 
844 |     Returns:
845 |         list: A list of Crawl objects.
846 |     """
847 |     if api_key is None:
848 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
849 | 
850 |     if api_key is not None and loc == "cloud":
851 |         crawl_list_data = run_webt_api(
852 |             {},
853 |             "v1/crawl/list",
854 |             api_key,
855 |         )
856 |         return crawl_list_data["crawls"]
857 | 
858 |     elif loc == "local" or api_key is None:
859 |         crawls = []
860 |         for filename in os.listdir("."):
861 |             if filename.endswith(".json"):
862 |                 crawls.append(Crawl.from_metadata(filename[:-5]))
863 |         return crawls
864 | 
865 | 
866 | def retry_failed(crawl_id: str, api_key: Optional[str] = None) -> None:
867 |     """
868 |     Queue failed URLs from a crawl.
869 | 
870 |     Args:
871 |         crawl_id (str): The ID of the crawl.
872 |         api_key (str, optional): The API key. Defaults to None.
873 |     """
874 |     if api_key is None:
875 |         api_key = os.environ.get("WEBTRANSPOSE_API_KEY")
876 | 
877 |     if api_key is not None:
878 |         queue_json = {
879 |             "crawl_id": crawl_id,
880 |         }
881 |         run_webt_api(
882 |             queue_json,
883 |             "v1/crawl/retry-failed",
884 |             api_key,
885 |         )
886 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
  1 | Copyright (C) 2023 Vetro Technologies, Inc. (Web Transpose)
  2 | 
  3 | This program is free software: you can redistribute it and/or modify
  4 | it under the terms of the GNU Affero General Public License as published by
  5 | the Free Software Foundation, either version 3 of the License, or
  6 | (at your option) any later version.
  7 | 
  8 | This program is distributed in the hope that it will be useful,
  9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | GNU Affero General Public License for more details.
 12 | 
 13 | You should have received a copy of the GNU Affero General Public License
 14 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | 
 17 |                     GNU AFFERO GENERAL PUBLIC LICENSE
 18 |                        Version 3, 19 November 2007
 19 | 
 20 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 21 |  Everyone is permitted to copy and distribute verbatim copies
 22 |  of this license document, but changing it is not allowed.
 23 | 
 24 |                             Preamble
 25 | 
 26 |   The GNU Affero General Public License is a free, copyleft license for
 27 | software and other kinds of works, specifically designed to ensure
 28 | cooperation with the community in the case of network server software.
 29 | 
 30 |   The licenses for most software and other practical works are designed
 31 | to take away your freedom to share and change the works.  By contrast,
 32 | our General Public Licenses are intended to guarantee your freedom to
 33 | share and change all versions of a program--to make sure it remains free
 34 | software for all its users.
 35 | 
 36 |   When we speak of free software, we are referring to freedom, not
 37 | price.  Our General Public Licenses are designed to make sure that you
 38 | have the freedom to distribute copies of free software (and charge for
 39 | them if you wish), that you receive source code or can get it if you
 40 | want it, that you can change the software or use pieces of it in new
 41 | free programs, and that you know you can do these things.
 42 | 
 43 |   Developers that use our General Public Licenses protect your rights
 44 | with two steps: (1) assert copyright on the software, and (2) offer
 45 | you this License which gives you legal permission to copy, distribute
 46 | and/or modify the software.
 47 | 
 48 |   A secondary benefit of defending all users' freedom is that
 49 | improvements made in alternate versions of the program, if they
 50 | receive widespread use, become available for other developers to
 51 | incorporate.  Many developers of free software are heartened and
 52 | encouraged by the resulting cooperation.  However, in the case of
 53 | software used on network servers, this result may fail to come about.
 54 | The GNU General Public License permits making a modified version and
 55 | letting the public access it on a server without ever releasing its
 56 | source code to the public.
 57 | 
 58 |   The GNU Affero General Public License is designed specifically to
 59 | ensure that, in such cases, the modified source code becomes available
 60 | to the community.  It requires the operator of a network server to
 61 | provide the source code of the modified version running there to the
 62 | users of that server.  Therefore, public use of a modified version, on
 63 | a publicly accessible server, gives the public access to the source
 64 | code of the modified version.
 65 | 
 66 |   An older license, called the Affero General Public License and
 67 | published by Affero, was designed to accomplish similar goals.  This is
 68 | a different license, not a version of the Affero GPL, but Affero has
 69 | released a new version of the Affero GPL which permits relicensing under
 70 | this license.
 71 | 
 72 |   The precise terms and conditions for copying, distribution and
 73 | modification follow.
 74 | 
 75 |                        TERMS AND CONDITIONS
 76 | 
 77 |   0. Definitions.
 78 | 
 79 |   "This License" refers to version 3 of the GNU Affero General Public License.
 80 | 
 81 |   "Copyright" also means copyright-like laws that apply to other kinds of
 82 | works, such as semiconductor masks.
 83 | 
 84 |   "The Program" refers to any copyrightable work licensed under this
 85 | License.  Each licensee is addressed as "you".  "Licensees" and
 86 | "recipients" may be individuals or organizations.
 87 | 
 88 |   To "modify" a work means to copy from or adapt all or part of the work
 89 | in a fashion requiring copyright permission, other than the making of an
 90 | exact copy.  The resulting work is called a "modified version" of the
 91 | earlier work or a work "based on" the earlier work.
 92 | 
 93 |   A "covered work" means either the unmodified Program or a work based
 94 | on the Program.
 95 | 
 96 |   To "propagate" a work means to do anything with it that, without
 97 | permission, would make you directly or secondarily liable for
 98 | infringement under applicable copyright law, except executing it on a
 99 | computer or modifying a private copy.  Propagation includes copying,
100 | distribution (with or without modification), making available to the
101 | public, and in some countries other activities as well.
102 | 
103 |   To "convey" a work means any kind of propagation that enables other
104 | parties to make or receive copies.  Mere interaction with a user through
105 | a computer network, with no transfer of a copy, is not conveying.
106 | 
107 |   An interactive user interface displays "Appropriate Legal Notices"
108 | to the extent that it includes a convenient and prominently visible
109 | feature that (1) displays an appropriate copyright notice, and (2)
110 | tells the user that there is no warranty for the work (except to the
111 | extent that warranties are provided), that licensees may convey the
112 | work under this License, and how to view a copy of this License.  If
113 | the interface presents a list of user commands or options, such as a
114 | menu, a prominent item in the list meets this criterion.
115 | 
116 |   1. Source Code.
117 | 
118 |   The "source code" for a work means the preferred form of the work
119 | for making modifications to it.  "Object code" means any non-source
120 | form of a work.
121 | 
122 |   A "Standard Interface" means an interface that either is an official
123 | standard defined by a recognized standards body, or, in the case of
124 | interfaces specified for a particular programming language, one that
125 | is widely used among developers working in that language.
126 | 
127 |   The "System Libraries" of an executable work include anything, other
128 | than the work as a whole, that (a) is included in the normal form of
129 | packaging a Major Component, but which is not part of that Major
130 | Component, and (b) serves only to enable use of the work with that
131 | Major Component, or to implement a Standard Interface for which an
132 | implementation is available to the public in source code form.  A
133 | "Major Component", in this context, means a major essential component
134 | (kernel, window system, and so on) of the specific operating system
135 | (if any) on which the executable work runs, or a compiler used to
136 | produce the work, or an object code interpreter used to run it.
137 | 
138 |   The "Corresponding Source" for a work in object code form means all
139 | the source code needed to generate, install, and (for an executable
140 | work) run the object code and to modify the work, including scripts to
141 | control those activities.  However, it does not include the work's
142 | System Libraries, or general-purpose tools or generally available free
143 | programs which are used unmodified in performing those activities but
144 | which are not part of the work.  For example, Corresponding Source
145 | includes interface definition files associated with source files for
146 | the work, and the source code for shared libraries and dynamically
147 | linked subprograms that the work is specifically designed to require,
148 | such as by intimate data communication or control flow between those
149 | subprograms and other parts of the work.
150 | 
151 |   The Corresponding Source need not include anything that users
152 | can regenerate automatically from other parts of the Corresponding
153 | Source.
154 | 
155 |   The Corresponding Source for a work in source code form is that
156 | same work.
157 | 
158 |   2. Basic Permissions.
159 | 
160 |   All rights granted under this License are granted for the term of
161 | copyright on the Program, and are irrevocable provided the stated
162 | conditions are met.  This License explicitly affirms your unlimited
163 | permission to run the unmodified Program.  The output from running a
164 | covered work is covered by this License only if the output, given its
165 | content, constitutes a covered work.  This License acknowledges your
166 | rights of fair use or other equivalent, as provided by copyright law.
167 | 
168 |   You may make, run and propagate covered works that you do not
169 | convey, without conditions so long as your license otherwise remains
170 | in force.  You may convey covered works to others for the sole purpose
171 | of having them make modifications exclusively for you, or provide you
172 | with facilities for running those works, provided that you comply with
173 | the terms of this License in conveying all material for which you do
174 | not control copyright.  Those thus making or running the covered works
175 | for you must do so exclusively on your behalf, under your direction
176 | and control, on terms that prohibit them from making any copies of
177 | your copyrighted material outside their relationship with you.
178 | 
179 |   Conveying under any other circumstances is permitted solely under
180 | the conditions stated below.  Sublicensing is not allowed; section 10
181 | makes it unnecessary.
182 | 
183 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
184 | 
185 |   No covered work shall be deemed part of an effective technological
186 | measure under any applicable law fulfilling obligations under article
187 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
188 | similar laws prohibiting or restricting circumvention of such
189 | measures.
190 | 
191 |   When you convey a covered work, you waive any legal power to forbid
192 | circumvention of technological measures to the extent such circumvention
193 | is effected by exercising rights under this License with respect to
194 | the covered work, and you disclaim any intention to limit operation or
195 | modification of the work as a means of enforcing, against the work's
196 | users, your or third parties' legal rights to forbid circumvention of
197 | technological measures.
198 | 
199 |   4. Conveying Verbatim Copies.
200 | 
201 |   You may convey verbatim copies of the Program's source code as you
202 | receive it, in any medium, provided that you conspicuously and
203 | appropriately publish on each copy an appropriate copyright notice;
204 | keep intact all notices stating that this License and any
205 | non-permissive terms added in accord with section 7 apply to the code;
206 | keep intact all notices of the absence of any warranty; and give all
207 | recipients a copy of this License along with the Program.
208 | 
209 |   You may charge any price or no price for each copy that you convey,
210 | and you may offer support or warranty protection for a fee.
211 | 
212 |   5. Conveying Modified Source Versions.
213 | 
214 |   You may convey a work based on the Program, or the modifications to
215 | produce it from the Program, in the form of source code under the
216 | terms of section 4, provided that you also meet all of these conditions:
217 | 
218 |     a) The work must carry prominent notices stating that you modified
219 |     it, and giving a relevant date.
220 | 
221 |     b) The work must carry prominent notices stating that it is
222 |     released under this License and any conditions added under section
223 |     7.  This requirement modifies the requirement in section 4 to
224 |     "keep intact all notices".
225 | 
226 |     c) You must license the entire work, as a whole, under this
227 |     License to anyone who comes into possession of a copy.  This
228 |     License will therefore apply, along with any applicable section 7
229 |     additional terms, to the whole of the work, and all its parts,
230 |     regardless of how they are packaged.  This License gives no
231 |     permission to license the work in any other way, but it does not
232 |     invalidate such permission if you have separately received it.
233 | 
234 |     d) If the work has interactive user interfaces, each must display
235 |     Appropriate Legal Notices; however, if the Program has interactive
236 |     interfaces that do not display Appropriate Legal Notices, your
237 |     work need not make them do so.
238 | 
239 |   A compilation of a covered work with other separate and independent
240 | works, which are not by their nature extensions of the covered work,
241 | and which are not combined with it such as to form a larger program,
242 | in or on a volume of a storage or distribution medium, is called an
243 | "aggregate" if the compilation and its resulting copyright are not
244 | used to limit the access or legal rights of the compilation's users
245 | beyond what the individual works permit.  Inclusion of a covered work
246 | in an aggregate does not cause this License to apply to the other
247 | parts of the aggregate.
248 | 
249 |   6. Conveying Non-Source Forms.
250 | 
251 |   You may convey a covered work in object code form under the terms
252 | of sections 4 and 5, provided that you also convey the
253 | machine-readable Corresponding Source under the terms of this License,
254 | in one of these ways:
255 | 
256 |     a) Convey the object code in, or embodied in, a physical product
257 |     (including a physical distribution medium), accompanied by the
258 |     Corresponding Source fixed on a durable physical medium
259 |     customarily used for software interchange.
260 | 
261 |     b) Convey the object code in, or embodied in, a physical product
262 |     (including a physical distribution medium), accompanied by a
263 |     written offer, valid for at least three years and valid for as
264 |     long as you offer spare parts or customer support for that product
265 |     model, to give anyone who possesses the object code either (1) a
266 |     copy of the Corresponding Source for all the software in the
267 |     product that is covered by this License, on a durable physical
268 |     medium customarily used for software interchange, for a price no
269 |     more than your reasonable cost of physically performing this
270 |     conveying of source, or (2) access to copy the
271 |     Corresponding Source from a network server at no charge.
272 | 
273 |     c) Convey individual copies of the object code with a copy of the
274 |     written offer to provide the Corresponding Source.  This
275 |     alternative is allowed only occasionally and noncommercially, and
276 |     only if you received the object code with such an offer, in accord
277 |     with subsection 6b.
278 | 
279 |     d) Convey the object code by offering access from a designated
280 |     place (gratis or for a charge), and offer equivalent access to the
281 |     Corresponding Source in the same way through the same place at no
282 |     further charge.  You need not require recipients to copy the
283 |     Corresponding Source along with the object code.  If the place to
284 |     copy the object code is a network server, the Corresponding Source
285 |     may be on a different server (operated by you or a third party)
286 |     that supports equivalent copying facilities, provided you maintain
287 |     clear directions next to the object code saying where to find the
288 |     Corresponding Source.  Regardless of what server hosts the
289 |     Corresponding Source, you remain obligated to ensure that it is
290 |     available for as long as needed to satisfy these requirements.
291 | 
292 |     e) Convey the object code using peer-to-peer transmission, provided
293 |     you inform other peers where the object code and Corresponding
294 |     Source of the work are being offered to the general public at no
295 |     charge under subsection 6d.
296 | 
297 |   A separable portion of the object code, whose source code is excluded
298 | from the Corresponding Source as a System Library, need not be
299 | included in conveying the object code work.
300 | 
301 |   A "User Product" is either (1) a "consumer product", which means any
302 | tangible personal property which is normally used for personal, family,
303 | or household purposes, or (2) anything designed or sold for incorporation
304 | into a dwelling.  In determining whether a product is a consumer product,
305 | doubtful cases shall be resolved in favor of coverage.  For a particular
306 | product received by a particular user, "normally used" refers to a
307 | typical or common use of that class of product, regardless of the status
308 | of the particular user or of the way in which the particular user
309 | actually uses, or expects or is expected to use, the product.  A product
310 | is a consumer product regardless of whether the product has substantial
311 | commercial, industrial or non-consumer uses, unless such uses represent
312 | the only significant mode of use of the product.
313 | 
314 |   "Installation Information" for a User Product means any methods,
315 | procedures, authorization keys, or other information required to install
316 | and execute modified versions of a covered work in that User Product from
317 | a modified version of its Corresponding Source.  The information must
318 | suffice to ensure that the continued functioning of the modified object
319 | code is in no case prevented or interfered with solely because
320 | modification has been made.
321 | 
322 |   If you convey an object code work under this section in, or with, or
323 | specifically for use in, a User Product, and the conveying occurs as
324 | part of a transaction in which the right of possession and use of the
325 | User Product is transferred to the recipient in perpetuity or for a
326 | fixed term (regardless of how the transaction is characterized), the
327 | Corresponding Source conveyed under this section must be accompanied
328 | by the Installation Information.  But this requirement does not apply
329 | if neither you nor any third party retains the ability to install
330 | modified object code on the User Product (for example, the work has
331 | been installed in ROM).
332 | 
333 |   The requirement to provide Installation Information does not include a
334 | requirement to continue to provide support service, warranty, or updates
335 | for a work that has been modified or installed by the recipient, or for
336 | the User Product in which it has been modified or installed.  Access to a
337 | network may be denied when the modification itself materially and
338 | adversely affects the operation of the network or violates the rules and
339 | protocols for communication across the network.
340 | 
341 |   Corresponding Source conveyed, and Installation Information provided,
342 | in accord with this section must be in a format that is publicly
343 | documented (and with an implementation available to the public in
344 | source code form), and must require no special password or key for
345 | unpacking, reading or copying.
346 | 
347 |   7. Additional Terms.
348 | 
349 |   "Additional permissions" are terms that supplement the terms of this
350 | License by making exceptions from one or more of its conditions.
351 | Additional permissions that are applicable to the entire Program shall
352 | be treated as though they were included in this License, to the extent
353 | that they are valid under applicable law.  If additional permissions
354 | apply only to part of the Program, that part may be used separately
355 | under those permissions, but the entire Program remains governed by
356 | this License without regard to the additional permissions.
357 | 
358 |   When you convey a copy of a covered work, you may at your option
359 | remove any additional permissions from that copy, or from any part of
360 | it.  (Additional permissions may be written to require their own
361 | removal in certain cases when you modify the work.)  You may place
362 | additional permissions on material, added by you to a covered work,
363 | for which you have or can give appropriate copyright permission.
364 | 
365 |   Notwithstanding any other provision of this License, for material you
366 | add to a covered work, you may (if authorized by the copyright holders of
367 | that material) supplement the terms of this License with terms:
368 | 
369 |     a) Disclaiming warranty or limiting liability differently from the
370 |     terms of sections 15 and 16 of this License; or
371 | 
372 |     b) Requiring preservation of specified reasonable legal notices or
373 |     author attributions in that material or in the Appropriate Legal
374 |     Notices displayed by works containing it; or
375 | 
376 |     c) Prohibiting misrepresentation of the origin of that material, or
377 |     requiring that modified versions of such material be marked in
378 |     reasonable ways as different from the original version; or
379 | 
380 |     d) Limiting the use for publicity purposes of names of licensors or
381 |     authors of the material; or
382 | 
383 |     e) Declining to grant rights under trademark law for use of some
384 |     trade names, trademarks, or service marks; or
385 | 
386 |     f) Requiring indemnification of licensors and authors of that
387 |     material by anyone who conveys the material (or modified versions of
388 |     it) with contractual assumptions of liability to the recipient, for
389 |     any liability that these contractual assumptions directly impose on
390 |     those licensors and authors.
391 | 
392 |   All other non-permissive additional terms are considered "further
393 | restrictions" within the meaning of section 10.  If the Program as you
394 | received it, or any part of it, contains a notice stating that it is
395 | governed by this License along with a term that is a further
396 | restriction, you may remove that term.  If a license document contains
397 | a further restriction but permits relicensing or conveying under this
398 | License, you may add to a covered work material governed by the terms
399 | of that license document, provided that the further restriction does
400 | not survive such relicensing or conveying.
401 | 
402 |   If you add terms to a covered work in accord with this section, you
403 | must place, in the relevant source files, a statement of the
404 | additional terms that apply to those files, or a notice indicating
405 | where to find the applicable terms.
406 | 
407 |   Additional terms, permissive or non-permissive, may be stated in the
408 | form of a separately written license, or stated as exceptions;
409 | the above requirements apply either way.
410 | 
411 |   8. Termination.
412 | 
413 |   You may not propagate or modify a covered work except as expressly
414 | provided under this License.  Any attempt otherwise to propagate or
415 | modify it is void, and will automatically terminate your rights under
416 | this License (including any patent licenses granted under the third
417 | paragraph of section 11).
418 | 
419 |   However, if you cease all violation of this License, then your
420 | license from a particular copyright holder is reinstated (a)
421 | provisionally, unless and until the copyright holder explicitly and
422 | finally terminates your license, and (b) permanently, if the copyright
423 | holder fails to notify you of the violation by some reasonable means
424 | prior to 60 days after the cessation.
425 | 
426 |   Moreover, your license from a particular copyright holder is
427 | reinstated permanently if the copyright holder notifies you of the
428 | violation by some reasonable means, this is the first time you have
429 | received notice of violation of this License (for any work) from that
430 | copyright holder, and you cure the violation prior to 30 days after
431 | your receipt of the notice.
432 | 
433 |   Termination of your rights under this section does not terminate the
434 | licenses of parties who have received copies or rights from you under
435 | this License.  If your rights have been terminated and not permanently
436 | reinstated, you do not qualify to receive new licenses for the same
437 | material under section 10.
438 | 
439 |   9. Acceptance Not Required for Having Copies.
440 | 
441 |   You are not required to accept this License in order to receive or
442 | run a copy of the Program.  Ancillary propagation of a covered work
443 | occurring solely as a consequence of using peer-to-peer transmission
444 | to receive a copy likewise does not require acceptance.  However,
445 | nothing other than this License grants you permission to propagate or
446 | modify any covered work.  These actions infringe copyright if you do
447 | not accept this License.  Therefore, by modifying or propagating a
448 | covered work, you indicate your acceptance of this License to do so.
449 | 
450 |   10. Automatic Licensing of Downstream Recipients.
451 | 
452 |   Each time you convey a covered work, the recipient automatically
453 | receives a license from the original licensors, to run, modify and
454 | propagate that work, subject to this License.  You are not responsible
455 | for enforcing compliance by third parties with this License.
456 | 
457 |   An "entity transaction" is a transaction transferring control of an
458 | organization, or substantially all assets of one, or subdividing an
459 | organization, or merging organizations.  If propagation of a covered
460 | work results from an entity transaction, each party to that
461 | transaction who receives a copy of the work also receives whatever
462 | licenses to the work the party's predecessor in interest had or could
463 | give under the previous paragraph, plus a right to possession of the
464 | Corresponding Source of the work from the predecessor in interest, if
465 | the predecessor has it or can get it with reasonable efforts.
466 | 
467 |   You may not impose any further restrictions on the exercise of the
468 | rights granted or affirmed under this License.  For example, you may
469 | not impose a license fee, royalty, or other charge for exercise of
470 | rights granted under this License, and you may not initiate litigation
471 | (including a cross-claim or counterclaim in a lawsuit) alleging that
472 | any patent claim is infringed by making, using, selling, offering for
473 | sale, or importing the Program or any portion of it.
474 | 
475 |   11. Patents.
476 | 
477 |   A "contributor" is a copyright holder who authorizes use under this
478 | License of the Program or a work on which the Program is based.  The
479 | work thus licensed is called the contributor's "contributor version".
480 | 
481 |   A contributor's "essential patent claims" are all patent claims
482 | owned or controlled by the contributor, whether already acquired or
483 | hereafter acquired, that would be infringed by some manner, permitted
484 | by this License, of making, using, or selling its contributor version,
485 | but do not include claims that would be infringed only as a
486 | consequence of further modification of the contributor version.  For
487 | purposes of this definition, "control" includes the right to grant
488 | patent sublicenses in a manner consistent with the requirements of
489 | this License.
490 | 
491 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
492 | patent license under the contributor's essential patent claims, to
493 | make, use, sell, offer for sale, import and otherwise run, modify and
494 | propagate the contents of its contributor version.
495 | 
496 |   In the following three paragraphs, a "patent license" is any express
497 | agreement or commitment, however denominated, not to enforce a patent
498 | (such as an express permission to practice a patent or covenant not to
499 | sue for patent infringement).  To "grant" such a patent license to a
500 | party means to make such an agreement or commitment not to enforce a
501 | patent against the party.
502 | 
503 |   If you convey a covered work, knowingly relying on a patent license,
504 | and the Corresponding Source of the work is not available for anyone
505 | to copy, free of charge and under the terms of this License, through a
506 | publicly available network server or other readily accessible means,
507 | then you must either (1) cause the Corresponding Source to be so
508 | available, or (2) arrange to deprive yourself of the benefit of the
509 | patent license for this particular work, or (3) arrange, in a manner
510 | consistent with the requirements of this License, to extend the patent
511 | license to downstream recipients.  "Knowingly relying" means you have
512 | actual knowledge that, but for the patent license, your conveying the
513 | covered work in a country, or your recipient's use of the covered work
514 | in a country, would infringe one or more identifiable patents in that
515 | country that you have reason to believe are valid.
516 | 
517 |   If, pursuant to or in connection with a single transaction or
518 | arrangement, you convey, or propagate by procuring conveyance of, a
519 | covered work, and grant a patent license to some of the parties
520 | receiving the covered work authorizing them to use, propagate, modify
521 | or convey a specific copy of the covered work, then the patent license
522 | you grant is automatically extended to all recipients of the covered
523 | work and works based on it.
524 | 
525 |   A patent license is "discriminatory" if it does not include within
526 | the scope of its coverage, prohibits the exercise of, or is
527 | conditioned on the non-exercise of one or more of the rights that are
528 | specifically granted under this License.  You may not convey a covered
529 | work if you are a party to an arrangement with a third party that is
530 | in the business of distributing software, under which you make payment
531 | to the third party based on the extent of your activity of conveying
532 | the work, and under which the third party grants, to any of the
533 | parties who would receive the covered work from you, a discriminatory
534 | patent license (a) in connection with copies of the covered work
535 | conveyed by you (or copies made from those copies), or (b) primarily
536 | for and in connection with specific products or compilations that
537 | contain the covered work, unless you entered into that arrangement,
538 | or that patent license was granted, prior to 28 March 2007.
539 | 
540 |   Nothing in this License shall be construed as excluding or limiting
541 | any implied license or other defenses to infringement that may
542 | otherwise be available to you under applicable patent law.
543 | 
544 |   12. No Surrender of Others' Freedom.
545 | 
546 |   If conditions are imposed on you (whether by court order, agreement or
547 | otherwise) that contradict the conditions of this License, they do not
548 | excuse you from the conditions of this License.  If you cannot convey a
549 | covered work so as to satisfy simultaneously your obligations under this
550 | License and any other pertinent obligations, then as a consequence you may
551 | not convey it at all.  For example, if you agree to terms that obligate you
552 | to collect a royalty for further conveying from those to whom you convey
553 | the Program, the only way you could satisfy both those terms and this
554 | License would be to refrain entirely from conveying the Program.
555 | 
556 |   13. Remote Network Interaction; Use with the GNU General Public License.
557 | 
558 |   Notwithstanding any other provision of this License, if you modify the
559 | Program, your modified version must prominently offer all users
560 | interacting with it remotely through a computer network (if your version
561 | supports such interaction) an opportunity to receive the Corresponding
562 | Source of your version by providing access to the Corresponding Source
563 | from a network server at no charge, through some standard or customary
564 | means of facilitating copying of software.  This Corresponding Source
565 | shall include the Corresponding Source for any work covered by version 3
566 | of the GNU General Public License that is incorporated pursuant to the
567 | following paragraph.
568 | 
569 |   Notwithstanding any other provision of this License, you have
570 | permission to link or combine any covered work with a work licensed
571 | under version 3 of the GNU General Public License into a single
572 | combined work, and to convey the resulting work.  The terms of this
573 | License will continue to apply to the part which is the covered work,
574 | but the work with which it is combined will remain governed by version
575 | 3 of the GNU General Public License.
576 | 
577 |   14. Revised Versions of this License.
578 | 
579 |   The Free Software Foundation may publish revised and/or new versions of
580 | the GNU Affero General Public License from time to time.  Such new versions
581 | will be similar in spirit to the present version, but may differ in detail to
582 | address new problems or concerns.
583 | 
584 |   Each version is given a distinguishing version number.  If the
585 | Program specifies that a certain numbered version of the GNU Affero General
586 | Public License "or any later version" applies to it, you have the
587 | option of following the terms and conditions either of that numbered
588 | version or of any later version published by the Free Software
589 | Foundation.  If the Program does not specify a version number of the
590 | GNU Affero General Public License, you may choose any version ever published
591 | by the Free Software Foundation.
592 | 
593 |   If the Program specifies that a proxy can decide which future
594 | versions of the GNU Affero General Public License can be used, that proxy's
595 | public statement of acceptance of a version permanently authorizes you
596 | to choose that version for the Program.
597 | 
598 |   Later license versions may give you additional or different
599 | permissions.  However, no additional obligations are imposed on any
600 | author or copyright holder as a result of your choosing to follow a
601 | later version.
602 | 
603 |   15. Disclaimer of Warranty.
604 | 
605 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
606 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
607 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
608 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
609 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
610 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
611 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
612 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
613 | 
614 |   16. Limitation of Liability.
615 | 
616 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
617 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
618 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
619 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
620 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
621 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
622 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
623 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
624 | SUCH DAMAGES.
625 | 
626 |   17. Interpretation of Sections 15 and 16.
627 | 
628 |   If the disclaimer of warranty and limitation of liability provided
629 | above cannot be given local legal effect according to their terms,
630 | reviewing courts shall apply local law that most closely approximates
631 | an absolute waiver of all civil liability in connection with the
632 | Program, unless a warranty or assumption of liability accompanies a
633 | copy of the Program in return for a fee.
634 | 
635 |                      END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------