├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── build.yml
    │   ├── publish-dev.yml
    │   ├── publish.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── Makefile
├── README.md
├── Screenshot.png
├── pyproject.toml
├── pyrightconfig.json
├── ruff.toml
├── src
    └── par_scrape
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── crawl.py
    │   ├── enums.py
    │   ├── extraction_prompt.md
    │   ├── py.typed
    │   ├── scrape_data.py
    │   └── utils.py
└── uv.lock


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Autodetect text files
 2 | * text=auto
 3 | 
 4 | # ...Unless the name matches the following
 5 | # overriding patterns
 6 | 
 7 | # Definitively text files
 8 | *.txt text
 9 | *.json text
10 | *.js text
11 | *.ts text
12 | .env text
13 | .env-* text
14 | *.sh text
15 | *.sql text
16 | *.yml text
17 | *.py text
18 | *.js text
19 | *.ts text
20 | *.ini text
21 | *.jq text
22 | Dockerfile text
23 | Dockerfile.* text
24 | makefile text
25 | makefile.* text
26 | Makefile text
27 | Makefile.* text
28 | 
29 | # Ensure those won't be messed up with
30 | *.jpg binary
31 | *.gif binary
32 | *.png binary
33 | 
34 | # force line endings to be lf so db container does not blow up
35 | **/*.sh text eol=lf
36 | **/*.sql text eol=lf
37 | **/.env text eol=lf
38 | **/.env-* text eol=lf
39 | **/Dockerfile text eol=lf
40 | **/Dockerfile.* text eol=lf
41 | **/*.py text eol=lf
42 | **/*.js text eol=lf
43 | **/*.ts text eol=lf
44 | **/*.jq text eol=lf
45 | **/*.json text eol=lf
46 | **/*.yml text eol=lf
47 | **/Makefile text eol=lf
48 | **/Makefile.* text eol=lf
49 | **/makefile text eol=lf
50 | **/makefile.* text eol=lf
51 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: Build and Test 🐍 distribution 📦
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |     tags:
  8 |       - 'v*.*.*'
  9 | 
 10 | jobs:
 11 |   build:
 12 |     name: Build distribution 📦
 13 |     runs-on: ubuntu-latest
 14 |     steps:
 15 |       - uses: actions/checkout@v4
 16 | 
 17 |       - name: Install uv
 18 |         uses: astral-sh/setup-uv@v4
 19 |         with:
 20 |           enable-cache: true
 21 | 
 22 |       - name: Set up Python
 23 |         uses: actions/setup-python@v5
 24 |         with:
 25 |           python-version: "3.11"
 26 | 
 27 |       - name: Install dependencies
 28 |         run: |
 29 |           uv sync --all-extras --dev
 30 | 
 31 |       - name: Lint and Typecheck
 32 |         run: |
 33 |           make checkall
 34 | 
 35 |       - name: Package
 36 |         run: make package
 37 | 
 38 |       - name: Cache build artifacts
 39 |         uses: actions/cache@v4
 40 |         with:
 41 |           path: dist
 42 |           key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }}
 43 | 
 44 |       - name: Store the distribution packages
 45 |         uses: actions/upload-artifact@v4
 46 |         with:
 47 |           name: python-package-distributions-ubuntu-latest-3.11-x64
 48 |           path: dist/
 49 | 
 50 |   tag-version:
 51 |     runs-on: ubuntu-latest
 52 |     permissions:
 53 |       contents: write  # Grant write access to the repository
 54 |     needs:
 55 |       - build # Wait for all build jobs to complete
 56 |     if: ${{ !startsWith(github.ref, 'refs/tags/v') }}
 57 |     steps:
 58 |       - name: Checkout repository
 59 |         uses: actions/checkout@v4
 60 | 
 61 |       - name: Install uv
 62 |         uses: astral-sh/setup-uv@v4
 63 |         with:
 64 |           enable-cache: true
 65 | 
 66 |       - name: Set up Python
 67 |         uses: actions/setup-python@v5
 68 |         with:
 69 |           python-version: '3.11'
 70 | 
 71 |       - name: Install dependencies
 72 |         run: |
 73 |           uv sync --all-extras --dev
 74 | 
 75 |       - name: Get version from __init__.py
 76 |         id: get_version
 77 |         run: |
 78 |           version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)")
 79 |           echo "Raw version output: $version"
 80 |           echo "VERSION=$version" >> $GITHUB_ENV
 81 | 
 82 |       - name: Check version
 83 |         id: check_version
 84 |         run: |
 85 |           echo "Version in env: ${{ env.VERSION }}"
 86 |           if [ -z "${{ env.VERSION }}" ]; then
 87 |             echo "Error: VERSION is empty"
 88 |             exit 1
 89 |           fi     
 90 | 
 91 |       - name: Configure Git
 92 |         run: |
 93 |           git config --global user.name "${{ github.actor }}"
 94 |           git config --global user.email "${{ github.event.pusher.email }}"
 95 | 
 96 |       - name: App VERSION
 97 |         run: echo "VERSION is ${{ env.VERSION }}"
 98 | 
 99 |       - name: Fetch all tags
100 |         run: git fetch --tags
101 | 
102 |       - name: Check if tag exists
103 |         id: check_tag
104 |         run: |
105 |           TAG_EXISTS=$(git tag --list "v${{ env.VERSION }}")
106 |           if [ -z "$TAG_EXISTS" ]; then
107 |             echo "TAG_EXISTS=false" >> $GITHUB_ENV
108 |           else
109 |             echo "TAG_EXISTS=true" >> $GITHUB_ENV
110 |           fi
111 | 
112 |       - name: Delete existing tag locally and remotely
113 |         if: env.TAG_EXISTS == 'true'
114 |         env:
115 |           VERSION: ${{ env.VERSION }}
116 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
117 |         run: |
118 |           git tag -d "v$VERSION"
119 |           git push --delete origin "v$VERSION"
120 | 
121 |       - name: Create new tag
122 |         env:
123 |           VERSION: ${{ env.VERSION }}
124 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
125 |         run: |
126 |           if ! git tag -a "v$VERSION" -m "Version $VERSION"; then
127 |             echo "Failed to create tag"
128 |             exit 1
129 |           fi
130 |           if ! git push origin "v$VERSION"; then
131 |             echo "Failed to push tag"
132 |             exit 1
133 |           fi
134 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-dev.yml:
--------------------------------------------------------------------------------
 1 | name: Publish 🐍 📦 to TestPyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   publish-to-testpypi:
 8 |     if: github.event_name == 'workflow_dispatch'  # Only allow manual triggers
 9 |     name: Publish 🐍 distribution 📦 to TestPyPI
10 |     runs-on: ubuntu-latest
11 |     environment:
12 |       name: testpypi
13 |       url: https://test.pypi.org/p/par_scrape
14 |     permissions:
15 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
16 | 
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v4
20 | 
21 |       - name: Install uv
22 |         uses: astral-sh/setup-uv@v3
23 |         with:
24 |           enable-cache: true
25 | 
26 |       - name: Set up Python
27 |         uses: actions/setup-python@v5
28 |         with:
29 |           python-version: '3.11'
30 | 
31 |       - name: Install dependencies
32 |         run: |
33 |           uv sync --all-extras --dev
34 | 
35 |       - name: Get version from __init__.py
36 |         id: get_version
37 |         run: |
38 |           version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)")
39 |           echo "Raw version output: $version"
40 |           echo "VERSION=$version" >> $GITHUB_ENV
41 | 
42 |       - name: Check version
43 |         id: check_version
44 |         run: |
45 |           echo "Version in env: ${{ env.VERSION }}"
46 |           if [ -z "${{ env.VERSION }}" ]; then
47 |             echo "Error: VERSION is empty"
48 |             exit 1
49 |           fi
50 | 
51 |       - name: Restore cached build artifacts
52 |         uses: actions/cache@v4
53 |         with:
54 |           path: dist
55 |           key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }}
56 |           fail-on-cache-miss: true
57 | 
58 |       - name: Publish distribution 📦 to TestPyPI
59 |         uses: pypa/gh-action-pypi-publish@release/v1
60 |         with:
61 |           repository-url: https://test.pypi.org/legacy/
62 |           skip-existing: true
63 | 
64 |       - name: Discord notification
65 |         env:
66 |           DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
67 |         uses: Ilshidur/action-discord@master
68 |         with:
69 |           args: 'The project {{ EVENT_PAYLOAD.repository.full_name }} ${{ env.VERSION }} has been published to TestPyPI.'
70 |         continue-on-error: true
71 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish 🐍 📦 to PyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   release:
 6 |     types: [ published ]
 7 | 
 8 | jobs:
 9 |   publish-to-pypi:
10 |     if: github.event_name == 'workflow_dispatch'  # Only allow manual triggers
11 |     name: Publish 🐍 distribution 📦 to PyPI
12 |     runs-on: ubuntu-latest
13 |     environment:
14 |       name: pypi
15 |       url: https://pypi.org/p/par_scrape
16 |     permissions:
17 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
18 | 
19 |     steps:
20 |       - name: Checkout repository
21 |         uses: actions/checkout@v4
22 | 
23 |       - name: Install uv
24 |         uses: astral-sh/setup-uv@v3
25 |         with:
26 |           enable-cache: true
27 | 
28 |       - name: Set up Python
29 |         uses: actions/setup-python@v5
30 |         with:
31 |           python-version: '3.11'
32 | 
33 |       - name: Install dependencies
34 |         run: |
35 |           uv sync --all-extras --dev
36 | 
37 |       - name: Get version from __init__.py
38 |         id: get_version
39 |         run: |
40 |           version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)")
41 |           echo "Raw version output: $version"
42 |           echo "VERSION=$version" >> $GITHUB_ENV
43 | 
44 |       - name: Check version
45 |         id: check_version
46 |         run: |
47 |           echo "Version in env: ${{ env.VERSION }}"
48 |           if [ -z "${{ env.VERSION }}" ]; then
49 |             echo "Error: VERSION is empty"
50 |             exit 1
51 |           fi
52 | 
53 |       - name: Restore cached build artifacts
54 |         uses: actions/cache@v4
55 |         with:
56 |           path: dist
57 |           key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }}
58 |           restore-keys: |
59 |             ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-
60 |             ${{ runner.os }}-3.11-x64-${{ env.VERSION }}-
61 |             ${{ runner.os }}-3.11-x64-
62 |           fail-on-cache-miss: true
63 | 
64 |       - name: Publish distribution 📦 to PyPI
65 |         uses: pypa/gh-action-pypi-publish@release/v1
66 | 
67 |       - name: Discord notification
68 |         env:
69 |           DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
70 |         uses: Ilshidur/action-discord@master
71 |         with:
72 |           args: 'The project {{ EVENT_PAYLOAD.repository.full_name }} ${{ env.VERSION }} has been published to PyPI.'
73 |         continue-on-error: true
74 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release 🐍 distribution
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   github-release:
 8 |     name: Create GitHub Release
 9 |     runs-on: ubuntu-latest
10 |     permissions:
11 |       contents: write
12 |       id-token: write
13 | 
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@v4
17 | 
18 |       - name: Set up Python
19 |         uses: actions/setup-python@v5
20 |         with:
21 |           python-version: '3.11'
22 | 
23 |       - name: Install uv
24 |         uses: astral-sh/setup-uv@v4
25 | 
26 |       - name: Install dependencies
27 |         run: |
28 |           uv sync
29 | 
30 |       - name: Get version from __init__.py
31 |         id: get_version
32 |         run: |
33 |           version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)")
34 |           echo "Raw version output: $version"
35 |           echo "VERSION=$version" >> $GITHUB_ENV
36 | 
37 |       - name: Check version
38 |         id: check_version
39 |         run: |
40 |           echo "Version in env: ${{ env.VERSION }}"
41 |           if [ -z "${{ env.VERSION }}" ]; then
42 |             echo "Error: VERSION is empty"
43 |             exit 1
44 |           fi
45 | 
46 |       - name: Restore cached build artifacts
47 |         uses: actions/cache@v4
48 |         with:
49 |           path: dist
50 |           key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }}
51 |           restore-keys: |
52 |             ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-
53 |             ${{ runner.os }}-3.11-x64-
54 | 
55 |       - name: Sign the dists with Sigstore
56 |         uses: sigstore/gh-action-sigstore-python@v3.0.0
57 |         with:
58 |           inputs: >-
59 |             ./dist/*.tar.gz
60 |             ./dist/*.whl
61 | 
62 |       - name: Create GitHub Release
63 |         env:
64 |           GITHUB_TOKEN: ${{ github.token }}
65 |         run: |
66 |           gh release create \
67 |           'release-v${{ env.VERSION }}' \
68 |           --repo '${{ github.repository }}' \
69 |           --generate-notes \
70 |           --latest
71 | 
72 |       - name: Upload artifact signatures to GitHub Release
73 |         env:
74 |           GITHUB_TOKEN: ${{ github.token }}
75 |         run: |
76 |           gh release upload \
77 |           'release-v${{ env.VERSION }}' dist/** \
78 |           --repo '${{ github.repository }}'
79 | 
80 |       - name: Discord notification
81 |         env:
82 |           DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
83 |         uses: Ilshidur/action-discord@master
84 |         with:
85 |           args: 'A new release ${{ github.ref_name }} has been created for {{ EVENT_PAYLOAD.repository.full_name }}.'
86 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### PythonVanilla template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 | 
29 | # Installer logs
30 | pip-log.txt
31 | pip-delete-this-directory.txt
32 | 
33 | # Unit test / coverage reports
34 | htmlcov/
35 | .tox/
36 | .nox/
37 | .coverage
38 | .coverage.*
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | *.cover
43 | *.py,cover
44 | .hypothesis/
45 | .pytest_cache/
46 | cover/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # pyenv
53 | #   For a library or package, you might want to ignore these files since the code is
54 | #   intended to run in multiple environments; otherwise, check them in:
55 | # .python-version
56 | 
57 | # pipenv
58 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
59 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
60 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
61 | #   install all needed dependencies.
62 | #Pipfile.lock
63 | 
64 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
65 | __pypackages__/
66 | 
67 | .aider*
68 | **/venv
69 | **/.venv
70 | **/.env
71 | **/.idea
72 | /config.json
73 | /output/
74 | /.DS_Store
75 | /.ruff_cache/
76 | /src/par_scrape/pages/
77 | /src/par_scrape/jobs.sqlite
78 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_stages: [pre-commit, pre-push]
 2 | default_language_version:
 3 |     python: python3.11
 4 | fail_fast: false
 5 | repos:
 6 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v4.6.0
 8 |     hooks:
 9 |     -   id: check-merge-conflict
10 |     -   id: detect-private-key
11 |     -   id: end-of-file-fixer
12 |     -   id: mixed-line-ending
13 |     -   id: trailing-whitespace
14 |         args: [--markdown-linebreak-ext=md]
15 |     -   id: check-docstring-first
16 |     -   id: check-toml
17 |     -   id: check-yaml
18 |     -   id: check-json
19 |     -   id: pretty-format-json
20 |         args: [--autofix, --no-sort-keys]
21 |         exclude: tests(/\w*)*/functional/|tests/input|tests(/.*)+/conftest.py|doc/data/messages|tests(/\w*)*data/|Pipfile.lock|output/.*
22 | 
23 | - repo: local
24 |   hooks:
25 |     - id: pyright
26 |       name: pyright
27 |       entry: make
28 |       language: system
29 |       pass_filenames: false
30 |       args:
31 |         [typecheck]
32 |       exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/|output/.*
33 | 
34 | - repo: local
35 |   hooks:
36 |     - id: format
37 |       name: format
38 |       entry: make
39 |       language: system
40 |       pass_filenames: false
41 |       args:
42 |         [format]
43 |       exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/|output/.*
44 | 
45 | - repo: local
46 |   hooks:
47 |     - id: lint
48 |       name: lint
49 |       entry: make
50 |       language: system
51 |       pass_filenames: false
52 |       args:
53 |         [lint]
54 |       exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/|output/.*
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Paul Robello
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # Common make values.
  3 | lib    := par_scrape
  4 | run    := uv run
  5 | python := $(run) python
  6 | ruff  := $(run) ruff
  7 | pyright := $(run) pyright
  8 | build  := uv build
  9 | 
 10 | #export UV_LINK_MODE=copy
 11 | export PIPENV_VERBOSITY=-1
 12 | ##############################################################################
 13 | # Run the app.
 14 | .PHONY: run
 15 | run:	        # Run the app
 16 | 	$(run) $(lib) "$(ARG1)" "$(ARG2)" "$(ARG3)" "$(ARG4)" "$(ARG5)" "$(ARG6)" "$(ARG7)" "$(ARG8)" "$(ARG9)"
 17 | 
 18 | .PHONY: app_help
 19 | app_help:		# Show app help
 20 | 	$(run) $(lib)  --help
 21 | 
 22 | 
 23 | ##############################################################################
 24 | .PHONY: uv-lock
 25 | uv-lock:
 26 | 	uv lock
 27 | 
 28 | .PHONY: uv-sync
 29 | uv-sync:
 30 | 	uv sync
 31 | 
 32 | .PHONY: setup
 33 | setup: uv-lock uv-sync	        # use this for first time run
 34 | 
 35 | .PHONY: resetup
 36 | resetup: remove-venv setup			# Recreate the virtual environment from scratch
 37 | 
 38 | .PHONY: remove-venv
 39 | remove-venv:			# Remove the virtual environment
 40 | 	rm -rf .venv
 41 | 
 42 | .PHONY: depsupdate
 43 | depsupdate:			# Update all dependencies
 44 | 	uv sync -U
 45 | 
 46 | .PHONY: depsshow
 47 | depsshow:			# Show the dependency graph
 48 | 	uv tree
 49 | 
 50 | .PHONY: shell
 51 | shell:			# Start shell inside of .venv
 52 | 	$(run) bash
 53 | ##############################################################################
 54 | # Checking/testing/linting/etc.
 55 | .PHONY: format
 56 | format:                         # Reformat the code with ruff.
 57 | 	$(ruff) format src/$(lib)
 58 | 
 59 | .PHONY: lint
 60 | lint:                           # Run ruff over the library
 61 | 	$(ruff) check src/$(lib) --fix
 62 | 
 63 | .PHONY: typecheck
 64 | typecheck:			# Perform static type checks with pyright
 65 | 	$(pyright)
 66 | 
 67 | .PHONY: typecheck-stats
 68 | typecheck-stats:			# Perform static type checks with pyright and print stats
 69 | 	$(pyright) --stats
 70 | 
 71 | .PHONY: checkall
 72 | checkall: format lint typecheck 	        # Check all the things
 73 | 
 74 | .PHONY: pre-commit              # run pre-commit checks on all files
 75 | pre-commit:
 76 | 	pre-commit run --all-files
 77 | 
 78 | .PHONY: pre-commit-update               # run pre-commit and update hooks
 79 | pre-commit-update:
 80 | 	pre-commit autoupdate
 81 | 
 82 | ##############################################################################
 83 | # Package/publish.
 84 | .PHONY: package
 85 | package: clean			# Package the library
 86 | 	$(build)
 87 | 
 88 | .PHONY: spackage
 89 | spackage:			# Create a source package for the library
 90 | 	$(build) --sdist
 91 | 
 92 | .PHONY: test-publish
 93 | test-publish: package		# Upload to testpypi
 94 | 	$(publish) upload --index testpypi --check-url
 95 | 
 96 | .PHONY: publish
 97 | publish: package		# Upload to pypi
 98 | 	$(publish) upload --check-url
 99 | ##############################################################################
100 | # Utility.
101 | 
102 | .PHONY: repl
103 | repl:				# Start a Python REPL
104 | 	$(python)
105 | 
106 | .PHONY: clean
107 | clean:				# Clean the build directories
108 | 	rm -rf build dist $(lib).egg-info
109 | 
110 | .PHONY: help
111 | help:				# Display this help
112 | 	@grep -Eh "^[a-z]+:.+# " $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.+# "}; {printf "%-20s %s\n", $$1, $$2}'
113 | 
114 | ##############################################################################
115 | # Housekeeping tasks.
116 | .PHONY: housekeeping
117 | housekeeping:			# Perform some git housekeeping
118 | 	git fsck
119 | 	git gc --aggressive
120 | 	git remote update --prune
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PAR Scrape
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/par_scrape)](https://pypi.org/project/par_scrape/)
  4 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/par_scrape.svg)](https://pypi.org/project/par_scrape/)  
  5 | ![Runs on Linux | MacOS | Windows](https://img.shields.io/badge/runs%20on-Linux%20%7C%20MacOS%20%7C%20Windows-blue)
  6 | ![Arch x86-63 | ARM | AppleSilicon](https://img.shields.io/badge/arch-x86--64%20%7C%20ARM%20%7C%20AppleSilicon-blue)  
  7 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/par_scrape)
  8 | 
  9 | ![PyPI - License](https://img.shields.io/pypi/l/par_scrape)
 10 | 
 11 | PAR Scrape is a versatile web scraping tool with options for Selenium or Playwright, featuring AI-powered data extraction and formatting.
 12 | 
 13 | [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/probello3)
 14 | 
 15 | ## Screenshots
 16 | ![PAR Scrape Screenshot](https://raw.githubusercontent.com/paulrobello/par_scrape/main/Screenshot.png)
 17 | 
 18 | ## Features
 19 | 
 20 | - Web scraping using Playwright or Selenium 
 21 | - AI-powered data extraction and formatting
 22 | - Can be used to crawl and extract clean markdown without AI
 23 | - Supports multiple output formats (JSON, Excel, CSV, Markdown)
 24 | - Customizable field extraction
 25 | - Token usage and cost estimation
 26 | - Prompt cache for Anthropic provider
 27 | - Uses my [PAR AI Core](https://github.com/paulrobello/par_ai_core)
 28 | 
 29 | 
 30 | ## Known Issues
 31 | - Selenium silent mode on windows still shows message about websocket. There is no simple way to get rid of this.
 32 | - Providers other than OpenAI are hit-and-miss depending on provider / model / data being extracted.
 33 | 
 34 | ## Prompt Cache
 35 | - OpenAI will auto cache prompts that are over 1024 tokens.
 36 | - Anthropic will only cache prompts if you specify the --prompt-cache flag. Due to cache writes costing more only enable this if you intend to run multiple scrape jobs against the same url, also the cache will go stale within a couple of minutes so to reduce cost run your jobs as close together as possible.
 37 | 
 38 | ## How it works
 39 | - Data is fetch from the site using either Selenium or Playwright
 40 | - HTML is converted to clean markdown
 41 | - If you specify an output format other than markdown then the following kicks in:
 42 |   - A pydantic model is constructed from the fields you specify
 43 |   - The markdown is sent to the AI provider with the pydantic model as the the required output
 44 |   - The structured output is saved in the specified formats
 45 | - If crawling mode is enabled this process is repeated for each page in the queue until the specified max number of pages is reached
 46 | 
 47 | ## Site Crawling
 48 | 
 49 | Crawling currently comes in 3 modes:
 50 | - Single page which is the default
 51 | - Single level which will crawl all links on the first page and add them to the queue. Links from any pages after the first are not added to the queue
 52 | - Domain which will crawl all links on all pages as long as they below to the same top level domain (TLD).
 53 | - Paginated will be added soon
 54 | 
 55 | Crawling progress is stored in a sqlite database and all pages are tagged with the run name which can be specified with the --run-name / -n flag.  
 56 | You can resume a crawl by specifying the same run name again.  
 57 | The options `--scrape-max-parallel` / `-P` can be used to increase the scraping speed by running multiple scrapes in parallel.  
 58 | The options `--crawl-batch-size` / `-b` should be set at least as high as the scrape max parallel option to ensure that the queue is always full.
 59 | The options `--crawl-max-pages` / `-M` can be used to limit the total number of pages crawled in a single run.
 60 | 
 61 | ## Prerequisites
 62 | 
 63 | To install PAR Scrape, make sure you have Python 3.11.
 64 | 
 65 | ### [uv](https://pypi.org/project/uv/) is recommended
 66 | 
 67 | #### Linux and Mac
 68 | ```bash
 69 | curl -LsSf https://astral.sh/uv/install.sh | sh
 70 | ```
 71 | 
 72 | #### Windows
 73 | ```bash
 74 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
 75 | ```
 76 | 
 77 | ## Installation
 78 | 
 79 | 
 80 | ### Installation From Source
 81 | 
 82 | Then, follow these steps:
 83 | 
 84 | 1. Clone the repository:
 85 |    ```bash
 86 |    git clone https://github.com/paulrobello/par_scrape.git
 87 |    cd par_scrape
 88 |    ```
 89 | 
 90 | 2. Install the package dependencies using uv:
 91 |    ```bash
 92 |    uv sync
 93 |    ```
 94 | ### Installation From PyPI
 95 | 
 96 | To install PAR Scrape from PyPI, run any of the following commands:
 97 | 
 98 | ```bash
 99 | uv tool install par_scrape
100 | ```
101 | 
102 | ```bash
103 | pipx install par_scrape
104 | ```
105 | ### Playwright Installation
106 | To use playwright as a scraper, you must install it and its browsers using the following commands:
107 | 
108 | ```bash
109 | uv tool install playwright
110 | playwright install chromium
111 | ```
112 | 
113 | ## Usage
114 | 
115 | To use PAR Scrape, you can run it from the command line with various options. Here's a basic example:
116 | Ensure you have the AI provider api key in your environment.
117 | You can also store your api keys in the file `~/.par_scrape.env` as follows:
118 | ```shell
119 | # AI API KEYS
120 | OPENAI_API_KEY=
121 | ANTHROPIC_API_KEY=
122 | GROQ_API_KEY=
123 | XAI_API_KEY=
124 | GOOGLE_API_KEY=
125 | MISTRAL_API_KEY=
126 | GITHUB_TOKEN=
127 | OPENROUTER_API_KEY=
128 | DEEPSEEK_API_KEY=
129 | # Used by Bedrock
130 | AWS_PROFILE=
131 | AWS_ACCESS_KEY_ID=
132 | AWS_SECRET_ACCESS_KEY=
133 | 
134 | 
135 | 
136 | ### Tracing (optional)
137 | LANGCHAIN_TRACING_V2=false
138 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
139 | LANGCHAIN_API_KEY=
140 | LANGCHAIN_PROJECT=par_scrape
141 | ```
142 | 
143 | ### AI API KEYS
144 | 
145 | * ANTHROPIC_API_KEY is required for Anthropic. Get a key from https://console.anthropic.com/
146 | * OPENAI_API_KEY is required for OpenAI. Get a key from https://platform.openai.com/account/api-keys
147 | * GITHUB_TOKEN is required for GitHub Models. Get a free key from https://github.com/marketplace/models
148 | * GOOGLE_API_KEY is required for Google Models. Get a free key from https://console.cloud.google.com
149 | * XAI_API_KEY is required for XAI. Get a free key from https://x.ai/api
150 | * GROQ_API_KEY is required for Groq. Get a free key from https://console.groq.com/
151 | * MISTRAL_API_KEY is required for Mistral. Get a free key from https://console.mistral.ai/
152 | * OPENROUTER_KEY is required for OpenRouter. Get a key from https://openrouter.ai/
153 | * DEEPSEEK_API_KEY is required for Deepseek. Get a key from https://platform.deepseek.com/
154 | * AWS_PROFILE or AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are used for Bedrock authentication. The environment must
155 |   already be authenticated with AWS.
156 | * No key required to use with Ollama, LlamaCpp, LiteLLM.
157 | 
158 | 
159 | ### Open AI Compatible Providers
160 | 
161 | If a specify provider is not listed but has an OpenAI compatible endpoint you can use the following combo of vars:
162 | * PARAI_AI_PROVIDER=OpenAI
163 | * PARAI_MODEL=Your selected model
164 | * PARAI_AI_BASE_URL=The providers OpenAI endpoint URL
165 | 
166 | ### Running from source
167 | ```bash
168 | uv run par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" -f "Cache Price" --model gpt-4o-mini --display-output md
169 | ```
170 | 
171 | ### Running if installed from PyPI
172 | ```bash
173 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" -f "Cache Price" --model gpt-4o-mini --display-output md
174 | ```
175 | 
176 | ### Options
177 | ```
178 | --url                  -u      TEXT                                                                                           URL to scrape [default: https://openai.com/api/pricing/]
179 | --output-format        -O      [md|json|csv|excel]                                                                            Output format for the scraped data [default: md]
180 | --fields               -f      TEXT                                                                                           Fields to extract from the webpage
181 |                                                                                                                               [default: Model, Pricing Input, Pricing Output, Cache Price] 
182 | --scraper              -s      [selenium|playwright]                                                                          Scraper to use: 'selenium' or 'playwright' [default: playwright]
183 | --retries              -r      INTEGER                                                                                        Retry attempts for failed scrapes [default: 3]
184 | --scrape-max-parallel  -P      INTEGER                                                                                        Max parallel fetch requests [default: 1]
185 | --wait-type            -w      [none|pause|sleep|idle|selector|text]                                                          Method to use for page content load waiting [default: sleep]
186 | --wait-selector        -i      TEXT                                                                                           Selector or text to use for page content load waiting. [default: None]
187 | --headless             -h                                                                                                     Run in headless mode (for Selenium)
188 | --sleep-time           -t      INTEGER                                                                                        Time to sleep before scrolling (in seconds) [default: 2]
189 | --ai-provider          -a      [Ollama|LlamaCpp|OpenRouter|OpenAI|Gemini|Github|XAI|Anthropic|
190 |                                 Groq|Mistral|Deepseek|LiteLLM|Bedrock]                                                        AI provider to use for processing [default: OpenAI]
191 | --model                -m      TEXT                                                                                           AI model to use for processing. If not specified, a default model will be used. [default: None]
192 | --ai-base-url          -b      TEXT                                                                                           Override the base URL for the AI provider. [default: None]
193 | --prompt-cache                                                                                                                Enable prompt cache for Anthropic provider
194 | --reasoning-effort             [low|medium|high]                                                                              Reasoning effort level to use for o1 and o3 models. [default: None]
195 | --reasoning-budget             INTEGER                                                                                        Maximum context size for reasoning. [default: None]
196 | --display-output       -d      [none|plain|md|csv|json]                                                                       Display output in terminal (md, csv, or json) [default: None]
197 | --output-folder        -o      PATH                                                                                           Specify the location of the output folder [default: output]
198 | --silent               -q                                                                                                     Run in silent mode, suppressing output
199 | --run-name             -n      TEXT                                                                                           Specify a name for this run. Can be used to resume a crawl Defaults to YYYYmmdd_HHMMSS
200 | --pricing              -p      [none|price|details]                                                                           Enable pricing summary display [default: details]
201 | --cleanup              -c      [none|before|after|both]                                                                       How to handle cleanup of output folder [default: none]
202 | --extraction-prompt    -e      PATH                                                                                           Path to the extraction prompt file [default: None]
203 | --crawl-type           -C      [single_page|single_level|domain]                                                              Enable crawling mode [default: single_page]
204 | --crawl-max-pages      -M      INTEGER                                                                                        Maximum number of pages to crawl this session [default: 100]
205 | --crawl-batch-size     -B      INTEGER                                                                                        Maximum number of pages to load from the queue at once [default: 1]
206 | --respect-rate-limits                                                                                                         Whether to use domain-specific rate limiting [default: True]
207 | --respect-robots                                                                                                              Whether to respect robots.txt
208 | --crawl-delay                  INTEGER                                                                                        Default delay in seconds between requests to the same domain [default: 1]
209 | --version              -v
210 | --help                                                                                                                        Show this message and exit.
211 | ```
212 | 
213 | ### Examples
214 | 
215 | * Basic usage with default options:
216 | ```bash
217 | par_scrape --url "https://openai.com/api/pricing/" -f "Model" -f "Pricing Input" -f "Pricing Output" -O json -O csv --pricing details --display-output csv
218 | ```
219 | * Using Playwright, displaying JSON output and waiting for text gpt-4o to be in page before continuing:
220 | ```bash
221 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --scraper playwright -O json -O csv -d json --pricing details -w text -i gpt-4o
222 | ```
223 | * Specifying a custom model and output folder:
224 | ```bash
225 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --model gpt-4 --output-folder ./custom_output -O json -O csv --pricing details -w text -i gpt-4o
226 | ```
227 | * Running in silent mode with a custom run name:
228 | ```bash
229 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --silent --run-name my_custom_run --pricing details -O json -O csv -w text -i gpt-4o
230 | ```
231 | * Using the cleanup option to remove the output folder after scraping:
232 | ```bash
233 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --cleanup --pricing details -O json -O csv 
234 | ```
235 | * Using the pause option to wait for user input before scrolling:
236 | ```bash
237 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --pause --pricing details -O json -O csv 
238 | ```
239 | * Using Anthropic provider with prompt cache enabled and detailed pricing breakdown:
240 | ```bash
241 | par_scrape -a Anthropic --prompt-cache -d csv -p details -f "Title" -f "Description" -f "Price" -f "Cache Price" -O json -O csv 
242 | ```
243 | 
244 | * Crawling single level and only outputting markdown (No LLM or cost):
245 | ```bash
246 | par_scrape --url "https://openai.com/api/pricing/" -O md --crawl-batch-size 5 --scrape-max-parallel 5 --crawl-type single_level
247 | ```
248 | 
249 | 
250 | ## Roadmap
251 | - API Server
252 | - More crawling options
253 |   - Paginated Listing crawling
254 | 
255 | 
256 | ## Whats New
257 | - Version 0.7.0
258 |   - Major overhaul and fixing of crawling features.
259 |   - added --respect-robots flag to check robots.txt before scraping
260 |   - added --respect-rate-limits to respect rate limits for domains
261 |   - added --reasoning-effort  and --reasoning-budget for o1/o3 and Sonnet 3.7
262 |   - updated dependencies
263 | - Version 0.6.1
264 |   - Updated ai-core
265 | - Version 0.6.0
266 |   - Fixed bug where images were being striped from markdown output
267 |   - Now uses par_ai_core for url fetching and markdown conversion
268 |   - New Features:
269 |     - BREAKING CHANGES: 
270 |       - New option to specify desired output formats `-O` which defaults to markdown only, which does not require AI
271 |     - BEHAVIOR CHANGES:
272 |       - Now retries 3 times on failed scrapes
273 |     - Basic site crawling
274 |     - Retry failed fetches
275 |     - HTTP authentication
276 |     - Proxy settings
277 |   - Updated system prompt for better results 
278 | - Version 0.5.1
279 |   - Update ai-core and dependencies
280 |   - Now supports Deepseek, XAI and LiteLLM
281 |   - Better pricing data
282 | - Version 0.5.0
283 |   - Update ai-core and dependencies
284 |   - Now supports OpenRouter
285 | - Version 0.4.9
286 |   - Updated to use new par-ai-core
287 |     - Now supports LlamaCPP and XAI Grok
288 |     - Better cost tracking
289 |     - Updated pricing data
290 |     - Better error handling
291 |   - Now supports Python 3.10
292 | - Version 0.4.8:
293 |   - Added Anthropic prompt cache option.
294 | - Version 0.4.7:
295 |   - BREAKING CHANGE: --pricing cli option now takes a string value of 'details', 'cost', or 'none'.
296 |   - Added pool of user agents that gets randomly pulled from.
297 |   - Updating pricing data.
298 |   - Pricing token capture and compute now much more accurate.
299 | - Version 0.4.6:
300 |   - Minor bug fixes.
301 |   - Updating pricing data.
302 |   - Added support for Amazon Bedrock
303 |   - Removed some unnecessary dependencies.
304 |   - Code cleanup.
305 | - Version 0.4.5:
306 |   - Added new option --wait-type that allows you to specify the type of wait to use such as pause, sleep, idle, text or selector.
307 |   - Removed --pause option as it is no longer needed with --wait-type option.
308 |   - Playwright scraping now honors the headless mode.
309 |   - Playwright is now the default scraper as it is much faster.
310 | - Version 0.4.4:
311 |   - Better Playwright scraping.
312 | - Version 0.4.3:
313 |   - Added option to override the base URL for the AI provider.
314 | - Version 0.4.2:
315 |   - The url parameter can now point to a local rawData_*.md file for easier testing of different models without having to re-fetch the data.
316 |   - Added ability to specify file with extraction prompt.
317 |   - Tweaked extraction prompt to work with Groq and Anthropic. Google still does not work.
318 |   - Remove need for ~/.par-scrape-config.json
319 | - Version 0.4.1:
320 |   - Minor bug fixes for pricing summary.
321 |   - Default model for google changed to "gemini-1.5-pro-exp-0827" which is free and usually works well.
322 | - Version 0.4.0:
323 |   - Added support for Anthropic, Google, Groq, and Ollama. (Not well tested with any providers other than OpenAI)
324 |   - Add flag for displaying pricing summary. Defaults to False.
325 |   - Added pricing data for Anthropic.
326 |   - Better error handling for llm calls.
327 |   - Updated cleanup flag to handle both before and after cleanup. Removed --remove-output-folder flag.
328 | - Version 0.3.1:
329 |   - Add pause and sleep-time options to control the browser and scraping delays.
330 |   - Default headless mode to False so you can interact with the browser.
331 | - Version 0.3.0:
332 |   - Fixed location of config.json file.
333 | 
334 | ## Contributing
335 | 
336 | Contributions are welcome! Please feel free to submit a Pull Request.
337 | 
338 | ## License
339 | 
340 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
341 | 
342 | ## Author
343 | 
344 | Paul Robello - probello@gmail.com
345 | 


--------------------------------------------------------------------------------
/Screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paulrobello/par_scrape/56294b0f0e86434033fd0d1a1ae54900ae0f4585/Screenshot.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "par_scrape"
  3 | dynamic = [
  4 |     "version",
  5 | ]
  6 | description = "A versatile web scraping tool with options for Selenium or Playwright, featuring OpenAI-powered data extraction and formatting."
  7 | url = "https://github.com/paulrobello/par_scrape"
  8 | readme = "README.md"
  9 | requires-python = ">=3.10"
 10 | authors = [
 11 |     { name = "Paul Robello", email = "probello@gmail.com" },
 12 | ]
 13 | maintainers = [
 14 |     { name = "Paul Robello", email = "probello@gmail.com" },
 15 | ]
 16 | classifiers = [
 17 |     "License :: OSI Approved :: MIT License",
 18 |     "Environment :: Console",
 19 |     "Development Status :: 4 - Beta",
 20 |     "Intended Audience :: Developers",
 21 |     "Intended Audience :: End Users/Desktop",
 22 |     "Intended Audience :: Other Audience",
 23 |     "Programming Language :: Python :: 3",
 24 |     "Programming Language :: Python :: 3.10",
 25 |     "Programming Language :: Python :: 3.11",
 26 |     "Programming Language :: Python :: 3.12",
 27 |     "Operating System :: MacOS",
 28 |     "Operating System :: Microsoft :: Windows :: Windows 10",
 29 |     "Operating System :: Microsoft :: Windows :: Windows 11",
 30 |     "Operating System :: POSIX :: Linux",
 31 |     "Topic :: Internet :: WWW/HTTP :: Browsers",
 32 |     "Topic :: Software Development :: Libraries :: Python Modules",
 33 |     "Topic :: Text Processing :: Markup :: HTML",
 34 |     "Typing :: Typed",
 35 | ]
 36 | keywords = [
 37 |     "web scraping",
 38 |     "data extraction",
 39 |     "selenium",
 40 |     "playwright",
 41 |     "openai",
 42 |     "anthropic",
 43 |     "xai",
 44 |     "openrouter",
 45 |     "groq",
 46 |     "ollama",
 47 |     "llamacpp",
 48 | ]
 49 | dependencies = [
 50 |     "beautifulsoup4>=4.13.3",
 51 |     "pandas>=2.2.3",
 52 |     "pydantic>=2.10.6",
 53 |     "python-dotenv>=1.0.1",
 54 |     "rich>=13.9.4",
 55 |     "typer>=0.15.2",
 56 |     "openpyxl>=3.1.5",
 57 |     "tabulate>=0.9.0",
 58 |     "par-ai-core>=0.1.24",
 59 |     "fastapi>=0.115.11",
 60 |     "tldextract>=5.1.3",
 61 |     "strenum>=0.4.15",
 62 | ]
 63 | packages = [
 64 |     "src/par_scrape",
 65 | ]
 66 | 
 67 | [project.license]
 68 | file = "LICENSE"
 69 | 
 70 | [project.urls]
 71 | Homepage = "https://github.com/paulrobello/par_scrape"
 72 | Documentation = "https://github.com/paulrobello/par_scrape/blob/main/README.md"
 73 | Repository = "https://github.com/paulrobello/par_scrape"
 74 | Issues = "https://github.com/paulrobello/par_scrape/issues"
 75 | Discussions = "https://github.com/paulrobello/par_scrape/discussions"
 76 | Wiki = "https://github.com/paulrobello/par_scrape/wiki"
 77 | 
 78 | [project.scripts]
 79 | par_scrape = "par_scrape.__main__:app"
 80 | 
 81 | [build-system]
 82 | requires = [
 83 |     "hatchling",
 84 | ]
 85 | build-backend = "hatchling.build"
 86 | 
 87 | [dependency-groups]
 88 | dev = [
 89 |     "build>=1.2.1",
 90 |     "pyright>=1.1.379",
 91 |     "ruff>=0.9.6",
 92 |     "pre-commit>=4.1.0",
 93 | ]
 94 | 
 95 | [tool.hatch.version]
 96 | path = "src/par_scrape/__init__.py"
 97 | 
 98 | [tool.hatch.metadata]
 99 | allow-direct-references = true
100 | 
101 | [tool.hatch.build.targets.wheel]
102 | packages = [
103 |     "src/par_scrape",
104 | ]
105 | include = [
106 |     "py.typed",
107 |     "**/*.py",
108 |     "**/*.html",
109 |     "**/*.gif",
110 |     "**/*.jpg",
111 |     "**/*.png",
112 |     "**/*.md",
113 | ]
114 | 
115 | [tool.hatch.build.targets.sdist]
116 | include = [
117 |     "src/par_scrape",
118 |     "LICENSE",
119 |     "README.md",
120 |     "extraction_prompt.md",
121 |     "pyproject.toml",
122 | ]
123 | exclude = [
124 |     "*.pyc",
125 |     "__pycache__",
126 |     "*.so",
127 |     "*.dylib",
128 | ]
129 | 


--------------------------------------------------------------------------------
/pyrightconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "include": [
 3 |     "src/**/*.py"
 4 |   ],
 5 |   "exclude": [
 6 |     "**/node_modules",
 7 |     "**/__pycache__",
 8 |     "**/output"
 9 |   ],
10 |   "ignore": [
11 |     "**/.venv"
12 |   ],
13 |   "defineConstant": {
14 |     "DEBUG": true
15 |   },
16 |   "venvPath": ".",
17 |   "venv": ".venv",
18 |   "reportMissingImports": true,
19 |   "reportMissingTypeStubs": false,
20 |   "pythonVersion": "3.10",
21 |   "typeCheckingMode": "basic"
22 | }
23 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | # Exclude a variety of commonly ignored directories.
 2 | exclude = [
 3 |     ".bzr",
 4 |     ".direnv",
 5 |     ".eggs",
 6 |     ".git",
 7 |     ".git-rewrite",
 8 |     ".hg",
 9 |     ".ipynb_checkpoints",
10 |     ".mypy_cache",
11 |     ".nox",
12 |     ".pants.d",
13 |     ".pyenv",
14 |     ".pytest_cache",
15 |     ".pytype",
16 |     ".ruff_cache",
17 |     ".svn",
18 |     ".tox",
19 |     ".venv",
20 |     ".vscode",
21 |     "__pypackages__",
22 |     "_build",
23 |     "buck-out",
24 |     "build",
25 |     "dist",
26 |     "node_modules",
27 |     "site-packages",
28 |     "venv",
29 | ]
30 | 
31 | # Same as Black.
32 | line-length = 120
33 | indent-width = 4
34 | 
35 | # Assume Python 3.10
36 | target-version = "py310"
37 | 
38 | [lint]
39 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
40 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
41 | # McCabe complexity (`C901`) by default.
42 | select = ["E4", "E5", "E7", "E9", "F", "W", "UP", "I"]
43 | ignore = ["E501"]
44 | 
45 | # Allow fix for all enabled rules (when `--fix`) is provided.
46 | fixable = ["ALL"]
47 | unfixable = []
48 | 
49 | # Allow unused variables when underscore-prefixed.
50 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
51 | 
52 | pydocstyle.convention = "google"
53 | 
54 | 
55 | [format]
56 | # Like Black, use double quotes for strings.
57 | quote-style = "double"
58 | 
59 | # Like Black, indent with spaces, rather than tabs.
60 | indent-style = "space"
61 | 
62 | # Like Black, respect magic trailing commas.
63 | skip-magic-trailing-comma = false
64 | 
65 | # Like Black, automatically detect the appropriate line ending.
66 | line-ending = "auto"
67 | 
68 | # Enable auto-formatting of code examples in docstrings. Markdown,
69 | # reStructuredText code/literal blocks and doctests are all supported.
70 | #
71 | # This is currently disabled by default, but it is planned for this
72 | # to be opt-out in the future.
73 | docstring-code-format = true
74 | 
75 | # Set the line length limit used when formatting code snippets in
76 | # docstrings.
77 | #
78 | # This only has an effect when the `docstring-code-format` setting is
79 | # enabled.
80 | docstring-code-line-length = "dynamic"
81 | 
82 | [lint.isort]
83 | combine-as-imports = true
84 | 


--------------------------------------------------------------------------------
/src/par_scrape/__init__.py:
--------------------------------------------------------------------------------
 1 | """PAR Scrape - A versatile web scraping tool."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import os
 6 | 
 7 | __author__ = "Paul Robello"
 8 | __copyright__ = "Copyright 2024, Paul Robello"
 9 | __credits__ = ["Paul Robello"]
10 | __maintainer__ = "Paul Robello"
11 | __email__ = "probello@gmail.com"
12 | __version__ = "0.7.0"
13 | __licence__ = "MIT"
14 | __application_title__ = "PAR Scrape"
15 | __application_binary__ = "par_scrape"
16 | 
17 | os.environ["USER_AGENT"] = f"{__application_title__} {__version__}"
18 | 
19 | __all__: list[str] = [
20 |     "__author__",
21 |     "__copyright__",
22 |     "__credits__",
23 |     "__maintainer__",
24 |     "__email__",
25 |     "__version__",
26 |     "__licence__",
27 |     "__application_title__",
28 |     "__application_binary__",
29 | ]
30 | 


--------------------------------------------------------------------------------
/src/par_scrape/__main__.py:
--------------------------------------------------------------------------------
  1 | """Main entry point for par_scrape."""
  2 | 
  3 | import os
  4 | import shutil
  5 | import sqlite3
  6 | import time
  7 | from contextlib import nullcontext
  8 | from datetime import datetime
  9 | from pathlib import Path
 10 | from typing import Annotated
 11 | from urllib.parse import urlparse
 12 | from uuid import uuid4
 13 | 
 14 | import typer
 15 | from dotenv import load_dotenv
 16 | from par_ai_core.llm_config import LlmConfig, ReasoningEffort
 17 | from par_ai_core.llm_providers import (
 18 |     LlmProvider,
 19 |     provider_default_models,
 20 |     provider_env_key_names,
 21 | )
 22 | from par_ai_core.output_utils import DisplayOutputFormat, display_formatted_output
 23 | from par_ai_core.par_logging import console_out
 24 | from par_ai_core.pricing_lookup import PricingDisplay, show_llm_cost
 25 | from par_ai_core.provider_cb_info import get_parai_callback
 26 | from par_ai_core.web_tools import ScraperChoice, ScraperWaitType, fetch_url, html_to_markdown
 27 | from rich.panel import Panel
 28 | from rich.text import Text
 29 | 
 30 | from par_scrape import __application_title__, __version__
 31 | from par_scrape.crawl import (
 32 |     DB_PATH,
 33 |     CrawlType,
 34 |     ErrorType,
 35 |     PageStatus,
 36 |     add_to_queue,
 37 |     clean_url_of_ticket_id,
 38 |     extract_links,
 39 |     get_next_urls,
 40 |     get_queue_stats,
 41 |     get_url_output_folder,
 42 |     init_db,
 43 |     mark_complete,
 44 |     mark_error,
 45 |     set_crawl_delay,
 46 | )
 47 | from par_scrape.enums import CleanupType, OutputFormat
 48 | from par_scrape.scrape_data import (
 49 |     create_container_model,
 50 |     create_dynamic_model,
 51 |     format_data,
 52 |     save_formatted_data,
 53 |     save_raw_data,
 54 | )
 55 | 
 56 | old_env_path = Path("~/.par-scrape.env").expanduser()
 57 | new_env_path = Path("~/.par_scrape.env").expanduser()
 58 | 
 59 | if old_env_path.exists():
 60 |     if new_env_path.exists():
 61 |         old_env_path.unlink()
 62 |     else:
 63 |         console_out.print(f"[bold yellow]Renaming {old_env_path} to {new_env_path}")
 64 |         old_env_path.rename(new_env_path)
 65 | 
 66 | # Load the .env file from the project folder
 67 | load_dotenv(dotenv_path=".env")
 68 | # Load the new .env file from the users home folder
 69 | load_dotenv(dotenv_path=new_env_path)
 70 | 
 71 | # Initialize Typer app
 72 | app = typer.Typer(help="Web scraping tool with options for Selenium or Playwright")
 73 | 
 74 | 
 75 | def version_callback(value: bool) -> None:
 76 |     """Print version and exit."""
 77 |     if value:
 78 |         print(f"{__application_title__}: {__version__}")
 79 |         raise typer.Exit()
 80 | 
 81 | 
 82 | @app.command()
 83 | def main(
 84 |     url: Annotated[str, typer.Option("--url", "-u", help="URL to scrape")] = "https://openai.com/api/pricing/",
 85 |     output_format: Annotated[
 86 |         list[OutputFormat],
 87 |         typer.Option("--output-format", "-O", help="Output format for the scraped data"),
 88 |     ] = [OutputFormat.MARKDOWN],
 89 |     fields: Annotated[
 90 |         list[str],
 91 |         typer.Option("--fields", "-f", help="Fields to extract from the webpage"),
 92 |     ] = ["Model", "Pricing Input", "Pricing Output", "Cache Price"],
 93 |     scraper: Annotated[
 94 |         ScraperChoice,
 95 |         typer.Option(
 96 |             "--scraper",
 97 |             "-s",
 98 |             help="Scraper to use: 'selenium' or 'playwright'",
 99 |             case_sensitive=False,
100 |         ),
101 |     ] = ScraperChoice.PLAYWRIGHT,
102 |     scrape_retries: Annotated[
103 |         int,
104 |         typer.Option("--retries", "-r", help="Retry attempts for failed scrapes"),
105 |     ] = 3,
106 |     scrape_max_parallel: Annotated[
107 |         int,
108 |         typer.Option("--scrape-max-parallel", "-P", help="Max parallel fetch requests"),
109 |     ] = 1,
110 |     wait_type: Annotated[
111 |         ScraperWaitType,
112 |         typer.Option(
113 |             "--wait-type",
114 |             "-w",
115 |             help="Method to use for page content load waiting",
116 |             case_sensitive=False,
117 |         ),
118 |     ] = ScraperWaitType.SLEEP,
119 |     wait_selector: Annotated[
120 |         str | None,
121 |         typer.Option(
122 |             "--wait-selector",
123 |             "-i",
124 |             help="Selector or text to use for page content load waiting.",
125 |         ),
126 |     ] = None,
127 |     headless: Annotated[
128 |         bool,
129 |         typer.Option("--headless", "-h", help="Run in headless mode (for Selenium)"),
130 |     ] = False,
131 |     sleep_time: Annotated[
132 |         int,
133 |         typer.Option("--sleep-time", "-t", help="Time to sleep before scrolling (in seconds)"),
134 |     ] = 2,
135 |     ai_provider: Annotated[
136 |         LlmProvider,
137 |         typer.Option("--ai-provider", "-a", help="AI provider to use for processing"),
138 |     ] = LlmProvider.OPENAI,
139 |     model: Annotated[
140 |         str | None,
141 |         typer.Option(
142 |             "--model",
143 |             "-m",
144 |             help="AI model to use for processing. If not specified, a default model will be used.",
145 |         ),
146 |     ] = None,
147 |     ai_base_url: Annotated[
148 |         str | None,
149 |         typer.Option(
150 |             "--ai-base-url",
151 |             "-b",
152 |             help="Override the base URL for the AI provider.",
153 |         ),
154 |     ] = None,
155 |     prompt_cache: Annotated[
156 |         bool,
157 |         typer.Option("--prompt-cache", help="Enable prompt cache for Anthropic provider"),
158 |     ] = False,
159 |     reasoning_effort: Annotated[
160 |         ReasoningEffort | None,
161 |         typer.Option(
162 |             "--reasoning-effort",
163 |             help="Reasoning effort level to use for o1 and o3 models.",
164 |         ),
165 |     ] = None,
166 |     reasoning_budget: Annotated[
167 |         int | None,
168 |         typer.Option(
169 |             "--reasoning-budget",
170 |             help="Maximum context size for reasoning.",
171 |         ),
172 |     ] = None,
173 |     display_output: Annotated[
174 |         DisplayOutputFormat | None,
175 |         typer.Option(
176 |             "--display-output",
177 |             "-d",
178 |             help="Display output in terminal (md, csv, or json)",
179 |         ),
180 |     ] = None,
181 |     output_folder: Annotated[
182 |         Path,
183 |         typer.Option("--output-folder", "-o", help="Specify the location of the output folder"),
184 |     ] = Path("./output"),
185 |     silent: Annotated[
186 |         bool,
187 |         typer.Option("--silent", "-q", help="Run in silent mode, suppressing output"),
188 |     ] = False,
189 |     run_name: Annotated[
190 |         str,
191 |         typer.Option(
192 |             "--run-name",
193 |             "-n",
194 |             help="Specify a name for this run. Can be used to resume a crawl Defaults to YYYYmmdd_HHMMSS",
195 |         ),
196 |     ] = "",
197 |     pricing: Annotated[
198 |         PricingDisplay,
199 |         typer.Option("--pricing", "-p", help="Enable pricing summary display"),
200 |     ] = PricingDisplay.DETAILS,
201 |     cleanup: Annotated[
202 |         CleanupType,
203 |         typer.Option("--cleanup", "-c", help="How to handle cleanup of output folder"),
204 |     ] = CleanupType.NONE,
205 |     extraction_prompt: Annotated[
206 |         Path | None,
207 |         typer.Option("--extraction-prompt", "-e", help="Path to the extraction prompt file"),
208 |     ] = None,
209 |     crawl_type: Annotated[
210 |         CrawlType,
211 |         typer.Option(
212 |             "--crawl-type",
213 |             "-C",
214 |             help="Enable crawling mode",
215 |             case_sensitive=False,
216 |         ),
217 |     ] = CrawlType.SINGLE_PAGE,
218 |     crawl_max_pages: Annotated[
219 |         int,
220 |         typer.Option("--crawl-max-pages", "-M", help="Maximum number of pages to crawl this session"),
221 |     ] = 100,
222 |     crawl_batch_size: Annotated[
223 |         int,
224 |         typer.Option("--crawl-batch-size", "-B", help="Maximum number of pages to load from the queue at once"),
225 |     ] = 1,
226 |     respect_rate_limits: Annotated[
227 |         bool,
228 |         typer.Option("--respect-rate-limits", help="Whether to use domain-specific rate limiting"),
229 |     ] = True,
230 |     respect_robots: Annotated[
231 |         bool,
232 |         typer.Option("--respect-robots", help="Whether to respect robots.txt"),
233 |     ] = False,
234 |     crawl_delay: Annotated[
235 |         int,
236 |         typer.Option("--crawl-delay", help="Default delay in seconds between requests to the same domain"),
237 |     ] = 1,
238 |     version: Annotated[  # pylint: disable=unused-argument
239 |         bool | None,
240 |         typer.Option("--version", "-v", callback=version_callback, is_eager=True),
241 |     ] = None,
242 | ):
243 |     """
244 |     Scrape and optionally crawl / extract data from a website.
245 | 
246 |     AI is only used if an output format other than md is specified.
247 | 
248 |     Crawl types:
249 | 
250 |     - single_page: Only scrape the specified URL.
251 | 
252 |     - single_level: Scrape the specified URL and all links on that page that are have the same top level domain.
253 | 
254 |     - domain: Scrape the specified URL and all links and their pages on that page that are have the same domain.
255 |     """
256 | 
257 |     if display_output and display_output not in output_format:
258 |         console_out.print(
259 |             f"[bold red]Display output format '{display_output}' is not in the specified output formats.[/bold red]"
260 |         )
261 |         raise typer.Exit(1)
262 | 
263 |     outputs_needing_llm = [OutputFormat.JSON, OutputFormat.CSV, OutputFormat.EXCEL]
264 |     llm_needed = any(format in output_format for format in outputs_needing_llm)
265 |     if llm_needed:
266 |         if not model:
267 |             model = provider_default_models[ai_provider]
268 | 
269 |         if ai_provider not in [LlmProvider.OLLAMA, LlmProvider.BEDROCK, LlmProvider.LITELLM]:
270 |             key_name = provider_env_key_names[ai_provider]
271 |             if not os.environ.get(key_name):
272 |                 console_out.print(f"[bold red]{key_name} environment variable not set. Exiting...[/bold red]")
273 |                 raise typer.Exit(1)
274 | 
275 |         if prompt_cache and ai_provider != LlmProvider.ANTHROPIC:
276 |             console_out.print(
277 |                 "[bold red]Prompt cache flag is only available for Anthropic provider. Exiting...[/bold red]"
278 |             )
279 |             raise typer.Exit(1)
280 | 
281 |         console_out.print("[bold cyan]Creating llm config and dynamic models...")
282 |         llm_config = LlmConfig(
283 |             provider=ai_provider,
284 |             model_name=model,
285 |             temperature=0,
286 |             base_url=ai_base_url,
287 |             reasoning_effort=reasoning_effort,
288 |             reasoning_budget=reasoning_budget,
289 |         )
290 |         dynamic_extraction_model = create_dynamic_model(fields)
291 |         dynamic_model_container = create_container_model(dynamic_extraction_model)
292 | 
293 |         console_out.print(
294 |             Panel.fit(
295 |                 Text.assemble(
296 |                     ("AI Provider: ", "cyan"),
297 |                     (f"{ai_provider.value}", "green"),
298 |                     "\n",
299 |                     ("Model: ", "cyan"),
300 |                     (f"{model}", "green"),
301 |                     "\n",
302 |                     ("AI Provider Base URL: ", "cyan"),
303 |                     (f"{ai_base_url or 'default'}", "green"),
304 |                     "\n",
305 |                     ("Prompt Cache: ", "cyan"),
306 |                     (f"{prompt_cache}", "green"),
307 |                     "\n",
308 |                     ("Fields to extract: ", "cyan"),
309 |                     (", ".join(fields), "green"),
310 |                     "\n",
311 |                     ("Pricing Display: ", "cyan"),
312 |                     (f"{pricing.value}", "green"),
313 |                 ),
314 |                 title="[bold]AI Configuration",
315 |                 border_style="bold",
316 |             )
317 |         )
318 |     else:
319 |         llm_config = None
320 |         dynamic_model_container = None
321 | 
322 |     # Generate run_name if not provided
323 |     if not run_name:
324 |         run_name = datetime.now().strftime("%Y%m%d_%H%M%S")
325 |     else:
326 |         # Ensure run_name is filesystem-friendly
327 |         run_name = "".join(c for c in run_name if c.isalnum() or c in ("-", "_"))
328 |         if not run_name:
329 |             run_name = str(uuid4())
330 | 
331 |     url = url.rstrip("/")
332 |     console_out.print(
333 |         Panel.fit(
334 |             Text.assemble(
335 |                 ("Primary URL: ", "cyan"),
336 |                 (f"{url}", "green"),
337 |                 "\n",
338 |                 ("Scraper: ", "cyan"),
339 |                 (f"{scraper}", "green"),
340 |                 "\n",
341 |                 ("Scrape Max Parallel: ", "cyan"),
342 |                 (f"{scrape_max_parallel}", "green"),
343 |                 "\n",
344 |                 ("Retries: ", "cyan"),
345 |                 (
346 |                     f"{scrape_retries}",
347 |                     "green",
348 |                 ),
349 |                 "\n",
350 |                 ("Crawl Type: ", "cyan"),
351 |                 (f"{crawl_type.value}", "green"),
352 |                 "\n",
353 |                 ("Crawl Batch Size: ", "cyan"),
354 |                 (f"{crawl_batch_size}", "green"),
355 |                 "\n",
356 |                 ("Respect Rate Limits: ", "cyan"),
357 |                 (f"{respect_rate_limits}", "green"),
358 |                 "\n",
359 |                 ("Default Crawl Delay: ", "cyan"),
360 |                 (f"{crawl_delay} seconds", "green"),
361 |                 "\n",
362 |                 ("Output Format: ", "cyan"),
363 |                 (", ".join([f"{format.value}" for format in output_format]), "green"),
364 |                 "\n",
365 |                 ("Max Pages: ", "cyan"),
366 |                 (f"{crawl_max_pages}", "green"),
367 |                 "\n",
368 |                 ("Headless: ", "cyan"),
369 |                 (f"{headless}", "green"),
370 |                 "\n",
371 |                 ("Wait Type: ", "cyan"),
372 |                 (f"{wait_type.value}", "green"),
373 |                 "\n",
374 |                 ("Wait Selector: ", "cyan"),
375 |                 (
376 |                     f"{wait_selector if wait_type in (ScraperWaitType.SELECTOR, ScraperWaitType.TEXT) else 'N/A'}",
377 |                     "green",
378 |                 ),
379 |                 "\n",
380 |                 ("Sleep Time: ", "cyan"),
381 |                 (
382 |                     f"{sleep_time} seconds",
383 |                     "green",
384 |                 ),
385 |                 "\n",
386 |                 ("Display output: ", "cyan"),
387 |                 (f"{display_output or 'None'}", "green"),
388 |                 "\n",
389 |                 ("Silent mode: ", "cyan"),
390 |                 (f"{silent}", "green"),
391 |                 "\n",
392 |                 ("Cleanup: ", "cyan"),
393 |                 (f"{cleanup}", "green"),
394 |             ),
395 |             title="[bold]Scraping Configuration",
396 |             border_style="bold",
397 |         )
398 |     )
399 | 
400 |     with console_out.capture() if silent else nullcontext():
401 |         if cleanup in [CleanupType.BEFORE, CleanupType.BOTH]:
402 |             if os.path.exists(output_folder):
403 |                 shutil.rmtree(output_folder)
404 |                 console_out.print(f"[bold green]Removed existing output folder: {output_folder}[/bold green]")
405 |         try:
406 |             init_db()
407 |             add_to_queue(run_name, [url])
408 | 
409 |             with get_parai_callback(show_pricing=pricing if llm_needed else PricingDisplay.NONE) as cb:
410 |                 with console_out.status("[bold green]Starting fetch loop...") as status:
411 |                     start_time = time.time()
412 |                     num_pages: int = 0
413 |                     base_output_folder = Path("./output")
414 |                     # Set initial crawl delay for all domains
415 |                     if respect_rate_limits and crawl_delay > 1:
416 |                         with sqlite3.connect(DB_PATH) as conn:
417 |                             conn.execute("UPDATE domain_rate_limit SET crawl_delay = ?", (crawl_delay,))
418 | 
419 |                     while num_pages < crawl_max_pages:
420 |                         # Get queue statistics
421 |                         queue_stats = get_queue_stats(run_name)
422 |                         queued = queue_stats.get(PageStatus.QUEUED.value, 0)
423 |                         completed = queue_stats.get(PageStatus.COMPLETED.value, 0)
424 |                         errors = queue_stats.get(PageStatus.ERROR.value, 0)
425 |                         active = queue_stats.get(PageStatus.ACTIVE.value, 0)
426 | 
427 |                         status.update(
428 |                             f"[bold cyan]Queue status: "
429 |                             f"[yellow]{queued}[/yellow] queued, "
430 |                             f"[green]{completed}[/green] completed, "
431 |                             f"[red]{errors}[/red] errors, "
432 |                             f"[blue]{active}[/blue] active"
433 |                         )
434 | 
435 |                         urls = get_next_urls(
436 |                             run_name, crawl_batch_size, scrape_retries, respect_rate_limits=respect_rate_limits
437 |                         )
438 | 
439 |                         if not urls:
440 |                             # Check if there are any active URLs that might complete
441 |                             if active > 0:
442 |                                 console_out.print(f"[yellow]Waiting for {active} active URLs to complete...[/yellow]")
443 |                                 time.sleep(2)  # Give a small delay to avoid tight loop
444 |                                 continue
445 |                             else:
446 |                                 break
447 |                         num_pages += len(urls)
448 | 
449 |                         try:
450 |                             raw_htmls = fetch_url(
451 |                                 urls,
452 |                                 fetch_using=scraper.value,
453 |                                 max_parallel=scrape_max_parallel,
454 |                                 sleep_time=sleep_time,
455 |                                 wait_type=wait_type,
456 |                                 wait_selector=wait_selector,
457 |                                 headless=headless,
458 |                                 verbose=True,
459 |                                 console=console_out,
460 |                             )
461 |                             if not raw_htmls:
462 |                                 raise ValueError("No data was fetched")
463 | 
464 |                             if len(raw_htmls) != len(urls):
465 |                                 raise ValueError(f"Mismatch between URLs {len(urls)} and fetched data {len(raw_htmls)}")
466 |                             url_data = zip(urls, raw_htmls)
467 |                             for current_url, raw_html in url_data:
468 |                                 try:
469 |                                     console_out.print(f"[green]{current_url}")
470 | 
471 |                                     # Use an even more aggressive approach to avoid nesting
472 |                                     # 1. Completely clean the URL of any run_name occurrences
473 |                                     cleaned_url = clean_url_of_ticket_id(current_url, run_name)
474 | 
475 |                                     url_output_folder = get_url_output_folder(base_output_folder, run_name, cleaned_url)
476 | 
477 |                                     # 2. Print for debugging
478 |                                     console_out.print(f"[blue]Output folder: {url_output_folder}[/blue]")
479 |                                     # Create necessary directories
480 |                                     if llm_needed:
481 |                                         url_output_folder.mkdir(parents=True, exist_ok=True)
482 |                                     else:
483 |                                         url_output_folder.parent.mkdir(parents=True, exist_ok=True)
484 |                                     # console_out.print(f"[green]{url_output_folder}")
485 | 
486 |                                     if not raw_html:
487 |                                         raise ValueError("No data was fetched")
488 | 
489 |                                     # console_out.print(f"cu:{current_url} -- u:{url}")
490 | 
491 |                                     if (
492 |                                         crawl_type == CrawlType.SINGLE_LEVEL and current_url == url
493 |                                     ) or crawl_type == CrawlType.DOMAIN:
494 |                                         # Extract links, respecting robots.txt
495 |                                         page_links = extract_links(
496 |                                             current_url,
497 |                                             raw_html,
498 |                                             crawl_type,
499 |                                             respect_robots=respect_robots,
500 |                                             console=console_out,
501 |                                             ticket_id=run_name,
502 |                                         )
503 | 
504 |                                         # Calculate the current page depth
505 |                                         current_depth = 0
506 |                                         with sqlite3.connect(DB_PATH) as conn:
507 |                                             row = conn.execute(
508 |                                                 "SELECT depth FROM scrape WHERE ticket_id = ? AND url = ?",
509 |                                                 (run_name, current_url),
510 |                                             ).fetchone()
511 |                                             if row:
512 |                                                 current_depth = row[0]
513 | 
514 |                                         # Add extracted links to queue with incremented depth
515 |                                         if page_links:
516 |                                             console_out.print(f"[cyan]Found {len(page_links)} links on {current_url}")
517 |                                             add_to_queue(run_name, page_links, current_depth + 1)
518 |                                     # break
519 |                                     status.update("[bold cyan]Converting HTML to Markdown...")
520 |                                     markdown = html_to_markdown(raw_html, url=current_url, include_images=True)
521 |                                     if not markdown:
522 |                                         raise ValueError("Markdown data is empty")
523 | 
524 |                                     # Save raw data
525 |                                     status.update("[bold cyan]Saving raw data...")
526 |                                     raw_output_path = save_raw_data(markdown, url_output_folder)
527 | 
528 |                                     if "Application error" in markdown:
529 |                                         raise ValueError("Application error encountered.")
530 | 
531 |                                     if llm_needed:
532 |                                         status.update("[bold cyan]Extracting data with LLM...")
533 |                                         assert dynamic_model_container and llm_config
534 |                                         formatted_data = format_data(
535 |                                             data=markdown,
536 |                                             dynamic_listings_container=dynamic_model_container,
537 |                                             llm_config=llm_config,
538 |                                             prompt_cache=prompt_cache,
539 |                                             extraction_prompt=extraction_prompt,
540 |                                         )
541 |                                         if not formatted_data:
542 |                                             raise ValueError("No data was found by the LLM.")
543 | 
544 |                                         # Save formatted data
545 |                                         status.update("[bold cyan]Saving extracted data...")
546 |                                         _, file_paths = save_formatted_data(
547 |                                             formatted_data=formatted_data,
548 |                                             run_name=run_name,
549 |                                             output_folder=url_output_folder,
550 |                                             output_formats=output_format,
551 |                                         )
552 |                                     else:
553 |                                         file_paths = {}
554 |                                     if OutputFormat.MARKDOWN not in file_paths:
555 |                                         file_paths[OutputFormat.MARKDOWN] = raw_output_path
556 | 
557 |                                     mark_complete(
558 |                                         run_name,
559 |                                         current_url,
560 |                                         raw_file_path=raw_output_path,
561 |                                         file_paths=file_paths,
562 |                                     )
563 | 
564 |                                     # Display output if requested
565 |                                     if display_output:
566 |                                         if display_output.value in file_paths:
567 |                                             content = file_paths[display_output.value].read_text()
568 |                                             display_formatted_output(content, display_output, console_out)
569 |                                         else:
570 |                                             console_out.print(
571 |                                                 f"[bold red]Invalid output type: {display_output.value}[/bold red]"
572 |                                             )
573 |                                     if llm_needed:
574 |                                         console_out.print("Current session price:")
575 |                                         show_llm_cost(
576 |                                             cb.usage_metadata, show_pricing=PricingDisplay.PRICE, console=console_out
577 |                                         )
578 | 
579 |                                     console_out.print(
580 |                                         Panel.fit(
581 |                                             "\n".join(
582 |                                                 set([str(p) for p in file_paths.values()] + [str(raw_output_path)])
583 |                                             ),
584 |                                             title="Files",
585 |                                         )
586 |                                     )
587 |                                 except Exception as e:
588 |                                     # Classify error type
589 |                                     error_type = ErrorType.OTHER
590 |                                     error_msg = str(e)
591 | 
592 |                                     if "timeout" in error_msg.lower() or "timed out" in error_msg.lower():
593 |                                         error_type = ErrorType.TIMEOUT
594 |                                     elif "network" in error_msg.lower() or "connection" in error_msg.lower():
595 |                                         error_type = ErrorType.NETWORK
596 |                                     elif "robots.txt" in error_msg.lower() or "disallowed" in error_msg.lower():
597 |                                         error_type = ErrorType.ROBOTS_DISALLOWED
598 |                                     elif "html" in error_msg.lower() or "parse" in error_msg.lower():
599 |                                         error_type = ErrorType.PARSING
600 |                                     elif "url" in error_msg.lower() or "scheme" in error_msg.lower():
601 |                                         error_type = ErrorType.INVALID_URL
602 | 
603 |                                     mark_error(run_name, current_url, error_msg, error_type)
604 |                                     console_out.print(
605 |                                         f"[bold red]URL processing error ([yellow]{error_type.value}[/yellow]):[/bold red][blue]{current_url}[/blue] {error_msg}"
606 |                                     )
607 | 
608 |                                     # Adjust rate limits on network errors
609 |                                     if error_type == ErrorType.NETWORK or error_type == ErrorType.TIMEOUT:
610 |                                         domain = urlparse(current_url).netloc
611 |                                         current_delay = 1
612 |                                         with sqlite3.connect(DB_PATH) as conn:
613 |                                             row = conn.execute(
614 |                                                 "SELECT crawl_delay FROM domain_rate_limit WHERE domain = ?", (domain,)
615 |                                             ).fetchone()
616 |                                             if row:
617 |                                                 current_delay = row[0]
618 | 
619 |                                         # Increase delay for this domain (max 30 seconds)
620 |                                         new_delay = min(current_delay * 2, 30)
621 |                                         set_crawl_delay(domain, new_delay)
622 |                                         console_out.print(
623 |                                             f"[yellow]Increased rate limit for {domain} to {new_delay} seconds[/yellow]"
624 |                                         )
625 |                         except Exception as e:
626 |                             # Determine error type
627 |                             error_type = ErrorType.OTHER
628 |                             error_msg = str(e)
629 | 
630 |                             if "timeout" in error_msg.lower() or "timed out" in error_msg.lower():
631 |                                 error_type = ErrorType.TIMEOUT
632 |                             elif "network" in error_msg.lower() or "connection" in error_msg.lower():
633 |                                 error_type = ErrorType.NETWORK
634 | 
635 |                             for current_url in urls:
636 |                                 mark_error(run_name, current_url, error_msg, error_type)
637 | 
638 |                             console_out.print(
639 |                                 f"[bold red]A fetch error occurred ([yellow]{error_type.value}[/yellow]):[/bold red] {error_msg}"
640 |                             )
641 |                     # end while num_pages < crawl_max_pages
642 |                     duration = time.time() - start_time
643 |                     console_out.print(
644 |                         Panel.fit(
645 |                             f"Pages {num_pages} in {duration:.1f} seconds. {num_pages / duration:.1f} pages per second."
646 |                         )
647 |                     )
648 |                     if llm_needed:
649 |                         console_out.print("Grand total:")
650 | 
651 |                 # end queue_status
652 |             # end get_parai_callback
653 |         except Exception as e:
654 |             console_out.print(f"[bold red]An general error occurred:[/bold red] {str(e)}")
655 |         finally:
656 |             if cleanup in [CleanupType.BOTH, CleanupType.AFTER]:
657 |                 with console_out.status("[bold yellow]Cleaning up..."):
658 |                     if os.path.exists(output_folder):
659 |                         shutil.rmtree(output_folder)
660 |                         console_out.print(
661 |                             f"[bold green]Removed output folder and its contents: {output_folder}[/bold green]"
662 |                         )
663 | 
664 | 
665 | if __name__ == "__main__":
666 |     app()
667 | 


--------------------------------------------------------------------------------
/src/par_scrape/crawl.py:
--------------------------------------------------------------------------------
  1 | """Web crawling functionality for par_scrape."""
  2 | 
  3 | import sqlite3
  4 | import time
  5 | import urllib.robotparser
  6 | from collections.abc import Iterable
  7 | from enum import Enum
  8 | from pathlib import Path
  9 | from urllib.parse import urljoin, urlparse
 10 | 
 11 | from bs4 import BeautifulSoup
 12 | from par_ai_core.web_tools import normalize_url
 13 | from rich.console import Console
 14 | 
 15 | from par_scrape.enums import OutputFormat
 16 | 
 17 | 
 18 | def clean_url_of_ticket_id(url: str, ticket_id: str) -> str:
 19 |     """
 20 |     Clean a URL of any occurrences of the ticket_id to prevent nesting issues.
 21 | 
 22 |     Args:
 23 |         url: The URL to clean
 24 |         ticket_id: The ticket_id to remove from the URL
 25 | 
 26 |     Returns:
 27 |         str: The cleaned URL
 28 |     """
 29 |     # Skip if URL is not valid
 30 |     if not is_valid_url(url):
 31 |         return url
 32 | 
 33 |     # Parse the URL
 34 |     parsed = urlparse(url)
 35 | 
 36 |     # Clean the path of ticket_id - aggressively remove ALL instances
 37 |     path_parts = parsed.path.split("/")
 38 |     cleaned_parts = []
 39 | 
 40 |     for part in path_parts:
 41 |         # Skip empty parts and parts that match ticket_id
 42 |         if part != "" and part != ticket_id:
 43 |             cleaned_parts.append(part)
 44 | 
 45 |     # Rebuild path with cleaned parts
 46 |     cleaned_path = "/" + "/".join(cleaned_parts)
 47 | 
 48 |     # Also clean query parameters if they contain the ticket_id
 49 |     query = parsed.query
 50 |     if ticket_id in query:
 51 |         query_pairs = query.split("&")
 52 |         cleaned_query_pairs = []
 53 | 
 54 |         for pair in query_pairs:
 55 |             if ticket_id not in pair:
 56 |                 cleaned_query_pairs.append(pair)
 57 | 
 58 |         query = "&".join(cleaned_query_pairs)
 59 | 
 60 |     # Rebuild the URL with cleaned path and query
 61 |     cleaned_parsed = parsed._replace(path=cleaned_path, query=query)
 62 |     cleaned_url = cleaned_parsed.geturl()
 63 | 
 64 |     return cleaned_url
 65 | 
 66 | 
 67 | # from tldextract import tldextract
 68 | 
 69 | BASE_PATH = Path("~/.par_scrape").expanduser()
 70 | # BASE_PATH = Path(__file__).parent  # debug path
 71 | DB_PATH = BASE_PATH / "jobs.sqlite"
 72 | # PAGES_BASE = BASE_PATH / "pages"
 73 | 
 74 | # Global dictionary to store robots.txt parsers by domain
 75 | ROBOTS_PARSERS: dict[str, urllib.robotparser.RobotFileParser] = {}
 76 | # Set of excluded URL patterns (common non-content URLs)
 77 | EXCLUDED_URL_PATTERNS = {
 78 |     "/login",
 79 |     "/logout",
 80 |     "/signin",
 81 |     "/signout",
 82 |     "/register",
 83 |     "/password",
 84 |     "/cart",
 85 |     "/checkout",
 86 |     "/search",
 87 |     "/cdn-cgi/",
 88 |     "/wp-admin/",
 89 |     "/wp-login.php",
 90 |     "/favicon.ico",
 91 |     "/sitemap.xml",
 92 |     "/robots.txt",
 93 |     "/feed",
 94 |     "/rss",
 95 |     "/comments",
 96 | }
 97 | # Default user agent for robots.txt
 98 | DEFAULT_USER_AGENT = "par-scrape/1.0 (+https://github.com/paulrobello/par_scrape)"
 99 | 
100 | 
101 | class CrawlType(str, Enum):
102 |     """Types of web crawling strategies."""
103 | 
104 |     SINGLE_PAGE = "single_page"
105 |     SINGLE_LEVEL = "single_level"
106 |     DOMAIN = "domain"
107 |     # PAGINATED = "paginated"
108 | 
109 | 
110 | class PageStatus(str, Enum):
111 |     """Status flags for pages in the crawl queue."""
112 | 
113 |     QUEUED = "queued"
114 |     ACTIVE = "active"
115 |     COMPLETED = "completed"
116 |     ERROR = "error"
117 | 
118 | 
119 | class ErrorType(str, Enum):
120 |     """Types of errors that can occur during crawling."""
121 | 
122 |     NETWORK = "network"
123 |     PARSING = "parsing"
124 |     ROBOTS_DISALLOWED = "robots_disallowed"
125 |     INVALID_URL = "invalid_url"
126 |     TIMEOUT = "timeout"
127 |     OTHER = "other"
128 | 
129 | 
130 | def is_valid_url(url: str) -> bool:
131 |     """
132 |     Validate if a URL is properly formatted and has a supported scheme.
133 | 
134 |     Args:
135 |         url: The URL to validate
136 | 
137 |     Returns:
138 |         bool: True if the URL is valid, False otherwise
139 |     """
140 |     try:
141 |         parsed = urlparse(url)
142 |         return all([parsed.scheme in ("http", "https"), parsed.netloc])
143 |     except Exception:
144 |         return False
145 | 
146 | 
147 | def get_url_output_folder(output_path: Path, ticket_id: str, url: str) -> Path:
148 |     """
149 |     Get storage folder based on URL and ticket_id.
150 | 
151 |     Args:
152 |         output_path: Base path for output files
153 |         ticket_id: Unique identifier for the crawl job
154 |         url: The URL being processed
155 | 
156 |     Returns:
157 |         Path: The folder path where output for this URL should be stored
158 |     """
159 |     # 1. Start with an absolute base folder - always use "./output"
160 |     base_folder = output_path
161 | 
162 |     # 2. Add ticket_id once and only once
163 |     run_folder = base_folder / ticket_id
164 | 
165 |     # 3. Parse the URL without any ticket_id contamination
166 |     parsed_url = urlparse(url)
167 |     domain = parsed_url.netloc.split(":")[0]  # Remove port if present
168 | 
169 |     # 4. Get path components and aggressively filter out ticket_id
170 |     raw_path = parsed_url.path.strip("/")
171 | 
172 |     # 5. If there's no path, just use the domain
173 |     if not raw_path:
174 |         return run_folder / domain
175 | 
176 |     # 6. Create a sanitized path by removing any ticket_id occurrences
177 |     # and converting slashes to double underscores
178 |     path_parts = raw_path.split("/")
179 |     clean_parts = []
180 | 
181 |     for part in path_parts:
182 |         if part != ticket_id and part != "":
183 |             clean_parts.append(part)
184 | 
185 |     sanitized_path = "__".join(clean_parts)
186 | 
187 |     # 7. Final path: ./output/ticket_id/domain/sanitized_path
188 |     if sanitized_path:
189 |         return run_folder / domain / sanitized_path
190 |     else:
191 |         return run_folder / domain
192 | 
193 | 
194 | def check_robots_txt(url: str, user_agent: str = DEFAULT_USER_AGENT) -> bool:
195 |     """
196 |     Check if a URL is allowed by the site's robots.txt.
197 | 
198 |     Args:
199 |         url: The URL to check
200 |         user_agent: User agent to use for robots.txt checking
201 | 
202 |     Returns:
203 |         bool: True if the URL is allowed, False if disallowed
204 |     """
205 |     try:
206 |         parsed_url = urlparse(url)
207 |         domain = parsed_url.netloc
208 | 
209 |         # Get or create a robot parser for this domain
210 |         if domain not in ROBOTS_PARSERS:
211 |             rp = urllib.robotparser.RobotFileParser()
212 |             robots_url = f"{parsed_url.scheme}://{domain}/robots.txt"
213 |             rp.set_url(robots_url)
214 |             try:
215 |                 rp.read()
216 |                 ROBOTS_PARSERS[domain] = rp
217 |             except Exception:
218 |                 # If we can't read robots.txt, assume everything is allowed
219 |                 return True
220 | 
221 |         # Check if URL is allowed
222 |         return ROBOTS_PARSERS[domain].can_fetch(user_agent, url)
223 |     except Exception:
224 |         # On any failure, default to allowing the URL
225 |         return True
226 | 
227 | 
228 | def should_exclude_url(url: str) -> bool:
229 |     """
230 |     Check if a URL should be excluded based on common patterns.
231 | 
232 |     Args:
233 |         url: The URL to check
234 | 
235 |     Returns:
236 |         bool: True if the URL should be excluded, False otherwise
237 |     """
238 |     parsed = urlparse(url)
239 |     path = parsed.path.lower()
240 | 
241 |     # Check for file extensions that aren't likely to be content pages
242 |     if path.endswith(
243 |         (".jpg", ".jpeg", ".png", ".gif", ".pdf", ".zip", ".tar.gz", ".css", ".js", ".ico", ".xml", ".json")
244 |     ):
245 |         return True
246 | 
247 |     # Check for excluded patterns
248 |     for pattern in EXCLUDED_URL_PATTERNS:
249 |         if pattern in path:
250 |             return True
251 | 
252 |     # URL seems fine
253 |     return False
254 | 
255 | 
256 | def extract_links(
257 |     base_url: str,
258 |     html: str,
259 |     crawl_type: CrawlType,
260 |     respect_robots: bool = False,
261 |     console: Console | None = None,
262 |     ticket_id: str = "",
263 | ) -> list[str]:
264 |     """
265 |     Extract links from HTML based on crawl type.
266 | 
267 |     Args:
268 |         base_url: The URL of the page being processed
269 |         html: HTML content of the page
270 |         crawl_type: Type of crawling to perform
271 |         respect_robots: Whether to respect robots.txt
272 |         console: Optional console for logging
273 |         ticket_id: Optional ticket_id to clean from extracted URLs
274 | 
275 |     Returns:
276 |         list[str]: List of normalized URLs to crawl next
277 |     """
278 |     if crawl_type == CrawlType.SINGLE_PAGE:
279 |         return []
280 | 
281 |     try:
282 |         soup = BeautifulSoup(html, "html.parser")
283 |         links: set[str] = set()
284 |         base_parsed = urlparse(base_url)
285 | 
286 |         # Find all link elements
287 |         for link in soup.find_all("a", href=True):
288 |             try:
289 |                 # We're using find_all with href=True, so we know href exists
290 |                 # Use type: ignore to bypass type checker for BeautifulSoup
291 |                 href = str(link["href"])  # type: ignore
292 |                 if not href or href.startswith(("javascript:", "mailto:", "tel:")):
293 |                     continue
294 | 
295 |                 # Build absolute URL
296 |                 full_url = urljoin(base_url, href)
297 | 
298 |                 # Validate the URL
299 |                 if not is_valid_url(full_url):
300 |                     continue
301 | 
302 |                 parsed = urlparse(full_url)
303 | 
304 |                 # Skip fragment-only URLs (same page anchors)
305 |                 if parsed.netloc == base_parsed.netloc and not parsed.path and parsed.fragment:
306 |                     continue
307 | 
308 |                 # Apply crawl type filtering
309 |                 if (
310 |                     crawl_type == CrawlType.SINGLE_LEVEL or crawl_type == CrawlType.DOMAIN
311 |                 ) and parsed.netloc == base_parsed.netloc:
312 |                     # Clean the URL of any ticket_id occurrences first to prevent nesting
313 |                     if ticket_id:
314 |                         full_url = clean_url_of_ticket_id(full_url, ticket_id)
315 | 
316 |                     normalized_url = normalize_url(full_url)
317 | 
318 |                     # Skip URLs that match common exclusion patterns
319 |                     if should_exclude_url(normalized_url):
320 |                         continue
321 | 
322 |                     # Check robots.txt
323 |                     if respect_robots and not check_robots_txt(normalized_url):
324 |                         if console:
325 |                             console.print(f"[yellow]Skipping disallowed URL: {normalized_url}[/yellow]")
326 |                         continue
327 | 
328 |                     links.add(normalized_url)
329 |                 # PAGINATED crawl type implementation would go here
330 |             except Exception as e:
331 |                 if console:
332 |                     console.print(f"[red]Error processing link: {str(e)}[/red]")
333 |                 continue
334 | 
335 |         return list(links)
336 |     except Exception as e:
337 |         if console:
338 |             console.print(f"[red]Error extracting links: {str(e)}[/red]")
339 |         return []
340 | 
341 | 
342 | def init_db() -> None:
343 |     """
344 |     Initialize database with required tables.
345 | 
346 |     Creates the database if it doesn't exist and ensures the schema is up-to-date.
347 |     Checks for version table and removes incompatible databases.
348 |     """
349 |     # Current database schema version
350 |     CURRENT_DB_VERSION = 1
351 | 
352 |     DB_PATH.parent.mkdir(parents=True, exist_ok=True)
353 | 
354 |     # Check if database exists and if it has our version table
355 |     if DB_PATH.exists():
356 |         try:
357 |             with sqlite3.connect(DB_PATH) as conn:
358 |                 # Check if db_version table exists
359 |                 cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='db_version'")
360 |                 if not cursor.fetchone():
361 |                     # No version table, remove the incompatible database
362 |                     conn.close()
363 |                     DB_PATH.unlink()
364 |                     print(f"Removed incompatible database at {DB_PATH}")
365 |         except sqlite3.Error:
366 |             # If any error occurs, assume the database is corrupted or incompatible
367 |             DB_PATH.unlink()
368 |             print(f"Removed corrupted database at {DB_PATH}")
369 | 
370 |     with sqlite3.connect(DB_PATH) as conn:
371 |         # Enable foreign keys
372 |         conn.execute("PRAGMA foreign_keys = ON")
373 | 
374 |         # Create version tracking table first
375 |         conn.execute("""
376 |             CREATE TABLE IF NOT EXISTS db_version (
377 |                 version INTEGER PRIMARY KEY,
378 |                 created_at INTEGER DEFAULT (strftime('%s','now')),
379 |                 description TEXT
380 |             )
381 |         """)
382 | 
383 |         # Check current version
384 |         cursor = conn.execute("SELECT version FROM db_version ORDER BY version DESC LIMIT 1")
385 |         row = cursor.fetchone()
386 |         db_version = row[0] if row else 0
387 | 
388 |         # If database is outdated, update schema as needed
389 |         if db_version < CURRENT_DB_VERSION:
390 |             # Create the main scrape table with enhanced fields
391 |             conn.execute("""
392 |                 CREATE TABLE IF NOT EXISTS scrape (
393 |                     ticket_id TEXT,
394 |                     url TEXT,
395 |                     status TEXT CHECK(status IN ('queued', 'active', 'completed', 'error')) NOT NULL,
396 |                     error_type TEXT,
397 |                     error_msg TEXT,
398 |                     raw_file_path TEXT,
399 |                     md_file_path TEXT,
400 |                     json_file_path TEXT,
401 |                     csv_file_path TEXT,
402 |                     excel_file_path TEXT,
403 |                     scraped INTEGER,
404 |                     queued_at INTEGER DEFAULT (strftime('%s','now')),
405 |                     last_processed_at INTEGER,
406 |                     attempts INTEGER DEFAULT 0,
407 |                     cost FLOAT,
408 |                     domain TEXT,
409 |                     depth INTEGER DEFAULT 0,
410 |                     PRIMARY KEY (ticket_id, url)
411 |                 )
412 |             """)
413 | 
414 |             # Create domain rate limiting table
415 |             conn.execute("""
416 |                 CREATE TABLE IF NOT EXISTS domain_rate_limit (
417 |                     domain TEXT PRIMARY KEY,
418 |                     last_access INTEGER,
419 |                     crawl_delay INTEGER DEFAULT 1
420 |                 )
421 |             """)
422 | 
423 |             # Create an index on status for faster querying
424 |             conn.execute("""
425 |                 CREATE INDEX IF NOT EXISTS idx_status ON scrape(status, ticket_id)
426 |             """)
427 | 
428 |             # Create an index on domain for faster rate limit lookups
429 |             conn.execute("""
430 |                 CREATE INDEX IF NOT EXISTS idx_domain ON scrape(domain)
431 |             """)
432 | 
433 |             # Update version information
434 |             conn.execute(
435 |                 """
436 |                 INSERT INTO db_version (version, description)
437 |                 VALUES (?, ?)
438 |                 """,
439 |                 (CURRENT_DB_VERSION, "Initial schema with scrape and domain_rate_limit tables"),
440 |             )
441 | 
442 | 
443 | def get_queue_stats(ticket_id: str) -> dict[str, int]:
444 |     """
445 |     Get statistics about the queue for a ticket.
446 | 
447 |     Args:
448 |         ticket_id: Unique identifier for the crawl job
449 | 
450 |     Returns:
451 |         dict: Dictionary with counts of items in each status
452 |     """
453 |     with sqlite3.connect(DB_PATH) as conn:
454 |         stats = {}
455 |         for status in PageStatus:
456 |             row = conn.execute(
457 |                 """
458 |                 SELECT COUNT(*) FROM scrape
459 |                 WHERE ticket_id = ? AND status = ?
460 |                 """,
461 |                 (ticket_id, status.value),
462 |             ).fetchone()
463 |             stats[status.value] = row[0] if row else 0
464 |         return stats
465 | 
466 | 
467 | def get_queue_size(ticket_id: str) -> int:
468 |     """
469 |     Get the number of URLs in the queue for a ticket.
470 | 
471 |     Args:
472 |         ticket_id: Unique identifier for the crawl job
473 | 
474 |     Returns:
475 |         int: Number of URLs in queued status
476 |     """
477 |     with sqlite3.connect(DB_PATH) as conn:
478 |         row = conn.execute(
479 |             """
480 |             SELECT COUNT(*) FROM scrape
481 |             WHERE ticket_id = ? AND status = ?
482 |             """,
483 |             (ticket_id, PageStatus.QUEUED.value),
484 |         ).fetchone()
485 |         return row[0] if row else 0
486 | 
487 | 
488 | def add_to_queue(ticket_id: str, urls: Iterable[str], depth: int = 0) -> None:
489 |     """
490 |     Add URLs to queue if they don't already exist.
491 | 
492 |     Args:
493 |         ticket_id: Unique identifier for the crawl job
494 |         urls: Collection of URLs to add to the queue
495 |         depth: Crawl depth of these URLs (default: 0 for starting URLs)
496 |     """
497 |     with sqlite3.connect(DB_PATH) as conn:
498 |         for url in urls:
499 |             # Skip invalid URLs
500 |             if not is_valid_url(url):
501 |                 continue
502 | 
503 |             # Clean URL of any ticket_id occurrences to prevent nesting
504 |             url = clean_url_of_ticket_id(url, ticket_id)
505 | 
506 |             # Normalize URL before adding
507 |             url = normalize_url(url.rstrip("/"))
508 |             parsed = urlparse(url)
509 |             domain = parsed.netloc
510 | 
511 |             # Insert new URL or ignore if it exists
512 |             conn.execute(
513 |                 """
514 |                 INSERT OR IGNORE INTO scrape
515 |                 (ticket_id, url, status, domain, depth, queued_at)
516 |                 VALUES (?, ?, ?, ?, ?, strftime('%s','now'))
517 |                 """,
518 |                 (ticket_id, url, PageStatus.QUEUED.value, domain, depth),
519 |             )
520 | 
521 |             # Reset error status if re-adding
522 |             conn.execute(
523 |                 """
524 |                 UPDATE scrape
525 |                 SET status = ?, error_msg = NULL, error_type = NULL
526 |                 WHERE ticket_id = ? AND url = ? AND status = ?
527 |                 """,
528 |                 (PageStatus.QUEUED.value, ticket_id, url, PageStatus.ERROR.value),
529 |             )
530 | 
531 |             # Ensure domain exists in rate limit table
532 |             conn.execute(
533 |                 """
534 |                 INSERT OR IGNORE INTO domain_rate_limit (domain, last_access, crawl_delay)
535 |                 VALUES (?, 0, 1)
536 |                 """,
537 |                 (domain,),
538 |             )
539 | 
540 | 
541 | def get_next_urls(
542 |     ticket_id: str, crawl_batch_size: int = 1, scrape_retries: int = 3, respect_rate_limits: bool = True
543 | ) -> list[str]:
544 |     """
545 |     Get next batch of URLs to process from the queue, respecting rate limits.
546 | 
547 |     Args:
548 |         ticket_id: Unique identifier for the crawl job
549 |         crawl_batch_size: Maximum number of URLs to return
550 |         scrape_retries: Maximum number of retry attempts for failed URLs
551 |         respect_rate_limits: Whether to respect per-domain rate limits
552 | 
553 |     Returns:
554 |         list[str]: List of URLs to process next
555 |     """
556 |     current_time = int(time.time())
557 |     urls = []
558 |     domains_used = set()
559 | 
560 |     with sqlite3.connect(DB_PATH) as conn:
561 |         # Query includes URLs from each domain respecting rate limits
562 |         if respect_rate_limits:
563 |             # First find eligible domains that respect rate limits
564 |             rows = conn.execute(
565 |                 """
566 |                 SELECT s.url, s.domain, d.last_access, d.crawl_delay
567 |                 FROM scrape s
568 |                 JOIN domain_rate_limit d ON s.domain = d.domain
569 |                 WHERE s.ticket_id = ?
570 |                   AND (s.status = ? OR (s.status = ? AND s.attempts < ?))
571 |                 ORDER BY d.last_access ASC
572 |                 """,
573 |                 (ticket_id, PageStatus.QUEUED.value, PageStatus.ERROR.value, scrape_retries),
574 |             ).fetchall()
575 | 
576 |             # Process each row, respecting rate limits
577 |             for row in rows:
578 |                 url, domain, last_access, crawl_delay = row
579 | 
580 |                 # Skip if we already have a URL from this domain in the batch
581 |                 if domain in domains_used:
582 |                     continue
583 | 
584 |                 # Skip if rate limit not elapsed
585 |                 if last_access > 0 and current_time - last_access < crawl_delay:
586 |                     continue
587 | 
588 |                 # Add URL to batch
589 |                 urls.append(url)
590 |                 domains_used.add(domain)
591 | 
592 |                 # Update last access time for this domain
593 |                 conn.execute(
594 |                     """
595 |                     UPDATE domain_rate_limit
596 |                     SET last_access = ?
597 |                     WHERE domain = ?
598 |                     """,
599 |                     (current_time, domain),
600 |                 )
601 | 
602 |                 # Stop if we have enough URLs
603 |                 if len(urls) >= crawl_batch_size:
604 |                     break
605 |         else:
606 |             # Simple version that doesn't respect rate limits
607 |             rows = conn.execute(
608 |                 """
609 |                 SELECT url FROM scrape
610 |                 WHERE ticket_id = ? AND (status = ? OR (status = ? AND attempts < ?))
611 |                 LIMIT ?
612 |                 """,
613 |                 (ticket_id, PageStatus.QUEUED.value, PageStatus.ERROR.value, scrape_retries, crawl_batch_size),
614 |             ).fetchall()
615 |             urls = [row[0] for row in rows]
616 | 
617 |         # Mark selected URLs as active
618 |         if urls:
619 |             placeholders = ", ".join("?" for _ in urls)
620 |             conn.execute(
621 |                 f"""
622 |                 UPDATE scrape
623 |                 SET status = ?, attempts = attempts + 1, last_processed_at = strftime('%s','now')
624 |                 WHERE ticket_id = ? AND url IN ({placeholders})
625 |                 """,
626 |                 [PageStatus.ACTIVE.value, ticket_id] + urls,
627 |             )
628 | 
629 |     return urls
630 | 
631 | 
632 | def set_crawl_delay(domain: str, delay_seconds: int) -> None:
633 |     """
634 |     Set the crawl delay for a specific domain.
635 | 
636 |     Args:
637 |         domain: Domain to set rate limit for
638 |         delay_seconds: Minimum seconds between requests to this domain
639 |     """
640 |     with sqlite3.connect(DB_PATH) as conn:
641 |         conn.execute(
642 |             """
643 |             INSERT OR REPLACE INTO domain_rate_limit (domain, last_access, crawl_delay)
644 |             VALUES (?, (SELECT last_access FROM domain_rate_limit WHERE domain = ?), ?)
645 |             """,
646 |             (domain, domain, delay_seconds),
647 |         )
648 | 
649 | 
650 | def mark_complete(
651 |     ticket_id: str, url: str, *, raw_file_path: Path, file_paths: dict[OutputFormat, Path], cost: float = 0.0
652 | ) -> None:
653 |     """
654 |     Mark URL as successfully scraped.
655 | 
656 |     Args:
657 |         ticket_id: Unique identifier for the crawl job
658 |         url: URL that was successfully processed
659 |         raw_file_path: Path to the raw output file
660 |         file_paths: Dictionary mapping output formats to file paths
661 |         cost: Cost of processing this URL (if applicable)
662 |     """
663 |     with sqlite3.connect(DB_PATH) as conn:
664 |         conn.execute(
665 |             """
666 |             UPDATE scrape
667 |             SET status = ?, scraped = strftime('%s','now'), error_msg = null, error_type = null,
668 |             raw_file_path = ?, md_file_path = ?, json_file_path = ?, csv_file_path = ?, excel_file_path = ?,
669 |             cost = ?, last_processed_at = strftime('%s','now')
670 |             WHERE ticket_id = ? AND url = ?
671 |             """,
672 |             (
673 |                 PageStatus.COMPLETED.value,
674 |                 str(raw_file_path),
675 |                 str(file_paths[OutputFormat.MARKDOWN]) if OutputFormat.MARKDOWN in file_paths else None,
676 |                 str(file_paths[OutputFormat.JSON]) if OutputFormat.JSON in file_paths else None,
677 |                 str(file_paths[OutputFormat.CSV]) if OutputFormat.CSV in file_paths else None,
678 |                 str(file_paths[OutputFormat.EXCEL]) if OutputFormat.EXCEL in file_paths else None,
679 |                 cost,
680 |                 ticket_id,
681 |                 url.rstrip("/"),
682 |             ),
683 |         )
684 | 
685 | 
686 | def mark_error(
687 |     ticket_id: str, url: str, error_msg: str, error_type: ErrorType = ErrorType.OTHER, cost: float = 0.0
688 | ) -> None:
689 |     """
690 |     Mark URL as failed with error message and type.
691 | 
692 |     Args:
693 |         ticket_id: Unique identifier for the crawl job
694 |         url: URL that failed processing
695 |         error_msg: Error message describing the failure
696 |         error_type: Type of error that occurred
697 |         cost: Cost of processing this URL (if applicable)
698 |     """
699 |     with sqlite3.connect(DB_PATH) as conn:
700 |         conn.execute(
701 |             """
702 |             UPDATE scrape
703 |             SET status = ?, error_msg = ?, error_type = ?, cost = ?, last_processed_at = strftime('%s','now')
704 |             WHERE ticket_id = ? AND url = ?
705 |             """,
706 |             (PageStatus.ERROR.value, error_msg[:255], error_type.value, cost, ticket_id, url.rstrip("/")),
707 |         )
708 | 


--------------------------------------------------------------------------------
/src/par_scrape/enums.py:
--------------------------------------------------------------------------------
 1 | """Enum for scraper choices."""
 2 | 
 3 | from strenum import StrEnum
 4 | 
 5 | 
 6 | class CleanupType(StrEnum):
 7 |     """Enum for cleanup choices."""
 8 | 
 9 |     NONE = "none"
10 |     BEFORE = "before"
11 |     AFTER = "after"
12 |     BOTH = "both"
13 | 
14 | 
15 | class OutputFormat(StrEnum):
16 |     """Enum for output formats."""
17 | 
18 |     MARKDOWN = "md"
19 |     JSON = "json"
20 |     CSV = "csv"
21 |     EXCEL = "excel"
22 | 


--------------------------------------------------------------------------------
/src/par_scrape/extraction_prompt.md:
--------------------------------------------------------------------------------
1 | ROLE: You are an intelligent text extraction and conversion assistant.
2 | TASK: Extract structured information from the user provided text into the format required to call DynamicListingsContainer.
3 | Ensure you include all data points in the output.
4 | If you encounter cases where you can't find the data for a specific field use an empty string "".
5 | You *MUST* call the `DynamicListingsContainer` function with the extracted data.
6 | 


--------------------------------------------------------------------------------
/src/par_scrape/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paulrobello/par_scrape/56294b0f0e86434033fd0d1a1ae54900ae0f4585/src/par_scrape/py.typed


--------------------------------------------------------------------------------
/src/par_scrape/scrape_data.py:
--------------------------------------------------------------------------------
  1 | """Scrape data from Web."""
  2 | 
  3 | import json
  4 | import os
  5 | from pathlib import Path
  6 | 
  7 | import pandas as pd
  8 | from langchain_anthropic import ChatAnthropic
  9 | from par_ai_core.llm_config import LlmConfig, llm_run_manager
 10 | from par_ai_core.par_logging import console_out
 11 | from pydantic import BaseModel, ConfigDict, create_model
 12 | from rich.panel import Panel
 13 | 
 14 | from par_scrape.enums import OutputFormat
 15 | 
 16 | 
 17 | def save_raw_data(raw_data: str, output_base: Path) -> Path:
 18 |     """
 19 |     Save raw data to a file.
 20 | 
 21 |     Args:
 22 |         raw_data (str): The raw data to save.
 23 |         output_base (str): The folder or base file_name to save the file in. Defaults to 'output'.
 24 | 
 25 |     Returns:
 26 |         Path: The path to the saved file.
 27 |     """
 28 |     if output_base.is_dir():
 29 |         # Use a simple filename without ticket_id since the path already has it
 30 |         raw_output_path = output_base / "raw_data.md"
 31 |     else:
 32 |         # For non-directory paths, just append -raw
 33 |         raw_output_path = Path(str(output_base) + "-raw.md")
 34 |     raw_output_path.write_text(raw_data)
 35 |     console_out.print(Panel(f"Raw data saved to [bold green]{raw_output_path}[/bold green]"))
 36 |     return raw_output_path
 37 | 
 38 | 
 39 | def create_dynamic_model(field_names: list[str]) -> type[BaseModel]:
 40 |     """
 41 |     Dynamically creates a Pydantic model based on provided fields.
 42 | 
 43 |     Args:
 44 |         field_names (List[str]): A list of names of the fields to extract from the markdown.
 45 | 
 46 |     Returns:
 47 |         Type[BaseModel]: A dynamically created Pydantic model.
 48 |     """
 49 |     # Create field definitions using aliases for Field parameters
 50 |     field_definitions = {field: (str, ...) for field in field_names}
 51 |     # Dynamically create the model with all fields
 52 |     dynamic_listing_model = create_model(
 53 |         "DynamicListingModel",
 54 |         **field_definitions,  # type: ignore
 55 |     )  # type: ignore
 56 |     dynamic_listing_model.model_config = ConfigDict(arbitrary_types_allowed=True)
 57 |     return dynamic_listing_model
 58 | 
 59 | 
 60 | def create_container_model(dynamic_model: type[BaseModel]) -> type[BaseModel]:
 61 |     """
 62 |     Create a container model that holds a list of the given listing model.
 63 | 
 64 |     Args:
 65 |         dynamic_model (Type[BaseModel]): The Pydantic model for individual listings.
 66 | 
 67 |     Returns:
 68 |         Type[BaseModel]: A container model for a list of listings.
 69 |     """
 70 |     return create_model("DynamicListingsContainer", listings=(list[dynamic_model], ...))
 71 | 
 72 | 
 73 | # pylint: disable=too-many-positional-arguments
 74 | def format_data(
 75 |     *,
 76 |     data: str,
 77 |     dynamic_listings_container: type[BaseModel],
 78 |     llm_config: LlmConfig,
 79 |     prompt_cache: bool = False,
 80 |     extraction_prompt: Path | None = None,
 81 | ) -> BaseModel:
 82 |     """
 83 |     Format data using the specified AI provider's API.
 84 | 
 85 |     Args:
 86 |         data (str): The input data to format.
 87 |         dynamic_listings_container (Type[BaseModel]): The Pydantic model to use for parsing.
 88 |         llm_config (LlmConfig): The configuration for the AI provider.
 89 |         prompt_cache (bool): Whether to use prompt caching.
 90 |         extraction_prompt (Path): Path to the extraction prompt file.
 91 | 
 92 |     Returns:
 93 |         BaseModel: The Extracted data as a Pydantic model instance.
 94 |     """
 95 |     if not extraction_prompt:
 96 |         extraction_prompt = Path(__file__).parent / "extraction_prompt.md"
 97 |     try:
 98 |         system_message = extraction_prompt.read_text(encoding="utf-8")
 99 |     except FileNotFoundError:
100 |         console_out.print(f"[bold red]Extraction prompt file not found: {extraction_prompt}[/bold red]")
101 |         raise
102 | 
103 |     user_message = f"Extract the following information from the provided text:\nPage content:\n\n{data}"
104 | 
105 |     try:
106 |         chat_model = llm_config.build_chat_model()
107 | 
108 |         structure_model = chat_model.with_structured_output(
109 |             dynamic_listings_container  # , include_raw=True
110 |         )
111 |         history = [
112 |             ("system", system_message),
113 |             (
114 |                 "user",
115 |                 [{"type": "text", "text": user_message}],
116 |             ),
117 |         ]
118 | 
119 |         if prompt_cache and isinstance(chat_model, ChatAnthropic):
120 |             history[1][1][0]["cache_control"] = {"type": "ephemeral"}  # type: ignore
121 | 
122 |         data = structure_model.invoke(history, config=llm_run_manager.get_runnable_config(chat_model.name))  # type: ignore
123 |         if isinstance(data, BaseModel):
124 |             return data
125 |         console_out.print(data)
126 |         raise ValueError("Error in API call. Did not return a Pydantic BaseModel")
127 |     except Exception as e:  # pylint: disable=broad-exception-caught
128 |         console_out.print(f"[bold red]Error in API call or parsing response:[/bold red] {str(e)}")
129 |         return dynamic_listings_container(listings=[])
130 | 
131 | 
132 | def save_formatted_data(
133 |     *, formatted_data: BaseModel, output_formats: list[OutputFormat], run_name: str, output_folder: Path
134 | ) -> tuple[pd.DataFrame | None, dict[OutputFormat, Path]]:
135 |     """
136 |     Save Extracted data to JSON, Excel, CSV, and Markdown files.
137 | 
138 |     Note: run_name should only be used for logging/reference, not for directory creation
139 |     since directories should already include run_name once via get_url_output_folder.
140 | 
141 |     Args:
142 |         formatted_data (BaseModel): The Extracted data to save.
143 |         output_formats (List[OutputFormat]): The desired output format.
144 |         run_name (str): The run name used for logging purposes only.
145 |         output_folder (Path): The folder to save the files in.
146 | 
147 |     Returns:
148 |         Tuple[pd.DataFrame | None, Dict[OutputFormat, Path]]: The DataFrame created from the Extracted data and a dictionary of
149 |         file paths, or None and an empty dict if an error occurred.
150 |     """
151 |     file_paths: dict[OutputFormat, Path] = {}
152 |     # Ensure the output folder exists
153 |     os.makedirs(output_folder, exist_ok=True)
154 | 
155 |     # Prepare Extracted data as a dictionary
156 |     formatted_data_dict = formatted_data.model_dump()
157 | 
158 |     if OutputFormat.JSON in output_formats:
159 |         # Save the Extracted data as JSON without adding run_name to the filename
160 |         # as the run_name is already part of the folder structure
161 |         json_output_path = output_folder / "extracted_data.json"
162 |         json_output_path.write_text(json.dumps(formatted_data_dict, indent=4), encoding="utf-8")
163 | 
164 |         console_out.print(Panel(f"Extracted data saved to JSON at [bold green]{json_output_path}[/bold green]"))
165 |         file_paths[OutputFormat.JSON] = json_output_path
166 | 
167 |     # Prepare data for DataFrame
168 |     if isinstance(formatted_data_dict, dict):
169 |         # If the data is a dictionary containing lists, assume these lists are records
170 |         data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict
171 |     elif isinstance(formatted_data_dict, list):
172 |         data_for_df = formatted_data_dict
173 |     else:
174 |         raise ValueError("Extracted data is neither a dictionary nor a list, cannot convert to DataFrame")
175 | 
176 |     # Create DataFrame
177 |     try:
178 |         df = pd.DataFrame(data_for_df)
179 | 
180 |         if df.empty:
181 |             raise ValueError("DataFrame is empty, cannot save to files")
182 | 
183 |         if OutputFormat.EXCEL in output_formats:
184 |             try:
185 |                 # Don't include run_name in filename since it's already in the path
186 |                 excel_output_path = output_folder / "extracted_data.xlsx"
187 |                 df.to_excel(excel_output_path, index=False)
188 |                 console_out.print(Panel(f"Excel data saved to [bold green]{excel_output_path}[/bold green]"))
189 |                 file_paths[OutputFormat.EXCEL] = excel_output_path
190 |             except Exception as e:
191 |                 console_out.print("[bold red]Error: Saving Excel failed[/bold red]")
192 |                 console_out.print(e)
193 | 
194 |         if OutputFormat.CSV in output_formats:
195 |             try:
196 |                 # Don't include run_name in filename since it's already in the path
197 |                 csv_output_path = output_folder / "extracted_data.csv"
198 |                 df.to_csv(csv_output_path, index=False)
199 |                 console_out.print(Panel(f"CSV data saved to [bold green]{csv_output_path}[/bold green]"))
200 |                 file_paths[OutputFormat.CSV] = csv_output_path
201 |             except Exception as e:
202 |                 console_out.print("[bold red]Error: Saving CSV failed[/bold red]")
203 |                 console_out.print(e)
204 | 
205 |         if OutputFormat.MARKDOWN in output_formats:
206 |             try:
207 |                 # Don't include run_name in filename since it's already in the path
208 |                 markdown_output_path = output_folder / "extracted_data.md"
209 |                 markdown_output_path.write_text(df.to_markdown(index=False) or "", encoding="utf-8")
210 |                 console_out.print(Panel(f"Markdown table saved to [bold green]{markdown_output_path}[/bold green]"))
211 |                 file_paths[OutputFormat.MARKDOWN] = markdown_output_path
212 |             except Exception as e:
213 |                 console_out.print("[bold red]Error: Saving Markdown table failed[/bold red]")
214 |                 console_out.print(e)
215 |         return df, file_paths
216 |     except Exception as e:
217 |         console_out.print(f"[bold red]Error creating DataFrame or saving files:[/bold red] {str(e)}")
218 |         return None, {}
219 | 


--------------------------------------------------------------------------------
/src/par_scrape/utils.py:
--------------------------------------------------------------------------------
1 | """Utility functions for par_scrape."""
2 | 


--------------------------------------------------------------------------------