├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── build.yml │ ├── publish-dev.yml │ ├── publish.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── Screenshot.png ├── pyproject.toml ├── pyrightconfig.json ├── ruff.toml ├── src └── par_scrape │ ├── __init__.py │ ├── __main__.py │ ├── crawl.py │ ├── enums.py │ ├── extraction_prompt.md │ ├── py.typed │ ├── scrape_data.py │ └── utils.py └── uv.lock /.gitattributes: -------------------------------------------------------------------------------- 1 | # Autodetect text files 2 | * text=auto 3 | 4 | # ...Unless the name matches the following 5 | # overriding patterns 6 | 7 | # Definitively text files 8 | *.txt text 9 | *.json text 10 | *.js text 11 | *.ts text 12 | .env text 13 | .env-* text 14 | *.sh text 15 | *.sql text 16 | *.yml text 17 | *.py text 18 | *.js text 19 | *.ts text 20 | *.ini text 21 | *.jq text 22 | Dockerfile text 23 | Dockerfile.* text 24 | makefile text 25 | makefile.* text 26 | Makefile text 27 | Makefile.* text 28 | 29 | # Ensure those won't be messed up with 30 | *.jpg binary 31 | *.gif binary 32 | *.png binary 33 | 34 | # force line endings to be lf so db container does not blow up 35 | **/*.sh text eol=lf 36 | **/*.sql text eol=lf 37 | **/.env text eol=lf 38 | **/.env-* text eol=lf 39 | **/Dockerfile text eol=lf 40 | **/Dockerfile.* text eol=lf 41 | **/*.py text eol=lf 42 | **/*.js text eol=lf 43 | **/*.ts text eol=lf 44 | **/*.jq text eol=lf 45 | **/*.json text eol=lf 46 | **/*.yml text eol=lf 47 | **/Makefile text eol=lf 48 | **/Makefile.* text eol=lf 49 | **/makefile text eol=lf 50 | **/makefile.* text eol=lf 51 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build and Test 🐍 distribution 📦 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - 'v*.*.*' 9 | 10 | jobs: 11 | build: 12 | name: Build distribution 📦 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Install uv 18 | uses: astral-sh/setup-uv@v4 19 | with: 20 | enable-cache: true 21 | 22 | - name: Set up Python 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: "3.11" 26 | 27 | - name: Install dependencies 28 | run: | 29 | uv sync --all-extras --dev 30 | 31 | - name: Lint and Typecheck 32 | run: | 33 | make checkall 34 | 35 | - name: Package 36 | run: make package 37 | 38 | - name: Cache build artifacts 39 | uses: actions/cache@v4 40 | with: 41 | path: dist 42 | key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }} 43 | 44 | - name: Store the distribution packages 45 | uses: actions/upload-artifact@v4 46 | with: 47 | name: python-package-distributions-ubuntu-latest-3.11-x64 48 | path: dist/ 49 | 50 | tag-version: 51 | runs-on: ubuntu-latest 52 | permissions: 53 | contents: write # Grant write access to the repository 54 | needs: 55 | - build # Wait for all build jobs to complete 56 | if: ${{ !startsWith(github.ref, 'refs/tags/v') }} 57 | steps: 58 | - name: Checkout repository 59 | uses: actions/checkout@v4 60 | 61 | - name: Install uv 62 | uses: astral-sh/setup-uv@v4 63 | with: 64 | enable-cache: true 65 | 66 | - name: Set up Python 67 | uses: actions/setup-python@v5 68 | with: 69 | python-version: '3.11' 70 | 71 | - name: Install dependencies 72 | run: | 73 | uv sync --all-extras --dev 74 | 75 | - name: Get version from __init__.py 76 | id: get_version 77 | run: | 78 | version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)") 79 | echo "Raw version output: $version" 80 | echo "VERSION=$version" >> $GITHUB_ENV 81 | 82 | - name: Check version 83 | id: check_version 84 | run: | 85 | echo "Version in env: ${{ env.VERSION }}" 86 | if [ -z "${{ env.VERSION }}" ]; then 87 | echo "Error: VERSION is empty" 88 | exit 1 89 | fi 90 | 91 | - name: Configure Git 92 | run: | 93 | git config --global user.name "${{ github.actor }}" 94 | git config --global user.email "${{ github.event.pusher.email }}" 95 | 96 | - name: App VERSION 97 | run: echo "VERSION is ${{ env.VERSION }}" 98 | 99 | - name: Fetch all tags 100 | run: git fetch --tags 101 | 102 | - name: Check if tag exists 103 | id: check_tag 104 | run: | 105 | TAG_EXISTS=$(git tag --list "v${{ env.VERSION }}") 106 | if [ -z "$TAG_EXISTS" ]; then 107 | echo "TAG_EXISTS=false" >> $GITHUB_ENV 108 | else 109 | echo "TAG_EXISTS=true" >> $GITHUB_ENV 110 | fi 111 | 112 | - name: Delete existing tag locally and remotely 113 | if: env.TAG_EXISTS == 'true' 114 | env: 115 | VERSION: ${{ env.VERSION }} 116 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 117 | run: | 118 | git tag -d "v$VERSION" 119 | git push --delete origin "v$VERSION" 120 | 121 | - name: Create new tag 122 | env: 123 | VERSION: ${{ env.VERSION }} 124 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 125 | run: | 126 | if ! git tag -a "v$VERSION" -m "Version $VERSION"; then 127 | echo "Failed to create tag" 128 | exit 1 129 | fi 130 | if ! git push origin "v$VERSION"; then 131 | echo "Failed to push tag" 132 | exit 1 133 | fi 134 | -------------------------------------------------------------------------------- /.github/workflows/publish-dev.yml: -------------------------------------------------------------------------------- 1 | name: Publish 🐍 📦 to TestPyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | publish-to-testpypi: 8 | if: github.event_name == 'workflow_dispatch' # Only allow manual triggers 9 | name: Publish 🐍 distribution 📦 to TestPyPI 10 | runs-on: ubuntu-latest 11 | environment: 12 | name: testpypi 13 | url: https://test.pypi.org/p/par_scrape 14 | permissions: 15 | id-token: write # IMPORTANT: mandatory for trusted publishing 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | 21 | - name: Install uv 22 | uses: astral-sh/setup-uv@v3 23 | with: 24 | enable-cache: true 25 | 26 | - name: Set up Python 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: '3.11' 30 | 31 | - name: Install dependencies 32 | run: | 33 | uv sync --all-extras --dev 34 | 35 | - name: Get version from __init__.py 36 | id: get_version 37 | run: | 38 | version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)") 39 | echo "Raw version output: $version" 40 | echo "VERSION=$version" >> $GITHUB_ENV 41 | 42 | - name: Check version 43 | id: check_version 44 | run: | 45 | echo "Version in env: ${{ env.VERSION }}" 46 | if [ -z "${{ env.VERSION }}" ]; then 47 | echo "Error: VERSION is empty" 48 | exit 1 49 | fi 50 | 51 | - name: Restore cached build artifacts 52 | uses: actions/cache@v4 53 | with: 54 | path: dist 55 | key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }} 56 | fail-on-cache-miss: true 57 | 58 | - name: Publish distribution 📦 to TestPyPI 59 | uses: pypa/gh-action-pypi-publish@release/v1 60 | with: 61 | repository-url: https://test.pypi.org/legacy/ 62 | skip-existing: true 63 | 64 | - name: Discord notification 65 | env: 66 | DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} 67 | uses: Ilshidur/action-discord@master 68 | with: 69 | args: 'The project {{ EVENT_PAYLOAD.repository.full_name }} ${{ env.VERSION }} has been published to TestPyPI.' 70 | continue-on-error: true 71 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 🐍 📦 to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [ published ] 7 | 8 | jobs: 9 | publish-to-pypi: 10 | if: github.event_name == 'workflow_dispatch' # Only allow manual triggers 11 | name: Publish 🐍 distribution 📦 to PyPI 12 | runs-on: ubuntu-latest 13 | environment: 14 | name: pypi 15 | url: https://pypi.org/p/par_scrape 16 | permissions: 17 | id-token: write # IMPORTANT: mandatory for trusted publishing 18 | 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v4 22 | 23 | - name: Install uv 24 | uses: astral-sh/setup-uv@v3 25 | with: 26 | enable-cache: true 27 | 28 | - name: Set up Python 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: '3.11' 32 | 33 | - name: Install dependencies 34 | run: | 35 | uv sync --all-extras --dev 36 | 37 | - name: Get version from __init__.py 38 | id: get_version 39 | run: | 40 | version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)") 41 | echo "Raw version output: $version" 42 | echo "VERSION=$version" >> $GITHUB_ENV 43 | 44 | - name: Check version 45 | id: check_version 46 | run: | 47 | echo "Version in env: ${{ env.VERSION }}" 48 | if [ -z "${{ env.VERSION }}" ]; then 49 | echo "Error: VERSION is empty" 50 | exit 1 51 | fi 52 | 53 | - name: Restore cached build artifacts 54 | uses: actions/cache@v4 55 | with: 56 | path: dist 57 | key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }} 58 | restore-keys: | 59 | ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}- 60 | ${{ runner.os }}-3.11-x64-${{ env.VERSION }}- 61 | ${{ runner.os }}-3.11-x64- 62 | fail-on-cache-miss: true 63 | 64 | - name: Publish distribution 📦 to PyPI 65 | uses: pypa/gh-action-pypi-publish@release/v1 66 | 67 | - name: Discord notification 68 | env: 69 | DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} 70 | uses: Ilshidur/action-discord@master 71 | with: 72 | args: 'The project {{ EVENT_PAYLOAD.repository.full_name }} ${{ env.VERSION }} has been published to PyPI.' 73 | continue-on-error: true 74 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 🐍 distribution 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | github-release: 8 | name: Create GitHub Release 9 | runs-on: ubuntu-latest 10 | permissions: 11 | contents: write 12 | id-token: write 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: '3.11' 22 | 23 | - name: Install uv 24 | uses: astral-sh/setup-uv@v4 25 | 26 | - name: Install dependencies 27 | run: | 28 | uv sync 29 | 30 | - name: Get version from __init__.py 31 | id: get_version 32 | run: | 33 | version=$(uv run python -c "from src.par_scrape import __version__; print(__version__)") 34 | echo "Raw version output: $version" 35 | echo "VERSION=$version" >> $GITHUB_ENV 36 | 37 | - name: Check version 38 | id: check_version 39 | run: | 40 | echo "Version in env: ${{ env.VERSION }}" 41 | if [ -z "${{ env.VERSION }}" ]; then 42 | echo "Error: VERSION is empty" 43 | exit 1 44 | fi 45 | 46 | - name: Restore cached build artifacts 47 | uses: actions/cache@v4 48 | with: 49 | path: dist 50 | key: ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}-${{ github.sha }} 51 | restore-keys: | 52 | ${{ runner.os }}-3.11-x64-${{ hashFiles('**/pyproject.toml') }}- 53 | ${{ runner.os }}-3.11-x64- 54 | 55 | - name: Sign the dists with Sigstore 56 | uses: sigstore/gh-action-sigstore-python@v3.0.0 57 | with: 58 | inputs: >- 59 | ./dist/*.tar.gz 60 | ./dist/*.whl 61 | 62 | - name: Create GitHub Release 63 | env: 64 | GITHUB_TOKEN: ${{ github.token }} 65 | run: | 66 | gh release create \ 67 | 'release-v${{ env.VERSION }}' \ 68 | --repo '${{ github.repository }}' \ 69 | --generate-notes \ 70 | --latest 71 | 72 | - name: Upload artifact signatures to GitHub Release 73 | env: 74 | GITHUB_TOKEN: ${{ github.token }} 75 | run: | 76 | gh release upload \ 77 | 'release-v${{ env.VERSION }}' dist/** \ 78 | --repo '${{ github.repository }}' 79 | 80 | - name: Discord notification 81 | env: 82 | DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }} 83 | uses: Ilshidur/action-discord@master 84 | with: 85 | args: 'A new release ${{ github.ref_name }} has been created for {{ EVENT_PAYLOAD.repository.full_name }}.' 86 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### PythonVanilla template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # Installer logs 30 | pip-log.txt 31 | pip-delete-this-directory.txt 32 | 33 | # Unit test / coverage reports 34 | htmlcov/ 35 | .tox/ 36 | .nox/ 37 | .coverage 38 | .coverage.* 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | *.cover 43 | *.py,cover 44 | .hypothesis/ 45 | .pytest_cache/ 46 | cover/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # pyenv 53 | # For a library or package, you might want to ignore these files since the code is 54 | # intended to run in multiple environments; otherwise, check them in: 55 | # .python-version 56 | 57 | # pipenv 58 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 59 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 60 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 61 | # install all needed dependencies. 62 | #Pipfile.lock 63 | 64 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 65 | __pypackages__/ 66 | 67 | .aider* 68 | **/venv 69 | **/.venv 70 | **/.env 71 | **/.idea 72 | /config.json 73 | /output/ 74 | /.DS_Store 75 | /.ruff_cache/ 76 | /src/par_scrape/pages/ 77 | /src/par_scrape/jobs.sqlite 78 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_stages: [pre-commit, pre-push] 2 | default_language_version: 3 | python: python3.11 4 | fail_fast: false 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.6.0 8 | hooks: 9 | - id: check-merge-conflict 10 | - id: detect-private-key 11 | - id: end-of-file-fixer 12 | - id: mixed-line-ending 13 | - id: trailing-whitespace 14 | args: [--markdown-linebreak-ext=md] 15 | - id: check-docstring-first 16 | - id: check-toml 17 | - id: check-yaml 18 | - id: check-json 19 | - id: pretty-format-json 20 | args: [--autofix, --no-sort-keys] 21 | exclude: tests(/\w*)*/functional/|tests/input|tests(/.*)+/conftest.py|doc/data/messages|tests(/\w*)*data/|Pipfile.lock|output/.* 22 | 23 | - repo: local 24 | hooks: 25 | - id: pyright 26 | name: pyright 27 | entry: make 28 | language: system 29 | pass_filenames: false 30 | args: 31 | [typecheck] 32 | exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/|output/.* 33 | 34 | - repo: local 35 | hooks: 36 | - id: format 37 | name: format 38 | entry: make 39 | language: system 40 | pass_filenames: false 41 | args: 42 | [format] 43 | exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/|output/.* 44 | 45 | - repo: local 46 | hooks: 47 | - id: lint 48 | name: lint 49 | entry: make 50 | language: system 51 | pass_filenames: false 52 | args: 53 | [lint] 54 | exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/|output/.* 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Paul Robello 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Common make values. 3 | lib := par_scrape 4 | run := uv run 5 | python := $(run) python 6 | ruff := $(run) ruff 7 | pyright := $(run) pyright 8 | build := uv build 9 | 10 | #export UV_LINK_MODE=copy 11 | export PIPENV_VERBOSITY=-1 12 | ############################################################################## 13 | # Run the app. 14 | .PHONY: run 15 | run: # Run the app 16 | $(run) $(lib) "$(ARG1)" "$(ARG2)" "$(ARG3)" "$(ARG4)" "$(ARG5)" "$(ARG6)" "$(ARG7)" "$(ARG8)" "$(ARG9)" 17 | 18 | .PHONY: app_help 19 | app_help: # Show app help 20 | $(run) $(lib) --help 21 | 22 | 23 | ############################################################################## 24 | .PHONY: uv-lock 25 | uv-lock: 26 | uv lock 27 | 28 | .PHONY: uv-sync 29 | uv-sync: 30 | uv sync 31 | 32 | .PHONY: setup 33 | setup: uv-lock uv-sync # use this for first time run 34 | 35 | .PHONY: resetup 36 | resetup: remove-venv setup # Recreate the virtual environment from scratch 37 | 38 | .PHONY: remove-venv 39 | remove-venv: # Remove the virtual environment 40 | rm -rf .venv 41 | 42 | .PHONY: depsupdate 43 | depsupdate: # Update all dependencies 44 | uv sync -U 45 | 46 | .PHONY: depsshow 47 | depsshow: # Show the dependency graph 48 | uv tree 49 | 50 | .PHONY: shell 51 | shell: # Start shell inside of .venv 52 | $(run) bash 53 | ############################################################################## 54 | # Checking/testing/linting/etc. 55 | .PHONY: format 56 | format: # Reformat the code with ruff. 57 | $(ruff) format src/$(lib) 58 | 59 | .PHONY: lint 60 | lint: # Run ruff over the library 61 | $(ruff) check src/$(lib) --fix 62 | 63 | .PHONY: typecheck 64 | typecheck: # Perform static type checks with pyright 65 | $(pyright) 66 | 67 | .PHONY: typecheck-stats 68 | typecheck-stats: # Perform static type checks with pyright and print stats 69 | $(pyright) --stats 70 | 71 | .PHONY: checkall 72 | checkall: format lint typecheck # Check all the things 73 | 74 | .PHONY: pre-commit # run pre-commit checks on all files 75 | pre-commit: 76 | pre-commit run --all-files 77 | 78 | .PHONY: pre-commit-update # run pre-commit and update hooks 79 | pre-commit-update: 80 | pre-commit autoupdate 81 | 82 | ############################################################################## 83 | # Package/publish. 84 | .PHONY: package 85 | package: clean # Package the library 86 | $(build) 87 | 88 | .PHONY: spackage 89 | spackage: # Create a source package for the library 90 | $(build) --sdist 91 | 92 | .PHONY: test-publish 93 | test-publish: package # Upload to testpypi 94 | $(publish) upload --index testpypi --check-url 95 | 96 | .PHONY: publish 97 | publish: package # Upload to pypi 98 | $(publish) upload --check-url 99 | ############################################################################## 100 | # Utility. 101 | 102 | .PHONY: repl 103 | repl: # Start a Python REPL 104 | $(python) 105 | 106 | .PHONY: clean 107 | clean: # Clean the build directories 108 | rm -rf build dist $(lib).egg-info 109 | 110 | .PHONY: help 111 | help: # Display this help 112 | @grep -Eh "^[a-z]+:.+# " $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.+# "}; {printf "%-20s %s\n", $$1, $$2}' 113 | 114 | ############################################################################## 115 | # Housekeeping tasks. 116 | .PHONY: housekeeping 117 | housekeeping: # Perform some git housekeeping 118 | git fsck 119 | git gc --aggressive 120 | git remote update --prune 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PAR Scrape 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/par_scrape)](https://pypi.org/project/par_scrape/) 4 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/par_scrape.svg)](https://pypi.org/project/par_scrape/) 5 | ![Runs on Linux | MacOS | Windows](https://img.shields.io/badge/runs%20on-Linux%20%7C%20MacOS%20%7C%20Windows-blue) 6 | ![Arch x86-63 | ARM | AppleSilicon](https://img.shields.io/badge/arch-x86--64%20%7C%20ARM%20%7C%20AppleSilicon-blue) 7 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/par_scrape) 8 | 9 | ![PyPI - License](https://img.shields.io/pypi/l/par_scrape) 10 | 11 | PAR Scrape is a versatile web scraping tool with options for Selenium or Playwright, featuring AI-powered data extraction and formatting. 12 | 13 | [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/probello3) 14 | 15 | ## Screenshots 16 | ![PAR Scrape Screenshot](https://raw.githubusercontent.com/paulrobello/par_scrape/main/Screenshot.png) 17 | 18 | ## Features 19 | 20 | - Web scraping using Playwright or Selenium 21 | - AI-powered data extraction and formatting 22 | - Can be used to crawl and extract clean markdown without AI 23 | - Supports multiple output formats (JSON, Excel, CSV, Markdown) 24 | - Customizable field extraction 25 | - Token usage and cost estimation 26 | - Prompt cache for Anthropic provider 27 | - Uses my [PAR AI Core](https://github.com/paulrobello/par_ai_core) 28 | 29 | 30 | ## Known Issues 31 | - Selenium silent mode on windows still shows message about websocket. There is no simple way to get rid of this. 32 | - Providers other than OpenAI are hit-and-miss depending on provider / model / data being extracted. 33 | 34 | ## Prompt Cache 35 | - OpenAI will auto cache prompts that are over 1024 tokens. 36 | - Anthropic will only cache prompts if you specify the --prompt-cache flag. Due to cache writes costing more only enable this if you intend to run multiple scrape jobs against the same url, also the cache will go stale within a couple of minutes so to reduce cost run your jobs as close together as possible. 37 | 38 | ## How it works 39 | - Data is fetch from the site using either Selenium or Playwright 40 | - HTML is converted to clean markdown 41 | - If you specify an output format other than markdown then the following kicks in: 42 | - A pydantic model is constructed from the fields you specify 43 | - The markdown is sent to the AI provider with the pydantic model as the the required output 44 | - The structured output is saved in the specified formats 45 | - If crawling mode is enabled this process is repeated for each page in the queue until the specified max number of pages is reached 46 | 47 | ## Site Crawling 48 | 49 | Crawling currently comes in 3 modes: 50 | - Single page which is the default 51 | - Single level which will crawl all links on the first page and add them to the queue. Links from any pages after the first are not added to the queue 52 | - Domain which will crawl all links on all pages as long as they below to the same top level domain (TLD). 53 | - Paginated will be added soon 54 | 55 | Crawling progress is stored in a sqlite database and all pages are tagged with the run name which can be specified with the --run-name / -n flag. 56 | You can resume a crawl by specifying the same run name again. 57 | The options `--scrape-max-parallel` / `-P` can be used to increase the scraping speed by running multiple scrapes in parallel. 58 | The options `--crawl-batch-size` / `-b` should be set at least as high as the scrape max parallel option to ensure that the queue is always full. 59 | The options `--crawl-max-pages` / `-M` can be used to limit the total number of pages crawled in a single run. 60 | 61 | ## Prerequisites 62 | 63 | To install PAR Scrape, make sure you have Python 3.11. 64 | 65 | ### [uv](https://pypi.org/project/uv/) is recommended 66 | 67 | #### Linux and Mac 68 | ```bash 69 | curl -LsSf https://astral.sh/uv/install.sh | sh 70 | ``` 71 | 72 | #### Windows 73 | ```bash 74 | powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex" 75 | ``` 76 | 77 | ## Installation 78 | 79 | 80 | ### Installation From Source 81 | 82 | Then, follow these steps: 83 | 84 | 1. Clone the repository: 85 | ```bash 86 | git clone https://github.com/paulrobello/par_scrape.git 87 | cd par_scrape 88 | ``` 89 | 90 | 2. Install the package dependencies using uv: 91 | ```bash 92 | uv sync 93 | ``` 94 | ### Installation From PyPI 95 | 96 | To install PAR Scrape from PyPI, run any of the following commands: 97 | 98 | ```bash 99 | uv tool install par_scrape 100 | ``` 101 | 102 | ```bash 103 | pipx install par_scrape 104 | ``` 105 | ### Playwright Installation 106 | To use playwright as a scraper, you must install it and its browsers using the following commands: 107 | 108 | ```bash 109 | uv tool install playwright 110 | playwright install chromium 111 | ``` 112 | 113 | ## Usage 114 | 115 | To use PAR Scrape, you can run it from the command line with various options. Here's a basic example: 116 | Ensure you have the AI provider api key in your environment. 117 | You can also store your api keys in the file `~/.par_scrape.env` as follows: 118 | ```shell 119 | # AI API KEYS 120 | OPENAI_API_KEY= 121 | ANTHROPIC_API_KEY= 122 | GROQ_API_KEY= 123 | XAI_API_KEY= 124 | GOOGLE_API_KEY= 125 | MISTRAL_API_KEY= 126 | GITHUB_TOKEN= 127 | OPENROUTER_API_KEY= 128 | DEEPSEEK_API_KEY= 129 | # Used by Bedrock 130 | AWS_PROFILE= 131 | AWS_ACCESS_KEY_ID= 132 | AWS_SECRET_ACCESS_KEY= 133 | 134 | 135 | 136 | ### Tracing (optional) 137 | LANGCHAIN_TRACING_V2=false 138 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com 139 | LANGCHAIN_API_KEY= 140 | LANGCHAIN_PROJECT=par_scrape 141 | ``` 142 | 143 | ### AI API KEYS 144 | 145 | * ANTHROPIC_API_KEY is required for Anthropic. Get a key from https://console.anthropic.com/ 146 | * OPENAI_API_KEY is required for OpenAI. Get a key from https://platform.openai.com/account/api-keys 147 | * GITHUB_TOKEN is required for GitHub Models. Get a free key from https://github.com/marketplace/models 148 | * GOOGLE_API_KEY is required for Google Models. Get a free key from https://console.cloud.google.com 149 | * XAI_API_KEY is required for XAI. Get a free key from https://x.ai/api 150 | * GROQ_API_KEY is required for Groq. Get a free key from https://console.groq.com/ 151 | * MISTRAL_API_KEY is required for Mistral. Get a free key from https://console.mistral.ai/ 152 | * OPENROUTER_KEY is required for OpenRouter. Get a key from https://openrouter.ai/ 153 | * DEEPSEEK_API_KEY is required for Deepseek. Get a key from https://platform.deepseek.com/ 154 | * AWS_PROFILE or AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are used for Bedrock authentication. The environment must 155 | already be authenticated with AWS. 156 | * No key required to use with Ollama, LlamaCpp, LiteLLM. 157 | 158 | 159 | ### Open AI Compatible Providers 160 | 161 | If a specify provider is not listed but has an OpenAI compatible endpoint you can use the following combo of vars: 162 | * PARAI_AI_PROVIDER=OpenAI 163 | * PARAI_MODEL=Your selected model 164 | * PARAI_AI_BASE_URL=The providers OpenAI endpoint URL 165 | 166 | ### Running from source 167 | ```bash 168 | uv run par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" -f "Cache Price" --model gpt-4o-mini --display-output md 169 | ``` 170 | 171 | ### Running if installed from PyPI 172 | ```bash 173 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" -f "Cache Price" --model gpt-4o-mini --display-output md 174 | ``` 175 | 176 | ### Options 177 | ``` 178 | --url -u TEXT URL to scrape [default: https://openai.com/api/pricing/] 179 | --output-format -O [md|json|csv|excel] Output format for the scraped data [default: md] 180 | --fields -f TEXT Fields to extract from the webpage 181 | [default: Model, Pricing Input, Pricing Output, Cache Price] 182 | --scraper -s [selenium|playwright] Scraper to use: 'selenium' or 'playwright' [default: playwright] 183 | --retries -r INTEGER Retry attempts for failed scrapes [default: 3] 184 | --scrape-max-parallel -P INTEGER Max parallel fetch requests [default: 1] 185 | --wait-type -w [none|pause|sleep|idle|selector|text] Method to use for page content load waiting [default: sleep] 186 | --wait-selector -i TEXT Selector or text to use for page content load waiting. [default: None] 187 | --headless -h Run in headless mode (for Selenium) 188 | --sleep-time -t INTEGER Time to sleep before scrolling (in seconds) [default: 2] 189 | --ai-provider -a [Ollama|LlamaCpp|OpenRouter|OpenAI|Gemini|Github|XAI|Anthropic| 190 | Groq|Mistral|Deepseek|LiteLLM|Bedrock] AI provider to use for processing [default: OpenAI] 191 | --model -m TEXT AI model to use for processing. If not specified, a default model will be used. [default: None] 192 | --ai-base-url -b TEXT Override the base URL for the AI provider. [default: None] 193 | --prompt-cache Enable prompt cache for Anthropic provider 194 | --reasoning-effort [low|medium|high] Reasoning effort level to use for o1 and o3 models. [default: None] 195 | --reasoning-budget INTEGER Maximum context size for reasoning. [default: None] 196 | --display-output -d [none|plain|md|csv|json] Display output in terminal (md, csv, or json) [default: None] 197 | --output-folder -o PATH Specify the location of the output folder [default: output] 198 | --silent -q Run in silent mode, suppressing output 199 | --run-name -n TEXT Specify a name for this run. Can be used to resume a crawl Defaults to YYYYmmdd_HHMMSS 200 | --pricing -p [none|price|details] Enable pricing summary display [default: details] 201 | --cleanup -c [none|before|after|both] How to handle cleanup of output folder [default: none] 202 | --extraction-prompt -e PATH Path to the extraction prompt file [default: None] 203 | --crawl-type -C [single_page|single_level|domain] Enable crawling mode [default: single_page] 204 | --crawl-max-pages -M INTEGER Maximum number of pages to crawl this session [default: 100] 205 | --crawl-batch-size -B INTEGER Maximum number of pages to load from the queue at once [default: 1] 206 | --respect-rate-limits Whether to use domain-specific rate limiting [default: True] 207 | --respect-robots Whether to respect robots.txt 208 | --crawl-delay INTEGER Default delay in seconds between requests to the same domain [default: 1] 209 | --version -v 210 | --help Show this message and exit. 211 | ``` 212 | 213 | ### Examples 214 | 215 | * Basic usage with default options: 216 | ```bash 217 | par_scrape --url "https://openai.com/api/pricing/" -f "Model" -f "Pricing Input" -f "Pricing Output" -O json -O csv --pricing details --display-output csv 218 | ``` 219 | * Using Playwright, displaying JSON output and waiting for text gpt-4o to be in page before continuing: 220 | ```bash 221 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --scraper playwright -O json -O csv -d json --pricing details -w text -i gpt-4o 222 | ``` 223 | * Specifying a custom model and output folder: 224 | ```bash 225 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --model gpt-4 --output-folder ./custom_output -O json -O csv --pricing details -w text -i gpt-4o 226 | ``` 227 | * Running in silent mode with a custom run name: 228 | ```bash 229 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --silent --run-name my_custom_run --pricing details -O json -O csv -w text -i gpt-4o 230 | ``` 231 | * Using the cleanup option to remove the output folder after scraping: 232 | ```bash 233 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --cleanup --pricing details -O json -O csv 234 | ``` 235 | * Using the pause option to wait for user input before scrolling: 236 | ```bash 237 | par_scrape --url "https://openai.com/api/pricing/" -f "Title" -f "Description" -f "Price" --pause --pricing details -O json -O csv 238 | ``` 239 | * Using Anthropic provider with prompt cache enabled and detailed pricing breakdown: 240 | ```bash 241 | par_scrape -a Anthropic --prompt-cache -d csv -p details -f "Title" -f "Description" -f "Price" -f "Cache Price" -O json -O csv 242 | ``` 243 | 244 | * Crawling single level and only outputting markdown (No LLM or cost): 245 | ```bash 246 | par_scrape --url "https://openai.com/api/pricing/" -O md --crawl-batch-size 5 --scrape-max-parallel 5 --crawl-type single_level 247 | ``` 248 | 249 | 250 | ## Roadmap 251 | - API Server 252 | - More crawling options 253 | - Paginated Listing crawling 254 | 255 | 256 | ## Whats New 257 | - Version 0.7.0 258 | - Major overhaul and fixing of crawling features. 259 | - added --respect-robots flag to check robots.txt before scraping 260 | - added --respect-rate-limits to respect rate limits for domains 261 | - added --reasoning-effort and --reasoning-budget for o1/o3 and Sonnet 3.7 262 | - updated dependencies 263 | - Version 0.6.1 264 | - Updated ai-core 265 | - Version 0.6.0 266 | - Fixed bug where images were being striped from markdown output 267 | - Now uses par_ai_core for url fetching and markdown conversion 268 | - New Features: 269 | - BREAKING CHANGES: 270 | - New option to specify desired output formats `-O` which defaults to markdown only, which does not require AI 271 | - BEHAVIOR CHANGES: 272 | - Now retries 3 times on failed scrapes 273 | - Basic site crawling 274 | - Retry failed fetches 275 | - HTTP authentication 276 | - Proxy settings 277 | - Updated system prompt for better results 278 | - Version 0.5.1 279 | - Update ai-core and dependencies 280 | - Now supports Deepseek, XAI and LiteLLM 281 | - Better pricing data 282 | - Version 0.5.0 283 | - Update ai-core and dependencies 284 | - Now supports OpenRouter 285 | - Version 0.4.9 286 | - Updated to use new par-ai-core 287 | - Now supports LlamaCPP and XAI Grok 288 | - Better cost tracking 289 | - Updated pricing data 290 | - Better error handling 291 | - Now supports Python 3.10 292 | - Version 0.4.8: 293 | - Added Anthropic prompt cache option. 294 | - Version 0.4.7: 295 | - BREAKING CHANGE: --pricing cli option now takes a string value of 'details', 'cost', or 'none'. 296 | - Added pool of user agents that gets randomly pulled from. 297 | - Updating pricing data. 298 | - Pricing token capture and compute now much more accurate. 299 | - Version 0.4.6: 300 | - Minor bug fixes. 301 | - Updating pricing data. 302 | - Added support for Amazon Bedrock 303 | - Removed some unnecessary dependencies. 304 | - Code cleanup. 305 | - Version 0.4.5: 306 | - Added new option --wait-type that allows you to specify the type of wait to use such as pause, sleep, idle, text or selector. 307 | - Removed --pause option as it is no longer needed with --wait-type option. 308 | - Playwright scraping now honors the headless mode. 309 | - Playwright is now the default scraper as it is much faster. 310 | - Version 0.4.4: 311 | - Better Playwright scraping. 312 | - Version 0.4.3: 313 | - Added option to override the base URL for the AI provider. 314 | - Version 0.4.2: 315 | - The url parameter can now point to a local rawData_*.md file for easier testing of different models without having to re-fetch the data. 316 | - Added ability to specify file with extraction prompt. 317 | - Tweaked extraction prompt to work with Groq and Anthropic. Google still does not work. 318 | - Remove need for ~/.par-scrape-config.json 319 | - Version 0.4.1: 320 | - Minor bug fixes for pricing summary. 321 | - Default model for google changed to "gemini-1.5-pro-exp-0827" which is free and usually works well. 322 | - Version 0.4.0: 323 | - Added support for Anthropic, Google, Groq, and Ollama. (Not well tested with any providers other than OpenAI) 324 | - Add flag for displaying pricing summary. Defaults to False. 325 | - Added pricing data for Anthropic. 326 | - Better error handling for llm calls. 327 | - Updated cleanup flag to handle both before and after cleanup. Removed --remove-output-folder flag. 328 | - Version 0.3.1: 329 | - Add pause and sleep-time options to control the browser and scraping delays. 330 | - Default headless mode to False so you can interact with the browser. 331 | - Version 0.3.0: 332 | - Fixed location of config.json file. 333 | 334 | ## Contributing 335 | 336 | Contributions are welcome! Please feel free to submit a Pull Request. 337 | 338 | ## License 339 | 340 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 341 | 342 | ## Author 343 | 344 | Paul Robello - probello@gmail.com 345 | -------------------------------------------------------------------------------- /Screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paulrobello/par_scrape/56294b0f0e86434033fd0d1a1ae54900ae0f4585/Screenshot.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "par_scrape" 3 | dynamic = [ 4 | "version", 5 | ] 6 | description = "A versatile web scraping tool with options for Selenium or Playwright, featuring OpenAI-powered data extraction and formatting." 7 | url = "https://github.com/paulrobello/par_scrape" 8 | readme = "README.md" 9 | requires-python = ">=3.10" 10 | authors = [ 11 | { name = "Paul Robello", email = "probello@gmail.com" }, 12 | ] 13 | maintainers = [ 14 | { name = "Paul Robello", email = "probello@gmail.com" }, 15 | ] 16 | classifiers = [ 17 | "License :: OSI Approved :: MIT License", 18 | "Environment :: Console", 19 | "Development Status :: 4 - Beta", 20 | "Intended Audience :: Developers", 21 | "Intended Audience :: End Users/Desktop", 22 | "Intended Audience :: Other Audience", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.10", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | "Operating System :: MacOS", 28 | "Operating System :: Microsoft :: Windows :: Windows 10", 29 | "Operating System :: Microsoft :: Windows :: Windows 11", 30 | "Operating System :: POSIX :: Linux", 31 | "Topic :: Internet :: WWW/HTTP :: Browsers", 32 | "Topic :: Software Development :: Libraries :: Python Modules", 33 | "Topic :: Text Processing :: Markup :: HTML", 34 | "Typing :: Typed", 35 | ] 36 | keywords = [ 37 | "web scraping", 38 | "data extraction", 39 | "selenium", 40 | "playwright", 41 | "openai", 42 | "anthropic", 43 | "xai", 44 | "openrouter", 45 | "groq", 46 | "ollama", 47 | "llamacpp", 48 | ] 49 | dependencies = [ 50 | "beautifulsoup4>=4.13.3", 51 | "pandas>=2.2.3", 52 | "pydantic>=2.10.6", 53 | "python-dotenv>=1.0.1", 54 | "rich>=13.9.4", 55 | "typer>=0.15.2", 56 | "openpyxl>=3.1.5", 57 | "tabulate>=0.9.0", 58 | "par-ai-core>=0.1.24", 59 | "fastapi>=0.115.11", 60 | "tldextract>=5.1.3", 61 | "strenum>=0.4.15", 62 | ] 63 | packages = [ 64 | "src/par_scrape", 65 | ] 66 | 67 | [project.license] 68 | file = "LICENSE" 69 | 70 | [project.urls] 71 | Homepage = "https://github.com/paulrobello/par_scrape" 72 | Documentation = "https://github.com/paulrobello/par_scrape/blob/main/README.md" 73 | Repository = "https://github.com/paulrobello/par_scrape" 74 | Issues = "https://github.com/paulrobello/par_scrape/issues" 75 | Discussions = "https://github.com/paulrobello/par_scrape/discussions" 76 | Wiki = "https://github.com/paulrobello/par_scrape/wiki" 77 | 78 | [project.scripts] 79 | par_scrape = "par_scrape.__main__:app" 80 | 81 | [build-system] 82 | requires = [ 83 | "hatchling", 84 | ] 85 | build-backend = "hatchling.build" 86 | 87 | [dependency-groups] 88 | dev = [ 89 | "build>=1.2.1", 90 | "pyright>=1.1.379", 91 | "ruff>=0.9.6", 92 | "pre-commit>=4.1.0", 93 | ] 94 | 95 | [tool.hatch.version] 96 | path = "src/par_scrape/__init__.py" 97 | 98 | [tool.hatch.metadata] 99 | allow-direct-references = true 100 | 101 | [tool.hatch.build.targets.wheel] 102 | packages = [ 103 | "src/par_scrape", 104 | ] 105 | include = [ 106 | "py.typed", 107 | "**/*.py", 108 | "**/*.html", 109 | "**/*.gif", 110 | "**/*.jpg", 111 | "**/*.png", 112 | "**/*.md", 113 | ] 114 | 115 | [tool.hatch.build.targets.sdist] 116 | include = [ 117 | "src/par_scrape", 118 | "LICENSE", 119 | "README.md", 120 | "extraction_prompt.md", 121 | "pyproject.toml", 122 | ] 123 | exclude = [ 124 | "*.pyc", 125 | "__pycache__", 126 | "*.so", 127 | "*.dylib", 128 | ] 129 | -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": [ 3 | "src/**/*.py" 4 | ], 5 | "exclude": [ 6 | "**/node_modules", 7 | "**/__pycache__", 8 | "**/output" 9 | ], 10 | "ignore": [ 11 | "**/.venv" 12 | ], 13 | "defineConstant": { 14 | "DEBUG": true 15 | }, 16 | "venvPath": ".", 17 | "venv": ".venv", 18 | "reportMissingImports": true, 19 | "reportMissingTypeStubs": false, 20 | "pythonVersion": "3.10", 21 | "typeCheckingMode": "basic" 22 | } 23 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | # Exclude a variety of commonly ignored directories. 2 | exclude = [ 3 | ".bzr", 4 | ".direnv", 5 | ".eggs", 6 | ".git", 7 | ".git-rewrite", 8 | ".hg", 9 | ".ipynb_checkpoints", 10 | ".mypy_cache", 11 | ".nox", 12 | ".pants.d", 13 | ".pyenv", 14 | ".pytest_cache", 15 | ".pytype", 16 | ".ruff_cache", 17 | ".svn", 18 | ".tox", 19 | ".venv", 20 | ".vscode", 21 | "__pypackages__", 22 | "_build", 23 | "buck-out", 24 | "build", 25 | "dist", 26 | "node_modules", 27 | "site-packages", 28 | "venv", 29 | ] 30 | 31 | # Same as Black. 32 | line-length = 120 33 | indent-width = 4 34 | 35 | # Assume Python 3.10 36 | target-version = "py310" 37 | 38 | [lint] 39 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 40 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 41 | # McCabe complexity (`C901`) by default. 42 | select = ["E4", "E5", "E7", "E9", "F", "W", "UP", "I"] 43 | ignore = ["E501"] 44 | 45 | # Allow fix for all enabled rules (when `--fix`) is provided. 46 | fixable = ["ALL"] 47 | unfixable = [] 48 | 49 | # Allow unused variables when underscore-prefixed. 50 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 51 | 52 | pydocstyle.convention = "google" 53 | 54 | 55 | [format] 56 | # Like Black, use double quotes for strings. 57 | quote-style = "double" 58 | 59 | # Like Black, indent with spaces, rather than tabs. 60 | indent-style = "space" 61 | 62 | # Like Black, respect magic trailing commas. 63 | skip-magic-trailing-comma = false 64 | 65 | # Like Black, automatically detect the appropriate line ending. 66 | line-ending = "auto" 67 | 68 | # Enable auto-formatting of code examples in docstrings. Markdown, 69 | # reStructuredText code/literal blocks and doctests are all supported. 70 | # 71 | # This is currently disabled by default, but it is planned for this 72 | # to be opt-out in the future. 73 | docstring-code-format = true 74 | 75 | # Set the line length limit used when formatting code snippets in 76 | # docstrings. 77 | # 78 | # This only has an effect when the `docstring-code-format` setting is 79 | # enabled. 80 | docstring-code-line-length = "dynamic" 81 | 82 | [lint.isort] 83 | combine-as-imports = true 84 | -------------------------------------------------------------------------------- /src/par_scrape/__init__.py: -------------------------------------------------------------------------------- 1 | """PAR Scrape - A versatile web scraping tool.""" 2 | 3 | from __future__ import annotations 4 | 5 | import os 6 | 7 | __author__ = "Paul Robello" 8 | __copyright__ = "Copyright 2024, Paul Robello" 9 | __credits__ = ["Paul Robello"] 10 | __maintainer__ = "Paul Robello" 11 | __email__ = "probello@gmail.com" 12 | __version__ = "0.7.0" 13 | __licence__ = "MIT" 14 | __application_title__ = "PAR Scrape" 15 | __application_binary__ = "par_scrape" 16 | 17 | os.environ["USER_AGENT"] = f"{__application_title__} {__version__}" 18 | 19 | __all__: list[str] = [ 20 | "__author__", 21 | "__copyright__", 22 | "__credits__", 23 | "__maintainer__", 24 | "__email__", 25 | "__version__", 26 | "__licence__", 27 | "__application_title__", 28 | "__application_binary__", 29 | ] 30 | -------------------------------------------------------------------------------- /src/par_scrape/__main__.py: -------------------------------------------------------------------------------- 1 | """Main entry point for par_scrape.""" 2 | 3 | import os 4 | import shutil 5 | import sqlite3 6 | import time 7 | from contextlib import nullcontext 8 | from datetime import datetime 9 | from pathlib import Path 10 | from typing import Annotated 11 | from urllib.parse import urlparse 12 | from uuid import uuid4 13 | 14 | import typer 15 | from dotenv import load_dotenv 16 | from par_ai_core.llm_config import LlmConfig, ReasoningEffort 17 | from par_ai_core.llm_providers import ( 18 | LlmProvider, 19 | provider_default_models, 20 | provider_env_key_names, 21 | ) 22 | from par_ai_core.output_utils import DisplayOutputFormat, display_formatted_output 23 | from par_ai_core.par_logging import console_out 24 | from par_ai_core.pricing_lookup import PricingDisplay, show_llm_cost 25 | from par_ai_core.provider_cb_info import get_parai_callback 26 | from par_ai_core.web_tools import ScraperChoice, ScraperWaitType, fetch_url, html_to_markdown 27 | from rich.panel import Panel 28 | from rich.text import Text 29 | 30 | from par_scrape import __application_title__, __version__ 31 | from par_scrape.crawl import ( 32 | DB_PATH, 33 | CrawlType, 34 | ErrorType, 35 | PageStatus, 36 | add_to_queue, 37 | clean_url_of_ticket_id, 38 | extract_links, 39 | get_next_urls, 40 | get_queue_stats, 41 | get_url_output_folder, 42 | init_db, 43 | mark_complete, 44 | mark_error, 45 | set_crawl_delay, 46 | ) 47 | from par_scrape.enums import CleanupType, OutputFormat 48 | from par_scrape.scrape_data import ( 49 | create_container_model, 50 | create_dynamic_model, 51 | format_data, 52 | save_formatted_data, 53 | save_raw_data, 54 | ) 55 | 56 | old_env_path = Path("~/.par-scrape.env").expanduser() 57 | new_env_path = Path("~/.par_scrape.env").expanduser() 58 | 59 | if old_env_path.exists(): 60 | if new_env_path.exists(): 61 | old_env_path.unlink() 62 | else: 63 | console_out.print(f"[bold yellow]Renaming {old_env_path} to {new_env_path}") 64 | old_env_path.rename(new_env_path) 65 | 66 | # Load the .env file from the project folder 67 | load_dotenv(dotenv_path=".env") 68 | # Load the new .env file from the users home folder 69 | load_dotenv(dotenv_path=new_env_path) 70 | 71 | # Initialize Typer app 72 | app = typer.Typer(help="Web scraping tool with options for Selenium or Playwright") 73 | 74 | 75 | def version_callback(value: bool) -> None: 76 | """Print version and exit.""" 77 | if value: 78 | print(f"{__application_title__}: {__version__}") 79 | raise typer.Exit() 80 | 81 | 82 | @app.command() 83 | def main( 84 | url: Annotated[str, typer.Option("--url", "-u", help="URL to scrape")] = "https://openai.com/api/pricing/", 85 | output_format: Annotated[ 86 | list[OutputFormat], 87 | typer.Option("--output-format", "-O", help="Output format for the scraped data"), 88 | ] = [OutputFormat.MARKDOWN], 89 | fields: Annotated[ 90 | list[str], 91 | typer.Option("--fields", "-f", help="Fields to extract from the webpage"), 92 | ] = ["Model", "Pricing Input", "Pricing Output", "Cache Price"], 93 | scraper: Annotated[ 94 | ScraperChoice, 95 | typer.Option( 96 | "--scraper", 97 | "-s", 98 | help="Scraper to use: 'selenium' or 'playwright'", 99 | case_sensitive=False, 100 | ), 101 | ] = ScraperChoice.PLAYWRIGHT, 102 | scrape_retries: Annotated[ 103 | int, 104 | typer.Option("--retries", "-r", help="Retry attempts for failed scrapes"), 105 | ] = 3, 106 | scrape_max_parallel: Annotated[ 107 | int, 108 | typer.Option("--scrape-max-parallel", "-P", help="Max parallel fetch requests"), 109 | ] = 1, 110 | wait_type: Annotated[ 111 | ScraperWaitType, 112 | typer.Option( 113 | "--wait-type", 114 | "-w", 115 | help="Method to use for page content load waiting", 116 | case_sensitive=False, 117 | ), 118 | ] = ScraperWaitType.SLEEP, 119 | wait_selector: Annotated[ 120 | str | None, 121 | typer.Option( 122 | "--wait-selector", 123 | "-i", 124 | help="Selector or text to use for page content load waiting.", 125 | ), 126 | ] = None, 127 | headless: Annotated[ 128 | bool, 129 | typer.Option("--headless", "-h", help="Run in headless mode (for Selenium)"), 130 | ] = False, 131 | sleep_time: Annotated[ 132 | int, 133 | typer.Option("--sleep-time", "-t", help="Time to sleep before scrolling (in seconds)"), 134 | ] = 2, 135 | ai_provider: Annotated[ 136 | LlmProvider, 137 | typer.Option("--ai-provider", "-a", help="AI provider to use for processing"), 138 | ] = LlmProvider.OPENAI, 139 | model: Annotated[ 140 | str | None, 141 | typer.Option( 142 | "--model", 143 | "-m", 144 | help="AI model to use for processing. If not specified, a default model will be used.", 145 | ), 146 | ] = None, 147 | ai_base_url: Annotated[ 148 | str | None, 149 | typer.Option( 150 | "--ai-base-url", 151 | "-b", 152 | help="Override the base URL for the AI provider.", 153 | ), 154 | ] = None, 155 | prompt_cache: Annotated[ 156 | bool, 157 | typer.Option("--prompt-cache", help="Enable prompt cache for Anthropic provider"), 158 | ] = False, 159 | reasoning_effort: Annotated[ 160 | ReasoningEffort | None, 161 | typer.Option( 162 | "--reasoning-effort", 163 | help="Reasoning effort level to use for o1 and o3 models.", 164 | ), 165 | ] = None, 166 | reasoning_budget: Annotated[ 167 | int | None, 168 | typer.Option( 169 | "--reasoning-budget", 170 | help="Maximum context size for reasoning.", 171 | ), 172 | ] = None, 173 | display_output: Annotated[ 174 | DisplayOutputFormat | None, 175 | typer.Option( 176 | "--display-output", 177 | "-d", 178 | help="Display output in terminal (md, csv, or json)", 179 | ), 180 | ] = None, 181 | output_folder: Annotated[ 182 | Path, 183 | typer.Option("--output-folder", "-o", help="Specify the location of the output folder"), 184 | ] = Path("./output"), 185 | silent: Annotated[ 186 | bool, 187 | typer.Option("--silent", "-q", help="Run in silent mode, suppressing output"), 188 | ] = False, 189 | run_name: Annotated[ 190 | str, 191 | typer.Option( 192 | "--run-name", 193 | "-n", 194 | help="Specify a name for this run. Can be used to resume a crawl Defaults to YYYYmmdd_HHMMSS", 195 | ), 196 | ] = "", 197 | pricing: Annotated[ 198 | PricingDisplay, 199 | typer.Option("--pricing", "-p", help="Enable pricing summary display"), 200 | ] = PricingDisplay.DETAILS, 201 | cleanup: Annotated[ 202 | CleanupType, 203 | typer.Option("--cleanup", "-c", help="How to handle cleanup of output folder"), 204 | ] = CleanupType.NONE, 205 | extraction_prompt: Annotated[ 206 | Path | None, 207 | typer.Option("--extraction-prompt", "-e", help="Path to the extraction prompt file"), 208 | ] = None, 209 | crawl_type: Annotated[ 210 | CrawlType, 211 | typer.Option( 212 | "--crawl-type", 213 | "-C", 214 | help="Enable crawling mode", 215 | case_sensitive=False, 216 | ), 217 | ] = CrawlType.SINGLE_PAGE, 218 | crawl_max_pages: Annotated[ 219 | int, 220 | typer.Option("--crawl-max-pages", "-M", help="Maximum number of pages to crawl this session"), 221 | ] = 100, 222 | crawl_batch_size: Annotated[ 223 | int, 224 | typer.Option("--crawl-batch-size", "-B", help="Maximum number of pages to load from the queue at once"), 225 | ] = 1, 226 | respect_rate_limits: Annotated[ 227 | bool, 228 | typer.Option("--respect-rate-limits", help="Whether to use domain-specific rate limiting"), 229 | ] = True, 230 | respect_robots: Annotated[ 231 | bool, 232 | typer.Option("--respect-robots", help="Whether to respect robots.txt"), 233 | ] = False, 234 | crawl_delay: Annotated[ 235 | int, 236 | typer.Option("--crawl-delay", help="Default delay in seconds between requests to the same domain"), 237 | ] = 1, 238 | version: Annotated[ # pylint: disable=unused-argument 239 | bool | None, 240 | typer.Option("--version", "-v", callback=version_callback, is_eager=True), 241 | ] = None, 242 | ): 243 | """ 244 | Scrape and optionally crawl / extract data from a website. 245 | 246 | AI is only used if an output format other than md is specified. 247 | 248 | Crawl types: 249 | 250 | - single_page: Only scrape the specified URL. 251 | 252 | - single_level: Scrape the specified URL and all links on that page that are have the same top level domain. 253 | 254 | - domain: Scrape the specified URL and all links and their pages on that page that are have the same domain. 255 | """ 256 | 257 | if display_output and display_output not in output_format: 258 | console_out.print( 259 | f"[bold red]Display output format '{display_output}' is not in the specified output formats.[/bold red]" 260 | ) 261 | raise typer.Exit(1) 262 | 263 | outputs_needing_llm = [OutputFormat.JSON, OutputFormat.CSV, OutputFormat.EXCEL] 264 | llm_needed = any(format in output_format for format in outputs_needing_llm) 265 | if llm_needed: 266 | if not model: 267 | model = provider_default_models[ai_provider] 268 | 269 | if ai_provider not in [LlmProvider.OLLAMA, LlmProvider.BEDROCK, LlmProvider.LITELLM]: 270 | key_name = provider_env_key_names[ai_provider] 271 | if not os.environ.get(key_name): 272 | console_out.print(f"[bold red]{key_name} environment variable not set. Exiting...[/bold red]") 273 | raise typer.Exit(1) 274 | 275 | if prompt_cache and ai_provider != LlmProvider.ANTHROPIC: 276 | console_out.print( 277 | "[bold red]Prompt cache flag is only available for Anthropic provider. Exiting...[/bold red]" 278 | ) 279 | raise typer.Exit(1) 280 | 281 | console_out.print("[bold cyan]Creating llm config and dynamic models...") 282 | llm_config = LlmConfig( 283 | provider=ai_provider, 284 | model_name=model, 285 | temperature=0, 286 | base_url=ai_base_url, 287 | reasoning_effort=reasoning_effort, 288 | reasoning_budget=reasoning_budget, 289 | ) 290 | dynamic_extraction_model = create_dynamic_model(fields) 291 | dynamic_model_container = create_container_model(dynamic_extraction_model) 292 | 293 | console_out.print( 294 | Panel.fit( 295 | Text.assemble( 296 | ("AI Provider: ", "cyan"), 297 | (f"{ai_provider.value}", "green"), 298 | "\n", 299 | ("Model: ", "cyan"), 300 | (f"{model}", "green"), 301 | "\n", 302 | ("AI Provider Base URL: ", "cyan"), 303 | (f"{ai_base_url or 'default'}", "green"), 304 | "\n", 305 | ("Prompt Cache: ", "cyan"), 306 | (f"{prompt_cache}", "green"), 307 | "\n", 308 | ("Fields to extract: ", "cyan"), 309 | (", ".join(fields), "green"), 310 | "\n", 311 | ("Pricing Display: ", "cyan"), 312 | (f"{pricing.value}", "green"), 313 | ), 314 | title="[bold]AI Configuration", 315 | border_style="bold", 316 | ) 317 | ) 318 | else: 319 | llm_config = None 320 | dynamic_model_container = None 321 | 322 | # Generate run_name if not provided 323 | if not run_name: 324 | run_name = datetime.now().strftime("%Y%m%d_%H%M%S") 325 | else: 326 | # Ensure run_name is filesystem-friendly 327 | run_name = "".join(c for c in run_name if c.isalnum() or c in ("-", "_")) 328 | if not run_name: 329 | run_name = str(uuid4()) 330 | 331 | url = url.rstrip("/") 332 | console_out.print( 333 | Panel.fit( 334 | Text.assemble( 335 | ("Primary URL: ", "cyan"), 336 | (f"{url}", "green"), 337 | "\n", 338 | ("Scraper: ", "cyan"), 339 | (f"{scraper}", "green"), 340 | "\n", 341 | ("Scrape Max Parallel: ", "cyan"), 342 | (f"{scrape_max_parallel}", "green"), 343 | "\n", 344 | ("Retries: ", "cyan"), 345 | ( 346 | f"{scrape_retries}", 347 | "green", 348 | ), 349 | "\n", 350 | ("Crawl Type: ", "cyan"), 351 | (f"{crawl_type.value}", "green"), 352 | "\n", 353 | ("Crawl Batch Size: ", "cyan"), 354 | (f"{crawl_batch_size}", "green"), 355 | "\n", 356 | ("Respect Rate Limits: ", "cyan"), 357 | (f"{respect_rate_limits}", "green"), 358 | "\n", 359 | ("Default Crawl Delay: ", "cyan"), 360 | (f"{crawl_delay} seconds", "green"), 361 | "\n", 362 | ("Output Format: ", "cyan"), 363 | (", ".join([f"{format.value}" for format in output_format]), "green"), 364 | "\n", 365 | ("Max Pages: ", "cyan"), 366 | (f"{crawl_max_pages}", "green"), 367 | "\n", 368 | ("Headless: ", "cyan"), 369 | (f"{headless}", "green"), 370 | "\n", 371 | ("Wait Type: ", "cyan"), 372 | (f"{wait_type.value}", "green"), 373 | "\n", 374 | ("Wait Selector: ", "cyan"), 375 | ( 376 | f"{wait_selector if wait_type in (ScraperWaitType.SELECTOR, ScraperWaitType.TEXT) else 'N/A'}", 377 | "green", 378 | ), 379 | "\n", 380 | ("Sleep Time: ", "cyan"), 381 | ( 382 | f"{sleep_time} seconds", 383 | "green", 384 | ), 385 | "\n", 386 | ("Display output: ", "cyan"), 387 | (f"{display_output or 'None'}", "green"), 388 | "\n", 389 | ("Silent mode: ", "cyan"), 390 | (f"{silent}", "green"), 391 | "\n", 392 | ("Cleanup: ", "cyan"), 393 | (f"{cleanup}", "green"), 394 | ), 395 | title="[bold]Scraping Configuration", 396 | border_style="bold", 397 | ) 398 | ) 399 | 400 | with console_out.capture() if silent else nullcontext(): 401 | if cleanup in [CleanupType.BEFORE, CleanupType.BOTH]: 402 | if os.path.exists(output_folder): 403 | shutil.rmtree(output_folder) 404 | console_out.print(f"[bold green]Removed existing output folder: {output_folder}[/bold green]") 405 | try: 406 | init_db() 407 | add_to_queue(run_name, [url]) 408 | 409 | with get_parai_callback(show_pricing=pricing if llm_needed else PricingDisplay.NONE) as cb: 410 | with console_out.status("[bold green]Starting fetch loop...") as status: 411 | start_time = time.time() 412 | num_pages: int = 0 413 | base_output_folder = Path("./output") 414 | # Set initial crawl delay for all domains 415 | if respect_rate_limits and crawl_delay > 1: 416 | with sqlite3.connect(DB_PATH) as conn: 417 | conn.execute("UPDATE domain_rate_limit SET crawl_delay = ?", (crawl_delay,)) 418 | 419 | while num_pages < crawl_max_pages: 420 | # Get queue statistics 421 | queue_stats = get_queue_stats(run_name) 422 | queued = queue_stats.get(PageStatus.QUEUED.value, 0) 423 | completed = queue_stats.get(PageStatus.COMPLETED.value, 0) 424 | errors = queue_stats.get(PageStatus.ERROR.value, 0) 425 | active = queue_stats.get(PageStatus.ACTIVE.value, 0) 426 | 427 | status.update( 428 | f"[bold cyan]Queue status: " 429 | f"[yellow]{queued}[/yellow] queued, " 430 | f"[green]{completed}[/green] completed, " 431 | f"[red]{errors}[/red] errors, " 432 | f"[blue]{active}[/blue] active" 433 | ) 434 | 435 | urls = get_next_urls( 436 | run_name, crawl_batch_size, scrape_retries, respect_rate_limits=respect_rate_limits 437 | ) 438 | 439 | if not urls: 440 | # Check if there are any active URLs that might complete 441 | if active > 0: 442 | console_out.print(f"[yellow]Waiting for {active} active URLs to complete...[/yellow]") 443 | time.sleep(2) # Give a small delay to avoid tight loop 444 | continue 445 | else: 446 | break 447 | num_pages += len(urls) 448 | 449 | try: 450 | raw_htmls = fetch_url( 451 | urls, 452 | fetch_using=scraper.value, 453 | max_parallel=scrape_max_parallel, 454 | sleep_time=sleep_time, 455 | wait_type=wait_type, 456 | wait_selector=wait_selector, 457 | headless=headless, 458 | verbose=True, 459 | console=console_out, 460 | ) 461 | if not raw_htmls: 462 | raise ValueError("No data was fetched") 463 | 464 | if len(raw_htmls) != len(urls): 465 | raise ValueError(f"Mismatch between URLs {len(urls)} and fetched data {len(raw_htmls)}") 466 | url_data = zip(urls, raw_htmls) 467 | for current_url, raw_html in url_data: 468 | try: 469 | console_out.print(f"[green]{current_url}") 470 | 471 | # Use an even more aggressive approach to avoid nesting 472 | # 1. Completely clean the URL of any run_name occurrences 473 | cleaned_url = clean_url_of_ticket_id(current_url, run_name) 474 | 475 | url_output_folder = get_url_output_folder(base_output_folder, run_name, cleaned_url) 476 | 477 | # 2. Print for debugging 478 | console_out.print(f"[blue]Output folder: {url_output_folder}[/blue]") 479 | # Create necessary directories 480 | if llm_needed: 481 | url_output_folder.mkdir(parents=True, exist_ok=True) 482 | else: 483 | url_output_folder.parent.mkdir(parents=True, exist_ok=True) 484 | # console_out.print(f"[green]{url_output_folder}") 485 | 486 | if not raw_html: 487 | raise ValueError("No data was fetched") 488 | 489 | # console_out.print(f"cu:{current_url} -- u:{url}") 490 | 491 | if ( 492 | crawl_type == CrawlType.SINGLE_LEVEL and current_url == url 493 | ) or crawl_type == CrawlType.DOMAIN: 494 | # Extract links, respecting robots.txt 495 | page_links = extract_links( 496 | current_url, 497 | raw_html, 498 | crawl_type, 499 | respect_robots=respect_robots, 500 | console=console_out, 501 | ticket_id=run_name, 502 | ) 503 | 504 | # Calculate the current page depth 505 | current_depth = 0 506 | with sqlite3.connect(DB_PATH) as conn: 507 | row = conn.execute( 508 | "SELECT depth FROM scrape WHERE ticket_id = ? AND url = ?", 509 | (run_name, current_url), 510 | ).fetchone() 511 | if row: 512 | current_depth = row[0] 513 | 514 | # Add extracted links to queue with incremented depth 515 | if page_links: 516 | console_out.print(f"[cyan]Found {len(page_links)} links on {current_url}") 517 | add_to_queue(run_name, page_links, current_depth + 1) 518 | # break 519 | status.update("[bold cyan]Converting HTML to Markdown...") 520 | markdown = html_to_markdown(raw_html, url=current_url, include_images=True) 521 | if not markdown: 522 | raise ValueError("Markdown data is empty") 523 | 524 | # Save raw data 525 | status.update("[bold cyan]Saving raw data...") 526 | raw_output_path = save_raw_data(markdown, url_output_folder) 527 | 528 | if "Application error" in markdown: 529 | raise ValueError("Application error encountered.") 530 | 531 | if llm_needed: 532 | status.update("[bold cyan]Extracting data with LLM...") 533 | assert dynamic_model_container and llm_config 534 | formatted_data = format_data( 535 | data=markdown, 536 | dynamic_listings_container=dynamic_model_container, 537 | llm_config=llm_config, 538 | prompt_cache=prompt_cache, 539 | extraction_prompt=extraction_prompt, 540 | ) 541 | if not formatted_data: 542 | raise ValueError("No data was found by the LLM.") 543 | 544 | # Save formatted data 545 | status.update("[bold cyan]Saving extracted data...") 546 | _, file_paths = save_formatted_data( 547 | formatted_data=formatted_data, 548 | run_name=run_name, 549 | output_folder=url_output_folder, 550 | output_formats=output_format, 551 | ) 552 | else: 553 | file_paths = {} 554 | if OutputFormat.MARKDOWN not in file_paths: 555 | file_paths[OutputFormat.MARKDOWN] = raw_output_path 556 | 557 | mark_complete( 558 | run_name, 559 | current_url, 560 | raw_file_path=raw_output_path, 561 | file_paths=file_paths, 562 | ) 563 | 564 | # Display output if requested 565 | if display_output: 566 | if display_output.value in file_paths: 567 | content = file_paths[display_output.value].read_text() 568 | display_formatted_output(content, display_output, console_out) 569 | else: 570 | console_out.print( 571 | f"[bold red]Invalid output type: {display_output.value}[/bold red]" 572 | ) 573 | if llm_needed: 574 | console_out.print("Current session price:") 575 | show_llm_cost( 576 | cb.usage_metadata, show_pricing=PricingDisplay.PRICE, console=console_out 577 | ) 578 | 579 | console_out.print( 580 | Panel.fit( 581 | "\n".join( 582 | set([str(p) for p in file_paths.values()] + [str(raw_output_path)]) 583 | ), 584 | title="Files", 585 | ) 586 | ) 587 | except Exception as e: 588 | # Classify error type 589 | error_type = ErrorType.OTHER 590 | error_msg = str(e) 591 | 592 | if "timeout" in error_msg.lower() or "timed out" in error_msg.lower(): 593 | error_type = ErrorType.TIMEOUT 594 | elif "network" in error_msg.lower() or "connection" in error_msg.lower(): 595 | error_type = ErrorType.NETWORK 596 | elif "robots.txt" in error_msg.lower() or "disallowed" in error_msg.lower(): 597 | error_type = ErrorType.ROBOTS_DISALLOWED 598 | elif "html" in error_msg.lower() or "parse" in error_msg.lower(): 599 | error_type = ErrorType.PARSING 600 | elif "url" in error_msg.lower() or "scheme" in error_msg.lower(): 601 | error_type = ErrorType.INVALID_URL 602 | 603 | mark_error(run_name, current_url, error_msg, error_type) 604 | console_out.print( 605 | f"[bold red]URL processing error ([yellow]{error_type.value}[/yellow]):[/bold red][blue]{current_url}[/blue] {error_msg}" 606 | ) 607 | 608 | # Adjust rate limits on network errors 609 | if error_type == ErrorType.NETWORK or error_type == ErrorType.TIMEOUT: 610 | domain = urlparse(current_url).netloc 611 | current_delay = 1 612 | with sqlite3.connect(DB_PATH) as conn: 613 | row = conn.execute( 614 | "SELECT crawl_delay FROM domain_rate_limit WHERE domain = ?", (domain,) 615 | ).fetchone() 616 | if row: 617 | current_delay = row[0] 618 | 619 | # Increase delay for this domain (max 30 seconds) 620 | new_delay = min(current_delay * 2, 30) 621 | set_crawl_delay(domain, new_delay) 622 | console_out.print( 623 | f"[yellow]Increased rate limit for {domain} to {new_delay} seconds[/yellow]" 624 | ) 625 | except Exception as e: 626 | # Determine error type 627 | error_type = ErrorType.OTHER 628 | error_msg = str(e) 629 | 630 | if "timeout" in error_msg.lower() or "timed out" in error_msg.lower(): 631 | error_type = ErrorType.TIMEOUT 632 | elif "network" in error_msg.lower() or "connection" in error_msg.lower(): 633 | error_type = ErrorType.NETWORK 634 | 635 | for current_url in urls: 636 | mark_error(run_name, current_url, error_msg, error_type) 637 | 638 | console_out.print( 639 | f"[bold red]A fetch error occurred ([yellow]{error_type.value}[/yellow]):[/bold red] {error_msg}" 640 | ) 641 | # end while num_pages < crawl_max_pages 642 | duration = time.time() - start_time 643 | console_out.print( 644 | Panel.fit( 645 | f"Pages {num_pages} in {duration:.1f} seconds. {num_pages / duration:.1f} pages per second." 646 | ) 647 | ) 648 | if llm_needed: 649 | console_out.print("Grand total:") 650 | 651 | # end queue_status 652 | # end get_parai_callback 653 | except Exception as e: 654 | console_out.print(f"[bold red]An general error occurred:[/bold red] {str(e)}") 655 | finally: 656 | if cleanup in [CleanupType.BOTH, CleanupType.AFTER]: 657 | with console_out.status("[bold yellow]Cleaning up..."): 658 | if os.path.exists(output_folder): 659 | shutil.rmtree(output_folder) 660 | console_out.print( 661 | f"[bold green]Removed output folder and its contents: {output_folder}[/bold green]" 662 | ) 663 | 664 | 665 | if __name__ == "__main__": 666 | app() 667 | -------------------------------------------------------------------------------- /src/par_scrape/crawl.py: -------------------------------------------------------------------------------- 1 | """Web crawling functionality for par_scrape.""" 2 | 3 | import sqlite3 4 | import time 5 | import urllib.robotparser 6 | from collections.abc import Iterable 7 | from enum import Enum 8 | from pathlib import Path 9 | from urllib.parse import urljoin, urlparse 10 | 11 | from bs4 import BeautifulSoup 12 | from par_ai_core.web_tools import normalize_url 13 | from rich.console import Console 14 | 15 | from par_scrape.enums import OutputFormat 16 | 17 | 18 | def clean_url_of_ticket_id(url: str, ticket_id: str) -> str: 19 | """ 20 | Clean a URL of any occurrences of the ticket_id to prevent nesting issues. 21 | 22 | Args: 23 | url: The URL to clean 24 | ticket_id: The ticket_id to remove from the URL 25 | 26 | Returns: 27 | str: The cleaned URL 28 | """ 29 | # Skip if URL is not valid 30 | if not is_valid_url(url): 31 | return url 32 | 33 | # Parse the URL 34 | parsed = urlparse(url) 35 | 36 | # Clean the path of ticket_id - aggressively remove ALL instances 37 | path_parts = parsed.path.split("/") 38 | cleaned_parts = [] 39 | 40 | for part in path_parts: 41 | # Skip empty parts and parts that match ticket_id 42 | if part != "" and part != ticket_id: 43 | cleaned_parts.append(part) 44 | 45 | # Rebuild path with cleaned parts 46 | cleaned_path = "/" + "/".join(cleaned_parts) 47 | 48 | # Also clean query parameters if they contain the ticket_id 49 | query = parsed.query 50 | if ticket_id in query: 51 | query_pairs = query.split("&") 52 | cleaned_query_pairs = [] 53 | 54 | for pair in query_pairs: 55 | if ticket_id not in pair: 56 | cleaned_query_pairs.append(pair) 57 | 58 | query = "&".join(cleaned_query_pairs) 59 | 60 | # Rebuild the URL with cleaned path and query 61 | cleaned_parsed = parsed._replace(path=cleaned_path, query=query) 62 | cleaned_url = cleaned_parsed.geturl() 63 | 64 | return cleaned_url 65 | 66 | 67 | # from tldextract import tldextract 68 | 69 | BASE_PATH = Path("~/.par_scrape").expanduser() 70 | # BASE_PATH = Path(__file__).parent # debug path 71 | DB_PATH = BASE_PATH / "jobs.sqlite" 72 | # PAGES_BASE = BASE_PATH / "pages" 73 | 74 | # Global dictionary to store robots.txt parsers by domain 75 | ROBOTS_PARSERS: dict[str, urllib.robotparser.RobotFileParser] = {} 76 | # Set of excluded URL patterns (common non-content URLs) 77 | EXCLUDED_URL_PATTERNS = { 78 | "/login", 79 | "/logout", 80 | "/signin", 81 | "/signout", 82 | "/register", 83 | "/password", 84 | "/cart", 85 | "/checkout", 86 | "/search", 87 | "/cdn-cgi/", 88 | "/wp-admin/", 89 | "/wp-login.php", 90 | "/favicon.ico", 91 | "/sitemap.xml", 92 | "/robots.txt", 93 | "/feed", 94 | "/rss", 95 | "/comments", 96 | } 97 | # Default user agent for robots.txt 98 | DEFAULT_USER_AGENT = "par-scrape/1.0 (+https://github.com/paulrobello/par_scrape)" 99 | 100 | 101 | class CrawlType(str, Enum): 102 | """Types of web crawling strategies.""" 103 | 104 | SINGLE_PAGE = "single_page" 105 | SINGLE_LEVEL = "single_level" 106 | DOMAIN = "domain" 107 | # PAGINATED = "paginated" 108 | 109 | 110 | class PageStatus(str, Enum): 111 | """Status flags for pages in the crawl queue.""" 112 | 113 | QUEUED = "queued" 114 | ACTIVE = "active" 115 | COMPLETED = "completed" 116 | ERROR = "error" 117 | 118 | 119 | class ErrorType(str, Enum): 120 | """Types of errors that can occur during crawling.""" 121 | 122 | NETWORK = "network" 123 | PARSING = "parsing" 124 | ROBOTS_DISALLOWED = "robots_disallowed" 125 | INVALID_URL = "invalid_url" 126 | TIMEOUT = "timeout" 127 | OTHER = "other" 128 | 129 | 130 | def is_valid_url(url: str) -> bool: 131 | """ 132 | Validate if a URL is properly formatted and has a supported scheme. 133 | 134 | Args: 135 | url: The URL to validate 136 | 137 | Returns: 138 | bool: True if the URL is valid, False otherwise 139 | """ 140 | try: 141 | parsed = urlparse(url) 142 | return all([parsed.scheme in ("http", "https"), parsed.netloc]) 143 | except Exception: 144 | return False 145 | 146 | 147 | def get_url_output_folder(output_path: Path, ticket_id: str, url: str) -> Path: 148 | """ 149 | Get storage folder based on URL and ticket_id. 150 | 151 | Args: 152 | output_path: Base path for output files 153 | ticket_id: Unique identifier for the crawl job 154 | url: The URL being processed 155 | 156 | Returns: 157 | Path: The folder path where output for this URL should be stored 158 | """ 159 | # 1. Start with an absolute base folder - always use "./output" 160 | base_folder = output_path 161 | 162 | # 2. Add ticket_id once and only once 163 | run_folder = base_folder / ticket_id 164 | 165 | # 3. Parse the URL without any ticket_id contamination 166 | parsed_url = urlparse(url) 167 | domain = parsed_url.netloc.split(":")[0] # Remove port if present 168 | 169 | # 4. Get path components and aggressively filter out ticket_id 170 | raw_path = parsed_url.path.strip("/") 171 | 172 | # 5. If there's no path, just use the domain 173 | if not raw_path: 174 | return run_folder / domain 175 | 176 | # 6. Create a sanitized path by removing any ticket_id occurrences 177 | # and converting slashes to double underscores 178 | path_parts = raw_path.split("/") 179 | clean_parts = [] 180 | 181 | for part in path_parts: 182 | if part != ticket_id and part != "": 183 | clean_parts.append(part) 184 | 185 | sanitized_path = "__".join(clean_parts) 186 | 187 | # 7. Final path: ./output/ticket_id/domain/sanitized_path 188 | if sanitized_path: 189 | return run_folder / domain / sanitized_path 190 | else: 191 | return run_folder / domain 192 | 193 | 194 | def check_robots_txt(url: str, user_agent: str = DEFAULT_USER_AGENT) -> bool: 195 | """ 196 | Check if a URL is allowed by the site's robots.txt. 197 | 198 | Args: 199 | url: The URL to check 200 | user_agent: User agent to use for robots.txt checking 201 | 202 | Returns: 203 | bool: True if the URL is allowed, False if disallowed 204 | """ 205 | try: 206 | parsed_url = urlparse(url) 207 | domain = parsed_url.netloc 208 | 209 | # Get or create a robot parser for this domain 210 | if domain not in ROBOTS_PARSERS: 211 | rp = urllib.robotparser.RobotFileParser() 212 | robots_url = f"{parsed_url.scheme}://{domain}/robots.txt" 213 | rp.set_url(robots_url) 214 | try: 215 | rp.read() 216 | ROBOTS_PARSERS[domain] = rp 217 | except Exception: 218 | # If we can't read robots.txt, assume everything is allowed 219 | return True 220 | 221 | # Check if URL is allowed 222 | return ROBOTS_PARSERS[domain].can_fetch(user_agent, url) 223 | except Exception: 224 | # On any failure, default to allowing the URL 225 | return True 226 | 227 | 228 | def should_exclude_url(url: str) -> bool: 229 | """ 230 | Check if a URL should be excluded based on common patterns. 231 | 232 | Args: 233 | url: The URL to check 234 | 235 | Returns: 236 | bool: True if the URL should be excluded, False otherwise 237 | """ 238 | parsed = urlparse(url) 239 | path = parsed.path.lower() 240 | 241 | # Check for file extensions that aren't likely to be content pages 242 | if path.endswith( 243 | (".jpg", ".jpeg", ".png", ".gif", ".pdf", ".zip", ".tar.gz", ".css", ".js", ".ico", ".xml", ".json") 244 | ): 245 | return True 246 | 247 | # Check for excluded patterns 248 | for pattern in EXCLUDED_URL_PATTERNS: 249 | if pattern in path: 250 | return True 251 | 252 | # URL seems fine 253 | return False 254 | 255 | 256 | def extract_links( 257 | base_url: str, 258 | html: str, 259 | crawl_type: CrawlType, 260 | respect_robots: bool = False, 261 | console: Console | None = None, 262 | ticket_id: str = "", 263 | ) -> list[str]: 264 | """ 265 | Extract links from HTML based on crawl type. 266 | 267 | Args: 268 | base_url: The URL of the page being processed 269 | html: HTML content of the page 270 | crawl_type: Type of crawling to perform 271 | respect_robots: Whether to respect robots.txt 272 | console: Optional console for logging 273 | ticket_id: Optional ticket_id to clean from extracted URLs 274 | 275 | Returns: 276 | list[str]: List of normalized URLs to crawl next 277 | """ 278 | if crawl_type == CrawlType.SINGLE_PAGE: 279 | return [] 280 | 281 | try: 282 | soup = BeautifulSoup(html, "html.parser") 283 | links: set[str] = set() 284 | base_parsed = urlparse(base_url) 285 | 286 | # Find all link elements 287 | for link in soup.find_all("a", href=True): 288 | try: 289 | # We're using find_all with href=True, so we know href exists 290 | # Use type: ignore to bypass type checker for BeautifulSoup 291 | href = str(link["href"]) # type: ignore 292 | if not href or href.startswith(("javascript:", "mailto:", "tel:")): 293 | continue 294 | 295 | # Build absolute URL 296 | full_url = urljoin(base_url, href) 297 | 298 | # Validate the URL 299 | if not is_valid_url(full_url): 300 | continue 301 | 302 | parsed = urlparse(full_url) 303 | 304 | # Skip fragment-only URLs (same page anchors) 305 | if parsed.netloc == base_parsed.netloc and not parsed.path and parsed.fragment: 306 | continue 307 | 308 | # Apply crawl type filtering 309 | if ( 310 | crawl_type == CrawlType.SINGLE_LEVEL or crawl_type == CrawlType.DOMAIN 311 | ) and parsed.netloc == base_parsed.netloc: 312 | # Clean the URL of any ticket_id occurrences first to prevent nesting 313 | if ticket_id: 314 | full_url = clean_url_of_ticket_id(full_url, ticket_id) 315 | 316 | normalized_url = normalize_url(full_url) 317 | 318 | # Skip URLs that match common exclusion patterns 319 | if should_exclude_url(normalized_url): 320 | continue 321 | 322 | # Check robots.txt 323 | if respect_robots and not check_robots_txt(normalized_url): 324 | if console: 325 | console.print(f"[yellow]Skipping disallowed URL: {normalized_url}[/yellow]") 326 | continue 327 | 328 | links.add(normalized_url) 329 | # PAGINATED crawl type implementation would go here 330 | except Exception as e: 331 | if console: 332 | console.print(f"[red]Error processing link: {str(e)}[/red]") 333 | continue 334 | 335 | return list(links) 336 | except Exception as e: 337 | if console: 338 | console.print(f"[red]Error extracting links: {str(e)}[/red]") 339 | return [] 340 | 341 | 342 | def init_db() -> None: 343 | """ 344 | Initialize database with required tables. 345 | 346 | Creates the database if it doesn't exist and ensures the schema is up-to-date. 347 | Checks for version table and removes incompatible databases. 348 | """ 349 | # Current database schema version 350 | CURRENT_DB_VERSION = 1 351 | 352 | DB_PATH.parent.mkdir(parents=True, exist_ok=True) 353 | 354 | # Check if database exists and if it has our version table 355 | if DB_PATH.exists(): 356 | try: 357 | with sqlite3.connect(DB_PATH) as conn: 358 | # Check if db_version table exists 359 | cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='db_version'") 360 | if not cursor.fetchone(): 361 | # No version table, remove the incompatible database 362 | conn.close() 363 | DB_PATH.unlink() 364 | print(f"Removed incompatible database at {DB_PATH}") 365 | except sqlite3.Error: 366 | # If any error occurs, assume the database is corrupted or incompatible 367 | DB_PATH.unlink() 368 | print(f"Removed corrupted database at {DB_PATH}") 369 | 370 | with sqlite3.connect(DB_PATH) as conn: 371 | # Enable foreign keys 372 | conn.execute("PRAGMA foreign_keys = ON") 373 | 374 | # Create version tracking table first 375 | conn.execute(""" 376 | CREATE TABLE IF NOT EXISTS db_version ( 377 | version INTEGER PRIMARY KEY, 378 | created_at INTEGER DEFAULT (strftime('%s','now')), 379 | description TEXT 380 | ) 381 | """) 382 | 383 | # Check current version 384 | cursor = conn.execute("SELECT version FROM db_version ORDER BY version DESC LIMIT 1") 385 | row = cursor.fetchone() 386 | db_version = row[0] if row else 0 387 | 388 | # If database is outdated, update schema as needed 389 | if db_version < CURRENT_DB_VERSION: 390 | # Create the main scrape table with enhanced fields 391 | conn.execute(""" 392 | CREATE TABLE IF NOT EXISTS scrape ( 393 | ticket_id TEXT, 394 | url TEXT, 395 | status TEXT CHECK(status IN ('queued', 'active', 'completed', 'error')) NOT NULL, 396 | error_type TEXT, 397 | error_msg TEXT, 398 | raw_file_path TEXT, 399 | md_file_path TEXT, 400 | json_file_path TEXT, 401 | csv_file_path TEXT, 402 | excel_file_path TEXT, 403 | scraped INTEGER, 404 | queued_at INTEGER DEFAULT (strftime('%s','now')), 405 | last_processed_at INTEGER, 406 | attempts INTEGER DEFAULT 0, 407 | cost FLOAT, 408 | domain TEXT, 409 | depth INTEGER DEFAULT 0, 410 | PRIMARY KEY (ticket_id, url) 411 | ) 412 | """) 413 | 414 | # Create domain rate limiting table 415 | conn.execute(""" 416 | CREATE TABLE IF NOT EXISTS domain_rate_limit ( 417 | domain TEXT PRIMARY KEY, 418 | last_access INTEGER, 419 | crawl_delay INTEGER DEFAULT 1 420 | ) 421 | """) 422 | 423 | # Create an index on status for faster querying 424 | conn.execute(""" 425 | CREATE INDEX IF NOT EXISTS idx_status ON scrape(status, ticket_id) 426 | """) 427 | 428 | # Create an index on domain for faster rate limit lookups 429 | conn.execute(""" 430 | CREATE INDEX IF NOT EXISTS idx_domain ON scrape(domain) 431 | """) 432 | 433 | # Update version information 434 | conn.execute( 435 | """ 436 | INSERT INTO db_version (version, description) 437 | VALUES (?, ?) 438 | """, 439 | (CURRENT_DB_VERSION, "Initial schema with scrape and domain_rate_limit tables"), 440 | ) 441 | 442 | 443 | def get_queue_stats(ticket_id: str) -> dict[str, int]: 444 | """ 445 | Get statistics about the queue for a ticket. 446 | 447 | Args: 448 | ticket_id: Unique identifier for the crawl job 449 | 450 | Returns: 451 | dict: Dictionary with counts of items in each status 452 | """ 453 | with sqlite3.connect(DB_PATH) as conn: 454 | stats = {} 455 | for status in PageStatus: 456 | row = conn.execute( 457 | """ 458 | SELECT COUNT(*) FROM scrape 459 | WHERE ticket_id = ? AND status = ? 460 | """, 461 | (ticket_id, status.value), 462 | ).fetchone() 463 | stats[status.value] = row[0] if row else 0 464 | return stats 465 | 466 | 467 | def get_queue_size(ticket_id: str) -> int: 468 | """ 469 | Get the number of URLs in the queue for a ticket. 470 | 471 | Args: 472 | ticket_id: Unique identifier for the crawl job 473 | 474 | Returns: 475 | int: Number of URLs in queued status 476 | """ 477 | with sqlite3.connect(DB_PATH) as conn: 478 | row = conn.execute( 479 | """ 480 | SELECT COUNT(*) FROM scrape 481 | WHERE ticket_id = ? AND status = ? 482 | """, 483 | (ticket_id, PageStatus.QUEUED.value), 484 | ).fetchone() 485 | return row[0] if row else 0 486 | 487 | 488 | def add_to_queue(ticket_id: str, urls: Iterable[str], depth: int = 0) -> None: 489 | """ 490 | Add URLs to queue if they don't already exist. 491 | 492 | Args: 493 | ticket_id: Unique identifier for the crawl job 494 | urls: Collection of URLs to add to the queue 495 | depth: Crawl depth of these URLs (default: 0 for starting URLs) 496 | """ 497 | with sqlite3.connect(DB_PATH) as conn: 498 | for url in urls: 499 | # Skip invalid URLs 500 | if not is_valid_url(url): 501 | continue 502 | 503 | # Clean URL of any ticket_id occurrences to prevent nesting 504 | url = clean_url_of_ticket_id(url, ticket_id) 505 | 506 | # Normalize URL before adding 507 | url = normalize_url(url.rstrip("/")) 508 | parsed = urlparse(url) 509 | domain = parsed.netloc 510 | 511 | # Insert new URL or ignore if it exists 512 | conn.execute( 513 | """ 514 | INSERT OR IGNORE INTO scrape 515 | (ticket_id, url, status, domain, depth, queued_at) 516 | VALUES (?, ?, ?, ?, ?, strftime('%s','now')) 517 | """, 518 | (ticket_id, url, PageStatus.QUEUED.value, domain, depth), 519 | ) 520 | 521 | # Reset error status if re-adding 522 | conn.execute( 523 | """ 524 | UPDATE scrape 525 | SET status = ?, error_msg = NULL, error_type = NULL 526 | WHERE ticket_id = ? AND url = ? AND status = ? 527 | """, 528 | (PageStatus.QUEUED.value, ticket_id, url, PageStatus.ERROR.value), 529 | ) 530 | 531 | # Ensure domain exists in rate limit table 532 | conn.execute( 533 | """ 534 | INSERT OR IGNORE INTO domain_rate_limit (domain, last_access, crawl_delay) 535 | VALUES (?, 0, 1) 536 | """, 537 | (domain,), 538 | ) 539 | 540 | 541 | def get_next_urls( 542 | ticket_id: str, crawl_batch_size: int = 1, scrape_retries: int = 3, respect_rate_limits: bool = True 543 | ) -> list[str]: 544 | """ 545 | Get next batch of URLs to process from the queue, respecting rate limits. 546 | 547 | Args: 548 | ticket_id: Unique identifier for the crawl job 549 | crawl_batch_size: Maximum number of URLs to return 550 | scrape_retries: Maximum number of retry attempts for failed URLs 551 | respect_rate_limits: Whether to respect per-domain rate limits 552 | 553 | Returns: 554 | list[str]: List of URLs to process next 555 | """ 556 | current_time = int(time.time()) 557 | urls = [] 558 | domains_used = set() 559 | 560 | with sqlite3.connect(DB_PATH) as conn: 561 | # Query includes URLs from each domain respecting rate limits 562 | if respect_rate_limits: 563 | # First find eligible domains that respect rate limits 564 | rows = conn.execute( 565 | """ 566 | SELECT s.url, s.domain, d.last_access, d.crawl_delay 567 | FROM scrape s 568 | JOIN domain_rate_limit d ON s.domain = d.domain 569 | WHERE s.ticket_id = ? 570 | AND (s.status = ? OR (s.status = ? AND s.attempts < ?)) 571 | ORDER BY d.last_access ASC 572 | """, 573 | (ticket_id, PageStatus.QUEUED.value, PageStatus.ERROR.value, scrape_retries), 574 | ).fetchall() 575 | 576 | # Process each row, respecting rate limits 577 | for row in rows: 578 | url, domain, last_access, crawl_delay = row 579 | 580 | # Skip if we already have a URL from this domain in the batch 581 | if domain in domains_used: 582 | continue 583 | 584 | # Skip if rate limit not elapsed 585 | if last_access > 0 and current_time - last_access < crawl_delay: 586 | continue 587 | 588 | # Add URL to batch 589 | urls.append(url) 590 | domains_used.add(domain) 591 | 592 | # Update last access time for this domain 593 | conn.execute( 594 | """ 595 | UPDATE domain_rate_limit 596 | SET last_access = ? 597 | WHERE domain = ? 598 | """, 599 | (current_time, domain), 600 | ) 601 | 602 | # Stop if we have enough URLs 603 | if len(urls) >= crawl_batch_size: 604 | break 605 | else: 606 | # Simple version that doesn't respect rate limits 607 | rows = conn.execute( 608 | """ 609 | SELECT url FROM scrape 610 | WHERE ticket_id = ? AND (status = ? OR (status = ? AND attempts < ?)) 611 | LIMIT ? 612 | """, 613 | (ticket_id, PageStatus.QUEUED.value, PageStatus.ERROR.value, scrape_retries, crawl_batch_size), 614 | ).fetchall() 615 | urls = [row[0] for row in rows] 616 | 617 | # Mark selected URLs as active 618 | if urls: 619 | placeholders = ", ".join("?" for _ in urls) 620 | conn.execute( 621 | f""" 622 | UPDATE scrape 623 | SET status = ?, attempts = attempts + 1, last_processed_at = strftime('%s','now') 624 | WHERE ticket_id = ? AND url IN ({placeholders}) 625 | """, 626 | [PageStatus.ACTIVE.value, ticket_id] + urls, 627 | ) 628 | 629 | return urls 630 | 631 | 632 | def set_crawl_delay(domain: str, delay_seconds: int) -> None: 633 | """ 634 | Set the crawl delay for a specific domain. 635 | 636 | Args: 637 | domain: Domain to set rate limit for 638 | delay_seconds: Minimum seconds between requests to this domain 639 | """ 640 | with sqlite3.connect(DB_PATH) as conn: 641 | conn.execute( 642 | """ 643 | INSERT OR REPLACE INTO domain_rate_limit (domain, last_access, crawl_delay) 644 | VALUES (?, (SELECT last_access FROM domain_rate_limit WHERE domain = ?), ?) 645 | """, 646 | (domain, domain, delay_seconds), 647 | ) 648 | 649 | 650 | def mark_complete( 651 | ticket_id: str, url: str, *, raw_file_path: Path, file_paths: dict[OutputFormat, Path], cost: float = 0.0 652 | ) -> None: 653 | """ 654 | Mark URL as successfully scraped. 655 | 656 | Args: 657 | ticket_id: Unique identifier for the crawl job 658 | url: URL that was successfully processed 659 | raw_file_path: Path to the raw output file 660 | file_paths: Dictionary mapping output formats to file paths 661 | cost: Cost of processing this URL (if applicable) 662 | """ 663 | with sqlite3.connect(DB_PATH) as conn: 664 | conn.execute( 665 | """ 666 | UPDATE scrape 667 | SET status = ?, scraped = strftime('%s','now'), error_msg = null, error_type = null, 668 | raw_file_path = ?, md_file_path = ?, json_file_path = ?, csv_file_path = ?, excel_file_path = ?, 669 | cost = ?, last_processed_at = strftime('%s','now') 670 | WHERE ticket_id = ? AND url = ? 671 | """, 672 | ( 673 | PageStatus.COMPLETED.value, 674 | str(raw_file_path), 675 | str(file_paths[OutputFormat.MARKDOWN]) if OutputFormat.MARKDOWN in file_paths else None, 676 | str(file_paths[OutputFormat.JSON]) if OutputFormat.JSON in file_paths else None, 677 | str(file_paths[OutputFormat.CSV]) if OutputFormat.CSV in file_paths else None, 678 | str(file_paths[OutputFormat.EXCEL]) if OutputFormat.EXCEL in file_paths else None, 679 | cost, 680 | ticket_id, 681 | url.rstrip("/"), 682 | ), 683 | ) 684 | 685 | 686 | def mark_error( 687 | ticket_id: str, url: str, error_msg: str, error_type: ErrorType = ErrorType.OTHER, cost: float = 0.0 688 | ) -> None: 689 | """ 690 | Mark URL as failed with error message and type. 691 | 692 | Args: 693 | ticket_id: Unique identifier for the crawl job 694 | url: URL that failed processing 695 | error_msg: Error message describing the failure 696 | error_type: Type of error that occurred 697 | cost: Cost of processing this URL (if applicable) 698 | """ 699 | with sqlite3.connect(DB_PATH) as conn: 700 | conn.execute( 701 | """ 702 | UPDATE scrape 703 | SET status = ?, error_msg = ?, error_type = ?, cost = ?, last_processed_at = strftime('%s','now') 704 | WHERE ticket_id = ? AND url = ? 705 | """, 706 | (PageStatus.ERROR.value, error_msg[:255], error_type.value, cost, ticket_id, url.rstrip("/")), 707 | ) 708 | -------------------------------------------------------------------------------- /src/par_scrape/enums.py: -------------------------------------------------------------------------------- 1 | """Enum for scraper choices.""" 2 | 3 | from strenum import StrEnum 4 | 5 | 6 | class CleanupType(StrEnum): 7 | """Enum for cleanup choices.""" 8 | 9 | NONE = "none" 10 | BEFORE = "before" 11 | AFTER = "after" 12 | BOTH = "both" 13 | 14 | 15 | class OutputFormat(StrEnum): 16 | """Enum for output formats.""" 17 | 18 | MARKDOWN = "md" 19 | JSON = "json" 20 | CSV = "csv" 21 | EXCEL = "excel" 22 | -------------------------------------------------------------------------------- /src/par_scrape/extraction_prompt.md: -------------------------------------------------------------------------------- 1 | ROLE: You are an intelligent text extraction and conversion assistant. 2 | TASK: Extract structured information from the user provided text into the format required to call DynamicListingsContainer. 3 | Ensure you include all data points in the output. 4 | If you encounter cases where you can't find the data for a specific field use an empty string "". 5 | You *MUST* call the `DynamicListingsContainer` function with the extracted data. 6 | -------------------------------------------------------------------------------- /src/par_scrape/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paulrobello/par_scrape/56294b0f0e86434033fd0d1a1ae54900ae0f4585/src/par_scrape/py.typed -------------------------------------------------------------------------------- /src/par_scrape/scrape_data.py: -------------------------------------------------------------------------------- 1 | """Scrape data from Web.""" 2 | 3 | import json 4 | import os 5 | from pathlib import Path 6 | 7 | import pandas as pd 8 | from langchain_anthropic import ChatAnthropic 9 | from par_ai_core.llm_config import LlmConfig, llm_run_manager 10 | from par_ai_core.par_logging import console_out 11 | from pydantic import BaseModel, ConfigDict, create_model 12 | from rich.panel import Panel 13 | 14 | from par_scrape.enums import OutputFormat 15 | 16 | 17 | def save_raw_data(raw_data: str, output_base: Path) -> Path: 18 | """ 19 | Save raw data to a file. 20 | 21 | Args: 22 | raw_data (str): The raw data to save. 23 | output_base (str): The folder or base file_name to save the file in. Defaults to 'output'. 24 | 25 | Returns: 26 | Path: The path to the saved file. 27 | """ 28 | if output_base.is_dir(): 29 | # Use a simple filename without ticket_id since the path already has it 30 | raw_output_path = output_base / "raw_data.md" 31 | else: 32 | # For non-directory paths, just append -raw 33 | raw_output_path = Path(str(output_base) + "-raw.md") 34 | raw_output_path.write_text(raw_data) 35 | console_out.print(Panel(f"Raw data saved to [bold green]{raw_output_path}[/bold green]")) 36 | return raw_output_path 37 | 38 | 39 | def create_dynamic_model(field_names: list[str]) -> type[BaseModel]: 40 | """ 41 | Dynamically creates a Pydantic model based on provided fields. 42 | 43 | Args: 44 | field_names (List[str]): A list of names of the fields to extract from the markdown. 45 | 46 | Returns: 47 | Type[BaseModel]: A dynamically created Pydantic model. 48 | """ 49 | # Create field definitions using aliases for Field parameters 50 | field_definitions = {field: (str, ...) for field in field_names} 51 | # Dynamically create the model with all fields 52 | dynamic_listing_model = create_model( 53 | "DynamicListingModel", 54 | **field_definitions, # type: ignore 55 | ) # type: ignore 56 | dynamic_listing_model.model_config = ConfigDict(arbitrary_types_allowed=True) 57 | return dynamic_listing_model 58 | 59 | 60 | def create_container_model(dynamic_model: type[BaseModel]) -> type[BaseModel]: 61 | """ 62 | Create a container model that holds a list of the given listing model. 63 | 64 | Args: 65 | dynamic_model (Type[BaseModel]): The Pydantic model for individual listings. 66 | 67 | Returns: 68 | Type[BaseModel]: A container model for a list of listings. 69 | """ 70 | return create_model("DynamicListingsContainer", listings=(list[dynamic_model], ...)) 71 | 72 | 73 | # pylint: disable=too-many-positional-arguments 74 | def format_data( 75 | *, 76 | data: str, 77 | dynamic_listings_container: type[BaseModel], 78 | llm_config: LlmConfig, 79 | prompt_cache: bool = False, 80 | extraction_prompt: Path | None = None, 81 | ) -> BaseModel: 82 | """ 83 | Format data using the specified AI provider's API. 84 | 85 | Args: 86 | data (str): The input data to format. 87 | dynamic_listings_container (Type[BaseModel]): The Pydantic model to use for parsing. 88 | llm_config (LlmConfig): The configuration for the AI provider. 89 | prompt_cache (bool): Whether to use prompt caching. 90 | extraction_prompt (Path): Path to the extraction prompt file. 91 | 92 | Returns: 93 | BaseModel: The Extracted data as a Pydantic model instance. 94 | """ 95 | if not extraction_prompt: 96 | extraction_prompt = Path(__file__).parent / "extraction_prompt.md" 97 | try: 98 | system_message = extraction_prompt.read_text(encoding="utf-8") 99 | except FileNotFoundError: 100 | console_out.print(f"[bold red]Extraction prompt file not found: {extraction_prompt}[/bold red]") 101 | raise 102 | 103 | user_message = f"Extract the following information from the provided text:\nPage content:\n\n{data}" 104 | 105 | try: 106 | chat_model = llm_config.build_chat_model() 107 | 108 | structure_model = chat_model.with_structured_output( 109 | dynamic_listings_container # , include_raw=True 110 | ) 111 | history = [ 112 | ("system", system_message), 113 | ( 114 | "user", 115 | [{"type": "text", "text": user_message}], 116 | ), 117 | ] 118 | 119 | if prompt_cache and isinstance(chat_model, ChatAnthropic): 120 | history[1][1][0]["cache_control"] = {"type": "ephemeral"} # type: ignore 121 | 122 | data = structure_model.invoke(history, config=llm_run_manager.get_runnable_config(chat_model.name)) # type: ignore 123 | if isinstance(data, BaseModel): 124 | return data 125 | console_out.print(data) 126 | raise ValueError("Error in API call. Did not return a Pydantic BaseModel") 127 | except Exception as e: # pylint: disable=broad-exception-caught 128 | console_out.print(f"[bold red]Error in API call or parsing response:[/bold red] {str(e)}") 129 | return dynamic_listings_container(listings=[]) 130 | 131 | 132 | def save_formatted_data( 133 | *, formatted_data: BaseModel, output_formats: list[OutputFormat], run_name: str, output_folder: Path 134 | ) -> tuple[pd.DataFrame | None, dict[OutputFormat, Path]]: 135 | """ 136 | Save Extracted data to JSON, Excel, CSV, and Markdown files. 137 | 138 | Note: run_name should only be used for logging/reference, not for directory creation 139 | since directories should already include run_name once via get_url_output_folder. 140 | 141 | Args: 142 | formatted_data (BaseModel): The Extracted data to save. 143 | output_formats (List[OutputFormat]): The desired output format. 144 | run_name (str): The run name used for logging purposes only. 145 | output_folder (Path): The folder to save the files in. 146 | 147 | Returns: 148 | Tuple[pd.DataFrame | None, Dict[OutputFormat, Path]]: The DataFrame created from the Extracted data and a dictionary of 149 | file paths, or None and an empty dict if an error occurred. 150 | """ 151 | file_paths: dict[OutputFormat, Path] = {} 152 | # Ensure the output folder exists 153 | os.makedirs(output_folder, exist_ok=True) 154 | 155 | # Prepare Extracted data as a dictionary 156 | formatted_data_dict = formatted_data.model_dump() 157 | 158 | if OutputFormat.JSON in output_formats: 159 | # Save the Extracted data as JSON without adding run_name to the filename 160 | # as the run_name is already part of the folder structure 161 | json_output_path = output_folder / "extracted_data.json" 162 | json_output_path.write_text(json.dumps(formatted_data_dict, indent=4), encoding="utf-8") 163 | 164 | console_out.print(Panel(f"Extracted data saved to JSON at [bold green]{json_output_path}[/bold green]")) 165 | file_paths[OutputFormat.JSON] = json_output_path 166 | 167 | # Prepare data for DataFrame 168 | if isinstance(formatted_data_dict, dict): 169 | # If the data is a dictionary containing lists, assume these lists are records 170 | data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict 171 | elif isinstance(formatted_data_dict, list): 172 | data_for_df = formatted_data_dict 173 | else: 174 | raise ValueError("Extracted data is neither a dictionary nor a list, cannot convert to DataFrame") 175 | 176 | # Create DataFrame 177 | try: 178 | df = pd.DataFrame(data_for_df) 179 | 180 | if df.empty: 181 | raise ValueError("DataFrame is empty, cannot save to files") 182 | 183 | if OutputFormat.EXCEL in output_formats: 184 | try: 185 | # Don't include run_name in filename since it's already in the path 186 | excel_output_path = output_folder / "extracted_data.xlsx" 187 | df.to_excel(excel_output_path, index=False) 188 | console_out.print(Panel(f"Excel data saved to [bold green]{excel_output_path}[/bold green]")) 189 | file_paths[OutputFormat.EXCEL] = excel_output_path 190 | except Exception as e: 191 | console_out.print("[bold red]Error: Saving Excel failed[/bold red]") 192 | console_out.print(e) 193 | 194 | if OutputFormat.CSV in output_formats: 195 | try: 196 | # Don't include run_name in filename since it's already in the path 197 | csv_output_path = output_folder / "extracted_data.csv" 198 | df.to_csv(csv_output_path, index=False) 199 | console_out.print(Panel(f"CSV data saved to [bold green]{csv_output_path}[/bold green]")) 200 | file_paths[OutputFormat.CSV] = csv_output_path 201 | except Exception as e: 202 | console_out.print("[bold red]Error: Saving CSV failed[/bold red]") 203 | console_out.print(e) 204 | 205 | if OutputFormat.MARKDOWN in output_formats: 206 | try: 207 | # Don't include run_name in filename since it's already in the path 208 | markdown_output_path = output_folder / "extracted_data.md" 209 | markdown_output_path.write_text(df.to_markdown(index=False) or "", encoding="utf-8") 210 | console_out.print(Panel(f"Markdown table saved to [bold green]{markdown_output_path}[/bold green]")) 211 | file_paths[OutputFormat.MARKDOWN] = markdown_output_path 212 | except Exception as e: 213 | console_out.print("[bold red]Error: Saving Markdown table failed[/bold red]") 214 | console_out.print(e) 215 | return df, file_paths 216 | except Exception as e: 217 | console_out.print(f"[bold red]Error creating DataFrame or saving files:[/bold red] {str(e)}") 218 | return None, {} 219 | -------------------------------------------------------------------------------- /src/par_scrape/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for par_scrape.""" 2 | --------------------------------------------------------------------------------