├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── docker-build-push.yml │ └── pypi-publish.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.docker.md ├── README.md ├── pyproject.toml ├── pyseoanalyzer ├── __init__.py ├── __main__.py ├── analyzer.py ├── http.py ├── llm_analyst.py ├── page.py ├── stopwords.py ├── templates │ └── index.html └── website.py ├── requirements.txt ├── test.py └── tests ├── __init__.py ├── test_analyzer.py ├── test_http.py ├── test_llm_analyst.py └── test_page.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .env 2 | .vscode 3 | .github 4 | .pytest_cache 5 | .git 6 | .dockerignore 7 | .gitignore 8 | *.pyc 9 | env/ 10 | venv/ 11 | */__pycache__/* 12 | tests/ 13 | Dockerfile 14 | *.pyc -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/docker-build-push.yml: -------------------------------------------------------------------------------- 1 | name: Manual Docker Build and Push 2 | 3 | on: 4 | workflow_dispatch: # Allows manual triggering 5 | 6 | # Add permissions for pushing packages and OIDC token 7 | permissions: 8 | contents: read 9 | packages: write # Needed to push container images 10 | id-token: write # Needed for signing/attestations 11 | 12 | jobs: 13 | build-and-push: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Log in to Docker Hub 20 | uses: docker/login-action@v3 21 | with: 22 | username: ${{ secrets.DOCKERHUB_USERNAME }} 23 | password: ${{ secrets.DOCKERHUB_TOKEN }} 24 | 25 | # Install the cosign tool 26 | # https://github.com/sigstore/cosign-installer 27 | - name: Install cosign 28 | uses: sigstore/cosign-installer@v3.5.0 29 | with: 30 | cosign-release: 'v2.2.4' 31 | 32 | # Setup Docker buildx 33 | # https://github.com/docker/build-push-action/issues/461 34 | - name: Setup Docker buildx 35 | uses: docker/setup-buildx-action@v3 36 | 37 | # Extract metadata (tags, labels) for Docker 38 | # https://github.com/docker/metadata-action 39 | - name: Extract Docker metadata 40 | id: meta 41 | uses: docker/metadata-action@v5 42 | with: 43 | images: sethblack/python-seo-analyzer # Your Docker Hub image 44 | tags: | 45 | # Add short SHA tag based on the Git commit, disable automatic latest promotion 46 | type=sha,format=short,flavor=latest=false 47 | # Explicitly add the 'latest' tag for all manual runs 48 | type=raw,value=latest,enable=true 49 | 50 | # Build and push Docker image with attestation 51 | # https://github.com/docker/build-push-action 52 | - name: Build and push Docker image 53 | id: build-and-push # Add id to reference outputs 54 | uses: docker/build-push-action@v5 55 | with: 56 | context: . 57 | push: true 58 | tags: ${{ steps.meta.outputs.tags }} # Use tags from metadata 59 | labels: ${{ steps.meta.outputs.labels }} # Use labels from metadata 60 | # Attestations for provenance and SBOM 61 | # Correct format: type=,= 62 | attests: | 63 | type=provenance,builder-id=${{ github.workflow }}/${{ github.job_id }} 64 | type=sbom,scan-mode=local,scan-args=--exclude=./tests 65 | 66 | # Sign the resulting Docker image digest. 67 | # https://github.com/sigstore/cosign 68 | - name: Sign the published Docker image 69 | env: 70 | # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-repository-for-the-build 71 | COSIGN_EXPERIMENTAL: "true" 72 | # This step uses the identity token to provision an ephemeral certificate 73 | # against the sigstore community Fulcio instance. 74 | run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }} 75 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: Manual PyPI Publish 2 | 3 | on: 4 | workflow_dispatch: # Allows manual triggering 5 | 6 | permissions: 7 | contents: read # Needed to checkout the repository 8 | 9 | jobs: 10 | publish-to-pypi: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.x' # Use an appropriate Python version 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install build twine 25 | 26 | - name: Build package 27 | run: python -m build 28 | 29 | - name: Publish package to PyPI 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | with: 32 | user: __token__ 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # I don't want the python virtual env in github! 2 | venv 3 | env 4 | .env 5 | 6 | # nor visual 7 | .vscode 8 | 9 | *.py[cod] 10 | 11 | # C extensions 12 | *.so 13 | */__pychache__/* 14 | 15 | # Packages 16 | *.egg 17 | *.egg-info 18 | dist 19 | build 20 | eggs 21 | parts 22 | bin 23 | var 24 | sdist 25 | develop-eggs 26 | .installed.cfg 27 | lib 28 | lib64 29 | 30 | # Installer logs 31 | pip-log.txt 32 | 33 | # Unit test / coverage reports 34 | .coverage 35 | .tox 36 | nosetests.xml 37 | 38 | # Translations 39 | *.mo 40 | 41 | # Mr Developer 42 | .mr.developer.cfg 43 | .project 44 | .pydevproject 45 | 46 | # Output directory 47 | output/ 48 | build/ 49 | 50 | .DS_Store 51 | 52 | # ipython 53 | *.ipynb 54 | .ipynb_checkpoints/* 55 | 56 | 57 | # PyCharm 58 | .idea/* 59 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at seth@sethserver.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13.2-bookworm 2 | 3 | RUN apt-get update -y && apt-get upgrade -y 4 | 5 | RUN pip3 install --upgrade pip 6 | RUN pip3 install uv 7 | 8 | COPY ./requirements.txt /python-seo-analyzer/ 9 | 10 | RUN uv pip install --system --verbose --requirement /python-seo-analyzer/requirements.txt 11 | RUN uv cache clean --verbose 12 | 13 | COPY . /python-seo-analyzer 14 | 15 | # Create a non-root user 16 | RUN groupadd -r appgroup && useradd --no-log-init -r -g appgroup appuser 17 | 18 | # Set ownership of the app directory 19 | RUN chown -R appuser:appgroup /python-seo-analyzer 20 | 21 | # Switch back to root to install the package system-wide 22 | USER root 23 | RUN python3 -m pip install /python-seo-analyzer 24 | 25 | # Switch back to the non-root user 26 | USER appuser 27 | 28 | WORKDIR /app 29 | 30 | ENTRYPOINT ["python-seo-analyzer"] 31 | CMD ["--version"] 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2012-2025 Seth Black. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. The name of Seth Black may not be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | graft seoanalyzer/templates 4 | 5 | -------------------------------------------------------------------------------- /README.docker.md: -------------------------------------------------------------------------------- 1 | # Docker Usage for Python SEO Analyzer 2 | 3 | This document provides instructions on how to build and run the `python-seo-analyzer` tool using Docker. 4 | 5 | ## Overview 6 | 7 | The Docker image provides a self-contained environment to run the `python-seo-analyzer` command-line tool without needing to install Python or dependencies directly on your host system. 8 | 9 | The image is based on `python:3.13.2-bookworm` and includes all necessary dependencies specified in `requirements.txt`. 10 | 11 | ## Building the Image (Optional) 12 | 13 | While pre-built images might be available (e.g., via GitHub Packages), you can build the image locally using the provided `Dockerfile`: 14 | 15 | ```bash 16 | docker build -t python-seo-analyzer . 17 | ``` 18 | 19 | ## Running the Container 20 | 21 | The container is configured to run the `python-seo-analyzer` command directly. You pass the command-line arguments for the tool after the image name. The official image is available at `sethblack/python-seo-analyzer:latest`. 22 | 23 | **Default Command (Show Version):** 24 | 25 | If you run the container without any arguments, it executes the default command (`--version`): 26 | 27 | ```bash 28 | docker run --rm sethblack/python-seo-analyzer:latest 29 | ``` 30 | *(Note: The examples below use `sethblack/python-seo-analyzer:latest`. If you built the image locally with a different tag, replace the image name accordingly.)* 31 | 32 | **Analyzing a Website:** 33 | 34 | To analyze a website, provide the site URL as the main argument: 35 | 36 | ```bash 37 | # Analyze a site and output JSON (default) 38 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com 39 | 40 | # Analyze a site and output HTML 41 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com -f html > analysis_report.html 42 | 43 | # Analyze a site using a sitemap 44 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com -s https://example.com/sitemap.xml 45 | 46 | # Analyze with heading analysis enabled 47 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com --analyze-headings 48 | 49 | # Analyze without following internal links 50 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com --no-follow-links 51 | 52 | # Analyze with LLM analysis (requires appropriate environment variables for the LLM provider, e.g., ANTHROPIC_API_KEY) 53 | # You'll need to pass environment variables using the -e flag 54 | docker run --rm -e ANTHROPIC_API_KEY=your_api_key sethblack/python-seo-analyzer:latest https://example.com --run-llm-analysis 55 | ``` 56 | 57 | ## Command-Line Arguments 58 | 59 | The `python-seo-analyzer` tool accepts the following arguments when run via Docker: 60 | 61 | * `site`: (Required) The URL of the website you want to analyze. 62 | * `-s`, `--sitemap`: URL of the sitemap to seed the crawler with. 63 | * `-f`, `--output-format`: Output format. Choices: `json` (default), `html`. 64 | * `--analyze-headings`: Enable analysis of heading tags (h1-h6). Default: `False`. 65 | * `--analyze-extra-tags`: Enable analysis of other additional tags. Default: `False`. 66 | * `--no-follow-links`: Disable following internal links during the crawl. By default, the crawler *does* follow internal links. Use this flag to prevent that behavior. 67 | * `--run-llm-analysis`: Run Large Language Model (LLM) analysis on the content. Requires API keys to be configured via environment variables (e.g., `ANTHROPIC_API_KEY`). Default: `False`. 68 | * `--version`: Display the tool's version and exit. (This is the default command if no other arguments are provided). 69 | 70 | ## Examples 71 | 72 | **Analyze `sethserver.com` and save the output as HTML:** 73 | 74 | ```bash 75 | docker run --rm sethblack/python-seo-analyzer:latest https://sethserver.com -f html > sethserver_report.html 76 | ``` 77 | 78 | **Analyze `github.com` using its sitemap and output JSON:** 79 | 80 | ```bash 81 | docker run --rm sethblack/python-seo-analyzer:latest https://github.com -s https://github.com/sitemap.xml 82 | ``` 83 | 84 | **Analyze `example.com` with heading analysis but without following internal links:** 85 | 86 | ```bash 87 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com --analyze-headings --no-follow-links 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Python SEO and GEO Analyzer 2 | =========================== 3 | 4 | [![PyPI version](https://badge.fury.io/py/pyseoanalyzer.svg)](https://badge.fury.io/py/pyseoanalyzer) 5 | [![Docker Pulls](https://img.shields.io/docker/pulls/sethblack/python-seo-analyzer.svg)](https://hub.docker.com/r/sethblack/python-seo-analyzer) 6 | 7 | A modern SEO and GEO (Generative AI Engine Optimization or better AI Search Optimization) analysis tool that combines technical optimization and authentic human value. Beyond traditional site crawling and structure analysis, it uses AI to evaluate content's expertise signals, conversational engagement, and cross-platform presence. It helps you maintain strong technical foundations while ensuring your site demonstrates genuine authority and value to real users. 8 | 9 | The AI features were heavily influenced by the clickbait-titled SEL article [A 13-point roadmap for thriving in the age of AI search](https://searchengineland.com/seo-roadmap-ai-search-449199). 10 | 11 | Note About Python 12 | ----------------- 13 | 14 | I've written quite a bit about the speed of Python and how there are very specific use cases where it isn't the best choice. I feel like crawling websites is definitely one of those cases. I wrote this tool in Python around 2010 to solve the very specific need of crawling some small HTML-only websites for startups I was working at. I'm excited to see how much it has grown and how many people are using it. I feel like Python SEO Analyzer is acceptable for most smaller use cases, but if you are looking for something better, I've built a much faster and more comprehensive tool [Black SEO Analyzer](https://github.com/sethblack/black-seo-analyzer). 15 | 16 | -Seth 17 | 18 | Installation 19 | ------------ 20 | 21 | ### PIP 22 | 23 | ``` 24 | pip install pyseoanalyzer 25 | ``` 26 | 27 | ### Docker 28 | 29 | #### Using the Pre-built Image from Docker Hub 30 | 31 | The easiest way to use the Docker image is to pull it directly from [Docker Hub](https://hub.docker.com/r/sethblack/python-seo-analyzer). 32 | 33 | ```bash 34 | # Pull the latest image 35 | docker pull sethblack/python-seo-analyzer:latest 36 | 37 | # Run the analyzer (replace example.com with the target URL) 38 | # The --rm flag automatically removes the container when it exits 39 | docker run --rm sethblack/python-seo-analyzer http://example.com/ 40 | 41 | # Run with specific arguments (e.g., sitemap and HTML output) 42 | # Note: If the sitemap is local, you'll need to mount it (see mounting example below) 43 | docker run --rm sethblack/python-seo-analyzer http://example.com/ --sitemap /path/inside/container/sitemap.xml --output-format html 44 | 45 | # Run with AI analysis (requires ANTHROPIC_API_KEY) 46 | # Replace "your_api_key_here" with your actual Anthropic API key 47 | docker run --rm -e ANTHROPIC_API_KEY="your_api_key_here" sethblack/python-seo-analyzer http://example.com/ --run-llm-analysis 48 | 49 | # Save HTML output to your local machine 50 | # This mounts the current directory (.) into /app/output inside the container. 51 | # The output file 'results.html' will be saved in your current directory. 52 | # The tool outputs JSON by default to stdout, so we redirect it for HTML. 53 | # Since the ENTRYPOINT handles the command, we redirect the container's stdout. 54 | # We need a shell inside the container to handle the redirection. 55 | docker run --rm -v "$(pwd):/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html" 56 | # Note for Windows CMD users: Use %cd% instead of $(pwd) 57 | # docker run --rm -v "%cd%:/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html" 58 | # Note for Windows PowerShell users: Use ${pwd} instead of $(pwd) 59 | # docker run --rm -v "${pwd}:/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html" 60 | 61 | 62 | # Mount a local sitemap file 63 | # This mounts 'local-sitemap.xml' from the current directory to '/app/sitemap.xml' inside the container 64 | docker run --rm -v "$(pwd)/local-sitemap.xml:/app/sitemap.xml" sethblack/python-seo-analyzer http://example.com/ --sitemap /app/sitemap.xml 65 | # Adjust paths and Windows commands as needed (see volume mounting example above) 66 | 67 | ``` 68 | 69 | #### Building the Image Locally 70 | 71 | You can also build the Docker image yourself from the source code. Make sure you have Docker installed and running. 72 | 73 | ```bash 74 | # Clone the repository (if you haven't already) 75 | # git clone https://github.com/sethblack/python-seo-analyzer.git 76 | # cd python-seo-analyzer 77 | 78 | # Build the Docker image (tag it as 'my-seo-analyzer' for easy reference) 79 | docker build -t my-seo-analyzer . 80 | 81 | # Run the locally built image 82 | docker run --rm my-seo-analyzer http://example.com/ 83 | 84 | # Run with AI analysis using the locally built image 85 | docker run --rm -e ANTHROPIC_API_KEY="your_api_key_here" my-seo-analyzer http://example.com/ --run-llm-analysis 86 | 87 | # Run with HTML output saved locally using the built image 88 | docker run --rm -v "$(pwd):/app/output" my-seo-analyzer /bin/sh -c "python-seo-analyzer http://example.com/ --output-format html > /app/output/results.html" 89 | # Adjust Windows commands as needed (see volume mounting example above) 90 | ``` 91 | 92 | Command-line Usage 93 | ------------------ 94 | 95 | If you run without a sitemap it will start crawling at the homepage. 96 | 97 | ```sh 98 | python-seo-analyzer http://www.domain.com/ 99 | ``` 100 | 101 | Or you can specify the path to a sitmap to seed the urls to scan list. 102 | 103 | ```sh 104 | seoanapython-seo-analyzerlyze http://www.domain.com/ --sitemap path/to/sitemap.xml 105 | ``` 106 | 107 | HTML output can be generated from the analysis instead of json. 108 | 109 | ```sh 110 | python-seo-analyzer http://www.domain.com/ --output-format html 111 | ``` 112 | 113 | API 114 | --- 115 | 116 | The `analyze` function returns a dictionary with the results of the crawl. 117 | 118 | ```python 119 | from pyseoanalyzer import analyze 120 | 121 | output = analyze(site, sitemap) 122 | 123 | print(output) 124 | ``` 125 | 126 | In order to analyze heading tags (h1-h6) and other extra additional tags as well, the following options can be passed to the `analyze` function 127 | ```python 128 | from pyseoanalyzer import analyze 129 | 130 | output = analyze(site, sitemap, analyze_headings=True, analyze_extra_tags=True) 131 | 132 | print(output) 133 | ``` 134 | 135 | By default, the `analyze` function analyzes all the existing inner links as well, which might be time consuming. 136 | This default behaviour can be changed to analyze only the provided URL by passing the following option to the `analyze` function 137 | ```python 138 | from pyseoanalyzer import analyze 139 | 140 | output = analyze(site, sitemap, follow_links=False) 141 | 142 | print(output) 143 | ``` 144 | 145 | Alternatively, you can run the analysis as a script from the seoanalyzer folder. 146 | 147 | ```sh 148 | python -m seoanalyzer https://www.sethserver.com/ -f html > results.html 149 | ``` 150 | 151 | AI Optimization 152 | --------------- 153 | 154 | The first pass of AI optimization features use Anthropic's `claude-3-sonnet-20240229` model to evaluate the content of the site. You will need to have an API key from [Anthropic](https://www.anthropic.com/) to use this feature. The API key needs to be set as the environment variable `ANTHROPIC_API_KEY`. I recommend using a `.env` file to set this variable. Once the API key is set, the AI optimization features can be enabled with the `--run-llm-analysis` flag. 155 | 156 | Notes 157 | ----- 158 | 159 | If you get `requests.exceptions.SSLError` at either the command-line or via the python-API, try using: 160 | - http://www.foo.bar 161 | 162 | **instead** of.. 163 | 164 | - https://www.foo.bar 165 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "pyseoanalyzer" 7 | version = "2025.4.3" 8 | authors = [ 9 | {name = "Seth Black", email = "sblack@sethserver.com"}, 10 | ] 11 | dependencies = [ 12 | "beautifulsoup4==4.13.3", 13 | "certifi==2025.1.31", 14 | "Jinja2==3.1.6", 15 | "langchain==0.3.22", 16 | "langchain-anthropic==0.3.10", 17 | "lxml==5.3.1", 18 | "MarkupSafe==3.0.2", 19 | "python-dotenv==1.1.0", 20 | "trafilatura==2.0.0", 21 | "urllib3==2.3.0", 22 | ] 23 | requires-python = ">= 3.8" 24 | description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues." 25 | readme = "README.md" 26 | license = {file = "LICENSE"} 27 | keywords = [ 28 | "search engine optimization", 29 | "seo", 30 | "website parser", 31 | "crawler", 32 | "scraper", 33 | "site analyzer", 34 | "site parser", 35 | "site crawler", 36 | ] 37 | classifiers = [ 38 | "Development Status :: 5 - Production/Stable", 39 | "Programming Language :: Python", 40 | "Programming Language :: Python :: 3", 41 | "Programming Language :: Python :: 3 :: Only", 42 | "Environment :: Console", 43 | "Intended Audience :: Developers", 44 | "License :: OSI Approved :: BSD License", 45 | "Operating System :: OS Independent", 46 | "Topic :: Internet :: WWW/HTTP :: Indexing/Search", 47 | "Topic :: Software Development :: Libraries :: Python Modules", 48 | "Topic :: Text Processing", 49 | "Topic :: Internet :: WWW/HTTP", 50 | ] 51 | 52 | [project.scripts] 53 | python-seo-analyzer = "pyseoanalyzer.__main__:main" 54 | 55 | [project.urls] 56 | Homepage = "https://github.com/sethblack/python-seo-analyzer" 57 | Repository = "https://github.com/sethblack/python-seo-analyzer.git" 58 | Issues = "https://github.com/sethblack/python-seo-analyzer/issues" 59 | -------------------------------------------------------------------------------- /pyseoanalyzer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | 5 | # Use importlib.metadata (available in Python 3.8+) to get the version 6 | # defined in pyproject.toml. This avoids duplicating the version string. 7 | if sys.version_info >= (3, 8): 8 | from importlib import metadata 9 | else: 10 | # Fallback for Python < 3.8 (requires importlib-metadata backport) 11 | # Consider adding 'importlib-metadata; python_version < "3.8"' to dependencies 12 | # if you need to support older Python versions. 13 | import importlib_metadata as metadata 14 | 15 | try: 16 | # __package__ refers to the package name ('pyseoanalyzer') 17 | __version__ = metadata.version(__package__) 18 | except metadata.PackageNotFoundError: 19 | # Fallback if the package is not installed (e.g., when running from source) 20 | # You might want to handle this differently, e.g., raise an error 21 | # or read from a VERSION file. For now, setting it to unknown. 22 | __version__ = "0.0.0-unknown" 23 | 24 | 25 | from .analyzer import analyze 26 | -------------------------------------------------------------------------------- /pyseoanalyzer/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import inspect 5 | import json 6 | import os 7 | import sys 8 | 9 | from .analyzer import analyze 10 | from . import __version__ 11 | 12 | 13 | def main(): 14 | module_path = os.path.dirname(inspect.getfile(analyze)) 15 | arg_parser = argparse.ArgumentParser( 16 | description="Analyze SEO aspects of a website." 17 | ) 18 | arg_parser.add_argument( 19 | "--version", action="version", version=f"%(prog)s {__version__}" 20 | ) 21 | arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.") 22 | arg_parser.add_argument( 23 | "-s", "--sitemap", help="URL of the sitemap to seed the crawler with." 24 | ) 25 | arg_parser.add_argument( 26 | "-f", 27 | "--output-format", 28 | help="Output format.", 29 | choices=[ 30 | "json", 31 | "html", 32 | ], 33 | default="json", 34 | ) 35 | 36 | arg_parser.add_argument( 37 | "--analyze-headings", 38 | default=False, 39 | action="store_true", 40 | help="Analyze heading tags (h1-h6).", 41 | ) 42 | arg_parser.add_argument( 43 | "--analyze-extra-tags", 44 | default=False, 45 | action="store_true", 46 | help="Analyze other extra additional tags.", 47 | ) 48 | arg_parser.add_argument( 49 | "--no-follow-links", 50 | default=True, 51 | action="store_false", 52 | help="Analyze all the existing inner links as well (might be time consuming).", 53 | ) 54 | arg_parser.add_argument( 55 | "--run-llm-analysis", 56 | default=False, 57 | action="store_true", 58 | help="Run LLM analysis on the content.", 59 | ) 60 | 61 | args = arg_parser.parse_args() 62 | 63 | output = analyze( 64 | args.site, 65 | args.sitemap, 66 | analyze_headings=args.analyze_headings, 67 | analyze_extra_tags=args.analyze_extra_tags, 68 | follow_links=args.no_follow_links, 69 | run_llm_analysis=args.run_llm_analysis, 70 | ) 71 | 72 | if args.output_format == "html": 73 | from jinja2 import Environment 74 | from jinja2 import FileSystemLoader 75 | 76 | env = Environment( 77 | loader=FileSystemLoader(os.path.join(module_path, "templates")) 78 | ) 79 | template = env.get_template("index.html") 80 | output_from_parsed_template = template.render(result=output) 81 | print(output_from_parsed_template) 82 | elif args.output_format == "json": 83 | print(json.dumps(output, indent=4, separators=(",", ": "))) 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /pyseoanalyzer/analyzer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from operator import itemgetter 3 | from .website import Website 4 | 5 | 6 | def calc_total_time(start_time): 7 | return time.time() - start_time 8 | 9 | 10 | def analyze( 11 | url, 12 | sitemap_url=None, 13 | analyze_headings=False, 14 | analyze_extra_tags=False, 15 | follow_links=True, 16 | run_llm_analysis=False, 17 | ): 18 | start_time = time.time() 19 | 20 | output = { 21 | "pages": [], 22 | "keywords": [], 23 | "errors": [], 24 | "total_time": 0, # Initialize to 0 before calculation 25 | } 26 | 27 | site = Website( 28 | base_url=url, 29 | sitemap=sitemap_url, 30 | analyze_headings=analyze_headings, 31 | analyze_extra_tags=analyze_extra_tags, 32 | follow_links=follow_links, 33 | run_llm_analysis=run_llm_analysis, 34 | ) 35 | 36 | site.crawl() 37 | 38 | for p in site.crawled_pages: 39 | output["pages"].append(p.as_dict()) 40 | 41 | output["duplicate_pages"] = [ 42 | list(site.content_hashes[p]) 43 | for p in site.content_hashes 44 | if len(site.content_hashes[p]) > 1 45 | ] 46 | 47 | sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True) 48 | sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True) 49 | sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True) 50 | 51 | output["keywords"] = [] 52 | 53 | for w in sorted_words: 54 | if w[1] > 4: 55 | output["keywords"].append( 56 | { 57 | "word": w[0], 58 | "count": w[1], 59 | } 60 | ) 61 | 62 | for w, v in sorted_bigrams: 63 | if v > 4: 64 | output["keywords"].append( 65 | { 66 | "word": w, 67 | "count": v, 68 | } 69 | ) 70 | 71 | for w, v in sorted_trigrams: 72 | if v > 4: 73 | output["keywords"].append( 74 | { 75 | "word": w, 76 | "count": v, 77 | } 78 | ) 79 | 80 | # Sort one last time... 81 | output["keywords"] = sorted( 82 | output["keywords"], key=itemgetter("count"), reverse=True 83 | ) 84 | 85 | output["total_time"] = calc_total_time(start_time) 86 | 87 | return output 88 | -------------------------------------------------------------------------------- /pyseoanalyzer/http.py: -------------------------------------------------------------------------------- 1 | import certifi 2 | from urllib3 import PoolManager 3 | from urllib3 import Timeout 4 | 5 | 6 | class Http: 7 | def __init__(self): 8 | user_agent = {"User-Agent": "Mozilla/5.0"} 9 | 10 | self.http = PoolManager( 11 | timeout=Timeout(connect=2.0, read=7.0), 12 | cert_reqs="CERT_REQUIRED", 13 | ca_certs=certifi.where(), 14 | headers=user_agent, 15 | ) 16 | 17 | def get(self, url): 18 | return self.http.request("GET", url) 19 | 20 | 21 | http = Http() 22 | -------------------------------------------------------------------------------- /pyseoanalyzer/llm_analyst.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from langchain_anthropic import ChatAnthropic 3 | from langchain.prompts import PromptTemplate 4 | from langchain.schema.runnable import RunnablePassthrough 5 | from langchain.output_parsers import PydanticOutputParser 6 | from pydantic import BaseModel, Field 7 | from typing import Dict, List, Optional 8 | 9 | import asyncio 10 | import json 11 | import os 12 | 13 | load_dotenv() 14 | 15 | 16 | # Pydantic models for structured output 17 | class EntityAnalysis(BaseModel): 18 | entity_assessment: str = Field( 19 | description="Detailed analysis of entity optimization" 20 | ) 21 | knowledge_panel_readiness: int = Field(description="Score from 0-100") 22 | key_improvements: List[str] = Field(description="Top 3 improvements needed") 23 | 24 | 25 | class CredibilityAnalysis(BaseModel): 26 | credibility_assessment: str = Field(description="Overall credibility analysis") 27 | neeat_scores: Dict[str, int] = Field( 28 | description="Individual N-E-E-A-T-T component scores" 29 | ) 30 | trust_signals: List[str] = Field(description="Identified trust signals") 31 | 32 | 33 | class ConversationAnalysis(BaseModel): 34 | conversation_readiness: str = Field(description="Overall assessment") 35 | query_patterns: List[str] = Field(description="Identified query patterns") 36 | engagement_score: int = Field(description="Score from 0-100") 37 | gaps: List[str] = Field(description="Identified conversational gaps") 38 | 39 | 40 | class PlatformPresence(BaseModel): 41 | platform_coverage: Dict[str, str] = Field( 42 | description="Coverage analysis per platform" 43 | ) 44 | visibility_scores: Dict[str, int] = Field(description="Scores per platform type") 45 | optimization_opportunities: List[str] = Field(description="List of opportunities") 46 | 47 | 48 | class SEORecommendations(BaseModel): 49 | strategic_recommendations: List[str] = Field( 50 | description="Major strategic recommendations" 51 | ) 52 | quick_wins: List[str] = Field(description="Immediate action items") 53 | long_term_strategy: List[str] = Field(description="Long-term strategic goals") 54 | priority_matrix: Dict[str, str] = Field( 55 | description="Priority matrix by impact/effort" 56 | ) 57 | 58 | 59 | class LLMSEOEnhancer: 60 | def __init__(self): 61 | self.llm = ChatAnthropic( 62 | model="claude-3-sonnet-20240229", 63 | anthropic_api_key=os.environ.get("ANTHROPIC_API_KEY"), 64 | temperature=0, 65 | timeout=30, 66 | max_retries=3, 67 | ) 68 | self._setup_chains() 69 | 70 | def _setup_chains(self): 71 | """Setup modern LangChain runnable sequences using pipe syntax""" 72 | # Entity Analysis Chain 73 | entity_parser = PydanticOutputParser(pydantic_object=EntityAnalysis) 74 | 75 | entity_prompt = PromptTemplate.from_template( 76 | """Analyze these SEO elements for entity optimization: 77 | 1. Entity understanding (Knowledge Panel readiness) 78 | 2. Brand credibility signals (N-E-E-A-T-T principles) 79 | 3. Entity relationships and mentions 80 | 4. Topic entity connections 81 | 5. Schema markup effectiveness 82 | 83 | Data to analyze: 84 | {seo_data} 85 | 86 | {format_instructions} 87 | 88 | Only return your ouput in JSON format. Do not include any explanations any other text. 89 | """ 90 | ) 91 | 92 | self.entity_chain = ( 93 | { 94 | "seo_data": RunnablePassthrough(), 95 | "format_instructions": lambda _: entity_parser.get_format_instructions(), 96 | } 97 | | entity_prompt 98 | | self.llm 99 | | entity_parser 100 | ) 101 | 102 | # Credibility Analysis Chain 103 | credibility_parser = PydanticOutputParser(pydantic_object=CredibilityAnalysis) 104 | 105 | credibility_prompt = PromptTemplate.from_template( 106 | """Evaluate these credibility aspects: 107 | 1. N-E-E-A-T-T signals 108 | 2. Entity understanding and validation 109 | 3. Content creator credentials 110 | 4. Publisher authority 111 | 5. Topic expertise signals 112 | 113 | Data to analyze: 114 | {seo_data} 115 | 116 | {format_instructions} 117 | 118 | Only return your ouput in JSON format. Do not include any explanations any other text. 119 | """ 120 | ) 121 | 122 | self.credibility_chain = ( 123 | { 124 | "seo_data": RunnablePassthrough(), 125 | "format_instructions": lambda _: credibility_parser.get_format_instructions(), 126 | } 127 | | credibility_prompt 128 | | self.llm 129 | | credibility_parser 130 | ) 131 | 132 | # Conversation Analysis Chain 133 | conversation_parser = PydanticOutputParser(pydantic_object=ConversationAnalysis) 134 | 135 | conversation_prompt = PromptTemplate.from_template( 136 | """Analyze content for conversational search readiness: 137 | 1. Query pattern matching 138 | 2. Intent coverage across funnel 139 | 3. Natural language understanding 140 | 4. Follow-up content availability 141 | 5. Conversational triggers 142 | 143 | Data to analyze: 144 | {seo_data} 145 | 146 | {format_instructions} 147 | 148 | Only return your ouput in JSON format. Do not include any explanations any other text. 149 | """ 150 | ) 151 | 152 | self.conversation_chain = ( 153 | { 154 | "seo_data": RunnablePassthrough(), 155 | "format_instructions": lambda _: conversation_parser.get_format_instructions(), 156 | } 157 | | conversation_prompt 158 | | self.llm 159 | | conversation_parser 160 | ) 161 | 162 | # Platform Presence Chain 163 | platform_parser = PydanticOutputParser(pydantic_object=PlatformPresence) 164 | 165 | platform_prompt = PromptTemplate.from_template( 166 | """Analyze presence across different platforms: 167 | 1. Search engines (Google, Bing) 168 | 2. Knowledge graphs 169 | 3. AI platforms (ChatGPT, Bard) 170 | 4. Social platforms 171 | 5. Industry-specific platforms 172 | 173 | Data to analyze: 174 | {seo_data} 175 | 176 | {format_instructions} 177 | 178 | Only return your ouput in JSON format. Do not include any explanations any other text. 179 | """ 180 | ) 181 | 182 | self.platform_chain = ( 183 | { 184 | "seo_data": RunnablePassthrough(), 185 | "format_instructions": lambda _: platform_parser.get_format_instructions(), 186 | } 187 | | platform_prompt 188 | | self.llm 189 | | platform_parser 190 | ) 191 | 192 | # Recommendations Chain 193 | recommendations_parser = PydanticOutputParser( 194 | pydantic_object=SEORecommendations 195 | ) 196 | 197 | recommendations_prompt = PromptTemplate.from_template( 198 | """Based on this complete analysis, provide strategic recommendations: 199 | 1. Entity optimization strategy 200 | 2. Content strategy across platforms 201 | 3. Credibility building actions 202 | 4. Conversational optimization 203 | 5. Cross-platform presence improvement 204 | 205 | Analysis results: 206 | {analysis_results} 207 | 208 | {format_instructions} 209 | 210 | Only return your ouput in JSON format. Do not include any explanations any other text. 211 | """ 212 | ) 213 | 214 | self.recommendations_chain = ( 215 | { 216 | "analysis_results": RunnablePassthrough(), 217 | "format_instructions": lambda _: recommendations_parser.get_format_instructions(), 218 | } 219 | | recommendations_prompt 220 | | self.llm 221 | | recommendations_parser 222 | ) 223 | 224 | async def enhance_seo_analysis(self, seo_data: Dict) -> Dict: 225 | """ 226 | Enhanced SEO analysis using modern LangChain patterns 227 | """ 228 | # Convert seo_data to string for prompt insertion 229 | seo_data_str = json.dumps(seo_data, indent=2) 230 | 231 | # Run analysis chains in parallel 232 | entity_results, credibility_results, conversation_results, platform_results = ( 233 | await asyncio.gather( 234 | self.entity_chain.ainvoke(seo_data_str), 235 | self.credibility_chain.ainvoke(seo_data_str), 236 | self.conversation_chain.ainvoke(seo_data_str), 237 | self.platform_chain.ainvoke(seo_data_str), 238 | ) 239 | ) 240 | 241 | # Combine analyses 242 | combined_analysis = { 243 | "entity_analysis": entity_results.model_dump(), 244 | "credibility_analysis": credibility_results.model_dump(), 245 | "conversation_analysis": conversation_results.model_dump(), 246 | "cross_platform_presence": platform_results.model_dump(), 247 | } 248 | 249 | # Generate final recommendations 250 | recommendations = await self.recommendations_chain.ainvoke( 251 | json.dumps(combined_analysis, indent=2) 252 | ) 253 | 254 | # Combine all results 255 | final_results = { 256 | **seo_data, 257 | **combined_analysis, 258 | "recommendations": recommendations.model_dump(), 259 | } 260 | 261 | return self._format_output(final_results) 262 | 263 | def _format_output(self, raw_analysis: Dict) -> Dict: 264 | """Format analysis results into a clean, structured output""" 265 | return { 266 | "summary": { 267 | "entity_score": raw_analysis["entity_analysis"][ 268 | "knowledge_panel_readiness" 269 | ], 270 | "credibility_score": sum( 271 | raw_analysis["credibility_analysis"]["neeat_scores"].values() 272 | ) 273 | / 6, 274 | "conversation_score": raw_analysis["conversation_analysis"][ 275 | "engagement_score" 276 | ], 277 | "platform_score": sum( 278 | raw_analysis["cross_platform_presence"][ 279 | "visibility_scores" 280 | ].values() 281 | ) 282 | / len(raw_analysis["cross_platform_presence"]["visibility_scores"]), 283 | }, 284 | "detailed_analysis": raw_analysis, 285 | "quick_wins": raw_analysis["recommendations"]["quick_wins"], 286 | "strategic_recommendations": raw_analysis["recommendations"][ 287 | "strategic_recommendations" 288 | ], 289 | } 290 | 291 | 292 | # Example usage with async support 293 | async def enhanced_modern_analyze( 294 | site: str, sitemap: Optional[str] = None, api_key: str = None, **kwargs 295 | ): 296 | """ 297 | Enhanced analysis incorporating modern SEO principles using LangChain 298 | """ 299 | from pyseoanalyzer import analyze 300 | 301 | # Run original analysis 302 | original_results = analyze(site, sitemap, **kwargs) 303 | 304 | # Enhance with modern SEO analysis if API key provided 305 | if api_key: 306 | enhancer = LLMSEOEnhancer() 307 | enhanced_results = await enhancer.enhance_seo_analysis(original_results) 308 | return enhancer._format_output(enhanced_results) 309 | 310 | return original_results 311 | -------------------------------------------------------------------------------- /pyseoanalyzer/page.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import hashlib 3 | import json 4 | import lxml.html as lh 5 | import os 6 | import re 7 | import trafilatura 8 | 9 | from bs4 import BeautifulSoup 10 | from collections import Counter 11 | from string import punctuation 12 | from urllib.parse import urlsplit 13 | from urllib3.exceptions import HTTPError 14 | 15 | from .http import http 16 | from .llm_analyst import LLMSEOEnhancer 17 | from .stopwords import ENGLISH_STOP_WORDS 18 | 19 | TOKEN_REGEX = re.compile(r"(?u)\b\w\w+\b") 20 | 21 | HEADING_TAGS_XPATHS = { 22 | "h1": "//h1", 23 | "h2": "//h2", 24 | "h3": "//h3", 25 | "h4": "//h4", 26 | "h5": "//h5", 27 | "h6": "//h6", 28 | } 29 | 30 | ADDITIONAL_TAGS_XPATHS = { 31 | "title": "//title/text()", 32 | "meta_desc": '//meta[@name="description"]/@content', 33 | "viewport": '//meta[@name="viewport"]/@content', 34 | "charset": "//meta[@charset]/@charset", 35 | "canonical": '//link[@rel="canonical"]/@href', 36 | "alt_href": '//link[@rel="alternate"]/@href', 37 | "alt_hreflang": '//link[@rel="alternate"]/@hreflang', 38 | "og_title": '//meta[@property="og:title"]/@content', 39 | "og_desc": '//meta[@property="og:description"]/@content', 40 | "og_url": '//meta[@property="og:url"]/@content', 41 | "og_image": '//meta[@property="og:image"]/@content', 42 | } 43 | 44 | IMAGE_EXTENSIONS = set( 45 | [ 46 | ".img", 47 | ".png", 48 | ".jpg", 49 | ".jpeg", 50 | ".gif", 51 | ".bmp", 52 | ".svg", 53 | ".webp", 54 | ".avif", 55 | ] 56 | ) 57 | 58 | 59 | class Page: 60 | """ 61 | Container for each page and the core analyzer. 62 | """ 63 | 64 | def __init__( 65 | self, 66 | url="", 67 | base_domain="", 68 | analyze_headings=False, 69 | analyze_extra_tags=False, 70 | encoding="utf-8", 71 | run_llm_analysis=False, 72 | ): 73 | """ 74 | Variables go here, *not* outside of __init__ 75 | """ 76 | 77 | self.base_domain = urlsplit(base_domain) 78 | self.parsed_url = urlsplit(url) 79 | self.url = url 80 | self.analyze_headings = analyze_headings 81 | self.analyze_extra_tags = analyze_extra_tags 82 | self.encoding = encoding 83 | self.run_llm_analysis = run_llm_analysis 84 | self.title: str = "" 85 | self.author: str = "" 86 | self.description: str = "" 87 | self.hostname: str = "" 88 | self.sitename: str 89 | self.date: str 90 | self.keywords = {} 91 | self.warnings = [] 92 | self.translation = bytes.maketrans( 93 | punctuation.encode(encoding), str(" " * len(punctuation)).encode(encoding) 94 | ) 95 | self.links = [] 96 | self.total_word_count = 0 97 | self.wordcount = Counter() 98 | self.bigrams = Counter() 99 | self.trigrams = Counter() 100 | self.stem_to_word = {} 101 | self.content: str = None 102 | self.content_hash: str = None 103 | 104 | if run_llm_analysis: 105 | self.llm_analysis = {} 106 | 107 | if analyze_headings: 108 | self.headings = {} 109 | 110 | if analyze_extra_tags: 111 | self.additional_info = {} 112 | 113 | def as_dict(self): 114 | """ 115 | Returns a dictionary that can be printed 116 | """ 117 | 118 | context = { 119 | "url": self.url, 120 | "title": self.title, 121 | "description": self.description, 122 | "author": self.author, 123 | "hostname": self.hostname, 124 | "sitename": self.sitename, 125 | "date": self.date, 126 | "word_count": self.total_word_count, 127 | "keywords": self.sort_freq_dist(self.keywords, limit=5), 128 | "bigrams": self.bigrams, 129 | "trigrams": self.trigrams, 130 | "warnings": self.warnings, 131 | "content_hash": self.content_hash, 132 | } 133 | 134 | if self.analyze_headings: 135 | context["headings"] = self.headings 136 | 137 | if self.analyze_extra_tags: 138 | context["additional_info"] = self.additional_info 139 | 140 | if self.run_llm_analysis: 141 | context["llm_analysis"] = self.llm_analysis 142 | 143 | return context 144 | 145 | def analyze_heading_tags(self, bs): 146 | """ 147 | Analyze the heading tags and populate the headings 148 | """ 149 | 150 | try: 151 | dom = lh.fromstring(str(bs)) 152 | except ValueError as _: 153 | dom = lh.fromstring(bs.encode(self.encoding)) 154 | for tag, xpath in HEADING_TAGS_XPATHS.items(): 155 | value = [heading.text_content() for heading in dom.xpath(xpath)] 156 | if value: 157 | self.headings.update({tag: value}) 158 | 159 | def analyze_additional_tags(self, bs): 160 | """ 161 | Analyze additional tags and populate the additional info 162 | """ 163 | 164 | try: 165 | dom = lh.fromstring(str(bs)) 166 | except ValueError as _: 167 | dom = lh.fromstring(bs.encode(self.encoding)) 168 | for tag, xpath in ADDITIONAL_TAGS_XPATHS.items(): 169 | value = dom.xpath(xpath) 170 | if value: 171 | self.additional_info.update({tag: value}) 172 | 173 | def analyze(self, raw_html=None): 174 | """ 175 | Analyze the page and populate the warnings list 176 | """ 177 | 178 | if not raw_html: 179 | valid_prefixes = [] 180 | 181 | # only allow http:// https:// and // 182 | for s in [ 183 | "http://", 184 | "https://", 185 | "//", 186 | ]: 187 | valid_prefixes.append(self.url.startswith(s)) 188 | 189 | if True not in valid_prefixes: 190 | self.warn(f"{self.url} does not appear to have a valid protocol.") 191 | return 192 | 193 | if self.url.startswith("//"): 194 | self.url = f"{self.base_domain.scheme}:{self.url}" 195 | 196 | if self.parsed_url.netloc != self.base_domain.netloc: 197 | self.warn(f"{self.url} is not part of {self.base_domain.netloc}.") 198 | return 199 | 200 | try: 201 | page = http.get(self.url) 202 | except HTTPError as e: 203 | self.warn(f"Returned {e}") 204 | return 205 | 206 | encoding = "utf8" 207 | 208 | if "content-type" in page.headers: 209 | encoding = page.headers["content-type"].split("charset=")[-1] 210 | 211 | if encoding.lower() not in ("text/html", "text/plain", self.encoding): 212 | self.warn(f"Can not read {encoding}") 213 | return 214 | else: 215 | raw_html = page.data.decode(self.encoding) 216 | 217 | self.content_hash = hashlib.sha1(raw_html.encode(self.encoding)).hexdigest() 218 | 219 | # Use trafilatura to extract metadata 220 | metadata = trafilatura.extract_metadata( 221 | filecontent=raw_html, 222 | default_url=self.url, 223 | extensive=True, 224 | ) 225 | 226 | # I want to grab values from this even if they don't exist 227 | metadata_dict = metadata.as_dict() if metadata else {} 228 | 229 | # Helper function to get value or default to "" if None or 'None' 230 | def get_meta_value(key): 231 | value = metadata_dict.get(key) 232 | return "" if value is None or value == "None" else value 233 | 234 | # Ensure fields are strings, defaulting to "" if None or 'None' 235 | self.title = get_meta_value("title") 236 | self.author = get_meta_value("author") 237 | self.description = get_meta_value("description") 238 | self.hostname = get_meta_value("hostname") 239 | self.sitename = get_meta_value("sitename") 240 | self.date = get_meta_value("date") 241 | metadata_keywords = get_meta_value("keywords") 242 | 243 | if len(metadata_keywords) > 0: 244 | self.warn( 245 | f"Keywords should be avoided as they are a spam indicator and no longer used by Search Engines" 246 | ) 247 | 248 | # use trafulatura to extract the content 249 | content = trafilatura.extract( 250 | raw_html, 251 | include_links=True, 252 | include_formatting=False, 253 | include_tables=True, 254 | include_images=True, 255 | output_format="json", 256 | ) 257 | 258 | self.content = json.loads(content) if content else None 259 | 260 | # remove comments, they screw with BeautifulSoup 261 | html_without_comments = re.sub(r"", r"", raw_html, flags=re.DOTALL) 262 | 263 | # use BeautifulSoup to parse the more nuanced tags 264 | soup_lower = BeautifulSoup(html_without_comments.lower(), "html.parser") 265 | soup_unmodified = BeautifulSoup(html_without_comments, "html.parser") 266 | 267 | self.process_text(self.content["text"]) 268 | 269 | self.analyze_title() 270 | self.analyze_description() 271 | self.analyze_og(soup_lower) 272 | self.analyze_a_tags(soup_unmodified) 273 | self.analyze_img_tags(soup_lower) 274 | self.analyze_h1_tags(soup_lower) 275 | 276 | if self.analyze_headings: 277 | self.analyze_heading_tags(soup_unmodified) 278 | 279 | if self.analyze_extra_tags: 280 | self.analyze_additional_tags(soup_unmodified) 281 | 282 | if self.run_llm_analysis: 283 | self.llm_analysis = self.use_llm_analyzer() 284 | 285 | return True 286 | 287 | def use_llm_analyzer(self): 288 | """ 289 | Use the LLM analyzer to enhance the SEO analysis 290 | """ 291 | 292 | llm_enhancer = LLMSEOEnhancer() 293 | return asyncio.run(llm_enhancer.enhance_seo_analysis(self.content)) 294 | 295 | def word_list_freq_dist(self, wordlist): 296 | freq = [wordlist.count(w) for w in wordlist] 297 | return dict(zip(wordlist, freq)) 298 | 299 | def sort_freq_dist(self, freqdist, limit=1): 300 | aux = [ 301 | (freqdist[key], self.stem_to_word[key]) 302 | for key in freqdist 303 | if freqdist[key] >= limit 304 | ] 305 | aux.sort() 306 | aux.reverse() 307 | return aux 308 | 309 | def raw_tokenize(self, rawtext): 310 | return TOKEN_REGEX.findall(rawtext.lower()) 311 | 312 | def tokenize(self, rawtext): 313 | return [ 314 | word 315 | for word in TOKEN_REGEX.findall(rawtext.lower()) 316 | if word not in ENGLISH_STOP_WORDS 317 | ] 318 | 319 | def getngrams(self, D, n=2): 320 | return zip(*[D[i:] for i in range(n)]) 321 | 322 | def process_text(self, page_text): 323 | tokens = self.tokenize(page_text) 324 | raw_tokens = self.raw_tokenize(page_text) 325 | self.total_word_count = len(raw_tokens) 326 | 327 | bigrams = self.getngrams(raw_tokens, 2) 328 | 329 | for ng in bigrams: 330 | vt = " ".join(ng) 331 | self.bigrams[vt] += 1 332 | 333 | trigrams = self.getngrams(raw_tokens, 3) 334 | 335 | for ng in trigrams: 336 | vt = " ".join(ng) 337 | self.trigrams[vt] += 1 338 | 339 | freq_dist = self.word_list_freq_dist(tokens) 340 | 341 | for word in freq_dist: 342 | cnt = freq_dist[word] 343 | 344 | if word not in self.stem_to_word: 345 | self.stem_to_word[word] = word 346 | 347 | if word in self.wordcount: 348 | self.wordcount[word] += cnt 349 | else: 350 | self.wordcount[word] = cnt 351 | 352 | if word in self.keywords: 353 | self.keywords[word] += cnt 354 | else: 355 | self.keywords[word] = cnt 356 | 357 | def analyze_og(self, bs): 358 | """ 359 | Validate open graph tags 360 | """ 361 | og_title = bs.findAll("meta", attrs={"property": "og:title"}) 362 | og_description = bs.findAll("meta", attrs={"property": "og:description"}) 363 | og_image = bs.findAll("meta", attrs={"property": "og:image"}) 364 | 365 | if len(og_title) == 0: 366 | self.warn("Missing og:title") 367 | 368 | if len(og_description) == 0: 369 | self.warn("Missing og:description") 370 | 371 | if len(og_image) == 0: 372 | self.warn("Missing og:image") 373 | 374 | def analyze_title(self): 375 | """ 376 | Validate the title 377 | """ 378 | 379 | # getting lazy, create a local variable so save having to 380 | # type self.x a billion times 381 | t = self.title 382 | 383 | # calculate the length of the title once 384 | length = len(t) 385 | 386 | if length == 0: 387 | self.warn("Missing title tag") 388 | return 389 | elif length < 10: 390 | self.warn("Title tag is too short (less than 10 characters): {0}".format(t)) 391 | elif length > 70: 392 | self.warn("Title tag is too long (more than 70 characters): {0}".format(t)) 393 | 394 | def analyze_description(self): 395 | """ 396 | Validate the description 397 | """ 398 | 399 | # getting lazy, create a local variable so save having to 400 | # type self.x a billion times 401 | d = self.description 402 | 403 | # calculate the length of the description once 404 | length = len(d) 405 | 406 | if length == 0: 407 | self.warn("Missing description") 408 | return 409 | elif length < 140: 410 | self.warn( 411 | "Description is too short (less than 140 characters): {0}".format(d) 412 | ) 413 | elif length > 255: 414 | self.warn( 415 | "Description is too long (more than 255 characters): {0}".format(d) 416 | ) 417 | 418 | def visible_tags(self, element): 419 | if element.parent.name in ["style", "script", "[document]"]: 420 | return False 421 | 422 | return True 423 | 424 | def analyze_img_tags(self, bs): 425 | """ 426 | Verifies that each img has an alt and title 427 | """ 428 | images = bs.find_all("img") 429 | 430 | for image in images: 431 | src = "" 432 | if "src" in image: 433 | src = image["src"] 434 | elif "data-src" in image: 435 | src = image["data-src"] 436 | else: 437 | src = image 438 | 439 | if len(image.get("alt", "")) == 0: 440 | self.warn("Image missing alt tag: {0}".format(src)) 441 | 442 | def analyze_h1_tags(self, bs): 443 | """ 444 | Make sure each page has at least one H1 tag 445 | """ 446 | htags = bs.find_all("h1") 447 | 448 | if len(htags) == 0: 449 | self.warn("Each page should have at least one h1 tag") 450 | 451 | def analyze_a_tags(self, bs): 452 | """ 453 | Add any new links (that we didn't find in the sitemap) 454 | """ 455 | anchors = bs.find_all("a", href=True) 456 | 457 | for tag in anchors: 458 | tag_href = tag["href"] 459 | tag_text = tag.text.lower().strip() 460 | 461 | if len(tag.get("title", "")) == 0: 462 | self.warn("Anchor missing title tag: {0}".format(tag_href)) 463 | 464 | if tag_text in ["click here", "page", "article"]: 465 | self.warn("Anchor text contains generic text: {0}".format(tag_text)) 466 | 467 | if self.base_domain.netloc not in tag_href and ":" in tag_href: 468 | continue 469 | 470 | modified_url = self.rel_to_abs_url(tag_href) 471 | 472 | url_filename, url_file_extension = os.path.splitext(modified_url) 473 | 474 | # ignore links to images 475 | if url_file_extension in IMAGE_EXTENSIONS: 476 | continue 477 | 478 | # remove hash links to all urls 479 | if "#" in modified_url: 480 | modified_url = modified_url[: modified_url.rindex("#")] 481 | 482 | self.links.append(modified_url) 483 | 484 | def rel_to_abs_url(self, link): 485 | if ":" in link: 486 | return link 487 | 488 | relative_path = link 489 | domain = self.base_domain.netloc 490 | 491 | if domain[-1] == "/": 492 | domain = domain[:-1] 493 | 494 | if len(relative_path) > 0 and relative_path[0] == "?": 495 | if "?" in self.url: 496 | return f'{self.url[:self.url.index("?")]}{relative_path}' 497 | 498 | return f"{self.url}{relative_path}" 499 | 500 | if len(relative_path) > 0 and relative_path[0] != "/": 501 | relative_path = f"/{relative_path}" 502 | 503 | return f"{self.base_domain.scheme}://{domain}{relative_path}" 504 | 505 | def warn(self, warning): 506 | self.warnings.append(warning) 507 | -------------------------------------------------------------------------------- /pyseoanalyzer/stopwords.py: -------------------------------------------------------------------------------- 1 | # This list of English stop words is taken from the "Glasgow Information 2 | # Retrieval Group". The original list can be found at 3 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words 4 | ENGLISH_STOP_WORDS = frozenset( 5 | [ 6 | "a", 7 | "about", 8 | "above", 9 | "across", 10 | "after", 11 | "afterwards", 12 | "again", 13 | "against", 14 | "all", 15 | "almost", 16 | "alone", 17 | "along", 18 | "already", 19 | "also", 20 | "although", 21 | "always", 22 | "am", 23 | "among", 24 | "amongst", 25 | "amoungst", 26 | "amount", 27 | "an", 28 | "and", 29 | "another", 30 | "any", 31 | "anyhow", 32 | "anyone", 33 | "anything", 34 | "anyway", 35 | "anywhere", 36 | "are", 37 | "around", 38 | "as", 39 | "at", 40 | "back", 41 | "be", 42 | "became", 43 | "because", 44 | "become", 45 | "becomes", 46 | "becoming", 47 | "been", 48 | "before", 49 | "beforehand", 50 | "behind", 51 | "being", 52 | "below", 53 | "beside", 54 | "besides", 55 | "between", 56 | "beyond", 57 | "bill", 58 | "both", 59 | "bottom", 60 | "but", 61 | "by", 62 | "call", 63 | "can", 64 | "cannot", 65 | "cant", 66 | "co", 67 | "con", 68 | "could", 69 | "couldnt", 70 | "cry", 71 | "de", 72 | "describe", 73 | "detail", 74 | "do", 75 | "done", 76 | "down", 77 | "due", 78 | "during", 79 | "each", 80 | "eg", 81 | "eight", 82 | "either", 83 | "eleven", 84 | "else", 85 | "elsewhere", 86 | "empty", 87 | "enough", 88 | "etc", 89 | "even", 90 | "ever", 91 | "every", 92 | "everyone", 93 | "everything", 94 | "everywhere", 95 | "except", 96 | "few", 97 | "fifteen", 98 | "fify", 99 | "fill", 100 | "find", 101 | "fire", 102 | "first", 103 | "five", 104 | "for", 105 | "former", 106 | "formerly", 107 | "forty", 108 | "found", 109 | "four", 110 | "from", 111 | "front", 112 | "full", 113 | "further", 114 | "get", 115 | "give", 116 | "go", 117 | "had", 118 | "has", 119 | "hasnt", 120 | "have", 121 | "he", 122 | "hence", 123 | "her", 124 | "here", 125 | "hereafter", 126 | "hereby", 127 | "herein", 128 | "hereupon", 129 | "hers", 130 | "herself", 131 | "him", 132 | "himself", 133 | "his", 134 | "how", 135 | "however", 136 | "hundred", 137 | "i", 138 | "ie", 139 | "if", 140 | "in", 141 | "inc", 142 | "indeed", 143 | "interest", 144 | "into", 145 | "is", 146 | "it", 147 | "its", 148 | "itself", 149 | "keep", 150 | "last", 151 | "latter", 152 | "latterly", 153 | "least", 154 | "less", 155 | "ltd", 156 | "made", 157 | "many", 158 | "may", 159 | "me", 160 | "meanwhile", 161 | "might", 162 | "mill", 163 | "mine", 164 | "more", 165 | "moreover", 166 | "most", 167 | "mostly", 168 | "move", 169 | "much", 170 | "must", 171 | "my", 172 | "myself", 173 | "name", 174 | "namely", 175 | "neither", 176 | "never", 177 | "nevertheless", 178 | "next", 179 | "nine", 180 | "no", 181 | "nobody", 182 | "none", 183 | "noone", 184 | "nor", 185 | "not", 186 | "nothing", 187 | "now", 188 | "nowhere", 189 | "of", 190 | "off", 191 | "often", 192 | "on", 193 | "once", 194 | "one", 195 | "only", 196 | "onto", 197 | "or", 198 | "other", 199 | "others", 200 | "otherwise", 201 | "our", 202 | "ours", 203 | "ourselves", 204 | "out", 205 | "over", 206 | "own", 207 | "part", 208 | "per", 209 | "perhaps", 210 | "please", 211 | "put", 212 | "rather", 213 | "re", 214 | "same", 215 | "see", 216 | "seem", 217 | "seemed", 218 | "seeming", 219 | "seems", 220 | "serious", 221 | "several", 222 | "she", 223 | "should", 224 | "show", 225 | "side", 226 | "since", 227 | "sincere", 228 | "six", 229 | "sixty", 230 | "so", 231 | "some", 232 | "somehow", 233 | "someone", 234 | "something", 235 | "sometime", 236 | "sometimes", 237 | "somewhere", 238 | "still", 239 | "such", 240 | "system", 241 | "take", 242 | "ten", 243 | "than", 244 | "that", 245 | "the", 246 | "their", 247 | "them", 248 | "themselves", 249 | "then", 250 | "thence", 251 | "there", 252 | "thereafter", 253 | "thereby", 254 | "therefore", 255 | "therein", 256 | "thereupon", 257 | "these", 258 | "they", 259 | "third", 260 | "this", 261 | "those", 262 | "though", 263 | "three", 264 | "through", 265 | "throughout", 266 | "thru", 267 | "thus", 268 | "to", 269 | "together", 270 | "too", 271 | "top", 272 | "toward", 273 | "towards", 274 | "twelve", 275 | "twenty", 276 | "two", 277 | "un", 278 | "under", 279 | "until", 280 | "up", 281 | "upon", 282 | "us", 283 | "very", 284 | "via", 285 | "was", 286 | "we", 287 | "well", 288 | "were", 289 | "what", 290 | "whatever", 291 | "when", 292 | "whence", 293 | "whenever", 294 | "where", 295 | "whereafter", 296 | "whereas", 297 | "whereby", 298 | "wherein", 299 | "whereupon", 300 | "wherever", 301 | "whether", 302 | "which", 303 | "while", 304 | "whither", 305 | "who", 306 | "whoever", 307 | "whole", 308 | "whom", 309 | "whose", 310 | "why", 311 | "will", 312 | "with", 313 | "within", 314 | "without", 315 | "would", 316 | "yet", 317 | "you", 318 | "your", 319 | "yours", 320 | "yourself", 321 | "yourselves", 322 | ] 323 | ) 324 | -------------------------------------------------------------------------------- /pyseoanalyzer/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | seo analysis 7 | 8 | 9 | 10 | 11 | 12 | 20 | 21 | 22 |
23 | 26 | 27 | {%if result['fatal_error']%} 28 |

There was a fatal error when trying to format the output file as a webpage. Confirm that there is an output.json file in the directory. If not, you can reference the 29 | python-seo-analyzer documentation

30 | {% else %} 31 |

total time: {{result['total_time']|round(2)}} seconds

32 |
33 |

go to:

34 | 39 |
40 | 41 | {% if result['errors']%} 42 |

errors:

43 |
    44 | {% for e in result['errors'] %} 45 |
  • {{e}}
  • 46 | {% endfor %} 47 |
48 | {% endif %} 49 | 50 | {% if result['pages'] %} 51 |

page analysis:

52 | 53 | 54 |
55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | {% for page in result['pages']%} 67 | {% set outer_loop = loop %} 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | {% endfor %} 78 | 79 |
pageword countnumber of notices
keyboard_arrow_down{{page['url']}}{{page['word_count']}}{{page["warnings"]|length}}

{{page["title"]}}

{{page["description"]}}

    {% for err in page["warnings"] %}
  • {{err|e}}
  • {% endfor %}
80 |
81 | 82 | {% endif %} 83 | 84 | 85 | {% if result['keywords'] %} 86 |

keyword analysis:

87 |
88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | {% for key in result['keywords']%} 97 | 98 | 99 | 100 | 101 | 102 | {% endfor %} 103 | 104 |
keywordscount
{{ key['word'] }} {{ key['count'] }}
105 |
106 | {% endif %} 107 | {% endif %} 108 |
109 | 110 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /pyseoanalyzer/website.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from urllib.parse import urlsplit 3 | from xml.dom import minidom 4 | import socket 5 | 6 | from .http import http 7 | from .page import Page 8 | 9 | 10 | class Website: 11 | def __init__( 12 | self, 13 | base_url, 14 | sitemap, 15 | analyze_headings=True, 16 | analyze_extra_tags=False, 17 | follow_links=False, 18 | run_llm_analysis=False, 19 | ): 20 | self.base_url = base_url 21 | self.sitemap = sitemap 22 | self.analyze_headings = analyze_headings 23 | self.analyze_extra_tags = analyze_extra_tags 24 | self.follow_links = follow_links 25 | self.run_llm_analysis = run_llm_analysis 26 | self.crawled_pages = [] 27 | self.crawled_urls = set() 28 | self.page_queue = [] 29 | self.wordcount = Counter() 30 | self.bigrams = Counter() 31 | self.trigrams = Counter() 32 | self.content_hashes = defaultdict(set) 33 | 34 | def check_dns(self, url_to_check): 35 | try: 36 | o = urlsplit(url_to_check) 37 | socket.gethostbyname_ex(o.hostname) 38 | return True 39 | except (socket.herror, socket.gaierror): 40 | return False 41 | 42 | def get_text_from_xml(self, nodelist): 43 | """ 44 | Stolen from the minidom documentation 45 | """ 46 | return "".join( 47 | node.data for node in nodelist if node.nodeType == node.TEXT_NODE 48 | ) 49 | 50 | def crawl(self): 51 | try: 52 | if self.sitemap: 53 | page = http.get(self.sitemap) 54 | if self.sitemap.endswith("xml"): 55 | xmldoc = minidom.parseString(page.data.decode("utf-8")) 56 | sitemap_urls = xmldoc.getElementsByTagName("loc") 57 | for url in sitemap_urls: 58 | self.page_queue.append(self.get_text_from_xml(url.childNodes)) 59 | elif self.sitemap.endswith("txt"): 60 | sitemap_urls = page.data.decode("utf-8").split("\n") 61 | for url in sitemap_urls: 62 | self.page_queue.append(url) 63 | 64 | self.page_queue.append(self.base_url) 65 | 66 | for url in self.page_queue: 67 | if url in self.crawled_urls: 68 | continue 69 | 70 | page = Page( 71 | url=url, 72 | base_domain=self.base_url, 73 | analyze_headings=self.analyze_headings, 74 | analyze_extra_tags=self.analyze_extra_tags, 75 | run_llm_analysis=self.run_llm_analysis, 76 | ) 77 | 78 | if page.parsed_url.netloc != page.base_domain.netloc: 79 | continue 80 | 81 | # Analyze the page and check if successful 82 | analysis_successful = page.analyze() 83 | 84 | # Only process and add the page if analysis completed 85 | if analysis_successful: 86 | self.content_hashes[page.content_hash].add(page.url) 87 | self.wordcount.update(page.wordcount) 88 | self.bigrams.update(page.bigrams) 89 | self.trigrams.update(page.trigrams) 90 | 91 | # Only add links if following is enabled and analysis was successful 92 | if self.follow_links: 93 | self.page_queue.extend(page.links) 94 | 95 | self.crawled_pages.append(page) 96 | self.crawled_urls.add(page.url) 97 | 98 | # Stop after the first page if not following links, regardless of analysis success 99 | if not self.follow_links: 100 | break 101 | except Exception as e: 102 | print(f"Error occurred during crawling: {e}") 103 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.13.3 2 | certifi==2025.1.31 3 | Jinja2==3.1.6 4 | langchain==0.3.22 5 | langchain-anthropic==0.3.10 6 | lxml==5.3.1 7 | MarkupSafe==3.0.2 8 | pytest==8.3.2 # Added for testing 9 | pytest-mock==3.14.0 # Added for testing 10 | python-dotenv==1.1.0 11 | trafilatura==2.0.0 12 | urllib3==2.3.0 13 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import inspect 3 | import json 4 | import os 5 | 6 | from jinja2 import Environment 7 | from jinja2 import FileSystemLoader 8 | from pyseoanalyzer import analyze 9 | 10 | 11 | module_path = os.path.dirname(inspect.getfile(analyze)) 12 | 13 | arg_parser = argparse.ArgumentParser() 14 | 15 | arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.") 16 | arg_parser.add_argument( 17 | "-s", "--sitemap", help="URL of the sitemap to seed the crawler with." 18 | ) 19 | arg_parser.add_argument( 20 | "-f", 21 | "--output-format", 22 | help="Output format.", 23 | choices=[ 24 | "json", 25 | "html", 26 | ], 27 | default="json", 28 | ) 29 | arg_parser.add_argument( 30 | "-d", 31 | "--disk", 32 | help="save to disk", 33 | choices=[ 34 | "y", 35 | "n", 36 | ], 37 | default="y", 38 | ) 39 | 40 | args = arg_parser.parse_args() 41 | 42 | output = analyze(args.site, args.sitemap) 43 | 44 | if args.output_format == "html": 45 | from jinja2 import Environment 46 | from jinja2 import FileSystemLoader 47 | 48 | env = Environment(loader=FileSystemLoader(os.path.join(module_path, "templates"))) 49 | template = env.get_template("index.html") 50 | output_from_parsed_template = template.render(result=output) 51 | if args.disk == "y": 52 | with open("test.html", "w", encoding="utf-8") as text_file: 53 | text_file.write(output_from_parsed_template) 54 | else: 55 | print(output_from_parsed_template) 56 | elif args.output_format == "json": 57 | if args.disk == "y": 58 | with open("test.json", "w", encoding="utf-8") as text_file: 59 | text_file.write(json.dumps(output, indent=4, separators=(",", ": "))) 60 | else: 61 | print(json.dumps(output, indent=4, separators=(",", ": "))) 62 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sethblack/python-seo-analyzer/27bb52303747dcb767f51853c2701153d0c4b6b7/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_analyzer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pytest 3 | from unittest.mock import patch, MagicMock 4 | from pyseoanalyzer.analyzer import analyze, calc_total_time 5 | 6 | 7 | # --- Test calc_total_time --- 8 | 9 | 10 | def test_calc_total_time(): 11 | start_time = time.time() 12 | # Simulate some time passing 13 | time.sleep(0.01) 14 | elapsed_time = calc_total_time(start_time) 15 | # Check if the elapsed time is roughly correct (allow for some variance) 16 | assert 0.005 < elapsed_time < 0.05 17 | 18 | 19 | # --- Test analyze function --- 20 | 21 | 22 | # Helper function to create a mock Page object 23 | def create_mock_page(url, title, description, word_count, content_hash): 24 | page = MagicMock() 25 | page.url = url 26 | page.content_hash = content_hash 27 | page.as_dict.return_value = { 28 | "url": url, 29 | "title": title, 30 | "description": description, 31 | "word_count": word_count, 32 | # Add other fields as needed by as_dict() if tests evolve 33 | } 34 | return page 35 | 36 | 37 | # Basic test using mocking 38 | @patch("pyseoanalyzer.analyzer.Website") 39 | def test_analyze_basic(MockWebsite): 40 | # --- Setup Mock --- 41 | mock_site_instance = MockWebsite.return_value 42 | mock_page1 = create_mock_page( 43 | "http://example.com", "Page 1", "Desc 1", 100, "hash1" 44 | ) 45 | mock_site_instance.crawled_pages = [mock_page1] 46 | mock_site_instance.content_hashes = {"hash1": ["http://example.com"]} 47 | mock_site_instance.wordcount = {"word": 5, "test": 6} 48 | mock_site_instance.bigrams = {("bigram", "test"): 5} 49 | mock_site_instance.trigrams = {("trigram", "test", "word"): 5} 50 | 51 | # --- Run analyze --- 52 | output = analyze("http://example.com", follow_links=False) 53 | 54 | # --- Assertions --- 55 | # Check Website constructor call 56 | MockWebsite.assert_called_once_with( 57 | base_url="http://example.com", 58 | sitemap=None, 59 | analyze_headings=False, 60 | analyze_extra_tags=False, 61 | follow_links=False, 62 | run_llm_analysis=False, 63 | ) 64 | # Check crawl was called 65 | mock_site_instance.crawl.assert_called_once() 66 | 67 | # Check output structure and basic content 68 | assert len(output["pages"]) == 1 69 | assert output["pages"][0]["url"] == "http://example.com" 70 | assert output["pages"][0]["title"] == "Page 1" 71 | assert output["pages"][0]["description"] == "Desc 1" 72 | assert output["pages"][0]["word_count"] == 100 73 | # assert output["errors"] == [] # Errors usually come from crawl, harder to test here 74 | assert output["duplicate_pages"] == [] # Only one page 75 | 76 | # Check keywords (counts > 4) 77 | assert len(output["keywords"]) == 4 78 | assert {"word": "test", "count": 6} in output["keywords"] 79 | assert {"word": "word", "count": 5} in output["keywords"] 80 | assert {"word": ("bigram", "test"), "count": 5} in output["keywords"] 81 | assert {"word": ("trigram", "test", "word"), "count": 5} in output["keywords"] 82 | 83 | # Check total time calculation 84 | assert "total_time" in output 85 | assert output["total_time"] > 0 86 | 87 | 88 | # Add more tests below for different scenarios (duplicates, arguments, etc.) 89 | # For example: 90 | 91 | 92 | @patch("pyseoanalyzer.analyzer.Website") 93 | def test_analyze_duplicates(MockWebsite): 94 | # --- Setup Mock --- 95 | mock_site_instance = MockWebsite.return_value 96 | mock_page1 = create_mock_page( 97 | "http://example.com/page1", "Page 1", "Desc", 100, "hash_dup" 98 | ) 99 | mock_page2 = create_mock_page( 100 | "http://example.com/page2", "Page 2", "Desc", 150, "hash_dup" 101 | ) # Same hash 102 | mock_page3 = create_mock_page( 103 | "http://example.com/page3", "Page 3", "Desc", 200, "hash_unique" 104 | ) 105 | mock_site_instance.crawled_pages = [mock_page1, mock_page2, mock_page3] 106 | mock_site_instance.content_hashes = { 107 | "hash_dup": ["http://example.com/page1", "http://example.com/page2"], 108 | "hash_unique": ["http://example.com/page3"], 109 | } 110 | mock_site_instance.wordcount = {} 111 | mock_site_instance.bigrams = {} 112 | mock_site_instance.trigrams = {} 113 | 114 | # --- Run analyze --- 115 | output = analyze("http://example.com") # Default follow_links=True 116 | 117 | # --- Assertions --- 118 | MockWebsite.assert_called_once_with( 119 | base_url="http://example.com", 120 | sitemap=None, 121 | analyze_headings=False, 122 | analyze_extra_tags=False, 123 | follow_links=True, # Check default 124 | run_llm_analysis=False, 125 | ) 126 | mock_site_instance.crawl.assert_called_once() 127 | 128 | assert len(output["pages"]) == 3 129 | assert len(output["duplicate_pages"]) == 1 130 | # Convert to sets for order-independent comparison 131 | assert set(output["duplicate_pages"][0]) == { 132 | "http://example.com/page1", 133 | "http://example.com/page2", 134 | } 135 | assert output["keywords"] == [] 136 | 137 | 138 | @patch("pyseoanalyzer.analyzer.Website") 139 | def test_analyze_arguments_passthrough(MockWebsite): 140 | # --- Setup Mock --- 141 | mock_site_instance = MockWebsite.return_value 142 | mock_site_instance.crawled_pages = [] 143 | mock_site_instance.content_hashes = {} 144 | mock_site_instance.wordcount = {} 145 | mock_site_instance.bigrams = {} 146 | mock_site_instance.trigrams = {} 147 | 148 | # --- Run analyze with specific arguments --- 149 | analyze( 150 | "http://example.com", 151 | sitemap_url="http://example.com/sitemap.xml", 152 | analyze_headings=True, 153 | analyze_extra_tags=True, 154 | follow_links=False, 155 | run_llm_analysis=True, 156 | ) 157 | 158 | # --- Assertions --- 159 | # Check Website constructor call reflects arguments 160 | MockWebsite.assert_called_once_with( 161 | base_url="http://example.com", 162 | sitemap="http://example.com/sitemap.xml", 163 | analyze_headings=True, 164 | analyze_extra_tags=True, 165 | follow_links=False, 166 | run_llm_analysis=True, 167 | ) 168 | mock_site_instance.crawl.assert_called_once() 169 | 170 | 171 | @patch("pyseoanalyzer.analyzer.Website") 172 | def test_analyze_keyword_filtering(MockWebsite): 173 | # --- Setup Mock --- 174 | mock_site_instance = MockWebsite.return_value 175 | mock_site_instance.crawled_pages = [] 176 | mock_site_instance.content_hashes = {} 177 | # Include counts <= 4 178 | mock_site_instance.wordcount = {"high": 10, "medium": 5, "low": 4, "verylow": 3} 179 | mock_site_instance.bigrams = {("bi", "high"): 6, ("bi", "low"): 4} 180 | mock_site_instance.trigrams = {("tri", "high", "a"): 5, ("tri", "low", "b"): 3} 181 | 182 | # --- Run analyze --- 183 | output = analyze("http://example.com") 184 | 185 | # --- Assertions --- 186 | assert len(output["keywords"]) == 4 # Only counts > 4 should be included 187 | words_in_keywords = {kw["word"] for kw in output["keywords"]} 188 | assert "high" in words_in_keywords 189 | assert "medium" in words_in_keywords 190 | assert ("bi", "high") in words_in_keywords 191 | assert ("tri", "high", "a") in words_in_keywords 192 | assert "low" not in words_in_keywords 193 | assert "verylow" not in words_in_keywords 194 | assert ("bi", "low") not in words_in_keywords 195 | assert ("tri", "low", "b") not in words_in_keywords 196 | 197 | # Check sorting (descending by count) 198 | counts = [kw["count"] for kw in output["keywords"]] 199 | assert counts == sorted(counts, reverse=True) 200 | -------------------------------------------------------------------------------- /tests/test_http.py: -------------------------------------------------------------------------------- 1 | from pyseoanalyzer import http 2 | 3 | 4 | def test_http(): 5 | assert http.http.get("https://www.sethserver.com/tests/utf8.html") 6 | -------------------------------------------------------------------------------- /tests/test_llm_analyst.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pyseoanalyzer.llm_analyst import LLMSEOEnhancer 3 | from langchain_anthropic import ChatAnthropic 4 | from langchain.chains import LLMChain 5 | from langchain.prompts import PromptTemplate 6 | import json 7 | 8 | 9 | @pytest.fixture 10 | def seo_data(): 11 | return { 12 | "title": "Test Title", 13 | "description": "Test Description", 14 | "keywords": ["test", "seo"], 15 | "content": "This is a test content.", 16 | } 17 | 18 | 19 | def test_init(): 20 | enhancer = LLMSEOEnhancer() 21 | assert isinstance(enhancer.llm, ChatAnthropic) 22 | assert enhancer.llm.model == "claude-3-sonnet-20240229" 23 | assert enhancer.llm.temperature == 0 24 | 25 | 26 | @pytest.mark.asyncio 27 | async def test_enhance_seo_analysis(seo_data): 28 | enhancer = LLMSEOEnhancer() 29 | result = await enhancer.enhance_seo_analysis(seo_data) 30 | 31 | assert "summary" in result 32 | 33 | assert "entity_analysis" in result["detailed_analysis"] 34 | assert "credibility_analysis" in result["detailed_analysis"] 35 | assert "conversation_analysis" in result["detailed_analysis"] 36 | assert "cross_platform_presence" in result["detailed_analysis"] 37 | assert "recommendations" in result["detailed_analysis"] 38 | -------------------------------------------------------------------------------- /tests/test_page.py: -------------------------------------------------------------------------------- 1 | from pyseoanalyzer import page 2 | 3 | 4 | def test_page_init(): 5 | p = page.Page( 6 | url="https://www.sethserver.com/sitemap.xml", 7 | base_domain="https://www.sethserver.com/", 8 | ) 9 | 10 | assert p.base_domain.scheme == "https" 11 | assert p.base_domain.netloc == "www.sethserver.com" 12 | assert p.base_domain.path == "/" 13 | 14 | assert p.url == "https://www.sethserver.com/sitemap.xml" 15 | 16 | assert p.title == "" 17 | assert p.description == "" 18 | assert p.keywords == {} 19 | assert p.warnings == [] 20 | assert p.links == [] 21 | 22 | 23 | def test_analyze(): 24 | p = page.Page( 25 | url="https://www.sethserver.com/", base_domain="https://www.sethserver.com/" 26 | ) 27 | 28 | assert p.analyze() 29 | 30 | assert "seth" in p.title.lower() 31 | 32 | 33 | def test_analyze_with_llm(): 34 | p = page.Page( 35 | url="https://www.sethserver.com/", 36 | base_domain="https://www.sethserver.com/", 37 | run_llm_analysis=True, 38 | ) 39 | 40 | assert p.analyze() 41 | 42 | assert "seth" in p.title.lower() 43 | assert "summary" in p.llm_analysis 44 | --------------------------------------------------------------------------------