├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── docker-build-push.yml
    │   └── pypi-publish.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.docker.md
├── README.md
├── pyproject.toml
├── pyseoanalyzer
    ├── __init__.py
    ├── __main__.py
    ├── analyzer.py
    ├── http.py
    ├── llm_analyst.py
    ├── page.py
    ├── stopwords.py
    ├── templates
    │   └── index.html
    └── website.py
├── requirements.txt
├── test.py
└── tests
    ├── __init__.py
    ├── test_analyzer.py
    ├── test_http.py
    ├── test_llm_analyst.py
    └── test_page.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .env
 2 | .vscode
 3 | .github
 4 | .pytest_cache
 5 | .git
 6 | .dockerignore
 7 | .gitignore
 8 | *.pyc
 9 | env/
10 | venv/
11 | */__pycache__/*
12 | tests/
13 | Dockerfile
14 | *.pyc


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build-push.yml:
--------------------------------------------------------------------------------
 1 | name: Manual Docker Build and Push
 2 | 
 3 | on:
 4 |   workflow_dispatch: # Allows manual triggering
 5 | 
 6 | # Add permissions for pushing packages and OIDC token
 7 | permissions:
 8 |   contents: read
 9 |   packages: write # Needed to push container images
10 |   id-token: write # Needed for signing/attestations
11 | 
12 | jobs:
13 |   build-and-push:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Log in to Docker Hub
20 |         uses: docker/login-action@v3
21 |         with:
22 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
23 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
24 | 
25 |       # Install the cosign tool
26 |       # https://github.com/sigstore/cosign-installer
27 |       - name: Install cosign
28 |         uses: sigstore/cosign-installer@v3.5.0
29 |         with:
30 |           cosign-release: 'v2.2.4'
31 | 
32 |       # Setup Docker buildx
33 |       # https://github.com/docker/build-push-action/issues/461
34 |       - name: Setup Docker buildx
35 |         uses: docker/setup-buildx-action@v3
36 | 
37 |       # Extract metadata (tags, labels) for Docker
38 |       # https://github.com/docker/metadata-action
39 |       - name: Extract Docker metadata
40 |         id: meta
41 |         uses: docker/metadata-action@v5
42 |         with:
43 |           images: sethblack/python-seo-analyzer # Your Docker Hub image
44 |           tags: |
45 |             # Add short SHA tag based on the Git commit, disable automatic latest promotion
46 |             type=sha,format=short,flavor=latest=false
47 |             # Explicitly add the 'latest' tag for all manual runs
48 |             type=raw,value=latest,enable=true
49 | 
50 |       # Build and push Docker image with attestation
51 |       # https://github.com/docker/build-push-action
52 |       - name: Build and push Docker image
53 |         id: build-and-push # Add id to reference outputs
54 |         uses: docker/build-push-action@v5
55 |         with:
56 |           context: .
57 |           push: true
58 |           tags: ${{ steps.meta.outputs.tags }} # Use tags from metadata
59 |           labels: ${{ steps.meta.outputs.labels }} # Use labels from metadata
60 |           # Attestations for provenance and SBOM
61 |           # Correct format: type=<type>,<key>=<value>
62 |           attests: |
63 |             type=provenance,builder-id=${{ github.workflow }}/${{ github.job_id }}
64 |             type=sbom,scan-mode=local,scan-args=--exclude=./tests
65 | 
66 |       # Sign the resulting Docker image digest.
67 |       # https://github.com/sigstore/cosign
68 |       - name: Sign the published Docker image
69 |         env:
70 |           # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-repository-for-the-build
71 |           COSIGN_EXPERIMENTAL: "true"
72 |         # This step uses the identity token to provision an ephemeral certificate
73 |         # against the sigstore community Fulcio instance.
74 |         run: echo "${{ steps.meta.outputs.tags }}" | xargs -I {} cosign sign --yes {}@${{ steps.build-and-push.outputs.digest }}
75 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Manual PyPI Publish
 2 | 
 3 | on:
 4 |   workflow_dispatch: # Allows manual triggering
 5 | 
 6 | permissions:
 7 |   contents: read # Needed to checkout the repository
 8 | 
 9 | jobs:
10 |   publish-to-pypi:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout repository
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v4
18 |         with:
19 |           python-version: '3.x' # Use an appropriate Python version
20 | 
21 |       - name: Install dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install build twine
25 | 
26 |       - name: Build package
27 |         run: python -m build
28 | 
29 |       - name: Publish package to PyPI
30 |         uses: pypa/gh-action-pypi-publish@release/v1
31 |         with:
32 |           user: __token__
33 |           password: ${{ secrets.PYPI_API_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # I don't want the python virtual env in github!
 2 | venv
 3 | env
 4 | .env
 5 | 
 6 | # nor visual
 7 | .vscode
 8 | 
 9 | *.py[cod]
10 | 
11 | # C extensions
12 | *.so
13 | */__pychache__/*
14 | 
15 | # Packages
16 | *.egg
17 | *.egg-info
18 | dist
19 | build
20 | eggs
21 | parts
22 | bin
23 | var
24 | sdist
25 | develop-eggs
26 | .installed.cfg
27 | lib
28 | lib64
29 | 
30 | # Installer logs
31 | pip-log.txt
32 | 
33 | # Unit test / coverage reports
34 | .coverage
35 | .tox
36 | nosetests.xml
37 | 
38 | # Translations
39 | *.mo
40 | 
41 | # Mr Developer
42 | .mr.developer.cfg
43 | .project
44 | .pydevproject
45 | 
46 | # Output directory
47 | output/
48 | build/
49 | 
50 | .DS_Store
51 | 
52 | # ipython
53 | *.ipynb
54 | .ipynb_checkpoints/*
55 | 
56 | 
57 | # PyCharm
58 | .idea/*
59 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at seth@sethserver.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.13.2-bookworm
 2 | 
 3 | RUN apt-get update -y && apt-get upgrade -y
 4 | 
 5 | RUN pip3 install --upgrade pip
 6 | RUN pip3 install uv
 7 | 
 8 | COPY ./requirements.txt /python-seo-analyzer/
 9 | 
10 | RUN uv pip install --system --verbose --requirement /python-seo-analyzer/requirements.txt
11 | RUN uv cache clean --verbose
12 | 
13 | COPY . /python-seo-analyzer
14 | 
15 | # Create a non-root user
16 | RUN groupadd -r appgroup && useradd --no-log-init -r -g appgroup appuser
17 | 
18 | # Set ownership of the app directory
19 | RUN chown -R appuser:appgroup /python-seo-analyzer
20 | 
21 | # Switch back to root to install the package system-wide
22 | USER root
23 | RUN python3 -m pip install /python-seo-analyzer
24 | 
25 | # Switch back to the non-root user
26 | USER appuser
27 | 
28 | WORKDIR /app
29 | 
30 | ENTRYPOINT ["python-seo-analyzer"]
31 | CMD ["--version"]
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2012-2025 Seth Black.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 |     1. Redistributions of source code must retain the above copyright notice, 
 8 |        this list of conditions and the following disclaimer.
 9 |     
10 |     2. Redistributions in binary form must reproduce the above copyright 
11 |        notice, this list of conditions and the following disclaimer in the
12 |        documentation and/or other materials provided with the distribution.
13 | 
14 |     3. The name of Seth Black may not be used to endorse or promote products
15 |        derived from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | graft seoanalyzer/templates
4 | 
5 | 


--------------------------------------------------------------------------------
/README.docker.md:
--------------------------------------------------------------------------------
 1 | # Docker Usage for Python SEO Analyzer
 2 | 
 3 | This document provides instructions on how to build and run the `python-seo-analyzer` tool using Docker.
 4 | 
 5 | ## Overview
 6 | 
 7 | The Docker image provides a self-contained environment to run the `python-seo-analyzer` command-line tool without needing to install Python or dependencies directly on your host system.
 8 | 
 9 | The image is based on `python:3.13.2-bookworm` and includes all necessary dependencies specified in `requirements.txt`.
10 | 
11 | ## Building the Image (Optional)
12 | 
13 | While pre-built images might be available (e.g., via GitHub Packages), you can build the image locally using the provided `Dockerfile`:
14 | 
15 | ```bash
16 | docker build -t python-seo-analyzer .
17 | ```
18 | 
19 | ## Running the Container
20 | 
21 | The container is configured to run the `python-seo-analyzer` command directly. You pass the command-line arguments for the tool after the image name. The official image is available at `sethblack/python-seo-analyzer:latest`.
22 | 
23 | **Default Command (Show Version):**
24 | 
25 | If you run the container without any arguments, it executes the default command (`--version`):
26 | 
27 | ```bash
28 | docker run --rm sethblack/python-seo-analyzer:latest
29 | ```
30 | *(Note: The examples below use `sethblack/python-seo-analyzer:latest`. If you built the image locally with a different tag, replace the image name accordingly.)*
31 | 
32 | **Analyzing a Website:**
33 | 
34 | To analyze a website, provide the site URL as the main argument:
35 | 
36 | ```bash
37 | # Analyze a site and output JSON (default)
38 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com
39 | 
40 | # Analyze a site and output HTML
41 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com -f html > analysis_report.html
42 | 
43 | # Analyze a site using a sitemap
44 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com -s https://example.com/sitemap.xml
45 | 
46 | # Analyze with heading analysis enabled
47 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com --analyze-headings
48 | 
49 | # Analyze without following internal links
50 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com --no-follow-links
51 | 
52 | # Analyze with LLM analysis (requires appropriate environment variables for the LLM provider, e.g., ANTHROPIC_API_KEY)
53 | # You'll need to pass environment variables using the -e flag
54 | docker run --rm -e ANTHROPIC_API_KEY=your_api_key sethblack/python-seo-analyzer:latest https://example.com --run-llm-analysis
55 | ```
56 | 
57 | ## Command-Line Arguments
58 | 
59 | The `python-seo-analyzer` tool accepts the following arguments when run via Docker:
60 | 
61 | *   `site`: (Required) The URL of the website you want to analyze.
62 | *   `-s`, `--sitemap`: URL of the sitemap to seed the crawler with.
63 | *   `-f`, `--output-format`: Output format. Choices: `json` (default), `html`.
64 | *   `--analyze-headings`: Enable analysis of heading tags (h1-h6). Default: `False`.
65 | *   `--analyze-extra-tags`: Enable analysis of other additional tags. Default: `False`.
66 | *   `--no-follow-links`: Disable following internal links during the crawl. By default, the crawler *does* follow internal links. Use this flag to prevent that behavior.
67 | *   `--run-llm-analysis`: Run Large Language Model (LLM) analysis on the content. Requires API keys to be configured via environment variables (e.g., `ANTHROPIC_API_KEY`). Default: `False`.
68 | *   `--version`: Display the tool's version and exit. (This is the default command if no other arguments are provided).
69 | 
70 | ## Examples
71 | 
72 | **Analyze `sethserver.com` and save the output as HTML:**
73 | 
74 | ```bash
75 | docker run --rm sethblack/python-seo-analyzer:latest https://sethserver.com -f html > sethserver_report.html
76 | ```
77 | 
78 | **Analyze `github.com` using its sitemap and output JSON:**
79 | 
80 | ```bash
81 | docker run --rm sethblack/python-seo-analyzer:latest https://github.com -s https://github.com/sitemap.xml
82 | ```
83 | 
84 | **Analyze `example.com` with heading analysis but without following internal links:**
85 | 
86 | ```bash
87 | docker run --rm sethblack/python-seo-analyzer:latest https://example.com --analyze-headings --no-follow-links
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Python SEO and GEO Analyzer
  2 | ===========================
  3 | 
  4 | [![PyPI version](https://badge.fury.io/py/pyseoanalyzer.svg)](https://badge.fury.io/py/pyseoanalyzer)
  5 | [![Docker Pulls](https://img.shields.io/docker/pulls/sethblack/python-seo-analyzer.svg)](https://hub.docker.com/r/sethblack/python-seo-analyzer)
  6 | 
  7 | A modern SEO and GEO (Generative AI Engine Optimization or better AI Search Optimization) analysis tool that combines technical optimization and authentic human value. Beyond traditional site crawling and structure analysis, it uses AI to evaluate content's expertise signals, conversational engagement, and cross-platform presence. It helps you maintain strong technical foundations while ensuring your site demonstrates genuine authority and value to real users.
  8 | 
  9 | The AI features were heavily influenced by the clickbait-titled SEL article [A 13-point roadmap for thriving in the age of AI search](https://searchengineland.com/seo-roadmap-ai-search-449199).
 10 | 
 11 | Note About Python
 12 | -----------------
 13 | 
 14 | I've written quite a bit about the speed of Python and how there are very specific use cases where it isn't the best choice. I feel like crawling websites is definitely one of those cases. I wrote this tool in Python around 2010 to solve the very specific need of crawling some small HTML-only websites for startups I was working at. I'm excited to see how much it has grown and how many people are using it. I feel like Python SEO Analyzer is acceptable for most smaller use cases, but if you are looking for something better, I've built a much faster and more comprehensive tool [Black SEO Analyzer](https://github.com/sethblack/black-seo-analyzer).
 15 | 
 16 | -Seth
 17 | 
 18 | Installation
 19 | ------------
 20 | 
 21 | ### PIP
 22 | 
 23 | ```
 24 | pip install pyseoanalyzer
 25 | ```
 26 | 
 27 | ### Docker
 28 | 
 29 | #### Using the Pre-built Image from Docker Hub
 30 | 
 31 | The easiest way to use the Docker image is to pull it directly from [Docker Hub](https://hub.docker.com/r/sethblack/python-seo-analyzer).
 32 | 
 33 | ```bash
 34 | # Pull the latest image
 35 | docker pull sethblack/python-seo-analyzer:latest
 36 | 
 37 | # Run the analyzer (replace example.com with the target URL)
 38 | # The --rm flag automatically removes the container when it exits
 39 | docker run --rm sethblack/python-seo-analyzer http://example.com/
 40 | 
 41 | # Run with specific arguments (e.g., sitemap and HTML output)
 42 | # Note: If the sitemap is local, you'll need to mount it (see mounting example below)
 43 | docker run --rm sethblack/python-seo-analyzer http://example.com/ --sitemap /path/inside/container/sitemap.xml --output-format html
 44 | 
 45 | # Run with AI analysis (requires ANTHROPIC_API_KEY)
 46 | # Replace "your_api_key_here" with your actual Anthropic API key
 47 | docker run --rm -e ANTHROPIC_API_KEY="your_api_key_here" sethblack/python-seo-analyzer http://example.com/ --run-llm-analysis
 48 | 
 49 | # Save HTML output to your local machine
 50 | # This mounts the current directory (.) into /app/output inside the container.
 51 | # The output file 'results.html' will be saved in your current directory.
 52 | # The tool outputs JSON by default to stdout, so we redirect it for HTML.
 53 | # Since the ENTRYPOINT handles the command, we redirect the container's stdout.
 54 | # We need a shell inside the container to handle the redirection.
 55 | docker run --rm -v "$(pwd):/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
 56 | # Note for Windows CMD users: Use %cd% instead of $(pwd)
 57 | # docker run --rm -v "%cd%:/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
 58 | # Note for Windows PowerShell users: Use ${pwd} instead of $(pwd)
 59 | # docker run --rm -v "${pwd}:/app/output" sethblack/python-seo-analyzer /bin/sh -c "seoanalyze http://example.com/ --output-format html > /app/output/results.html"
 60 | 
 61 | 
 62 | # Mount a local sitemap file
 63 | # This mounts 'local-sitemap.xml' from the current directory to '/app/sitemap.xml' inside the container
 64 | docker run --rm -v "$(pwd)/local-sitemap.xml:/app/sitemap.xml" sethblack/python-seo-analyzer http://example.com/ --sitemap /app/sitemap.xml
 65 | # Adjust paths and Windows commands as needed (see volume mounting example above)
 66 | 
 67 | ```
 68 | 
 69 | #### Building the Image Locally
 70 | 
 71 | You can also build the Docker image yourself from the source code. Make sure you have Docker installed and running.
 72 | 
 73 | ```bash
 74 | # Clone the repository (if you haven't already)
 75 | # git clone https://github.com/sethblack/python-seo-analyzer.git
 76 | # cd python-seo-analyzer
 77 | 
 78 | # Build the Docker image (tag it as 'my-seo-analyzer' for easy reference)
 79 | docker build -t my-seo-analyzer .
 80 | 
 81 | # Run the locally built image
 82 | docker run --rm my-seo-analyzer http://example.com/
 83 | 
 84 | # Run with AI analysis using the locally built image
 85 | docker run --rm -e ANTHROPIC_API_KEY="your_api_key_here" my-seo-analyzer http://example.com/ --run-llm-analysis
 86 | 
 87 | # Run with HTML output saved locally using the built image
 88 | docker run --rm -v "$(pwd):/app/output" my-seo-analyzer /bin/sh -c "python-seo-analyzer http://example.com/ --output-format html > /app/output/results.html"
 89 | # Adjust Windows commands as needed (see volume mounting example above)
 90 | ```
 91 | 
 92 | Command-line Usage
 93 | ------------------
 94 | 
 95 | If you run without a sitemap it will start crawling at the homepage.
 96 | 
 97 | ```sh
 98 | python-seo-analyzer http://www.domain.com/
 99 | ```
100 | 
101 | Or you can specify the path to a sitmap to seed the urls to scan list.
102 | 
103 | ```sh
104 | seoanapython-seo-analyzerlyze http://www.domain.com/ --sitemap path/to/sitemap.xml
105 | ```
106 | 
107 | HTML output can be generated from the analysis instead of json.
108 | 
109 | ```sh
110 | python-seo-analyzer http://www.domain.com/ --output-format html
111 | ```
112 | 
113 | API
114 | ---
115 | 
116 | The `analyze` function returns a dictionary with the results of the crawl.
117 | 
118 | ```python
119 | from pyseoanalyzer import analyze
120 | 
121 | output = analyze(site, sitemap)
122 | 
123 | print(output)
124 | ```
125 | 
126 | In order to analyze heading tags (h1-h6) and other extra additional tags as well, the following options can be passed to the `analyze` function
127 | ```python
128 | from pyseoanalyzer import analyze
129 | 
130 | output = analyze(site, sitemap, analyze_headings=True, analyze_extra_tags=True)
131 | 
132 | print(output)
133 | ```
134 | 
135 | By default, the `analyze` function analyzes all the existing inner links as well, which might be time consuming.
136 | This default behaviour can be changed to analyze only the provided URL by passing the following option to the `analyze` function
137 | ```python
138 | from pyseoanalyzer import analyze
139 | 
140 | output = analyze(site, sitemap, follow_links=False)
141 | 
142 | print(output)
143 | ```
144 | 
145 | Alternatively, you can run the analysis as a script from the seoanalyzer folder.
146 | 
147 | ```sh
148 | python -m seoanalyzer https://www.sethserver.com/ -f html > results.html
149 | ```
150 | 
151 | AI Optimization
152 | ---------------
153 | 
154 | The first pass of AI optimization features use Anthropic's `claude-3-sonnet-20240229` model to evaluate the content of the site. You will need to have an API key from [Anthropic](https://www.anthropic.com/) to use this feature. The API key needs to be set as the environment variable `ANTHROPIC_API_KEY`. I recommend using a `.env` file to set this variable. Once the API key is set, the AI optimization features can be enabled with the `--run-llm-analysis` flag.
155 | 
156 | Notes
157 | -----
158 | 
159 | If you get `requests.exceptions.SSLError` at either the command-line or via the python-API, try using:
160 |  - http://www.foo.bar
161 |  
162 |  **instead** of..
163 |  
164 |  -  https://www.foo.bar
165 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pyseoanalyzer"
 7 | version = "2025.4.3"
 8 | authors = [
 9 |   {name = "Seth Black", email = "sblack@sethserver.com"},
10 | ]
11 | dependencies = [
12 |     "beautifulsoup4==4.13.3",
13 |     "certifi==2025.1.31",
14 |     "Jinja2==3.1.6",
15 |     "langchain==0.3.22",
16 |     "langchain-anthropic==0.3.10",
17 |     "lxml==5.3.1",
18 |     "MarkupSafe==3.0.2",
19 |     "python-dotenv==1.1.0",
20 |     "trafilatura==2.0.0",
21 |     "urllib3==2.3.0",
22 | ]
23 | requires-python = ">= 3.8"
24 | description = "An SEO tool that analyzes the structure of a site, crawls the site, count words in the body of the site and warns of any technical SEO issues."
25 | readme = "README.md"
26 | license = {file = "LICENSE"}
27 | keywords = [
28 |     "search engine optimization",
29 |     "seo",
30 |     "website parser",
31 |     "crawler",
32 |     "scraper",
33 |     "site analyzer",
34 |     "site parser",
35 |     "site crawler",
36 | ]
37 | classifiers = [
38 |     "Development Status :: 5 - Production/Stable",
39 |     "Programming Language :: Python",
40 |     "Programming Language :: Python :: 3",
41 |     "Programming Language :: Python :: 3 :: Only",
42 |     "Environment :: Console",
43 |     "Intended Audience :: Developers",
44 |     "License :: OSI Approved :: BSD License",
45 |     "Operating System :: OS Independent",
46 |     "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
47 |     "Topic :: Software Development :: Libraries :: Python Modules",
48 |     "Topic :: Text Processing",
49 |     "Topic :: Internet :: WWW/HTTP",
50 | ]
51 | 
52 | [project.scripts]
53 | python-seo-analyzer = "pyseoanalyzer.__main__:main"
54 | 
55 | [project.urls]
56 | Homepage = "https://github.com/sethblack/python-seo-analyzer"
57 | Repository = "https://github.com/sethblack/python-seo-analyzer.git"
58 | Issues = "https://github.com/sethblack/python-seo-analyzer/issues"
59 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | 
 5 | # Use importlib.metadata (available in Python 3.8+) to get the version
 6 | # defined in pyproject.toml. This avoids duplicating the version string.
 7 | if sys.version_info >= (3, 8):
 8 |     from importlib import metadata
 9 | else:
10 |     # Fallback for Python < 3.8 (requires importlib-metadata backport)
11 |     # Consider adding 'importlib-metadata; python_version < "3.8"' to dependencies
12 |     # if you need to support older Python versions.
13 |     import importlib_metadata as metadata
14 | 
15 | try:
16 |     # __package__ refers to the package name ('pyseoanalyzer')
17 |     __version__ = metadata.version(__package__)
18 | except metadata.PackageNotFoundError:
19 |     # Fallback if the package is not installed (e.g., when running from source)
20 |     # You might want to handle this differently, e.g., raise an error
21 |     # or read from a VERSION file. For now, setting it to unknown.
22 |     __version__ = "0.0.0-unknown"
23 | 
24 | 
25 | from .analyzer import analyze
26 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import inspect
 5 | import json
 6 | import os
 7 | import sys
 8 | 
 9 | from .analyzer import analyze
10 | from . import __version__
11 | 
12 | 
13 | def main():
14 |     module_path = os.path.dirname(inspect.getfile(analyze))
15 |     arg_parser = argparse.ArgumentParser(
16 |         description="Analyze SEO aspects of a website."
17 |     )
18 |     arg_parser.add_argument(
19 |         "--version", action="version", version=f"%(prog)s {__version__}"
20 |     )
21 |     arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.")
22 |     arg_parser.add_argument(
23 |         "-s", "--sitemap", help="URL of the sitemap to seed the crawler with."
24 |     )
25 |     arg_parser.add_argument(
26 |         "-f",
27 |         "--output-format",
28 |         help="Output format.",
29 |         choices=[
30 |             "json",
31 |             "html",
32 |         ],
33 |         default="json",
34 |     )
35 | 
36 |     arg_parser.add_argument(
37 |         "--analyze-headings",
38 |         default=False,
39 |         action="store_true",
40 |         help="Analyze heading tags (h1-h6).",
41 |     )
42 |     arg_parser.add_argument(
43 |         "--analyze-extra-tags",
44 |         default=False,
45 |         action="store_true",
46 |         help="Analyze other extra additional tags.",
47 |     )
48 |     arg_parser.add_argument(
49 |         "--no-follow-links",
50 |         default=True,
51 |         action="store_false",
52 |         help="Analyze all the existing inner links as well (might be time consuming).",
53 |     )
54 |     arg_parser.add_argument(
55 |         "--run-llm-analysis",
56 |         default=False,
57 |         action="store_true",
58 |         help="Run LLM analysis on the content.",
59 |     )
60 | 
61 |     args = arg_parser.parse_args()
62 | 
63 |     output = analyze(
64 |         args.site,
65 |         args.sitemap,
66 |         analyze_headings=args.analyze_headings,
67 |         analyze_extra_tags=args.analyze_extra_tags,
68 |         follow_links=args.no_follow_links,
69 |         run_llm_analysis=args.run_llm_analysis,
70 |     )
71 | 
72 |     if args.output_format == "html":
73 |         from jinja2 import Environment
74 |         from jinja2 import FileSystemLoader
75 | 
76 |         env = Environment(
77 |             loader=FileSystemLoader(os.path.join(module_path, "templates"))
78 |         )
79 |         template = env.get_template("index.html")
80 |         output_from_parsed_template = template.render(result=output)
81 |         print(output_from_parsed_template)
82 |     elif args.output_format == "json":
83 |         print(json.dumps(output, indent=4, separators=(",", ": ")))
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/analyzer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from operator import itemgetter
 3 | from .website import Website
 4 | 
 5 | 
 6 | def calc_total_time(start_time):
 7 |     return time.time() - start_time
 8 | 
 9 | 
10 | def analyze(
11 |     url,
12 |     sitemap_url=None,
13 |     analyze_headings=False,
14 |     analyze_extra_tags=False,
15 |     follow_links=True,
16 |     run_llm_analysis=False,
17 | ):
18 |     start_time = time.time()
19 | 
20 |     output = {
21 |         "pages": [],
22 |         "keywords": [],
23 |         "errors": [],
24 |         "total_time": 0,  # Initialize to 0 before calculation
25 |     }
26 | 
27 |     site = Website(
28 |         base_url=url,
29 |         sitemap=sitemap_url,
30 |         analyze_headings=analyze_headings,
31 |         analyze_extra_tags=analyze_extra_tags,
32 |         follow_links=follow_links,
33 |         run_llm_analysis=run_llm_analysis,
34 |     )
35 | 
36 |     site.crawl()
37 | 
38 |     for p in site.crawled_pages:
39 |         output["pages"].append(p.as_dict())
40 | 
41 |     output["duplicate_pages"] = [
42 |         list(site.content_hashes[p])
43 |         for p in site.content_hashes
44 |         if len(site.content_hashes[p]) > 1
45 |     ]
46 | 
47 |     sorted_words = sorted(site.wordcount.items(), key=itemgetter(1), reverse=True)
48 |     sorted_bigrams = sorted(site.bigrams.items(), key=itemgetter(1), reverse=True)
49 |     sorted_trigrams = sorted(site.trigrams.items(), key=itemgetter(1), reverse=True)
50 | 
51 |     output["keywords"] = []
52 | 
53 |     for w in sorted_words:
54 |         if w[1] > 4:
55 |             output["keywords"].append(
56 |                 {
57 |                     "word": w[0],
58 |                     "count": w[1],
59 |                 }
60 |             )
61 | 
62 |     for w, v in sorted_bigrams:
63 |         if v > 4:
64 |             output["keywords"].append(
65 |                 {
66 |                     "word": w,
67 |                     "count": v,
68 |                 }
69 |             )
70 | 
71 |     for w, v in sorted_trigrams:
72 |         if v > 4:
73 |             output["keywords"].append(
74 |                 {
75 |                     "word": w,
76 |                     "count": v,
77 |                 }
78 |             )
79 | 
80 |     # Sort one last time...
81 |     output["keywords"] = sorted(
82 |         output["keywords"], key=itemgetter("count"), reverse=True
83 |     )
84 | 
85 |     output["total_time"] = calc_total_time(start_time)
86 | 
87 |     return output
88 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/http.py:
--------------------------------------------------------------------------------
 1 | import certifi
 2 | from urllib3 import PoolManager
 3 | from urllib3 import Timeout
 4 | 
 5 | 
 6 | class Http:
 7 |     def __init__(self):
 8 |         user_agent = {"User-Agent": "Mozilla/5.0"}
 9 | 
10 |         self.http = PoolManager(
11 |             timeout=Timeout(connect=2.0, read=7.0),
12 |             cert_reqs="CERT_REQUIRED",
13 |             ca_certs=certifi.where(),
14 |             headers=user_agent,
15 |         )
16 | 
17 |     def get(self, url):
18 |         return self.http.request("GET", url)
19 | 
20 | 
21 | http = Http()
22 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/llm_analyst.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | from langchain_anthropic import ChatAnthropic
  3 | from langchain.prompts import PromptTemplate
  4 | from langchain.schema.runnable import RunnablePassthrough
  5 | from langchain.output_parsers import PydanticOutputParser
  6 | from pydantic import BaseModel, Field
  7 | from typing import Dict, List, Optional
  8 | 
  9 | import asyncio
 10 | import json
 11 | import os
 12 | 
 13 | load_dotenv()
 14 | 
 15 | 
 16 | # Pydantic models for structured output
 17 | class EntityAnalysis(BaseModel):
 18 |     entity_assessment: str = Field(
 19 |         description="Detailed analysis of entity optimization"
 20 |     )
 21 |     knowledge_panel_readiness: int = Field(description="Score from 0-100")
 22 |     key_improvements: List[str] = Field(description="Top 3 improvements needed")
 23 | 
 24 | 
 25 | class CredibilityAnalysis(BaseModel):
 26 |     credibility_assessment: str = Field(description="Overall credibility analysis")
 27 |     neeat_scores: Dict[str, int] = Field(
 28 |         description="Individual N-E-E-A-T-T component scores"
 29 |     )
 30 |     trust_signals: List[str] = Field(description="Identified trust signals")
 31 | 
 32 | 
 33 | class ConversationAnalysis(BaseModel):
 34 |     conversation_readiness: str = Field(description="Overall assessment")
 35 |     query_patterns: List[str] = Field(description="Identified query patterns")
 36 |     engagement_score: int = Field(description="Score from 0-100")
 37 |     gaps: List[str] = Field(description="Identified conversational gaps")
 38 | 
 39 | 
 40 | class PlatformPresence(BaseModel):
 41 |     platform_coverage: Dict[str, str] = Field(
 42 |         description="Coverage analysis per platform"
 43 |     )
 44 |     visibility_scores: Dict[str, int] = Field(description="Scores per platform type")
 45 |     optimization_opportunities: List[str] = Field(description="List of opportunities")
 46 | 
 47 | 
 48 | class SEORecommendations(BaseModel):
 49 |     strategic_recommendations: List[str] = Field(
 50 |         description="Major strategic recommendations"
 51 |     )
 52 |     quick_wins: List[str] = Field(description="Immediate action items")
 53 |     long_term_strategy: List[str] = Field(description="Long-term strategic goals")
 54 |     priority_matrix: Dict[str, str] = Field(
 55 |         description="Priority matrix by impact/effort"
 56 |     )
 57 | 
 58 | 
 59 | class LLMSEOEnhancer:
 60 |     def __init__(self):
 61 |         self.llm = ChatAnthropic(
 62 |             model="claude-3-sonnet-20240229",
 63 |             anthropic_api_key=os.environ.get("ANTHROPIC_API_KEY"),
 64 |             temperature=0,
 65 |             timeout=30,
 66 |             max_retries=3,
 67 |         )
 68 |         self._setup_chains()
 69 | 
 70 |     def _setup_chains(self):
 71 |         """Setup modern LangChain runnable sequences using pipe syntax"""
 72 |         # Entity Analysis Chain
 73 |         entity_parser = PydanticOutputParser(pydantic_object=EntityAnalysis)
 74 | 
 75 |         entity_prompt = PromptTemplate.from_template(
 76 |             """Analyze these SEO elements for entity optimization:
 77 |             1. Entity understanding (Knowledge Panel readiness)
 78 |             2. Brand credibility signals (N-E-E-A-T-T principles)
 79 |             3. Entity relationships and mentions
 80 |             4. Topic entity connections
 81 |             5. Schema markup effectiveness
 82 |             
 83 |             Data to analyze:
 84 |             {seo_data}
 85 |             
 86 |             {format_instructions}
 87 | 
 88 |             Only return your ouput in JSON format. Do not include any explanations any other text.
 89 |             """
 90 |         )
 91 | 
 92 |         self.entity_chain = (
 93 |             {
 94 |                 "seo_data": RunnablePassthrough(),
 95 |                 "format_instructions": lambda _: entity_parser.get_format_instructions(),
 96 |             }
 97 |             | entity_prompt
 98 |             | self.llm
 99 |             | entity_parser
100 |         )
101 | 
102 |         # Credibility Analysis Chain
103 |         credibility_parser = PydanticOutputParser(pydantic_object=CredibilityAnalysis)
104 | 
105 |         credibility_prompt = PromptTemplate.from_template(
106 |             """Evaluate these credibility aspects:
107 |             1. N-E-E-A-T-T signals
108 |             2. Entity understanding and validation
109 |             3. Content creator credentials
110 |             4. Publisher authority
111 |             5. Topic expertise signals
112 |             
113 |             Data to analyze:
114 |             {seo_data}
115 |             
116 |             {format_instructions}
117 | 
118 |             Only return your ouput in JSON format. Do not include any explanations any other text.
119 |             """
120 |         )
121 | 
122 |         self.credibility_chain = (
123 |             {
124 |                 "seo_data": RunnablePassthrough(),
125 |                 "format_instructions": lambda _: credibility_parser.get_format_instructions(),
126 |             }
127 |             | credibility_prompt
128 |             | self.llm
129 |             | credibility_parser
130 |         )
131 | 
132 |         # Conversation Analysis Chain
133 |         conversation_parser = PydanticOutputParser(pydantic_object=ConversationAnalysis)
134 | 
135 |         conversation_prompt = PromptTemplate.from_template(
136 |             """Analyze content for conversational search readiness:
137 |             1. Query pattern matching
138 |             2. Intent coverage across funnel
139 |             3. Natural language understanding
140 |             4. Follow-up content availability
141 |             5. Conversational triggers
142 |             
143 |             Data to analyze:
144 |             {seo_data}
145 |             
146 |             {format_instructions}
147 | 
148 |             Only return your ouput in JSON format. Do not include any explanations any other text.
149 |             """
150 |         )
151 | 
152 |         self.conversation_chain = (
153 |             {
154 |                 "seo_data": RunnablePassthrough(),
155 |                 "format_instructions": lambda _: conversation_parser.get_format_instructions(),
156 |             }
157 |             | conversation_prompt
158 |             | self.llm
159 |             | conversation_parser
160 |         )
161 | 
162 |         # Platform Presence Chain
163 |         platform_parser = PydanticOutputParser(pydantic_object=PlatformPresence)
164 | 
165 |         platform_prompt = PromptTemplate.from_template(
166 |             """Analyze presence across different platforms:
167 |             1. Search engines (Google, Bing)
168 |             2. Knowledge graphs
169 |             3. AI platforms (ChatGPT, Bard)
170 |             4. Social platforms
171 |             5. Industry-specific platforms
172 |             
173 |             Data to analyze:
174 |             {seo_data}
175 |             
176 |             {format_instructions}
177 | 
178 |             Only return your ouput in JSON format. Do not include any explanations any other text.
179 |             """
180 |         )
181 | 
182 |         self.platform_chain = (
183 |             {
184 |                 "seo_data": RunnablePassthrough(),
185 |                 "format_instructions": lambda _: platform_parser.get_format_instructions(),
186 |             }
187 |             | platform_prompt
188 |             | self.llm
189 |             | platform_parser
190 |         )
191 | 
192 |         # Recommendations Chain
193 |         recommendations_parser = PydanticOutputParser(
194 |             pydantic_object=SEORecommendations
195 |         )
196 | 
197 |         recommendations_prompt = PromptTemplate.from_template(
198 |             """Based on this complete analysis, provide strategic recommendations:
199 |             1. Entity optimization strategy
200 |             2. Content strategy across platforms
201 |             3. Credibility building actions
202 |             4. Conversational optimization
203 |             5. Cross-platform presence improvement
204 |             
205 |             Analysis results:
206 |             {analysis_results}
207 |             
208 |             {format_instructions}
209 | 
210 |             Only return your ouput in JSON format. Do not include any explanations any other text.
211 |             """
212 |         )
213 | 
214 |         self.recommendations_chain = (
215 |             {
216 |                 "analysis_results": RunnablePassthrough(),
217 |                 "format_instructions": lambda _: recommendations_parser.get_format_instructions(),
218 |             }
219 |             | recommendations_prompt
220 |             | self.llm
221 |             | recommendations_parser
222 |         )
223 | 
224 |     async def enhance_seo_analysis(self, seo_data: Dict) -> Dict:
225 |         """
226 |         Enhanced SEO analysis using modern LangChain patterns
227 |         """
228 |         # Convert seo_data to string for prompt insertion
229 |         seo_data_str = json.dumps(seo_data, indent=2)
230 | 
231 |         # Run analysis chains in parallel
232 |         entity_results, credibility_results, conversation_results, platform_results = (
233 |             await asyncio.gather(
234 |                 self.entity_chain.ainvoke(seo_data_str),
235 |                 self.credibility_chain.ainvoke(seo_data_str),
236 |                 self.conversation_chain.ainvoke(seo_data_str),
237 |                 self.platform_chain.ainvoke(seo_data_str),
238 |             )
239 |         )
240 | 
241 |         # Combine analyses
242 |         combined_analysis = {
243 |             "entity_analysis": entity_results.model_dump(),
244 |             "credibility_analysis": credibility_results.model_dump(),
245 |             "conversation_analysis": conversation_results.model_dump(),
246 |             "cross_platform_presence": platform_results.model_dump(),
247 |         }
248 | 
249 |         # Generate final recommendations
250 |         recommendations = await self.recommendations_chain.ainvoke(
251 |             json.dumps(combined_analysis, indent=2)
252 |         )
253 | 
254 |         # Combine all results
255 |         final_results = {
256 |             **seo_data,
257 |             **combined_analysis,
258 |             "recommendations": recommendations.model_dump(),
259 |         }
260 | 
261 |         return self._format_output(final_results)
262 | 
263 |     def _format_output(self, raw_analysis: Dict) -> Dict:
264 |         """Format analysis results into a clean, structured output"""
265 |         return {
266 |             "summary": {
267 |                 "entity_score": raw_analysis["entity_analysis"][
268 |                     "knowledge_panel_readiness"
269 |                 ],
270 |                 "credibility_score": sum(
271 |                     raw_analysis["credibility_analysis"]["neeat_scores"].values()
272 |                 )
273 |                 / 6,
274 |                 "conversation_score": raw_analysis["conversation_analysis"][
275 |                     "engagement_score"
276 |                 ],
277 |                 "platform_score": sum(
278 |                     raw_analysis["cross_platform_presence"][
279 |                         "visibility_scores"
280 |                     ].values()
281 |                 )
282 |                 / len(raw_analysis["cross_platform_presence"]["visibility_scores"]),
283 |             },
284 |             "detailed_analysis": raw_analysis,
285 |             "quick_wins": raw_analysis["recommendations"]["quick_wins"],
286 |             "strategic_recommendations": raw_analysis["recommendations"][
287 |                 "strategic_recommendations"
288 |             ],
289 |         }
290 | 
291 | 
292 | # Example usage with async support
293 | async def enhanced_modern_analyze(
294 |     site: str, sitemap: Optional[str] = None, api_key: str = None, **kwargs
295 | ):
296 |     """
297 |     Enhanced analysis incorporating modern SEO principles using LangChain
298 |     """
299 |     from pyseoanalyzer import analyze
300 | 
301 |     # Run original analysis
302 |     original_results = analyze(site, sitemap, **kwargs)
303 | 
304 |     # Enhance with modern SEO analysis if API key provided
305 |     if api_key:
306 |         enhancer = LLMSEOEnhancer()
307 |         enhanced_results = await enhancer.enhance_seo_analysis(original_results)
308 |         return enhancer._format_output(enhanced_results)
309 | 
310 |     return original_results
311 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/page.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import hashlib
  3 | import json
  4 | import lxml.html as lh
  5 | import os
  6 | import re
  7 | import trafilatura
  8 | 
  9 | from bs4 import BeautifulSoup
 10 | from collections import Counter
 11 | from string import punctuation
 12 | from urllib.parse import urlsplit
 13 | from urllib3.exceptions import HTTPError
 14 | 
 15 | from .http import http
 16 | from .llm_analyst import LLMSEOEnhancer
 17 | from .stopwords import ENGLISH_STOP_WORDS
 18 | 
 19 | TOKEN_REGEX = re.compile(r"(?u)\b\w\w+\b")
 20 | 
 21 | HEADING_TAGS_XPATHS = {
 22 |     "h1": "//h1",
 23 |     "h2": "//h2",
 24 |     "h3": "//h3",
 25 |     "h4": "//h4",
 26 |     "h5": "//h5",
 27 |     "h6": "//h6",
 28 | }
 29 | 
 30 | ADDITIONAL_TAGS_XPATHS = {
 31 |     "title": "//title/text()",
 32 |     "meta_desc": '//meta[@name="description"]/@content',
 33 |     "viewport": '//meta[@name="viewport"]/@content',
 34 |     "charset": "//meta[@charset]/@charset",
 35 |     "canonical": '//link[@rel="canonical"]/@href',
 36 |     "alt_href": '//link[@rel="alternate"]/@href',
 37 |     "alt_hreflang": '//link[@rel="alternate"]/@hreflang',
 38 |     "og_title": '//meta[@property="og:title"]/@content',
 39 |     "og_desc": '//meta[@property="og:description"]/@content',
 40 |     "og_url": '//meta[@property="og:url"]/@content',
 41 |     "og_image": '//meta[@property="og:image"]/@content',
 42 | }
 43 | 
 44 | IMAGE_EXTENSIONS = set(
 45 |     [
 46 |         ".img",
 47 |         ".png",
 48 |         ".jpg",
 49 |         ".jpeg",
 50 |         ".gif",
 51 |         ".bmp",
 52 |         ".svg",
 53 |         ".webp",
 54 |         ".avif",
 55 |     ]
 56 | )
 57 | 
 58 | 
 59 | class Page:
 60 |     """
 61 |     Container for each page and the core analyzer.
 62 |     """
 63 | 
 64 |     def __init__(
 65 |         self,
 66 |         url="",
 67 |         base_domain="",
 68 |         analyze_headings=False,
 69 |         analyze_extra_tags=False,
 70 |         encoding="utf-8",
 71 |         run_llm_analysis=False,
 72 |     ):
 73 |         """
 74 |         Variables go here, *not* outside of __init__
 75 |         """
 76 | 
 77 |         self.base_domain = urlsplit(base_domain)
 78 |         self.parsed_url = urlsplit(url)
 79 |         self.url = url
 80 |         self.analyze_headings = analyze_headings
 81 |         self.analyze_extra_tags = analyze_extra_tags
 82 |         self.encoding = encoding
 83 |         self.run_llm_analysis = run_llm_analysis
 84 |         self.title: str = ""
 85 |         self.author: str = ""
 86 |         self.description: str = ""
 87 |         self.hostname: str = ""
 88 |         self.sitename: str
 89 |         self.date: str
 90 |         self.keywords = {}
 91 |         self.warnings = []
 92 |         self.translation = bytes.maketrans(
 93 |             punctuation.encode(encoding), str(" " * len(punctuation)).encode(encoding)
 94 |         )
 95 |         self.links = []
 96 |         self.total_word_count = 0
 97 |         self.wordcount = Counter()
 98 |         self.bigrams = Counter()
 99 |         self.trigrams = Counter()
100 |         self.stem_to_word = {}
101 |         self.content: str = None
102 |         self.content_hash: str = None
103 | 
104 |         if run_llm_analysis:
105 |             self.llm_analysis = {}
106 | 
107 |         if analyze_headings:
108 |             self.headings = {}
109 | 
110 |         if analyze_extra_tags:
111 |             self.additional_info = {}
112 | 
113 |     def as_dict(self):
114 |         """
115 |         Returns a dictionary that can be printed
116 |         """
117 | 
118 |         context = {
119 |             "url": self.url,
120 |             "title": self.title,
121 |             "description": self.description,
122 |             "author": self.author,
123 |             "hostname": self.hostname,
124 |             "sitename": self.sitename,
125 |             "date": self.date,
126 |             "word_count": self.total_word_count,
127 |             "keywords": self.sort_freq_dist(self.keywords, limit=5),
128 |             "bigrams": self.bigrams,
129 |             "trigrams": self.trigrams,
130 |             "warnings": self.warnings,
131 |             "content_hash": self.content_hash,
132 |         }
133 | 
134 |         if self.analyze_headings:
135 |             context["headings"] = self.headings
136 | 
137 |         if self.analyze_extra_tags:
138 |             context["additional_info"] = self.additional_info
139 | 
140 |         if self.run_llm_analysis:
141 |             context["llm_analysis"] = self.llm_analysis
142 | 
143 |         return context
144 | 
145 |     def analyze_heading_tags(self, bs):
146 |         """
147 |         Analyze the heading tags and populate the headings
148 |         """
149 | 
150 |         try:
151 |             dom = lh.fromstring(str(bs))
152 |         except ValueError as _:
153 |             dom = lh.fromstring(bs.encode(self.encoding))
154 |         for tag, xpath in HEADING_TAGS_XPATHS.items():
155 |             value = [heading.text_content() for heading in dom.xpath(xpath)]
156 |             if value:
157 |                 self.headings.update({tag: value})
158 | 
159 |     def analyze_additional_tags(self, bs):
160 |         """
161 |         Analyze additional tags and populate the additional info
162 |         """
163 | 
164 |         try:
165 |             dom = lh.fromstring(str(bs))
166 |         except ValueError as _:
167 |             dom = lh.fromstring(bs.encode(self.encoding))
168 |         for tag, xpath in ADDITIONAL_TAGS_XPATHS.items():
169 |             value = dom.xpath(xpath)
170 |             if value:
171 |                 self.additional_info.update({tag: value})
172 | 
173 |     def analyze(self, raw_html=None):
174 |         """
175 |         Analyze the page and populate the warnings list
176 |         """
177 | 
178 |         if not raw_html:
179 |             valid_prefixes = []
180 | 
181 |             # only allow http:// https:// and //
182 |             for s in [
183 |                 "http://",
184 |                 "https://",
185 |                 "//",
186 |             ]:
187 |                 valid_prefixes.append(self.url.startswith(s))
188 | 
189 |             if True not in valid_prefixes:
190 |                 self.warn(f"{self.url} does not appear to have a valid protocol.")
191 |                 return
192 | 
193 |             if self.url.startswith("//"):
194 |                 self.url = f"{self.base_domain.scheme}:{self.url}"
195 | 
196 |             if self.parsed_url.netloc != self.base_domain.netloc:
197 |                 self.warn(f"{self.url} is not part of {self.base_domain.netloc}.")
198 |                 return
199 | 
200 |             try:
201 |                 page = http.get(self.url)
202 |             except HTTPError as e:
203 |                 self.warn(f"Returned {e}")
204 |                 return
205 | 
206 |             encoding = "utf8"
207 | 
208 |             if "content-type" in page.headers:
209 |                 encoding = page.headers["content-type"].split("charset=")[-1]
210 | 
211 |             if encoding.lower() not in ("text/html", "text/plain", self.encoding):
212 |                 self.warn(f"Can not read {encoding}")
213 |                 return
214 |             else:
215 |                 raw_html = page.data.decode(self.encoding)
216 | 
217 |         self.content_hash = hashlib.sha1(raw_html.encode(self.encoding)).hexdigest()
218 | 
219 |         # Use trafilatura to extract metadata
220 |         metadata = trafilatura.extract_metadata(
221 |             filecontent=raw_html,
222 |             default_url=self.url,
223 |             extensive=True,
224 |         )
225 | 
226 |         # I want to grab values from this even if they don't exist
227 |         metadata_dict = metadata.as_dict() if metadata else {}
228 | 
229 |         # Helper function to get value or default to "" if None or 'None'
230 |         def get_meta_value(key):
231 |             value = metadata_dict.get(key)
232 |             return "" if value is None or value == "None" else value
233 | 
234 |         # Ensure fields are strings, defaulting to "" if None or 'None'
235 |         self.title = get_meta_value("title")
236 |         self.author = get_meta_value("author")
237 |         self.description = get_meta_value("description")
238 |         self.hostname = get_meta_value("hostname")
239 |         self.sitename = get_meta_value("sitename")
240 |         self.date = get_meta_value("date")
241 |         metadata_keywords = get_meta_value("keywords")
242 | 
243 |         if len(metadata_keywords) > 0:
244 |             self.warn(
245 |                 f"Keywords should be avoided as they are a spam indicator and no longer used by Search Engines"
246 |             )
247 | 
248 |         # use trafulatura to extract the content
249 |         content = trafilatura.extract(
250 |             raw_html,
251 |             include_links=True,
252 |             include_formatting=False,
253 |             include_tables=True,
254 |             include_images=True,
255 |             output_format="json",
256 |         )
257 | 
258 |         self.content = json.loads(content) if content else None
259 | 
260 |         # remove comments, they screw with BeautifulSoup
261 |         html_without_comments = re.sub(r"<!--.*?-->", r"", raw_html, flags=re.DOTALL)
262 | 
263 |         # use BeautifulSoup to parse the more nuanced tags
264 |         soup_lower = BeautifulSoup(html_without_comments.lower(), "html.parser")
265 |         soup_unmodified = BeautifulSoup(html_without_comments, "html.parser")
266 | 
267 |         self.process_text(self.content["text"])
268 | 
269 |         self.analyze_title()
270 |         self.analyze_description()
271 |         self.analyze_og(soup_lower)
272 |         self.analyze_a_tags(soup_unmodified)
273 |         self.analyze_img_tags(soup_lower)
274 |         self.analyze_h1_tags(soup_lower)
275 | 
276 |         if self.analyze_headings:
277 |             self.analyze_heading_tags(soup_unmodified)
278 | 
279 |         if self.analyze_extra_tags:
280 |             self.analyze_additional_tags(soup_unmodified)
281 | 
282 |         if self.run_llm_analysis:
283 |             self.llm_analysis = self.use_llm_analyzer()
284 | 
285 |         return True
286 | 
287 |     def use_llm_analyzer(self):
288 |         """
289 |         Use the LLM analyzer to enhance the SEO analysis
290 |         """
291 | 
292 |         llm_enhancer = LLMSEOEnhancer()
293 |         return asyncio.run(llm_enhancer.enhance_seo_analysis(self.content))
294 | 
295 |     def word_list_freq_dist(self, wordlist):
296 |         freq = [wordlist.count(w) for w in wordlist]
297 |         return dict(zip(wordlist, freq))
298 | 
299 |     def sort_freq_dist(self, freqdist, limit=1):
300 |         aux = [
301 |             (freqdist[key], self.stem_to_word[key])
302 |             for key in freqdist
303 |             if freqdist[key] >= limit
304 |         ]
305 |         aux.sort()
306 |         aux.reverse()
307 |         return aux
308 | 
309 |     def raw_tokenize(self, rawtext):
310 |         return TOKEN_REGEX.findall(rawtext.lower())
311 | 
312 |     def tokenize(self, rawtext):
313 |         return [
314 |             word
315 |             for word in TOKEN_REGEX.findall(rawtext.lower())
316 |             if word not in ENGLISH_STOP_WORDS
317 |         ]
318 | 
319 |     def getngrams(self, D, n=2):
320 |         return zip(*[D[i:] for i in range(n)])
321 | 
322 |     def process_text(self, page_text):
323 |         tokens = self.tokenize(page_text)
324 |         raw_tokens = self.raw_tokenize(page_text)
325 |         self.total_word_count = len(raw_tokens)
326 | 
327 |         bigrams = self.getngrams(raw_tokens, 2)
328 | 
329 |         for ng in bigrams:
330 |             vt = " ".join(ng)
331 |             self.bigrams[vt] += 1
332 | 
333 |         trigrams = self.getngrams(raw_tokens, 3)
334 | 
335 |         for ng in trigrams:
336 |             vt = " ".join(ng)
337 |             self.trigrams[vt] += 1
338 | 
339 |         freq_dist = self.word_list_freq_dist(tokens)
340 | 
341 |         for word in freq_dist:
342 |             cnt = freq_dist[word]
343 | 
344 |             if word not in self.stem_to_word:
345 |                 self.stem_to_word[word] = word
346 | 
347 |             if word in self.wordcount:
348 |                 self.wordcount[word] += cnt
349 |             else:
350 |                 self.wordcount[word] = cnt
351 | 
352 |             if word in self.keywords:
353 |                 self.keywords[word] += cnt
354 |             else:
355 |                 self.keywords[word] = cnt
356 | 
357 |     def analyze_og(self, bs):
358 |         """
359 |         Validate open graph tags
360 |         """
361 |         og_title = bs.findAll("meta", attrs={"property": "og:title"})
362 |         og_description = bs.findAll("meta", attrs={"property": "og:description"})
363 |         og_image = bs.findAll("meta", attrs={"property": "og:image"})
364 | 
365 |         if len(og_title) == 0:
366 |             self.warn("Missing og:title")
367 | 
368 |         if len(og_description) == 0:
369 |             self.warn("Missing og:description")
370 | 
371 |         if len(og_image) == 0:
372 |             self.warn("Missing og:image")
373 | 
374 |     def analyze_title(self):
375 |         """
376 |         Validate the title
377 |         """
378 | 
379 |         # getting lazy, create a local variable so save having to
380 |         # type self.x a billion times
381 |         t = self.title
382 | 
383 |         # calculate the length of the title once
384 |         length = len(t)
385 | 
386 |         if length == 0:
387 |             self.warn("Missing title tag")
388 |             return
389 |         elif length < 10:
390 |             self.warn("Title tag is too short (less than 10 characters): {0}".format(t))
391 |         elif length > 70:
392 |             self.warn("Title tag is too long (more than 70 characters): {0}".format(t))
393 | 
394 |     def analyze_description(self):
395 |         """
396 |         Validate the description
397 |         """
398 | 
399 |         # getting lazy, create a local variable so save having to
400 |         # type self.x a billion times
401 |         d = self.description
402 | 
403 |         # calculate the length of the description once
404 |         length = len(d)
405 | 
406 |         if length == 0:
407 |             self.warn("Missing description")
408 |             return
409 |         elif length < 140:
410 |             self.warn(
411 |                 "Description is too short (less than 140 characters): {0}".format(d)
412 |             )
413 |         elif length > 255:
414 |             self.warn(
415 |                 "Description is too long (more than 255 characters): {0}".format(d)
416 |             )
417 | 
418 |     def visible_tags(self, element):
419 |         if element.parent.name in ["style", "script", "[document]"]:
420 |             return False
421 | 
422 |         return True
423 | 
424 |     def analyze_img_tags(self, bs):
425 |         """
426 |         Verifies that each img has an alt and title
427 |         """
428 |         images = bs.find_all("img")
429 | 
430 |         for image in images:
431 |             src = ""
432 |             if "src" in image:
433 |                 src = image["src"]
434 |             elif "data-src" in image:
435 |                 src = image["data-src"]
436 |             else:
437 |                 src = image
438 | 
439 |             if len(image.get("alt", "")) == 0:
440 |                 self.warn("Image missing alt tag: {0}".format(src))
441 | 
442 |     def analyze_h1_tags(self, bs):
443 |         """
444 |         Make sure each page has at least one H1 tag
445 |         """
446 |         htags = bs.find_all("h1")
447 | 
448 |         if len(htags) == 0:
449 |             self.warn("Each page should have at least one h1 tag")
450 | 
451 |     def analyze_a_tags(self, bs):
452 |         """
453 |         Add any new links (that we didn't find in the sitemap)
454 |         """
455 |         anchors = bs.find_all("a", href=True)
456 | 
457 |         for tag in anchors:
458 |             tag_href = tag["href"]
459 |             tag_text = tag.text.lower().strip()
460 | 
461 |             if len(tag.get("title", "")) == 0:
462 |                 self.warn("Anchor missing title tag: {0}".format(tag_href))
463 | 
464 |             if tag_text in ["click here", "page", "article"]:
465 |                 self.warn("Anchor text contains generic text: {0}".format(tag_text))
466 | 
467 |             if self.base_domain.netloc not in tag_href and ":" in tag_href:
468 |                 continue
469 | 
470 |             modified_url = self.rel_to_abs_url(tag_href)
471 | 
472 |             url_filename, url_file_extension = os.path.splitext(modified_url)
473 | 
474 |             # ignore links to images
475 |             if url_file_extension in IMAGE_EXTENSIONS:
476 |                 continue
477 | 
478 |             # remove hash links to all urls
479 |             if "#" in modified_url:
480 |                 modified_url = modified_url[: modified_url.rindex("#")]
481 | 
482 |             self.links.append(modified_url)
483 | 
484 |     def rel_to_abs_url(self, link):
485 |         if ":" in link:
486 |             return link
487 | 
488 |         relative_path = link
489 |         domain = self.base_domain.netloc
490 | 
491 |         if domain[-1] == "/":
492 |             domain = domain[:-1]
493 | 
494 |         if len(relative_path) > 0 and relative_path[0] == "?":
495 |             if "?" in self.url:
496 |                 return f'{self.url[:self.url.index("?")]}{relative_path}'
497 | 
498 |             return f"{self.url}{relative_path}"
499 | 
500 |         if len(relative_path) > 0 and relative_path[0] != "/":
501 |             relative_path = f"/{relative_path}"
502 | 
503 |         return f"{self.base_domain.scheme}://{domain}{relative_path}"
504 | 
505 |     def warn(self, warning):
506 |         self.warnings.append(warning)
507 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/stopwords.py:
--------------------------------------------------------------------------------
  1 | # This list of English stop words is taken from the "Glasgow Information
  2 | # Retrieval Group". The original list can be found at
  3 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
  4 | ENGLISH_STOP_WORDS = frozenset(
  5 |     [
  6 |         "a",
  7 |         "about",
  8 |         "above",
  9 |         "across",
 10 |         "after",
 11 |         "afterwards",
 12 |         "again",
 13 |         "against",
 14 |         "all",
 15 |         "almost",
 16 |         "alone",
 17 |         "along",
 18 |         "already",
 19 |         "also",
 20 |         "although",
 21 |         "always",
 22 |         "am",
 23 |         "among",
 24 |         "amongst",
 25 |         "amoungst",
 26 |         "amount",
 27 |         "an",
 28 |         "and",
 29 |         "another",
 30 |         "any",
 31 |         "anyhow",
 32 |         "anyone",
 33 |         "anything",
 34 |         "anyway",
 35 |         "anywhere",
 36 |         "are",
 37 |         "around",
 38 |         "as",
 39 |         "at",
 40 |         "back",
 41 |         "be",
 42 |         "became",
 43 |         "because",
 44 |         "become",
 45 |         "becomes",
 46 |         "becoming",
 47 |         "been",
 48 |         "before",
 49 |         "beforehand",
 50 |         "behind",
 51 |         "being",
 52 |         "below",
 53 |         "beside",
 54 |         "besides",
 55 |         "between",
 56 |         "beyond",
 57 |         "bill",
 58 |         "both",
 59 |         "bottom",
 60 |         "but",
 61 |         "by",
 62 |         "call",
 63 |         "can",
 64 |         "cannot",
 65 |         "cant",
 66 |         "co",
 67 |         "con",
 68 |         "could",
 69 |         "couldnt",
 70 |         "cry",
 71 |         "de",
 72 |         "describe",
 73 |         "detail",
 74 |         "do",
 75 |         "done",
 76 |         "down",
 77 |         "due",
 78 |         "during",
 79 |         "each",
 80 |         "eg",
 81 |         "eight",
 82 |         "either",
 83 |         "eleven",
 84 |         "else",
 85 |         "elsewhere",
 86 |         "empty",
 87 |         "enough",
 88 |         "etc",
 89 |         "even",
 90 |         "ever",
 91 |         "every",
 92 |         "everyone",
 93 |         "everything",
 94 |         "everywhere",
 95 |         "except",
 96 |         "few",
 97 |         "fifteen",
 98 |         "fify",
 99 |         "fill",
100 |         "find",
101 |         "fire",
102 |         "first",
103 |         "five",
104 |         "for",
105 |         "former",
106 |         "formerly",
107 |         "forty",
108 |         "found",
109 |         "four",
110 |         "from",
111 |         "front",
112 |         "full",
113 |         "further",
114 |         "get",
115 |         "give",
116 |         "go",
117 |         "had",
118 |         "has",
119 |         "hasnt",
120 |         "have",
121 |         "he",
122 |         "hence",
123 |         "her",
124 |         "here",
125 |         "hereafter",
126 |         "hereby",
127 |         "herein",
128 |         "hereupon",
129 |         "hers",
130 |         "herself",
131 |         "him",
132 |         "himself",
133 |         "his",
134 |         "how",
135 |         "however",
136 |         "hundred",
137 |         "i",
138 |         "ie",
139 |         "if",
140 |         "in",
141 |         "inc",
142 |         "indeed",
143 |         "interest",
144 |         "into",
145 |         "is",
146 |         "it",
147 |         "its",
148 |         "itself",
149 |         "keep",
150 |         "last",
151 |         "latter",
152 |         "latterly",
153 |         "least",
154 |         "less",
155 |         "ltd",
156 |         "made",
157 |         "many",
158 |         "may",
159 |         "me",
160 |         "meanwhile",
161 |         "might",
162 |         "mill",
163 |         "mine",
164 |         "more",
165 |         "moreover",
166 |         "most",
167 |         "mostly",
168 |         "move",
169 |         "much",
170 |         "must",
171 |         "my",
172 |         "myself",
173 |         "name",
174 |         "namely",
175 |         "neither",
176 |         "never",
177 |         "nevertheless",
178 |         "next",
179 |         "nine",
180 |         "no",
181 |         "nobody",
182 |         "none",
183 |         "noone",
184 |         "nor",
185 |         "not",
186 |         "nothing",
187 |         "now",
188 |         "nowhere",
189 |         "of",
190 |         "off",
191 |         "often",
192 |         "on",
193 |         "once",
194 |         "one",
195 |         "only",
196 |         "onto",
197 |         "or",
198 |         "other",
199 |         "others",
200 |         "otherwise",
201 |         "our",
202 |         "ours",
203 |         "ourselves",
204 |         "out",
205 |         "over",
206 |         "own",
207 |         "part",
208 |         "per",
209 |         "perhaps",
210 |         "please",
211 |         "put",
212 |         "rather",
213 |         "re",
214 |         "same",
215 |         "see",
216 |         "seem",
217 |         "seemed",
218 |         "seeming",
219 |         "seems",
220 |         "serious",
221 |         "several",
222 |         "she",
223 |         "should",
224 |         "show",
225 |         "side",
226 |         "since",
227 |         "sincere",
228 |         "six",
229 |         "sixty",
230 |         "so",
231 |         "some",
232 |         "somehow",
233 |         "someone",
234 |         "something",
235 |         "sometime",
236 |         "sometimes",
237 |         "somewhere",
238 |         "still",
239 |         "such",
240 |         "system",
241 |         "take",
242 |         "ten",
243 |         "than",
244 |         "that",
245 |         "the",
246 |         "their",
247 |         "them",
248 |         "themselves",
249 |         "then",
250 |         "thence",
251 |         "there",
252 |         "thereafter",
253 |         "thereby",
254 |         "therefore",
255 |         "therein",
256 |         "thereupon",
257 |         "these",
258 |         "they",
259 |         "third",
260 |         "this",
261 |         "those",
262 |         "though",
263 |         "three",
264 |         "through",
265 |         "throughout",
266 |         "thru",
267 |         "thus",
268 |         "to",
269 |         "together",
270 |         "too",
271 |         "top",
272 |         "toward",
273 |         "towards",
274 |         "twelve",
275 |         "twenty",
276 |         "two",
277 |         "un",
278 |         "under",
279 |         "until",
280 |         "up",
281 |         "upon",
282 |         "us",
283 |         "very",
284 |         "via",
285 |         "was",
286 |         "we",
287 |         "well",
288 |         "were",
289 |         "what",
290 |         "whatever",
291 |         "when",
292 |         "whence",
293 |         "whenever",
294 |         "where",
295 |         "whereafter",
296 |         "whereas",
297 |         "whereby",
298 |         "wherein",
299 |         "whereupon",
300 |         "wherever",
301 |         "whether",
302 |         "which",
303 |         "while",
304 |         "whither",
305 |         "who",
306 |         "whoever",
307 |         "whole",
308 |         "whom",
309 |         "whose",
310 |         "why",
311 |         "will",
312 |         "with",
313 |         "within",
314 |         "without",
315 |         "would",
316 |         "yet",
317 |         "you",
318 |         "your",
319 |         "yours",
320 |         "yourself",
321 |         "yourselves",
322 |     ]
323 | )
324 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | 	<meta charset="utf-8">
  5 | 	<meta name="viewport" content="width=device-width, initial-scale=1">
  6 | 	<title>seo analysis</title>
  7 | 	<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" integrity="sha384-rwoIResjU2yc3z8GV/NPeZWAv56rSmLldC3R/AZzGRnGxQQKnKkoFVhFQhNUwEyJ" crossorigin="anonymous">
  8 | 	<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
  9 | 	<script src="https://npmcdn.com/tether@1.2.4/dist/js/tether.min.js"></script>
 10 | 	<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
 11 | 	<link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
 12 | 	<style>
 13 | 	th {
 14 | 		cursor: pointer;
 15 | 	}
 16 | 	td{
 17 | 		word-break:break-word;
 18 | 	}
 19 | 	</style>
 20 | </head>
 21 | <body>
 22 | 	<div class="container">
 23 | 		<div class="page-header">
 24 | 			<h1>python-seo-analyzer results</h1>
 25 | 		</div>
 26 | 
 27 | 		{%if result['fatal_error']%}
 28 | 		<p>There was a fatal error when trying to format the output file as a webpage. Confirm that there is an output.json file in the directory. If not, you can reference the 
 29 | 			<a href="https://github.com/sethblack/python-seo-analyzer">python-seo-analyzer documentation</a></p>
 30 | 			{% else %}
 31 | 			<p>total time: {{result['total_time']|round(2)}} seconds </p>
 32 | 			<div>
 33 | 				<h2>go to:</h2>
 34 | 				<ul>
 35 | 					<li><a href="#page-section">page analysis</a></li>
 36 | 					<li><a href="#keyword-section">keyword analysis</a></li>
 37 | 					<li><a href="https://github.com/sethblack/python-seo-analyzer">GitHub</a></li>
 38 | 				</ul>
 39 | 			</div>
 40 | 
 41 | 			{% if result['errors']%}
 42 | 			<h2>errors:</h2>
 43 | 			<ul>
 44 | 				{% for e in result['errors'] %}
 45 | 				<li>{{e}}</li>
 46 | 				{% endfor %}
 47 | 			</ul>
 48 | 			{% endif %}
 49 | 
 50 | 			{% if result['pages'] %}
 51 | 			<h2 id="page-section">page analysis:</h2>
 52 | 			<input type="button" value="hide all notices " onclick="toggleDetail()" class="btn btn-info" id="ebutton">
 53 | 
 54 | 			<div class="table-responsive">
 55 | 			<table id="pages" class="table table-striped table-hover">
 56 | 				<thead>
 57 | 					<tr>
 58 | 						<th></th>
 59 | 						<th>page</th>
 60 | 						<th>word count</th>
 61 | 						<th>number of notices</th>
 62 | 					</tr>
 63 | 				</thead>
 64 | 
 65 | 				<tbody>
 66 | 				{% for page in result['pages']%}
 67 | 				{% set outer_loop = loop %}
 68 | 				<tr onclick="fullerrors({{ outer_loop.index }})">
 69 | 					<td><i class="material-icons" id="i{{ outer_loop.index }}">keyboard_arrow_down</i></td>
 70 | 					<td><a href="{{ page['url'] }}" target ="_blank">{{page['url']}}</a></td>
 71 | 					<td>{{page['word_count']}}</td>
 72 | 					<td>{{page["warnings"]|length}}</td>
 73 | 				</tr>
 74 | 				<tr id="{{ outer_loop.index }}" class="error-detail">
 75 | 					<td colspan="4"><p>{{page["title"]}}</p><p>{{page["description"]}}</p><ul>{% for err in page["warnings"] %}<li>{{err|e}}</li>{% endfor %}</ul></td>
 76 | 				</tr>
 77 | 				{% endfor %}
 78 | 			</tbody>
 79 | 			</table>
 80 | 			</div>
 81 | 
 82 | 			{% endif %}
 83 | 
 84 | 
 85 | 			{% if result['keywords'] %}
 86 | 			<h2 id="keyword-section">keyword analysis:</h2>
 87 | 			<div class="table-responsive">
 88 | 			<table id = "keywords" class="table table-striped">
 89 | 				<thead>
 90 | 					<tr>
 91 | 						<th class="sortable_th">keywords</th>
 92 | 						<th class="sortable_th">count</th>
 93 | 					</tr>
 94 | 				</thead>
 95 | 				<tbody>
 96 | 				{% for key in result['keywords']%}
 97 | 				<tr>
 98 | 					<td> {{ key['word'] }} </td>
 99 | 					<td> {{ key['count'] }} </td>
100 | 				</tr>
101 | 
102 | 				{% endfor %}
103 | 			</tbody>
104 | 			</table>
105 | 		</div>
106 | 			{% endif %}
107 | 			{% endif %}
108 | 		</div>
109 | 
110 | 		<script type='text/javascript'>
111 | 		var showall = true;
112 | 		function toggleDetail(){
113 | 			if(showall){
114 | 				$(".error-detail").hide();
115 | 				$(".material-icons").text('keyboard_arrow_right');
116 | 				$("#ebutton").val('show all notices');
117 | 				showall = false;
118 | 			}else{
119 | 				$(".error-detail").show();
120 | 				$(".material-icons").text('keyboard_arrow_down');
121 | 				$("#ebutton").val('hide all notices ');
122 | 				showall = true;
123 | 			}
124 | 		}
125 | 		function fullerrors(errorid){
126 | 			var cellid = "#" + errorid;
127 | 			var pressedid = "#i" + errorid;
128 | 			$(cellid).toggle();
129 | 
130 | 			if($(pressedid).html() == 'keyboard_arrow_down'){
131 | 				$(pressedid).text('keyboard_arrow_right');
132 | 			}else{
133 | 				$(pressedid).text('keyboard_arrow_down');
134 | 			}
135 | 		}
136 | 		// https://stackoverflow.com/questions/3160277/jquery-table-sort
137 | 		$(document).ready(function(){
138 | 			$('.sortable_th').click(function(){
139 | 				var table = $(this).parents('table').eq(0)
140 | 				var rows = table.find('tr:gt(0)').toArray().sort(comparer($(this).index()))
141 | 				this.asc = !this.asc
142 | 				if (!this.asc){rows = rows.reverse()}
143 | 					for (var i = 0; i < rows.length; i++){table.append(rows[i])}
144 | 				})
145 | 			function comparer(index) {
146 | 				return function(a, b) {
147 | 					var valA = getCellValue(a, index), valB = getCellValue(b, index)
148 | 					return $.isNumeric(valA) && $.isNumeric(valB) ? valA - valB : valA.localeCompare(valB)
149 | 				}
150 | 			}
151 | 			function getCellValue(row, index){ return $(row).children('td').eq(index).text() }
152 | 		});
153 | 		</script>
154 | 	</body>
155 | 	</html>
156 | 


--------------------------------------------------------------------------------
/pyseoanalyzer/website.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter, defaultdict
  2 | from urllib.parse import urlsplit
  3 | from xml.dom import minidom
  4 | import socket
  5 | 
  6 | from .http import http
  7 | from .page import Page
  8 | 
  9 | 
 10 | class Website:
 11 |     def __init__(
 12 |         self,
 13 |         base_url,
 14 |         sitemap,
 15 |         analyze_headings=True,
 16 |         analyze_extra_tags=False,
 17 |         follow_links=False,
 18 |         run_llm_analysis=False,
 19 |     ):
 20 |         self.base_url = base_url
 21 |         self.sitemap = sitemap
 22 |         self.analyze_headings = analyze_headings
 23 |         self.analyze_extra_tags = analyze_extra_tags
 24 |         self.follow_links = follow_links
 25 |         self.run_llm_analysis = run_llm_analysis
 26 |         self.crawled_pages = []
 27 |         self.crawled_urls = set()
 28 |         self.page_queue = []
 29 |         self.wordcount = Counter()
 30 |         self.bigrams = Counter()
 31 |         self.trigrams = Counter()
 32 |         self.content_hashes = defaultdict(set)
 33 | 
 34 |     def check_dns(self, url_to_check):
 35 |         try:
 36 |             o = urlsplit(url_to_check)
 37 |             socket.gethostbyname_ex(o.hostname)
 38 |             return True
 39 |         except (socket.herror, socket.gaierror):
 40 |             return False
 41 | 
 42 |     def get_text_from_xml(self, nodelist):
 43 |         """
 44 |         Stolen from the minidom documentation
 45 |         """
 46 |         return "".join(
 47 |             node.data for node in nodelist if node.nodeType == node.TEXT_NODE
 48 |         )
 49 | 
 50 |     def crawl(self):
 51 |         try:
 52 |             if self.sitemap:
 53 |                 page = http.get(self.sitemap)
 54 |                 if self.sitemap.endswith("xml"):
 55 |                     xmldoc = minidom.parseString(page.data.decode("utf-8"))
 56 |                     sitemap_urls = xmldoc.getElementsByTagName("loc")
 57 |                     for url in sitemap_urls:
 58 |                         self.page_queue.append(self.get_text_from_xml(url.childNodes))
 59 |                 elif self.sitemap.endswith("txt"):
 60 |                     sitemap_urls = page.data.decode("utf-8").split("\n")
 61 |                     for url in sitemap_urls:
 62 |                         self.page_queue.append(url)
 63 | 
 64 |             self.page_queue.append(self.base_url)
 65 | 
 66 |             for url in self.page_queue:
 67 |                 if url in self.crawled_urls:
 68 |                     continue
 69 | 
 70 |                 page = Page(
 71 |                     url=url,
 72 |                     base_domain=self.base_url,
 73 |                     analyze_headings=self.analyze_headings,
 74 |                     analyze_extra_tags=self.analyze_extra_tags,
 75 |                     run_llm_analysis=self.run_llm_analysis,
 76 |                 )
 77 | 
 78 |                 if page.parsed_url.netloc != page.base_domain.netloc:
 79 |                     continue
 80 | 
 81 |                 # Analyze the page and check if successful
 82 |                 analysis_successful = page.analyze()
 83 | 
 84 |                 # Only process and add the page if analysis completed
 85 |                 if analysis_successful:
 86 |                     self.content_hashes[page.content_hash].add(page.url)
 87 |                     self.wordcount.update(page.wordcount)
 88 |                     self.bigrams.update(page.bigrams)
 89 |                     self.trigrams.update(page.trigrams)
 90 | 
 91 |                     # Only add links if following is enabled and analysis was successful
 92 |                     if self.follow_links:
 93 |                         self.page_queue.extend(page.links)
 94 | 
 95 |                     self.crawled_pages.append(page)
 96 |                     self.crawled_urls.add(page.url)
 97 | 
 98 |                 # Stop after the first page if not following links, regardless of analysis success
 99 |                 if not self.follow_links:
100 |                     break
101 |         except Exception as e:
102 |             print(f"Error occurred during crawling: {e}")
103 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.13.3
 2 | certifi==2025.1.31
 3 | Jinja2==3.1.6
 4 | langchain==0.3.22
 5 | langchain-anthropic==0.3.10
 6 | lxml==5.3.1
 7 | MarkupSafe==3.0.2
 8 | pytest==8.3.2 # Added for testing
 9 | pytest-mock==3.14.0 # Added for testing
10 | python-dotenv==1.1.0
11 | trafilatura==2.0.0
12 | urllib3==2.3.0
13 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import inspect
 3 | import json
 4 | import os
 5 | 
 6 | from jinja2 import Environment
 7 | from jinja2 import FileSystemLoader
 8 | from pyseoanalyzer import analyze
 9 | 
10 | 
11 | module_path = os.path.dirname(inspect.getfile(analyze))
12 | 
13 | arg_parser = argparse.ArgumentParser()
14 | 
15 | arg_parser.add_argument("site", help="URL of the site you are wanting to analyze.")
16 | arg_parser.add_argument(
17 |     "-s", "--sitemap", help="URL of the sitemap to seed the crawler with."
18 | )
19 | arg_parser.add_argument(
20 |     "-f",
21 |     "--output-format",
22 |     help="Output format.",
23 |     choices=[
24 |         "json",
25 |         "html",
26 |     ],
27 |     default="json",
28 | )
29 | arg_parser.add_argument(
30 |     "-d",
31 |     "--disk",
32 |     help="save to disk",
33 |     choices=[
34 |         "y",
35 |         "n",
36 |     ],
37 |     default="y",
38 | )
39 | 
40 | args = arg_parser.parse_args()
41 | 
42 | output = analyze(args.site, args.sitemap)
43 | 
44 | if args.output_format == "html":
45 |     from jinja2 import Environment
46 |     from jinja2 import FileSystemLoader
47 | 
48 |     env = Environment(loader=FileSystemLoader(os.path.join(module_path, "templates")))
49 |     template = env.get_template("index.html")
50 |     output_from_parsed_template = template.render(result=output)
51 |     if args.disk == "y":
52 |         with open("test.html", "w", encoding="utf-8") as text_file:
53 |             text_file.write(output_from_parsed_template)
54 |     else:
55 |         print(output_from_parsed_template)
56 | elif args.output_format == "json":
57 |     if args.disk == "y":
58 |         with open("test.json", "w", encoding="utf-8") as text_file:
59 |             text_file.write(json.dumps(output, indent=4, separators=(",", ": ")))
60 |     else:
61 |         print(json.dumps(output, indent=4, separators=(",", ": ")))
62 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sethblack/python-seo-analyzer/27bb52303747dcb767f51853c2701153d0c4b6b7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_analyzer.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pytest
  3 | from unittest.mock import patch, MagicMock
  4 | from pyseoanalyzer.analyzer import analyze, calc_total_time
  5 | 
  6 | 
  7 | # --- Test calc_total_time ---
  8 | 
  9 | 
 10 | def test_calc_total_time():
 11 |     start_time = time.time()
 12 |     # Simulate some time passing
 13 |     time.sleep(0.01)
 14 |     elapsed_time = calc_total_time(start_time)
 15 |     # Check if the elapsed time is roughly correct (allow for some variance)
 16 |     assert 0.005 < elapsed_time < 0.05
 17 | 
 18 | 
 19 | # --- Test analyze function ---
 20 | 
 21 | 
 22 | # Helper function to create a mock Page object
 23 | def create_mock_page(url, title, description, word_count, content_hash):
 24 |     page = MagicMock()
 25 |     page.url = url
 26 |     page.content_hash = content_hash
 27 |     page.as_dict.return_value = {
 28 |         "url": url,
 29 |         "title": title,
 30 |         "description": description,
 31 |         "word_count": word_count,
 32 |         # Add other fields as needed by as_dict() if tests evolve
 33 |     }
 34 |     return page
 35 | 
 36 | 
 37 | # Basic test using mocking
 38 | @patch("pyseoanalyzer.analyzer.Website")
 39 | def test_analyze_basic(MockWebsite):
 40 |     # --- Setup Mock ---
 41 |     mock_site_instance = MockWebsite.return_value
 42 |     mock_page1 = create_mock_page(
 43 |         "http://example.com", "Page 1", "Desc 1", 100, "hash1"
 44 |     )
 45 |     mock_site_instance.crawled_pages = [mock_page1]
 46 |     mock_site_instance.content_hashes = {"hash1": ["http://example.com"]}
 47 |     mock_site_instance.wordcount = {"word": 5, "test": 6}
 48 |     mock_site_instance.bigrams = {("bigram", "test"): 5}
 49 |     mock_site_instance.trigrams = {("trigram", "test", "word"): 5}
 50 | 
 51 |     # --- Run analyze ---
 52 |     output = analyze("http://example.com", follow_links=False)
 53 | 
 54 |     # --- Assertions ---
 55 |     # Check Website constructor call
 56 |     MockWebsite.assert_called_once_with(
 57 |         base_url="http://example.com",
 58 |         sitemap=None,
 59 |         analyze_headings=False,
 60 |         analyze_extra_tags=False,
 61 |         follow_links=False,
 62 |         run_llm_analysis=False,
 63 |     )
 64 |     # Check crawl was called
 65 |     mock_site_instance.crawl.assert_called_once()
 66 | 
 67 |     # Check output structure and basic content
 68 |     assert len(output["pages"]) == 1
 69 |     assert output["pages"][0]["url"] == "http://example.com"
 70 |     assert output["pages"][0]["title"] == "Page 1"
 71 |     assert output["pages"][0]["description"] == "Desc 1"
 72 |     assert output["pages"][0]["word_count"] == 100
 73 |     # assert output["errors"] == [] # Errors usually come from crawl, harder to test here
 74 |     assert output["duplicate_pages"] == []  # Only one page
 75 | 
 76 |     # Check keywords (counts > 4)
 77 |     assert len(output["keywords"]) == 4
 78 |     assert {"word": "test", "count": 6} in output["keywords"]
 79 |     assert {"word": "word", "count": 5} in output["keywords"]
 80 |     assert {"word": ("bigram", "test"), "count": 5} in output["keywords"]
 81 |     assert {"word": ("trigram", "test", "word"), "count": 5} in output["keywords"]
 82 | 
 83 |     # Check total time calculation
 84 |     assert "total_time" in output
 85 |     assert output["total_time"] > 0
 86 | 
 87 | 
 88 | # Add more tests below for different scenarios (duplicates, arguments, etc.)
 89 | # For example:
 90 | 
 91 | 
 92 | @patch("pyseoanalyzer.analyzer.Website")
 93 | def test_analyze_duplicates(MockWebsite):
 94 |     # --- Setup Mock ---
 95 |     mock_site_instance = MockWebsite.return_value
 96 |     mock_page1 = create_mock_page(
 97 |         "http://example.com/page1", "Page 1", "Desc", 100, "hash_dup"
 98 |     )
 99 |     mock_page2 = create_mock_page(
100 |         "http://example.com/page2", "Page 2", "Desc", 150, "hash_dup"
101 |     )  # Same hash
102 |     mock_page3 = create_mock_page(
103 |         "http://example.com/page3", "Page 3", "Desc", 200, "hash_unique"
104 |     )
105 |     mock_site_instance.crawled_pages = [mock_page1, mock_page2, mock_page3]
106 |     mock_site_instance.content_hashes = {
107 |         "hash_dup": ["http://example.com/page1", "http://example.com/page2"],
108 |         "hash_unique": ["http://example.com/page3"],
109 |     }
110 |     mock_site_instance.wordcount = {}
111 |     mock_site_instance.bigrams = {}
112 |     mock_site_instance.trigrams = {}
113 | 
114 |     # --- Run analyze ---
115 |     output = analyze("http://example.com")  # Default follow_links=True
116 | 
117 |     # --- Assertions ---
118 |     MockWebsite.assert_called_once_with(
119 |         base_url="http://example.com",
120 |         sitemap=None,
121 |         analyze_headings=False,
122 |         analyze_extra_tags=False,
123 |         follow_links=True,  # Check default
124 |         run_llm_analysis=False,
125 |     )
126 |     mock_site_instance.crawl.assert_called_once()
127 | 
128 |     assert len(output["pages"]) == 3
129 |     assert len(output["duplicate_pages"]) == 1
130 |     # Convert to sets for order-independent comparison
131 |     assert set(output["duplicate_pages"][0]) == {
132 |         "http://example.com/page1",
133 |         "http://example.com/page2",
134 |     }
135 |     assert output["keywords"] == []
136 | 
137 | 
138 | @patch("pyseoanalyzer.analyzer.Website")
139 | def test_analyze_arguments_passthrough(MockWebsite):
140 |     # --- Setup Mock ---
141 |     mock_site_instance = MockWebsite.return_value
142 |     mock_site_instance.crawled_pages = []
143 |     mock_site_instance.content_hashes = {}
144 |     mock_site_instance.wordcount = {}
145 |     mock_site_instance.bigrams = {}
146 |     mock_site_instance.trigrams = {}
147 | 
148 |     # --- Run analyze with specific arguments ---
149 |     analyze(
150 |         "http://example.com",
151 |         sitemap_url="http://example.com/sitemap.xml",
152 |         analyze_headings=True,
153 |         analyze_extra_tags=True,
154 |         follow_links=False,
155 |         run_llm_analysis=True,
156 |     )
157 | 
158 |     # --- Assertions ---
159 |     # Check Website constructor call reflects arguments
160 |     MockWebsite.assert_called_once_with(
161 |         base_url="http://example.com",
162 |         sitemap="http://example.com/sitemap.xml",
163 |         analyze_headings=True,
164 |         analyze_extra_tags=True,
165 |         follow_links=False,
166 |         run_llm_analysis=True,
167 |     )
168 |     mock_site_instance.crawl.assert_called_once()
169 | 
170 | 
171 | @patch("pyseoanalyzer.analyzer.Website")
172 | def test_analyze_keyword_filtering(MockWebsite):
173 |     # --- Setup Mock ---
174 |     mock_site_instance = MockWebsite.return_value
175 |     mock_site_instance.crawled_pages = []
176 |     mock_site_instance.content_hashes = {}
177 |     # Include counts <= 4
178 |     mock_site_instance.wordcount = {"high": 10, "medium": 5, "low": 4, "verylow": 3}
179 |     mock_site_instance.bigrams = {("bi", "high"): 6, ("bi", "low"): 4}
180 |     mock_site_instance.trigrams = {("tri", "high", "a"): 5, ("tri", "low", "b"): 3}
181 | 
182 |     # --- Run analyze ---
183 |     output = analyze("http://example.com")
184 | 
185 |     # --- Assertions ---
186 |     assert len(output["keywords"]) == 4  # Only counts > 4 should be included
187 |     words_in_keywords = {kw["word"] for kw in output["keywords"]}
188 |     assert "high" in words_in_keywords
189 |     assert "medium" in words_in_keywords
190 |     assert ("bi", "high") in words_in_keywords
191 |     assert ("tri", "high", "a") in words_in_keywords
192 |     assert "low" not in words_in_keywords
193 |     assert "verylow" not in words_in_keywords
194 |     assert ("bi", "low") not in words_in_keywords
195 |     assert ("tri", "low", "b") not in words_in_keywords
196 | 
197 |     # Check sorting (descending by count)
198 |     counts = [kw["count"] for kw in output["keywords"]]
199 |     assert counts == sorted(counts, reverse=True)
200 | 


--------------------------------------------------------------------------------
/tests/test_http.py:
--------------------------------------------------------------------------------
1 | from pyseoanalyzer import http
2 | 
3 | 
4 | def test_http():
5 |     assert http.http.get("https://www.sethserver.com/tests/utf8.html")
6 | 


--------------------------------------------------------------------------------
/tests/test_llm_analyst.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pyseoanalyzer.llm_analyst import LLMSEOEnhancer
 3 | from langchain_anthropic import ChatAnthropic
 4 | from langchain.chains import LLMChain
 5 | from langchain.prompts import PromptTemplate
 6 | import json
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def seo_data():
11 |     return {
12 |         "title": "Test Title",
13 |         "description": "Test Description",
14 |         "keywords": ["test", "seo"],
15 |         "content": "This is a test content.",
16 |     }
17 | 
18 | 
19 | def test_init():
20 |     enhancer = LLMSEOEnhancer()
21 |     assert isinstance(enhancer.llm, ChatAnthropic)
22 |     assert enhancer.llm.model == "claude-3-sonnet-20240229"
23 |     assert enhancer.llm.temperature == 0
24 | 
25 | 
26 | @pytest.mark.asyncio
27 | async def test_enhance_seo_analysis(seo_data):
28 |     enhancer = LLMSEOEnhancer()
29 |     result = await enhancer.enhance_seo_analysis(seo_data)
30 | 
31 |     assert "summary" in result
32 | 
33 |     assert "entity_analysis" in result["detailed_analysis"]
34 |     assert "credibility_analysis" in result["detailed_analysis"]
35 |     assert "conversation_analysis" in result["detailed_analysis"]
36 |     assert "cross_platform_presence" in result["detailed_analysis"]
37 |     assert "recommendations" in result["detailed_analysis"]
38 | 


--------------------------------------------------------------------------------
/tests/test_page.py:
--------------------------------------------------------------------------------
 1 | from pyseoanalyzer import page
 2 | 
 3 | 
 4 | def test_page_init():
 5 |     p = page.Page(
 6 |         url="https://www.sethserver.com/sitemap.xml",
 7 |         base_domain="https://www.sethserver.com/",
 8 |     )
 9 | 
10 |     assert p.base_domain.scheme == "https"
11 |     assert p.base_domain.netloc == "www.sethserver.com"
12 |     assert p.base_domain.path == "/"
13 | 
14 |     assert p.url == "https://www.sethserver.com/sitemap.xml"
15 | 
16 |     assert p.title == ""
17 |     assert p.description == ""
18 |     assert p.keywords == {}
19 |     assert p.warnings == []
20 |     assert p.links == []
21 | 
22 | 
23 | def test_analyze():
24 |     p = page.Page(
25 |         url="https://www.sethserver.com/", base_domain="https://www.sethserver.com/"
26 |     )
27 | 
28 |     assert p.analyze()
29 | 
30 |     assert "seth" in p.title.lower()
31 | 
32 | 
33 | def test_analyze_with_llm():
34 |     p = page.Page(
35 |         url="https://www.sethserver.com/",
36 |         base_domain="https://www.sethserver.com/",
37 |         run_llm_analysis=True,
38 |     )
39 | 
40 |     assert p.analyze()
41 | 
42 |     assert "seth" in p.title.lower()
43 |     assert "summary" in p.llm_analysis
44 | 


--------------------------------------------------------------------------------