├── report_template.md
├── .gitignore
├── screenshot.png
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── create-report.yml
├── requirements.txt
├── pyproject.toml
├── config.yaml
├── .dockerignore
├── Dockerfile
├── LICENSE
├── .env.example
├── checks
    ├── check_content_type_headers.py
    ├── check_amp_compatibility.py
    ├── check_cdn.py
    ├── check_cookie_duration.py
    ├── check_browser_compatibility.py
    ├── check_floc.py
    ├── check_cookie_flags.py
    ├── check_cms_used.py
    ├── check_clientside_rendering.py
    ├── check_brotli_compression.py
    ├── check_pagespeed_performances.py
    ├── check_cookie_policy.py
    ├── check_open_graph_protocol.py
    ├── check_alt_tags.py
    ├── check_broken_links.py
    ├── check_mobile_friendly.py
    ├── check_xss_protection.py
    ├── check_internationalization.py
    ├── check_redirects.py
    ├── check_hsts.py
    ├── check_server_response_time.py
    ├── check_robot_txt.py
    ├── check_privacy_protected_whois.py
    ├── check_email_domain.py
    ├── check_mixed_content.py
    ├── check_website_load_time.py
    ├── check_asset_minification.py
    ├── check_subresource_integrity.py
    ├── check_domain_expiration.py
    ├── check_redirect_chains.py
    ├── check_ssl_cipher_strength.py
    ├── check_url_canonicalization.py
    ├── check_dnssec.py
    ├── check_favicon.py
    ├── check_domainsblacklists_blacklist.py
    ├── check_security_headers.py
    ├── check_semantic_markup.py
    ├── check_subdomain_enumeration.py
    ├── check_rate_limiting.py
    ├── check_domain_breach.py
    ├── check_ad_and_tracking.py
    ├── check_ssl_cert.py
    ├── check_cors_headers.py
    ├── check_third_party_resources.py
    ├── check_third_party_requests.py
    ├── check_sitemap.py
    └── check_privacy_exposure.py
├── project_description.md
├── debug_scheduler.py
├── CHANGELOG.md
├── docker-compose.yml
├── scheduler.py
└── usage.md


/report_template.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .pytest_cache
3 | tests/
4 | *.log
5 | codeflash.yaml


--------------------------------------------------------------------------------
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fabriziosalmi/websites-monitor/HEAD/screenshot.png


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: [fabriziosalmi]
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | python-whois
3 | dnspython
4 | beautifulsoup4
5 | selenium
6 | pyyaml
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.codeflash]
2 | # All paths are relative to this pyproject.toml's directory.
3 | module-root = "checks"
4 | tests-root = "tests"
5 | test-framework = "pytest"
6 | ignore-paths = []
7 | formatter-cmds = ["disabled"]
8 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | websites:
2 |   - audiolibri.org
3 |   - example.com
4 | output_file: README.md
5 | max_workers: 2
6 | timeout: 30
7 | report_template: report_template.md
8 | github_workflow_badge: https://github.com/fabriziosalmi/websites-monitor/actions/workflows/create-report.yml/badge.svg
9 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Docker ignore file to reduce build context size
 2 | 
 3 | # Git
 4 | .git
 5 | .gitignore
 6 | .gitattributes
 7 | 
 8 | # Docker files
 9 | Dockerfile
10 | docker-compose.yml
11 | .dockerignore
12 | 
13 | # Environment files
14 | .env
15 | .env.local
16 | .env.example
17 | 
18 | # Logs
19 | *.log
20 | logs/
21 | monitor.log
22 | 
23 | # Cache
24 | __pycache__/
25 | *.pyc
26 | *.pyo
27 | *.pyd
28 | .Python
29 | .pytest_cache/
30 | .coverage
31 | .tox/
32 | 
33 | # IDE
34 | .vscode/
35 | .idea/
36 | *.swp
37 | *.swo
38 | *~
39 | 
40 | # OS
41 | .DS_Store
42 | Thumbs.db
43 | 
44 | # Reports (will be mounted as volume)
45 | README.md.backup
46 | reports/
47 | 
48 | # Documentation
49 | docs/
50 | *.md
51 | 
52 | # Tests
53 | tests/
54 | test_*.py
55 | 
56 | # Node modules (if any)
57 | node_modules/
58 | npm-debug.log*
59 | 
60 | # Backup files
61 | *.backup
62 | *.bak
63 | *.tmp
64 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Website Monitor - Dockerfile
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set working directory
 5 | WORKDIR /app
 6 | 
 7 | # Install system dependencies
 8 | RUN apt-get update && apt-get install -y \
 9 |     curl \
10 |     dnsutils \
11 |     whois \
12 |     && rm -rf /var/lib/apt/lists/*
13 | 
14 | # Copy requirements first for better caching
15 | COPY requirements.txt .
16 | COPY pyproject.toml .
17 | 
18 | # Install Python dependencies
19 | RUN pip install --no-cache-dir -r requirements.txt
20 | 
21 | # Install additional API dependencies
22 | RUN pip install --no-cache-dir fastapi uvicorn[standard] pydantic
23 | 
24 | # Copy application code
25 | COPY . .
26 | 
27 | # Create necessary directories
28 | RUN mkdir -p /app/logs /app/reports
29 | 
30 | # Set environment variables
31 | ENV PYTHONPATH=/app
32 | ENV PYTHONUNBUFFERED=1
33 | 
34 | # Health check
35 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
36 |     CMD curl -f http://localhost:8000/health || exit 1
37 | 
38 | # Expose API port
39 | EXPOSE 8000
40 | 
41 | # Default command - can be overridden in docker-compose
42 | CMD ["python", "api.py"]
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Fabrizio Salmi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Website Monitor - Environment Variables
 2 | # Copy this file to .env and modify values as needed
 3 | 
 4 | # ===================================
 5 | # API Configuration
 6 | # ===================================
 7 | API_HOST=0.0.0.0
 8 | API_PORT=8000
 9 | 
10 | # ===================================
11 | # Google PageSpeed Insights
12 | # ===================================
13 | # Get your API key from: https://developers.google.com/speed/docs/insights/v5/get-started
14 | PAGESPEED_API_KEY=your_pagespeed_api_key_here
15 | 
16 | # ===================================
17 | # Monitoring Configuration
18 | # ===================================
19 | # Monitoring interval for scheduler (in seconds)
20 | # 3600 = 1 hour, 86400 = 24 hours
21 | MONITOR_INTERVAL=3600
22 | 
23 | # ===================================
24 | # Database Configuration (Production)
25 | # ===================================
26 | # Used when running with docker-compose --profile production
27 | DB_PASSWORD=secure_password_change_me
28 | POSTGRES_DB=website_monitor
29 | POSTGRES_USER=monitor_user
30 | 
31 | # ===================================
32 | # Redis Configuration (Production)
33 | # ===================================
34 | # Used when running with docker-compose --profile production
35 | REDIS_URL=redis://redis:6379/0
36 | 
37 | # ===================================
38 | # SSL Configuration (Production)
39 | # ===================================
40 | # Paths for SSL certificates when using nginx
41 | SSL_CERT_PATH=/etc/nginx/ssl/cert.pem
42 | SSL_KEY_PATH=/etc/nginx/ssl/key.pem
43 | 
44 | # ===================================
45 | # Logging Configuration
46 | # ===================================
47 | # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
48 | LOG_LEVEL=INFO
49 | 


--------------------------------------------------------------------------------
/checks/check_content_type_headers.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import RequestException, Timeout, HTTPError
 3 | 
 4 | def check_content_type_headers(website):
 5 |     """
 6 |     Checks if the 'Content-Type' header of the website is set to 'text/html' 
 7 |     and has a character encoding specified.
 8 | 
 9 |     Args:
10 |         website (str): The website URL to check.
11 | 
12 |     Returns:
13 |         str: 
14 |             - "🟢" if the header is properly set
15 |             - "🔴" if the header is not properly set
16 |             - "⚪" if an error occurs
17 |     """
18 |     # Ensure the website starts with 'http://' or 'https://'
19 |     if not website.startswith(('http://', 'https://')):
20 |         website = f"https://{website}"
21 | 
22 |     headers = {
23 |         'User-Agent': 'ContentTypeChecker/1.0'
24 |     }
25 | 
26 |     try:
27 |         # Method 1: Check Content-Type header directly
28 |         response = requests.get(website, headers=headers, timeout=10)
29 |         response.raise_for_status()
30 |         content_type = response.headers.get('Content-Type', '')
31 | 
32 |         # Check for both 'text/html' and a character encoding
33 |         if 'text/html' in content_type.lower() and 'charset=' in content_type.lower():
34 |             print(f"Content-Type is correctly set for {website}.")
35 |             return "🟢"
36 |         else:
37 |             print(f"Content-Type is not properly set for {website}.")
38 |             return "🔴"
39 | 
40 |     except (Timeout, HTTPError, RequestException) as e:
41 |         print(f"Request error occurred while checking Content-Type headers for {website}: {e}")
42 |         return "⚪"
43 |     except Exception as e:
44 |         print(f"An unexpected error occurred while checking Content-Type headers for {website}: {e}")
45 |         return "⚪"
46 | 


--------------------------------------------------------------------------------
/project_description.md:
--------------------------------------------------------------------------------
 1 | ## Project Description
 2 | 
 3 | Website Monitor is a comprehensive website monitoring framework designed to continuously monitor various aspects of websites including security, performance, SEO compliance, and accessibility. 
 4 | 
 5 | ### Key Features
 6 | 
 7 | - **Automated Monitoring**: Runs scheduled checks via GitHub Actions (daily by default)
 8 | - **Web Interface**: Interactive HTML interface for real-time testing and analysis
 9 | - **REST API**: Full-featured API for integration with other tools and services
10 | - **Comprehensive Checks**: 53+ different checks across 7 categories
11 | - **Multiple Deployment Options**: Local, Docker, or GitHub Actions
12 | - **Detailed Reporting**: Automatic markdown report generation with results
13 | 
14 | ### Use Cases
15 | 
16 | - **Website Health Monitoring**: Track the status and health of multiple websites
17 | - **Security Auditing**: Identify security issues like missing SSL, weak headers, or vulnerabilities
18 | - **Performance Tracking**: Monitor load times, PageSpeed scores, and optimization opportunities
19 | - **SEO Compliance**: Ensure proper sitemaps, robots.txt, meta tags, and structured data
20 | - **Accessibility Testing**: Verify WCAG compliance and mobile-friendliness
21 | - **DevOps Integration**: Integrate with CI/CD pipelines for continuous monitoring
22 | 
23 | ### Architecture
24 | 
25 | The project is built with Python and uses:
26 | - **FastAPI**: For the web interface and REST API
27 | - **Selenium**: For browser-based checks
28 | - **Multiple specialized libraries**: For DNS, SSL, and other specific checks
29 | - **Docker**: For containerized deployment
30 | - **GitHub Actions**: For automated scheduled monitoring
31 | 
32 | ### Project Status
33 | 
34 | ![Static Badge](https://img.shields.io/badge/project_status-active-green?style=for-the-badge&logo=github)
35 | 
36 | The project is actively maintained and welcomes contributions. See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute.
37 | 


--------------------------------------------------------------------------------
/checks/check_amp_compatibility.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import RequestException, Timeout, HTTPError
 3 | from bs4 import BeautifulSoup
 4 | from bs4 import FeatureNotFound
 5 | 
 6 | def check_amp_compatibility(website):
 7 |     """
 8 |     Check if the website has AMP compatibility.
 9 | 
10 |     Args:
11 |         website (str): URL of the website to be checked.
12 | 
13 |     Returns:
14 |         str: "🟢" if AMP compatible, "🔴" if not AMP compatible, "⚪" if error occurs
15 |     """
16 |     # Ensure the website starts with 'http://' or 'https://'
17 |     if not website.startswith(('http://', 'https://')):
18 |         website = f"https://{website}"
19 | 
20 |     headers = {
21 |         'User-Agent': 'AMPChecker/1.0'
22 |     }
23 | 
24 |     try:
25 |         response = requests.get(website, headers=headers, timeout=10)
26 |         response.raise_for_status()
27 |         html_content = response.text
28 | 
29 |         # Prefer lxml parser for better performance, fallback to html.parser
30 |         try:
31 |             soup = BeautifulSoup(html_content, 'lxml')
32 |         except FeatureNotFound:
33 |             soup = BeautifulSoup(html_content, 'html.parser')
34 | 
35 |         # Check for AMP attributes on <html> tag
36 |         amp_html = soup.find('html', attrs=lambda x: x and ('⚡' in x or 'amp' in x))
37 |         
38 |         # Check for required AMP script
39 |         amp_script = soup.find('script', src="https://cdn.ampproject.org/v0.js")
40 | 
41 |         # Check for AMP-specific components like link to AMP version
42 |         amp_link = soup.find('link', rel="amphtml")
43 | 
44 |         # Validate AMP criteria
45 |         if html_content.lower().startswith('<!doctype html>') and (amp_html or amp_link) and amp_script:
46 |             return "🟢"
47 | 
48 |         return "🔴"
49 |     
50 |     except (Timeout, HTTPError, RequestException) as e:
51 |         print(f"Request error occurred while checking AMP compatibility for {website}: {e}")
52 |         return "⚪"
53 |     except Exception as e:
54 |         print(f"An error occurred while checking AMP compatibility for {website}: {e}")
55 |         return "⚪"
56 | 


--------------------------------------------------------------------------------
/debug_scheduler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Debug version of the scheduler to identify issues
 4 | """
 5 | 
 6 | import time
 7 | import subprocess
 8 | import os
 9 | import sys
10 | from datetime import datetime
11 | 
12 | print("=== Debug Scheduler Starting ===")
13 | print(f"Python version: {sys.version}")
14 | print(f"Working directory: {os.getcwd()}")
15 | print(f"Files in directory: {os.listdir('.')}")
16 | print(f"Logs directory exists: {os.path.exists('/app/logs')}")
17 | 
18 | # Test creating logs directory
19 | try:
20 |     os.makedirs('/app/logs', exist_ok=True)
21 |     print("✓ Logs directory created/exists")
22 | except Exception as e:
23 |     print(f"✗ Error creating logs directory: {e}")
24 | 
25 | # Test if main.py exists and is readable
26 | if os.path.exists('main.py'):
27 |     print("✓ main.py exists")
28 |     try:
29 |         with open('main.py', 'r') as f:
30 |             content = f.read()
31 |         print(f"✓ main.py is readable ({len(content)} characters)")
32 |     except Exception as e:
33 |         print(f"✗ Error reading main.py: {e}")
34 | else:
35 |     print("✗ main.py does not exist")
36 | 
37 | print("=== Testing subprocess call ===")
38 | try:
39 |     result = subprocess.run(
40 |         ['python', '--version'], 
41 |         capture_output=True, 
42 |         text=True,
43 |         timeout=10
44 |     )
45 |     print(f"✓ Subprocess test successful: {result.stdout.strip()}")
46 | except Exception as e:
47 |     print(f"✗ Subprocess test failed: {e}")
48 | 
49 | print("=== Testing main.py execution ===")
50 | try:
51 |     # Try to run main.py with a short timeout
52 |     result = subprocess.run(
53 |         ['python', 'main.py', '--help'], 
54 |         capture_output=True, 
55 |         text=True,
56 |         timeout=5
57 |     )
58 |     print(f"✓ main.py --help exit code: {result.returncode}")
59 |     if result.stdout:
60 |         print(f"stdout: {result.stdout[:200]}...")
61 |     if result.stderr:
62 |         print(f"stderr: {result.stderr[:200]}...")
63 | except subprocess.TimeoutExpired:
64 |     print("⚠ main.py --help timed out")
65 | except Exception as e:
66 |     print(f"✗ Error running main.py --help: {e}")
67 | 
68 | print("=== Debug complete ===")
69 | 


--------------------------------------------------------------------------------
/checks/check_cdn.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from typing import Optional
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | def check_cdn(website: str) -> str:
 8 |     """
 9 |     Checks if a website is using a CDN by inspecting headers and other indicators.
10 | 
11 |     Args:
12 |         website (str): The URL of the website to check.
13 | 
14 |     Returns:
15 |         str:
16 |             - "🟢" if a CDN is detected.
17 |             - "🔴" if no CDN is detected.
18 |             - "⚪" if an error occurs.
19 |     """
20 |     # Ensure the website starts with 'http://' or 'https://'
21 |     if not website.startswith(('http://', 'https://')):
22 |         website = f"https://{website}"
23 | 
24 |     headers = {
25 |         'User-Agent': 'CDNChecker/1.0'
26 |     }
27 | 
28 |     try:
29 |         response = requests.get(website, headers=headers, stream=True, timeout=10)
30 |         response.raise_for_status()
31 | 
32 |         # Check server header for CDN indicators
33 |         server_header = response.headers.get('server', '').lower()
34 |         cdn_indicators = ['cloudflare', 'akamai', 'fastly', 'amazon', 'cdn', 'stackpath', 'keycdn', 'maxcdn']
35 |         
36 |         if any(indicator in server_header for indicator in cdn_indicators):
37 |             logger.info(f"CDN detected in server header for {website}.")
38 |             return "🟢"
39 | 
40 |         # Check other headers that might indicate CDN usage
41 |         cdn_headers = [
42 |             'cf-ray',  # Cloudflare
43 |             'x-served-by',  # Fastly
44 |             'x-cache',  # Various CDNs
45 |             'x-edge-location',  # AWS CloudFront
46 |             'x-akamai-transformed'  # Akamai
47 |         ]
48 |         
49 |         if any(header in response.headers for header in cdn_headers):
50 |             logger.info(f"CDN detected in headers for {website}.")
51 |             return "🟢"
52 | 
53 |         logger.info(f"No CDN detected for {website}.")
54 |         return "🔴"
55 | 
56 |     except requests.exceptions.RequestException as e:
57 |         logger.error(f"Request error during CDN check for {website}: {e}")
58 |         return "⚪"
59 |     except Exception as e:
60 |         logger.error(f"An unexpected error occurred during the CDN check for {website}: {e}")
61 |         return "⚪"
62 | 


--------------------------------------------------------------------------------
/checks/check_cookie_duration.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from datetime import datetime
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | 
 5 | def check_cookie_duration(website):
 6 |     """
 7 |     Ensure that session cookies set by the website don't have an overly long duration.
 8 |     
 9 |     Args:
10 |         website (str): URL of the website to be checked.
11 |     
12 |     Returns:
13 |         str: 
14 |             - "🔴" if any cookie has an overly long duration
15 |             - "🟢" if all cookies have an acceptable duration
16 |             - "⚪" if an error occurs
17 |     """
18 |     # Ensure the website starts with 'http://' or 'https://'
19 |     if not website.startswith(('http://', 'https://')):
20 |         website = f"https://{website}"
21 | 
22 |     headers = {
23 |         'User-Agent': 'CookieDurationChecker/1.0'
24 |     }
25 | 
26 |     try:
27 |         # Perform the request to get cookies
28 |         response = requests.get(website, headers=headers, timeout=10)
29 |         response.raise_for_status()
30 | 
31 |         long_duration_cookies = 0
32 |         max_duration_seconds = 604800  # 7 days in seconds
33 | 
34 |         for cookie in response.cookies:
35 |             # Check if cookie has expiration (session cookies don't have expiration)
36 |             if cookie.expires:
37 |                 try:
38 |                     # Convert expires timestamp to datetime and calculate duration
39 |                     expires_datetime = datetime.fromtimestamp(cookie.expires)
40 |                     delta = expires_datetime - datetime.now()
41 |                     if delta.total_seconds() > max_duration_seconds:
42 |                         long_duration_cookies += 1
43 |                         print(f"Cookie '{cookie.name}' has long duration: {delta.days} days")
44 |                 except (ValueError, OSError) as e:
45 |                     print(f"Error parsing cookie expiration for '{cookie.name}': {e}")
46 |                     continue
47 | 
48 |         # Return based on the count of long-duration cookies
49 |         if long_duration_cookies > 0:
50 |             print(f"Found {long_duration_cookies} cookies with long duration on {website}.")
51 |             return "🔴"
52 |         print(f"All cookies have an acceptable duration on {website}.")
53 |         return "🟢"
54 | 
55 |     except (Timeout, HTTPError, RequestException) as e:
56 |         print(f"Request error occurred while checking cookie duration for {website}: {e}")
57 |         return "⚪"
58 |     except Exception as e:
59 |         print(f"An unexpected error occurred while checking cookie duration for {website}: {e}")
60 |         return "⚪"
61 | 


--------------------------------------------------------------------------------
/.github/workflows/create-report.yml:
--------------------------------------------------------------------------------
 1 | name: Create report
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 4 * * *'  # Runs daily at 4 AM
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   create-report:
10 |     runs-on: ubuntu-latest
11 |     outputs:
12 |       status: ${{ steps.set-output.outputs.status }}
13 |     steps:
14 |       - name: Checkout repository
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Setup Python 3.x
18 |         uses: actions/setup-python@v5
19 |         with:
20 |           python-version: '3.x'
21 | 
22 |       - name: Install dependencies
23 |         run: |
24 |           pip install requests python-whois dnspython beautifulsoup4 selenium pyyaml lxml
25 | 
26 |       - name: Reinstall dependencies
27 |         run: |
28 |           pip install --upgrade pip
29 |           pip uninstall whois -y
30 |           pip install python-whois
31 |           
32 |       - name: Set Chrome Version
33 |         id: chrome_version
34 |         run: |
35 |           CHROME_VERSION=$(curl -s "https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json" | jq -r '.channels.Stable.version')
36 |           echo "CHROME_VERSION=$CHROME_VERSION" >> $GITHUB_OUTPUT
37 |           
38 |       - name: Setup Chrome and Chromedriver
39 |         uses: browser-actions/setup-chrome@v1
40 |         with:
41 |           chrome-version: ${{ steps.chrome_version.outputs.CHROME_VERSION }}
42 |           install-chromedriver: true
43 |           
44 |       - name: Verify Chrome and Chromedriver Installation
45 |         run: |
46 |           google-chrome --version
47 |           chromedriver --version
48 |           CHROME_MAJOR=$(google-chrome --version | cut -d ' ' -f 3 | cut -d '.' -f 1)
49 |           DRIVER_MAJOR=$(chromedriver --version | cut -d ' ' -f 2 | cut -d '.' -f 1)
50 |           if [[ "$CHROME_MAJOR" != "$DRIVER_MAJOR" ]]; then
51 |             echo "Mismatch between Chrome and Chromedriver versions!"
52 |           fi
53 | 
54 |       - name: Run Website Tests
55 |         id: test
56 |         env:
57 |           PAGESPEED_API_KEY: ${{ secrets.PAGESPEED_API_KEY }}
58 |         run: python main.py
59 |         continue-on-error: true
60 |         
61 |       - name: Commit and push report
62 |         run: |
63 |           git config --local user.email "action@github.com"
64 |           git config --local user.name "GitHub Action"
65 |           git add report.md
66 |           git commit -m "Add generated report" -a || echo "No changes to commit"
67 |           git push || echo "No changes to push"
68 | 
69 |       - name: Fail if error encountered
70 |         if: steps.set-output.outputs.status == 'error'
71 |         run: exit 1
72 | 


--------------------------------------------------------------------------------
/checks/check_browser_compatibility.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.common.exceptions import WebDriverException
 3 | 
 4 | def check_browser_compatibility(website):
 5 |     """
 6 |     Check if the website is compatible with different browsers.
 7 | 
 8 |     Args:
 9 |         website (str): The URL of the website to be checked.
10 | 
11 |     Returns:
12 |         str: 
13 |             - "🟢" if the website is compatible with all tested browsers
14 |             - "🔴" if the website is not compatible with any browser or if an error occurs
15 |     """
16 |     # Ensure the website starts with 'http://' or 'https://'
17 |     if not website.startswith(('http://', 'https://')):
18 |         website = f"https://{website}"
19 | 
20 |     # List of drivers to test compatibility
21 |     driver_configs = [
22 |         ("Chrome", webdriver.Chrome, webdriver.ChromeOptions),
23 |         ("Firefox", webdriver.Firefox, webdriver.FirefoxOptions),
24 |     ]
25 | 
26 |     compatible_browsers = 0
27 |     total_browsers = len(driver_configs)
28 | 
29 |     for browser_name, driver_class, options_class in driver_configs:
30 |         driver = None
31 |         try:
32 |             # Set up options for each driver
33 |             browser_options = options_class()
34 |             browser_options.add_argument('--headless')  # Run in headless mode
35 |             browser_options.add_argument('--no-sandbox')
36 |             browser_options.add_argument('--disable-dev-shm-usage')
37 |             
38 |             driver = driver_class(options=browser_options)
39 |             driver.set_page_load_timeout(10)  # Set timeout for page load
40 |             
41 |             driver.get(website)
42 | 
43 |             # Basic check: Ensure that the page loads successfully and has content
44 |             if driver.title and len(driver.page_source) > 100:
45 |                 print(f"Website {website} is compatible with {browser_name}.")
46 |                 compatible_browsers += 1
47 |             else:
48 |                 print(f"Compatibility issue found with {browser_name} for {website}.")
49 | 
50 |         except WebDriverException as e:
51 |             print(f"Error occurred while testing {browser_name} for {website}: {e}")
52 |         except Exception as e:
53 |             print(f"Unexpected error with {browser_name} for {website}: {e}")
54 |         finally:
55 |             if driver:
56 |                 try:
57 |                     driver.quit()
58 |                 except:
59 |                     pass  # Ignore cleanup errors
60 | 
61 |     # Determine result based on browser compatibility
62 |     if compatible_browsers == total_browsers:
63 |         return "🟢"
64 |     elif compatible_browsers > 0:
65 |         return "🟠"  # Partially compatible
66 |     else:
67 |         return "🔴"
68 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to the Website Monitor project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ### Added
11 | - Comprehensive CONTRIBUTING.md with development guidelines
12 | - MIT LICENSE file
13 | - Detailed troubleshooting section in README
14 | - Project structure documentation
15 | - FAQ section answering common questions
16 | - Quick Links navigation in README
17 | - Badges for license, Python version, and Docker
18 | - Comprehensive API usage examples with multiple scenarios
19 | - Detailed check categories reference table
20 | - Enhanced status indicators documentation
21 | 
22 | ### Changed
23 | - Improved GitHub Actions setup instructions with step-by-step guide
24 | - Enhanced .env.example with better organization and comments
25 | - Updated project_description.md with architecture details
26 | - Expanded usage.md with better structure
27 | - Improved DOCKER.md with corrected API paths
28 | 
29 | ### Fixed
30 | - Corrected API endpoint from POST /check to POST /monitor
31 | - Fixed API documentation paths from /docs to /api/docs
32 | - Fixed output_file default in documentation (report.md, not README.md)
33 | - Updated config.yaml to include all documented options
34 | - Corrected documentation paths throughout
35 | 
36 | ## [1.3.0] - 2024
37 | 
38 | ### Added
39 | - FastAPI web interface with interactive UI
40 | - REST API with 53+ security, performance, and compliance checks
41 | - Docker and Docker Compose support
42 | - Comprehensive check categories:
43 |   - Security & Protection (10 checks)
44 |   - Performance & Speed (8 checks)
45 |   - SEO & Content (9 checks)
46 |   - Domain & DNS (7 checks)
47 |   - Privacy & Tracking (10 checks)
48 |   - Accessibility & Mobile (5 checks)
49 |   - Technical & Infrastructure (4 checks)
50 | 
51 | ### Changed
52 | - Migrated from basic script to full FastAPI application
53 | - Improved error handling and logging
54 | - Enhanced check organization by category
55 | 
56 | ## [1.0.0] - Initial Release
57 | 
58 | ### Added
59 | - Basic website monitoring functionality
60 | - GitHub Actions integration for automated daily checks
61 | - Markdown report generation
62 | - Core security and performance checks
63 | 
64 | ---
65 | 
66 | ## Version History
67 | 
68 | - **1.3.0**: Current version with full API and web interface
69 | - **1.0.0**: Initial release with basic monitoring
70 | 
71 | [Unreleased]: https://github.com/fabriziosalmi/websites-monitor/compare/v1.3.0...HEAD
72 | [1.3.0]: https://github.com/fabriziosalmi/websites-monitor/releases/tag/v1.3.0
73 | [1.0.0]: https://github.com/fabriziosalmi/websites-monitor/releases/tag/v1.0.0
74 | 


--------------------------------------------------------------------------------
/checks/check_floc.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | from urllib.parse import urlparse
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def check_floc(website: str) -> str:
 9 |     """
10 |     Check if the website has opted out of FLoC (Federated Learning of Cohorts).
11 |     
12 |     Args:
13 |         website (str): URL of the website to be checked.
14 |     
15 |     Returns:
16 |         str: 
17 |             - "🟢" if the site has opted out of FLoC
18 |             - "🔴" if it has not opted out
19 |             - "⚪" if an error occurred
20 |     """
21 |     # Input validation and URL normalization
22 |     if not website:
23 |         logger.error("Website URL is required")
24 |         return "⚪"
25 |     
26 |     if not website.startswith(('http://', 'https://')):
27 |         website = f"https://{website}"
28 |     
29 |     try:
30 |         parsed_url = urlparse(website)
31 |         if not parsed_url.netloc:
32 |             logger.error(f"Invalid URL format: {website}")
33 |             return "⚪"
34 |     except Exception as e:
35 |         logger.error(f"URL parsing error for {website}: {e}")
36 |         return "⚪"
37 | 
38 |     headers = {
39 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
40 |     }
41 | 
42 |     try:
43 |         # Perform the HTTP request with timeout
44 |         response = requests.get(website, headers=headers, timeout=15)
45 |         response.raise_for_status()
46 | 
47 |         # Enhanced detection patterns
48 |         permissions_policy = response.headers.get('Permissions-Policy', '').lower()
49 |         
50 |         # Check for FLoC opt-out in Permissions-Policy header
51 |         if 'interest-cohort=()' in permissions_policy:
52 |             logger.info(f"FLoC opt-out detected via Permissions-Policy for {website}")
53 |             return "🟢"
54 |         
55 |         # Fallback: Check for older Feature-Policy header
56 |         feature_policy = response.headers.get('Feature-Policy', '').lower()
57 |         if 'interest-cohort' in feature_policy and "'none'" in feature_policy:
58 |             logger.info(f"FLoC opt-out detected via Feature-Policy for {website}")
59 |             return "🟢"
60 |         
61 |         logger.warning(f"No FLoC opt-out detected for {website}")
62 |         return "🔴"
63 | 
64 |     except (Timeout, HTTPError) as e:
65 |         logger.error(f"HTTP error while checking FLoC for {website}: {e}")
66 |         return "⚪"
67 |     except RequestException as e:
68 |         logger.error(f"Request error while checking FLoC for {website}: {e}")
69 |         return "⚪"
70 |     except Exception as e:
71 |         logger.error(f"Unexpected error while checking FLoC for {website}: {e}")
72 |         return "⚪"
73 | 


--------------------------------------------------------------------------------
/checks/check_cookie_flags.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import RequestException, Timeout, HTTPError
 3 | 
 4 | def check_cookie_flags(website):
 5 |     """
 6 |     Check if all cookies set by the website have the Secure and HttpOnly flags.
 7 | 
 8 |     Args:
 9 |         website (str): URL of the website to be checked.
10 | 
11 |     Returns:
12 |         str:
13 |             - "🟢" if all cookies have Secure and HttpOnly flags
14 |             - "🟠" if some cookies have Secure and HttpOnly flags, but not all
15 |             - "🔴" if no cookies have both Secure and HttpOnly flags
16 |             - "⚪" if an error occurs
17 |     """
18 |     # Ensure the website starts with 'http://' or 'https://'
19 |     if not website.startswith(('http://', 'https://')):
20 |         website = f"https://{website}"
21 | 
22 |     headers = {
23 |         'User-Agent': 'CookieFlagChecker/1.0'
24 |     }
25 | 
26 |     try:
27 |         response = requests.get(website, headers=headers, timeout=10)
28 |         response.raise_for_status()
29 | 
30 |         # Check if any cookies are set using the response.cookies object
31 |         if not response.cookies:
32 |             print(f"No cookies found for {website}.")
33 |             return "🟢"  # No cookies means no security issue
34 | 
35 |         # Flags for checking Secure and HttpOnly
36 |         all_secure_http_only = True
37 |         any_secure_http_only = False
38 |         total_cookies = len(response.cookies)
39 | 
40 |         # Check each cookie's security attributes
41 |         for cookie in response.cookies:
42 |             has_secure = cookie.secure
43 |             has_httponly = hasattr(cookie, '_rest') and cookie._rest.get('HttpOnly') is not None
44 |             
45 |             if has_secure and has_httponly:
46 |                 any_secure_http_only = True
47 |             else:
48 |                 all_secure_http_only = False
49 |                 print(f"Cookie '{cookie.name}' missing security flags: Secure={has_secure}, HttpOnly={has_httponly}")
50 | 
51 |         # Determine the result based on the flags
52 |         if all_secure_http_only and total_cookies > 0:
53 |             print(f"All cookies have Secure and HttpOnly flags for {website}.")
54 |             return "🟢"
55 |         elif any_secure_http_only:
56 |             print(f"Some cookies have Secure and HttpOnly flags, but not all for {website}.")
57 |             return "🟠"
58 |         else:
59 |             print(f"No cookies have both Secure and HttpOnly flags for {website}.")
60 |             return "🔴"
61 | 
62 |     except (Timeout, HTTPError, RequestException) as e:
63 |         print(f"Request error occurred while checking cookie flags for {website}: {e}")
64 |         return "⚪"
65 |     except Exception as e:
66 |         print(f"An unexpected error occurred while checking cookie flags for {website}: {e}")
67 |         return "⚪"
68 | 


--------------------------------------------------------------------------------
/checks/check_cms_used.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import RequestException, Timeout, HTTPError
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | def check_cms_used(website):
 6 |     """
 7 |     Checks which CMS (if any) is used by a website based on certain telltale patterns in its content.
 8 |     
 9 |     Args:
10 |         website (str): The website URL to check.
11 |     
12 |     Returns:
13 |         str: 
14 |             - "🟢 (CMS Name)" if a CMS is detected
15 |             - "🔴" if no CMS is detected
16 |             - "⚪" if an error occurs
17 |     """
18 |     # Ensure the website starts with 'http://' or 'https://'
19 |     if not website.startswith(('http://', 'https://')):
20 |         website = f"https://{website}"
21 | 
22 |     headers = {
23 |         'User-Agent': 'CMSChecker/1.0'
24 |     }
25 | 
26 |     cms_patterns = {
27 |         "WordPress": ["wp-", "wp-content", "wp-includes", "wp-json", "xmlrpc.php"],
28 |         "Drupal": ["Drupal", "sites/default/files", "drupal.js"],
29 |         "Joomla": ["Joomla", "/templates/joomla/", "index.php?option=com_"],
30 |         "Wix": ["wix.com", "wix-public", "wixstatic"],
31 |         "Squarespace": ["squarespace.com", "static.squarespace.com"],
32 |         "Shopify": ["shopify", "cdn.shopify.com"],
33 |         "Magento": ["Magento", "mage/", "static/version", "skin/frontend"]
34 |     }
35 | 
36 |     try:
37 |         # Method 1: Direct HTML content analysis
38 |         response = requests.get(website, headers=headers, timeout=10)
39 |         response.raise_for_status()
40 |         content = response.text
41 | 
42 |         # Search for CMS-specific patterns in the website content
43 |         for cms, patterns in cms_patterns.items():
44 |             if any(pattern in content for pattern in patterns):
45 |                 print(f"Detected CMS: {cms} for {website}.")
46 |                 return f"🟢 ({cms})"
47 |         
48 |         # Method 2: Additional heuristic checks with BeautifulSoup
49 |         soup = BeautifulSoup(content, 'html.parser')
50 | 
51 |         # Check for meta tags or generator information that might indicate a CMS
52 |         meta_generator = soup.find('meta', attrs={'name': 'generator'})
53 |         if meta_generator and meta_generator.get('content'):
54 |             generator_content = meta_generator['content'].lower()
55 |             for cms in cms_patterns:
56 |                 if cms.lower() in generator_content:
57 |                     print(f"Detected CMS via meta tag: {cms} for {website}.")
58 |                     return f"🟢 ({cms})"
59 |         
60 |         print(f"No CMS detected for {website}.")
61 |         return "🔴"
62 | 
63 |     except (Timeout, HTTPError, RequestException) as e:
64 |         print(f"Request error occurred while checking CMS for {website}: {e}")
65 |         return "⚪"
66 |     except Exception as e:
67 |         print(f"An unexpected error occurred while checking CMS for {website}: {e}")
68 |         return "⚪"
69 | 


--------------------------------------------------------------------------------
/checks/check_clientside_rendering.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | 
 5 | def check_clientside_rendering(website, threshold=10):
 6 |     """
 7 |     Checks if a website relies heavily on client-side rendering by counting the number of script tags and other indicators.
 8 |     
 9 |     Args:
10 |         website (str): The website URL to check.
11 |         threshold (int): The threshold above which the number of scripts indicates heavy client-side rendering.
12 | 
13 |     Returns:
14 |         str: 
15 |             - "🟢" if the number of scripts is below the threshold
16 |             - "🟠" if it's close to the threshold
17 |             - "🔴" if above the threshold
18 |             - "⚪" if an error occurs
19 |     """
20 |     # Ensure the website starts with 'http://' or 'https://'
21 |     if not website.startswith(('http://', 'https://')):
22 |         website = f"https://{website}"
23 | 
24 |     headers = {
25 |         'User-Agent': 'ClientSideRenderingChecker/1.0'
26 |     }
27 | 
28 |     try:
29 |         # Method 1: Check number of script tags
30 |         response = requests.get(website, headers=headers, timeout=10)
31 |         response.raise_for_status()
32 |         soup = BeautifulSoup(response.content, 'html.parser')
33 |         scripts = soup.find_all('script')
34 | 
35 |         num_scripts = len(scripts)
36 | 
37 |         # Method 2: Additional check for specific JavaScript libraries and frameworks
38 |         # Check for common client-side frameworks that are heavy on client-side rendering
39 |         frameworks = ['react', 'angular', 'vue', 'next', 'nuxt', 'svelte', 'ember', 'backbone']
40 |         framework_detected = False
41 |         
42 |         for script in scripts:
43 |             src = script.get('src', '').lower()
44 |             content = (script.string or '').lower()
45 |             if any(framework in src or framework in content for framework in frameworks):
46 |                 framework_detected = True
47 |                 break
48 | 
49 |         # Determine result based on number of script tags and framework detection
50 |         if num_scripts > threshold or framework_detected:
51 |             print(f"Heavy client-side rendering detected for {website}.")
52 |             return "🔴"
53 |         elif threshold - 3 <= num_scripts <= threshold:
54 |             print(f"Moderate client-side rendering detected for {website}.")
55 |             return "🟠"
56 |         else:
57 |             print(f"Minimal client-side rendering detected for {website}.")
58 |             return "🟢"
59 | 
60 |     except (Timeout, HTTPError, RequestException) as e:
61 |         print(f"Request error occurred while checking client-side rendering for {website}: {e}")
62 |         return "⚪"
63 |     except Exception as e:
64 |         print(f"An unexpected error occurred while checking client-side rendering for {website}: {e}")
65 |         return "⚪"
66 | 


--------------------------------------------------------------------------------
/checks/check_brotli_compression.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import RequestException, Timeout, HTTPError
 3 | 
 4 | def check_brotli_compression(website):
 5 |     """
 6 |     Check if the website supports Brotli compression.
 7 | 
 8 |     Args:
 9 |         website (str): The URL of the website to be checked.
10 | 
11 |     Returns:
12 |         str: 
13 |             - "🟢" if Brotli compression is enabled
14 |             - "🔴" if Brotli compression is not enabled
15 |             - "⚪" if an error occurs
16 |     """
17 |     # Ensure the website starts with 'http://' or 'https://'
18 |     if not website.startswith(('http://', 'https://')):
19 |         website = f"https://{website}"
20 | 
21 |     headers = {
22 |         "Accept-Encoding": "gzip, deflate, br",
23 |         "User-Agent": "BrotliCompressionChecker/1.0"
24 |     }
25 | 
26 |     try:
27 |         # Method 1: Direct HTTP Request with Brotli Accept-Encoding Header
28 |         response = requests.get(website, headers=headers, timeout=10)
29 |         response.raise_for_status()
30 | 
31 |         # Check if the response indicates Brotli compression
32 |         if 'br' in response.headers.get('Content-Encoding', ''):
33 |             print(f"Brotli compression is enabled for {website}.")
34 |             return "🟢"
35 |         else:
36 |             print(f"Brotli compression is not enabled for {website}.")
37 |             return "🔴"
38 | 
39 |     except (Timeout, HTTPError, RequestException) as e:
40 |         print(f"Request error occurred while checking Brotli compression for {website}: {e}")
41 |         
42 |         # Method 2: Alternative Check via Content-Length Comparison (Fallback)
43 |         try:
44 |             headers_gzip = {
45 |                 "Accept-Encoding": "gzip, deflate",
46 |                 "User-Agent": "BrotliCompressionChecker/1.0"
47 |             }
48 |             headers_brotli = {
49 |                 "Accept-Encoding": "br",
50 |                 "User-Agent": "BrotliCompressionChecker/1.0"
51 |             }
52 | 
53 |             # Request with Gzip/Deflate encoding
54 |             response_gzip = requests.get(website, headers=headers_gzip, timeout=10)
55 |             response_gzip.raise_for_status()
56 |             
57 |             # Request with Brotli encoding
58 |             response_brotli = requests.get(website, headers=headers_brotli, timeout=10)
59 |             response_brotli.raise_for_status()
60 | 
61 |             # Check if Brotli encoding is actually used in response
62 |             if 'br' in response_brotli.headers.get('Content-Encoding', ''):
63 |                 print(f"Brotli compression is enabled for {website} (fallback method).")
64 |                 return "🟢"
65 |             else:
66 |                 print(f"Brotli compression is not enabled for {website} (fallback method).")
67 |                 return "🔴"
68 | 
69 |         except Exception as e:
70 |             print(f"Error during fallback Brotli check for {website}: {e}")
71 |             return "⚪"
72 | 
73 |     return "⚪"
74 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | # Website Monitor - Docker Compose Configuration
  2 | 
  3 | services:
  4 |   # Main Website Monitor API Service
  5 |   website-monitor-api:
  6 |     build: .
  7 |     container_name: website-monitor-api
  8 |     ports:
  9 |       - "8000:8000"
 10 |     volumes:
 11 |       - ./config.yaml:/app/config.yaml:ro
 12 |       - ./reports:/app/reports
 13 |       - ./logs:/app/logs
 14 |     environment:
 15 |       - PYTHONUNBUFFERED=1
 16 |       - API_HOST=0.0.0.0
 17 |       - API_PORT=8000
 18 |       - PAGESPEED_API_KEY=${PAGESPEED_API_KEY:-}
 19 |     command: ["python", "api.py"]
 20 |     restart: unless-stopped
 21 |     healthcheck:
 22 |       test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
 23 |       interval: 30s
 24 |       timeout: 10s
 25 |       retries: 3
 26 |       start_period: 40s
 27 |     networks:
 28 |       - website-monitor-network
 29 | 
 30 |   # Scheduled Monitor Service (runs checks periodically)
 31 |   website-monitor-scheduler:
 32 |     build: .
 33 |     container_name: website-monitor-scheduler
 34 |     volumes:
 35 |       - ./config.yaml:/app/config.yaml:ro
 36 |       - ./reports:/app/reports
 37 |       - ./logs:/app/logs
 38 |       - ./README.md:/app/README.md
 39 |       - ./report_template.md:/app/report_template.md:ro
 40 |     environment:
 41 |       - PYTHONUNBUFFERED=1
 42 |       - PAGESPEED_API_KEY=${PAGESPEED_API_KEY:-}
 43 |       - MONITOR_INTERVAL=${MONITOR_INTERVAL:-3600}  # Default: 1 hour
 44 |     command: ["python", "scheduler.py"]
 45 |     restart: unless-stopped
 46 |     depends_on:
 47 |       - website-monitor-api
 48 |     networks:
 49 |       - website-monitor-network
 50 | 
 51 |   # Optional: Nginx reverse proxy for production
 52 |   nginx:
 53 |     image: nginx:alpine
 54 |     container_name: website-monitor-nginx
 55 |     ports:
 56 |       - "80:80"
 57 |       - "443:443"
 58 |     volumes:
 59 |       - ./docker/nginx.conf:/etc/nginx/nginx.conf:ro
 60 |       - ./docker/ssl:/etc/nginx/ssl:ro
 61 |     depends_on:
 62 |       - website-monitor-api
 63 |     restart: unless-stopped
 64 |     profiles:
 65 |       - production
 66 | 
 67 |   # Optional: Redis for caching and task queues
 68 |   redis:
 69 |     image: redis:7-alpine
 70 |     container_name: website-monitor-redis
 71 |     ports:
 72 |       - "6379:6379"
 73 |     volumes:
 74 |       - redis_data:/data
 75 |     restart: unless-stopped
 76 |     profiles:
 77 |       - production
 78 | 
 79 |   # Optional: PostgreSQL for storing monitoring results
 80 |   postgres:
 81 |     image: postgres:15-alpine
 82 |     container_name: website-monitor-db
 83 |     environment:
 84 |       POSTGRES_DB: website_monitor
 85 |       POSTGRES_USER: monitor_user
 86 |       POSTGRES_PASSWORD: ${DB_PASSWORD:-secure_password_change_me}
 87 |     ports:
 88 |       - "5432:5432"
 89 |     volumes:
 90 |       - postgres_data:/var/lib/postgresql/data
 91 |       - ./docker/init.sql:/docker-entrypoint-initdb.d/init.sql:ro
 92 |     restart: unless-stopped
 93 |     profiles:
 94 |       - production
 95 | 
 96 | volumes:
 97 |   redis_data:
 98 |   postgres_data:
 99 | 
100 | networks:
101 |   website-monitor-network:
102 |     driver: bridge
103 | 


--------------------------------------------------------------------------------
/checks/check_pagespeed_performances.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from requests.exceptions import RequestException, HTTPError, Timeout
 4 | 
 5 | # Configure logging
 6 | logging.basicConfig(level=logging.INFO)
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | def check_pagespeed_performances(website: str, api_key: str = None) -> str:
10 |     """
11 |     Checks the PageSpeed Insights performance score for a website with enhanced error handling.
12 | 
13 |     Args:
14 |         website (str): The URL of the website to be checked.
15 |         api_key (str, optional): The Google PageSpeed Insights API key. Defaults to None.
16 | 
17 |     Returns:
18 |         str:
19 |             - An integer representing the PageSpeed score if successful.
20 |             - "⚪" if any errors occur during the check or if no API key was provided.
21 |     """
22 |     # Input validation and URL normalization
23 |     if not website or not isinstance(website, str):
24 |         logger.error(f"Invalid website input: {website}")
25 |         return "⚪"
26 |         
27 |     if not api_key:
28 |         logger.error("No API key provided for PageSpeed check.")
29 |         return "⚪"
30 |     
31 |     website = website.strip()
32 |     if not website.startswith(('http://', 'https://')):
33 |         website = f"https://{website}"
34 |     
35 |     try:
36 |         # Enhanced API call with better parameters
37 |         pagespeed_url = f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed"
38 |         params = {
39 |             'url': website,
40 |             'key': api_key,
41 |             'category': 'performance',
42 |             'strategy': 'mobile'  # Default to mobile strategy
43 |         }
44 |         
45 |         response = requests.get(pagespeed_url, params=params, timeout=30)
46 |         response.raise_for_status()
47 |         data = response.json()
48 | 
49 |         # Enhanced data extraction
50 |         lighthouse_result = data.get("lighthouseResult", {})
51 |         performance_category = lighthouse_result.get("categories", {}).get("performance", {})
52 |         score = performance_category.get("score")
53 |         
54 |         if score is not None:
55 |             score_percentage = int(score * 100)
56 |             logger.info(f"PageSpeed score for {website} is {score_percentage}.")
57 |             return str(score_percentage)
58 |             
59 |         logger.warning(f"PageSpeed score not found for {website}.")
60 |         return "⚪"
61 | 
62 |     except Timeout:
63 |         logger.error(f"Timeout occurred while fetching PageSpeed data for {website}")
64 |         return "⚪"
65 |     except HTTPError as http_err:
66 |         logger.error(f"HTTP error occurred while fetching PageSpeed data for {website}: {http_err}")
67 |         return "⚪"
68 |     except RequestException as req_err:
69 |         logger.error(f"Request error occurred while fetching PageSpeed data for {website}: {req_err}")
70 |         return "⚪"
71 |     except ValueError as json_err:
72 |         logger.error(f"JSON parsing error occurred while fetching PageSpeed data for {website}: {json_err}")
73 |         return "⚪"
74 |     except Exception as e:
75 |         logger.error(f"An unexpected error occurred while checking PageSpeed data for {website}: {e}")
76 |         return "⚪"
77 | 


--------------------------------------------------------------------------------
/checks/check_cookie_policy.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | 
 5 | def check_cookie_policy(website):
 6 |     """
 7 |     Verify if the website has a cookie policy and it's accessible to users.
 8 |     
 9 |     Args:
10 |         website (str): URL of the website to be checked.
11 |     
12 |     Returns:
13 |         str: 
14 |             - "🟢" if a cookie policy is found and accessible
15 |             - "🔴" if no cookie policy is found or if it's inaccessible
16 |             - "⚪" for any errors
17 |     """
18 |     # Ensure the website starts with 'http://' or 'https://'
19 |     if not website.startswith(('http://', 'https://')):
20 |         website = f"https://{website}"
21 | 
22 |     headers = {
23 |         'User-Agent': 'CookiePolicyChecker/1.0'
24 |     }
25 | 
26 |     try:
27 |         # Method 1: Direct page analysis for cookie policy
28 |         response = requests.get(website, headers=headers, timeout=10)
29 |         response.raise_for_status()
30 | 
31 |         soup = BeautifulSoup(response.text, 'html.parser')
32 | 
33 |         # Common keywords associated with cookie policies
34 |         keywords = ["cookie policy", "cookie statement", "use of cookies", "privacy policy"]
35 | 
36 |         # Check for the presence of these keywords in anchor tags (links)
37 |         anchors = soup.find_all('a', string=lambda text: text and any(keyword in text.lower() for keyword in keywords))
38 |         if anchors:
39 |             print(f"Cookie policy found in links for {website}.")
40 |             return "🟢"
41 | 
42 |         # If not found in links, check if any of the keywords are present in the page's text
43 |         page_text = soup.get_text().lower()
44 |         if any(keyword in page_text for keyword in keywords):
45 |             print(f"Cookie policy text found on the page for {website}.")
46 |             return "🟢"
47 | 
48 |         print(f"No cookie policy found for {website}.")
49 |         return "🔴"
50 | 
51 |     except (Timeout, HTTPError, RequestException) as e:
52 |         print(f"Request error occurred while checking cookie policy for {website}: {e}")
53 |         
54 |         # Method 2: Check for common cookie policy URLs (Fallback)
55 |         try:
56 |             common_paths = ["/cookie-policy", "/cookies", "/privacy-policy", "/legal/cookies", "/legal/privacy-policy"]
57 |             for path in common_paths:
58 |                 try:
59 |                     policy_response = requests.get(f"{website.rstrip('/')}{path}", headers=headers, timeout=5)
60 |                     if policy_response.status_code == 200:
61 |                         print(f"Cookie policy found at {website.rstrip('/')}{path}.")
62 |                         return "🟢"
63 |                 except (Timeout, HTTPError, RequestException):
64 |                     continue
65 | 
66 |             print(f"No cookie policy found for {website} (fallback method).")
67 |             return "🔴"
68 | 
69 |         except Exception as e:
70 |             print(f"Error during fallback cookie policy check for {website}: {e}")
71 |             return "⚪"
72 |     except Exception as e:
73 |         print(f"An unexpected error occurred while checking cookie policy for {website}: {e}")
74 |         return "⚪"
75 | 


--------------------------------------------------------------------------------
/checks/check_open_graph_protocol.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from bs4 import BeautifulSoup
 4 | from requests.exceptions import RequestException, HTTPError, Timeout
 5 | 
 6 | # Configure logging
 7 | logging.basicConfig(level=logging.INFO)
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def check_open_graph_protocol(website: str) -> str:
11 |     """
12 |     Check a given website for the presence of essential Open Graph Protocol meta tags with enhanced validation.
13 | 
14 |     Args:
15 |         website (str): The URL of the website to be checked.
16 | 
17 |     Returns:
18 |         str:
19 |             - "🟢" if essential Open Graph Protocol meta tags are found.
20 |             - "🔴" if essential Open Graph Protocol meta tags are missing.
21 |             - "⚪" for any errors.
22 |     """
23 |     # Input validation and URL normalization
24 |     if not website or not isinstance(website, str):
25 |         logger.error(f"Invalid website input: {website}")
26 |         return "⚪"
27 |     
28 |     website = website.strip()
29 |     if not website.startswith(('http://', 'https://')):
30 |         website = f"https://{website}"
31 | 
32 |     headers = {
33 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
34 |     }
35 | 
36 |     try:
37 |         # Make a request to the website
38 |         response = requests.get(website, headers=headers, timeout=15)
39 |         response.raise_for_status()
40 | 
41 |         # Parse the HTML content using BeautifulSoup
42 |         soup = BeautifulSoup(response.content, 'html.parser')
43 | 
44 |         # List of essential Open Graph tags
45 |         essential_tags = {'og:title', 'og:type', 'og:image', 'og:url'}
46 |         recommended_tags = {'og:description', 'og:site_name', 'og:locale'}
47 | 
48 |         # Extract all Open Graph meta tags
49 |         meta_tags = soup.find_all('meta', property=lambda x: x and x.startswith('og:'))
50 | 
51 |         # Extract the properties of found meta tags
52 |         found_tags = {tag['property'] for tag in meta_tags if tag.has_attr('property') and tag.get('content')}
53 | 
54 |         logger.info(f"Open Graph analysis for {website}: {len(found_tags)} tags found")
55 |         logger.debug(f"Found OG tags: {found_tags}")
56 | 
57 |         # Check if all essential tags are present
58 |         missing_essential = essential_tags - found_tags
59 |         found_recommended = recommended_tags.intersection(found_tags)
60 | 
61 |         if not missing_essential:
62 |             logger.info(f"All essential Open Graph tags found for {website}.")
63 |             if len(found_recommended) >= 2:
64 |                 return "🟢"  # Has essential + recommended tags
65 |             return "🟢"  # Has essential tags
66 |         else:
67 |             logger.warning(f"Missing essential Open Graph tags for {website}: {missing_essential}")
68 |             return "🔴"
69 |     
70 |     except (Timeout, HTTPError, RequestException) as e:
71 |         logger.warning(f"Request error occurred while checking Open Graph Protocol tags on {website}: {e}")
72 |         return "⚪"
73 |     except Exception as e:
74 |         logger.error(f"An unexpected error occurred while checking Open Graph Protocol tags on {website}: {e}")
75 |         return "⚪"
76 | 


--------------------------------------------------------------------------------
/checks/check_alt_tags.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | def check_alt_tags(website):
 7 |     """
 8 |     Check if all the images on the website have alt tags.
 9 | 
10 |     Args:
11 |         website (str): The URL of the website to be checked.
12 | 
13 |     Returns:
14 |         str: 
15 |             - "🔴" if no image has an alt tag
16 |             - "🟠" if some images have alt tags and one or more doesn't
17 |             - "🟢" if all images have alt tags
18 |             - "⚪" if an error occurs
19 |     """
20 |     # Ensure the website starts with 'http://' or 'https://'
21 |     if not website.startswith(('http://', 'https://')):
22 |         website = f"https://{website}"
23 | 
24 |     headers = {
25 |         'User-Agent': 'AltTagChecker/1.0'
26 |     }
27 | 
28 |     try:
29 |         # Method 1: Direct HTML content analysis using BeautifulSoup
30 |         response = requests.get(website, headers=headers, timeout=10)
31 |         response.raise_for_status()  # Raise an error for HTTP issues
32 |         soup = BeautifulSoup(response.text, 'lxml')
33 | 
34 |         # Find all images and count those with and without alt tags
35 |         images = soup.find_all('img')
36 |         total_images = len(images)
37 |         images_with_alt = sum(1 for img in images if img.get('alt') and img.get('alt').strip())
38 | 
39 |         # Determine the result based on the alt tag analysis
40 |         if total_images == 0:
41 |             print(f"No images found on {website}.")
42 |             return "🟢"  # No images, hence all images (none) have alt tags by definition
43 |         elif images_with_alt == 0:
44 |             print(f"No images with alt tags found on {website}.")
45 |             return "🔴"
46 |         elif images_with_alt < total_images:
47 |             print(f"{total_images - images_with_alt} images without alt tags found on {website}.")
48 |             return "🟠"
49 |         else:
50 |             return "🟢"
51 | 
52 |     except (Timeout, HTTPError, RequestException) as e:
53 |         print(f"Request error occurred while checking alt tags for {website}: {e}")
54 |         
55 |         # Method 2: Alternative Heuristic Check via Meta Tags (Fallback)
56 |         try:
57 |             # Try to get the response again for fallback analysis
58 |             response = requests.get(website, headers=headers, timeout=10)
59 |             response.raise_for_status()
60 |             soup = BeautifulSoup(response.text, 'html.parser')
61 | 
62 |             # Look for meta tags that could indicate a focus on accessibility
63 |             accessibility_tags = soup.find_all('meta', {'name': re.compile(r'(description|keywords|viewport)', re.IGNORECASE)})
64 | 
65 |             # Heuristic: If the website uses meta tags commonly associated with accessibility
66 |             if accessibility_tags:
67 |                 print(f"Some meta tags found that might indicate a focus on accessibility on {website}.")
68 |                 return "🟠"
69 | 
70 |             return "🔴"  # Assume no focus on accessibility if no relevant meta tags found
71 | 
72 |         except Exception as e:
73 |             print(f"Error during heuristic check for alt tags for {website}: {e}")
74 |             return "⚪"
75 | 
76 |     return "⚪"
77 | 


--------------------------------------------------------------------------------
/checks/check_broken_links.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import RequestException, Timeout, HTTPError
 3 | from bs4 import BeautifulSoup
 4 | from urllib.parse import urljoin, urlparse
 5 | 
 6 | def check_broken_links(website):
 7 |     """
 8 |     Check for broken links on the provided website.
 9 | 
10 |     Args:
11 |         website (str): The URL of the website to be checked.
12 | 
13 |     Returns:
14 |         str: 
15 |             - "🟢" if no broken links are found
16 |             - "🟠" if some broken links are found
17 |             - "🔴" if all links are broken
18 |             - "⚪" if an error occurs
19 |     """
20 |     # Ensure the website starts with 'http://' or 'https://'
21 |     if not website.startswith(('http://', 'https://')):
22 |         website = f"https://{website}"
23 | 
24 |     headers = {
25 |         'User-Agent': 'BrokenLinkChecker/1.0'
26 |     }
27 | 
28 |     checked_links = set()  # To avoid checking the same URL twice
29 |     broken_link_count = 0
30 |     total_links = 0
31 |     max_links_to_check = 20  # Limit to avoid excessive requests
32 | 
33 |     try:
34 |         # Method 1: Direct HTML content analysis using BeautifulSoup
35 |         response = requests.get(website, headers=headers, timeout=10)
36 |         response.raise_for_status()
37 |         soup = BeautifulSoup(response.text, 'lxml')
38 | 
39 |         # Find all anchor tags with href attributes
40 |         links = soup.find_all('a', href=True)
41 | 
42 |         for link in links[:max_links_to_check]:  # Limit number of links to check
43 |             href = link.get('href')
44 |             
45 |             # Skip anchor links, JavaScript calls, and mailto links
46 |             if href.startswith(('#', 'javascript:', 'mailto:')):
47 |                 continue
48 | 
49 |             # Convert relative URLs to absolute URLs
50 |             full_url = urljoin(website, href)
51 | 
52 |             # Skip already checked links
53 |             if full_url in checked_links:
54 |                 continue
55 | 
56 |             checked_links.add(full_url)
57 | 
58 |             try:
59 |                 # Check the status of the link
60 |                 link_response = requests.get(full_url, headers=headers, allow_redirects=True, timeout=5)
61 |                 if 400 <= link_response.status_code < 600:
62 |                     print(f"Broken link found: {full_url} (Status: {link_response.status_code})")
63 |                     broken_link_count += 1
64 | 
65 |             except (Timeout, HTTPError, RequestException) as e:
66 |                 print(f"Error while checking link: {full_url}: {e}")
67 |                 broken_link_count += 1
68 | 
69 |             total_links += 1
70 | 
71 |         # Determine the result based on the broken link analysis
72 |         if total_links == 0:
73 |             print("No valid links found on the website.")
74 |             return "⚪"
75 |         elif broken_link_count == 0:
76 |             return "🟢"
77 |         elif broken_link_count < total_links:
78 |             return "🟠"
79 |         else:
80 |             return "🔴"
81 | 
82 |     except (Timeout, HTTPError, RequestException) as e:
83 |         print(f"Request error occurred while checking broken links for {website}: {e}")
84 |         return "⚪"
85 |     except Exception as e:
86 |         print(f"An unexpected error occurred while checking broken links for {website}: {e}")
87 |         return "⚪"
88 | 


--------------------------------------------------------------------------------
/checks/check_mobile_friendly.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import logging
 4 | from requests.exceptions import RequestException, HTTPError
 5 | from urllib.parse import urlparse
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | def check_mobile_friendly(website: str, api_key: str) -> str:
10 |     """
11 |     Check if the given website is mobile-friendly using the Google Mobile-Friendly Test API.
12 | 
13 |     Args:
14 |         website (str): The URL of the website to be checked.
15 |         api_key (str): The API key for accessing the Google Mobile-Friendly Test API.
16 | 
17 |     Returns:
18 |         str: 
19 |             - "🟢" if the website is mobile-friendly.
20 |             - "🔴" if the website is not mobile-friendly.
21 |             - "⚪" for any errors.
22 |     """
23 |     # Input validation and URL normalization
24 |     if not website or not api_key:
25 |         logger.error("Website URL and API key are required")
26 |         return "⚪"
27 |     
28 |     # Normalize URL
29 |     if not website.startswith(('http://', 'https://')):
30 |         website = f"https://{website}"
31 |     
32 |     try:
33 |         parsed_url = urlparse(website)
34 |         if not parsed_url.netloc:
35 |             logger.error(f"Invalid URL format: {website}")
36 |             return "⚪"
37 |     except Exception as e:
38 |         logger.error(f"URL parsing error for {website}: {e}")
39 |         return "⚪"
40 | 
41 |     api_url = f"https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run?key={api_key}"
42 |     payload = {"url": website}
43 |     headers = {'Content-Type': 'application/json'}
44 | 
45 |     try:
46 |         # Make a POST request to the Google API
47 |         response = requests.post(api_url, headers=headers, json=payload, timeout=30)
48 |         response.raise_for_status()
49 | 
50 |         # Parse the response JSON
51 |         result = response.json()
52 | 
53 |         # Enhanced detection patterns
54 |         mobile_friendliness = result.get('mobileFriendliness', '').upper()
55 |         
56 |         if mobile_friendliness == 'MOBILE_FRIENDLY':
57 |             logger.info(f"Website {website} is mobile-friendly")
58 |             return "🟢"
59 |         elif mobile_friendliness == 'NOT_MOBILE_FRIENDLY':
60 |             logger.warning(f"Website {website} is not mobile-friendly")
61 |             return "🔴"
62 |         else:
63 |             logger.error(f"Unexpected mobile friendliness status: {mobile_friendliness}")
64 |             return "⚪"
65 | 
66 |     except requests.HTTPError as e:
67 |         if e.response.status_code == 429:
68 |             logger.error(f"API rate limit exceeded for {website}")
69 |         elif e.response.status_code == 403:
70 |             logger.error(f"API key invalid or insufficient permissions for {website}")
71 |         else:
72 |             logger.error(f"HTTP error {e.response.status_code} while checking {website}: {e}")
73 |         return "⚪"
74 |     except requests.RequestException as e:
75 |         logger.error(f"Request error while checking mobile-friendliness for {website}: {e}")
76 |         return "⚪"
77 |     except (KeyError, json.JSONDecodeError) as e:
78 |         logger.error(f"Invalid API response format for {website}: {e}")
79 |         return "⚪"
80 |     except Exception as e:
81 |         logger.error(f"Unexpected error while checking mobile-friendliness for {website}: {e}")
82 |         return "⚪"
83 | 


--------------------------------------------------------------------------------
/checks/check_xss_protection.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from typing import Optional
 4 | from requests.exceptions import RequestException, Timeout, HTTPError
 5 | 
 6 | # Configure logging
 7 | logging.basicConfig(level=logging.INFO)
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def check_xss_protection(website: str, timeout_seconds: Optional[int] = 10) -> str:
11 |     """
12 |     Check if the X-XSS-Protection header is present in the HTTP response headers of a website.
13 | 
14 |     Args:
15 |         website (str): The URL of the website to be checked.
16 |         timeout_seconds (int, optional): Timeout for the HTTP request in seconds. Default is 10 seconds.
17 | 
18 |     Returns:
19 |         str:
20 |             - "🟢" if X-XSS-Protection header is present and properly configured.
21 |             - "🟠" if header is present but with suboptimal configuration.
22 |             - "🔴" if X-XSS-Protection header is absent.
23 |             - "⚪" for any errors or non-success HTTP responses.
24 |     """
25 |     # Input validation and URL normalization
26 |     if not website or not isinstance(website, str):
27 |         logger.error(f"Invalid website input: {website}")
28 |         return "⚪"
29 |     
30 |     website = website.strip()
31 |     if not website.startswith(('http://', 'https://')):
32 |         website = f"https://{website}"
33 | 
34 |     headers = {
35 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
36 |     }
37 | 
38 |     try:
39 |         # Make request with proper timeout and error handling
40 |         response = requests.get(website, headers=headers, timeout=timeout_seconds)
41 |         response.raise_for_status()
42 | 
43 |         # Check X-XSS-Protection header
44 |         xss_protection = response.headers.get('X-XSS-Protection', '').lower()
45 |         
46 |         if xss_protection:
47 |             logger.info(f"X-XSS-Protection header found for {website}: {xss_protection}")
48 |             
49 |             # Enhanced validation of header value
50 |             if '1; mode=block' in xss_protection:
51 |                 return "🟢"  # Optimal configuration
52 |             elif xss_protection.startswith('1'):
53 |                 return "🟠"  # Present but not optimal
54 |             else:
55 |                 return "🔴"  # Present but disabled (0)
56 |         else:
57 |             # Check for Content-Security-Policy as alternative protection
58 |             csp_header = response.headers.get('Content-Security-Policy', '')
59 |             if csp_header and 'unsafe-inline' not in csp_header.lower():
60 |                 logger.info(f"No X-XSS-Protection but CSP found for {website}")
61 |                 return "🟠"  # CSP provides some XSS protection
62 |             
63 |             logger.warning(f"X-XSS-Protection header missing for {website}")
64 |             return "🔴"
65 | 
66 |     except Timeout:
67 |         logger.warning(f"Timeout occurred while checking XSS protection for {website}")
68 |         return "⚪"
69 |     except HTTPError as e:
70 |         logger.warning(f"HTTP error for {website}: {e}")
71 |         return "⚪"
72 |     except RequestException as e:
73 |         logger.warning(f"Request error for {website}: {e}")
74 |         return "⚪"
75 |     except Exception as e:
76 |         logger.error(f"Unexpected error for {website}: {e}")
77 |         return "⚪"
78 | 


--------------------------------------------------------------------------------
/checks/check_internationalization.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from bs4 import BeautifulSoup
 4 | from urllib.parse import urlparse
 5 | import re
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | def check_internationalization(website: str) -> str:
10 |     """
11 |     Checks if a website has implemented internationalization (i18n) using the lang attribute.
12 | 
13 |     Args:
14 |         website (str): The URL of the website to check.
15 | 
16 |     Returns:
17 |         str:
18 |            - "🟢" if i18n is detected
19 |            - "🟡" if partial i18n is detected
20 |            - "⚪" if i18n is not detected or an error occurred.
21 |     """
22 |     # Input validation and URL normalization
23 |     if not website:
24 |         logger.error("Website URL is required")
25 |         return "⚪"
26 |     
27 |     if not website.startswith(('http://', 'https://')):
28 |         website = f"https://{website}"
29 |     
30 |     try:
31 |         parsed_url = urlparse(website)
32 |         if not parsed_url.netloc:
33 |             logger.error(f"Invalid URL format: {website}")
34 |             return "⚪"
35 |     except Exception as e:
36 |         logger.error(f"URL parsing error for {website}: {e}")
37 |         return "⚪"
38 | 
39 |     try:
40 |         response = requests.get(website, timeout=15)
41 |         response.raise_for_status()
42 |         soup = BeautifulSoup(response.content, "html.parser")
43 |         
44 |         # Enhanced detection patterns
45 |         i18n_indicators = []
46 |         
47 |         # Check HTML lang attribute
48 |         html_tag = soup.find("html")
49 |         if html_tag and html_tag.has_attr("lang"):
50 |             lang_value = html_tag.get("lang", "").strip()
51 |             if lang_value and len(lang_value) >= 2:
52 |                 i18n_indicators.append(f"HTML lang attribute: {lang_value}")
53 |         
54 |         # Check for hreflang attributes in link tags
55 |         hreflang_links = soup.find_all("link", attrs={"hreflang": True})
56 |         if hreflang_links:
57 |             i18n_indicators.append(f"hreflang links: {len(hreflang_links)} found")
58 |         
59 |         # Check for language-specific meta tags
60 |         lang_meta = soup.find_all("meta", attrs={"http-equiv": "content-language"})
61 |         if lang_meta:
62 |             i18n_indicators.append("Content-Language meta tag found")
63 |         
64 |         # Check for common i18n URL patterns
65 |         if re.search(r'/[a-z]{2}(?:-[A-Z]{2})?/', website):
66 |             i18n_indicators.append("Language code in URL pattern")
67 |         
68 |         # Improved scoring and categorization
69 |         if len(i18n_indicators) >= 2:
70 |             logger.info(f"Strong internationalization detected for {website}: {', '.join(i18n_indicators)}")
71 |             return "🟢"
72 |         elif len(i18n_indicators) == 1:
73 |             logger.info(f"Basic internationalization detected for {website}: {i18n_indicators[0]}")
74 |             return "🟡"
75 |         else:
76 |             logger.info(f"No internationalization detected for {website}")
77 |             return "⚪"
78 | 
79 |     except requests.exceptions.RequestException as e:
80 |         logger.error(f"Request error while checking internationalization for {website}: {e}")
81 |         return "⚪"
82 |     except Exception as e:
83 |         logger.error(f"Unexpected error while checking internationalization for {website}: {e}")
84 |         return "⚪"
85 | 


--------------------------------------------------------------------------------
/checks/check_redirects.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | 
 5 | # Configure logging
 6 | logging.basicConfig(level=logging.INFO)
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | def check_redirects(website: str) -> str:
10 |     """
11 |     Verify if a website using HTTP redirects to its HTTPS counterpart with enhanced security analysis.
12 | 
13 |     Args:
14 |         website (str): The URL (without protocol) of the website to check.
15 | 
16 |     Returns:
17 |         str:
18 |             - "🟢" if the site redirects from HTTP to HTTPS securely.
19 |             - "🟠" if redirect exists but has minor security issues.
20 |             - "🔴" if it does not redirect from HTTP to HTTPS or has security issues.
21 |             - "⚪" in case of an error.
22 |     """
23 |     # Input validation and URL normalization
24 |     if not website or not isinstance(website, str):
25 |         logger.error(f"Invalid website input: {website}")
26 |         return "⚪"
27 |     
28 |     website = website.strip()
29 |     if website.startswith(('http://', 'https://')):
30 |         from urllib.parse import urlparse
31 |         parsed = urlparse(website)
32 |         website = parsed.netloc
33 | 
34 |     headers = {
35 |         "User-Agent": "HTTPtoHTTPSRedirectChecker/2.0"
36 |     }
37 | 
38 |     try:
39 |         # Make an HTTP request to the site and prevent automatic redirects
40 |         response = requests.get(f"http://{website}", headers=headers, allow_redirects=False, timeout=15)
41 |         redirect_location = response.headers.get('Location', '')
42 | 
43 |         # Enhanced redirect analysis
44 |         if response.status_code in [301, 302, 303, 307, 308] and redirect_location:
45 |             logger.debug(f"Redirect detected: {response.status_code} -> {redirect_location}")
46 |             
47 |             # Check if redirect is to HTTPS
48 |             if redirect_location.startswith(f"https://{website}"):
49 |                 # Check for permanent redirect (301, 308) - more secure
50 |                 if response.status_code in [301, 308]:
51 |                     logger.info(f"Website {website} has secure permanent redirect to HTTPS")
52 |                     return "🟢"
53 |                 else:
54 |                     logger.info(f"Website {website} redirects to HTTPS but uses temporary redirect")
55 |                     return "🟠"
56 |             elif redirect_location.startswith('https://'):
57 |                 # Redirects to HTTPS but different domain
58 |                 logger.warning(f"Website {website} redirects to different HTTPS domain: {redirect_location}")
59 |                 return "🟠"
60 |             else:
61 |                 # Redirects but not to HTTPS
62 |                 logger.warning(f"Website {website} redirects but not to HTTPS: {redirect_location}")
63 |                 return "🔴"
64 |         else:
65 |             # No redirect or invalid redirect
66 |             logger.warning(f"Website {website} does not redirect from HTTP to HTTPS")
67 |             return "🔴"
68 | 
69 |     except (Timeout, HTTPError) as e:
70 |         logger.warning(f"HTTP/Timeout error while checking redirects for {website}: {e}")
71 |         return "⚪"
72 |     except RequestException as e:
73 |         logger.warning(f"Request error while checking redirects for {website}: {e}")
74 |         return "⚪"
75 |     except Exception as e:
76 |         logger.error(f"Unexpected error while checking redirects for {website}: {e}")
77 |         return "⚪"
78 | 


--------------------------------------------------------------------------------
/checks/check_hsts.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | from urllib.parse import urlparse
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | def check_hsts(website: str) -> str:
 9 |     """
10 |     Check if the website implements HTTP Strict Transport Security (HSTS).
11 | 
12 |     Args:
13 |         website (str): URL of the website to be checked.
14 | 
15 |     Returns:
16 |         str: 
17 |             - "🟢" if the site has HSTS enabled with good configuration.
18 |             - "🟡" if HSTS is enabled but with suboptimal configuration.
19 |             - "🔴" if the site does not have HSTS enabled.
20 |             - "⚪" if an error occurred during the check.
21 |     """
22 |     # Input validation and URL normalization
23 |     if not website:
24 |         logger.error("Website URL is required")
25 |         return "⚪"
26 |     
27 |     if not website.startswith(('http://', 'https://')):
28 |         website = f"https://{website}"
29 |     
30 |     try:
31 |         parsed_url = urlparse(website)
32 |         if not parsed_url.netloc:
33 |             logger.error(f"Invalid URL format: {website}")
34 |             return "⚪"
35 |     except Exception as e:
36 |         logger.error(f"URL parsing error for {website}: {e}")
37 |         return "⚪"
38 | 
39 |     headers = {
40 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
41 |     }
42 | 
43 |     try:
44 |         # Make a request to the website
45 |         response = requests.get(website, headers=headers, timeout=15)
46 |         response.raise_for_status()
47 | 
48 |         # Enhanced detection patterns
49 |         hsts_header = response.headers.get('Strict-Transport-Security', '')
50 |         
51 |         if not hsts_header:
52 |             logger.warning(f"No HSTS header found for {website}")
53 |             return "🔴"
54 |         
55 |         # Improved scoring and categorization
56 |         hsts_lower = hsts_header.lower()
57 |         max_age_match = None
58 |         
59 |         # Extract max-age value
60 |         import re
61 |         max_age_pattern = re.search(r'max-age=(\d+)', hsts_lower)
62 |         if max_age_pattern:
63 |             max_age = int(max_age_pattern.group(1))
64 |             
65 |             # Check for security best practices
66 |             has_include_subdomains = 'includesubdomains' in hsts_lower
67 |             has_preload = 'preload' in hsts_lower
68 |             
69 |             # Categorize based on configuration quality
70 |             if max_age >= 31536000 and has_include_subdomains:  # 1 year or more with subdomains
71 |                 logger.info(f"Strong HSTS configuration for {website}: max-age={max_age}, includeSubDomains={has_include_subdomains}, preload={has_preload}")
72 |                 return "🟢"
73 |             elif max_age >= 86400:  # At least 1 day
74 |                 logger.info(f"Basic HSTS configuration for {website}: max-age={max_age}, includeSubDomains={has_include_subdomains}")
75 |                 return "🟡"
76 |             else:
77 |                 logger.warning(f"Weak HSTS configuration for {website}: max-age too low ({max_age})")
78 |                 return "🟡"
79 |         else:
80 |             logger.warning(f"Invalid HSTS header format for {website}: {hsts_header}")
81 |             return "🟡"
82 | 
83 |     except requests.RequestException as e:
84 |         logger.error(f"Request error while checking HSTS for {website}: {e}")
85 |         return "⚪"
86 |     except Exception as e:
87 |         logger.error(f"Unexpected error while checking HSTS for {website}: {e}")
88 |         return "⚪"
89 | 


--------------------------------------------------------------------------------
/checks/check_server_response_time.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import time
 3 | import statistics
 4 | import logging
 5 | from requests.exceptions import RequestException, Timeout, HTTPError
 6 | 
 7 | # Configure logging
 8 | logging.basicConfig(level=logging.INFO)
 9 | logger = logging.getLogger(__name__)
10 | 
11 | def check_server_response_time(website: str, num_attempts: int = 3) -> str:
12 |     """
13 |     Measure the server's response time with multiple attempts for accuracy.
14 | 
15 |     Args:
16 |         website (str): URL of the website to be checked.
17 |         num_attempts (int): Number of attempts to measure response time.
18 | 
19 |     Returns:
20 |         str: 
21 |             - "🟢" if the response time is excellent (under 0.5 seconds)
22 |             - "🟠" if the response time is moderate (between 0.5 and 2 seconds)
23 |             - "🔴" if the response time is slow (2 seconds or more)
24 |             - "⚪" if an error occurs or the server does not respond in time
25 |     """
26 |     # Input validation and URL normalization
27 |     if not website or not isinstance(website, str):
28 |         logger.error(f"Invalid website input: {website}")
29 |         return "⚪"
30 |     
31 |     website = website.strip()
32 |     if not website.startswith(('http://', 'https://')):
33 |         website = f"https://{website}"
34 | 
35 |     headers = {
36 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
37 |     }
38 | 
39 |     response_times = []
40 | 
41 |     try:
42 |         # Perform multiple measurements for accuracy
43 |         for attempt in range(num_attempts):
44 |             start_time = time.perf_counter()
45 | 
46 |             # Make the request and measure time to first byte
47 |             response = requests.get(website, headers=headers, timeout=15, stream=True)
48 |             
49 |             # Time to first byte
50 |             ttfb = time.perf_counter() - start_time
51 |             response.raise_for_status()
52 |             
53 |             response_times.append(ttfb)
54 |             logger.debug(f"Attempt {attempt + 1} for {website}: {ttfb:.3f}s")
55 |             
56 |             # Small delay between attempts
57 |             if attempt < num_attempts - 1:
58 |                 time.sleep(0.5)
59 | 
60 |         # Calculate statistics
61 |         avg_time = statistics.mean(response_times)
62 |         median_time = statistics.median(response_times)
63 |         min_time = min(response_times)
64 |         max_time = max(response_times)
65 | 
66 |         logger.info(f"Response time stats for {website} - Avg: {avg_time:.3f}s, Median: {median_time:.3f}s, Range: {min_time:.3f}s-{max_time:.3f}s")
67 | 
68 |         # Enhanced categorization based on average response time
69 |         if avg_time < 0.2:
70 |             logger.info(f"Website {website} responded excellently: {avg_time:.3f}s average")
71 |             return "🟢"
72 |         elif avg_time < 0.5:
73 |             logger.info(f"Website {website} responded very well: {avg_time:.3f}s average")
74 |             return "🟢"
75 |         elif avg_time < 2.0:
76 |             logger.info(f"Website {website} responded moderately: {avg_time:.3f}s average")
77 |             return "🟠"
78 |         else:
79 |             logger.warning(f"Website {website} responded slowly: {avg_time:.3f}s average")
80 |             return "🔴"
81 | 
82 |     except Timeout:
83 |         logger.warning(f"Timeout occurred while checking response time for {website}")
84 |         return "🔴"
85 |     except HTTPError as e:
86 |         logger.warning(f"HTTP error for {website}: {e}")
87 |         return "⚪"
88 |     except RequestException as e:
89 |         logger.warning(f"Request error for {website}: {e}")
90 |         return "⚪"
91 |     except Exception as e:
92 |         logger.error(f"Unexpected error for {website}: {e}")
93 |         return "⚪"
94 | 


--------------------------------------------------------------------------------
/checks/check_robot_txt.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import logging
 3 | from requests.exceptions import RequestException, Timeout, HTTPError
 4 | from urllib.parse import urljoin
 5 | 
 6 | # Configure logging
 7 | logging.basicConfig(level=logging.INFO)
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def check_robot_txt(website):
11 |     """
12 |     Verify the presence and basic validity of a robots.txt file on a website.
13 |     
14 |     Args:
15 |     - website (str): The URL (without protocol) of the website to check.
16 |     
17 |     Returns:
18 |     - str: "🟢" if the site has a valid robots.txt file, "🔴" otherwise, and 
19 |            "⚪" in case of an error.
20 |     """
21 |     # Input validation and URL normalization
22 |     if not website or not isinstance(website, str):
23 |         logger.error(f"Invalid website input: {website}")
24 |         return "⚪"
25 |     
26 |     website = website.strip()
27 |     if not website.startswith(('http://', 'https://')):
28 |         website = f"https://{website}"
29 | 
30 |     headers = {
31 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
32 |     }
33 | 
34 |     try:
35 |         # Perform the HTTP request with a timeout
36 |         robots_url = urljoin(website, '/robots.txt')
37 |         response = requests.get(robots_url, headers=headers, timeout=15)
38 |         response.raise_for_status()
39 | 
40 |         # Enhanced validation of robots.txt content
41 |         content = response.text.lower()
42 |         lines = [line.strip() for line in content.split('\n') if line.strip()]
43 |         
44 |         # Check for essential robots.txt directives
45 |         has_user_agent = any(line.startswith('user-agent:') for line in lines)
46 |         has_disallow = any(line.startswith('disallow:') for line in lines)
47 |         has_allow = any(line.startswith('allow:') for line in lines)
48 |         has_sitemap = any(line.startswith('sitemap:') for line in lines)
49 |         
50 |         # Additional validation checks
51 |         valid_directives = {'user-agent:', 'disallow:', 'allow:', 'crawl-delay:', 'sitemap:', 'host:'}
52 |         unknown_directives = []
53 |         
54 |         for line in lines:
55 |             if ':' in line and not line.startswith('#'):
56 |                 directive = line.split(':')[0] + ':'
57 |                 if directive not in valid_directives:
58 |                     unknown_directives.append(directive)
59 | 
60 |         # Scoring system for robots.txt quality
61 |         score = 0
62 |         if has_user_agent:
63 |             score += 2
64 |         if has_disallow or has_allow:
65 |             score += 2
66 |         if has_sitemap:
67 |             score += 1
68 |         if not unknown_directives:
69 |             score += 1
70 | 
71 |         logger.info(f"Robots.txt analysis for {website}: score {score}/6, sitemaps: {has_sitemap}")
72 |         
73 |         if unknown_directives:
74 |             logger.warning(f"Unknown directives found: {unknown_directives}")
75 | 
76 |         if score >= 4:
77 |             logger.info(f"Valid and comprehensive robots.txt found for {website}")
78 |             return "🟢"
79 |         elif score >= 2:
80 |             logger.info(f"Basic robots.txt found for {website}")
81 |             return "🟢"
82 |         else:
83 |             logger.warning(f"Poor quality robots.txt found for {website}")
84 |             return "🔴"
85 |     
86 |     except (Timeout, HTTPError) as e:
87 |         logger.warning(f"HTTP/Timeout error while checking robots.txt for {website}: {e}")
88 |         return "⚪"
89 |     except RequestException as e:
90 |         logger.warning(f"Request error while checking robots.txt for {website}: {e}")
91 |         return "⚪"
92 |     except Exception as e:
93 |         logger.error(f"Unexpected error while checking robots.txt for {website}: {e}")
94 |         return "⚪"
95 | 


--------------------------------------------------------------------------------
/checks/check_privacy_protected_whois.py:
--------------------------------------------------------------------------------
 1 | import whois
 2 | import logging
 3 | from whois.parser import PywhoisError
 4 | 
 5 | # Configure logging
 6 | logging.basicConfig(level=logging.INFO)
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | def check_privacy_protected_whois(domain: str) -> str:
10 |     """
11 |     Check if a domain's WHOIS information indicates that it is privacy-protected with enhanced detection.
12 | 
13 |     Args:
14 |         domain (str): The domain to check.
15 | 
16 |     Returns:
17 |         str: "🟢" if the domain's WHOIS information is privacy-protected, "🔴" otherwise,
18 |              "⚪" if an error occurred.
19 |     """
20 |     # Input validation
21 |     if not domain or not isinstance(domain, str):
22 |         logger.error(f"Invalid domain input: {domain}")
23 |         return "⚪"
24 |     
25 |     domain = domain.strip()
26 |     
27 |     # Remove protocol if present
28 |     if domain.startswith(('http://', 'https://')):
29 |         from urllib.parse import urlparse
30 |         parsed = urlparse(domain)
31 |         domain = parsed.netloc
32 | 
33 |     try:
34 |         # Fetch WHOIS data for the domain
35 |         whois_data = whois.whois(domain)
36 | 
37 |         # Enhanced privacy indicators
38 |         privacy_indicators = [
39 |             'privacy', 'protected', 'redacted', 'whoisguard', 'domains by proxy',
40 |             'anonymous', 'contact privacy', 'whois privacy', 'perfect privacy', 
41 |             'data protected', 'private registration', 'domain privacy',
42 |             'namecheap', 'godaddy privacy', 'cloudflare', 'proxy protection',
43 |             'withheld', 'not disclosed', 'see privacy policy'
44 |         ]
45 | 
46 |         # Enhanced fields to check with more comprehensive coverage
47 |         fields_to_check = [
48 |             'registrar', 'tech_email', 'admin_email', 'registrant_email',
49 |             'org', 'name', 'address', 'registrant_name', 'admin_name', 'tech_name',
50 |             'registrant_org', 'admin_org', 'tech_org', 'emails'
51 |         ]
52 | 
53 |         privacy_score = 0
54 |         total_checks = 0
55 | 
56 |         # Check for privacy indicators in relevant WHOIS fields
57 |         for field in fields_to_check:
58 |             field_value = whois_data.get(field, '')
59 |             
60 |             if field_value:
61 |                 total_checks += 1
62 |                 field_str = str(field_value).lower()
63 |                 
64 |                 if any(indicator in field_str for indicator in privacy_indicators):
65 |                     privacy_score += 1
66 |                     logger.debug(f"Privacy indicator found in {field}: {field_value}")
67 | 
68 |         # Additional checks for redacted information
69 |         if whois_data:
70 |             # Check if critical information is redacted
71 |             critical_fields = ['registrant_name', 'admin_email', 'tech_email']
72 |             redacted_count = 0
73 |             
74 |             for field in critical_fields:
75 |                 value = whois_data.get(field, '')
76 |                 if not value or 'redacted' in str(value).lower() or 'withheld' in str(value).lower():
77 |                     redacted_count += 1
78 | 
79 |             if redacted_count >= 2:
80 |                 privacy_score += 2
81 | 
82 |         logger.info(f"Privacy analysis for {domain}: score {privacy_score}/{total_checks + 2}")
83 | 
84 |         # Determine result based on privacy score
85 |         if privacy_score > 0:
86 |             logger.info(f"Privacy protection detected for {domain}")
87 |             return "🟢"
88 |         else:
89 |             logger.warning(f"No privacy protection detected for {domain}")
90 |             return "🔴"
91 | 
92 |     except PywhoisError as e:
93 |         logger.warning(f"WHOIS command failed for {domain}: {e}")
94 |         return "⚪"
95 |     except Exception as e:
96 |         logger.error(f"Unexpected error while checking privacy-protected WHOIS for {domain}: {e}")
97 |         return "⚪"
98 | 


--------------------------------------------------------------------------------
/checks/check_email_domain.py:
--------------------------------------------------------------------------------
  1 | import dns.resolver
  2 | import logging
  3 | from dns.resolver import NXDOMAIN, NoAnswer, NoNameservers, Timeout
  4 | import re
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | def check_email_domain(email_domain: str) -> str:
  9 |     """
 10 |     Check if an email domain has an SPF (Sender Policy Framework) record.
 11 | 
 12 |     Args:
 13 |         email_domain (str): The domain of the email to be checked.
 14 | 
 15 |     Returns:
 16 |         str: 
 17 |             - "🟢" if a strong SPF record is found.
 18 |             - "🟡" if a basic SPF record is found.
 19 |             - "🔴" if no SPF record is found.
 20 |             - "⚪" for any other errors or issues.
 21 |     """
 22 |     # Input validation
 23 |     if not email_domain:
 24 |         logger.error("Email domain is required")
 25 |         return "⚪"
 26 |     
 27 |     # Normalize domain (remove protocol, www, etc.)
 28 |     email_domain = email_domain.lower().strip()
 29 |     email_domain = re.sub(r'^https?://', '', email_domain)
 30 |     email_domain = re.sub(r'^www\.', '', email_domain)
 31 |     email_domain = email_domain.split('/')[0]  # Remove path if present
 32 |     
 33 |     # Validate domain format
 34 |     if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.[a-zA-Z]{2,}$', email_domain):
 35 |         logger.error(f"Invalid domain format: {email_domain}")
 36 |         return "⚪"
 37 | 
 38 |     try:
 39 |         # Query DNS TXT records for the given email domain
 40 |         answers = dns.resolver.resolve(email_domain, 'TXT', timeout=10)
 41 | 
 42 |         # Enhanced detection patterns
 43 |         spf_records = []
 44 |         for rdata in answers:
 45 |             txt_record = str(rdata).strip('"')
 46 |             if txt_record.startswith("v=spf1"):
 47 |                 spf_records.append(txt_record)
 48 | 
 49 |         if not spf_records:
 50 |             logger.warning(f"No SPF record found for {email_domain}")
 51 |             return "🔴"
 52 |         
 53 |         if len(spf_records) > 1:
 54 |             logger.warning(f"Multiple SPF records found for {email_domain} - this may cause issues")
 55 |         
 56 |         # Analyze SPF record quality
 57 |         spf_record = spf_records[0]
 58 |         logger.info(f"SPF record found for {email_domain}: {spf_record}")
 59 |         
 60 |         # Improved scoring and categorization
 61 |         strong_indicators = [
 62 |             '-all',  # Hard fail
 63 |             'include:',  # Include mechanism
 64 |             'mx',  # MX mechanism
 65 |         ]
 66 |         
 67 |         weak_indicators = [
 68 |             '~all',  # Soft fail
 69 |             '?all',  # Neutral
 70 |             '+all',  # Pass all (very permissive)
 71 |         ]
 72 |         
 73 |         strong_score = sum(1 for indicator in strong_indicators if indicator in spf_record)
 74 |         weak_score = sum(1 for indicator in weak_indicators if indicator in spf_record)
 75 |         
 76 |         if strong_score >= 2 and '-all' in spf_record:
 77 |             logger.info(f"Strong SPF configuration for {email_domain}")
 78 |             return "🟢"
 79 |         elif strong_score >= 1 or ('~all' in spf_record):
 80 |             logger.info(f"Basic SPF configuration for {email_domain}")
 81 |             return "🟡"
 82 |         else:
 83 |             logger.warning(f"Weak SPF configuration for {email_domain}")
 84 |             return "🟡"
 85 |     
 86 |     except NXDOMAIN:
 87 |         logger.error(f"Domain {email_domain} does not exist")
 88 |         return "⚪"
 89 |     except NoAnswer:
 90 |         logger.warning(f"Domain {email_domain} does not have TXT records")
 91 |         return "🔴"
 92 |     except NoNameservers:
 93 |         logger.error(f"No nameservers found for domain {email_domain}")
 94 |         return "⚪"
 95 |     except Timeout:
 96 |         logger.error(f"DNS query for {email_domain} timed out")
 97 |         return "⚪"
 98 |     except Exception as e:
 99 |         logger.error(f"Unexpected error while checking email domain {email_domain}: {e}")
100 |         return "⚪"
101 | 


--------------------------------------------------------------------------------
/checks/check_mixed_content.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import logging
 4 | from requests.exceptions import RequestException, HTTPError
 5 | from urllib.parse import urlparse, urljoin
 6 | import re
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def check_mixed_content(website: str) -> str:
11 |     """
12 |     Check a given website for mixed content issues by searching for resources loaded over HTTP.
13 | 
14 |     Args:
15 |         website (str): The URL of the website to be checked.
16 | 
17 |     Returns:
18 |         str:
19 |             - "🟢" if no mixed content is found.
20 |             - "🔴" if mixed content is found.
21 |             - "⚪" for any errors.
22 |     """
23 |     # Input validation and URL normalization
24 |     if not website:
25 |         logger.error("Website URL is required")
26 |         return "⚪"
27 |     
28 |     if not website.startswith(('http://', 'https://')):
29 |         website = f"https://{website}"
30 |     
31 |     try:
32 |         parsed_url = urlparse(website)
33 |         if not parsed_url.netloc:
34 |             logger.error(f"Invalid URL format: {website}")
35 |             return "⚪"
36 |     except Exception as e:
37 |         logger.error(f"URL parsing error for {website}: {e}")
38 |         return "⚪"
39 | 
40 |     headers = {
41 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
42 |     }
43 | 
44 |     try:
45 |         # Make a request to the website
46 |         response = requests.get(website, headers=headers, timeout=15)
47 |         response.raise_for_status()
48 | 
49 |         # Parse the HTML content using BeautifulSoup
50 |         soup = BeautifulSoup(response.content, 'html.parser')
51 | 
52 |         # Enhanced detection patterns - check multiple attributes and elements
53 |         mixed_content_found = []
54 |         
55 |         # Check src attributes (img, script, iframe, etc.)
56 |         elements_with_src = soup.find_all(attrs={'src': True})
57 |         for element in elements_with_src:
58 |             src = element.get('src', '')
59 |             if src.startswith('http://'):
60 |                 mixed_content_found.append(f"{element.name}[src]: {src}")
61 |         
62 |         # Check href attributes (link, a tags)
63 |         elements_with_href = soup.find_all(attrs={'href': True})
64 |         for element in elements_with_href:
65 |             href = element.get('href', '')
66 |             if href.startswith('http://') and element.name in ['link']:  # Focus on resource links
67 |                 mixed_content_found.append(f"{element.name}[href]: {href}")
68 |         
69 |         # Check CSS url() patterns in style attributes and tags
70 |         style_elements = soup.find_all(['style']) + soup.find_all(attrs={'style': True})
71 |         for element in style_elements:
72 |             style_content = element.get('style', '') if element.has_attr('style') else element.get_text()
73 |             if style_content:
74 |                 http_urls = re.findall(r'url\(["\']?(http://[^"\')\s]+)["\']?\)', style_content)
75 |                 for url in http_urls:
76 |                     mixed_content_found.append(f"CSS url(): {url}")
77 | 
78 |         # Check if there is any mixed content
79 |         if mixed_content_found:
80 |             logger.warning(f"Mixed content found on {website}: {len(mixed_content_found)} instances")
81 |             for content in mixed_content_found[:5]:  # Log first 5 instances
82 |                 logger.warning(f"  - {content}")
83 |             return "🔴"
84 |         else:
85 |             logger.info(f"No mixed content found on {website}")
86 |             return "🟢"
87 | 
88 |     except HTTPError as e:
89 |         logger.error(f"HTTP error {e.response.status_code} while checking mixed content on {website}: {e}")
90 |         return "⚪"
91 |     except RequestException as e:
92 |         logger.error(f"Request error while checking mixed content on {website}: {e}")
93 |         return "⚪"
94 |     except Exception as e:
95 |         logger.error(f"Unexpected error while checking mixed content on {website}: {e}")
96 |         return "⚪"
97 | 


--------------------------------------------------------------------------------
/checks/check_website_load_time.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import statistics
  3 | import requests
  4 | import logging
  5 | from requests.exceptions import RequestException, Timeout, HTTPError
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_website_load_time(website: str, num_attempts: int = 3) -> str:
 12 |     """
 13 |     Check the load time of the given website with multiple measurements for accuracy.
 14 |     
 15 |     Args:
 16 |         website (str): The URL of the website to be checked.
 17 |         num_attempts (int): Number of attempts to measure load time for accuracy.
 18 |     
 19 |     Returns:
 20 |         str: 
 21 |             - "🟢" if average load time is under 2 seconds
 22 |             - "🟠" if average load time is between 2 and 4 seconds
 23 |             - "🔴" if average load time is over 4 seconds
 24 |             - "⚪" in case of any errors or timeouts
 25 |     """
 26 |     # Input validation and URL normalization
 27 |     if not website or not isinstance(website, str):
 28 |         logger.error(f"Invalid website input: {website}")
 29 |         return "⚪"
 30 |     
 31 |     website = website.strip()
 32 |     if not website.startswith(('http://', 'https://')):
 33 |         website = f"https://{website}"
 34 | 
 35 |     headers = {
 36 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
 37 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 38 |         'Accept-Language': 'en-US,en;q=0.5',
 39 |         'Accept-Encoding': 'gzip, deflate, br',
 40 |         'Cache-Control': 'no-cache',
 41 |         'Pragma': 'no-cache'
 42 |     }
 43 | 
 44 |     load_times = []
 45 |     
 46 |     try:
 47 |         # Perform multiple measurements for accuracy
 48 |         for attempt in range(num_attempts):
 49 |             start_time = time.perf_counter()
 50 |             
 51 |             # Perform the request with enhanced monitoring
 52 |             response = requests.get(
 53 |                 website, 
 54 |                 headers=headers, 
 55 |                 timeout=15,
 56 |                 allow_redirects=True,
 57 |                 stream=False
 58 |             )
 59 |             response.raise_for_status()
 60 |             
 61 |             # Calculate elapsed time
 62 |             elapsed_time = time.perf_counter() - start_time
 63 |             load_times.append(elapsed_time)
 64 |             
 65 |             logger.debug(f"Attempt {attempt + 1} for {website}: {elapsed_time:.3f}s")
 66 |             
 67 |             # Small delay between attempts to avoid overwhelming the server
 68 |             if attempt < num_attempts - 1:
 69 |                 time.sleep(0.5)
 70 | 
 71 |         # Calculate statistics
 72 |         avg_time = statistics.mean(load_times)
 73 |         median_time = statistics.median(load_times)
 74 |         min_time = min(load_times)
 75 |         max_time = max(load_times)
 76 |         
 77 |         logger.info(f"Load time stats for {website} - Avg: {avg_time:.3f}s, Median: {median_time:.3f}s, Range: {min_time:.3f}s-{max_time:.3f}s")
 78 | 
 79 |         # Enhanced categorization based on average time
 80 |         if avg_time < 1.0:
 81 |             logger.info(f"Website {website} loaded very fast: {avg_time:.2f}s average")
 82 |             return "🟢"
 83 |         elif avg_time < 2.0:
 84 |             logger.info(f"Website {website} loaded fast: {avg_time:.2f}s average")
 85 |             return "🟢"
 86 |         elif avg_time < 4.0:
 87 |             logger.info(f"Website {website} loaded moderately: {avg_time:.2f}s average")
 88 |             return "🟠"
 89 |         else:
 90 |             logger.warning(f"Website {website} loaded slowly: {avg_time:.2f}s average")
 91 |             return "🔴"
 92 | 
 93 |     except Timeout:
 94 |         logger.warning(f"Timeout occurred while checking load time for {website}")
 95 |         return "🔴"  # Timeout is effectively a slow load time
 96 |     except HTTPError as e:
 97 |         logger.warning(f"HTTP error for {website}: {e}")
 98 |         return "⚪"
 99 |     except RequestException as e:
100 |         logger.warning(f"Request error for {website}: {e}")
101 |         return "⚪"
102 |     except Exception as e:
103 |         logger.error(f"Unexpected error for {website}: {e}")
104 |         return "⚪"
105 | 


--------------------------------------------------------------------------------
/checks/check_asset_minification.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from requests.exceptions import RequestException, Timeout, HTTPError
  3 | from bs4 import BeautifulSoup
  4 | import re
  5 | 
  6 | def check_asset_minification(website):
  7 |     """
  8 |     Check if the website's CSS/JS assets are minified.
  9 | 
 10 |     Args:
 11 |         website (str): URL of the website to be checked.
 12 | 
 13 |     Returns:
 14 |         str: 
 15 |             - "🟢" if all assets are minified
 16 |             - "🟠" if some assets are minified and others are not
 17 |             - "🔴" if none of the assets are minified
 18 |             - "⚪" if an error occurs or no assets to check
 19 |     """
 20 |     # Ensure the website starts with 'http://' or 'https://'
 21 |     if not website.startswith(('http://', 'https://')):
 22 |         website = f"https://{website}"
 23 | 
 24 |     headers = {
 25 |         'User-Agent': 'AssetMinificationChecker/1.0'
 26 |     }
 27 | 
 28 |     try:
 29 |         # First, get the website content to extract asset links
 30 |         response = requests.get(website, headers=headers, timeout=10)
 31 |         response.raise_for_status()
 32 |         
 33 |         soup = BeautifulSoup(response.text, 'lxml')
 34 |         
 35 |         # Extract CSS and JS links
 36 |         css_links = [link.get('href') for link in soup.find_all('link', rel='stylesheet') if link.get('href')]
 37 |         js_links = [script.get('src') for script in soup.find_all('script', src=True) if script.get('src')]
 38 |         
 39 |         # Convert relative URLs to absolute
 40 |         from urllib.parse import urljoin
 41 |         website_links = []
 42 |         for link in css_links + js_links:
 43 |             if link.startswith(('http://', 'https://')):
 44 |                 website_links.append(link)
 45 |             else:
 46 |                 website_links.append(urljoin(website, link))
 47 | 
 48 |         minified_count = 0
 49 |         total_assets = 0
 50 | 
 51 |         for link in website_links:
 52 |             try:
 53 |                 # Method 1: Check content and minification status
 54 |                 asset_response = requests.get(link, headers=headers, timeout=10)
 55 |                 asset_response.raise_for_status()
 56 | 
 57 |                 # Check if the content type is either CSS or JavaScript
 58 |                 content_type = asset_response.headers.get('Content-Type', '').lower()
 59 |                 if 'text/css' in content_type or 'javascript' in content_type:
 60 |                     total_assets += 1
 61 |                     content = asset_response.text
 62 | 
 63 |                     # Check for minification indicators
 64 |                     # Minified files typically have very long lines and no whitespace
 65 |                     lines = content.splitlines()
 66 |                     avg_line_length = sum(len(line) for line in lines) / max(len(lines), 1)
 67 |                     has_comments = '//' in content or '/*' in content
 68 |                     has_excessive_whitespace = re.search(r'\n\s*\n\s*\n', content)
 69 | 
 70 |                     # Heuristic: likely minified if average line length is high and no comments/whitespace
 71 |                     if avg_line_length > 200 and not has_comments and not has_excessive_whitespace:
 72 |                         minified_count += 1
 73 |                     else:
 74 |                         print(f"Asset at {link} appears not to be minified.")
 75 | 
 76 |             except (Timeout, HTTPError, RequestException) as e:
 77 |                 print(f"Error while fetching content from {link}: {e}")
 78 |                 continue
 79 | 
 80 |         # Determine the result based on the minification analysis
 81 |         if total_assets == 0:
 82 |             print(f"No CSS/JS assets found on {website}.")
 83 |             return "⚪"
 84 |         elif minified_count == 0:
 85 |             print("None of the assets are minified.")
 86 |             return "🔴"
 87 |         elif minified_count < total_assets:
 88 |             print(f"Some assets are minified, others are not. Minified: {minified_count}, Total: {total_assets}")
 89 |             return "🟠"
 90 |         else:
 91 |             print("All assets are minified.")
 92 |             return "🟢"
 93 | 
 94 |     except (Timeout, HTTPError, RequestException) as e:
 95 |         print(f"Request error occurred while checking asset minification for {website}: {e}")
 96 |         return "⚪"
 97 |     except Exception as e:
 98 |         print(f"An unexpected error occurred while checking asset minification for {website}: {e}")
 99 |         return "⚪"
100 | 


--------------------------------------------------------------------------------
/checks/check_subresource_integrity.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from bs4 import BeautifulSoup
  4 | from typing import Tuple
  5 | from requests.exceptions import RequestException
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_subresource_integrity(website: str) -> Tuple[str, int]:
 12 |     """
 13 |     Check if the given website uses Subresource Integrity (SRI) by analyzing external resources.
 14 |     
 15 |     Args:
 16 |         website (str): The URL of the website to be analyzed.
 17 |     
 18 |     Returns:
 19 |         tuple: A status symbol and a count of external resources with SRI.
 20 |             - "🟢" if most external resources have SRI protection.
 21 |             - "🟠" if some external resources have SRI protection.
 22 |             - "🔴" if no or few external resources have SRI protection.
 23 |             - "⚪" if an error occurs.
 24 |     """
 25 |     # Input validation and URL normalization
 26 |     if not website or not isinstance(website, str):
 27 |         logger.error(f"Invalid website input: {website}")
 28 |         return "⚪", 0
 29 |     
 30 |     website = website.strip()
 31 |     if not website.startswith(('http://', 'https://')):
 32 |         website = f"https://{website}"
 33 | 
 34 |     headers = {
 35 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 36 |     }
 37 | 
 38 |     try:
 39 |         # Fetch website content
 40 |         response = requests.get(website, headers=headers, timeout=15)
 41 |         response.raise_for_status()
 42 |         
 43 |         # Parse HTML content
 44 |         soup = BeautifulSoup(response.content, 'lxml')
 45 | 
 46 |         # Find all external resources that should have SRI
 47 |         external_resources = []
 48 |         sri_protected_resources = []
 49 | 
 50 |         # Check script tags with external sources
 51 |         for script in soup.find_all('script', src=True):
 52 |             src = script.get('src')
 53 |             if src and (src.startswith(('http://', 'https://')) or src.startswith('//')):
 54 |                 external_resources.append(('script', src))
 55 |                 if script.get('integrity'):
 56 |                     sri_protected_resources.append(('script', src, script.get('integrity')))
 57 | 
 58 |         # Check link tags (stylesheets, fonts, etc.)
 59 |         for link in soup.find_all('link', href=True):
 60 |             href = link.get('href')
 61 |             rel = link.get('rel', [])
 62 |             if isinstance(rel, str):
 63 |                 rel = [rel]
 64 |             
 65 |             # Focus on stylesheets and preload resources
 66 |             if href and (href.startswith(('http://', 'https://')) or href.startswith('//')) and \
 67 |                any(r in rel for r in ['stylesheet', 'preload']):
 68 |                 external_resources.append(('link', href))
 69 |                 if link.get('integrity'):
 70 |                     sri_protected_resources.append(('link', href, link.get('integrity')))
 71 | 
 72 |         total_external = len(external_resources)
 73 |         total_sri_protected = len(sri_protected_resources)
 74 | 
 75 |         logger.info(f"SRI analysis for {website}: {total_sri_protected}/{total_external} external resources have SRI")
 76 |         
 77 |         if total_sri_protected > 0:
 78 |             logger.debug(f"SRI-protected resources: {[r[1] for r in sri_protected_resources]}")
 79 | 
 80 |         # Determine result based on SRI coverage
 81 |         if total_external == 0:
 82 |             logger.info(f"No external resources found for {website}")
 83 |             return "🟢", 0
 84 |         
 85 |         sri_coverage = total_sri_protected / total_external
 86 |         
 87 |         if sri_coverage >= 0.8:  # 80% or more have SRI
 88 |             logger.info(f"Excellent SRI coverage ({sri_coverage:.1%}) for {website}")
 89 |             return "🟢", total_sri_protected
 90 |         elif sri_coverage >= 0.4:  # 40% or more have SRI
 91 |             logger.warning(f"Moderate SRI coverage ({sri_coverage:.1%}) for {website}")
 92 |             return "🟠", total_sri_protected
 93 |         else:  # Less than 40% have SRI
 94 |             logger.warning(f"Poor SRI coverage ({sri_coverage:.1%}) for {website}")
 95 |             return "🔴", total_sri_protected
 96 | 
 97 |     except RequestException as e:
 98 |         logger.warning(f"Request error for {website}: {e}")
 99 |         return "⚪", 0
100 |     except Exception as e:
101 |         logger.error(f"Unexpected error for {website}: {e}")
102 |         return "⚪", 0
103 | 


--------------------------------------------------------------------------------
/checks/check_domain_expiration.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import whois
  3 | import logging
  4 | import re
  5 | from urllib.parse import urlparse
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | def check_domain_expiration(domain: str) -> str:
 10 |     """
 11 |     Check the expiration date of a domain.
 12 | 
 13 |     Args:
 14 |         domain (str): The domain name to be checked.
 15 | 
 16 |     Returns:
 17 |         str:
 18 |             - "🟢 (X days left)" if the domain has more than 90 days to expire.
 19 |             - "🟡 (X days left)" if the domain has between 30 to 90 days to expire.
 20 |             - "🟠 (X days left)" if the domain has between 15 to 30 days to expire.
 21 |             - "🔴 (X days left)" if the domain has less than 15 days to expire.
 22 |             - "⚪" for other errors.
 23 |     """
 24 |     # Input validation and normalization
 25 |     if not domain:
 26 |         logger.error("Domain is required")
 27 |         return "⚪"
 28 |     
 29 |     # Normalize domain
 30 |     domain = domain.lower().strip()
 31 |     domain = re.sub(r'^https?://', '', domain)
 32 |     domain = re.sub(r'^www\.', '', domain)
 33 |     domain = domain.split('/')[0]  # Remove path if present
 34 |     domain = domain.split(':')[0]  # Remove port if present
 35 |     
 36 |     # Validate domain format
 37 |     if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', domain):
 38 |         logger.error(f"Invalid domain format: {domain}")
 39 |         return "⚪"
 40 | 
 41 |     def get_days_to_expire(exp_date):
 42 |         """Calculate the days remaining for expiration."""
 43 |         if not exp_date:
 44 |             return None
 45 |         
 46 |         # Handle list of dates (some registrars return multiple dates)
 47 |         if isinstance(exp_date, list):
 48 |             # Use the earliest expiration date
 49 |             exp_date = min(exp_date)
 50 |         
 51 |         if isinstance(exp_date, str):
 52 |             try:
 53 |                 # Try to parse string dates
 54 |                 exp_date = datetime.strptime(exp_date, '%Y-%m-%d %H:%M:%S')
 55 |             except ValueError:
 56 |                 try:
 57 |                     exp_date = datetime.strptime(exp_date, '%Y-%m-%d')
 58 |                 except ValueError:
 59 |                     return None
 60 |         
 61 |         return (exp_date - datetime.now()).days
 62 | 
 63 |     try:
 64 |         # Fetch WHOIS data for the domain with timeout
 65 |         logger.info(f"Fetching WHOIS data for {domain}")
 66 |         w = whois.whois(domain)
 67 |         
 68 |         if not w:
 69 |             logger.error(f"No WHOIS data returned for {domain}")
 70 |             return "⚪"
 71 |         
 72 |         # Enhanced detection patterns
 73 |         expiration_date = w.expiration_date
 74 |         creation_date = w.creation_date
 75 |         
 76 |         days_to_expire = get_days_to_expire(expiration_date)
 77 | 
 78 |         if days_to_expire is None:
 79 |             logger.error(f"Could not retrieve or parse expiration date for {domain}")
 80 |             return "⚪"
 81 | 
 82 |         # Log additional domain information
 83 |         if creation_date:
 84 |             creation_days = get_days_to_expire(creation_date)
 85 |             if creation_days:
 86 |                 domain_age = abs(creation_days)
 87 |                 logger.info(f"Domain {domain} is {domain_age} days old")
 88 | 
 89 |         # Improved scoring and categorization
 90 |         if days_to_expire < 0:
 91 |             logger.critical(f"Domain {domain} has already expired {abs(days_to_expire)} days ago!")
 92 |             return f"🔴 (expired {abs(days_to_expire)} days ago)"
 93 |         elif days_to_expire < 15:
 94 |             logger.critical(f"Domain {domain} expires in {days_to_expire} days - URGENT!")
 95 |             return f"🔴 ({days_to_expire} days left)"
 96 |         elif days_to_expire < 30:
 97 |             logger.warning(f"Domain {domain} expires in {days_to_expire} days - action needed soon")
 98 |             return f"🟠 ({days_to_expire} days left)"
 99 |         elif days_to_expire < 90:
100 |             logger.info(f"Domain {domain} expires in {days_to_expire} days - consider renewal")
101 |             return f"🟡 ({days_to_expire} days left)"
102 |         else:
103 |             logger.info(f"Domain {domain} expires in {days_to_expire} days - safe")
104 |             return f"🟢 ({days_to_expire} days left)"
105 |             
106 |     except whois.parser.PywhoisError as e:
107 |         logger.error(f"WHOIS parsing error for {domain}: {e}")
108 |         return "⚪"
109 |     except Exception as e:
110 |         logger.error(f"Unexpected error while checking domain expiration for {domain}: {e}")
111 |         return "⚪"
112 | 


--------------------------------------------------------------------------------
/checks/check_redirect_chains.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from requests.exceptions import RequestException, Timeout, HTTPError
  4 | from urllib.parse import urljoin
  5 | 
  6 | # Configure logging
  7 | logging.basicConfig(level=logging.INFO)
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | def check_redirect_chains(website: str) -> str:
 11 |     """
 12 |     Check the number of redirects that a website triggers with enhanced security analysis.
 13 | 
 14 |     Args:
 15 |         website (str): The URL of the website to check.
 16 | 
 17 |     Returns:
 18 |         str: 
 19 |             - "🟢" if no redirects or optimal redirect pattern.
 20 |             - "🟠" if there's one redirect or acceptable chain.
 21 |             - "🔴" if multiple redirects or security issues.
 22 |             - "⚪" in case of an error.
 23 |     """
 24 |     # Input validation and URL normalization
 25 |     if not website or not isinstance(website, str):
 26 |         logger.error(f"Invalid website input: {website}")
 27 |         return "⚪"
 28 |     
 29 |     website = website.strip()
 30 |     if not website.startswith(('http://', 'https://')):
 31 |         website = f"https://{website}"
 32 | 
 33 |     headers = {
 34 |         "User-Agent": "RedirectChainChecker/2.0"
 35 |     }
 36 | 
 37 |     try:
 38 |         redirect_count = 0
 39 |         redirect_chain = []
 40 |         current_url = website
 41 |         visited_urls = set()
 42 |         max_redirects = 10  # Prevent infinite loops
 43 | 
 44 |         while redirect_count < max_redirects:
 45 |             # Prevent redirect loops
 46 |             if current_url in visited_urls:
 47 |                 logger.warning(f"Redirect loop detected for {website}")
 48 |                 return "🔴"
 49 |             
 50 |             visited_urls.add(current_url)
 51 |             response = requests.get(current_url, headers=headers, allow_redirects=False, timeout=15)
 52 |             
 53 |             # Check if there's a redirect
 54 |             if response.status_code in [301, 302, 303, 307, 308]:
 55 |                 redirect_location = response.headers.get('location', '')
 56 |                 if not redirect_location:
 57 |                     logger.warning(f"Empty redirect location for {current_url}")
 58 |                     break
 59 |                 
 60 |                 redirect_count += 1
 61 |                 redirect_chain.append({
 62 |                     'from': current_url,
 63 |                     'to': redirect_location,
 64 |                     'status': response.status_code
 65 |                 })
 66 |                 
 67 |                 # Handle relative URLs
 68 |                 if not redirect_location.startswith(('http://', 'https://')):
 69 |                     redirect_location = urljoin(current_url, redirect_location)
 70 |                 
 71 |                 current_url = redirect_location
 72 |                 logger.debug(f"Redirect {redirect_count}: {response.status_code} -> {redirect_location}")
 73 |             else:
 74 |                 # No more redirects
 75 |                 break
 76 | 
 77 |         logger.info(f"Redirect analysis for {website}: {redirect_count} redirects found")
 78 |         
 79 |         if redirect_chain:
 80 |             logger.debug(f"Redirect chain: {redirect_chain}")
 81 | 
 82 |         # Enhanced evaluation
 83 |         if redirect_count == 0:
 84 |             logger.info(f"No redirects found for {website}")
 85 |             return "🟢"
 86 |         elif redirect_count == 1:
 87 |             # Check if it's a good redirect (HTTP to HTTPS)
 88 |             if (redirect_chain[0]['from'].startswith('http://') and 
 89 |                 redirect_chain[0]['to'].startswith('https://') and
 90 |                 redirect_chain[0]['status'] in [301, 308]):
 91 |                 logger.info(f"Single secure redirect found for {website}")
 92 |                 return "🟢"
 93 |             else:
 94 |                 logger.info(f"Single redirect found for {website}")
 95 |                 return "🟠"
 96 |         elif redirect_count <= 3:
 97 |             logger.warning(f"Multiple redirects ({redirect_count}) detected for {website}")
 98 |             return "🟠"
 99 |         else:
100 |             logger.warning(f"Excessive redirects ({redirect_count}) detected for {website}")
101 |             return "🔴"
102 | 
103 |     except (Timeout, HTTPError) as e:
104 |         logger.warning(f"HTTP/Timeout error while checking redirect chains for {website}: {e}")
105 |         return "⚪"
106 |     except RequestException as e:
107 |         logger.warning(f"Request error while checking redirect chains for {website}: {e}")
108 |         return "⚪"
109 |     except Exception as e:
110 |         logger.error(f"Unexpected error while checking redirect chains for {website}: {e}")
111 |         return "⚪"
112 | 


--------------------------------------------------------------------------------
/scheduler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Website Monitor Scheduler
  4 | Runs the monitoring checks at regular intervals in Docker environment.
  5 | """
  6 | 
  7 | import time
  8 | import subprocess
  9 | import os
 10 | import logging
 11 | import signal
 12 | import sys
 13 | from datetime import datetime
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(
 17 |     level=logging.INFO,
 18 |     format='%(asctime)s - %(levelname)s - %(message)s',
 19 |     handlers=[
 20 |         logging.StreamHandler(sys.stdout),
 21 |         logging.FileHandler('/app/logs/scheduler.log')
 22 |     ]
 23 | )
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | class MonitorScheduler:
 27 |     def __init__(self):
 28 |         self.interval = int(os.getenv('MONITOR_INTERVAL', 3600))  # Default: 1 hour
 29 |         self.running = True
 30 |         
 31 |         # Set up signal handlers for graceful shutdown
 32 |         signal.signal(signal.SIGTERM, self.handle_signal)
 33 |         signal.signal(signal.SIGINT, self.handle_signal)
 34 |         
 35 |     def handle_signal(self, signum, frame):
 36 |         """Handle shutdown signals gracefully."""
 37 |         logger.info(f"Received signal {signum}, shutting down gracefully...")
 38 |         self.running = False
 39 |         
 40 |     def run_monitoring(self):
 41 |         """Execute the monitoring script."""
 42 |         try:
 43 |             logger.info('Starting website monitoring check...')
 44 |             start_time = datetime.now()
 45 |             
 46 |             # Run the main monitoring script
 47 |             result = subprocess.run(
 48 |                 ['python', 'main.py'], 
 49 |                 capture_output=True, 
 50 |                 text=True,
 51 |                 timeout=1800  # 30 minute timeout
 52 |             )
 53 |             
 54 |             end_time = datetime.now()
 55 |             execution_time = (end_time - start_time).total_seconds()
 56 |             
 57 |             if result.returncode == 0:
 58 |                 logger.info(f'Monitoring completed successfully in {execution_time:.2f} seconds')
 59 |                 if result.stdout:
 60 |                     logger.debug(f'Output: {result.stdout}')
 61 |             else:
 62 |                 logger.error(f'Monitoring failed with exit code {result.returncode}')
 63 |                 if result.stderr:
 64 |                     logger.error(f'Error output: {result.stderr}')
 65 |                 if result.stdout:
 66 |                     logger.info(f'Standard output: {result.stdout}')
 67 |                     
 68 |         except subprocess.TimeoutExpired:
 69 |             logger.error('Monitoring timed out after 30 minutes')
 70 |         except Exception as e:
 71 |             logger.error(f'Error running monitoring: {e}')
 72 |             
 73 |     def start(self):
 74 |         """Start the scheduler main loop."""
 75 |         logger.info(f'🚀 Starting Website Monitor Scheduler')
 76 |         logger.info(f'📅 Monitoring interval: {self.interval} seconds ({self.interval/3600:.1f} hours)')
 77 |         logger.info(f'📁 Working directory: {os.getcwd()}')
 78 |         logger.info(f'🐍 Python version: {sys.version}')
 79 |         
 80 |         # Run initial monitoring check
 81 |         logger.info('Running initial monitoring check...')
 82 |         self.run_monitoring()
 83 |         
 84 |         # Main scheduling loop
 85 |         while self.running:
 86 |             try:
 87 |                 logger.info(f'⏰ Waiting {self.interval} seconds until next monitoring run...')
 88 |                 
 89 |                 # Sleep in small intervals to allow for graceful shutdown
 90 |                 sleep_remaining = self.interval
 91 |                 while sleep_remaining > 0 and self.running:
 92 |                     sleep_time = min(60, sleep_remaining)  # Sleep max 60 seconds at a time
 93 |                     time.sleep(sleep_time)
 94 |                     sleep_remaining -= sleep_time
 95 |                 
 96 |                 if self.running:
 97 |                     self.run_monitoring()
 98 |                     
 99 |             except KeyboardInterrupt:
100 |                 logger.info('Scheduler interrupted by user')
101 |                 break
102 |             except Exception as e:
103 |                 logger.error(f'Unexpected error in scheduler: {e}')
104 |                 time.sleep(60)  # Wait a minute before trying again
105 |                 
106 |         logger.info('📊 Website Monitor Scheduler stopped')
107 | 
108 | def main():
109 |     """Main entry point for the scheduler."""
110 |     # Ensure logs directory exists
111 |     os.makedirs('/app/logs', exist_ok=True)
112 |     
113 |     try:
114 |         scheduler = MonitorScheduler()
115 |         scheduler.start()
116 |     except Exception as e:
117 |         logger.error(f'Failed to start scheduler: {e}')
118 |         sys.exit(1)
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/checks/check_ssl_cipher_strength.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | import ssl
  3 | import logging
  4 | 
  5 | # Configure logging
  6 | logging.basicConfig(level=logging.INFO)
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | # Updated cipher classifications based on current security standards
 10 | STRONG_CIPHERS = {
 11 |     'ECDHE-RSA-AES128-GCM-SHA256', 'ECDHE-RSA-AES256-GCM-SHA384',
 12 |     'ECDHE-ECDSA-AES128-GCM-SHA256', 'ECDHE-ECDSA-AES256-GCM-SHA384',
 13 |     'TLS_AES_128_GCM_SHA256', 'TLS_AES_256_GCM_SHA384',
 14 |     'TLS_CHACHA20_POLY1305_SHA256', 'ECDHE-RSA-CHACHA20-POLY1305',
 15 |     'ECDHE-ECDSA-CHACHA20-POLY1305'
 16 | }
 17 | 
 18 | MODERATE_CIPHERS = {
 19 |     'ECDHE-RSA-AES128-SHA', 'ECDHE-RSA-AES256-SHA',
 20 |     'ECDHE-ECDSA-AES128-SHA', 'ECDHE-ECDSA-AES256-SHA',
 21 |     'ECDHE-RSA-AES128-SHA256', 'ECDHE-RSA-AES256-SHA384'
 22 | }
 23 | 
 24 | WEAK_CIPHERS = {
 25 |     'RC4', 'DES', '3DES', 'MD5', 'SHA1', 'NULL'
 26 | }
 27 | 
 28 | def check_ssl_cipher_strength(website: str) -> str:
 29 |     """
 30 |     Check the strength of the SSL/TLS cipher suite of the website with enhanced analysis.
 31 | 
 32 |     Args:
 33 |         website (str): URL of the website to be checked.
 34 | 
 35 |     Returns:
 36 |         str:
 37 |             - "🟢" if the cipher strength is strong
 38 |             - "🟠" if the cipher strength is moderate
 39 |             - "🔴" if the cipher strength is weak
 40 |             - "⚪" for any errors
 41 |     """
 42 |     # Input validation and hostname extraction
 43 |     if not website or not isinstance(website, str):
 44 |         logger.error(f"Invalid website input: {website}")
 45 |         return "⚪"
 46 |     
 47 |     website = website.strip()
 48 |     
 49 |     # Extract hostname from URL
 50 |     if website.startswith(('http://', 'https://')):
 51 |         hostname = website.split('//')[1].split('/')[0].split(':')[0]
 52 |     else:
 53 |         hostname = website.split('/')[0].split(':')[0]
 54 | 
 55 |     try:
 56 |         # Create enhanced SSL context
 57 |         context = ssl.create_default_context()
 58 |         context.check_hostname = True
 59 |         context.verify_mode = ssl.CERT_REQUIRED
 60 |         
 61 |         # Create connection with timeout
 62 |         with socket.create_connection((hostname, 443), timeout=15) as sock:
 63 |             with context.wrap_socket(sock, server_hostname=hostname) as ssock:
 64 |                 # Get comprehensive SSL information
 65 |                 cipher_info = ssock.cipher()
 66 |                 protocol_version = ssock.version()
 67 |                 cert = ssock.getpeercert()
 68 | 
 69 |         if not cipher_info:
 70 |             logger.warning(f"No cipher information available for {hostname}")
 71 |             return "⚪"
 72 | 
 73 |         cipher_name = cipher_info[0]
 74 |         cipher_protocol = cipher_info[1] 
 75 |         cipher_bits = cipher_info[2]
 76 | 
 77 |         logger.info(f"SSL analysis for {hostname}: {cipher_name}, {protocol_version}, {cipher_bits} bits")
 78 | 
 79 |         # Enhanced cipher strength analysis
 80 |         cipher_upper = cipher_name.upper()
 81 |         
 82 |         # Check for weak indicators first
 83 |         if any(weak in cipher_upper for weak in WEAK_CIPHERS):
 84 |             logger.warning(f"Weak cipher components detected: {cipher_name}")
 85 |             return "🔴"
 86 |         
 87 |         # Check protocol version
 88 |         if protocol_version in ['TLSv1.3']:
 89 |             logger.info(f"Excellent protocol version: {protocol_version}")
 90 |             return "🟢"
 91 |         elif protocol_version in ['TLSv1.2']:
 92 |             # For TLS 1.2, check specific cipher
 93 |             if cipher_name in STRONG_CIPHERS:
 94 |                 logger.info(f"Strong cipher with TLS 1.2: {cipher_name}")
 95 |                 return "🟢"
 96 |             elif cipher_name in MODERATE_CIPHERS:
 97 |                 logger.info(f"Moderate cipher with TLS 1.2: {cipher_name}")
 98 |                 return "🟠"
 99 |             else:
100 |                 logger.warning(f"Unknown/weak cipher with TLS 1.2: {cipher_name}")
101 |                 return "🔴"
102 |         elif protocol_version in ['TLSv1.1', 'TLSv1']:
103 |             logger.warning(f"Outdated protocol version: {protocol_version}")
104 |             return "🔴"
105 |         else:
106 |             logger.warning(f"Unknown protocol version: {protocol_version}")
107 |             return "🔴"
108 | 
109 |     except socket.timeout:
110 |         logger.warning(f"Connection timeout for {hostname}")
111 |         return "⚪"
112 |     except ssl.SSLError as ssl_err:
113 |         logger.warning(f"SSL error for {hostname}: {ssl_err}")
114 |         return "⚪"
115 |     except socket.error as sock_err:
116 |         logger.warning(f"Socket error for {hostname}: {sock_err}")
117 |         return "⚪"
118 |     except Exception as e:
119 |         logger.error(f"Unexpected error for {hostname}: {e}")
120 |         return "⚪"
121 | 


--------------------------------------------------------------------------------
/checks/check_url_canonicalization.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from bs4 import BeautifulSoup
  4 | from urllib.parse import urlparse, urljoin, urlunparse
  5 | from requests.exceptions import RequestException, Timeout, HTTPError
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_url_canonicalization(website: str) -> str:
 12 |     """
 13 |     Check if the given website uses a canonical link element to avoid potential duplicate content issues.
 14 |     
 15 |     Args:
 16 |         website (str): The URL of the website to be checked.
 17 |     
 18 |     Returns:
 19 |         str: 
 20 |             - "🟢" if a correct canonical link element is found.
 21 |             - "🟠" if canonical link exists but has minor issues.
 22 |             - "🔴" if no canonical link or major issues found.
 23 |             - "⚪" on errors.
 24 |     """
 25 |     # Input validation and URL normalization
 26 |     if not website or not isinstance(website, str):
 27 |         logger.error(f"Invalid website input: {website}")
 28 |         return "⚪"
 29 |     
 30 |     website = website.strip()
 31 |     if not website.startswith(('http://', 'https://')):
 32 |         website = f"https://{website}"
 33 | 
 34 |     headers = {
 35 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 36 |     }
 37 | 
 38 |     try:
 39 |         # Make request with proper error handling
 40 |         response = requests.get(website, headers=headers, timeout=15)
 41 |         response.raise_for_status()
 42 | 
 43 |         # Parse HTML content
 44 |         soup = BeautifulSoup(response.text, 'html.parser')
 45 |         canonical_tags = soup.find_all('link', {'rel': 'canonical'})
 46 | 
 47 |         if not canonical_tags:
 48 |             logger.warning(f"No canonical link found for {website}")
 49 |             return "🔴"
 50 |         
 51 |         if len(canonical_tags) > 1:
 52 |             logger.warning(f"Multiple canonical links found for {website}")
 53 |             return "🟠"  # Multiple canonicals can be problematic
 54 | 
 55 |         canonical_tag = canonical_tags[0]
 56 |         canonical_href = canonical_tag.get('href')
 57 | 
 58 |         if not canonical_href:
 59 |             logger.warning(f"Empty canonical href for {website}")
 60 |             return "🔴"
 61 | 
 62 |         # Normalize URLs for comparison
 63 |         def normalize_url(url):
 64 |             parsed = urlparse(url)
 65 |             # Remove fragment, normalize path
 66 |             normalized = urlunparse((
 67 |                 parsed.scheme.lower(),
 68 |                 parsed.netloc.lower(),
 69 |                 parsed.path.rstrip('/') or '/',
 70 |                 parsed.params,
 71 |                 parsed.query,
 72 |                 ''  # Remove fragment
 73 |             ))
 74 |             return normalized
 75 | 
 76 |         # Handle relative canonical URLs
 77 |         if canonical_href.startswith(('http://', 'https://')):
 78 |             canonical_url = canonical_href
 79 |         else:
 80 |             canonical_url = urljoin(website, canonical_href)
 81 | 
 82 |         normalized_website = normalize_url(website)
 83 |         normalized_canonical = normalize_url(canonical_url)
 84 | 
 85 |         logger.info(f"Canonical analysis for {website}: canonical={canonical_url}")
 86 | 
 87 |         # Enhanced validation
 88 |         if normalized_canonical == normalized_website:
 89 |             logger.info(f"Perfect canonical match for {website}")
 90 |             return "🟢"
 91 |         
 92 |         # Check if canonical points to a valid variation (e.g., with/without www)
 93 |         website_parsed = urlparse(normalized_website)
 94 |         canonical_parsed = urlparse(normalized_canonical)
 95 |         
 96 |         if (website_parsed.netloc.replace('www.', '') == canonical_parsed.netloc.replace('www.', '') and
 97 |             website_parsed.path == canonical_parsed.path):
 98 |             logger.info(f"Canonical points to valid domain variation for {website}")
 99 |             return "🟢"
100 |         
101 |         # Check if it's the same domain but different path (might be intentional)
102 |         if website_parsed.netloc == canonical_parsed.netloc:
103 |             logger.warning(f"Canonical points to different path on same domain for {website}")
104 |             return "🟠"
105 |         
106 |         logger.warning(f"Canonical points to different domain for {website}")
107 |         return "🔴"
108 | 
109 |     except (Timeout, HTTPError) as e:
110 |         logger.warning(f"HTTP/Timeout error for {website}: {e}")
111 |         return "⚪"
112 |     except RequestException as e:
113 |         logger.warning(f"Request error for {website}: {e}")
114 |         return "⚪"
115 |     except Exception as e:
116 |         logger.error(f"Unexpected error for {website}: {e}")
117 |         return "⚪"
118 | 


--------------------------------------------------------------------------------
/checks/check_dnssec.py:
--------------------------------------------------------------------------------
  1 | import dns.resolver
  2 | import dns.dnssec
  3 | import dns.query
  4 | import dns.name
  5 | import dns.rdatatype
  6 | import logging
  7 | import re
  8 | from urllib.parse import urlparse
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | def check_dnssec(domain: str) -> str:
 13 |     """
 14 |     Check if a domain supports DNSSEC (Domain Name System Security Extensions).
 15 | 
 16 |     Args:
 17 |         domain (str): The domain name to be checked.
 18 | 
 19 |     Returns:
 20 |         str:
 21 |             - "🟢" if the domain supports DNSSEC properly.
 22 |             - "🟡" if DNSSEC is partially configured.
 23 |             - "🔴" if the domain does not support DNSSEC or there's a DNSSEC-related error.
 24 |             - "⚪" for other errors.
 25 |     """
 26 |     # Input validation and normalization
 27 |     if not domain:
 28 |         logger.error("Domain is required")
 29 |         return "⚪"
 30 |     
 31 |     # Normalize domain
 32 |     domain = domain.lower().strip()
 33 |     domain = re.sub(r'^https?://', '', domain)
 34 |     domain = re.sub(r'^www\.', '', domain)
 35 |     domain = domain.split('/')[0]  # Remove path if present
 36 |     domain = domain.split(':')[0]  # Remove port if present
 37 |     
 38 |     # Validate domain format
 39 |     if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', domain):
 40 |         logger.error(f"Invalid domain format: {domain}")
 41 |         return "⚪"
 42 | 
 43 |     try:
 44 |         # Convert domain to DNS name object
 45 |         domain_name = dns.name.from_text(domain)
 46 |         
 47 |         # Enhanced detection patterns
 48 |         dnssec_indicators = []
 49 |         
 50 |         # Check for DNSKEY records
 51 |         try:
 52 |             dnskey_query = dns.resolver.resolve(domain_name, 'DNSKEY', tcp=True)
 53 |             if dnskey_query:
 54 |                 dnssec_indicators.append("DNSKEY records found")
 55 |                 logger.info(f"Found {len(dnskey_query)} DNSKEY records for {domain}")
 56 |         except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
 57 |             logger.warning(f"No DNSKEY records found for {domain}")
 58 |         
 59 |         # Check for DS records in parent zone
 60 |         try:
 61 |             ds_query = dns.resolver.resolve(domain_name, 'DS', tcp=True)
 62 |             if ds_query:
 63 |                 dnssec_indicators.append("DS records found")
 64 |                 logger.info(f"Found {len(ds_query)} DS records for {domain}")
 65 |         except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
 66 |             logger.warning(f"No DS records found for {domain}")
 67 |         
 68 |         # Check for RRSIG records (signature records)
 69 |         try:
 70 |             rrsig_query = dns.resolver.resolve(domain_name, 'RRSIG', tcp=True)
 71 |             if rrsig_query:
 72 |                 dnssec_indicators.append("RRSIG records found")
 73 |                 logger.info(f"Found RRSIG records for {domain}")
 74 |         except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN):
 75 |             logger.warning(f"No RRSIG records found for {domain}")
 76 |         
 77 |         # Fallback mechanism - check for any DNSSEC records
 78 |         if not dnssec_indicators:
 79 |             # Try checking A record with DNSSEC validation
 80 |             try:
 81 |                 resolver = dns.resolver.Resolver()
 82 |                 resolver.use_edns(0, dns.flags.DO, 4096)  # Enable DNSSEC
 83 |                 a_query = resolver.resolve(domain_name, 'A')
 84 |                 # If we get here, DNS works but DNSSEC might not be configured
 85 |                 logger.info(f"DNS resolution works for {domain} but no DNSSEC indicators found")
 86 |             except Exception:
 87 |                 pass
 88 |         
 89 |         # Improved scoring and categorization
 90 |         if len(dnssec_indicators) >= 2:
 91 |             logger.info(f"Strong DNSSEC configuration for {domain}: {', '.join(dnssec_indicators)}")
 92 |             return "🟢"
 93 |         elif len(dnssec_indicators) == 1:
 94 |             logger.warning(f"Partial DNSSEC configuration for {domain}: {dnssec_indicators[0]}")
 95 |             return "🟡"
 96 |         else:
 97 |             logger.warning(f"No DNSSEC configuration found for {domain}")
 98 |             return "🔴"
 99 | 
100 |     except dns.resolver.NoAnswer:
101 |         logger.warning(f"No DNS answer received for {domain} - domain might not exist or have DNS issues")
102 |         return "⚪"
103 |     except dns.resolver.NoNameservers:
104 |         logger.error(f"No name servers available for {domain}")
105 |         return "⚪"
106 |     except dns.resolver.NXDOMAIN:
107 |         logger.error(f"Domain {domain} does not exist")
108 |         return "⚪"
109 |     except dns.resolver.Timeout:
110 |         logger.error(f"DNS request timeout while checking DNSSEC for {domain}")
111 |         return "⚪"
112 |     except dns.dnssec.ValidationFailure as e:
113 |         logger.error(f"DNSSEC validation failure for {domain}: {e}")
114 |         return "🔴"
115 |     except Exception as e:
116 |         logger.error(f"Unexpected error while checking DNSSEC for {domain}: {e}")
117 |         return "⚪"
118 | 


--------------------------------------------------------------------------------
/checks/check_favicon.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from bs4 import BeautifulSoup
  4 | from requests.exceptions import RequestException, HTTPError
  5 | from urllib.parse import urlparse, urljoin
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | def check_favicon(website: str) -> str:
 10 |     """
 11 |     Check if the website has a valid favicon.
 12 | 
 13 |     Args:
 14 |         website (str): URL of the website to be checked.
 15 | 
 16 |     Returns:
 17 |         str: 
 18 |             - "🟢" if a valid favicon is found.
 19 |             - "🔴" if no valid favicon is found.
 20 |             - "⚪" if an error occurred during the check.
 21 |     """
 22 |     # Input validation and URL normalization
 23 |     if not website:
 24 |         logger.error("Website URL is required")
 25 |         return "⚪"
 26 |     
 27 |     if not website.startswith(('http://', 'https://')):
 28 |         website = f"https://{website}"
 29 |     
 30 |     try:
 31 |         parsed_url = urlparse(website)
 32 |         if not parsed_url.netloc:
 33 |             logger.error(f"Invalid URL format: {website}")
 34 |             return "⚪"
 35 |         base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
 36 |     except Exception as e:
 37 |         logger.error(f"URL parsing error for {website}: {e}")
 38 |         return "⚪"
 39 | 
 40 |     headers = {
 41 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
 42 |     }
 43 | 
 44 |     def check_favicon_url(url):
 45 |         """Helper function to check if a favicon URL is valid"""
 46 |         try:
 47 |             response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
 48 |             return response.status_code == 200
 49 |         except:
 50 |             try:
 51 |                 response = requests.get(url, headers=headers, timeout=10, stream=True)
 52 |                 return response.status_code == 200 and len(response.content) > 0
 53 |             except:
 54 |                 return False
 55 | 
 56 |     try:
 57 |         # Enhanced detection patterns - multiple fallback mechanisms
 58 |         favicon_candidates = []
 59 |         
 60 |         # 1. Check default favicon.ico location
 61 |         default_favicon = f"{base_url}/favicon.ico"
 62 |         if check_favicon_url(default_favicon):
 63 |             logger.info(f"Favicon found at default location: {default_favicon}")
 64 |             return "🟢"
 65 |         favicon_candidates.append(default_favicon)
 66 | 
 67 |         # 2. Parse HTML for favicon references
 68 |         try:
 69 |             response = requests.get(website, headers=headers, timeout=15)
 70 |             response.raise_for_status()
 71 |             soup = BeautifulSoup(response.text, 'html.parser')
 72 | 
 73 |             # Look for various favicon link types
 74 |             favicon_rels = ['icon', 'shortcut icon', 'apple-touch-icon', 'apple-touch-icon-precomposed']
 75 |             
 76 |             for rel in favicon_rels:
 77 |                 icons = soup.find_all('link', rel=lambda x: x and rel in x.lower() if x else False)
 78 |                 for icon in icons:
 79 |                     href = icon.get('href')
 80 |                     if not href:
 81 |                         continue
 82 |                     
 83 |                     # Normalize URL
 84 |                     if href.startswith('//'):
 85 |                         favicon_url = f"{parsed_url.scheme}:{href}"
 86 |                     elif href.startswith('/'):
 87 |                         favicon_url = f"{base_url}{href}"
 88 |                     elif not href.startswith(('http://', 'https://')):
 89 |                         favicon_url = urljoin(website, href)
 90 |                     else:
 91 |                         favicon_url = href
 92 |                     
 93 |                     favicon_candidates.append(favicon_url)
 94 |                     
 95 |                     if check_favicon_url(favicon_url):
 96 |                         logger.info(f"Favicon found via HTML link tag: {favicon_url}")
 97 |                         return "🟢"
 98 | 
 99 |         except Exception as e:
100 |             logger.warning(f"Error parsing HTML for favicon on {website}: {e}")
101 | 
102 |         # 3. Try common alternative locations
103 |         common_paths = ['/apple-touch-icon.png', '/icon.png', '/favicon.png']
104 |         for path in common_paths:
105 |             favicon_url = f"{base_url}{path}"
106 |             favicon_candidates.append(favicon_url)
107 |             if check_favicon_url(favicon_url):
108 |                 logger.info(f"Favicon found at common location: {favicon_url}")
109 |                 return "🟢"
110 | 
111 |         logger.warning(f"No valid favicon found for {website}. Checked {len(set(favicon_candidates))} locations")
112 |         return "🔴"
113 |         
114 |     except (HTTPError, RequestException) as e:
115 |         logger.error(f"Request error while checking favicon for {website}: {e}")
116 |         return "⚪"
117 |     except Exception as e:
118 |         logger.error(f"Unexpected error while checking favicon for {website}: {e}")
119 |         return "⚪"
120 | 


--------------------------------------------------------------------------------
/checks/check_domainsblacklists_blacklist.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from requests.exceptions import RequestException, Timeout, HTTPError
  4 | from urllib.parse import urlparse
  5 | import re
  6 | import hashlib
  7 | import time
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | # Simple cache to avoid repeated downloads
 12 | _blacklist_cache = {
 13 |     'data': None,
 14 |     'timestamp': 0,
 15 |     'ttl': 3600  # Cache for 1 hour
 16 | }
 17 | 
 18 | def check_domainsblacklists_blacklist(domain: str) -> str:
 19 |     """
 20 |     Check if a domain is present in a large blacklist file hosted online.
 21 | 
 22 |     Args:
 23 |         domain (str): The domain to check against the blacklist.
 24 | 
 25 |     Returns:
 26 |         str: 
 27 |             - "🔴" if the domain is found in the blacklist
 28 |             - "🟢" if the domain is not found in the blacklist
 29 |             - "⚪" if an error occurs
 30 |     """
 31 |     # Input validation and normalization
 32 |     if not domain:
 33 |         logger.error("Domain is required")
 34 |         return "⚪"
 35 |     
 36 |     # Normalize domain
 37 |     domain = domain.lower().strip()
 38 |     domain = re.sub(r'^https?://', '', domain)
 39 |     domain = re.sub(r'^www\.', '', domain)
 40 |     domain = domain.split('/')[0]  # Remove path if present
 41 |     domain = domain.split(':')[0]  # Remove port if present
 42 |     
 43 |     # Validate domain format
 44 |     if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', domain):
 45 |         logger.error(f"Invalid domain format: {domain}")
 46 |         return "⚪"
 47 | 
 48 |     url = "https://github.com/fabriziosalmi/blacklists/releases/download/latest/blacklist.txt"
 49 |     
 50 |     headers = {
 51 |         'User-Agent': 'DomainBlacklistChecker/2.0',
 52 |         'Accept-Encoding': 'gzip, deflate'
 53 |     }
 54 | 
 55 |     try:
 56 |         # Check cache first for performance optimization
 57 |         current_time = time.time()
 58 |         if _blacklist_cache['data'] and (current_time - _blacklist_cache['timestamp'] < _blacklist_cache['ttl']):
 59 |             logger.debug("Using cached blacklist data")
 60 |             blacklist_set = _blacklist_cache['data']
 61 |         else:
 62 |             logger.info("Downloading fresh blacklist data")
 63 |             # Stream the response to handle large files efficiently
 64 |             response = requests.get(url, headers=headers, stream=True, timeout=60)
 65 |             response.raise_for_status()
 66 | 
 67 |             # Build a set for O(1) lookup performance
 68 |             blacklist_set = set()
 69 |             line_count = 0
 70 |             
 71 |             for line in response.iter_lines(decode_unicode=True):
 72 |                 # Ensure line is a string and handle properly
 73 |                 if line and isinstance(line, str):
 74 |                     if not line.startswith('#'):  # Skip comments
 75 |                         cleaned_line = line.strip().lower()
 76 |                         if cleaned_line:
 77 |                             blacklist_set.add(cleaned_line)
 78 |                             line_count += 1
 79 |                 elif line and isinstance(line, bytes):
 80 |                     # Handle bytes if somehow we get them
 81 |                     line_str = line.decode('utf-8', errors='ignore')
 82 |                     if not line_str.startswith('#'):
 83 |                         cleaned_line = line_str.strip().lower()
 84 |                         if cleaned_line:
 85 |                             blacklist_set.add(cleaned_line)
 86 |                             line_count += 1
 87 |             
 88 |             # Update cache
 89 |             _blacklist_cache['data'] = blacklist_set
 90 |             _blacklist_cache['timestamp'] = current_time
 91 |             
 92 |             logger.info(f"Loaded {line_count} domains into blacklist")
 93 | 
 94 |         # Enhanced detection patterns - check domain and subdomains
 95 |         domains_to_check = [domain]
 96 |         
 97 |         # Add parent domains for subdomain checking
 98 |         parts = domain.split('.')
 99 |         for i in range(1, len(parts)):
100 |             parent_domain = '.'.join(parts[i:])
101 |             if len(parent_domain) > 3:  # Avoid checking TLDs
102 |                 domains_to_check.append(parent_domain)
103 | 
104 |         # Check all domain variants
105 |         for check_domain in domains_to_check:
106 |             if check_domain in blacklist_set:
107 |                 logger.warning(f"Domain {check_domain} found in blacklist (original: {domain})")
108 |                 return "🔴"
109 | 
110 |         logger.info(f"Domain {domain} not found in blacklist")
111 |         return "🟢"
112 | 
113 |     except (Timeout, HTTPError) as e:
114 |         logger.error(f"HTTP error while checking domain {domain} against blacklist: {e}")
115 |         return "⚪"
116 |     except RequestException as e:
117 |         logger.error(f"Request error while checking domain {domain} against blacklist: {e}")
118 |         return "⚪"
119 |     except Exception as e:
120 |         logger.error(f"Unexpected error while checking domain {domain} against blacklist: {e}")
121 |         return "⚪"
122 | 


--------------------------------------------------------------------------------
/checks/check_security_headers.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from requests.exceptions import RequestException, Timeout, HTTPError
  4 | 
  5 | # Configure logging
  6 | logging.basicConfig(level=logging.INFO)
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | def check_security_headers(website: str) -> str:
 10 |     """
 11 |     Check for the presence and correct implementation of recommended security headers on a website.
 12 | 
 13 |     Args:
 14 |         website (str): URL of the website to be checked.
 15 | 
 16 |     Returns:
 17 |         str:
 18 |             - "🟢" if all recommended headers are properly implemented.
 19 |             - "🟠" if headers are present but not all are ideally implemented.
 20 |             - "🔴" if some recommended headers are missing.
 21 |             - "⚪" for any errors.
 22 |     """
 23 |     # Input validation and URL normalization
 24 |     if not website or not isinstance(website, str):
 25 |         logger.error(f"Invalid website input: {website}")
 26 |         return "⚪"
 27 |     
 28 |     website = website.strip()
 29 |     if not website.startswith(('http://', 'https://')):
 30 |         website = f"https://{website}"
 31 | 
 32 |     headers = {
 33 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 34 |     }
 35 | 
 36 |     # Enhanced recommended security headers with scoring
 37 |     security_headers = {
 38 |         'X-Content-Type-Options': {'expected': 'nosniff', 'weight': 2},
 39 |         'X-XSS-Protection': {'expected': '1; mode=block', 'weight': 2},
 40 |         'Strict-Transport-Security': {'expected': None, 'weight': 3},
 41 |         'Content-Security-Policy': {'expected': None, 'weight': 3},
 42 |         'Referrer-Policy': {'expected': None, 'weight': 1},
 43 |         'Permissions-Policy': {'expected': None, 'weight': 1},
 44 |         'X-Frame-Options': {'expected': ['DENY', 'SAMEORIGIN'], 'weight': 2}
 45 |     }
 46 | 
 47 |     try:
 48 |         # Make request with proper error handling
 49 |         response = requests.get(website, headers=headers, timeout=15)
 50 |         response.raise_for_status()
 51 | 
 52 |         # Analyze security headers
 53 |         total_score = 0
 54 |         max_score = sum(header_info['weight'] for header_info in security_headers.values())
 55 |         issues = []
 56 | 
 57 |         for header, config in security_headers.items():
 58 |             header_value = response.headers.get(header)
 59 |             expected = config['expected']
 60 |             weight = config['weight']
 61 |             
 62 |             if header_value:
 63 |                 if expected is None:
 64 |                     # Header present, that's good enough
 65 |                     total_score += weight
 66 |                     logger.debug(f"Security header {header} present: {header_value}")
 67 |                 elif isinstance(expected, list):
 68 |                     # Check if value is in expected list
 69 |                     if any(exp in header_value for exp in expected):
 70 |                         total_score += weight
 71 |                     else:
 72 |                         issues.append(f"{header} has unexpected value: {header_value}")
 73 |                         total_score += weight * 0.5  # Partial credit
 74 |                 elif expected.lower() in header_value.lower():
 75 |                     total_score += weight
 76 |                 else:
 77 |                     issues.append(f"{header} has non-ideal value: {header_value} (expected: {expected})")
 78 |                     total_score += weight * 0.5  # Partial credit
 79 |             else:
 80 |                 issues.append(f"Missing security header: {header}")
 81 | 
 82 |         # Check for information disclosure headers
 83 |         revealing_headers = {
 84 |             'Server', 'X-Powered-By', 'X-AspNet-Version', 'X-Generator'
 85 |         }
 86 |         found_revealing = revealing_headers.intersection(response.headers.keys())
 87 |         
 88 |         if found_revealing:
 89 |             issues.append(f"Information disclosure headers found: {', '.join(found_revealing)}")
 90 |             total_score -= 1  # Penalty for revealing headers
 91 | 
 92 |         # Calculate security score percentage
 93 |         security_score = max(0, total_score / max_score)
 94 |         
 95 |         logger.info(f"Security headers analysis for {website}: {security_score:.2f} score ({total_score}/{max_score})")
 96 |         
 97 |         if issues:
 98 |             logger.warning(f"Security issues found: {issues}")
 99 | 
100 |         # Determine result based on security score
101 |         if security_score >= 0.9:
102 |             return "🟢"
103 |         elif security_score >= 0.6:
104 |             return "🟠"
105 |         else:
106 |             return "🔴"
107 | 
108 |     except (Timeout, HTTPError) as e:
109 |         logger.warning(f"HTTP/Timeout error for {website}: {e}")
110 |         return "⚪"
111 |     except RequestException as e:
112 |         logger.warning(f"Request error for {website}: {e}")
113 |         return "⚪"
114 |     except Exception as e:
115 |         logger.error(f"Unexpected error for {website}: {e}")
116 |         return "⚪"
117 | 


--------------------------------------------------------------------------------
/checks/check_semantic_markup.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from bs4 import BeautifulSoup
  4 | from bs4 import FeatureNotFound
  5 | from requests.exceptions import RequestException
  6 | import json
  7 | import re
  8 | 
  9 | # Configure logging
 10 | logging.basicConfig(level=logging.INFO)
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | def check_semantic_markup(website):
 14 |     """
 15 |     Check if the website contains semantic markup in the form of JSON-LD, Microdata, or RDFa.
 16 | 
 17 |     Args:
 18 |         website (str): The URL of the website to be checked.
 19 | 
 20 |     Returns:
 21 |         str: 
 22 |             - "🟢" if comprehensive semantic markup is found
 23 |             - "🟠" if some semantic markup is found
 24 |             - "🔴" if no semantic markup is found
 25 |             - "⚪" if an error occurs
 26 |     """
 27 |     # Input validation and URL normalization
 28 |     if not website or not isinstance(website, str):
 29 |         logger.error(f"Invalid website input: {website}")
 30 |         return "⚪"
 31 |     
 32 |     website = website.strip()
 33 |     if not website.startswith(('http://', 'https://')):
 34 |         website = f"https://{website}"
 35 | 
 36 |     headers = {
 37 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 38 |     }
 39 | 
 40 |     try:
 41 |         # Fetch website content
 42 |         response = requests.get(website, headers=headers, timeout=15)
 43 |         response.raise_for_status()
 44 |         html_content = response.text
 45 | 
 46 |         # Parse HTML content
 47 |         try:
 48 |             soup = BeautifulSoup(html_content, 'lxml')
 49 |         except FeatureNotFound:
 50 |             soup = BeautifulSoup(html_content, 'html.parser')
 51 |         
 52 |         markup_score = 0
 53 |         markup_types = []
 54 | 
 55 |         # Method 1: Check for JSON-LD semantic markup
 56 |         json_ld_scripts = soup.find_all('script', type="application/ld+json")
 57 |         if json_ld_scripts:
 58 |             valid_json_ld = 0
 59 |             for script in json_ld_scripts:
 60 |                 try:
 61 |                     json_data = json.loads(script.string or '{}')
 62 |                     if json_data and '@context' in json_data:
 63 |                         valid_json_ld += 1
 64 |                         logger.debug(f"Valid JSON-LD found: {json_data.get('@type', 'Unknown type')}")
 65 |                 except (json.JSONDecodeError, AttributeError):
 66 |                     continue
 67 |             
 68 |             if valid_json_ld > 0:
 69 |                 markup_score += 3  # JSON-LD gets highest score
 70 |                 markup_types.append(f"JSON-LD ({valid_json_ld} items)")
 71 | 
 72 |         # Method 2: Check for Microdata semantic markup
 73 |         microdata_elements = soup.find_all(attrs={"itemscope": True})
 74 |         if microdata_elements:
 75 |             microdata_with_type = [elem for elem in microdata_elements if elem.get('itemtype')]
 76 |             if microdata_with_type:
 77 |                 markup_score += 2
 78 |                 markup_types.append(f"Microdata ({len(microdata_with_type)} items)")
 79 | 
 80 |         # Method 3: Check for RDFa semantic markup
 81 |         rdfa_vocab = soup.find_all(attrs={"vocab": True})
 82 |         rdfa_typeof = soup.find_all(attrs={"typeof": True})
 83 |         rdfa_property = soup.find_all(attrs={"property": True})
 84 |         
 85 |         if rdfa_vocab or rdfa_typeof or rdfa_property:
 86 |             markup_score += 1
 87 |             rdfa_count = len(rdfa_vocab) + len(rdfa_typeof) + len(rdfa_property)
 88 |             markup_types.append(f"RDFa ({rdfa_count} attributes)")
 89 | 
 90 |         # Method 4: Check for Open Graph and Twitter Card markup
 91 |         og_tags = soup.find_all('meta', property=lambda x: x and x.startswith('og:'))
 92 |         twitter_tags = soup.find_all('meta', attrs={'name': lambda x: x and x.startswith('twitter:')})
 93 |         
 94 |         if og_tags:
 95 |             markup_score += 1
 96 |             markup_types.append(f"Open Graph ({len(og_tags)} tags)")
 97 |         
 98 |         if twitter_tags:
 99 |             markup_score += 1
100 |             markup_types.append(f"Twitter Cards ({len(twitter_tags)} tags)")
101 | 
102 |         # Method 5: Check for Schema.org patterns in class names
103 |         schema_classes = soup.find_all(class_=re.compile(r'schema|hcard|vcard|geo|adr', re.IGNORECASE))
104 |         if schema_classes:
105 |             markup_score += 1
106 |             markup_types.append(f"Schema classes ({len(schema_classes)} elements)")
107 | 
108 |         logger.info(f"Semantic markup analysis for {website}: Score {markup_score}, Types: {', '.join(markup_types)}")
109 | 
110 |         # Determine result based on markup score and types
111 |         if markup_score >= 4:
112 |             return "🟢"  # Comprehensive semantic markup
113 |         elif markup_score >= 2:
114 |             return "🟠"  # Some semantic markup
115 |         else:
116 |             return "🔴"  # No or minimal semantic markup
117 | 
118 |     except RequestException as e:
119 |         logger.warning(f"Request error for {website}: {e}")
120 |         return "⚪"
121 |     except Exception as e:
122 |         logger.error(f"Unexpected error for {website}: {e}")
123 |         return "⚪"
124 | 


--------------------------------------------------------------------------------
/checks/check_subdomain_enumeration.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from urllib.parse import urlparse
  4 | from requests.exceptions import RequestException, Timeout, HTTPError
  5 | from concurrent.futures import ThreadPoolExecutor, as_completed
  6 | import time
  7 | 
  8 | # Configure logging
  9 | logging.basicConfig(level=logging.INFO)
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | def check_subdomain_enumeration(website: str) -> tuple:
 13 |     """
 14 |     Check for the existence of common subdomains for a given website with enhanced security analysis.
 15 | 
 16 |     Args:
 17 |         website (str): The main domain of the website to be checked.
 18 | 
 19 |     Returns:
 20 |         tuple: A status symbol and a list of discovered subdomains.
 21 |             - "🟢" if no potentially risky subdomains were discovered.
 22 |             - "🟠" if some subdomains were found but appear safe.
 23 |             - "🔴" if risky subdomains were found.
 24 |             - "⚪" for unexpected errors.
 25 |     """
 26 |     # Input validation and URL normalization
 27 |     if not website or not isinstance(website, str):
 28 |         logger.error(f"Invalid website input: {website}")
 29 |         return "⚪", []
 30 |     
 31 |     # Extract domain from URL if full URL provided
 32 |     if website.startswith(('http://', 'https://')):
 33 |         parsed = urlparse(website)
 34 |         domain = parsed.netloc
 35 |     else:
 36 |         domain = website.strip()
 37 | 
 38 |     # Enhanced subdomain list with security-focused subdomains
 39 |     SUBDOMAINS = [
 40 |         # Common subdomains
 41 |         "www", "api", "dev", "test", "staging", "mail", "blog", "shop", "admin",
 42 |         # Development/staging subdomains (potentially risky)
 43 |         "development", "stage", "beta", "alpha", "demo", "sandbox",
 44 |         # Infrastructure subdomains
 45 |         "cdn", "static", "assets", "media", "files",
 46 |         # Potentially sensitive subdomains
 47 |         "backup", "old", "legacy", "archive", "temp", "tmp",
 48 |         # Service subdomains
 49 |         "ftp", "ssh", "vpn", "remote", "portal"
 50 |     ]
 51 |     
 52 |     # Categorize subdomains by risk level
 53 |     RISKY_SUBDOMAINS = {
 54 |         "dev", "test", "staging", "development", "stage", "beta", "alpha", 
 55 |         "demo", "sandbox", "backup", "old", "legacy", "archive", "temp", "tmp"
 56 |     }
 57 | 
 58 |     discovered_subdomains = []
 59 |     risky_subdomains = []
 60 | 
 61 |     headers = {
 62 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 63 |     }
 64 | 
 65 |     def check_subdomain(subdomain):
 66 |         """Helper function to check individual subdomain."""
 67 |         subdomain_url = f"https://{subdomain}.{domain}"
 68 |         try:
 69 |             response = requests.get(subdomain_url, headers=headers, timeout=10, allow_redirects=True)
 70 |             if response.status_code == 200:
 71 |                 logger.debug(f"Discovered subdomain: {subdomain_url}")
 72 |                 return subdomain_url, subdomain in RISKY_SUBDOMAINS
 73 |             return None, False
 74 |         except (Timeout, HTTPError, RequestException):
 75 |             return None, False
 76 |         except Exception as e:
 77 |             logger.debug(f"Error checking {subdomain_url}: {e}")
 78 |             return None, False
 79 | 
 80 |     try:
 81 |         # Use ThreadPoolExecutor for concurrent subdomain checking
 82 |         with ThreadPoolExecutor(max_workers=10) as executor:
 83 |             # Submit all subdomain checks
 84 |             future_to_subdomain = {
 85 |                 executor.submit(check_subdomain, sub): sub 
 86 |                 for sub in SUBDOMAINS
 87 |             }
 88 |             
 89 |             # Process results as they complete
 90 |             for future in as_completed(future_to_subdomain, timeout=60):
 91 |                 subdomain = future_to_subdomain[future]
 92 |                 try:
 93 |                     result, is_risky = future.result()
 94 |                     if result:
 95 |                         discovered_subdomains.append(result)
 96 |                         if is_risky:
 97 |                             risky_subdomains.append(result)
 98 |                 except Exception as e:
 99 |                     logger.debug(f"Error processing result for {subdomain}: {e}")
100 |                     continue
101 | 
102 |         # Enhanced result analysis
103 |         total_discovered = len(discovered_subdomains)
104 |         total_risky = len(risky_subdomains)
105 | 
106 |         logger.info(f"Subdomain enumeration for {domain}: {total_discovered} discovered, {total_risky} potentially risky")
107 |         
108 |         if total_risky > 0:
109 |             logger.warning(f"Risky subdomains found: {risky_subdomains}")
110 |             return "🔴", discovered_subdomains
111 |         elif total_discovered > 5:
112 |             logger.warning(f"Multiple subdomains discovered for {domain}, potential attack surface")
113 |             return "🟠", discovered_subdomains
114 |         elif total_discovered > 0:
115 |             logger.info(f"Few subdomains discovered for {domain}")
116 |             return "🟠", discovered_subdomains
117 |         else:
118 |             logger.info(f"No subdomains discovered for {domain}")
119 |             return "🟢", []
120 | 
121 |     except Exception as e:
122 |         logger.error(f"Unexpected error during subdomain enumeration for {domain}: {e}")
123 |         return "⚪", []
124 | 


--------------------------------------------------------------------------------
/checks/check_rate_limiting.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import time
  3 | import logging
  4 | from urllib.parse import urlparse, urlunparse
  5 | from requests.exceptions import RequestException, Timeout, HTTPError
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def normalize_url(website):
 12 |     """
 13 |     Normalize the website URL, ensuring it has a scheme.
 14 |     
 15 |     Args:
 16 |     - website (str): The URL of the website to normalize.
 17 |     
 18 |     Returns:
 19 |     - str: The normalized URL.
 20 |     """
 21 |     if not website or not isinstance(website, str):
 22 |         raise ValueError("Invalid website input")
 23 |     
 24 |     website = website.strip()
 25 |     parsed_url = urlparse(website)
 26 |     
 27 |     if not parsed_url.scheme:
 28 |         normalized_url = urlunparse(('https', website, '', '', '', ''))
 29 |     else:
 30 |         normalized_url = website
 31 |     return normalized_url
 32 | 
 33 | def check_rate_limiting(website: str, num_requests: int = 5, delay: float = 0.3, 
 34 |                        user_agent: str = "RateLimitChecker/2.0", threshold: int = 2) -> str:
 35 |     """
 36 |     Checks for rate limiting using enhanced detection with varied delays and request patterns.
 37 | 
 38 |     Args:
 39 |         website (str): The URL of the website to check.
 40 |         num_requests (int): Number of requests to send for testing.
 41 |         delay (float): Initial delay in seconds between requests.
 42 |         user_agent (str): Custom User-Agent string for the requests.
 43 |         threshold (int): The maximum number of successful requests before assuming no rate limiting.
 44 | 
 45 |     Returns:
 46 |         str: "🟢", "🔴", or "⚪" based on the detection status.
 47 |     """
 48 |     headers = {
 49 |         "User-Agent": user_agent
 50 |     }
 51 |     
 52 |     # Normalize the URL
 53 |     try:
 54 |         website = normalize_url(website)
 55 |     except Exception as e:
 56 |         logger.error(f"Invalid URL format: {e}")
 57 |         return "⚪"
 58 | 
 59 |     status_codes = []
 60 |     response_times = []
 61 |     success_count = 0
 62 |     rate_limit_detected = False
 63 |     
 64 |     try:
 65 |         for i in range(num_requests):
 66 |             start_time = time.perf_counter()
 67 |             
 68 |             try:
 69 |                 response = requests.get(website, headers=headers, timeout=15)
 70 |                 end_time = time.perf_counter()
 71 |                 
 72 |                 response_time = end_time - start_time
 73 |                 response_times.append(response_time)
 74 |                 status_codes.append(response.status_code)
 75 |                 
 76 |                 # Check for rate limiting indicators
 77 |                 if response.status_code == 429:
 78 |                     logger.info(f"Rate limiting detected (429) for {website} after {i + 1} requests")
 79 |                     rate_limit_detected = True
 80 |                     break
 81 |                 elif response.status_code in [503, 502, 504]:
 82 |                     logger.warning(f"Server overload detected ({response.status_code}) for {website}")
 83 |                     # Continue to see if it's consistent
 84 |                     
 85 |                 # Check for rate limiting headers
 86 |                 rate_limit_headers = ['X-RateLimit-Limit', 'X-RateLimit-Remaining', 'Retry-After']
 87 |                 if any(header in response.headers for header in rate_limit_headers):
 88 |                     logger.info(f"Rate limiting headers detected for {website}")
 89 |                     rate_limit_detected = True
 90 |                     break
 91 |                 
 92 |                 if response.status_code in [200, 201, 202, 203, 204, 205, 206]:
 93 |                     success_count += 1
 94 |                 
 95 |                 # Adaptive delay based on response time
 96 |                 elapsed_time = end_time - start_time
 97 |                 adaptive_delay = max(delay, elapsed_time * 0.5)
 98 |                 time_to_sleep = max(0, adaptive_delay - elapsed_time)
 99 |                 
100 |                 if i < num_requests - 1:
101 |                     time.sleep(time_to_sleep)
102 |                     
103 |             except (Timeout, HTTPError) as e:
104 |                 logger.debug(f"Request {i + 1} failed for {website}: {e}")
105 |                 status_codes.append(0)  # Indicate failure
106 |                 time.sleep(delay * 2)  # Longer delay after failure
107 | 
108 |         # Enhanced analysis
109 |         avg_response_time = sum(response_times) / len(response_times) if response_times else 0
110 |         logger.info(f"Rate limiting analysis for {website}: {success_count}/{num_requests} successful, "
111 |                    f"avg response time: {avg_response_time:.3f}s")
112 |         logger.debug(f"Status codes: {status_codes}")
113 | 
114 |         if rate_limit_detected:
115 |             logger.info(f"Rate limiting detected for {website}")
116 |             return "🟢"
117 |         elif success_count < threshold:
118 |             logger.info(f"Possible rate limiting detected for {website} (low success rate)")
119 |             return "🟢"
120 |         else:
121 |             logger.info(f"No rate limiting detected for {website}")
122 |             return "🔴"
123 | 
124 |     except RequestException as e:
125 |         logger.error(f"Request error while checking rate limiting for {website}: {e}")
126 |         return "⚪"
127 |     except Exception as e:
128 |         logger.error(f"Unexpected error while checking rate limiting for {website}: {e}")
129 |         return "⚪"
130 | 


--------------------------------------------------------------------------------
/checks/check_domain_breach.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from requests.exceptions import RequestException, HTTPError
  4 | from urllib.parse import urlparse
  5 | import json
  6 | import re
  7 | import time
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | # Simple rate limiting cache
 12 | _rate_limit_cache = {
 13 |     'last_request': 0,
 14 |     'min_interval': 1.5  # Respect HIBP rate limits
 15 | }
 16 | 
 17 | def check_domain_breach(website: str) -> str:
 18 |     """
 19 |     Check if a domain has been found in any known data breaches using the Have I Been Pwned API.
 20 | 
 21 |     Args:
 22 |         website (str): The domain name to be checked.
 23 | 
 24 |     Returns:
 25 |         str:
 26 |             - "🟢" if no breaches are found.
 27 |             - "🟡" if the domain is found in old/resolved breaches.
 28 |             - "🔴" if the domain is found in recent/active breaches.
 29 |             - "⚪" if any errors occurred or if the breach check could not be completed.
 30 |     """
 31 |     # Input validation and normalization
 32 |     if not website:
 33 |         logger.error("Website URL is required")
 34 |         return "⚪"
 35 |     
 36 |     # Normalize domain
 37 |     website = website.lower().strip()
 38 |     website = re.sub(r'^https?://', '', website)
 39 |     website = re.sub(r'^www\.', '', website)
 40 |     website = website.split('/')[0]  # Remove path if present
 41 |     website = website.split(':')[0]  # Remove port if present
 42 |     
 43 |     # Validate domain format
 44 |     if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', website):
 45 |         logger.error(f"Invalid domain format: {website}")
 46 |         return "⚪"
 47 | 
 48 |     # Performance optimization - rate limiting
 49 |     current_time = time.time()
 50 |     time_since_last = current_time - _rate_limit_cache['last_request']
 51 |     if time_since_last < _rate_limit_cache['min_interval']:
 52 |         time.sleep(_rate_limit_cache['min_interval'] - time_since_last)
 53 |     
 54 |     _rate_limit_cache['last_request'] = time.time()
 55 | 
 56 |     try:
 57 |         # Enhanced API usage - check breaches endpoint
 58 |         url = f"https://haveibeenpwned.com/api/v3/breaches"
 59 |         headers = {
 60 |             "hibp-api-version": "3",
 61 |             "User-Agent": "WebsiteMonitor/1.0"
 62 |         }
 63 |         
 64 |         response = requests.get(url, headers=headers, timeout=15)
 65 |         response.raise_for_status()
 66 |         
 67 |         if response.status_code == 200:
 68 |             all_breaches = response.json()
 69 |             
 70 |             # Enhanced detection patterns - check for domain-related breaches
 71 |             domain_breaches = []
 72 |             recent_breaches = []
 73 |             
 74 |             for breach in all_breaches:
 75 |                 breach_domain = breach.get('Domain', '')
 76 |                 breach_name = breach.get('Name', '').lower()
 77 |                 breach_date = breach.get('BreachDate', '')
 78 |                 
 79 |                 # Check if breach is related to the domain
 80 |                 if (website in breach_domain.lower() or 
 81 |                     website in breach_name or 
 82 |                     breach_domain.lower().endswith(website)):
 83 |                     
 84 |                     domain_breaches.append(breach)
 85 |                     
 86 |                     # Check if breach is recent (within last 2 years)
 87 |                     try:
 88 |                         from datetime import datetime
 89 |                         breach_datetime = datetime.strptime(breach_date, '%Y-%m-%d')
 90 |                         days_ago = (datetime.now() - breach_datetime).days
 91 |                         if days_ago <= 730:  # 2 years
 92 |                             recent_breaches.append(breach)
 93 |                     except:
 94 |                         pass
 95 |             
 96 |             # Improved scoring and categorization
 97 |             if recent_breaches:
 98 |                 logger.critical(f"Domain {website} found in {len(recent_breaches)} recent breaches")
 99 |                 for breach in recent_breaches[:3]:  # Log first 3 recent breaches
100 |                     logger.critical(f"  - {breach.get('Name')} ({breach.get('BreachDate')}): {breach.get('Description', '')[:100]}...")
101 |                 return "🔴"
102 |             elif domain_breaches:
103 |                 logger.warning(f"Domain {website} found in {len(domain_breaches)} older breaches")
104 |                 for breach in domain_breaches[:2]:  # Log first 2 older breaches
105 |                     logger.warning(f"  - {breach.get('Name')} ({breach.get('BreachDate')})")
106 |                 return "🟡"
107 |             else:
108 |                 logger.info(f"Domain {website} not found in any known breaches")
109 |                 return "🟢"
110 |         
111 |     except requests.exceptions.HTTPError as e:
112 |         if e.response.status_code == 401:
113 |             logger.error(f"API authentication failed for {website} - API key may be required")
114 |         elif e.response.status_code == 429:
115 |             logger.error(f"Rate limit exceeded while checking breaches for {website}")
116 |         elif e.response.status_code == 404:
117 |             logger.info(f"No breach data found for {website}")
118 |             return "🟢"
119 |         else:
120 |             logger.error(f"HTTP error {e.response.status_code} while checking breaches for {website}: {e}")
121 |         return "⚪"
122 |     except RequestException as e:
123 |         logger.error(f"Request error while checking breaches for {website}: {e}")
124 |         return "⚪"
125 |     except json.JSONDecodeError as e:
126 |         logger.error(f"Invalid JSON response while checking breaches for {website}: {e}")
127 |         return "⚪"
128 |     except Exception as e:
129 |         logger.error(f"Unexpected error while checking breaches for {website}: {e}")
130 |         return "⚪"
131 | 


--------------------------------------------------------------------------------
/checks/check_ad_and_tracking.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | from requests.exceptions import RequestException, Timeout, HTTPError
  4 | from bs4 import BeautifulSoup
  5 | import logging
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_ad_and_tracking(website):
 12 |     """
 13 |     Check if the website is using Google Analytics, AdsbyGoogle, or other common ad/tracking scripts.
 14 | 
 15 |     Args:
 16 |         website (str): URL of the website to be checked.
 17 | 
 18 |     Returns:
 19 |         str: 
 20 |             - "🔴" if both Google Analytics and AdsbyGoogle are present
 21 |             - "🟠" if only Google Analytics is present
 22 |             - "🟡" if other ad/tracking scripts are detected
 23 |             - "🟢" if neither are present
 24 |             - "⚪" if an error occurs
 25 |     """
 26 |     # Input validation and URL normalization
 27 |     if not website or not isinstance(website, str):
 28 |         logger.error(f"Invalid website input: {website}")
 29 |         return "⚪"
 30 |     
 31 |     website = website.strip()
 32 |     if not website.startswith(('http://', 'https://')):
 33 |         website = f"https://{website}"
 34 | 
 35 |     headers = {
 36 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 37 |     }
 38 | 
 39 |     # Enhanced ad/tracking services patterns
 40 |     tracking_patterns = {
 41 |         'google_analytics': [
 42 |             r'www\.google-analytics\.com/analytics\.js',
 43 |             r'www\.googletagmanager\.com/gtag/js',
 44 |             r'gtag\(',
 45 |             r'GoogleAnalyticsObject',
 46 |             r'ga\(',
 47 |         ],
 48 |         'google_ads': [
 49 |             r'pagead2\.googlesyndication\.com/pagead/js/adsbygoogle\.js',
 50 |             r'googlesyndication\.com',
 51 |             r'adsbygoogle',
 52 |         ],
 53 |         'facebook': [
 54 |             r'connect\.facebook\.net',
 55 |             r'fbevents\.js',
 56 |             r'facebook\.com/tr',
 57 |         ],
 58 |         'other_tracking': [
 59 |             r'cdn\.branch\.io',
 60 |             r'pixel\.quantserve\.com',
 61 |             r'bat\.bing\.com',
 62 |             r'cdn\.taboola\.com',
 63 |             r'tracker\.cleverbridge\.com',
 64 |             r'hotjar\.com',
 65 |             r'fullstory\.com',
 66 |             r'mixpanel\.com',
 67 |             r'segment\.io',
 68 |             r'amplitude\.com',
 69 |         ]
 70 |     }
 71 | 
 72 |     try:
 73 |         # Enhanced content analysis with retry mechanism
 74 |         response = requests.get(website, headers=headers, timeout=15)
 75 |         response.raise_for_status()
 76 |         content = response.text.lower()
 77 | 
 78 |         # Score-based detection system
 79 |         detection_score = {
 80 |             'google_analytics': 0,
 81 |             'google_ads': 0,
 82 |             'facebook': 0,
 83 |             'other_tracking': 0
 84 |         }
 85 | 
 86 |         # Check patterns in content
 87 |         for category, patterns in tracking_patterns.items():
 88 |             for pattern in patterns:
 89 |                 if re.search(pattern, content, re.IGNORECASE):
 90 |                     detection_score[category] += 1
 91 |                     logger.debug(f"Found {category} pattern: {pattern}")
 92 | 
 93 |         # Enhanced BeautifulSoup analysis
 94 |         soup = BeautifulSoup(response.text, 'lxml')
 95 |         
 96 |         # Check script tags
 97 |         scripts = soup.find_all('script', src=True)
 98 |         for script in scripts:
 99 |             src = script.get('src', '').lower()
100 |             for category, patterns in tracking_patterns.items():
101 |                 for pattern in patterns:
102 |                     if re.search(pattern, src):
103 |                         detection_score[category] += 1
104 | 
105 |         # Check inline scripts
106 |         inline_scripts = soup.find_all('script')
107 |         for script in inline_scripts:
108 |             if script.string:
109 |                 script_content = script.string.lower()
110 |                 for category, patterns in tracking_patterns.items():
111 |                     for pattern in patterns:
112 |                         if re.search(pattern, script_content):
113 |                             detection_score[category] += 1
114 | 
115 |         # Determine result based on weighted scoring
116 |         has_google_analytics = detection_score['google_analytics'] > 0
117 |         has_google_ads = detection_score['google_ads'] > 0
118 |         has_other_tracking = (detection_score['facebook'] + detection_score['other_tracking']) > 0
119 | 
120 |         logger.info(f"Tracking detection scores for {website}: {detection_score}")
121 | 
122 |         if has_google_analytics and has_google_ads:
123 |             return "🔴"
124 |         elif has_google_analytics:
125 |             return "🟠"
126 |         elif has_other_tracking:
127 |             return "🟡"
128 |         else:
129 |             return "🟢"
130 | 
131 |     except (Timeout, HTTPError, RequestException) as e:
132 |         logger.warning(f"Request error for {website}: {e}")
133 |         
134 |         # Enhanced fallback with basic pattern matching
135 |         try:
136 |             response = requests.get(website, headers=headers, timeout=10)
137 |             response.raise_for_status()
138 |             
139 |             # Simple pattern matching as fallback
140 |             content = response.text.lower()
141 |             if any(pattern in content for patterns in tracking_patterns.values() for pattern in patterns[:2]):
142 |                 return "🟡"
143 |             return "🟢"
144 | 
145 |         except Exception as e:
146 |             logger.error(f"Fallback failed for {website}: {e}")
147 |             return "⚪"
148 |     except Exception as e:
149 |         logger.error(f"Unexpected error for {website}: {e}")
150 |         return "⚪"
151 | 


--------------------------------------------------------------------------------
/checks/check_ssl_cert.py:
--------------------------------------------------------------------------------
  1 | import ssl
  2 | import socket
  3 | import logging
  4 | from datetime import datetime, timezone
  5 | from urllib.parse import urlparse
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_ssl_cert(website: str, port: int = 443) -> str:
 12 |     """
 13 |     Check the SSL certificate of a given website for comprehensive security analysis.
 14 | 
 15 |     Args:
 16 |         website (str): The hostname or URL to check.
 17 |         port (int, optional): The port number. Defaults to 443 (standard HTTPS port).
 18 | 
 19 |     Returns:
 20 |         str:
 21 |             - "🟢 (X days left)" if the certificate is valid and secure with more than 30 days left.
 22 |             - "🟠 (X days left)" if the certificate is valid but has 30 days or fewer left, or minor issues.
 23 |             - "🔴" if the certificate is expired, invalid, or has security issues.
 24 |             - "⚪" if an error occurs during the check.
 25 |     """
 26 |     # Input validation and hostname extraction
 27 |     if not website or not isinstance(website, str):
 28 |         logger.error(f"Invalid website input: {website}")
 29 |         return "⚪"
 30 |     
 31 |     website = website.strip()
 32 |     
 33 |     # Extract hostname from URL if provided
 34 |     if website.startswith(('http://', 'https://')):
 35 |         parsed = urlparse(website)
 36 |         host = parsed.netloc.split(':')[0]
 37 |     else:
 38 |         host = website.split(':')[0]
 39 | 
 40 |     # Create enhanced SSL context
 41 |     context = ssl.create_default_context()
 42 |     context.check_hostname = True
 43 |     context.verify_mode = ssl.CERT_REQUIRED
 44 | 
 45 |     try:
 46 |         # Create connection with timeout
 47 |         with socket.create_connection((host, port), timeout=15) as conn:
 48 |             with context.wrap_socket(conn, server_hostname=host) as sock:
 49 |                 cert = sock.getpeercert()
 50 |                 cert_der = sock.getpeercert(binary_form=True)
 51 |                 
 52 |                 # Get protocol and cipher information
 53 |                 protocol_version = sock.version()
 54 |                 cipher_info = sock.cipher()
 55 | 
 56 |         # Extract certificate information
 57 |         subject = dict(x[0] for x in cert['subject'])
 58 |         issuer = dict(x[0] for x in cert['issuer'])
 59 |         
 60 |         # Parse certificate dates (handling timezone-aware parsing)
 61 |         not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y %Z")
 62 |         not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y %Z")
 63 |         
 64 |         # Calculate days to expiration
 65 |         now = datetime.utcnow()
 66 |         days_to_expire = (not_after - now).days
 67 |         days_since_issued = (now - not_before).days
 68 | 
 69 |         # Security analysis
 70 |         security_issues = []
 71 |         
 72 |         # Check certificate validity period
 73 |         if days_to_expire <= 0:
 74 |             logger.error(f"SSL certificate for {host} is expired")
 75 |             return "🔴"
 76 |         
 77 |         # Check for short validity periods (potential security issue)
 78 |         cert_lifetime_days = (not_after - not_before).days
 79 |         if cert_lifetime_days > 825:  # More than ~2 years (old CA practice)
 80 |             security_issues.append("Long certificate lifetime")
 81 |         
 82 |         # Check signature algorithm
 83 |         if 'sha1' in cert.get('signatureAlgorithm', '').lower():
 84 |             security_issues.append("Weak signature algorithm (SHA-1)")
 85 |         
 86 |         # Check key size (for RSA certificates)
 87 |         public_key_info = cert.get('subjectPublicKeyInfo', {})
 88 |         
 89 |         # Check SAN (Subject Alternative Names)
 90 |         san_list = []
 91 |         for san in cert.get('subjectAltName', []):
 92 |             if san[0] == 'DNS':
 93 |                 san_list.append(san[1])
 94 |         
 95 |         # Verify hostname is in certificate
 96 |         hostname_verified = (
 97 |             subject.get('commonName') == host or 
 98 |             host in san_list or
 99 |             any(san.replace('*.', '') in host for san in san_list if san.startswith('*.'))
100 |         )
101 |         
102 |         if not hostname_verified:
103 |             security_issues.append("Hostname not in certificate")
104 | 
105 |         # Check certificate chain and issuer
106 |         if 'Let\'s Encrypt' in issuer.get('organizationName', ''):
107 |             logger.debug(f"Let's Encrypt certificate for {host}")
108 |         
109 |         logger.info(f"SSL certificate analysis for {host}: {days_to_expire} days left, {len(security_issues)} issues")
110 |         
111 |         if security_issues:
112 |             logger.warning(f"Security issues found: {security_issues}")
113 | 
114 |         # Determine result based on analysis
115 |         if security_issues:
116 |             if days_to_expire <= 7:
117 |                 return "🔴"
118 |             elif days_to_expire <= 30:
119 |                 return f"🔴 ({days_to_expire} days left)"
120 |             else:
121 |                 return f"🟠 ({days_to_expire} days left)"
122 |         elif days_to_expire <= 7:
123 |             return "🔴"
124 |         elif days_to_expire <= 30:
125 |             return f"🟠 ({days_to_expire} days left)"
126 |         else:
127 |             return f"🟢 ({days_to_expire} days left)"
128 | 
129 |     except ssl.SSLError as ssl_err:
130 |         logger.error(f"SSL error for {host}:{port}: {ssl_err}")
131 |         return "🔴"
132 |     except socket.timeout:
133 |         logger.warning(f"Connection timeout for {host}:{port}")
134 |         return "⚪"
135 |     except socket.error as sock_err:
136 |         logger.warning(f"Socket error for {host}:{port}: {sock_err}")
137 |         return "⚪"
138 |     except Exception as e:
139 |         logger.error(f"Unexpected error for {host}:{port}: {e}")
140 |         return "⚪"
141 | 


--------------------------------------------------------------------------------
/checks/check_cors_headers.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from requests.exceptions import RequestException, HTTPError
  4 | from urllib.parse import urlparse
  5 | import re
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | def check_cors_headers(website: str) -> str:
 10 |     """
 11 |     Checks the CORS policy of the given website.
 12 | 
 13 |     Args:
 14 |         website (str): The website URL to check.
 15 | 
 16 |     Returns:
 17 |         str: 
 18 |             - "🟢" if the CORS policy is secure and properly configured.
 19 |             - "🟡" if CORS is configured but with potential security concerns.
 20 |             - "🔴" if the CORS policy is insecure or misconfigured.
 21 |             - "⚪" if an error occurred during checking.
 22 |     """
 23 |     # Input validation and URL normalization
 24 |     if not website:
 25 |         logger.error("Website URL is required")
 26 |         return "⚪"
 27 |     
 28 |     if not website.startswith(('http://', 'https://')):
 29 |         website = f"https://{website}"
 30 |     
 31 |     try:
 32 |         parsed_url = urlparse(website)
 33 |         if not parsed_url.netloc:
 34 |             logger.error(f"Invalid URL format: {website}")
 35 |             return "⚪"
 36 |     except Exception as e:
 37 |         logger.error(f"URL parsing error for {website}: {e}")
 38 |         return "⚪"
 39 | 
 40 |     headers = {
 41 |         'User-Agent': 'CORSPolicyChecker/2.0',
 42 |         'Origin': 'https://example.com'  # Test with a different origin
 43 |     }
 44 | 
 45 |     try:
 46 |         # Enhanced detection patterns - check multiple endpoints and methods
 47 |         endpoints_to_check = [
 48 |             website,
 49 |             f"{website}/api",
 50 |             f"{website}/api/v1", 
 51 |             f"{website}/graphql"
 52 |         ]
 53 |         
 54 |         cors_findings = []
 55 |         
 56 |         for endpoint in endpoints_to_check:
 57 |             try:
 58 |                 # Check OPTIONS request (preflight)
 59 |                 options_response = requests.options(endpoint, headers=headers, timeout=10)
 60 |                 
 61 |                 # Check GET request for CORS headers
 62 |                 get_response = requests.get(endpoint, headers=headers, timeout=10)
 63 |                 
 64 |                 for response in [options_response, get_response]:
 65 |                     if response.status_code < 400:  # Only check successful responses
 66 |                         cors_analysis = analyze_cors_headers(response.headers, endpoint)
 67 |                         if cors_analysis:
 68 |                             cors_findings.append(cors_analysis)
 69 |                             
 70 |             except RequestException:
 71 |                 continue  # Skip endpoints that don't respond
 72 |         
 73 |         if not cors_findings:
 74 |             logger.info(f"No CORS headers found for {website} - may not support CORS")
 75 |             return "🟢"
 76 |         
 77 |         # Improved scoring and categorization
 78 |         critical_issues = [f for f in cors_findings if f["risk"] == "critical"]
 79 |         high_issues = [f for f in cors_findings if f["risk"] == "high"] 
 80 |         medium_issues = [f for f in cors_findings if f["risk"] == "medium"]
 81 |         
 82 |         # Log findings
 83 |         for finding in cors_findings[:5]:  # Log first 5 findings
 84 |             level = logger.critical if finding["risk"] == "critical" else logger.warning
 85 |             level(f"CORS issue on {finding['endpoint']}: {finding['issue']}")
 86 |         
 87 |         if critical_issues:
 88 |             logger.critical(f"Critical CORS vulnerabilities found on {website}")
 89 |             return "🔴"
 90 |         elif high_issues or len(medium_issues) >= 2:
 91 |             logger.warning(f"CORS security concerns found on {website}")
 92 |             return "🟡"
 93 |         elif medium_issues:
 94 |             logger.info(f"Minor CORS configuration issues found on {website}")
 95 |             return "🟡"
 96 |         else:
 97 |             logger.info(f"CORS policy appears secure for {website}")
 98 |             return "🟢"
 99 | 
100 |     except (HTTPError, RequestException) as e:
101 |         logger.error(f"Request error while checking CORS headers for {website}: {e}")
102 |         return "⚪"
103 |     except Exception as e:
104 |         logger.error(f"Unexpected error while checking CORS headers for {website}: {e}")
105 |         return "⚪"
106 | 
107 | def analyze_cors_headers(headers, endpoint):
108 |     """Helper function to analyze CORS headers for security issues"""
109 |     cors_origin = headers.get('Access-Control-Allow-Origin', '')
110 |     cors_credentials = headers.get('Access-Control-Allow-Credentials', '').lower()
111 |     cors_methods = headers.get('Access-Control-Allow-Methods', '')
112 |     cors_headers_allowed = headers.get('Access-Control-Allow-Headers', '')
113 |     
114 |     # Critical security issues
115 |     if cors_origin == '*' and cors_credentials == 'true':
116 |         return {
117 |             "endpoint": endpoint,
118 |             "issue": "Wildcard origin with credentials allowed - critical security risk",
119 |             "risk": "critical"
120 |         }
121 |     
122 |     # High risk issues
123 |     if cors_origin == '*':
124 |         return {
125 |             "endpoint": endpoint, 
126 |             "issue": "Wildcard CORS origin allows all domains",
127 |             "risk": "high"
128 |         }
129 |     
130 |     # Medium risk issues
131 |     if 'DELETE' in cors_methods.upper() and cors_credentials == 'true':
132 |         return {
133 |             "endpoint": endpoint,
134 |             "issue": "DELETE method allowed with credentials",
135 |             "risk": "medium"
136 |         }
137 |     
138 |     if '*' in cors_headers_allowed:
139 |         return {
140 |             "endpoint": endpoint,
141 |             "issue": "Wildcard headers allowed",
142 |             "risk": "medium" 
143 |         }
144 |     
145 |     return None
146 | 


--------------------------------------------------------------------------------
/checks/check_third_party_resources.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from urllib.parse import urlparse
  4 | from bs4 import BeautifulSoup
  5 | from requests.exceptions import RequestException, Timeout, HTTPError
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_third_party_resources(website: str) -> str:
 12 |     """
 13 |     Check for third-party resources loaded by the website with enhanced analysis.
 14 | 
 15 |     Args:
 16 |         website (str): URL of the website to be checked.
 17 | 
 18 |     Returns:
 19 |         str:
 20 |             - "🟢" if the number of third-party resources is minimal (0-3 domains).
 21 |             - "🟠" if there is a moderate number of third-party resources (4-8 domains).
 22 |             - "🔴" if there is a high number of third-party resources (9+ domains).
 23 |             - "⚪" for any errors.
 24 |     """
 25 |     # Input validation and URL normalization
 26 |     if not website or not isinstance(website, str):
 27 |         logger.error(f"Invalid website input: {website}")
 28 |         return "⚪"
 29 |     
 30 |     website = website.strip()
 31 |     if not website.startswith(('http://', 'https://')):
 32 |         website = f"https://{website}"
 33 | 
 34 |     headers = {
 35 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 36 |     }
 37 | 
 38 |     try:
 39 |         # Fetch content with proper error handling
 40 |         response = requests.get(website, headers=headers, timeout=15)
 41 |         response.raise_for_status()
 42 | 
 43 |         # Parse main domain
 44 |         main_domain = urlparse(website).netloc.lower()
 45 |         main_domain_parts = main_domain.split('.')
 46 |         if len(main_domain_parts) >= 2:
 47 |             root_domain = '.'.join(main_domain_parts[-2:])
 48 |         else:
 49 |             root_domain = main_domain
 50 | 
 51 |         # Parse HTML content
 52 |         soup = BeautifulSoup(response.text, 'html.parser')
 53 | 
 54 |         # Track third-party domains and resource types
 55 |         third_party_domains = set()
 56 |         resource_types = {
 57 |             'scripts': 0,
 58 |             'stylesheets': 0,
 59 |             'images': 0,
 60 |             'fonts': 0,
 61 |             'other': 0
 62 |         }
 63 | 
 64 |         # Enhanced resource detection
 65 |         resource_selectors = [
 66 |             ('script', 'src', 'scripts'),
 67 |             ('link', 'href', 'stylesheets'),
 68 |             ('img', 'src', 'images'),
 69 |             ('source', 'src', 'images'),
 70 |             ('iframe', 'src', 'other'),
 71 |             ('embed', 'src', 'other'),
 72 |             ('object', 'data', 'other')
 73 |         ]
 74 | 
 75 |         for tag_name, attr_name, resource_type in resource_selectors:
 76 |             for tag in soup.find_all(tag_name):
 77 |                 resource_url = tag.get(attr_name)
 78 |                 if resource_url:
 79 |                     parsed_url = urlparse(resource_url)
 80 |                     domain = parsed_url.netloc.lower()
 81 |                     
 82 |                     if domain and domain != main_domain:
 83 |                         # Check if it's a subdomain of the main domain
 84 |                         domain_parts = domain.split('.')
 85 |                         if len(domain_parts) >= 2:
 86 |                             domain_root = '.'.join(domain_parts[-2:])
 87 |                             if domain_root != root_domain:
 88 |                                 third_party_domains.add(domain)
 89 |                                 resource_types[resource_type] += 1
 90 |                                 logger.debug(f"Third-party {resource_type}: {resource_url}")
 91 | 
 92 |         # Check for font resources specifically
 93 |         for link in soup.find_all('link', rel=lambda x: x and 'font' in str(x).lower()):
 94 |             href = link.get('href')
 95 |             if href:
 96 |                 parsed_url = urlparse(href)
 97 |                 domain = parsed_url.netloc.lower()
 98 |                 if domain and domain != main_domain:
 99 |                     domain_parts = domain.split('.')
100 |                     if len(domain_parts) >= 2:
101 |                         domain_root = '.'.join(domain_parts[-2:])
102 |                         if domain_root != root_domain:
103 |                             third_party_domains.add(domain)
104 |                             resource_types['fonts'] += 1
105 | 
106 |         third_party_count = len(third_party_domains)
107 |         total_resources = sum(resource_types.values())
108 | 
109 |         logger.info(f"Third-party analysis for {website}: {third_party_count} domains, {total_resources} resources")
110 |         logger.debug(f"Resource breakdown: {resource_types}")
111 |         logger.debug(f"Third-party domains: {list(third_party_domains)}")
112 | 
113 |         # Enhanced scoring based on domain count and resource distribution
114 |         if third_party_count == 0:
115 |             logger.info(f"No third-party resources detected for {website}")
116 |             return "🟢"
117 |         elif third_party_count <= 3:
118 |             logger.info(f"Minimal third-party resources ({third_party_count} domains) for {website}")
119 |             return "🟢"
120 |         elif third_party_count <= 8:
121 |             logger.warning(f"Moderate third-party resources ({third_party_count} domains) for {website}")
122 |             return "🟠"
123 |         else:
124 |             logger.warning(f"High number of third-party resources ({third_party_count} domains) for {website}")
125 |             return "🔴"
126 | 
127 |     except (Timeout, HTTPError) as e:
128 |         logger.warning(f"HTTP/Timeout error for {website}: {e}")
129 |         return "⚪"
130 |     except RequestException as e:
131 |         logger.warning(f"Request error for {website}: {e}")
132 |         return "⚪"
133 |     except Exception as e:
134 |         logger.error(f"Unexpected error for {website}: {e}")
135 |         return "⚪"
136 | 


--------------------------------------------------------------------------------
/checks/check_third_party_requests.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from urllib.parse import urlparse
  4 | from bs4 import BeautifulSoup
  5 | from requests.exceptions import RequestException, Timeout, HTTPError
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_third_party_requests(website: str) -> str:
 12 |     """
 13 |     Monitor the number of third-party requests made by a website with enhanced analysis.
 14 | 
 15 |     Args:
 16 |         website (str): URL of the website to be checked.
 17 |     
 18 |     Returns:
 19 |         str: 
 20 |             - "🟢" if the number of third-party requests is minimal (0-20).
 21 |             - "🟠" if there is a moderate number of third-party requests (21-50).
 22 |             - "🔴" if the website makes a high number of third-party requests (51+).
 23 |             - "⚪" for any errors.
 24 |     """
 25 |     # Input validation and URL normalization
 26 |     if not website or not isinstance(website, str):
 27 |         logger.error(f"Invalid website input: {website}")
 28 |         return "⚪"
 29 |     
 30 |     website = website.strip()
 31 |     if not website.startswith(('http://', 'https://')):
 32 |         website = f"https://{website}"
 33 | 
 34 |     headers = {
 35 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 36 |     }
 37 | 
 38 |     try:
 39 |         # Send HTTP GET request with enhanced error handling
 40 |         response = requests.get(website, headers=headers, timeout=15)
 41 |         response.raise_for_status()
 42 | 
 43 |         # Parse the main domain from the website URL
 44 |         parsed_url = urlparse(website)
 45 |         main_domain = parsed_url.netloc.lower()
 46 |         main_domain_parts = main_domain.split('.')
 47 |         if len(main_domain_parts) >= 2:
 48 |             root_domain = '.'.join(main_domain_parts[-2:])
 49 |         else:
 50 |             root_domain = main_domain
 51 | 
 52 |         # Parse HTML content using BeautifulSoup
 53 |         soup = BeautifulSoup(response.text, 'lxml')
 54 | 
 55 |         # Track third-party requests by category
 56 |         third_party_requests = 0
 57 |         request_categories = {
 58 |             'scripts': 0,
 59 |             'stylesheets': 0,
 60 |             'images': 0,
 61 |             'fonts': 0,
 62 |             'iframes': 0,
 63 |             'other': 0
 64 |         }
 65 |         
 66 |         third_party_domains = set()
 67 | 
 68 |         # Enhanced resource detection with categorization
 69 |         resource_selectors = [
 70 |             ('script', 'src', 'scripts'),
 71 |             ('link', 'href', 'stylesheets'),
 72 |             ('img', 'src', 'images'),
 73 |             ('source', 'src', 'images'),
 74 |             ('iframe', 'src', 'iframes'),
 75 |             ('embed', 'src', 'other'),
 76 |             ('object', 'data', 'other'),
 77 |             ('video', 'src', 'other'),
 78 |             ('audio', 'src', 'other')
 79 |         ]
 80 | 
 81 |         for tag_name, attr_name, category in resource_selectors:
 82 |             for tag in soup.find_all(tag_name):
 83 |                 resource_url = tag.get(attr_name)
 84 |                 if resource_url and resource_url.startswith(('http://', 'https://')):
 85 |                     parsed_resource = urlparse(resource_url)
 86 |                     domain = parsed_resource.netloc.lower()
 87 |                     
 88 |                     if domain and domain != main_domain:
 89 |                         # Check if it's a subdomain of the main domain
 90 |                         domain_parts = domain.split('.')
 91 |                         if len(domain_parts) >= 2:
 92 |                             domain_root = '.'.join(domain_parts[-2:])
 93 |                             if domain_root != root_domain:
 94 |                                 third_party_requests += 1
 95 |                                 third_party_domains.add(domain)
 96 |                                 request_categories[category] += 1
 97 |                                 logger.debug(f"Third-party {category}: {resource_url}")
 98 | 
 99 |         # Check for font resources specifically
100 |         for link in soup.find_all('link', rel=lambda x: x and any(font_rel in str(x).lower() for font_rel in ['font', 'preload'])):
101 |             href = link.get('href')
102 |             if href and href.startswith(('http://', 'https://')):
103 |                 parsed_href = urlparse(href)
104 |                 domain = parsed_href.netloc.lower()
105 |                 if domain and domain != main_domain:
106 |                     domain_parts = domain.split('.')
107 |                     if len(domain_parts) >= 2:
108 |                         domain_root = '.'.join(domain_parts[-2:])
109 |                         if domain_root != root_domain:
110 |                             third_party_requests += 1
111 |                             third_party_domains.add(domain)
112 |                             request_categories['fonts'] += 1
113 | 
114 |         logger.info(f"Third-party analysis for {website}: {third_party_requests} requests across {len(third_party_domains)} domains")
115 |         logger.debug(f"Request breakdown: {request_categories}")
116 |         logger.debug(f"Third-party domains: {list(third_party_domains)}")
117 | 
118 |         # Enhanced threshold-based evaluation
119 |         if third_party_requests <= 20:
120 |             logger.info(f"Minimal third-party requests ({third_party_requests}) for {website}")
121 |             return "🟢"
122 |         elif third_party_requests <= 50:
123 |             logger.warning(f"Moderate third-party requests ({third_party_requests}) for {website}")
124 |             return "🟠"
125 |         else:
126 |             logger.warning(f"High number of third-party requests ({third_party_requests}) for {website}")
127 |             return "🔴"
128 | 
129 |     except (Timeout, HTTPError) as e:
130 |         logger.warning(f"HTTP/Timeout error for {website}: {e}")
131 |         return "⚪"
132 |     except RequestException as e:
133 |         logger.warning(f"Request error for {website}: {e}")
134 |         return "⚪"
135 |     except Exception as e:
136 |         logger.error(f"Unexpected error for {website}: {e}")
137 |         return "⚪"
138 | 


--------------------------------------------------------------------------------
/checks/check_sitemap.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import logging
  3 | from bs4 import BeautifulSoup
  4 | from urllib.parse import urljoin, urlparse
  5 | from requests.exceptions import RequestException, Timeout, HTTPError
  6 | 
  7 | # Configure logging
  8 | logging.basicConfig(level=logging.INFO)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def check_sitemap(website):
 12 |     """
 13 |     Check if the provided website has a sitemap.xml with enhanced validation.
 14 | 
 15 |     Args:
 16 |         website (str): The URL of the website to be checked.
 17 | 
 18 |     Returns:
 19 |         str: 
 20 |             - "🟢" if a valid sitemap is found.
 21 |             - "🔴" if no sitemap is found or if there's a request-related error.
 22 |             - "⚪" for any other unexpected errors.
 23 |     """
 24 |     # Input validation and URL normalization
 25 |     if not website or not isinstance(website, str):
 26 |         logger.error(f"Invalid website input: {website}")
 27 |         return "⚪"
 28 |     
 29 |     website = website.strip()
 30 |     if not website.startswith(('http://', 'https://')):
 31 |         website = f"https://{website}"
 32 |     
 33 |     # Enhanced sitemap paths with more comprehensive patterns
 34 |     sitemap_paths = [
 35 |         '/sitemap.xml',                # Default location
 36 |         '/sitemap_index.xml',          # Index file for multiple sitemaps
 37 |         '/sitemap/sitemap.xml',        # Common alternative path
 38 |         '/sitemap1.xml',               # Numbered sitemap
 39 |         '/sitemap-index.xml',          # Alternative index naming
 40 |         '/sitemap/sitemap-index.xml',  # Nested alternative
 41 |         '/sitemap_index.xml.gz',       # Compressed sitemap
 42 |         '/sitemaps.xml',               # Plural variant
 43 |         '/site-map.xml',               # Hyphenated variant
 44 |         '/robots.txt'                  # Check robots.txt for sitemap reference
 45 |     ]
 46 | 
 47 |     headers = {
 48 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 49 |     }
 50 | 
 51 |     try:
 52 |         session = requests.Session()
 53 |         session.headers.update(headers)
 54 |         
 55 |         # Method 1: Check common sitemap paths
 56 |         for path in sitemap_paths[:-1]:  # Exclude robots.txt for now
 57 |             try:
 58 |                 sitemap_url = urljoin(website, path)
 59 |                 response = session.get(sitemap_url, timeout=15)
 60 |                 
 61 |                 if response.status_code == 200:
 62 |                     content = response.text.lower()
 63 |                     
 64 |                     # Enhanced validation of sitemap content
 65 |                     if any(indicator in content for indicator in ['<urlset', '<sitemapindex', '<url>', '<sitemap>']):
 66 |                         logger.info(f"Valid sitemap found at {sitemap_url}")
 67 |                         return "🟢"
 68 |                     
 69 |             except (Timeout, HTTPError, RequestException):
 70 |                 continue
 71 |         
 72 |         # Method 2: Check robots.txt for sitemap references
 73 |         try:
 74 |             robots_url = urljoin(website, '/robots.txt')
 75 |             robots_response = session.get(robots_url, timeout=10)
 76 |             
 77 |             if robots_response.status_code == 200:
 78 |                 robots_content = robots_response.text.lower()
 79 |                 if 'sitemap:' in robots_content:
 80 |                     # Extract sitemap URLs from robots.txt
 81 |                     import re
 82 |                     sitemap_matches = re.findall(r'sitemap:\s*(.+)', robots_content, re.IGNORECASE)
 83 |                     
 84 |                     for sitemap_url in sitemap_matches:
 85 |                         sitemap_url = sitemap_url.strip()
 86 |                         try:
 87 |                             sitemap_response = session.get(sitemap_url, timeout=10)
 88 |                             if sitemap_response.status_code == 200:
 89 |                                 content = sitemap_response.text.lower()
 90 |                                 if any(indicator in content for indicator in ['<urlset', '<sitemapindex', '<url>', '<sitemap>']):
 91 |                                     logger.info(f"Valid sitemap found via robots.txt: {sitemap_url}")
 92 |                                     return "🟢"
 93 |                         except (Timeout, HTTPError, RequestException):
 94 |                             continue
 95 |         except (Timeout, HTTPError, RequestException):
 96 |             pass
 97 |         
 98 |         # Method 3: Check HTML for sitemap links
 99 |         try:
100 |             main_response = session.get(website, timeout=15)
101 |             if main_response.status_code == 200:
102 |                 soup = BeautifulSoup(main_response.text, 'html.parser')
103 |                 
104 |                 # Look for sitemap links in HTML
105 |                 sitemap_links = soup.find_all('a', href=lambda x: x and 'sitemap' in x.lower())
106 |                 for link in sitemap_links:
107 |                     href = link.get('href')
108 |                     if href:
109 |                         sitemap_url = urljoin(website, href)
110 |                         try:
111 |                             sitemap_response = session.get(sitemap_url, timeout=10)
112 |                             if sitemap_response.status_code == 200:
113 |                                 content = sitemap_response.text.lower()
114 |                                 if any(indicator in content for indicator in ['<urlset', '<sitemapindex']):
115 |                                     logger.info(f"Valid sitemap found in HTML links: {sitemap_url}")
116 |                                     return "🟢"
117 |                         except (Timeout, HTTPError, RequestException):
118 |                             continue
119 |         except (Timeout, HTTPError, RequestException):
120 |             pass
121 |         
122 |         logger.warning(f"No valid sitemap found for {website}")
123 |         return "🔴"
124 |     
125 |     except RequestException as e:
126 |         logger.error(f"Request error while checking sitemap for {website}: {e}")
127 |         return "🔴"
128 |     except Exception as e:
129 |         logger.error(f"Unexpected error while checking sitemap for {website}: {e}")
130 |         return "⚪"
131 | 


--------------------------------------------------------------------------------
/usage.md:
--------------------------------------------------------------------------------
  1 | # Website Monitor - Usage Guide
  2 | 
  3 | [![GitHub Workflow Status](https://github.com/fabriziosalmi/websites-monitor/actions/workflows/create-report.yml/badge.svg)](https://github.com/fabriziosalmi/websites-monitor/actions/workflows/create-report.yml)
  4 | 
  5 | This guide provides detailed instructions for using the Website Monitor framework to monitor website health, security, and performance. The framework performs comprehensive checks and generates detailed reports in markdown format.
  6 | 
  7 | ## 📋 Prerequisites
  8 | 
  9 | Before getting started, ensure you have:
 10 | 
 11 | - **Python 3.11+**: The framework requires Python 3.11 or higher
 12 | - **pip**: Python package manager for installing dependencies
 13 | - **Git**: For cloning the repository and managing versions
 14 | - **Chrome/Chromium** (optional): Some checks use browser automation
 15 | - **Docker** (optional): For containerized deployment
 16 | - **Google PageSpeed API Key** (optional): For PageSpeed performance checks
 17 | 
 18 | ## Features
 19 | 
 20 | -   **Automated Daily Checks:** Runs daily using GitHub Actions.
 21 | -   **Comprehensive Analysis:** Performs a range of checks, including:
 22 |     -   Domain breach detection
 23 |     -   Domain expiration check
 24 |     -   SSL certificate validation
 25 |     -   DNS blacklist check
 26 |     -   HSTS header check
 27 |     -   XSS protection check
 28 |     -   Redirect chain check
 29 |     -   PageSpeed performance score
 30 |     -   Website load time
 31 |     -   Rate limiting check
 32 |     -   CDN detection
 33 |     -   Brotli compression check
 34 |     -   Deprecated libraries check
 35 |     -   Client-side rendering check
 36 |     -    Mixed content check
 37 |     -   Content-Type check
 38 |     -    Internationalization check
 39 |     -   FLoC check
 40 |     -    AMP check
 41 |     -   Robots.txt check
 42 |     -   Sitemap check
 43 |     -   Favicon check
 44 |     -   Alt tag check
 45 |     -   Open Graph check
 46 |     -   Semantic Markup check
 47 |     -   Ad Tracking check
 48 |     -   WHOIS privacy check
 49 |     -   Privacy exposure check
 50 | -   **Clear Report:** Generates a markdown-formatted report with results in a table, easily viewable on GitHub.
 51 | -   **Customizable:** Easily extendable to incorporate new checks or modify existing ones.
 52 | 
 53 | ## How to Use
 54 | 
 55 | ### Initial Setup
 56 | 
 57 | 1.  **Fork This Repository:** Start by forking this repository to your GitHub account.
 58 | 2.  **Configure Websites:**
 59 |     *   Edit the `config.yaml` file.
 60 |     *   Add the websites you want to monitor, one per line, under the `websites:` section.
 61 |         ```yaml
 62 |         websites:
 63 |           - audiolibri.org
 64 |           - example.com
 65 |         ```
 66 | 3.  **Enable GitHub Actions (If Not Already Enabled):**
 67 |     *   Navigate to the "Actions" tab in your repository.
 68 |     *   If GitHub Actions are not enabled, enable them for your forked repository, and make sure you give write permissions.
 69 | 4.  **Set the PageSpeed API Key Secret:**
 70 |     *   Obtain a Google PageSpeed Insights API key if you want to use the PageSpeed test.
 71 |     *   Navigate to "Settings" -> "Secrets and variables" -> "Actions" in your GitHub repository.
 72 |     *   Add a new repository secret named `PAGESPEED_API_KEY` and paste your API key as the value.
 73 | 5.  **Create the `report_template.md` File:**
 74 |     *   Create a new file called `report_template.md` in the root of your repository if it doesn't exist.
 75 |     *   Add a default template to generate the report, for example:
 76 |     ```markdown
 77 |     # Websites Monitor
 78 |     ```
 79 | 6.  **Commit All Changes:**
 80 |     *   Commit and push the changes to your forked repository to trigger the initial report generation.
 81 | 
 82 | ### How the Monitoring Works
 83 | 
 84 | -   **Daily Execution:** The `create-report.yml` GitHub Action workflow is scheduled to run daily.
 85 | -   **Website Checks:** The workflow executes the `main.py` script, which performs all the checks on the websites specified in `config.yaml`.
 86 | -   **Report Generation:** The `main.py` script automatically generates the report in the `README.md` file using the `report_template.md` as a base.
 87 | -   **Automatic Updates:** The `README.md` file will be automatically updated with the latest check results after each successful run of the workflow.
 88 | 
 89 | ## Configuration Options
 90 | 
 91 | The `config.yaml` file allows for various configurations:
 92 | 
 93 | - `websites`: List of URLs to monitor (required)
 94 | - `output_file`: The output filename of the generated report (default: `report.md`)
 95 | - `max_workers`: Number of concurrent tasks when performing the checks (default: 4)
 96 | - `timeout`: Default timeout in seconds for the checks (default: 30)
 97 | - `report_template`: The filename of the report template (default: `report_template.md`)
 98 | - `github_workflow_badge`: The GitHub workflow badge URL
 99 | - `pagespeed_api_key`: Google PageSpeed Insights API key (can also be set as environment variable `PAGESPEED_API_KEY`)
100 | 
101 | ### Example Configuration
102 | 
103 | ```yaml
104 | websites:
105 |   - audiolibri.org
106 |   - example.com
107 |   - mywebsite.com
108 | output_file: report.md
109 | max_workers: 2
110 | timeout: 30
111 | report_template: report_template.md
112 | github_workflow_badge: https://github.com/fabriziosalmi/websites-monitor/actions/workflows/create-report.yml/badge.svg
113 | ```
114 | 
115 | **Note**: When using GitHub Actions, you can override the `output_file` to `README.md` to update your repository's README automatically.
116 | 
117 | ## Customizing Checks
118 | 
119 | You can modify existing checks or add new ones by editing the files in the `checks` directory and then adding the check to the `WebsiteMonitor` class in `main.py`. Ensure your new check functions follow the same format, returning an emoji indicating status (🟢, 🔴, or ⚪).
120 | 
121 | ## Understanding the Output
122 | 
123 | The generated report in `README.md` includes a table with a row for each website, and the results for each check in each column.
124 | 
125 | -  🟢: The check is successful.
126 | -  🔴: The check failed.
127 | -  🟡: The check returned a warning or requires attention.
128 | -  ⚪: An error occurred during the check, or the check was not completed.
129 | 
130 | ## Support
131 | 
132 | For any issues or suggestions regarding this project, feel free to open an issue on GitHub.
133 | 
134 | ---
135 | 


--------------------------------------------------------------------------------
/checks/check_privacy_exposure.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | import logging
  4 | from requests.exceptions import RequestException, Timeout, HTTPError
  5 | from bs4 import BeautifulSoup
  6 | from urllib.parse import urljoin
  7 | 
  8 | # Configure logging
  9 | logging.basicConfig(level=logging.INFO)
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | def check_privacy_exposure(website: str) -> str:
 13 |     """
 14 |     Check a given website for potential exposure of sensitive or private data with enhanced detection.
 15 |     
 16 |     Args:
 17 |         website (str): The URL of the website to be checked.
 18 |     
 19 |     Returns:
 20 |         str: "🟢" if no patterns of sensitive data are found, "🔴" otherwise, "⚪" if an error occurred.
 21 |     """
 22 |     # Input validation and URL normalization
 23 |     if not website or not isinstance(website, str):
 24 |         logger.error(f"Invalid website input: {website}")
 25 |         return "⚪"
 26 |     
 27 |     website = website.strip()
 28 |     if not website.startswith(('http://', 'https://')):
 29 |         website = f"https://{website}"
 30 | 
 31 |     headers = {
 32 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 33 |     }
 34 | 
 35 |     # Enhanced patterns to detect sensitive data exposure
 36 |     sensitive_data_patterns = [
 37 |         r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b',  # Email addresses
 38 |         r'\b\d{10,11}\b',  # Phone numbers (10-11 digits)
 39 |         r'(\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b)',  # Credit Card numbers
 40 |         r'(\b\d{3}[- ]?\d{2}[- ]?\d{4}\b)',  # Social Security Numbers
 41 |         r'("AWS_ACCESS_KEY_ID"|"aws_secret_access_key"|"AKIA[0-9A-Z]{16}")',  # AWS Access Keys
 42 |         r'("-----BEGIN PRIVATE KEY-----")',  # Private key exposure
 43 |         r'("api_key"|"apikey"):\s*["\'][^"\']+["\']',  # API keys
 44 |         r'("password"|"passwd"):\s*["\'][^"\']+["\']',  # Passwords
 45 |         r'("secret"|"token"):\s*["\'][^"\']+["\']',  # Secrets/tokens
 46 |         r'\b[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\b',  # IP addresses
 47 |     ]
 48 | 
 49 |     exposure_score = 0
 50 | 
 51 |     try:
 52 |         # Method 1: Direct HTML content analysis
 53 |         response = requests.get(website, headers=headers, timeout=15)
 54 |         response.raise_for_status()
 55 | 
 56 |         # Check for sensitive data patterns in the HTML content
 57 |         for pattern in sensitive_data_patterns:
 58 |             matches = re.findall(pattern, response.text, re.IGNORECASE)
 59 |             if matches:
 60 |                 exposure_score += len(matches)
 61 |                 logger.warning(f"Sensitive data pattern found: {pattern[:30]}... ({len(matches)} matches)")
 62 | 
 63 |         # Method 2: Meta tags and scripts analysis
 64 |         soup = BeautifulSoup(response.text, 'html.parser')
 65 |         
 66 |         # Check meta tags for privacy leaks
 67 |         meta_tags = soup.find_all('meta', {'name': re.compile(r'(description|keywords|author)', re.IGNORECASE)})
 68 |         for tag in meta_tags:
 69 |             content = tag.get('content', '')
 70 |             for pattern in sensitive_data_patterns:
 71 |                 if re.search(pattern, content, re.IGNORECASE):
 72 |                     exposure_score += 1
 73 |                     logger.warning(f"Sensitive data in meta tag: {tag}")
 74 | 
 75 |         # Check script tags for exposed data
 76 |         script_tags = soup.find_all('script')
 77 |         for script in script_tags:
 78 |             if script.string:
 79 |                 for pattern in sensitive_data_patterns:
 80 |                     matches = re.findall(pattern, script.string, re.IGNORECASE)
 81 |                     if matches:
 82 |                         exposure_score += len(matches)
 83 |                         logger.warning(f"Sensitive data in script tag: {len(matches)} matches")
 84 | 
 85 |         # Method 3: Check for accidental exposure of configuration files
 86 |         sensitive_paths = [
 87 |             '/.env', '/config.json', '/settings.py', '/config.php', 
 88 |             '/wp-config.php', '/.git/config', '/.htaccess', '/.aws/credentials',
 89 |             '/database.yml', '/.env.local', '/.env.production', '/secrets.json',
 90 |             '/app.config', '/web.config', '/.npmrc', '/.dockerenv'
 91 |         ]
 92 | 
 93 |         for path in sensitive_paths:
 94 |             try:
 95 |                 file_url = urljoin(website, path)
 96 |                 file_response = requests.get(file_url, headers=headers, timeout=5)
 97 |                 
 98 |                 if file_response.status_code == 200:
 99 |                     for pattern in sensitive_data_patterns:
100 |                         matches = re.findall(pattern, file_response.text, re.IGNORECASE)
101 |                         if matches:
102 |                             exposure_score += len(matches) * 2  # Higher weight for config files
103 |                             logger.error(f"Sensitive data in config file {path}: {len(matches)} matches")
104 |                             
105 |             except (Timeout, HTTPError):
106 |                 continue  # Ignore timeouts and HTTP errors for these paths
107 | 
108 |         # Method 4: Check common backup/temp file patterns
109 |         backup_patterns = [
110 |             '/backup.sql', '/dump.sql', '/database.bak', '/site.zip',
111 |             '/backup.zip', '/old_site.tar.gz', '/backup.tar'
112 |         ]
113 |         
114 |         for backup_path in backup_patterns:
115 |             try:
116 |                 backup_url = urljoin(website, backup_path)
117 |                 backup_response = requests.get(backup_url, headers=headers, timeout=5)
118 |                 
119 |                 if backup_response.status_code == 200:
120 |                     exposure_score += 5  # High weight for accessible backup files
121 |                     logger.error(f"Accessible backup file found: {backup_path}")
122 |                     
123 |             except (Timeout, HTTPError):
124 |                 continue
125 | 
126 |         logger.info(f"Privacy exposure analysis for {website}: exposure score {exposure_score}")
127 | 
128 |         if exposure_score == 0:
129 |             return "🟢"
130 |         else:
131 |             logger.warning(f"Privacy exposure detected for {website}: score {exposure_score}")
132 |             return "🔴"
133 | 
134 |     except (Timeout, HTTPError) as e:
135 |         logger.warning(f"HTTP/Timeout error while checking privacy exposure for {website}: {e}")
136 |         return "⚪"
137 |     except RequestException as e:
138 |         logger.warning(f"Request error while checking privacy exposure for {website}: {e}")
139 |         return "⚪"
140 |     except Exception as e:
141 |         logger.error(f"Unexpected error while checking privacy exposure for {website}: {e}")
142 |         return "⚪"
143 | 


--------------------------------------------------------------------------------