├── report_template.md ├── .gitignore ├── screenshot.png ├── .github ├── FUNDING.yml └── workflows │ └── create-report.yml ├── requirements.txt ├── pyproject.toml ├── config.yaml ├── .dockerignore ├── Dockerfile ├── LICENSE ├── .env.example ├── checks ├── check_content_type_headers.py ├── check_amp_compatibility.py ├── check_cdn.py ├── check_cookie_duration.py ├── check_browser_compatibility.py ├── check_floc.py ├── check_cookie_flags.py ├── check_cms_used.py ├── check_clientside_rendering.py ├── check_brotli_compression.py ├── check_pagespeed_performances.py ├── check_cookie_policy.py ├── check_open_graph_protocol.py ├── check_alt_tags.py ├── check_broken_links.py ├── check_mobile_friendly.py ├── check_xss_protection.py ├── check_internationalization.py ├── check_redirects.py ├── check_hsts.py ├── check_server_response_time.py ├── check_robot_txt.py ├── check_privacy_protected_whois.py ├── check_email_domain.py ├── check_mixed_content.py ├── check_website_load_time.py ├── check_asset_minification.py ├── check_subresource_integrity.py ├── check_domain_expiration.py ├── check_redirect_chains.py ├── check_ssl_cipher_strength.py ├── check_url_canonicalization.py ├── check_dnssec.py ├── check_favicon.py ├── check_domainsblacklists_blacklist.py ├── check_security_headers.py ├── check_semantic_markup.py ├── check_subdomain_enumeration.py ├── check_rate_limiting.py ├── check_domain_breach.py ├── check_ad_and_tracking.py ├── check_ssl_cert.py ├── check_cors_headers.py ├── check_third_party_resources.py ├── check_third_party_requests.py ├── check_sitemap.py └── check_privacy_exposure.py ├── project_description.md ├── debug_scheduler.py ├── CHANGELOG.md ├── docker-compose.yml ├── scheduler.py └── usage.md /report_template.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .pytest_cache 3 | tests/ 4 | *.log 5 | codeflash.yaml -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fabriziosalmi/websites-monitor/HEAD/screenshot.png -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [fabriziosalmi] 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | python-whois 3 | dnspython 4 | beautifulsoup4 5 | selenium 6 | pyyaml 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.codeflash] 2 | # All paths are relative to this pyproject.toml's directory. 3 | module-root = "checks" 4 | tests-root = "tests" 5 | test-framework = "pytest" 6 | ignore-paths = [] 7 | formatter-cmds = ["disabled"] 8 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | websites: 2 | - audiolibri.org 3 | - example.com 4 | output_file: README.md 5 | max_workers: 2 6 | timeout: 30 7 | report_template: report_template.md 8 | github_workflow_badge: https://github.com/fabriziosalmi/websites-monitor/actions/workflows/create-report.yml/badge.svg 9 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Docker ignore file to reduce build context size 2 | 3 | # Git 4 | .git 5 | .gitignore 6 | .gitattributes 7 | 8 | # Docker files 9 | Dockerfile 10 | docker-compose.yml 11 | .dockerignore 12 | 13 | # Environment files 14 | .env 15 | .env.local 16 | .env.example 17 | 18 | # Logs 19 | *.log 20 | logs/ 21 | monitor.log 22 | 23 | # Cache 24 | __pycache__/ 25 | *.pyc 26 | *.pyo 27 | *.pyd 28 | .Python 29 | .pytest_cache/ 30 | .coverage 31 | .tox/ 32 | 33 | # IDE 34 | .vscode/ 35 | .idea/ 36 | *.swp 37 | *.swo 38 | *~ 39 | 40 | # OS 41 | .DS_Store 42 | Thumbs.db 43 | 44 | # Reports (will be mounted as volume) 45 | README.md.backup 46 | reports/ 47 | 48 | # Documentation 49 | docs/ 50 | *.md 51 | 52 | # Tests 53 | tests/ 54 | test_*.py 55 | 56 | # Node modules (if any) 57 | node_modules/ 58 | npm-debug.log* 59 | 60 | # Backup files 61 | *.backup 62 | *.bak 63 | *.tmp 64 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Website Monitor - Dockerfile 2 | FROM python:3.11-slim 3 | 4 | # Set working directory 5 | WORKDIR /app 6 | 7 | # Install system dependencies 8 | RUN apt-get update && apt-get install -y \ 9 | curl \ 10 | dnsutils \ 11 | whois \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | # Copy requirements first for better caching 15 | COPY requirements.txt . 16 | COPY pyproject.toml . 17 | 18 | # Install Python dependencies 19 | RUN pip install --no-cache-dir -r requirements.txt 20 | 21 | # Install additional API dependencies 22 | RUN pip install --no-cache-dir fastapi uvicorn[standard] pydantic 23 | 24 | # Copy application code 25 | COPY . . 26 | 27 | # Create necessary directories 28 | RUN mkdir -p /app/logs /app/reports 29 | 30 | # Set environment variables 31 | ENV PYTHONPATH=/app 32 | ENV PYTHONUNBUFFERED=1 33 | 34 | # Health check 35 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 36 | CMD curl -f http://localhost:8000/health || exit 1 37 | 38 | # Expose API port 39 | EXPOSE 8000 40 | 41 | # Default command - can be overridden in docker-compose 42 | CMD ["python", "api.py"] 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Fabrizio Salmi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Website Monitor - Environment Variables 2 | # Copy this file to .env and modify values as needed 3 | 4 | # =================================== 5 | # API Configuration 6 | # =================================== 7 | API_HOST=0.0.0.0 8 | API_PORT=8000 9 | 10 | # =================================== 11 | # Google PageSpeed Insights 12 | # =================================== 13 | # Get your API key from: https://developers.google.com/speed/docs/insights/v5/get-started 14 | PAGESPEED_API_KEY=your_pagespeed_api_key_here 15 | 16 | # =================================== 17 | # Monitoring Configuration 18 | # =================================== 19 | # Monitoring interval for scheduler (in seconds) 20 | # 3600 = 1 hour, 86400 = 24 hours 21 | MONITOR_INTERVAL=3600 22 | 23 | # =================================== 24 | # Database Configuration (Production) 25 | # =================================== 26 | # Used when running with docker-compose --profile production 27 | DB_PASSWORD=secure_password_change_me 28 | POSTGRES_DB=website_monitor 29 | POSTGRES_USER=monitor_user 30 | 31 | # =================================== 32 | # Redis Configuration (Production) 33 | # =================================== 34 | # Used when running with docker-compose --profile production 35 | REDIS_URL=redis://redis:6379/0 36 | 37 | # =================================== 38 | # SSL Configuration (Production) 39 | # =================================== 40 | # Paths for SSL certificates when using nginx 41 | SSL_CERT_PATH=/etc/nginx/ssl/cert.pem 42 | SSL_KEY_PATH=/etc/nginx/ssl/key.pem 43 | 44 | # =================================== 45 | # Logging Configuration 46 | # =================================== 47 | # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL 48 | LOG_LEVEL=INFO 49 | -------------------------------------------------------------------------------- /checks/check_content_type_headers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | 4 | def check_content_type_headers(website): 5 | """ 6 | Checks if the 'Content-Type' header of the website is set to 'text/html' 7 | and has a character encoding specified. 8 | 9 | Args: 10 | website (str): The website URL to check. 11 | 12 | Returns: 13 | str: 14 | - "🟢" if the header is properly set 15 | - "🔴" if the header is not properly set 16 | - "⚪" if an error occurs 17 | """ 18 | # Ensure the website starts with 'http://' or 'https://' 19 | if not website.startswith(('http://', 'https://')): 20 | website = f"https://{website}" 21 | 22 | headers = { 23 | 'User-Agent': 'ContentTypeChecker/1.0' 24 | } 25 | 26 | try: 27 | # Method 1: Check Content-Type header directly 28 | response = requests.get(website, headers=headers, timeout=10) 29 | response.raise_for_status() 30 | content_type = response.headers.get('Content-Type', '') 31 | 32 | # Check for both 'text/html' and a character encoding 33 | if 'text/html' in content_type.lower() and 'charset=' in content_type.lower(): 34 | print(f"Content-Type is correctly set for {website}.") 35 | return "🟢" 36 | else: 37 | print(f"Content-Type is not properly set for {website}.") 38 | return "🔴" 39 | 40 | except (Timeout, HTTPError, RequestException) as e: 41 | print(f"Request error occurred while checking Content-Type headers for {website}: {e}") 42 | return "⚪" 43 | except Exception as e: 44 | print(f"An unexpected error occurred while checking Content-Type headers for {website}: {e}") 45 | return "⚪" 46 | -------------------------------------------------------------------------------- /project_description.md: -------------------------------------------------------------------------------- 1 | ## Project Description 2 | 3 | Website Monitor is a comprehensive website monitoring framework designed to continuously monitor various aspects of websites including security, performance, SEO compliance, and accessibility. 4 | 5 | ### Key Features 6 | 7 | - **Automated Monitoring**: Runs scheduled checks via GitHub Actions (daily by default) 8 | - **Web Interface**: Interactive HTML interface for real-time testing and analysis 9 | - **REST API**: Full-featured API for integration with other tools and services 10 | - **Comprehensive Checks**: 53+ different checks across 7 categories 11 | - **Multiple Deployment Options**: Local, Docker, or GitHub Actions 12 | - **Detailed Reporting**: Automatic markdown report generation with results 13 | 14 | ### Use Cases 15 | 16 | - **Website Health Monitoring**: Track the status and health of multiple websites 17 | - **Security Auditing**: Identify security issues like missing SSL, weak headers, or vulnerabilities 18 | - **Performance Tracking**: Monitor load times, PageSpeed scores, and optimization opportunities 19 | - **SEO Compliance**: Ensure proper sitemaps, robots.txt, meta tags, and structured data 20 | - **Accessibility Testing**: Verify WCAG compliance and mobile-friendliness 21 | - **DevOps Integration**: Integrate with CI/CD pipelines for continuous monitoring 22 | 23 | ### Architecture 24 | 25 | The project is built with Python and uses: 26 | - **FastAPI**: For the web interface and REST API 27 | - **Selenium**: For browser-based checks 28 | - **Multiple specialized libraries**: For DNS, SSL, and other specific checks 29 | - **Docker**: For containerized deployment 30 | - **GitHub Actions**: For automated scheduled monitoring 31 | 32 | ### Project Status 33 | 34 | ![Static Badge](https://img.shields.io/badge/project_status-active-green?style=for-the-badge&logo=github) 35 | 36 | The project is actively maintained and welcomes contributions. See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute. 37 | -------------------------------------------------------------------------------- /checks/check_amp_compatibility.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | from bs4 import BeautifulSoup 4 | from bs4 import FeatureNotFound 5 | 6 | def check_amp_compatibility(website): 7 | """ 8 | Check if the website has AMP compatibility. 9 | 10 | Args: 11 | website (str): URL of the website to be checked. 12 | 13 | Returns: 14 | str: "🟢" if AMP compatible, "🔴" if not AMP compatible, "⚪" if error occurs 15 | """ 16 | # Ensure the website starts with 'http://' or 'https://' 17 | if not website.startswith(('http://', 'https://')): 18 | website = f"https://{website}" 19 | 20 | headers = { 21 | 'User-Agent': 'AMPChecker/1.0' 22 | } 23 | 24 | try: 25 | response = requests.get(website, headers=headers, timeout=10) 26 | response.raise_for_status() 27 | html_content = response.text 28 | 29 | # Prefer lxml parser for better performance, fallback to html.parser 30 | try: 31 | soup = BeautifulSoup(html_content, 'lxml') 32 | except FeatureNotFound: 33 | soup = BeautifulSoup(html_content, 'html.parser') 34 | 35 | # Check for AMP attributes on tag 36 | amp_html = soup.find('html', attrs=lambda x: x and ('⚡' in x or 'amp' in x)) 37 | 38 | # Check for required AMP script 39 | amp_script = soup.find('script', src="https://cdn.ampproject.org/v0.js") 40 | 41 | # Check for AMP-specific components like link to AMP version 42 | amp_link = soup.find('link', rel="amphtml") 43 | 44 | # Validate AMP criteria 45 | if html_content.lower().startswith('') and (amp_html or amp_link) and amp_script: 46 | return "🟢" 47 | 48 | return "🔴" 49 | 50 | except (Timeout, HTTPError, RequestException) as e: 51 | print(f"Request error occurred while checking AMP compatibility for {website}: {e}") 52 | return "⚪" 53 | except Exception as e: 54 | print(f"An error occurred while checking AMP compatibility for {website}: {e}") 55 | return "⚪" 56 | -------------------------------------------------------------------------------- /debug_scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Debug version of the scheduler to identify issues 4 | """ 5 | 6 | import time 7 | import subprocess 8 | import os 9 | import sys 10 | from datetime import datetime 11 | 12 | print("=== Debug Scheduler Starting ===") 13 | print(f"Python version: {sys.version}") 14 | print(f"Working directory: {os.getcwd()}") 15 | print(f"Files in directory: {os.listdir('.')}") 16 | print(f"Logs directory exists: {os.path.exists('/app/logs')}") 17 | 18 | # Test creating logs directory 19 | try: 20 | os.makedirs('/app/logs', exist_ok=True) 21 | print("✓ Logs directory created/exists") 22 | except Exception as e: 23 | print(f"✗ Error creating logs directory: {e}") 24 | 25 | # Test if main.py exists and is readable 26 | if os.path.exists('main.py'): 27 | print("✓ main.py exists") 28 | try: 29 | with open('main.py', 'r') as f: 30 | content = f.read() 31 | print(f"✓ main.py is readable ({len(content)} characters)") 32 | except Exception as e: 33 | print(f"✗ Error reading main.py: {e}") 34 | else: 35 | print("✗ main.py does not exist") 36 | 37 | print("=== Testing subprocess call ===") 38 | try: 39 | result = subprocess.run( 40 | ['python', '--version'], 41 | capture_output=True, 42 | text=True, 43 | timeout=10 44 | ) 45 | print(f"✓ Subprocess test successful: {result.stdout.strip()}") 46 | except Exception as e: 47 | print(f"✗ Subprocess test failed: {e}") 48 | 49 | print("=== Testing main.py execution ===") 50 | try: 51 | # Try to run main.py with a short timeout 52 | result = subprocess.run( 53 | ['python', 'main.py', '--help'], 54 | capture_output=True, 55 | text=True, 56 | timeout=5 57 | ) 58 | print(f"✓ main.py --help exit code: {result.returncode}") 59 | if result.stdout: 60 | print(f"stdout: {result.stdout[:200]}...") 61 | if result.stderr: 62 | print(f"stderr: {result.stderr[:200]}...") 63 | except subprocess.TimeoutExpired: 64 | print("⚠ main.py --help timed out") 65 | except Exception as e: 66 | print(f"✗ Error running main.py --help: {e}") 67 | 68 | print("=== Debug complete ===") 69 | -------------------------------------------------------------------------------- /checks/check_cdn.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from typing import Optional 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | def check_cdn(website: str) -> str: 8 | """ 9 | Checks if a website is using a CDN by inspecting headers and other indicators. 10 | 11 | Args: 12 | website (str): The URL of the website to check. 13 | 14 | Returns: 15 | str: 16 | - "🟢" if a CDN is detected. 17 | - "🔴" if no CDN is detected. 18 | - "⚪" if an error occurs. 19 | """ 20 | # Ensure the website starts with 'http://' or 'https://' 21 | if not website.startswith(('http://', 'https://')): 22 | website = f"https://{website}" 23 | 24 | headers = { 25 | 'User-Agent': 'CDNChecker/1.0' 26 | } 27 | 28 | try: 29 | response = requests.get(website, headers=headers, stream=True, timeout=10) 30 | response.raise_for_status() 31 | 32 | # Check server header for CDN indicators 33 | server_header = response.headers.get('server', '').lower() 34 | cdn_indicators = ['cloudflare', 'akamai', 'fastly', 'amazon', 'cdn', 'stackpath', 'keycdn', 'maxcdn'] 35 | 36 | if any(indicator in server_header for indicator in cdn_indicators): 37 | logger.info(f"CDN detected in server header for {website}.") 38 | return "🟢" 39 | 40 | # Check other headers that might indicate CDN usage 41 | cdn_headers = [ 42 | 'cf-ray', # Cloudflare 43 | 'x-served-by', # Fastly 44 | 'x-cache', # Various CDNs 45 | 'x-edge-location', # AWS CloudFront 46 | 'x-akamai-transformed' # Akamai 47 | ] 48 | 49 | if any(header in response.headers for header in cdn_headers): 50 | logger.info(f"CDN detected in headers for {website}.") 51 | return "🟢" 52 | 53 | logger.info(f"No CDN detected for {website}.") 54 | return "🔴" 55 | 56 | except requests.exceptions.RequestException as e: 57 | logger.error(f"Request error during CDN check for {website}: {e}") 58 | return "⚪" 59 | except Exception as e: 60 | logger.error(f"An unexpected error occurred during the CDN check for {website}: {e}") 61 | return "⚪" 62 | -------------------------------------------------------------------------------- /checks/check_cookie_duration.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from datetime import datetime 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | 5 | def check_cookie_duration(website): 6 | """ 7 | Ensure that session cookies set by the website don't have an overly long duration. 8 | 9 | Args: 10 | website (str): URL of the website to be checked. 11 | 12 | Returns: 13 | str: 14 | - "🔴" if any cookie has an overly long duration 15 | - "🟢" if all cookies have an acceptable duration 16 | - "⚪" if an error occurs 17 | """ 18 | # Ensure the website starts with 'http://' or 'https://' 19 | if not website.startswith(('http://', 'https://')): 20 | website = f"https://{website}" 21 | 22 | headers = { 23 | 'User-Agent': 'CookieDurationChecker/1.0' 24 | } 25 | 26 | try: 27 | # Perform the request to get cookies 28 | response = requests.get(website, headers=headers, timeout=10) 29 | response.raise_for_status() 30 | 31 | long_duration_cookies = 0 32 | max_duration_seconds = 604800 # 7 days in seconds 33 | 34 | for cookie in response.cookies: 35 | # Check if cookie has expiration (session cookies don't have expiration) 36 | if cookie.expires: 37 | try: 38 | # Convert expires timestamp to datetime and calculate duration 39 | expires_datetime = datetime.fromtimestamp(cookie.expires) 40 | delta = expires_datetime - datetime.now() 41 | if delta.total_seconds() > max_duration_seconds: 42 | long_duration_cookies += 1 43 | print(f"Cookie '{cookie.name}' has long duration: {delta.days} days") 44 | except (ValueError, OSError) as e: 45 | print(f"Error parsing cookie expiration for '{cookie.name}': {e}") 46 | continue 47 | 48 | # Return based on the count of long-duration cookies 49 | if long_duration_cookies > 0: 50 | print(f"Found {long_duration_cookies} cookies with long duration on {website}.") 51 | return "🔴" 52 | print(f"All cookies have an acceptable duration on {website}.") 53 | return "🟢" 54 | 55 | except (Timeout, HTTPError, RequestException) as e: 56 | print(f"Request error occurred while checking cookie duration for {website}: {e}") 57 | return "⚪" 58 | except Exception as e: 59 | print(f"An unexpected error occurred while checking cookie duration for {website}: {e}") 60 | return "⚪" 61 | -------------------------------------------------------------------------------- /.github/workflows/create-report.yml: -------------------------------------------------------------------------------- 1 | name: Create report 2 | 3 | on: 4 | schedule: 5 | - cron: '0 4 * * *' # Runs daily at 4 AM 6 | workflow_dispatch: 7 | 8 | jobs: 9 | create-report: 10 | runs-on: ubuntu-latest 11 | outputs: 12 | status: ${{ steps.set-output.outputs.status }} 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v4 16 | 17 | - name: Setup Python 3.x 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: '3.x' 21 | 22 | - name: Install dependencies 23 | run: | 24 | pip install requests python-whois dnspython beautifulsoup4 selenium pyyaml lxml 25 | 26 | - name: Reinstall dependencies 27 | run: | 28 | pip install --upgrade pip 29 | pip uninstall whois -y 30 | pip install python-whois 31 | 32 | - name: Set Chrome Version 33 | id: chrome_version 34 | run: | 35 | CHROME_VERSION=$(curl -s "https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json" | jq -r '.channels.Stable.version') 36 | echo "CHROME_VERSION=$CHROME_VERSION" >> $GITHUB_OUTPUT 37 | 38 | - name: Setup Chrome and Chromedriver 39 | uses: browser-actions/setup-chrome@v1 40 | with: 41 | chrome-version: ${{ steps.chrome_version.outputs.CHROME_VERSION }} 42 | install-chromedriver: true 43 | 44 | - name: Verify Chrome and Chromedriver Installation 45 | run: | 46 | google-chrome --version 47 | chromedriver --version 48 | CHROME_MAJOR=$(google-chrome --version | cut -d ' ' -f 3 | cut -d '.' -f 1) 49 | DRIVER_MAJOR=$(chromedriver --version | cut -d ' ' -f 2 | cut -d '.' -f 1) 50 | if [[ "$CHROME_MAJOR" != "$DRIVER_MAJOR" ]]; then 51 | echo "Mismatch between Chrome and Chromedriver versions!" 52 | fi 53 | 54 | - name: Run Website Tests 55 | id: test 56 | env: 57 | PAGESPEED_API_KEY: ${{ secrets.PAGESPEED_API_KEY }} 58 | run: python main.py 59 | continue-on-error: true 60 | 61 | - name: Commit and push report 62 | run: | 63 | git config --local user.email "action@github.com" 64 | git config --local user.name "GitHub Action" 65 | git add report.md 66 | git commit -m "Add generated report" -a || echo "No changes to commit" 67 | git push || echo "No changes to push" 68 | 69 | - name: Fail if error encountered 70 | if: steps.set-output.outputs.status == 'error' 71 | run: exit 1 72 | -------------------------------------------------------------------------------- /checks/check_browser_compatibility.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.common.exceptions import WebDriverException 3 | 4 | def check_browser_compatibility(website): 5 | """ 6 | Check if the website is compatible with different browsers. 7 | 8 | Args: 9 | website (str): The URL of the website to be checked. 10 | 11 | Returns: 12 | str: 13 | - "🟢" if the website is compatible with all tested browsers 14 | - "🔴" if the website is not compatible with any browser or if an error occurs 15 | """ 16 | # Ensure the website starts with 'http://' or 'https://' 17 | if not website.startswith(('http://', 'https://')): 18 | website = f"https://{website}" 19 | 20 | # List of drivers to test compatibility 21 | driver_configs = [ 22 | ("Chrome", webdriver.Chrome, webdriver.ChromeOptions), 23 | ("Firefox", webdriver.Firefox, webdriver.FirefoxOptions), 24 | ] 25 | 26 | compatible_browsers = 0 27 | total_browsers = len(driver_configs) 28 | 29 | for browser_name, driver_class, options_class in driver_configs: 30 | driver = None 31 | try: 32 | # Set up options for each driver 33 | browser_options = options_class() 34 | browser_options.add_argument('--headless') # Run in headless mode 35 | browser_options.add_argument('--no-sandbox') 36 | browser_options.add_argument('--disable-dev-shm-usage') 37 | 38 | driver = driver_class(options=browser_options) 39 | driver.set_page_load_timeout(10) # Set timeout for page load 40 | 41 | driver.get(website) 42 | 43 | # Basic check: Ensure that the page loads successfully and has content 44 | if driver.title and len(driver.page_source) > 100: 45 | print(f"Website {website} is compatible with {browser_name}.") 46 | compatible_browsers += 1 47 | else: 48 | print(f"Compatibility issue found with {browser_name} for {website}.") 49 | 50 | except WebDriverException as e: 51 | print(f"Error occurred while testing {browser_name} for {website}: {e}") 52 | except Exception as e: 53 | print(f"Unexpected error with {browser_name} for {website}: {e}") 54 | finally: 55 | if driver: 56 | try: 57 | driver.quit() 58 | except: 59 | pass # Ignore cleanup errors 60 | 61 | # Determine result based on browser compatibility 62 | if compatible_browsers == total_browsers: 63 | return "🟢" 64 | elif compatible_browsers > 0: 65 | return "🟠" # Partially compatible 66 | else: 67 | return "🔴" 68 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to the Website Monitor project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [Unreleased] 9 | 10 | ### Added 11 | - Comprehensive CONTRIBUTING.md with development guidelines 12 | - MIT LICENSE file 13 | - Detailed troubleshooting section in README 14 | - Project structure documentation 15 | - FAQ section answering common questions 16 | - Quick Links navigation in README 17 | - Badges for license, Python version, and Docker 18 | - Comprehensive API usage examples with multiple scenarios 19 | - Detailed check categories reference table 20 | - Enhanced status indicators documentation 21 | 22 | ### Changed 23 | - Improved GitHub Actions setup instructions with step-by-step guide 24 | - Enhanced .env.example with better organization and comments 25 | - Updated project_description.md with architecture details 26 | - Expanded usage.md with better structure 27 | - Improved DOCKER.md with corrected API paths 28 | 29 | ### Fixed 30 | - Corrected API endpoint from POST /check to POST /monitor 31 | - Fixed API documentation paths from /docs to /api/docs 32 | - Fixed output_file default in documentation (report.md, not README.md) 33 | - Updated config.yaml to include all documented options 34 | - Corrected documentation paths throughout 35 | 36 | ## [1.3.0] - 2024 37 | 38 | ### Added 39 | - FastAPI web interface with interactive UI 40 | - REST API with 53+ security, performance, and compliance checks 41 | - Docker and Docker Compose support 42 | - Comprehensive check categories: 43 | - Security & Protection (10 checks) 44 | - Performance & Speed (8 checks) 45 | - SEO & Content (9 checks) 46 | - Domain & DNS (7 checks) 47 | - Privacy & Tracking (10 checks) 48 | - Accessibility & Mobile (5 checks) 49 | - Technical & Infrastructure (4 checks) 50 | 51 | ### Changed 52 | - Migrated from basic script to full FastAPI application 53 | - Improved error handling and logging 54 | - Enhanced check organization by category 55 | 56 | ## [1.0.0] - Initial Release 57 | 58 | ### Added 59 | - Basic website monitoring functionality 60 | - GitHub Actions integration for automated daily checks 61 | - Markdown report generation 62 | - Core security and performance checks 63 | 64 | --- 65 | 66 | ## Version History 67 | 68 | - **1.3.0**: Current version with full API and web interface 69 | - **1.0.0**: Initial release with basic monitoring 70 | 71 | [Unreleased]: https://github.com/fabriziosalmi/websites-monitor/compare/v1.3.0...HEAD 72 | [1.3.0]: https://github.com/fabriziosalmi/websites-monitor/releases/tag/v1.3.0 73 | [1.0.0]: https://github.com/fabriziosalmi/websites-monitor/releases/tag/v1.0.0 74 | -------------------------------------------------------------------------------- /checks/check_floc.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from urllib.parse import urlparse 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def check_floc(website: str) -> str: 9 | """ 10 | Check if the website has opted out of FLoC (Federated Learning of Cohorts). 11 | 12 | Args: 13 | website (str): URL of the website to be checked. 14 | 15 | Returns: 16 | str: 17 | - "🟢" if the site has opted out of FLoC 18 | - "🔴" if it has not opted out 19 | - "⚪" if an error occurred 20 | """ 21 | # Input validation and URL normalization 22 | if not website: 23 | logger.error("Website URL is required") 24 | return "⚪" 25 | 26 | if not website.startswith(('http://', 'https://')): 27 | website = f"https://{website}" 28 | 29 | try: 30 | parsed_url = urlparse(website) 31 | if not parsed_url.netloc: 32 | logger.error(f"Invalid URL format: {website}") 33 | return "⚪" 34 | except Exception as e: 35 | logger.error(f"URL parsing error for {website}: {e}") 36 | return "⚪" 37 | 38 | headers = { 39 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' 40 | } 41 | 42 | try: 43 | # Perform the HTTP request with timeout 44 | response = requests.get(website, headers=headers, timeout=15) 45 | response.raise_for_status() 46 | 47 | # Enhanced detection patterns 48 | permissions_policy = response.headers.get('Permissions-Policy', '').lower() 49 | 50 | # Check for FLoC opt-out in Permissions-Policy header 51 | if 'interest-cohort=()' in permissions_policy: 52 | logger.info(f"FLoC opt-out detected via Permissions-Policy for {website}") 53 | return "🟢" 54 | 55 | # Fallback: Check for older Feature-Policy header 56 | feature_policy = response.headers.get('Feature-Policy', '').lower() 57 | if 'interest-cohort' in feature_policy and "'none'" in feature_policy: 58 | logger.info(f"FLoC opt-out detected via Feature-Policy for {website}") 59 | return "🟢" 60 | 61 | logger.warning(f"No FLoC opt-out detected for {website}") 62 | return "🔴" 63 | 64 | except (Timeout, HTTPError) as e: 65 | logger.error(f"HTTP error while checking FLoC for {website}: {e}") 66 | return "⚪" 67 | except RequestException as e: 68 | logger.error(f"Request error while checking FLoC for {website}: {e}") 69 | return "⚪" 70 | except Exception as e: 71 | logger.error(f"Unexpected error while checking FLoC for {website}: {e}") 72 | return "⚪" 73 | -------------------------------------------------------------------------------- /checks/check_cookie_flags.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | 4 | def check_cookie_flags(website): 5 | """ 6 | Check if all cookies set by the website have the Secure and HttpOnly flags. 7 | 8 | Args: 9 | website (str): URL of the website to be checked. 10 | 11 | Returns: 12 | str: 13 | - "🟢" if all cookies have Secure and HttpOnly flags 14 | - "🟠" if some cookies have Secure and HttpOnly flags, but not all 15 | - "🔴" if no cookies have both Secure and HttpOnly flags 16 | - "⚪" if an error occurs 17 | """ 18 | # Ensure the website starts with 'http://' or 'https://' 19 | if not website.startswith(('http://', 'https://')): 20 | website = f"https://{website}" 21 | 22 | headers = { 23 | 'User-Agent': 'CookieFlagChecker/1.0' 24 | } 25 | 26 | try: 27 | response = requests.get(website, headers=headers, timeout=10) 28 | response.raise_for_status() 29 | 30 | # Check if any cookies are set using the response.cookies object 31 | if not response.cookies: 32 | print(f"No cookies found for {website}.") 33 | return "🟢" # No cookies means no security issue 34 | 35 | # Flags for checking Secure and HttpOnly 36 | all_secure_http_only = True 37 | any_secure_http_only = False 38 | total_cookies = len(response.cookies) 39 | 40 | # Check each cookie's security attributes 41 | for cookie in response.cookies: 42 | has_secure = cookie.secure 43 | has_httponly = hasattr(cookie, '_rest') and cookie._rest.get('HttpOnly') is not None 44 | 45 | if has_secure and has_httponly: 46 | any_secure_http_only = True 47 | else: 48 | all_secure_http_only = False 49 | print(f"Cookie '{cookie.name}' missing security flags: Secure={has_secure}, HttpOnly={has_httponly}") 50 | 51 | # Determine the result based on the flags 52 | if all_secure_http_only and total_cookies > 0: 53 | print(f"All cookies have Secure and HttpOnly flags for {website}.") 54 | return "🟢" 55 | elif any_secure_http_only: 56 | print(f"Some cookies have Secure and HttpOnly flags, but not all for {website}.") 57 | return "🟠" 58 | else: 59 | print(f"No cookies have both Secure and HttpOnly flags for {website}.") 60 | return "🔴" 61 | 62 | except (Timeout, HTTPError, RequestException) as e: 63 | print(f"Request error occurred while checking cookie flags for {website}: {e}") 64 | return "⚪" 65 | except Exception as e: 66 | print(f"An unexpected error occurred while checking cookie flags for {website}: {e}") 67 | return "⚪" 68 | -------------------------------------------------------------------------------- /checks/check_cms_used.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | from bs4 import BeautifulSoup 4 | 5 | def check_cms_used(website): 6 | """ 7 | Checks which CMS (if any) is used by a website based on certain telltale patterns in its content. 8 | 9 | Args: 10 | website (str): The website URL to check. 11 | 12 | Returns: 13 | str: 14 | - "🟢 (CMS Name)" if a CMS is detected 15 | - "🔴" if no CMS is detected 16 | - "⚪" if an error occurs 17 | """ 18 | # Ensure the website starts with 'http://' or 'https://' 19 | if not website.startswith(('http://', 'https://')): 20 | website = f"https://{website}" 21 | 22 | headers = { 23 | 'User-Agent': 'CMSChecker/1.0' 24 | } 25 | 26 | cms_patterns = { 27 | "WordPress": ["wp-", "wp-content", "wp-includes", "wp-json", "xmlrpc.php"], 28 | "Drupal": ["Drupal", "sites/default/files", "drupal.js"], 29 | "Joomla": ["Joomla", "/templates/joomla/", "index.php?option=com_"], 30 | "Wix": ["wix.com", "wix-public", "wixstatic"], 31 | "Squarespace": ["squarespace.com", "static.squarespace.com"], 32 | "Shopify": ["shopify", "cdn.shopify.com"], 33 | "Magento": ["Magento", "mage/", "static/version", "skin/frontend"] 34 | } 35 | 36 | try: 37 | # Method 1: Direct HTML content analysis 38 | response = requests.get(website, headers=headers, timeout=10) 39 | response.raise_for_status() 40 | content = response.text 41 | 42 | # Search for CMS-specific patterns in the website content 43 | for cms, patterns in cms_patterns.items(): 44 | if any(pattern in content for pattern in patterns): 45 | print(f"Detected CMS: {cms} for {website}.") 46 | return f"🟢 ({cms})" 47 | 48 | # Method 2: Additional heuristic checks with BeautifulSoup 49 | soup = BeautifulSoup(content, 'html.parser') 50 | 51 | # Check for meta tags or generator information that might indicate a CMS 52 | meta_generator = soup.find('meta', attrs={'name': 'generator'}) 53 | if meta_generator and meta_generator.get('content'): 54 | generator_content = meta_generator['content'].lower() 55 | for cms in cms_patterns: 56 | if cms.lower() in generator_content: 57 | print(f"Detected CMS via meta tag: {cms} for {website}.") 58 | return f"🟢 ({cms})" 59 | 60 | print(f"No CMS detected for {website}.") 61 | return "🔴" 62 | 63 | except (Timeout, HTTPError, RequestException) as e: 64 | print(f"Request error occurred while checking CMS for {website}: {e}") 65 | return "⚪" 66 | except Exception as e: 67 | print(f"An unexpected error occurred while checking CMS for {website}: {e}") 68 | return "⚪" 69 | -------------------------------------------------------------------------------- /checks/check_clientside_rendering.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | 5 | def check_clientside_rendering(website, threshold=10): 6 | """ 7 | Checks if a website relies heavily on client-side rendering by counting the number of script tags and other indicators. 8 | 9 | Args: 10 | website (str): The website URL to check. 11 | threshold (int): The threshold above which the number of scripts indicates heavy client-side rendering. 12 | 13 | Returns: 14 | str: 15 | - "🟢" if the number of scripts is below the threshold 16 | - "🟠" if it's close to the threshold 17 | - "🔴" if above the threshold 18 | - "⚪" if an error occurs 19 | """ 20 | # Ensure the website starts with 'http://' or 'https://' 21 | if not website.startswith(('http://', 'https://')): 22 | website = f"https://{website}" 23 | 24 | headers = { 25 | 'User-Agent': 'ClientSideRenderingChecker/1.0' 26 | } 27 | 28 | try: 29 | # Method 1: Check number of script tags 30 | response = requests.get(website, headers=headers, timeout=10) 31 | response.raise_for_status() 32 | soup = BeautifulSoup(response.content, 'html.parser') 33 | scripts = soup.find_all('script') 34 | 35 | num_scripts = len(scripts) 36 | 37 | # Method 2: Additional check for specific JavaScript libraries and frameworks 38 | # Check for common client-side frameworks that are heavy on client-side rendering 39 | frameworks = ['react', 'angular', 'vue', 'next', 'nuxt', 'svelte', 'ember', 'backbone'] 40 | framework_detected = False 41 | 42 | for script in scripts: 43 | src = script.get('src', '').lower() 44 | content = (script.string or '').lower() 45 | if any(framework in src or framework in content for framework in frameworks): 46 | framework_detected = True 47 | break 48 | 49 | # Determine result based on number of script tags and framework detection 50 | if num_scripts > threshold or framework_detected: 51 | print(f"Heavy client-side rendering detected for {website}.") 52 | return "🔴" 53 | elif threshold - 3 <= num_scripts <= threshold: 54 | print(f"Moderate client-side rendering detected for {website}.") 55 | return "🟠" 56 | else: 57 | print(f"Minimal client-side rendering detected for {website}.") 58 | return "🟢" 59 | 60 | except (Timeout, HTTPError, RequestException) as e: 61 | print(f"Request error occurred while checking client-side rendering for {website}: {e}") 62 | return "⚪" 63 | except Exception as e: 64 | print(f"An unexpected error occurred while checking client-side rendering for {website}: {e}") 65 | return "⚪" 66 | -------------------------------------------------------------------------------- /checks/check_brotli_compression.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | 4 | def check_brotli_compression(website): 5 | """ 6 | Check if the website supports Brotli compression. 7 | 8 | Args: 9 | website (str): The URL of the website to be checked. 10 | 11 | Returns: 12 | str: 13 | - "🟢" if Brotli compression is enabled 14 | - "🔴" if Brotli compression is not enabled 15 | - "⚪" if an error occurs 16 | """ 17 | # Ensure the website starts with 'http://' or 'https://' 18 | if not website.startswith(('http://', 'https://')): 19 | website = f"https://{website}" 20 | 21 | headers = { 22 | "Accept-Encoding": "gzip, deflate, br", 23 | "User-Agent": "BrotliCompressionChecker/1.0" 24 | } 25 | 26 | try: 27 | # Method 1: Direct HTTP Request with Brotli Accept-Encoding Header 28 | response = requests.get(website, headers=headers, timeout=10) 29 | response.raise_for_status() 30 | 31 | # Check if the response indicates Brotli compression 32 | if 'br' in response.headers.get('Content-Encoding', ''): 33 | print(f"Brotli compression is enabled for {website}.") 34 | return "🟢" 35 | else: 36 | print(f"Brotli compression is not enabled for {website}.") 37 | return "🔴" 38 | 39 | except (Timeout, HTTPError, RequestException) as e: 40 | print(f"Request error occurred while checking Brotli compression for {website}: {e}") 41 | 42 | # Method 2: Alternative Check via Content-Length Comparison (Fallback) 43 | try: 44 | headers_gzip = { 45 | "Accept-Encoding": "gzip, deflate", 46 | "User-Agent": "BrotliCompressionChecker/1.0" 47 | } 48 | headers_brotli = { 49 | "Accept-Encoding": "br", 50 | "User-Agent": "BrotliCompressionChecker/1.0" 51 | } 52 | 53 | # Request with Gzip/Deflate encoding 54 | response_gzip = requests.get(website, headers=headers_gzip, timeout=10) 55 | response_gzip.raise_for_status() 56 | 57 | # Request with Brotli encoding 58 | response_brotli = requests.get(website, headers=headers_brotli, timeout=10) 59 | response_brotli.raise_for_status() 60 | 61 | # Check if Brotli encoding is actually used in response 62 | if 'br' in response_brotli.headers.get('Content-Encoding', ''): 63 | print(f"Brotli compression is enabled for {website} (fallback method).") 64 | return "🟢" 65 | else: 66 | print(f"Brotli compression is not enabled for {website} (fallback method).") 67 | return "🔴" 68 | 69 | except Exception as e: 70 | print(f"Error during fallback Brotli check for {website}: {e}") 71 | return "⚪" 72 | 73 | return "⚪" 74 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Website Monitor - Docker Compose Configuration 2 | 3 | services: 4 | # Main Website Monitor API Service 5 | website-monitor-api: 6 | build: . 7 | container_name: website-monitor-api 8 | ports: 9 | - "8000:8000" 10 | volumes: 11 | - ./config.yaml:/app/config.yaml:ro 12 | - ./reports:/app/reports 13 | - ./logs:/app/logs 14 | environment: 15 | - PYTHONUNBUFFERED=1 16 | - API_HOST=0.0.0.0 17 | - API_PORT=8000 18 | - PAGESPEED_API_KEY=${PAGESPEED_API_KEY:-} 19 | command: ["python", "api.py"] 20 | restart: unless-stopped 21 | healthcheck: 22 | test: ["CMD", "curl", "-f", "http://localhost:8000/health"] 23 | interval: 30s 24 | timeout: 10s 25 | retries: 3 26 | start_period: 40s 27 | networks: 28 | - website-monitor-network 29 | 30 | # Scheduled Monitor Service (runs checks periodically) 31 | website-monitor-scheduler: 32 | build: . 33 | container_name: website-monitor-scheduler 34 | volumes: 35 | - ./config.yaml:/app/config.yaml:ro 36 | - ./reports:/app/reports 37 | - ./logs:/app/logs 38 | - ./README.md:/app/README.md 39 | - ./report_template.md:/app/report_template.md:ro 40 | environment: 41 | - PYTHONUNBUFFERED=1 42 | - PAGESPEED_API_KEY=${PAGESPEED_API_KEY:-} 43 | - MONITOR_INTERVAL=${MONITOR_INTERVAL:-3600} # Default: 1 hour 44 | command: ["python", "scheduler.py"] 45 | restart: unless-stopped 46 | depends_on: 47 | - website-monitor-api 48 | networks: 49 | - website-monitor-network 50 | 51 | # Optional: Nginx reverse proxy for production 52 | nginx: 53 | image: nginx:alpine 54 | container_name: website-monitor-nginx 55 | ports: 56 | - "80:80" 57 | - "443:443" 58 | volumes: 59 | - ./docker/nginx.conf:/etc/nginx/nginx.conf:ro 60 | - ./docker/ssl:/etc/nginx/ssl:ro 61 | depends_on: 62 | - website-monitor-api 63 | restart: unless-stopped 64 | profiles: 65 | - production 66 | 67 | # Optional: Redis for caching and task queues 68 | redis: 69 | image: redis:7-alpine 70 | container_name: website-monitor-redis 71 | ports: 72 | - "6379:6379" 73 | volumes: 74 | - redis_data:/data 75 | restart: unless-stopped 76 | profiles: 77 | - production 78 | 79 | # Optional: PostgreSQL for storing monitoring results 80 | postgres: 81 | image: postgres:15-alpine 82 | container_name: website-monitor-db 83 | environment: 84 | POSTGRES_DB: website_monitor 85 | POSTGRES_USER: monitor_user 86 | POSTGRES_PASSWORD: ${DB_PASSWORD:-secure_password_change_me} 87 | ports: 88 | - "5432:5432" 89 | volumes: 90 | - postgres_data:/var/lib/postgresql/data 91 | - ./docker/init.sql:/docker-entrypoint-initdb.d/init.sql:ro 92 | restart: unless-stopped 93 | profiles: 94 | - production 95 | 96 | volumes: 97 | redis_data: 98 | postgres_data: 99 | 100 | networks: 101 | website-monitor-network: 102 | driver: bridge 103 | -------------------------------------------------------------------------------- /checks/check_pagespeed_performances.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, HTTPError, Timeout 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_pagespeed_performances(website: str, api_key: str = None) -> str: 10 | """ 11 | Checks the PageSpeed Insights performance score for a website with enhanced error handling. 12 | 13 | Args: 14 | website (str): The URL of the website to be checked. 15 | api_key (str, optional): The Google PageSpeed Insights API key. Defaults to None. 16 | 17 | Returns: 18 | str: 19 | - An integer representing the PageSpeed score if successful. 20 | - "⚪" if any errors occur during the check or if no API key was provided. 21 | """ 22 | # Input validation and URL normalization 23 | if not website or not isinstance(website, str): 24 | logger.error(f"Invalid website input: {website}") 25 | return "⚪" 26 | 27 | if not api_key: 28 | logger.error("No API key provided for PageSpeed check.") 29 | return "⚪" 30 | 31 | website = website.strip() 32 | if not website.startswith(('http://', 'https://')): 33 | website = f"https://{website}" 34 | 35 | try: 36 | # Enhanced API call with better parameters 37 | pagespeed_url = f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed" 38 | params = { 39 | 'url': website, 40 | 'key': api_key, 41 | 'category': 'performance', 42 | 'strategy': 'mobile' # Default to mobile strategy 43 | } 44 | 45 | response = requests.get(pagespeed_url, params=params, timeout=30) 46 | response.raise_for_status() 47 | data = response.json() 48 | 49 | # Enhanced data extraction 50 | lighthouse_result = data.get("lighthouseResult", {}) 51 | performance_category = lighthouse_result.get("categories", {}).get("performance", {}) 52 | score = performance_category.get("score") 53 | 54 | if score is not None: 55 | score_percentage = int(score * 100) 56 | logger.info(f"PageSpeed score for {website} is {score_percentage}.") 57 | return str(score_percentage) 58 | 59 | logger.warning(f"PageSpeed score not found for {website}.") 60 | return "⚪" 61 | 62 | except Timeout: 63 | logger.error(f"Timeout occurred while fetching PageSpeed data for {website}") 64 | return "⚪" 65 | except HTTPError as http_err: 66 | logger.error(f"HTTP error occurred while fetching PageSpeed data for {website}: {http_err}") 67 | return "⚪" 68 | except RequestException as req_err: 69 | logger.error(f"Request error occurred while fetching PageSpeed data for {website}: {req_err}") 70 | return "⚪" 71 | except ValueError as json_err: 72 | logger.error(f"JSON parsing error occurred while fetching PageSpeed data for {website}: {json_err}") 73 | return "⚪" 74 | except Exception as e: 75 | logger.error(f"An unexpected error occurred while checking PageSpeed data for {website}: {e}") 76 | return "⚪" 77 | -------------------------------------------------------------------------------- /checks/check_cookie_policy.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | 5 | def check_cookie_policy(website): 6 | """ 7 | Verify if the website has a cookie policy and it's accessible to users. 8 | 9 | Args: 10 | website (str): URL of the website to be checked. 11 | 12 | Returns: 13 | str: 14 | - "🟢" if a cookie policy is found and accessible 15 | - "🔴" if no cookie policy is found or if it's inaccessible 16 | - "⚪" for any errors 17 | """ 18 | # Ensure the website starts with 'http://' or 'https://' 19 | if not website.startswith(('http://', 'https://')): 20 | website = f"https://{website}" 21 | 22 | headers = { 23 | 'User-Agent': 'CookiePolicyChecker/1.0' 24 | } 25 | 26 | try: 27 | # Method 1: Direct page analysis for cookie policy 28 | response = requests.get(website, headers=headers, timeout=10) 29 | response.raise_for_status() 30 | 31 | soup = BeautifulSoup(response.text, 'html.parser') 32 | 33 | # Common keywords associated with cookie policies 34 | keywords = ["cookie policy", "cookie statement", "use of cookies", "privacy policy"] 35 | 36 | # Check for the presence of these keywords in anchor tags (links) 37 | anchors = soup.find_all('a', string=lambda text: text and any(keyword in text.lower() for keyword in keywords)) 38 | if anchors: 39 | print(f"Cookie policy found in links for {website}.") 40 | return "🟢" 41 | 42 | # If not found in links, check if any of the keywords are present in the page's text 43 | page_text = soup.get_text().lower() 44 | if any(keyword in page_text for keyword in keywords): 45 | print(f"Cookie policy text found on the page for {website}.") 46 | return "🟢" 47 | 48 | print(f"No cookie policy found for {website}.") 49 | return "🔴" 50 | 51 | except (Timeout, HTTPError, RequestException) as e: 52 | print(f"Request error occurred while checking cookie policy for {website}: {e}") 53 | 54 | # Method 2: Check for common cookie policy URLs (Fallback) 55 | try: 56 | common_paths = ["/cookie-policy", "/cookies", "/privacy-policy", "/legal/cookies", "/legal/privacy-policy"] 57 | for path in common_paths: 58 | try: 59 | policy_response = requests.get(f"{website.rstrip('/')}{path}", headers=headers, timeout=5) 60 | if policy_response.status_code == 200: 61 | print(f"Cookie policy found at {website.rstrip('/')}{path}.") 62 | return "🟢" 63 | except (Timeout, HTTPError, RequestException): 64 | continue 65 | 66 | print(f"No cookie policy found for {website} (fallback method).") 67 | return "🔴" 68 | 69 | except Exception as e: 70 | print(f"Error during fallback cookie policy check for {website}: {e}") 71 | return "⚪" 72 | except Exception as e: 73 | print(f"An unexpected error occurred while checking cookie policy for {website}: {e}") 74 | return "⚪" 75 | -------------------------------------------------------------------------------- /checks/check_open_graph_protocol.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from requests.exceptions import RequestException, HTTPError, Timeout 5 | 6 | # Configure logging 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | def check_open_graph_protocol(website: str) -> str: 11 | """ 12 | Check a given website for the presence of essential Open Graph Protocol meta tags with enhanced validation. 13 | 14 | Args: 15 | website (str): The URL of the website to be checked. 16 | 17 | Returns: 18 | str: 19 | - "🟢" if essential Open Graph Protocol meta tags are found. 20 | - "🔴" if essential Open Graph Protocol meta tags are missing. 21 | - "⚪" for any errors. 22 | """ 23 | # Input validation and URL normalization 24 | if not website or not isinstance(website, str): 25 | logger.error(f"Invalid website input: {website}") 26 | return "⚪" 27 | 28 | website = website.strip() 29 | if not website.startswith(('http://', 'https://')): 30 | website = f"https://{website}" 31 | 32 | headers = { 33 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 34 | } 35 | 36 | try: 37 | # Make a request to the website 38 | response = requests.get(website, headers=headers, timeout=15) 39 | response.raise_for_status() 40 | 41 | # Parse the HTML content using BeautifulSoup 42 | soup = BeautifulSoup(response.content, 'html.parser') 43 | 44 | # List of essential Open Graph tags 45 | essential_tags = {'og:title', 'og:type', 'og:image', 'og:url'} 46 | recommended_tags = {'og:description', 'og:site_name', 'og:locale'} 47 | 48 | # Extract all Open Graph meta tags 49 | meta_tags = soup.find_all('meta', property=lambda x: x and x.startswith('og:')) 50 | 51 | # Extract the properties of found meta tags 52 | found_tags = {tag['property'] for tag in meta_tags if tag.has_attr('property') and tag.get('content')} 53 | 54 | logger.info(f"Open Graph analysis for {website}: {len(found_tags)} tags found") 55 | logger.debug(f"Found OG tags: {found_tags}") 56 | 57 | # Check if all essential tags are present 58 | missing_essential = essential_tags - found_tags 59 | found_recommended = recommended_tags.intersection(found_tags) 60 | 61 | if not missing_essential: 62 | logger.info(f"All essential Open Graph tags found for {website}.") 63 | if len(found_recommended) >= 2: 64 | return "🟢" # Has essential + recommended tags 65 | return "🟢" # Has essential tags 66 | else: 67 | logger.warning(f"Missing essential Open Graph tags for {website}: {missing_essential}") 68 | return "🔴" 69 | 70 | except (Timeout, HTTPError, RequestException) as e: 71 | logger.warning(f"Request error occurred while checking Open Graph Protocol tags on {website}: {e}") 72 | return "⚪" 73 | except Exception as e: 74 | logger.error(f"An unexpected error occurred while checking Open Graph Protocol tags on {website}: {e}") 75 | return "⚪" 76 | -------------------------------------------------------------------------------- /checks/check_alt_tags.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from bs4 import BeautifulSoup 5 | 6 | def check_alt_tags(website): 7 | """ 8 | Check if all the images on the website have alt tags. 9 | 10 | Args: 11 | website (str): The URL of the website to be checked. 12 | 13 | Returns: 14 | str: 15 | - "🔴" if no image has an alt tag 16 | - "🟠" if some images have alt tags and one or more doesn't 17 | - "🟢" if all images have alt tags 18 | - "⚪" if an error occurs 19 | """ 20 | # Ensure the website starts with 'http://' or 'https://' 21 | if not website.startswith(('http://', 'https://')): 22 | website = f"https://{website}" 23 | 24 | headers = { 25 | 'User-Agent': 'AltTagChecker/1.0' 26 | } 27 | 28 | try: 29 | # Method 1: Direct HTML content analysis using BeautifulSoup 30 | response = requests.get(website, headers=headers, timeout=10) 31 | response.raise_for_status() # Raise an error for HTTP issues 32 | soup = BeautifulSoup(response.text, 'lxml') 33 | 34 | # Find all images and count those with and without alt tags 35 | images = soup.find_all('img') 36 | total_images = len(images) 37 | images_with_alt = sum(1 for img in images if img.get('alt') and img.get('alt').strip()) 38 | 39 | # Determine the result based on the alt tag analysis 40 | if total_images == 0: 41 | print(f"No images found on {website}.") 42 | return "🟢" # No images, hence all images (none) have alt tags by definition 43 | elif images_with_alt == 0: 44 | print(f"No images with alt tags found on {website}.") 45 | return "🔴" 46 | elif images_with_alt < total_images: 47 | print(f"{total_images - images_with_alt} images without alt tags found on {website}.") 48 | return "🟠" 49 | else: 50 | return "🟢" 51 | 52 | except (Timeout, HTTPError, RequestException) as e: 53 | print(f"Request error occurred while checking alt tags for {website}: {e}") 54 | 55 | # Method 2: Alternative Heuristic Check via Meta Tags (Fallback) 56 | try: 57 | # Try to get the response again for fallback analysis 58 | response = requests.get(website, headers=headers, timeout=10) 59 | response.raise_for_status() 60 | soup = BeautifulSoup(response.text, 'html.parser') 61 | 62 | # Look for meta tags that could indicate a focus on accessibility 63 | accessibility_tags = soup.find_all('meta', {'name': re.compile(r'(description|keywords|viewport)', re.IGNORECASE)}) 64 | 65 | # Heuristic: If the website uses meta tags commonly associated with accessibility 66 | if accessibility_tags: 67 | print(f"Some meta tags found that might indicate a focus on accessibility on {website}.") 68 | return "🟠" 69 | 70 | return "🔴" # Assume no focus on accessibility if no relevant meta tags found 71 | 72 | except Exception as e: 73 | print(f"Error during heuristic check for alt tags for {website}: {e}") 74 | return "⚪" 75 | 76 | return "⚪" 77 | -------------------------------------------------------------------------------- /checks/check_broken_links.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urljoin, urlparse 5 | 6 | def check_broken_links(website): 7 | """ 8 | Check for broken links on the provided website. 9 | 10 | Args: 11 | website (str): The URL of the website to be checked. 12 | 13 | Returns: 14 | str: 15 | - "🟢" if no broken links are found 16 | - "🟠" if some broken links are found 17 | - "🔴" if all links are broken 18 | - "⚪" if an error occurs 19 | """ 20 | # Ensure the website starts with 'http://' or 'https://' 21 | if not website.startswith(('http://', 'https://')): 22 | website = f"https://{website}" 23 | 24 | headers = { 25 | 'User-Agent': 'BrokenLinkChecker/1.0' 26 | } 27 | 28 | checked_links = set() # To avoid checking the same URL twice 29 | broken_link_count = 0 30 | total_links = 0 31 | max_links_to_check = 20 # Limit to avoid excessive requests 32 | 33 | try: 34 | # Method 1: Direct HTML content analysis using BeautifulSoup 35 | response = requests.get(website, headers=headers, timeout=10) 36 | response.raise_for_status() 37 | soup = BeautifulSoup(response.text, 'lxml') 38 | 39 | # Find all anchor tags with href attributes 40 | links = soup.find_all('a', href=True) 41 | 42 | for link in links[:max_links_to_check]: # Limit number of links to check 43 | href = link.get('href') 44 | 45 | # Skip anchor links, JavaScript calls, and mailto links 46 | if href.startswith(('#', 'javascript:', 'mailto:')): 47 | continue 48 | 49 | # Convert relative URLs to absolute URLs 50 | full_url = urljoin(website, href) 51 | 52 | # Skip already checked links 53 | if full_url in checked_links: 54 | continue 55 | 56 | checked_links.add(full_url) 57 | 58 | try: 59 | # Check the status of the link 60 | link_response = requests.get(full_url, headers=headers, allow_redirects=True, timeout=5) 61 | if 400 <= link_response.status_code < 600: 62 | print(f"Broken link found: {full_url} (Status: {link_response.status_code})") 63 | broken_link_count += 1 64 | 65 | except (Timeout, HTTPError, RequestException) as e: 66 | print(f"Error while checking link: {full_url}: {e}") 67 | broken_link_count += 1 68 | 69 | total_links += 1 70 | 71 | # Determine the result based on the broken link analysis 72 | if total_links == 0: 73 | print("No valid links found on the website.") 74 | return "⚪" 75 | elif broken_link_count == 0: 76 | return "🟢" 77 | elif broken_link_count < total_links: 78 | return "🟠" 79 | else: 80 | return "🔴" 81 | 82 | except (Timeout, HTTPError, RequestException) as e: 83 | print(f"Request error occurred while checking broken links for {website}: {e}") 84 | return "⚪" 85 | except Exception as e: 86 | print(f"An unexpected error occurred while checking broken links for {website}: {e}") 87 | return "⚪" 88 | -------------------------------------------------------------------------------- /checks/check_mobile_friendly.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import logging 4 | from requests.exceptions import RequestException, HTTPError 5 | from urllib.parse import urlparse 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_mobile_friendly(website: str, api_key: str) -> str: 10 | """ 11 | Check if the given website is mobile-friendly using the Google Mobile-Friendly Test API. 12 | 13 | Args: 14 | website (str): The URL of the website to be checked. 15 | api_key (str): The API key for accessing the Google Mobile-Friendly Test API. 16 | 17 | Returns: 18 | str: 19 | - "🟢" if the website is mobile-friendly. 20 | - "🔴" if the website is not mobile-friendly. 21 | - "⚪" for any errors. 22 | """ 23 | # Input validation and URL normalization 24 | if not website or not api_key: 25 | logger.error("Website URL and API key are required") 26 | return "⚪" 27 | 28 | # Normalize URL 29 | if not website.startswith(('http://', 'https://')): 30 | website = f"https://{website}" 31 | 32 | try: 33 | parsed_url = urlparse(website) 34 | if not parsed_url.netloc: 35 | logger.error(f"Invalid URL format: {website}") 36 | return "⚪" 37 | except Exception as e: 38 | logger.error(f"URL parsing error for {website}: {e}") 39 | return "⚪" 40 | 41 | api_url = f"https://searchconsole.googleapis.com/v1/urlTestingTools/mobileFriendlyTest:run?key={api_key}" 42 | payload = {"url": website} 43 | headers = {'Content-Type': 'application/json'} 44 | 45 | try: 46 | # Make a POST request to the Google API 47 | response = requests.post(api_url, headers=headers, json=payload, timeout=30) 48 | response.raise_for_status() 49 | 50 | # Parse the response JSON 51 | result = response.json() 52 | 53 | # Enhanced detection patterns 54 | mobile_friendliness = result.get('mobileFriendliness', '').upper() 55 | 56 | if mobile_friendliness == 'MOBILE_FRIENDLY': 57 | logger.info(f"Website {website} is mobile-friendly") 58 | return "🟢" 59 | elif mobile_friendliness == 'NOT_MOBILE_FRIENDLY': 60 | logger.warning(f"Website {website} is not mobile-friendly") 61 | return "🔴" 62 | else: 63 | logger.error(f"Unexpected mobile friendliness status: {mobile_friendliness}") 64 | return "⚪" 65 | 66 | except requests.HTTPError as e: 67 | if e.response.status_code == 429: 68 | logger.error(f"API rate limit exceeded for {website}") 69 | elif e.response.status_code == 403: 70 | logger.error(f"API key invalid or insufficient permissions for {website}") 71 | else: 72 | logger.error(f"HTTP error {e.response.status_code} while checking {website}: {e}") 73 | return "⚪" 74 | except requests.RequestException as e: 75 | logger.error(f"Request error while checking mobile-friendliness for {website}: {e}") 76 | return "⚪" 77 | except (KeyError, json.JSONDecodeError) as e: 78 | logger.error(f"Invalid API response format for {website}: {e}") 79 | return "⚪" 80 | except Exception as e: 81 | logger.error(f"Unexpected error while checking mobile-friendliness for {website}: {e}") 82 | return "⚪" 83 | -------------------------------------------------------------------------------- /checks/check_xss_protection.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from typing import Optional 4 | from requests.exceptions import RequestException, Timeout, HTTPError 5 | 6 | # Configure logging 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | def check_xss_protection(website: str, timeout_seconds: Optional[int] = 10) -> str: 11 | """ 12 | Check if the X-XSS-Protection header is present in the HTTP response headers of a website. 13 | 14 | Args: 15 | website (str): The URL of the website to be checked. 16 | timeout_seconds (int, optional): Timeout for the HTTP request in seconds. Default is 10 seconds. 17 | 18 | Returns: 19 | str: 20 | - "🟢" if X-XSS-Protection header is present and properly configured. 21 | - "🟠" if header is present but with suboptimal configuration. 22 | - "🔴" if X-XSS-Protection header is absent. 23 | - "⚪" for any errors or non-success HTTP responses. 24 | """ 25 | # Input validation and URL normalization 26 | if not website or not isinstance(website, str): 27 | logger.error(f"Invalid website input: {website}") 28 | return "⚪" 29 | 30 | website = website.strip() 31 | if not website.startswith(('http://', 'https://')): 32 | website = f"https://{website}" 33 | 34 | headers = { 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 36 | } 37 | 38 | try: 39 | # Make request with proper timeout and error handling 40 | response = requests.get(website, headers=headers, timeout=timeout_seconds) 41 | response.raise_for_status() 42 | 43 | # Check X-XSS-Protection header 44 | xss_protection = response.headers.get('X-XSS-Protection', '').lower() 45 | 46 | if xss_protection: 47 | logger.info(f"X-XSS-Protection header found for {website}: {xss_protection}") 48 | 49 | # Enhanced validation of header value 50 | if '1; mode=block' in xss_protection: 51 | return "🟢" # Optimal configuration 52 | elif xss_protection.startswith('1'): 53 | return "🟠" # Present but not optimal 54 | else: 55 | return "🔴" # Present but disabled (0) 56 | else: 57 | # Check for Content-Security-Policy as alternative protection 58 | csp_header = response.headers.get('Content-Security-Policy', '') 59 | if csp_header and 'unsafe-inline' not in csp_header.lower(): 60 | logger.info(f"No X-XSS-Protection but CSP found for {website}") 61 | return "🟠" # CSP provides some XSS protection 62 | 63 | logger.warning(f"X-XSS-Protection header missing for {website}") 64 | return "🔴" 65 | 66 | except Timeout: 67 | logger.warning(f"Timeout occurred while checking XSS protection for {website}") 68 | return "⚪" 69 | except HTTPError as e: 70 | logger.warning(f"HTTP error for {website}: {e}") 71 | return "⚪" 72 | except RequestException as e: 73 | logger.warning(f"Request error for {website}: {e}") 74 | return "⚪" 75 | except Exception as e: 76 | logger.error(f"Unexpected error for {website}: {e}") 77 | return "⚪" 78 | -------------------------------------------------------------------------------- /checks/check_internationalization.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urlparse 5 | import re 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_internationalization(website: str) -> str: 10 | """ 11 | Checks if a website has implemented internationalization (i18n) using the lang attribute. 12 | 13 | Args: 14 | website (str): The URL of the website to check. 15 | 16 | Returns: 17 | str: 18 | - "🟢" if i18n is detected 19 | - "🟡" if partial i18n is detected 20 | - "⚪" if i18n is not detected or an error occurred. 21 | """ 22 | # Input validation and URL normalization 23 | if not website: 24 | logger.error("Website URL is required") 25 | return "⚪" 26 | 27 | if not website.startswith(('http://', 'https://')): 28 | website = f"https://{website}" 29 | 30 | try: 31 | parsed_url = urlparse(website) 32 | if not parsed_url.netloc: 33 | logger.error(f"Invalid URL format: {website}") 34 | return "⚪" 35 | except Exception as e: 36 | logger.error(f"URL parsing error for {website}: {e}") 37 | return "⚪" 38 | 39 | try: 40 | response = requests.get(website, timeout=15) 41 | response.raise_for_status() 42 | soup = BeautifulSoup(response.content, "html.parser") 43 | 44 | # Enhanced detection patterns 45 | i18n_indicators = [] 46 | 47 | # Check HTML lang attribute 48 | html_tag = soup.find("html") 49 | if html_tag and html_tag.has_attr("lang"): 50 | lang_value = html_tag.get("lang", "").strip() 51 | if lang_value and len(lang_value) >= 2: 52 | i18n_indicators.append(f"HTML lang attribute: {lang_value}") 53 | 54 | # Check for hreflang attributes in link tags 55 | hreflang_links = soup.find_all("link", attrs={"hreflang": True}) 56 | if hreflang_links: 57 | i18n_indicators.append(f"hreflang links: {len(hreflang_links)} found") 58 | 59 | # Check for language-specific meta tags 60 | lang_meta = soup.find_all("meta", attrs={"http-equiv": "content-language"}) 61 | if lang_meta: 62 | i18n_indicators.append("Content-Language meta tag found") 63 | 64 | # Check for common i18n URL patterns 65 | if re.search(r'/[a-z]{2}(?:-[A-Z]{2})?/', website): 66 | i18n_indicators.append("Language code in URL pattern") 67 | 68 | # Improved scoring and categorization 69 | if len(i18n_indicators) >= 2: 70 | logger.info(f"Strong internationalization detected for {website}: {', '.join(i18n_indicators)}") 71 | return "🟢" 72 | elif len(i18n_indicators) == 1: 73 | logger.info(f"Basic internationalization detected for {website}: {i18n_indicators[0]}") 74 | return "🟡" 75 | else: 76 | logger.info(f"No internationalization detected for {website}") 77 | return "⚪" 78 | 79 | except requests.exceptions.RequestException as e: 80 | logger.error(f"Request error while checking internationalization for {website}: {e}") 81 | return "⚪" 82 | except Exception as e: 83 | logger.error(f"Unexpected error while checking internationalization for {website}: {e}") 84 | return "⚪" 85 | -------------------------------------------------------------------------------- /checks/check_redirects.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_redirects(website: str) -> str: 10 | """ 11 | Verify if a website using HTTP redirects to its HTTPS counterpart with enhanced security analysis. 12 | 13 | Args: 14 | website (str): The URL (without protocol) of the website to check. 15 | 16 | Returns: 17 | str: 18 | - "🟢" if the site redirects from HTTP to HTTPS securely. 19 | - "🟠" if redirect exists but has minor security issues. 20 | - "🔴" if it does not redirect from HTTP to HTTPS or has security issues. 21 | - "⚪" in case of an error. 22 | """ 23 | # Input validation and URL normalization 24 | if not website or not isinstance(website, str): 25 | logger.error(f"Invalid website input: {website}") 26 | return "⚪" 27 | 28 | website = website.strip() 29 | if website.startswith(('http://', 'https://')): 30 | from urllib.parse import urlparse 31 | parsed = urlparse(website) 32 | website = parsed.netloc 33 | 34 | headers = { 35 | "User-Agent": "HTTPtoHTTPSRedirectChecker/2.0" 36 | } 37 | 38 | try: 39 | # Make an HTTP request to the site and prevent automatic redirects 40 | response = requests.get(f"http://{website}", headers=headers, allow_redirects=False, timeout=15) 41 | redirect_location = response.headers.get('Location', '') 42 | 43 | # Enhanced redirect analysis 44 | if response.status_code in [301, 302, 303, 307, 308] and redirect_location: 45 | logger.debug(f"Redirect detected: {response.status_code} -> {redirect_location}") 46 | 47 | # Check if redirect is to HTTPS 48 | if redirect_location.startswith(f"https://{website}"): 49 | # Check for permanent redirect (301, 308) - more secure 50 | if response.status_code in [301, 308]: 51 | logger.info(f"Website {website} has secure permanent redirect to HTTPS") 52 | return "🟢" 53 | else: 54 | logger.info(f"Website {website} redirects to HTTPS but uses temporary redirect") 55 | return "🟠" 56 | elif redirect_location.startswith('https://'): 57 | # Redirects to HTTPS but different domain 58 | logger.warning(f"Website {website} redirects to different HTTPS domain: {redirect_location}") 59 | return "🟠" 60 | else: 61 | # Redirects but not to HTTPS 62 | logger.warning(f"Website {website} redirects but not to HTTPS: {redirect_location}") 63 | return "🔴" 64 | else: 65 | # No redirect or invalid redirect 66 | logger.warning(f"Website {website} does not redirect from HTTP to HTTPS") 67 | return "🔴" 68 | 69 | except (Timeout, HTTPError) as e: 70 | logger.warning(f"HTTP/Timeout error while checking redirects for {website}: {e}") 71 | return "⚪" 72 | except RequestException as e: 73 | logger.warning(f"Request error while checking redirects for {website}: {e}") 74 | return "⚪" 75 | except Exception as e: 76 | logger.error(f"Unexpected error while checking redirects for {website}: {e}") 77 | return "⚪" 78 | -------------------------------------------------------------------------------- /checks/check_hsts.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from urllib.parse import urlparse 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def check_hsts(website: str) -> str: 9 | """ 10 | Check if the website implements HTTP Strict Transport Security (HSTS). 11 | 12 | Args: 13 | website (str): URL of the website to be checked. 14 | 15 | Returns: 16 | str: 17 | - "🟢" if the site has HSTS enabled with good configuration. 18 | - "🟡" if HSTS is enabled but with suboptimal configuration. 19 | - "🔴" if the site does not have HSTS enabled. 20 | - "⚪" if an error occurred during the check. 21 | """ 22 | # Input validation and URL normalization 23 | if not website: 24 | logger.error("Website URL is required") 25 | return "⚪" 26 | 27 | if not website.startswith(('http://', 'https://')): 28 | website = f"https://{website}" 29 | 30 | try: 31 | parsed_url = urlparse(website) 32 | if not parsed_url.netloc: 33 | logger.error(f"Invalid URL format: {website}") 34 | return "⚪" 35 | except Exception as e: 36 | logger.error(f"URL parsing error for {website}: {e}") 37 | return "⚪" 38 | 39 | headers = { 40 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" 41 | } 42 | 43 | try: 44 | # Make a request to the website 45 | response = requests.get(website, headers=headers, timeout=15) 46 | response.raise_for_status() 47 | 48 | # Enhanced detection patterns 49 | hsts_header = response.headers.get('Strict-Transport-Security', '') 50 | 51 | if not hsts_header: 52 | logger.warning(f"No HSTS header found for {website}") 53 | return "🔴" 54 | 55 | # Improved scoring and categorization 56 | hsts_lower = hsts_header.lower() 57 | max_age_match = None 58 | 59 | # Extract max-age value 60 | import re 61 | max_age_pattern = re.search(r'max-age=(\d+)', hsts_lower) 62 | if max_age_pattern: 63 | max_age = int(max_age_pattern.group(1)) 64 | 65 | # Check for security best practices 66 | has_include_subdomains = 'includesubdomains' in hsts_lower 67 | has_preload = 'preload' in hsts_lower 68 | 69 | # Categorize based on configuration quality 70 | if max_age >= 31536000 and has_include_subdomains: # 1 year or more with subdomains 71 | logger.info(f"Strong HSTS configuration for {website}: max-age={max_age}, includeSubDomains={has_include_subdomains}, preload={has_preload}") 72 | return "🟢" 73 | elif max_age >= 86400: # At least 1 day 74 | logger.info(f"Basic HSTS configuration for {website}: max-age={max_age}, includeSubDomains={has_include_subdomains}") 75 | return "🟡" 76 | else: 77 | logger.warning(f"Weak HSTS configuration for {website}: max-age too low ({max_age})") 78 | return "🟡" 79 | else: 80 | logger.warning(f"Invalid HSTS header format for {website}: {hsts_header}") 81 | return "🟡" 82 | 83 | except requests.RequestException as e: 84 | logger.error(f"Request error while checking HSTS for {website}: {e}") 85 | return "⚪" 86 | except Exception as e: 87 | logger.error(f"Unexpected error while checking HSTS for {website}: {e}") 88 | return "⚪" 89 | -------------------------------------------------------------------------------- /checks/check_server_response_time.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import statistics 4 | import logging 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_server_response_time(website: str, num_attempts: int = 3) -> str: 12 | """ 13 | Measure the server's response time with multiple attempts for accuracy. 14 | 15 | Args: 16 | website (str): URL of the website to be checked. 17 | num_attempts (int): Number of attempts to measure response time. 18 | 19 | Returns: 20 | str: 21 | - "🟢" if the response time is excellent (under 0.5 seconds) 22 | - "🟠" if the response time is moderate (between 0.5 and 2 seconds) 23 | - "🔴" if the response time is slow (2 seconds or more) 24 | - "⚪" if an error occurs or the server does not respond in time 25 | """ 26 | # Input validation and URL normalization 27 | if not website or not isinstance(website, str): 28 | logger.error(f"Invalid website input: {website}") 29 | return "⚪" 30 | 31 | website = website.strip() 32 | if not website.startswith(('http://', 'https://')): 33 | website = f"https://{website}" 34 | 35 | headers = { 36 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 37 | } 38 | 39 | response_times = [] 40 | 41 | try: 42 | # Perform multiple measurements for accuracy 43 | for attempt in range(num_attempts): 44 | start_time = time.perf_counter() 45 | 46 | # Make the request and measure time to first byte 47 | response = requests.get(website, headers=headers, timeout=15, stream=True) 48 | 49 | # Time to first byte 50 | ttfb = time.perf_counter() - start_time 51 | response.raise_for_status() 52 | 53 | response_times.append(ttfb) 54 | logger.debug(f"Attempt {attempt + 1} for {website}: {ttfb:.3f}s") 55 | 56 | # Small delay between attempts 57 | if attempt < num_attempts - 1: 58 | time.sleep(0.5) 59 | 60 | # Calculate statistics 61 | avg_time = statistics.mean(response_times) 62 | median_time = statistics.median(response_times) 63 | min_time = min(response_times) 64 | max_time = max(response_times) 65 | 66 | logger.info(f"Response time stats for {website} - Avg: {avg_time:.3f}s, Median: {median_time:.3f}s, Range: {min_time:.3f}s-{max_time:.3f}s") 67 | 68 | # Enhanced categorization based on average response time 69 | if avg_time < 0.2: 70 | logger.info(f"Website {website} responded excellently: {avg_time:.3f}s average") 71 | return "🟢" 72 | elif avg_time < 0.5: 73 | logger.info(f"Website {website} responded very well: {avg_time:.3f}s average") 74 | return "🟢" 75 | elif avg_time < 2.0: 76 | logger.info(f"Website {website} responded moderately: {avg_time:.3f}s average") 77 | return "🟠" 78 | else: 79 | logger.warning(f"Website {website} responded slowly: {avg_time:.3f}s average") 80 | return "🔴" 81 | 82 | except Timeout: 83 | logger.warning(f"Timeout occurred while checking response time for {website}") 84 | return "🔴" 85 | except HTTPError as e: 86 | logger.warning(f"HTTP error for {website}: {e}") 87 | return "⚪" 88 | except RequestException as e: 89 | logger.warning(f"Request error for {website}: {e}") 90 | return "⚪" 91 | except Exception as e: 92 | logger.error(f"Unexpected error for {website}: {e}") 93 | return "⚪" 94 | -------------------------------------------------------------------------------- /checks/check_robot_txt.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from urllib.parse import urljoin 5 | 6 | # Configure logging 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | def check_robot_txt(website): 11 | """ 12 | Verify the presence and basic validity of a robots.txt file on a website. 13 | 14 | Args: 15 | - website (str): The URL (without protocol) of the website to check. 16 | 17 | Returns: 18 | - str: "🟢" if the site has a valid robots.txt file, "🔴" otherwise, and 19 | "⚪" in case of an error. 20 | """ 21 | # Input validation and URL normalization 22 | if not website or not isinstance(website, str): 23 | logger.error(f"Invalid website input: {website}") 24 | return "⚪" 25 | 26 | website = website.strip() 27 | if not website.startswith(('http://', 'https://')): 28 | website = f"https://{website}" 29 | 30 | headers = { 31 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 32 | } 33 | 34 | try: 35 | # Perform the HTTP request with a timeout 36 | robots_url = urljoin(website, '/robots.txt') 37 | response = requests.get(robots_url, headers=headers, timeout=15) 38 | response.raise_for_status() 39 | 40 | # Enhanced validation of robots.txt content 41 | content = response.text.lower() 42 | lines = [line.strip() for line in content.split('\n') if line.strip()] 43 | 44 | # Check for essential robots.txt directives 45 | has_user_agent = any(line.startswith('user-agent:') for line in lines) 46 | has_disallow = any(line.startswith('disallow:') for line in lines) 47 | has_allow = any(line.startswith('allow:') for line in lines) 48 | has_sitemap = any(line.startswith('sitemap:') for line in lines) 49 | 50 | # Additional validation checks 51 | valid_directives = {'user-agent:', 'disallow:', 'allow:', 'crawl-delay:', 'sitemap:', 'host:'} 52 | unknown_directives = [] 53 | 54 | for line in lines: 55 | if ':' in line and not line.startswith('#'): 56 | directive = line.split(':')[0] + ':' 57 | if directive not in valid_directives: 58 | unknown_directives.append(directive) 59 | 60 | # Scoring system for robots.txt quality 61 | score = 0 62 | if has_user_agent: 63 | score += 2 64 | if has_disallow or has_allow: 65 | score += 2 66 | if has_sitemap: 67 | score += 1 68 | if not unknown_directives: 69 | score += 1 70 | 71 | logger.info(f"Robots.txt analysis for {website}: score {score}/6, sitemaps: {has_sitemap}") 72 | 73 | if unknown_directives: 74 | logger.warning(f"Unknown directives found: {unknown_directives}") 75 | 76 | if score >= 4: 77 | logger.info(f"Valid and comprehensive robots.txt found for {website}") 78 | return "🟢" 79 | elif score >= 2: 80 | logger.info(f"Basic robots.txt found for {website}") 81 | return "🟢" 82 | else: 83 | logger.warning(f"Poor quality robots.txt found for {website}") 84 | return "🔴" 85 | 86 | except (Timeout, HTTPError) as e: 87 | logger.warning(f"HTTP/Timeout error while checking robots.txt for {website}: {e}") 88 | return "⚪" 89 | except RequestException as e: 90 | logger.warning(f"Request error while checking robots.txt for {website}: {e}") 91 | return "⚪" 92 | except Exception as e: 93 | logger.error(f"Unexpected error while checking robots.txt for {website}: {e}") 94 | return "⚪" 95 | -------------------------------------------------------------------------------- /checks/check_privacy_protected_whois.py: -------------------------------------------------------------------------------- 1 | import whois 2 | import logging 3 | from whois.parser import PywhoisError 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_privacy_protected_whois(domain: str) -> str: 10 | """ 11 | Check if a domain's WHOIS information indicates that it is privacy-protected with enhanced detection. 12 | 13 | Args: 14 | domain (str): The domain to check. 15 | 16 | Returns: 17 | str: "🟢" if the domain's WHOIS information is privacy-protected, "🔴" otherwise, 18 | "⚪" if an error occurred. 19 | """ 20 | # Input validation 21 | if not domain or not isinstance(domain, str): 22 | logger.error(f"Invalid domain input: {domain}") 23 | return "⚪" 24 | 25 | domain = domain.strip() 26 | 27 | # Remove protocol if present 28 | if domain.startswith(('http://', 'https://')): 29 | from urllib.parse import urlparse 30 | parsed = urlparse(domain) 31 | domain = parsed.netloc 32 | 33 | try: 34 | # Fetch WHOIS data for the domain 35 | whois_data = whois.whois(domain) 36 | 37 | # Enhanced privacy indicators 38 | privacy_indicators = [ 39 | 'privacy', 'protected', 'redacted', 'whoisguard', 'domains by proxy', 40 | 'anonymous', 'contact privacy', 'whois privacy', 'perfect privacy', 41 | 'data protected', 'private registration', 'domain privacy', 42 | 'namecheap', 'godaddy privacy', 'cloudflare', 'proxy protection', 43 | 'withheld', 'not disclosed', 'see privacy policy' 44 | ] 45 | 46 | # Enhanced fields to check with more comprehensive coverage 47 | fields_to_check = [ 48 | 'registrar', 'tech_email', 'admin_email', 'registrant_email', 49 | 'org', 'name', 'address', 'registrant_name', 'admin_name', 'tech_name', 50 | 'registrant_org', 'admin_org', 'tech_org', 'emails' 51 | ] 52 | 53 | privacy_score = 0 54 | total_checks = 0 55 | 56 | # Check for privacy indicators in relevant WHOIS fields 57 | for field in fields_to_check: 58 | field_value = whois_data.get(field, '') 59 | 60 | if field_value: 61 | total_checks += 1 62 | field_str = str(field_value).lower() 63 | 64 | if any(indicator in field_str for indicator in privacy_indicators): 65 | privacy_score += 1 66 | logger.debug(f"Privacy indicator found in {field}: {field_value}") 67 | 68 | # Additional checks for redacted information 69 | if whois_data: 70 | # Check if critical information is redacted 71 | critical_fields = ['registrant_name', 'admin_email', 'tech_email'] 72 | redacted_count = 0 73 | 74 | for field in critical_fields: 75 | value = whois_data.get(field, '') 76 | if not value or 'redacted' in str(value).lower() or 'withheld' in str(value).lower(): 77 | redacted_count += 1 78 | 79 | if redacted_count >= 2: 80 | privacy_score += 2 81 | 82 | logger.info(f"Privacy analysis for {domain}: score {privacy_score}/{total_checks + 2}") 83 | 84 | # Determine result based on privacy score 85 | if privacy_score > 0: 86 | logger.info(f"Privacy protection detected for {domain}") 87 | return "🟢" 88 | else: 89 | logger.warning(f"No privacy protection detected for {domain}") 90 | return "🔴" 91 | 92 | except PywhoisError as e: 93 | logger.warning(f"WHOIS command failed for {domain}: {e}") 94 | return "⚪" 95 | except Exception as e: 96 | logger.error(f"Unexpected error while checking privacy-protected WHOIS for {domain}: {e}") 97 | return "⚪" 98 | -------------------------------------------------------------------------------- /checks/check_email_domain.py: -------------------------------------------------------------------------------- 1 | import dns.resolver 2 | import logging 3 | from dns.resolver import NXDOMAIN, NoAnswer, NoNameservers, Timeout 4 | import re 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | def check_email_domain(email_domain: str) -> str: 9 | """ 10 | Check if an email domain has an SPF (Sender Policy Framework) record. 11 | 12 | Args: 13 | email_domain (str): The domain of the email to be checked. 14 | 15 | Returns: 16 | str: 17 | - "🟢" if a strong SPF record is found. 18 | - "🟡" if a basic SPF record is found. 19 | - "🔴" if no SPF record is found. 20 | - "⚪" for any other errors or issues. 21 | """ 22 | # Input validation 23 | if not email_domain: 24 | logger.error("Email domain is required") 25 | return "⚪" 26 | 27 | # Normalize domain (remove protocol, www, etc.) 28 | email_domain = email_domain.lower().strip() 29 | email_domain = re.sub(r'^https?://', '', email_domain) 30 | email_domain = re.sub(r'^www\.', '', email_domain) 31 | email_domain = email_domain.split('/')[0] # Remove path if present 32 | 33 | # Validate domain format 34 | if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]*\.[a-zA-Z]{2,}$', email_domain): 35 | logger.error(f"Invalid domain format: {email_domain}") 36 | return "⚪" 37 | 38 | try: 39 | # Query DNS TXT records for the given email domain 40 | answers = dns.resolver.resolve(email_domain, 'TXT', timeout=10) 41 | 42 | # Enhanced detection patterns 43 | spf_records = [] 44 | for rdata in answers: 45 | txt_record = str(rdata).strip('"') 46 | if txt_record.startswith("v=spf1"): 47 | spf_records.append(txt_record) 48 | 49 | if not spf_records: 50 | logger.warning(f"No SPF record found for {email_domain}") 51 | return "🔴" 52 | 53 | if len(spf_records) > 1: 54 | logger.warning(f"Multiple SPF records found for {email_domain} - this may cause issues") 55 | 56 | # Analyze SPF record quality 57 | spf_record = spf_records[0] 58 | logger.info(f"SPF record found for {email_domain}: {spf_record}") 59 | 60 | # Improved scoring and categorization 61 | strong_indicators = [ 62 | '-all', # Hard fail 63 | 'include:', # Include mechanism 64 | 'mx', # MX mechanism 65 | ] 66 | 67 | weak_indicators = [ 68 | '~all', # Soft fail 69 | '?all', # Neutral 70 | '+all', # Pass all (very permissive) 71 | ] 72 | 73 | strong_score = sum(1 for indicator in strong_indicators if indicator in spf_record) 74 | weak_score = sum(1 for indicator in weak_indicators if indicator in spf_record) 75 | 76 | if strong_score >= 2 and '-all' in spf_record: 77 | logger.info(f"Strong SPF configuration for {email_domain}") 78 | return "🟢" 79 | elif strong_score >= 1 or ('~all' in spf_record): 80 | logger.info(f"Basic SPF configuration for {email_domain}") 81 | return "🟡" 82 | else: 83 | logger.warning(f"Weak SPF configuration for {email_domain}") 84 | return "🟡" 85 | 86 | except NXDOMAIN: 87 | logger.error(f"Domain {email_domain} does not exist") 88 | return "⚪" 89 | except NoAnswer: 90 | logger.warning(f"Domain {email_domain} does not have TXT records") 91 | return "🔴" 92 | except NoNameservers: 93 | logger.error(f"No nameservers found for domain {email_domain}") 94 | return "⚪" 95 | except Timeout: 96 | logger.error(f"DNS query for {email_domain} timed out") 97 | return "⚪" 98 | except Exception as e: 99 | logger.error(f"Unexpected error while checking email domain {email_domain}: {e}") 100 | return "⚪" 101 | -------------------------------------------------------------------------------- /checks/check_mixed_content.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import logging 4 | from requests.exceptions import RequestException, HTTPError 5 | from urllib.parse import urlparse, urljoin 6 | import re 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | def check_mixed_content(website: str) -> str: 11 | """ 12 | Check a given website for mixed content issues by searching for resources loaded over HTTP. 13 | 14 | Args: 15 | website (str): The URL of the website to be checked. 16 | 17 | Returns: 18 | str: 19 | - "🟢" if no mixed content is found. 20 | - "🔴" if mixed content is found. 21 | - "⚪" for any errors. 22 | """ 23 | # Input validation and URL normalization 24 | if not website: 25 | logger.error("Website URL is required") 26 | return "⚪" 27 | 28 | if not website.startswith(('http://', 'https://')): 29 | website = f"https://{website}" 30 | 31 | try: 32 | parsed_url = urlparse(website) 33 | if not parsed_url.netloc: 34 | logger.error(f"Invalid URL format: {website}") 35 | return "⚪" 36 | except Exception as e: 37 | logger.error(f"URL parsing error for {website}: {e}") 38 | return "⚪" 39 | 40 | headers = { 41 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 42 | } 43 | 44 | try: 45 | # Make a request to the website 46 | response = requests.get(website, headers=headers, timeout=15) 47 | response.raise_for_status() 48 | 49 | # Parse the HTML content using BeautifulSoup 50 | soup = BeautifulSoup(response.content, 'html.parser') 51 | 52 | # Enhanced detection patterns - check multiple attributes and elements 53 | mixed_content_found = [] 54 | 55 | # Check src attributes (img, script, iframe, etc.) 56 | elements_with_src = soup.find_all(attrs={'src': True}) 57 | for element in elements_with_src: 58 | src = element.get('src', '') 59 | if src.startswith('http://'): 60 | mixed_content_found.append(f"{element.name}[src]: {src}") 61 | 62 | # Check href attributes (link, a tags) 63 | elements_with_href = soup.find_all(attrs={'href': True}) 64 | for element in elements_with_href: 65 | href = element.get('href', '') 66 | if href.startswith('http://') and element.name in ['link']: # Focus on resource links 67 | mixed_content_found.append(f"{element.name}[href]: {href}") 68 | 69 | # Check CSS url() patterns in style attributes and tags 70 | style_elements = soup.find_all(['style']) + soup.find_all(attrs={'style': True}) 71 | for element in style_elements: 72 | style_content = element.get('style', '') if element.has_attr('style') else element.get_text() 73 | if style_content: 74 | http_urls = re.findall(r'url\(["\']?(http://[^"\')\s]+)["\']?\)', style_content) 75 | for url in http_urls: 76 | mixed_content_found.append(f"CSS url(): {url}") 77 | 78 | # Check if there is any mixed content 79 | if mixed_content_found: 80 | logger.warning(f"Mixed content found on {website}: {len(mixed_content_found)} instances") 81 | for content in mixed_content_found[:5]: # Log first 5 instances 82 | logger.warning(f" - {content}") 83 | return "🔴" 84 | else: 85 | logger.info(f"No mixed content found on {website}") 86 | return "🟢" 87 | 88 | except HTTPError as e: 89 | logger.error(f"HTTP error {e.response.status_code} while checking mixed content on {website}: {e}") 90 | return "⚪" 91 | except RequestException as e: 92 | logger.error(f"Request error while checking mixed content on {website}: {e}") 93 | return "⚪" 94 | except Exception as e: 95 | logger.error(f"Unexpected error while checking mixed content on {website}: {e}") 96 | return "⚪" 97 | -------------------------------------------------------------------------------- /checks/check_website_load_time.py: -------------------------------------------------------------------------------- 1 | import time 2 | import statistics 3 | import requests 4 | import logging 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_website_load_time(website: str, num_attempts: int = 3) -> str: 12 | """ 13 | Check the load time of the given website with multiple measurements for accuracy. 14 | 15 | Args: 16 | website (str): The URL of the website to be checked. 17 | num_attempts (int): Number of attempts to measure load time for accuracy. 18 | 19 | Returns: 20 | str: 21 | - "🟢" if average load time is under 2 seconds 22 | - "🟠" if average load time is between 2 and 4 seconds 23 | - "🔴" if average load time is over 4 seconds 24 | - "⚪" in case of any errors or timeouts 25 | """ 26 | # Input validation and URL normalization 27 | if not website or not isinstance(website, str): 28 | logger.error(f"Invalid website input: {website}") 29 | return "⚪" 30 | 31 | website = website.strip() 32 | if not website.startswith(('http://', 'https://')): 33 | website = f"https://{website}" 34 | 35 | headers = { 36 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 37 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 38 | 'Accept-Language': 'en-US,en;q=0.5', 39 | 'Accept-Encoding': 'gzip, deflate, br', 40 | 'Cache-Control': 'no-cache', 41 | 'Pragma': 'no-cache' 42 | } 43 | 44 | load_times = [] 45 | 46 | try: 47 | # Perform multiple measurements for accuracy 48 | for attempt in range(num_attempts): 49 | start_time = time.perf_counter() 50 | 51 | # Perform the request with enhanced monitoring 52 | response = requests.get( 53 | website, 54 | headers=headers, 55 | timeout=15, 56 | allow_redirects=True, 57 | stream=False 58 | ) 59 | response.raise_for_status() 60 | 61 | # Calculate elapsed time 62 | elapsed_time = time.perf_counter() - start_time 63 | load_times.append(elapsed_time) 64 | 65 | logger.debug(f"Attempt {attempt + 1} for {website}: {elapsed_time:.3f}s") 66 | 67 | # Small delay between attempts to avoid overwhelming the server 68 | if attempt < num_attempts - 1: 69 | time.sleep(0.5) 70 | 71 | # Calculate statistics 72 | avg_time = statistics.mean(load_times) 73 | median_time = statistics.median(load_times) 74 | min_time = min(load_times) 75 | max_time = max(load_times) 76 | 77 | logger.info(f"Load time stats for {website} - Avg: {avg_time:.3f}s, Median: {median_time:.3f}s, Range: {min_time:.3f}s-{max_time:.3f}s") 78 | 79 | # Enhanced categorization based on average time 80 | if avg_time < 1.0: 81 | logger.info(f"Website {website} loaded very fast: {avg_time:.2f}s average") 82 | return "🟢" 83 | elif avg_time < 2.0: 84 | logger.info(f"Website {website} loaded fast: {avg_time:.2f}s average") 85 | return "🟢" 86 | elif avg_time < 4.0: 87 | logger.info(f"Website {website} loaded moderately: {avg_time:.2f}s average") 88 | return "🟠" 89 | else: 90 | logger.warning(f"Website {website} loaded slowly: {avg_time:.2f}s average") 91 | return "🔴" 92 | 93 | except Timeout: 94 | logger.warning(f"Timeout occurred while checking load time for {website}") 95 | return "🔴" # Timeout is effectively a slow load time 96 | except HTTPError as e: 97 | logger.warning(f"HTTP error for {website}: {e}") 98 | return "⚪" 99 | except RequestException as e: 100 | logger.warning(f"Request error for {website}: {e}") 101 | return "⚪" 102 | except Exception as e: 103 | logger.error(f"Unexpected error for {website}: {e}") 104 | return "⚪" 105 | -------------------------------------------------------------------------------- /checks/check_asset_minification.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, Timeout, HTTPError 3 | from bs4 import BeautifulSoup 4 | import re 5 | 6 | def check_asset_minification(website): 7 | """ 8 | Check if the website's CSS/JS assets are minified. 9 | 10 | Args: 11 | website (str): URL of the website to be checked. 12 | 13 | Returns: 14 | str: 15 | - "🟢" if all assets are minified 16 | - "🟠" if some assets are minified and others are not 17 | - "🔴" if none of the assets are minified 18 | - "⚪" if an error occurs or no assets to check 19 | """ 20 | # Ensure the website starts with 'http://' or 'https://' 21 | if not website.startswith(('http://', 'https://')): 22 | website = f"https://{website}" 23 | 24 | headers = { 25 | 'User-Agent': 'AssetMinificationChecker/1.0' 26 | } 27 | 28 | try: 29 | # First, get the website content to extract asset links 30 | response = requests.get(website, headers=headers, timeout=10) 31 | response.raise_for_status() 32 | 33 | soup = BeautifulSoup(response.text, 'lxml') 34 | 35 | # Extract CSS and JS links 36 | css_links = [link.get('href') for link in soup.find_all('link', rel='stylesheet') if link.get('href')] 37 | js_links = [script.get('src') for script in soup.find_all('script', src=True) if script.get('src')] 38 | 39 | # Convert relative URLs to absolute 40 | from urllib.parse import urljoin 41 | website_links = [] 42 | for link in css_links + js_links: 43 | if link.startswith(('http://', 'https://')): 44 | website_links.append(link) 45 | else: 46 | website_links.append(urljoin(website, link)) 47 | 48 | minified_count = 0 49 | total_assets = 0 50 | 51 | for link in website_links: 52 | try: 53 | # Method 1: Check content and minification status 54 | asset_response = requests.get(link, headers=headers, timeout=10) 55 | asset_response.raise_for_status() 56 | 57 | # Check if the content type is either CSS or JavaScript 58 | content_type = asset_response.headers.get('Content-Type', '').lower() 59 | if 'text/css' in content_type or 'javascript' in content_type: 60 | total_assets += 1 61 | content = asset_response.text 62 | 63 | # Check for minification indicators 64 | # Minified files typically have very long lines and no whitespace 65 | lines = content.splitlines() 66 | avg_line_length = sum(len(line) for line in lines) / max(len(lines), 1) 67 | has_comments = '//' in content or '/*' in content 68 | has_excessive_whitespace = re.search(r'\n\s*\n\s*\n', content) 69 | 70 | # Heuristic: likely minified if average line length is high and no comments/whitespace 71 | if avg_line_length > 200 and not has_comments and not has_excessive_whitespace: 72 | minified_count += 1 73 | else: 74 | print(f"Asset at {link} appears not to be minified.") 75 | 76 | except (Timeout, HTTPError, RequestException) as e: 77 | print(f"Error while fetching content from {link}: {e}") 78 | continue 79 | 80 | # Determine the result based on the minification analysis 81 | if total_assets == 0: 82 | print(f"No CSS/JS assets found on {website}.") 83 | return "⚪" 84 | elif minified_count == 0: 85 | print("None of the assets are minified.") 86 | return "🔴" 87 | elif minified_count < total_assets: 88 | print(f"Some assets are minified, others are not. Minified: {minified_count}, Total: {total_assets}") 89 | return "🟠" 90 | else: 91 | print("All assets are minified.") 92 | return "🟢" 93 | 94 | except (Timeout, HTTPError, RequestException) as e: 95 | print(f"Request error occurred while checking asset minification for {website}: {e}") 96 | return "⚪" 97 | except Exception as e: 98 | print(f"An unexpected error occurred while checking asset minification for {website}: {e}") 99 | return "⚪" 100 | -------------------------------------------------------------------------------- /checks/check_subresource_integrity.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from typing import Tuple 5 | from requests.exceptions import RequestException 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_subresource_integrity(website: str) -> Tuple[str, int]: 12 | """ 13 | Check if the given website uses Subresource Integrity (SRI) by analyzing external resources. 14 | 15 | Args: 16 | website (str): The URL of the website to be analyzed. 17 | 18 | Returns: 19 | tuple: A status symbol and a count of external resources with SRI. 20 | - "🟢" if most external resources have SRI protection. 21 | - "🟠" if some external resources have SRI protection. 22 | - "🔴" if no or few external resources have SRI protection. 23 | - "⚪" if an error occurs. 24 | """ 25 | # Input validation and URL normalization 26 | if not website or not isinstance(website, str): 27 | logger.error(f"Invalid website input: {website}") 28 | return "⚪", 0 29 | 30 | website = website.strip() 31 | if not website.startswith(('http://', 'https://')): 32 | website = f"https://{website}" 33 | 34 | headers = { 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 36 | } 37 | 38 | try: 39 | # Fetch website content 40 | response = requests.get(website, headers=headers, timeout=15) 41 | response.raise_for_status() 42 | 43 | # Parse HTML content 44 | soup = BeautifulSoup(response.content, 'lxml') 45 | 46 | # Find all external resources that should have SRI 47 | external_resources = [] 48 | sri_protected_resources = [] 49 | 50 | # Check script tags with external sources 51 | for script in soup.find_all('script', src=True): 52 | src = script.get('src') 53 | if src and (src.startswith(('http://', 'https://')) or src.startswith('//')): 54 | external_resources.append(('script', src)) 55 | if script.get('integrity'): 56 | sri_protected_resources.append(('script', src, script.get('integrity'))) 57 | 58 | # Check link tags (stylesheets, fonts, etc.) 59 | for link in soup.find_all('link', href=True): 60 | href = link.get('href') 61 | rel = link.get('rel', []) 62 | if isinstance(rel, str): 63 | rel = [rel] 64 | 65 | # Focus on stylesheets and preload resources 66 | if href and (href.startswith(('http://', 'https://')) or href.startswith('//')) and \ 67 | any(r in rel for r in ['stylesheet', 'preload']): 68 | external_resources.append(('link', href)) 69 | if link.get('integrity'): 70 | sri_protected_resources.append(('link', href, link.get('integrity'))) 71 | 72 | total_external = len(external_resources) 73 | total_sri_protected = len(sri_protected_resources) 74 | 75 | logger.info(f"SRI analysis for {website}: {total_sri_protected}/{total_external} external resources have SRI") 76 | 77 | if total_sri_protected > 0: 78 | logger.debug(f"SRI-protected resources: {[r[1] for r in sri_protected_resources]}") 79 | 80 | # Determine result based on SRI coverage 81 | if total_external == 0: 82 | logger.info(f"No external resources found for {website}") 83 | return "🟢", 0 84 | 85 | sri_coverage = total_sri_protected / total_external 86 | 87 | if sri_coverage >= 0.8: # 80% or more have SRI 88 | logger.info(f"Excellent SRI coverage ({sri_coverage:.1%}) for {website}") 89 | return "🟢", total_sri_protected 90 | elif sri_coverage >= 0.4: # 40% or more have SRI 91 | logger.warning(f"Moderate SRI coverage ({sri_coverage:.1%}) for {website}") 92 | return "🟠", total_sri_protected 93 | else: # Less than 40% have SRI 94 | logger.warning(f"Poor SRI coverage ({sri_coverage:.1%}) for {website}") 95 | return "🔴", total_sri_protected 96 | 97 | except RequestException as e: 98 | logger.warning(f"Request error for {website}: {e}") 99 | return "⚪", 0 100 | except Exception as e: 101 | logger.error(f"Unexpected error for {website}: {e}") 102 | return "⚪", 0 103 | -------------------------------------------------------------------------------- /checks/check_domain_expiration.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import whois 3 | import logging 4 | import re 5 | from urllib.parse import urlparse 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_domain_expiration(domain: str) -> str: 10 | """ 11 | Check the expiration date of a domain. 12 | 13 | Args: 14 | domain (str): The domain name to be checked. 15 | 16 | Returns: 17 | str: 18 | - "🟢 (X days left)" if the domain has more than 90 days to expire. 19 | - "🟡 (X days left)" if the domain has between 30 to 90 days to expire. 20 | - "🟠 (X days left)" if the domain has between 15 to 30 days to expire. 21 | - "🔴 (X days left)" if the domain has less than 15 days to expire. 22 | - "⚪" for other errors. 23 | """ 24 | # Input validation and normalization 25 | if not domain: 26 | logger.error("Domain is required") 27 | return "⚪" 28 | 29 | # Normalize domain 30 | domain = domain.lower().strip() 31 | domain = re.sub(r'^https?://', '', domain) 32 | domain = re.sub(r'^www\.', '', domain) 33 | domain = domain.split('/')[0] # Remove path if present 34 | domain = domain.split(':')[0] # Remove port if present 35 | 36 | # Validate domain format 37 | if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', domain): 38 | logger.error(f"Invalid domain format: {domain}") 39 | return "⚪" 40 | 41 | def get_days_to_expire(exp_date): 42 | """Calculate the days remaining for expiration.""" 43 | if not exp_date: 44 | return None 45 | 46 | # Handle list of dates (some registrars return multiple dates) 47 | if isinstance(exp_date, list): 48 | # Use the earliest expiration date 49 | exp_date = min(exp_date) 50 | 51 | if isinstance(exp_date, str): 52 | try: 53 | # Try to parse string dates 54 | exp_date = datetime.strptime(exp_date, '%Y-%m-%d %H:%M:%S') 55 | except ValueError: 56 | try: 57 | exp_date = datetime.strptime(exp_date, '%Y-%m-%d') 58 | except ValueError: 59 | return None 60 | 61 | return (exp_date - datetime.now()).days 62 | 63 | try: 64 | # Fetch WHOIS data for the domain with timeout 65 | logger.info(f"Fetching WHOIS data for {domain}") 66 | w = whois.whois(domain) 67 | 68 | if not w: 69 | logger.error(f"No WHOIS data returned for {domain}") 70 | return "⚪" 71 | 72 | # Enhanced detection patterns 73 | expiration_date = w.expiration_date 74 | creation_date = w.creation_date 75 | 76 | days_to_expire = get_days_to_expire(expiration_date) 77 | 78 | if days_to_expire is None: 79 | logger.error(f"Could not retrieve or parse expiration date for {domain}") 80 | return "⚪" 81 | 82 | # Log additional domain information 83 | if creation_date: 84 | creation_days = get_days_to_expire(creation_date) 85 | if creation_days: 86 | domain_age = abs(creation_days) 87 | logger.info(f"Domain {domain} is {domain_age} days old") 88 | 89 | # Improved scoring and categorization 90 | if days_to_expire < 0: 91 | logger.critical(f"Domain {domain} has already expired {abs(days_to_expire)} days ago!") 92 | return f"🔴 (expired {abs(days_to_expire)} days ago)" 93 | elif days_to_expire < 15: 94 | logger.critical(f"Domain {domain} expires in {days_to_expire} days - URGENT!") 95 | return f"🔴 ({days_to_expire} days left)" 96 | elif days_to_expire < 30: 97 | logger.warning(f"Domain {domain} expires in {days_to_expire} days - action needed soon") 98 | return f"🟠 ({days_to_expire} days left)" 99 | elif days_to_expire < 90: 100 | logger.info(f"Domain {domain} expires in {days_to_expire} days - consider renewal") 101 | return f"🟡 ({days_to_expire} days left)" 102 | else: 103 | logger.info(f"Domain {domain} expires in {days_to_expire} days - safe") 104 | return f"🟢 ({days_to_expire} days left)" 105 | 106 | except whois.parser.PywhoisError as e: 107 | logger.error(f"WHOIS parsing error for {domain}: {e}") 108 | return "⚪" 109 | except Exception as e: 110 | logger.error(f"Unexpected error while checking domain expiration for {domain}: {e}") 111 | return "⚪" 112 | -------------------------------------------------------------------------------- /checks/check_redirect_chains.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from urllib.parse import urljoin 5 | 6 | # Configure logging 7 | logging.basicConfig(level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | def check_redirect_chains(website: str) -> str: 11 | """ 12 | Check the number of redirects that a website triggers with enhanced security analysis. 13 | 14 | Args: 15 | website (str): The URL of the website to check. 16 | 17 | Returns: 18 | str: 19 | - "🟢" if no redirects or optimal redirect pattern. 20 | - "🟠" if there's one redirect or acceptable chain. 21 | - "🔴" if multiple redirects or security issues. 22 | - "⚪" in case of an error. 23 | """ 24 | # Input validation and URL normalization 25 | if not website or not isinstance(website, str): 26 | logger.error(f"Invalid website input: {website}") 27 | return "⚪" 28 | 29 | website = website.strip() 30 | if not website.startswith(('http://', 'https://')): 31 | website = f"https://{website}" 32 | 33 | headers = { 34 | "User-Agent": "RedirectChainChecker/2.0" 35 | } 36 | 37 | try: 38 | redirect_count = 0 39 | redirect_chain = [] 40 | current_url = website 41 | visited_urls = set() 42 | max_redirects = 10 # Prevent infinite loops 43 | 44 | while redirect_count < max_redirects: 45 | # Prevent redirect loops 46 | if current_url in visited_urls: 47 | logger.warning(f"Redirect loop detected for {website}") 48 | return "🔴" 49 | 50 | visited_urls.add(current_url) 51 | response = requests.get(current_url, headers=headers, allow_redirects=False, timeout=15) 52 | 53 | # Check if there's a redirect 54 | if response.status_code in [301, 302, 303, 307, 308]: 55 | redirect_location = response.headers.get('location', '') 56 | if not redirect_location: 57 | logger.warning(f"Empty redirect location for {current_url}") 58 | break 59 | 60 | redirect_count += 1 61 | redirect_chain.append({ 62 | 'from': current_url, 63 | 'to': redirect_location, 64 | 'status': response.status_code 65 | }) 66 | 67 | # Handle relative URLs 68 | if not redirect_location.startswith(('http://', 'https://')): 69 | redirect_location = urljoin(current_url, redirect_location) 70 | 71 | current_url = redirect_location 72 | logger.debug(f"Redirect {redirect_count}: {response.status_code} -> {redirect_location}") 73 | else: 74 | # No more redirects 75 | break 76 | 77 | logger.info(f"Redirect analysis for {website}: {redirect_count} redirects found") 78 | 79 | if redirect_chain: 80 | logger.debug(f"Redirect chain: {redirect_chain}") 81 | 82 | # Enhanced evaluation 83 | if redirect_count == 0: 84 | logger.info(f"No redirects found for {website}") 85 | return "🟢" 86 | elif redirect_count == 1: 87 | # Check if it's a good redirect (HTTP to HTTPS) 88 | if (redirect_chain[0]['from'].startswith('http://') and 89 | redirect_chain[0]['to'].startswith('https://') and 90 | redirect_chain[0]['status'] in [301, 308]): 91 | logger.info(f"Single secure redirect found for {website}") 92 | return "🟢" 93 | else: 94 | logger.info(f"Single redirect found for {website}") 95 | return "🟠" 96 | elif redirect_count <= 3: 97 | logger.warning(f"Multiple redirects ({redirect_count}) detected for {website}") 98 | return "🟠" 99 | else: 100 | logger.warning(f"Excessive redirects ({redirect_count}) detected for {website}") 101 | return "🔴" 102 | 103 | except (Timeout, HTTPError) as e: 104 | logger.warning(f"HTTP/Timeout error while checking redirect chains for {website}: {e}") 105 | return "⚪" 106 | except RequestException as e: 107 | logger.warning(f"Request error while checking redirect chains for {website}: {e}") 108 | return "⚪" 109 | except Exception as e: 110 | logger.error(f"Unexpected error while checking redirect chains for {website}: {e}") 111 | return "⚪" 112 | -------------------------------------------------------------------------------- /scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Website Monitor Scheduler 4 | Runs the monitoring checks at regular intervals in Docker environment. 5 | """ 6 | 7 | import time 8 | import subprocess 9 | import os 10 | import logging 11 | import signal 12 | import sys 13 | from datetime import datetime 14 | 15 | # Configure logging 16 | logging.basicConfig( 17 | level=logging.INFO, 18 | format='%(asctime)s - %(levelname)s - %(message)s', 19 | handlers=[ 20 | logging.StreamHandler(sys.stdout), 21 | logging.FileHandler('/app/logs/scheduler.log') 22 | ] 23 | ) 24 | logger = logging.getLogger(__name__) 25 | 26 | class MonitorScheduler: 27 | def __init__(self): 28 | self.interval = int(os.getenv('MONITOR_INTERVAL', 3600)) # Default: 1 hour 29 | self.running = True 30 | 31 | # Set up signal handlers for graceful shutdown 32 | signal.signal(signal.SIGTERM, self.handle_signal) 33 | signal.signal(signal.SIGINT, self.handle_signal) 34 | 35 | def handle_signal(self, signum, frame): 36 | """Handle shutdown signals gracefully.""" 37 | logger.info(f"Received signal {signum}, shutting down gracefully...") 38 | self.running = False 39 | 40 | def run_monitoring(self): 41 | """Execute the monitoring script.""" 42 | try: 43 | logger.info('Starting website monitoring check...') 44 | start_time = datetime.now() 45 | 46 | # Run the main monitoring script 47 | result = subprocess.run( 48 | ['python', 'main.py'], 49 | capture_output=True, 50 | text=True, 51 | timeout=1800 # 30 minute timeout 52 | ) 53 | 54 | end_time = datetime.now() 55 | execution_time = (end_time - start_time).total_seconds() 56 | 57 | if result.returncode == 0: 58 | logger.info(f'Monitoring completed successfully in {execution_time:.2f} seconds') 59 | if result.stdout: 60 | logger.debug(f'Output: {result.stdout}') 61 | else: 62 | logger.error(f'Monitoring failed with exit code {result.returncode}') 63 | if result.stderr: 64 | logger.error(f'Error output: {result.stderr}') 65 | if result.stdout: 66 | logger.info(f'Standard output: {result.stdout}') 67 | 68 | except subprocess.TimeoutExpired: 69 | logger.error('Monitoring timed out after 30 minutes') 70 | except Exception as e: 71 | logger.error(f'Error running monitoring: {e}') 72 | 73 | def start(self): 74 | """Start the scheduler main loop.""" 75 | logger.info(f'🚀 Starting Website Monitor Scheduler') 76 | logger.info(f'📅 Monitoring interval: {self.interval} seconds ({self.interval/3600:.1f} hours)') 77 | logger.info(f'📁 Working directory: {os.getcwd()}') 78 | logger.info(f'🐍 Python version: {sys.version}') 79 | 80 | # Run initial monitoring check 81 | logger.info('Running initial monitoring check...') 82 | self.run_monitoring() 83 | 84 | # Main scheduling loop 85 | while self.running: 86 | try: 87 | logger.info(f'⏰ Waiting {self.interval} seconds until next monitoring run...') 88 | 89 | # Sleep in small intervals to allow for graceful shutdown 90 | sleep_remaining = self.interval 91 | while sleep_remaining > 0 and self.running: 92 | sleep_time = min(60, sleep_remaining) # Sleep max 60 seconds at a time 93 | time.sleep(sleep_time) 94 | sleep_remaining -= sleep_time 95 | 96 | if self.running: 97 | self.run_monitoring() 98 | 99 | except KeyboardInterrupt: 100 | logger.info('Scheduler interrupted by user') 101 | break 102 | except Exception as e: 103 | logger.error(f'Unexpected error in scheduler: {e}') 104 | time.sleep(60) # Wait a minute before trying again 105 | 106 | logger.info('📊 Website Monitor Scheduler stopped') 107 | 108 | def main(): 109 | """Main entry point for the scheduler.""" 110 | # Ensure logs directory exists 111 | os.makedirs('/app/logs', exist_ok=True) 112 | 113 | try: 114 | scheduler = MonitorScheduler() 115 | scheduler.start() 116 | except Exception as e: 117 | logger.error(f'Failed to start scheduler: {e}') 118 | sys.exit(1) 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /checks/check_ssl_cipher_strength.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import ssl 3 | import logging 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | # Updated cipher classifications based on current security standards 10 | STRONG_CIPHERS = { 11 | 'ECDHE-RSA-AES128-GCM-SHA256', 'ECDHE-RSA-AES256-GCM-SHA384', 12 | 'ECDHE-ECDSA-AES128-GCM-SHA256', 'ECDHE-ECDSA-AES256-GCM-SHA384', 13 | 'TLS_AES_128_GCM_SHA256', 'TLS_AES_256_GCM_SHA384', 14 | 'TLS_CHACHA20_POLY1305_SHA256', 'ECDHE-RSA-CHACHA20-POLY1305', 15 | 'ECDHE-ECDSA-CHACHA20-POLY1305' 16 | } 17 | 18 | MODERATE_CIPHERS = { 19 | 'ECDHE-RSA-AES128-SHA', 'ECDHE-RSA-AES256-SHA', 20 | 'ECDHE-ECDSA-AES128-SHA', 'ECDHE-ECDSA-AES256-SHA', 21 | 'ECDHE-RSA-AES128-SHA256', 'ECDHE-RSA-AES256-SHA384' 22 | } 23 | 24 | WEAK_CIPHERS = { 25 | 'RC4', 'DES', '3DES', 'MD5', 'SHA1', 'NULL' 26 | } 27 | 28 | def check_ssl_cipher_strength(website: str) -> str: 29 | """ 30 | Check the strength of the SSL/TLS cipher suite of the website with enhanced analysis. 31 | 32 | Args: 33 | website (str): URL of the website to be checked. 34 | 35 | Returns: 36 | str: 37 | - "🟢" if the cipher strength is strong 38 | - "🟠" if the cipher strength is moderate 39 | - "🔴" if the cipher strength is weak 40 | - "⚪" for any errors 41 | """ 42 | # Input validation and hostname extraction 43 | if not website or not isinstance(website, str): 44 | logger.error(f"Invalid website input: {website}") 45 | return "⚪" 46 | 47 | website = website.strip() 48 | 49 | # Extract hostname from URL 50 | if website.startswith(('http://', 'https://')): 51 | hostname = website.split('//')[1].split('/')[0].split(':')[0] 52 | else: 53 | hostname = website.split('/')[0].split(':')[0] 54 | 55 | try: 56 | # Create enhanced SSL context 57 | context = ssl.create_default_context() 58 | context.check_hostname = True 59 | context.verify_mode = ssl.CERT_REQUIRED 60 | 61 | # Create connection with timeout 62 | with socket.create_connection((hostname, 443), timeout=15) as sock: 63 | with context.wrap_socket(sock, server_hostname=hostname) as ssock: 64 | # Get comprehensive SSL information 65 | cipher_info = ssock.cipher() 66 | protocol_version = ssock.version() 67 | cert = ssock.getpeercert() 68 | 69 | if not cipher_info: 70 | logger.warning(f"No cipher information available for {hostname}") 71 | return "⚪" 72 | 73 | cipher_name = cipher_info[0] 74 | cipher_protocol = cipher_info[1] 75 | cipher_bits = cipher_info[2] 76 | 77 | logger.info(f"SSL analysis for {hostname}: {cipher_name}, {protocol_version}, {cipher_bits} bits") 78 | 79 | # Enhanced cipher strength analysis 80 | cipher_upper = cipher_name.upper() 81 | 82 | # Check for weak indicators first 83 | if any(weak in cipher_upper for weak in WEAK_CIPHERS): 84 | logger.warning(f"Weak cipher components detected: {cipher_name}") 85 | return "🔴" 86 | 87 | # Check protocol version 88 | if protocol_version in ['TLSv1.3']: 89 | logger.info(f"Excellent protocol version: {protocol_version}") 90 | return "🟢" 91 | elif protocol_version in ['TLSv1.2']: 92 | # For TLS 1.2, check specific cipher 93 | if cipher_name in STRONG_CIPHERS: 94 | logger.info(f"Strong cipher with TLS 1.2: {cipher_name}") 95 | return "🟢" 96 | elif cipher_name in MODERATE_CIPHERS: 97 | logger.info(f"Moderate cipher with TLS 1.2: {cipher_name}") 98 | return "🟠" 99 | else: 100 | logger.warning(f"Unknown/weak cipher with TLS 1.2: {cipher_name}") 101 | return "🔴" 102 | elif protocol_version in ['TLSv1.1', 'TLSv1']: 103 | logger.warning(f"Outdated protocol version: {protocol_version}") 104 | return "🔴" 105 | else: 106 | logger.warning(f"Unknown protocol version: {protocol_version}") 107 | return "🔴" 108 | 109 | except socket.timeout: 110 | logger.warning(f"Connection timeout for {hostname}") 111 | return "⚪" 112 | except ssl.SSLError as ssl_err: 113 | logger.warning(f"SSL error for {hostname}: {ssl_err}") 114 | return "⚪" 115 | except socket.error as sock_err: 116 | logger.warning(f"Socket error for {hostname}: {sock_err}") 117 | return "⚪" 118 | except Exception as e: 119 | logger.error(f"Unexpected error for {hostname}: {e}") 120 | return "⚪" 121 | -------------------------------------------------------------------------------- /checks/check_url_canonicalization.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urlparse, urljoin, urlunparse 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_url_canonicalization(website: str) -> str: 12 | """ 13 | Check if the given website uses a canonical link element to avoid potential duplicate content issues. 14 | 15 | Args: 16 | website (str): The URL of the website to be checked. 17 | 18 | Returns: 19 | str: 20 | - "🟢" if a correct canonical link element is found. 21 | - "🟠" if canonical link exists but has minor issues. 22 | - "🔴" if no canonical link or major issues found. 23 | - "⚪" on errors. 24 | """ 25 | # Input validation and URL normalization 26 | if not website or not isinstance(website, str): 27 | logger.error(f"Invalid website input: {website}") 28 | return "⚪" 29 | 30 | website = website.strip() 31 | if not website.startswith(('http://', 'https://')): 32 | website = f"https://{website}" 33 | 34 | headers = { 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 36 | } 37 | 38 | try: 39 | # Make request with proper error handling 40 | response = requests.get(website, headers=headers, timeout=15) 41 | response.raise_for_status() 42 | 43 | # Parse HTML content 44 | soup = BeautifulSoup(response.text, 'html.parser') 45 | canonical_tags = soup.find_all('link', {'rel': 'canonical'}) 46 | 47 | if not canonical_tags: 48 | logger.warning(f"No canonical link found for {website}") 49 | return "🔴" 50 | 51 | if len(canonical_tags) > 1: 52 | logger.warning(f"Multiple canonical links found for {website}") 53 | return "🟠" # Multiple canonicals can be problematic 54 | 55 | canonical_tag = canonical_tags[0] 56 | canonical_href = canonical_tag.get('href') 57 | 58 | if not canonical_href: 59 | logger.warning(f"Empty canonical href for {website}") 60 | return "🔴" 61 | 62 | # Normalize URLs for comparison 63 | def normalize_url(url): 64 | parsed = urlparse(url) 65 | # Remove fragment, normalize path 66 | normalized = urlunparse(( 67 | parsed.scheme.lower(), 68 | parsed.netloc.lower(), 69 | parsed.path.rstrip('/') or '/', 70 | parsed.params, 71 | parsed.query, 72 | '' # Remove fragment 73 | )) 74 | return normalized 75 | 76 | # Handle relative canonical URLs 77 | if canonical_href.startswith(('http://', 'https://')): 78 | canonical_url = canonical_href 79 | else: 80 | canonical_url = urljoin(website, canonical_href) 81 | 82 | normalized_website = normalize_url(website) 83 | normalized_canonical = normalize_url(canonical_url) 84 | 85 | logger.info(f"Canonical analysis for {website}: canonical={canonical_url}") 86 | 87 | # Enhanced validation 88 | if normalized_canonical == normalized_website: 89 | logger.info(f"Perfect canonical match for {website}") 90 | return "🟢" 91 | 92 | # Check if canonical points to a valid variation (e.g., with/without www) 93 | website_parsed = urlparse(normalized_website) 94 | canonical_parsed = urlparse(normalized_canonical) 95 | 96 | if (website_parsed.netloc.replace('www.', '') == canonical_parsed.netloc.replace('www.', '') and 97 | website_parsed.path == canonical_parsed.path): 98 | logger.info(f"Canonical points to valid domain variation for {website}") 99 | return "🟢" 100 | 101 | # Check if it's the same domain but different path (might be intentional) 102 | if website_parsed.netloc == canonical_parsed.netloc: 103 | logger.warning(f"Canonical points to different path on same domain for {website}") 104 | return "🟠" 105 | 106 | logger.warning(f"Canonical points to different domain for {website}") 107 | return "🔴" 108 | 109 | except (Timeout, HTTPError) as e: 110 | logger.warning(f"HTTP/Timeout error for {website}: {e}") 111 | return "⚪" 112 | except RequestException as e: 113 | logger.warning(f"Request error for {website}: {e}") 114 | return "⚪" 115 | except Exception as e: 116 | logger.error(f"Unexpected error for {website}: {e}") 117 | return "⚪" 118 | -------------------------------------------------------------------------------- /checks/check_dnssec.py: -------------------------------------------------------------------------------- 1 | import dns.resolver 2 | import dns.dnssec 3 | import dns.query 4 | import dns.name 5 | import dns.rdatatype 6 | import logging 7 | import re 8 | from urllib.parse import urlparse 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | def check_dnssec(domain: str) -> str: 13 | """ 14 | Check if a domain supports DNSSEC (Domain Name System Security Extensions). 15 | 16 | Args: 17 | domain (str): The domain name to be checked. 18 | 19 | Returns: 20 | str: 21 | - "🟢" if the domain supports DNSSEC properly. 22 | - "🟡" if DNSSEC is partially configured. 23 | - "🔴" if the domain does not support DNSSEC or there's a DNSSEC-related error. 24 | - "⚪" for other errors. 25 | """ 26 | # Input validation and normalization 27 | if not domain: 28 | logger.error("Domain is required") 29 | return "⚪" 30 | 31 | # Normalize domain 32 | domain = domain.lower().strip() 33 | domain = re.sub(r'^https?://', '', domain) 34 | domain = re.sub(r'^www\.', '', domain) 35 | domain = domain.split('/')[0] # Remove path if present 36 | domain = domain.split(':')[0] # Remove port if present 37 | 38 | # Validate domain format 39 | if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', domain): 40 | logger.error(f"Invalid domain format: {domain}") 41 | return "⚪" 42 | 43 | try: 44 | # Convert domain to DNS name object 45 | domain_name = dns.name.from_text(domain) 46 | 47 | # Enhanced detection patterns 48 | dnssec_indicators = [] 49 | 50 | # Check for DNSKEY records 51 | try: 52 | dnskey_query = dns.resolver.resolve(domain_name, 'DNSKEY', tcp=True) 53 | if dnskey_query: 54 | dnssec_indicators.append("DNSKEY records found") 55 | logger.info(f"Found {len(dnskey_query)} DNSKEY records for {domain}") 56 | except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN): 57 | logger.warning(f"No DNSKEY records found for {domain}") 58 | 59 | # Check for DS records in parent zone 60 | try: 61 | ds_query = dns.resolver.resolve(domain_name, 'DS', tcp=True) 62 | if ds_query: 63 | dnssec_indicators.append("DS records found") 64 | logger.info(f"Found {len(ds_query)} DS records for {domain}") 65 | except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN): 66 | logger.warning(f"No DS records found for {domain}") 67 | 68 | # Check for RRSIG records (signature records) 69 | try: 70 | rrsig_query = dns.resolver.resolve(domain_name, 'RRSIG', tcp=True) 71 | if rrsig_query: 72 | dnssec_indicators.append("RRSIG records found") 73 | logger.info(f"Found RRSIG records for {domain}") 74 | except (dns.resolver.NoAnswer, dns.resolver.NXDOMAIN): 75 | logger.warning(f"No RRSIG records found for {domain}") 76 | 77 | # Fallback mechanism - check for any DNSSEC records 78 | if not dnssec_indicators: 79 | # Try checking A record with DNSSEC validation 80 | try: 81 | resolver = dns.resolver.Resolver() 82 | resolver.use_edns(0, dns.flags.DO, 4096) # Enable DNSSEC 83 | a_query = resolver.resolve(domain_name, 'A') 84 | # If we get here, DNS works but DNSSEC might not be configured 85 | logger.info(f"DNS resolution works for {domain} but no DNSSEC indicators found") 86 | except Exception: 87 | pass 88 | 89 | # Improved scoring and categorization 90 | if len(dnssec_indicators) >= 2: 91 | logger.info(f"Strong DNSSEC configuration for {domain}: {', '.join(dnssec_indicators)}") 92 | return "🟢" 93 | elif len(dnssec_indicators) == 1: 94 | logger.warning(f"Partial DNSSEC configuration for {domain}: {dnssec_indicators[0]}") 95 | return "🟡" 96 | else: 97 | logger.warning(f"No DNSSEC configuration found for {domain}") 98 | return "🔴" 99 | 100 | except dns.resolver.NoAnswer: 101 | logger.warning(f"No DNS answer received for {domain} - domain might not exist or have DNS issues") 102 | return "⚪" 103 | except dns.resolver.NoNameservers: 104 | logger.error(f"No name servers available for {domain}") 105 | return "⚪" 106 | except dns.resolver.NXDOMAIN: 107 | logger.error(f"Domain {domain} does not exist") 108 | return "⚪" 109 | except dns.resolver.Timeout: 110 | logger.error(f"DNS request timeout while checking DNSSEC for {domain}") 111 | return "⚪" 112 | except dns.dnssec.ValidationFailure as e: 113 | logger.error(f"DNSSEC validation failure for {domain}: {e}") 114 | return "🔴" 115 | except Exception as e: 116 | logger.error(f"Unexpected error while checking DNSSEC for {domain}: {e}") 117 | return "⚪" 118 | -------------------------------------------------------------------------------- /checks/check_favicon.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from requests.exceptions import RequestException, HTTPError 5 | from urllib.parse import urlparse, urljoin 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_favicon(website: str) -> str: 10 | """ 11 | Check if the website has a valid favicon. 12 | 13 | Args: 14 | website (str): URL of the website to be checked. 15 | 16 | Returns: 17 | str: 18 | - "🟢" if a valid favicon is found. 19 | - "🔴" if no valid favicon is found. 20 | - "⚪" if an error occurred during the check. 21 | """ 22 | # Input validation and URL normalization 23 | if not website: 24 | logger.error("Website URL is required") 25 | return "⚪" 26 | 27 | if not website.startswith(('http://', 'https://')): 28 | website = f"https://{website}" 29 | 30 | try: 31 | parsed_url = urlparse(website) 32 | if not parsed_url.netloc: 33 | logger.error(f"Invalid URL format: {website}") 34 | return "⚪" 35 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" 36 | except Exception as e: 37 | logger.error(f"URL parsing error for {website}: {e}") 38 | return "⚪" 39 | 40 | headers = { 41 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36' 42 | } 43 | 44 | def check_favicon_url(url): 45 | """Helper function to check if a favicon URL is valid""" 46 | try: 47 | response = requests.head(url, headers=headers, timeout=10, allow_redirects=True) 48 | return response.status_code == 200 49 | except: 50 | try: 51 | response = requests.get(url, headers=headers, timeout=10, stream=True) 52 | return response.status_code == 200 and len(response.content) > 0 53 | except: 54 | return False 55 | 56 | try: 57 | # Enhanced detection patterns - multiple fallback mechanisms 58 | favicon_candidates = [] 59 | 60 | # 1. Check default favicon.ico location 61 | default_favicon = f"{base_url}/favicon.ico" 62 | if check_favicon_url(default_favicon): 63 | logger.info(f"Favicon found at default location: {default_favicon}") 64 | return "🟢" 65 | favicon_candidates.append(default_favicon) 66 | 67 | # 2. Parse HTML for favicon references 68 | try: 69 | response = requests.get(website, headers=headers, timeout=15) 70 | response.raise_for_status() 71 | soup = BeautifulSoup(response.text, 'html.parser') 72 | 73 | # Look for various favicon link types 74 | favicon_rels = ['icon', 'shortcut icon', 'apple-touch-icon', 'apple-touch-icon-precomposed'] 75 | 76 | for rel in favicon_rels: 77 | icons = soup.find_all('link', rel=lambda x: x and rel in x.lower() if x else False) 78 | for icon in icons: 79 | href = icon.get('href') 80 | if not href: 81 | continue 82 | 83 | # Normalize URL 84 | if href.startswith('//'): 85 | favicon_url = f"{parsed_url.scheme}:{href}" 86 | elif href.startswith('/'): 87 | favicon_url = f"{base_url}{href}" 88 | elif not href.startswith(('http://', 'https://')): 89 | favicon_url = urljoin(website, href) 90 | else: 91 | favicon_url = href 92 | 93 | favicon_candidates.append(favicon_url) 94 | 95 | if check_favicon_url(favicon_url): 96 | logger.info(f"Favicon found via HTML link tag: {favicon_url}") 97 | return "🟢" 98 | 99 | except Exception as e: 100 | logger.warning(f"Error parsing HTML for favicon on {website}: {e}") 101 | 102 | # 3. Try common alternative locations 103 | common_paths = ['/apple-touch-icon.png', '/icon.png', '/favicon.png'] 104 | for path in common_paths: 105 | favicon_url = f"{base_url}{path}" 106 | favicon_candidates.append(favicon_url) 107 | if check_favicon_url(favicon_url): 108 | logger.info(f"Favicon found at common location: {favicon_url}") 109 | return "🟢" 110 | 111 | logger.warning(f"No valid favicon found for {website}. Checked {len(set(favicon_candidates))} locations") 112 | return "🔴" 113 | 114 | except (HTTPError, RequestException) as e: 115 | logger.error(f"Request error while checking favicon for {website}: {e}") 116 | return "⚪" 117 | except Exception as e: 118 | logger.error(f"Unexpected error while checking favicon for {website}: {e}") 119 | return "⚪" 120 | -------------------------------------------------------------------------------- /checks/check_domainsblacklists_blacklist.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from urllib.parse import urlparse 5 | import re 6 | import hashlib 7 | import time 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | # Simple cache to avoid repeated downloads 12 | _blacklist_cache = { 13 | 'data': None, 14 | 'timestamp': 0, 15 | 'ttl': 3600 # Cache for 1 hour 16 | } 17 | 18 | def check_domainsblacklists_blacklist(domain: str) -> str: 19 | """ 20 | Check if a domain is present in a large blacklist file hosted online. 21 | 22 | Args: 23 | domain (str): The domain to check against the blacklist. 24 | 25 | Returns: 26 | str: 27 | - "🔴" if the domain is found in the blacklist 28 | - "🟢" if the domain is not found in the blacklist 29 | - "⚪" if an error occurs 30 | """ 31 | # Input validation and normalization 32 | if not domain: 33 | logger.error("Domain is required") 34 | return "⚪" 35 | 36 | # Normalize domain 37 | domain = domain.lower().strip() 38 | domain = re.sub(r'^https?://', '', domain) 39 | domain = re.sub(r'^www\.', '', domain) 40 | domain = domain.split('/')[0] # Remove path if present 41 | domain = domain.split(':')[0] # Remove port if present 42 | 43 | # Validate domain format 44 | if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', domain): 45 | logger.error(f"Invalid domain format: {domain}") 46 | return "⚪" 47 | 48 | url = "https://github.com/fabriziosalmi/blacklists/releases/download/latest/blacklist.txt" 49 | 50 | headers = { 51 | 'User-Agent': 'DomainBlacklistChecker/2.0', 52 | 'Accept-Encoding': 'gzip, deflate' 53 | } 54 | 55 | try: 56 | # Check cache first for performance optimization 57 | current_time = time.time() 58 | if _blacklist_cache['data'] and (current_time - _blacklist_cache['timestamp'] < _blacklist_cache['ttl']): 59 | logger.debug("Using cached blacklist data") 60 | blacklist_set = _blacklist_cache['data'] 61 | else: 62 | logger.info("Downloading fresh blacklist data") 63 | # Stream the response to handle large files efficiently 64 | response = requests.get(url, headers=headers, stream=True, timeout=60) 65 | response.raise_for_status() 66 | 67 | # Build a set for O(1) lookup performance 68 | blacklist_set = set() 69 | line_count = 0 70 | 71 | for line in response.iter_lines(decode_unicode=True): 72 | # Ensure line is a string and handle properly 73 | if line and isinstance(line, str): 74 | if not line.startswith('#'): # Skip comments 75 | cleaned_line = line.strip().lower() 76 | if cleaned_line: 77 | blacklist_set.add(cleaned_line) 78 | line_count += 1 79 | elif line and isinstance(line, bytes): 80 | # Handle bytes if somehow we get them 81 | line_str = line.decode('utf-8', errors='ignore') 82 | if not line_str.startswith('#'): 83 | cleaned_line = line_str.strip().lower() 84 | if cleaned_line: 85 | blacklist_set.add(cleaned_line) 86 | line_count += 1 87 | 88 | # Update cache 89 | _blacklist_cache['data'] = blacklist_set 90 | _blacklist_cache['timestamp'] = current_time 91 | 92 | logger.info(f"Loaded {line_count} domains into blacklist") 93 | 94 | # Enhanced detection patterns - check domain and subdomains 95 | domains_to_check = [domain] 96 | 97 | # Add parent domains for subdomain checking 98 | parts = domain.split('.') 99 | for i in range(1, len(parts)): 100 | parent_domain = '.'.join(parts[i:]) 101 | if len(parent_domain) > 3: # Avoid checking TLDs 102 | domains_to_check.append(parent_domain) 103 | 104 | # Check all domain variants 105 | for check_domain in domains_to_check: 106 | if check_domain in blacklist_set: 107 | logger.warning(f"Domain {check_domain} found in blacklist (original: {domain})") 108 | return "🔴" 109 | 110 | logger.info(f"Domain {domain} not found in blacklist") 111 | return "🟢" 112 | 113 | except (Timeout, HTTPError) as e: 114 | logger.error(f"HTTP error while checking domain {domain} against blacklist: {e}") 115 | return "⚪" 116 | except RequestException as e: 117 | logger.error(f"Request error while checking domain {domain} against blacklist: {e}") 118 | return "⚪" 119 | except Exception as e: 120 | logger.error(f"Unexpected error while checking domain {domain} against blacklist: {e}") 121 | return "⚪" 122 | -------------------------------------------------------------------------------- /checks/check_security_headers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_security_headers(website: str) -> str: 10 | """ 11 | Check for the presence and correct implementation of recommended security headers on a website. 12 | 13 | Args: 14 | website (str): URL of the website to be checked. 15 | 16 | Returns: 17 | str: 18 | - "🟢" if all recommended headers are properly implemented. 19 | - "🟠" if headers are present but not all are ideally implemented. 20 | - "🔴" if some recommended headers are missing. 21 | - "⚪" for any errors. 22 | """ 23 | # Input validation and URL normalization 24 | if not website or not isinstance(website, str): 25 | logger.error(f"Invalid website input: {website}") 26 | return "⚪" 27 | 28 | website = website.strip() 29 | if not website.startswith(('http://', 'https://')): 30 | website = f"https://{website}" 31 | 32 | headers = { 33 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 34 | } 35 | 36 | # Enhanced recommended security headers with scoring 37 | security_headers = { 38 | 'X-Content-Type-Options': {'expected': 'nosniff', 'weight': 2}, 39 | 'X-XSS-Protection': {'expected': '1; mode=block', 'weight': 2}, 40 | 'Strict-Transport-Security': {'expected': None, 'weight': 3}, 41 | 'Content-Security-Policy': {'expected': None, 'weight': 3}, 42 | 'Referrer-Policy': {'expected': None, 'weight': 1}, 43 | 'Permissions-Policy': {'expected': None, 'weight': 1}, 44 | 'X-Frame-Options': {'expected': ['DENY', 'SAMEORIGIN'], 'weight': 2} 45 | } 46 | 47 | try: 48 | # Make request with proper error handling 49 | response = requests.get(website, headers=headers, timeout=15) 50 | response.raise_for_status() 51 | 52 | # Analyze security headers 53 | total_score = 0 54 | max_score = sum(header_info['weight'] for header_info in security_headers.values()) 55 | issues = [] 56 | 57 | for header, config in security_headers.items(): 58 | header_value = response.headers.get(header) 59 | expected = config['expected'] 60 | weight = config['weight'] 61 | 62 | if header_value: 63 | if expected is None: 64 | # Header present, that's good enough 65 | total_score += weight 66 | logger.debug(f"Security header {header} present: {header_value}") 67 | elif isinstance(expected, list): 68 | # Check if value is in expected list 69 | if any(exp in header_value for exp in expected): 70 | total_score += weight 71 | else: 72 | issues.append(f"{header} has unexpected value: {header_value}") 73 | total_score += weight * 0.5 # Partial credit 74 | elif expected.lower() in header_value.lower(): 75 | total_score += weight 76 | else: 77 | issues.append(f"{header} has non-ideal value: {header_value} (expected: {expected})") 78 | total_score += weight * 0.5 # Partial credit 79 | else: 80 | issues.append(f"Missing security header: {header}") 81 | 82 | # Check for information disclosure headers 83 | revealing_headers = { 84 | 'Server', 'X-Powered-By', 'X-AspNet-Version', 'X-Generator' 85 | } 86 | found_revealing = revealing_headers.intersection(response.headers.keys()) 87 | 88 | if found_revealing: 89 | issues.append(f"Information disclosure headers found: {', '.join(found_revealing)}") 90 | total_score -= 1 # Penalty for revealing headers 91 | 92 | # Calculate security score percentage 93 | security_score = max(0, total_score / max_score) 94 | 95 | logger.info(f"Security headers analysis for {website}: {security_score:.2f} score ({total_score}/{max_score})") 96 | 97 | if issues: 98 | logger.warning(f"Security issues found: {issues}") 99 | 100 | # Determine result based on security score 101 | if security_score >= 0.9: 102 | return "🟢" 103 | elif security_score >= 0.6: 104 | return "🟠" 105 | else: 106 | return "🔴" 107 | 108 | except (Timeout, HTTPError) as e: 109 | logger.warning(f"HTTP/Timeout error for {website}: {e}") 110 | return "⚪" 111 | except RequestException as e: 112 | logger.warning(f"Request error for {website}: {e}") 113 | return "⚪" 114 | except Exception as e: 115 | logger.error(f"Unexpected error for {website}: {e}") 116 | return "⚪" 117 | -------------------------------------------------------------------------------- /checks/check_semantic_markup.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from bs4 import FeatureNotFound 5 | from requests.exceptions import RequestException 6 | import json 7 | import re 8 | 9 | # Configure logging 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | def check_semantic_markup(website): 14 | """ 15 | Check if the website contains semantic markup in the form of JSON-LD, Microdata, or RDFa. 16 | 17 | Args: 18 | website (str): The URL of the website to be checked. 19 | 20 | Returns: 21 | str: 22 | - "🟢" if comprehensive semantic markup is found 23 | - "🟠" if some semantic markup is found 24 | - "🔴" if no semantic markup is found 25 | - "⚪" if an error occurs 26 | """ 27 | # Input validation and URL normalization 28 | if not website or not isinstance(website, str): 29 | logger.error(f"Invalid website input: {website}") 30 | return "⚪" 31 | 32 | website = website.strip() 33 | if not website.startswith(('http://', 'https://')): 34 | website = f"https://{website}" 35 | 36 | headers = { 37 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 38 | } 39 | 40 | try: 41 | # Fetch website content 42 | response = requests.get(website, headers=headers, timeout=15) 43 | response.raise_for_status() 44 | html_content = response.text 45 | 46 | # Parse HTML content 47 | try: 48 | soup = BeautifulSoup(html_content, 'lxml') 49 | except FeatureNotFound: 50 | soup = BeautifulSoup(html_content, 'html.parser') 51 | 52 | markup_score = 0 53 | markup_types = [] 54 | 55 | # Method 1: Check for JSON-LD semantic markup 56 | json_ld_scripts = soup.find_all('script', type="application/ld+json") 57 | if json_ld_scripts: 58 | valid_json_ld = 0 59 | for script in json_ld_scripts: 60 | try: 61 | json_data = json.loads(script.string or '{}') 62 | if json_data and '@context' in json_data: 63 | valid_json_ld += 1 64 | logger.debug(f"Valid JSON-LD found: {json_data.get('@type', 'Unknown type')}") 65 | except (json.JSONDecodeError, AttributeError): 66 | continue 67 | 68 | if valid_json_ld > 0: 69 | markup_score += 3 # JSON-LD gets highest score 70 | markup_types.append(f"JSON-LD ({valid_json_ld} items)") 71 | 72 | # Method 2: Check for Microdata semantic markup 73 | microdata_elements = soup.find_all(attrs={"itemscope": True}) 74 | if microdata_elements: 75 | microdata_with_type = [elem for elem in microdata_elements if elem.get('itemtype')] 76 | if microdata_with_type: 77 | markup_score += 2 78 | markup_types.append(f"Microdata ({len(microdata_with_type)} items)") 79 | 80 | # Method 3: Check for RDFa semantic markup 81 | rdfa_vocab = soup.find_all(attrs={"vocab": True}) 82 | rdfa_typeof = soup.find_all(attrs={"typeof": True}) 83 | rdfa_property = soup.find_all(attrs={"property": True}) 84 | 85 | if rdfa_vocab or rdfa_typeof or rdfa_property: 86 | markup_score += 1 87 | rdfa_count = len(rdfa_vocab) + len(rdfa_typeof) + len(rdfa_property) 88 | markup_types.append(f"RDFa ({rdfa_count} attributes)") 89 | 90 | # Method 4: Check for Open Graph and Twitter Card markup 91 | og_tags = soup.find_all('meta', property=lambda x: x and x.startswith('og:')) 92 | twitter_tags = soup.find_all('meta', attrs={'name': lambda x: x and x.startswith('twitter:')}) 93 | 94 | if og_tags: 95 | markup_score += 1 96 | markup_types.append(f"Open Graph ({len(og_tags)} tags)") 97 | 98 | if twitter_tags: 99 | markup_score += 1 100 | markup_types.append(f"Twitter Cards ({len(twitter_tags)} tags)") 101 | 102 | # Method 5: Check for Schema.org patterns in class names 103 | schema_classes = soup.find_all(class_=re.compile(r'schema|hcard|vcard|geo|adr', re.IGNORECASE)) 104 | if schema_classes: 105 | markup_score += 1 106 | markup_types.append(f"Schema classes ({len(schema_classes)} elements)") 107 | 108 | logger.info(f"Semantic markup analysis for {website}: Score {markup_score}, Types: {', '.join(markup_types)}") 109 | 110 | # Determine result based on markup score and types 111 | if markup_score >= 4: 112 | return "🟢" # Comprehensive semantic markup 113 | elif markup_score >= 2: 114 | return "🟠" # Some semantic markup 115 | else: 116 | return "🔴" # No or minimal semantic markup 117 | 118 | except RequestException as e: 119 | logger.warning(f"Request error for {website}: {e}") 120 | return "⚪" 121 | except Exception as e: 122 | logger.error(f"Unexpected error for {website}: {e}") 123 | return "⚪" 124 | -------------------------------------------------------------------------------- /checks/check_subdomain_enumeration.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from urllib.parse import urlparse 4 | from requests.exceptions import RequestException, Timeout, HTTPError 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | import time 7 | 8 | # Configure logging 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__name__) 11 | 12 | def check_subdomain_enumeration(website: str) -> tuple: 13 | """ 14 | Check for the existence of common subdomains for a given website with enhanced security analysis. 15 | 16 | Args: 17 | website (str): The main domain of the website to be checked. 18 | 19 | Returns: 20 | tuple: A status symbol and a list of discovered subdomains. 21 | - "🟢" if no potentially risky subdomains were discovered. 22 | - "🟠" if some subdomains were found but appear safe. 23 | - "🔴" if risky subdomains were found. 24 | - "⚪" for unexpected errors. 25 | """ 26 | # Input validation and URL normalization 27 | if not website or not isinstance(website, str): 28 | logger.error(f"Invalid website input: {website}") 29 | return "⚪", [] 30 | 31 | # Extract domain from URL if full URL provided 32 | if website.startswith(('http://', 'https://')): 33 | parsed = urlparse(website) 34 | domain = parsed.netloc 35 | else: 36 | domain = website.strip() 37 | 38 | # Enhanced subdomain list with security-focused subdomains 39 | SUBDOMAINS = [ 40 | # Common subdomains 41 | "www", "api", "dev", "test", "staging", "mail", "blog", "shop", "admin", 42 | # Development/staging subdomains (potentially risky) 43 | "development", "stage", "beta", "alpha", "demo", "sandbox", 44 | # Infrastructure subdomains 45 | "cdn", "static", "assets", "media", "files", 46 | # Potentially sensitive subdomains 47 | "backup", "old", "legacy", "archive", "temp", "tmp", 48 | # Service subdomains 49 | "ftp", "ssh", "vpn", "remote", "portal" 50 | ] 51 | 52 | # Categorize subdomains by risk level 53 | RISKY_SUBDOMAINS = { 54 | "dev", "test", "staging", "development", "stage", "beta", "alpha", 55 | "demo", "sandbox", "backup", "old", "legacy", "archive", "temp", "tmp" 56 | } 57 | 58 | discovered_subdomains = [] 59 | risky_subdomains = [] 60 | 61 | headers = { 62 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 63 | } 64 | 65 | def check_subdomain(subdomain): 66 | """Helper function to check individual subdomain.""" 67 | subdomain_url = f"https://{subdomain}.{domain}" 68 | try: 69 | response = requests.get(subdomain_url, headers=headers, timeout=10, allow_redirects=True) 70 | if response.status_code == 200: 71 | logger.debug(f"Discovered subdomain: {subdomain_url}") 72 | return subdomain_url, subdomain in RISKY_SUBDOMAINS 73 | return None, False 74 | except (Timeout, HTTPError, RequestException): 75 | return None, False 76 | except Exception as e: 77 | logger.debug(f"Error checking {subdomain_url}: {e}") 78 | return None, False 79 | 80 | try: 81 | # Use ThreadPoolExecutor for concurrent subdomain checking 82 | with ThreadPoolExecutor(max_workers=10) as executor: 83 | # Submit all subdomain checks 84 | future_to_subdomain = { 85 | executor.submit(check_subdomain, sub): sub 86 | for sub in SUBDOMAINS 87 | } 88 | 89 | # Process results as they complete 90 | for future in as_completed(future_to_subdomain, timeout=60): 91 | subdomain = future_to_subdomain[future] 92 | try: 93 | result, is_risky = future.result() 94 | if result: 95 | discovered_subdomains.append(result) 96 | if is_risky: 97 | risky_subdomains.append(result) 98 | except Exception as e: 99 | logger.debug(f"Error processing result for {subdomain}: {e}") 100 | continue 101 | 102 | # Enhanced result analysis 103 | total_discovered = len(discovered_subdomains) 104 | total_risky = len(risky_subdomains) 105 | 106 | logger.info(f"Subdomain enumeration for {domain}: {total_discovered} discovered, {total_risky} potentially risky") 107 | 108 | if total_risky > 0: 109 | logger.warning(f"Risky subdomains found: {risky_subdomains}") 110 | return "🔴", discovered_subdomains 111 | elif total_discovered > 5: 112 | logger.warning(f"Multiple subdomains discovered for {domain}, potential attack surface") 113 | return "🟠", discovered_subdomains 114 | elif total_discovered > 0: 115 | logger.info(f"Few subdomains discovered for {domain}") 116 | return "🟠", discovered_subdomains 117 | else: 118 | logger.info(f"No subdomains discovered for {domain}") 119 | return "🟢", [] 120 | 121 | except Exception as e: 122 | logger.error(f"Unexpected error during subdomain enumeration for {domain}: {e}") 123 | return "⚪", [] 124 | -------------------------------------------------------------------------------- /checks/check_rate_limiting.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import logging 4 | from urllib.parse import urlparse, urlunparse 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def normalize_url(website): 12 | """ 13 | Normalize the website URL, ensuring it has a scheme. 14 | 15 | Args: 16 | - website (str): The URL of the website to normalize. 17 | 18 | Returns: 19 | - str: The normalized URL. 20 | """ 21 | if not website or not isinstance(website, str): 22 | raise ValueError("Invalid website input") 23 | 24 | website = website.strip() 25 | parsed_url = urlparse(website) 26 | 27 | if not parsed_url.scheme: 28 | normalized_url = urlunparse(('https', website, '', '', '', '')) 29 | else: 30 | normalized_url = website 31 | return normalized_url 32 | 33 | def check_rate_limiting(website: str, num_requests: int = 5, delay: float = 0.3, 34 | user_agent: str = "RateLimitChecker/2.0", threshold: int = 2) -> str: 35 | """ 36 | Checks for rate limiting using enhanced detection with varied delays and request patterns. 37 | 38 | Args: 39 | website (str): The URL of the website to check. 40 | num_requests (int): Number of requests to send for testing. 41 | delay (float): Initial delay in seconds between requests. 42 | user_agent (str): Custom User-Agent string for the requests. 43 | threshold (int): The maximum number of successful requests before assuming no rate limiting. 44 | 45 | Returns: 46 | str: "🟢", "🔴", or "⚪" based on the detection status. 47 | """ 48 | headers = { 49 | "User-Agent": user_agent 50 | } 51 | 52 | # Normalize the URL 53 | try: 54 | website = normalize_url(website) 55 | except Exception as e: 56 | logger.error(f"Invalid URL format: {e}") 57 | return "⚪" 58 | 59 | status_codes = [] 60 | response_times = [] 61 | success_count = 0 62 | rate_limit_detected = False 63 | 64 | try: 65 | for i in range(num_requests): 66 | start_time = time.perf_counter() 67 | 68 | try: 69 | response = requests.get(website, headers=headers, timeout=15) 70 | end_time = time.perf_counter() 71 | 72 | response_time = end_time - start_time 73 | response_times.append(response_time) 74 | status_codes.append(response.status_code) 75 | 76 | # Check for rate limiting indicators 77 | if response.status_code == 429: 78 | logger.info(f"Rate limiting detected (429) for {website} after {i + 1} requests") 79 | rate_limit_detected = True 80 | break 81 | elif response.status_code in [503, 502, 504]: 82 | logger.warning(f"Server overload detected ({response.status_code}) for {website}") 83 | # Continue to see if it's consistent 84 | 85 | # Check for rate limiting headers 86 | rate_limit_headers = ['X-RateLimit-Limit', 'X-RateLimit-Remaining', 'Retry-After'] 87 | if any(header in response.headers for header in rate_limit_headers): 88 | logger.info(f"Rate limiting headers detected for {website}") 89 | rate_limit_detected = True 90 | break 91 | 92 | if response.status_code in [200, 201, 202, 203, 204, 205, 206]: 93 | success_count += 1 94 | 95 | # Adaptive delay based on response time 96 | elapsed_time = end_time - start_time 97 | adaptive_delay = max(delay, elapsed_time * 0.5) 98 | time_to_sleep = max(0, adaptive_delay - elapsed_time) 99 | 100 | if i < num_requests - 1: 101 | time.sleep(time_to_sleep) 102 | 103 | except (Timeout, HTTPError) as e: 104 | logger.debug(f"Request {i + 1} failed for {website}: {e}") 105 | status_codes.append(0) # Indicate failure 106 | time.sleep(delay * 2) # Longer delay after failure 107 | 108 | # Enhanced analysis 109 | avg_response_time = sum(response_times) / len(response_times) if response_times else 0 110 | logger.info(f"Rate limiting analysis for {website}: {success_count}/{num_requests} successful, " 111 | f"avg response time: {avg_response_time:.3f}s") 112 | logger.debug(f"Status codes: {status_codes}") 113 | 114 | if rate_limit_detected: 115 | logger.info(f"Rate limiting detected for {website}") 116 | return "🟢" 117 | elif success_count < threshold: 118 | logger.info(f"Possible rate limiting detected for {website} (low success rate)") 119 | return "🟢" 120 | else: 121 | logger.info(f"No rate limiting detected for {website}") 122 | return "🔴" 123 | 124 | except RequestException as e: 125 | logger.error(f"Request error while checking rate limiting for {website}: {e}") 126 | return "⚪" 127 | except Exception as e: 128 | logger.error(f"Unexpected error while checking rate limiting for {website}: {e}") 129 | return "⚪" 130 | -------------------------------------------------------------------------------- /checks/check_domain_breach.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, HTTPError 4 | from urllib.parse import urlparse 5 | import json 6 | import re 7 | import time 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | # Simple rate limiting cache 12 | _rate_limit_cache = { 13 | 'last_request': 0, 14 | 'min_interval': 1.5 # Respect HIBP rate limits 15 | } 16 | 17 | def check_domain_breach(website: str) -> str: 18 | """ 19 | Check if a domain has been found in any known data breaches using the Have I Been Pwned API. 20 | 21 | Args: 22 | website (str): The domain name to be checked. 23 | 24 | Returns: 25 | str: 26 | - "🟢" if no breaches are found. 27 | - "🟡" if the domain is found in old/resolved breaches. 28 | - "🔴" if the domain is found in recent/active breaches. 29 | - "⚪" if any errors occurred or if the breach check could not be completed. 30 | """ 31 | # Input validation and normalization 32 | if not website: 33 | logger.error("Website URL is required") 34 | return "⚪" 35 | 36 | # Normalize domain 37 | website = website.lower().strip() 38 | website = re.sub(r'^https?://', '', website) 39 | website = re.sub(r'^www\.', '', website) 40 | website = website.split('/')[0] # Remove path if present 41 | website = website.split(':')[0] # Remove port if present 42 | 43 | # Validate domain format 44 | if not re.match(r'^[a-zA-Z0-9][a-zA-Z0-9.-]*[a-zA-Z0-9]$', website): 45 | logger.error(f"Invalid domain format: {website}") 46 | return "⚪" 47 | 48 | # Performance optimization - rate limiting 49 | current_time = time.time() 50 | time_since_last = current_time - _rate_limit_cache['last_request'] 51 | if time_since_last < _rate_limit_cache['min_interval']: 52 | time.sleep(_rate_limit_cache['min_interval'] - time_since_last) 53 | 54 | _rate_limit_cache['last_request'] = time.time() 55 | 56 | try: 57 | # Enhanced API usage - check breaches endpoint 58 | url = f"https://haveibeenpwned.com/api/v3/breaches" 59 | headers = { 60 | "hibp-api-version": "3", 61 | "User-Agent": "WebsiteMonitor/1.0" 62 | } 63 | 64 | response = requests.get(url, headers=headers, timeout=15) 65 | response.raise_for_status() 66 | 67 | if response.status_code == 200: 68 | all_breaches = response.json() 69 | 70 | # Enhanced detection patterns - check for domain-related breaches 71 | domain_breaches = [] 72 | recent_breaches = [] 73 | 74 | for breach in all_breaches: 75 | breach_domain = breach.get('Domain', '') 76 | breach_name = breach.get('Name', '').lower() 77 | breach_date = breach.get('BreachDate', '') 78 | 79 | # Check if breach is related to the domain 80 | if (website in breach_domain.lower() or 81 | website in breach_name or 82 | breach_domain.lower().endswith(website)): 83 | 84 | domain_breaches.append(breach) 85 | 86 | # Check if breach is recent (within last 2 years) 87 | try: 88 | from datetime import datetime 89 | breach_datetime = datetime.strptime(breach_date, '%Y-%m-%d') 90 | days_ago = (datetime.now() - breach_datetime).days 91 | if days_ago <= 730: # 2 years 92 | recent_breaches.append(breach) 93 | except: 94 | pass 95 | 96 | # Improved scoring and categorization 97 | if recent_breaches: 98 | logger.critical(f"Domain {website} found in {len(recent_breaches)} recent breaches") 99 | for breach in recent_breaches[:3]: # Log first 3 recent breaches 100 | logger.critical(f" - {breach.get('Name')} ({breach.get('BreachDate')}): {breach.get('Description', '')[:100]}...") 101 | return "🔴" 102 | elif domain_breaches: 103 | logger.warning(f"Domain {website} found in {len(domain_breaches)} older breaches") 104 | for breach in domain_breaches[:2]: # Log first 2 older breaches 105 | logger.warning(f" - {breach.get('Name')} ({breach.get('BreachDate')})") 106 | return "🟡" 107 | else: 108 | logger.info(f"Domain {website} not found in any known breaches") 109 | return "🟢" 110 | 111 | except requests.exceptions.HTTPError as e: 112 | if e.response.status_code == 401: 113 | logger.error(f"API authentication failed for {website} - API key may be required") 114 | elif e.response.status_code == 429: 115 | logger.error(f"Rate limit exceeded while checking breaches for {website}") 116 | elif e.response.status_code == 404: 117 | logger.info(f"No breach data found for {website}") 118 | return "🟢" 119 | else: 120 | logger.error(f"HTTP error {e.response.status_code} while checking breaches for {website}: {e}") 121 | return "⚪" 122 | except RequestException as e: 123 | logger.error(f"Request error while checking breaches for {website}: {e}") 124 | return "⚪" 125 | except json.JSONDecodeError as e: 126 | logger.error(f"Invalid JSON response while checking breaches for {website}: {e}") 127 | return "⚪" 128 | except Exception as e: 129 | logger.error(f"Unexpected error while checking breaches for {website}: {e}") 130 | return "⚪" 131 | -------------------------------------------------------------------------------- /checks/check_ad_and_tracking.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from requests.exceptions import RequestException, Timeout, HTTPError 4 | from bs4 import BeautifulSoup 5 | import logging 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_ad_and_tracking(website): 12 | """ 13 | Check if the website is using Google Analytics, AdsbyGoogle, or other common ad/tracking scripts. 14 | 15 | Args: 16 | website (str): URL of the website to be checked. 17 | 18 | Returns: 19 | str: 20 | - "🔴" if both Google Analytics and AdsbyGoogle are present 21 | - "🟠" if only Google Analytics is present 22 | - "🟡" if other ad/tracking scripts are detected 23 | - "🟢" if neither are present 24 | - "⚪" if an error occurs 25 | """ 26 | # Input validation and URL normalization 27 | if not website or not isinstance(website, str): 28 | logger.error(f"Invalid website input: {website}") 29 | return "⚪" 30 | 31 | website = website.strip() 32 | if not website.startswith(('http://', 'https://')): 33 | website = f"https://{website}" 34 | 35 | headers = { 36 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 37 | } 38 | 39 | # Enhanced ad/tracking services patterns 40 | tracking_patterns = { 41 | 'google_analytics': [ 42 | r'www\.google-analytics\.com/analytics\.js', 43 | r'www\.googletagmanager\.com/gtag/js', 44 | r'gtag\(', 45 | r'GoogleAnalyticsObject', 46 | r'ga\(', 47 | ], 48 | 'google_ads': [ 49 | r'pagead2\.googlesyndication\.com/pagead/js/adsbygoogle\.js', 50 | r'googlesyndication\.com', 51 | r'adsbygoogle', 52 | ], 53 | 'facebook': [ 54 | r'connect\.facebook\.net', 55 | r'fbevents\.js', 56 | r'facebook\.com/tr', 57 | ], 58 | 'other_tracking': [ 59 | r'cdn\.branch\.io', 60 | r'pixel\.quantserve\.com', 61 | r'bat\.bing\.com', 62 | r'cdn\.taboola\.com', 63 | r'tracker\.cleverbridge\.com', 64 | r'hotjar\.com', 65 | r'fullstory\.com', 66 | r'mixpanel\.com', 67 | r'segment\.io', 68 | r'amplitude\.com', 69 | ] 70 | } 71 | 72 | try: 73 | # Enhanced content analysis with retry mechanism 74 | response = requests.get(website, headers=headers, timeout=15) 75 | response.raise_for_status() 76 | content = response.text.lower() 77 | 78 | # Score-based detection system 79 | detection_score = { 80 | 'google_analytics': 0, 81 | 'google_ads': 0, 82 | 'facebook': 0, 83 | 'other_tracking': 0 84 | } 85 | 86 | # Check patterns in content 87 | for category, patterns in tracking_patterns.items(): 88 | for pattern in patterns: 89 | if re.search(pattern, content, re.IGNORECASE): 90 | detection_score[category] += 1 91 | logger.debug(f"Found {category} pattern: {pattern}") 92 | 93 | # Enhanced BeautifulSoup analysis 94 | soup = BeautifulSoup(response.text, 'lxml') 95 | 96 | # Check script tags 97 | scripts = soup.find_all('script', src=True) 98 | for script in scripts: 99 | src = script.get('src', '').lower() 100 | for category, patterns in tracking_patterns.items(): 101 | for pattern in patterns: 102 | if re.search(pattern, src): 103 | detection_score[category] += 1 104 | 105 | # Check inline scripts 106 | inline_scripts = soup.find_all('script') 107 | for script in inline_scripts: 108 | if script.string: 109 | script_content = script.string.lower() 110 | for category, patterns in tracking_patterns.items(): 111 | for pattern in patterns: 112 | if re.search(pattern, script_content): 113 | detection_score[category] += 1 114 | 115 | # Determine result based on weighted scoring 116 | has_google_analytics = detection_score['google_analytics'] > 0 117 | has_google_ads = detection_score['google_ads'] > 0 118 | has_other_tracking = (detection_score['facebook'] + detection_score['other_tracking']) > 0 119 | 120 | logger.info(f"Tracking detection scores for {website}: {detection_score}") 121 | 122 | if has_google_analytics and has_google_ads: 123 | return "🔴" 124 | elif has_google_analytics: 125 | return "🟠" 126 | elif has_other_tracking: 127 | return "🟡" 128 | else: 129 | return "🟢" 130 | 131 | except (Timeout, HTTPError, RequestException) as e: 132 | logger.warning(f"Request error for {website}: {e}") 133 | 134 | # Enhanced fallback with basic pattern matching 135 | try: 136 | response = requests.get(website, headers=headers, timeout=10) 137 | response.raise_for_status() 138 | 139 | # Simple pattern matching as fallback 140 | content = response.text.lower() 141 | if any(pattern in content for patterns in tracking_patterns.values() for pattern in patterns[:2]): 142 | return "🟡" 143 | return "🟢" 144 | 145 | except Exception as e: 146 | logger.error(f"Fallback failed for {website}: {e}") 147 | return "⚪" 148 | except Exception as e: 149 | logger.error(f"Unexpected error for {website}: {e}") 150 | return "⚪" 151 | -------------------------------------------------------------------------------- /checks/check_ssl_cert.py: -------------------------------------------------------------------------------- 1 | import ssl 2 | import socket 3 | import logging 4 | from datetime import datetime, timezone 5 | from urllib.parse import urlparse 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_ssl_cert(website: str, port: int = 443) -> str: 12 | """ 13 | Check the SSL certificate of a given website for comprehensive security analysis. 14 | 15 | Args: 16 | website (str): The hostname or URL to check. 17 | port (int, optional): The port number. Defaults to 443 (standard HTTPS port). 18 | 19 | Returns: 20 | str: 21 | - "🟢 (X days left)" if the certificate is valid and secure with more than 30 days left. 22 | - "🟠 (X days left)" if the certificate is valid but has 30 days or fewer left, or minor issues. 23 | - "🔴" if the certificate is expired, invalid, or has security issues. 24 | - "⚪" if an error occurs during the check. 25 | """ 26 | # Input validation and hostname extraction 27 | if not website or not isinstance(website, str): 28 | logger.error(f"Invalid website input: {website}") 29 | return "⚪" 30 | 31 | website = website.strip() 32 | 33 | # Extract hostname from URL if provided 34 | if website.startswith(('http://', 'https://')): 35 | parsed = urlparse(website) 36 | host = parsed.netloc.split(':')[0] 37 | else: 38 | host = website.split(':')[0] 39 | 40 | # Create enhanced SSL context 41 | context = ssl.create_default_context() 42 | context.check_hostname = True 43 | context.verify_mode = ssl.CERT_REQUIRED 44 | 45 | try: 46 | # Create connection with timeout 47 | with socket.create_connection((host, port), timeout=15) as conn: 48 | with context.wrap_socket(conn, server_hostname=host) as sock: 49 | cert = sock.getpeercert() 50 | cert_der = sock.getpeercert(binary_form=True) 51 | 52 | # Get protocol and cipher information 53 | protocol_version = sock.version() 54 | cipher_info = sock.cipher() 55 | 56 | # Extract certificate information 57 | subject = dict(x[0] for x in cert['subject']) 58 | issuer = dict(x[0] for x in cert['issuer']) 59 | 60 | # Parse certificate dates (handling timezone-aware parsing) 61 | not_before = datetime.strptime(cert['notBefore'], "%b %d %H:%M:%S %Y %Z") 62 | not_after = datetime.strptime(cert['notAfter'], "%b %d %H:%M:%S %Y %Z") 63 | 64 | # Calculate days to expiration 65 | now = datetime.utcnow() 66 | days_to_expire = (not_after - now).days 67 | days_since_issued = (now - not_before).days 68 | 69 | # Security analysis 70 | security_issues = [] 71 | 72 | # Check certificate validity period 73 | if days_to_expire <= 0: 74 | logger.error(f"SSL certificate for {host} is expired") 75 | return "🔴" 76 | 77 | # Check for short validity periods (potential security issue) 78 | cert_lifetime_days = (not_after - not_before).days 79 | if cert_lifetime_days > 825: # More than ~2 years (old CA practice) 80 | security_issues.append("Long certificate lifetime") 81 | 82 | # Check signature algorithm 83 | if 'sha1' in cert.get('signatureAlgorithm', '').lower(): 84 | security_issues.append("Weak signature algorithm (SHA-1)") 85 | 86 | # Check key size (for RSA certificates) 87 | public_key_info = cert.get('subjectPublicKeyInfo', {}) 88 | 89 | # Check SAN (Subject Alternative Names) 90 | san_list = [] 91 | for san in cert.get('subjectAltName', []): 92 | if san[0] == 'DNS': 93 | san_list.append(san[1]) 94 | 95 | # Verify hostname is in certificate 96 | hostname_verified = ( 97 | subject.get('commonName') == host or 98 | host in san_list or 99 | any(san.replace('*.', '') in host for san in san_list if san.startswith('*.')) 100 | ) 101 | 102 | if not hostname_verified: 103 | security_issues.append("Hostname not in certificate") 104 | 105 | # Check certificate chain and issuer 106 | if 'Let\'s Encrypt' in issuer.get('organizationName', ''): 107 | logger.debug(f"Let's Encrypt certificate for {host}") 108 | 109 | logger.info(f"SSL certificate analysis for {host}: {days_to_expire} days left, {len(security_issues)} issues") 110 | 111 | if security_issues: 112 | logger.warning(f"Security issues found: {security_issues}") 113 | 114 | # Determine result based on analysis 115 | if security_issues: 116 | if days_to_expire <= 7: 117 | return "🔴" 118 | elif days_to_expire <= 30: 119 | return f"🔴 ({days_to_expire} days left)" 120 | else: 121 | return f"🟠 ({days_to_expire} days left)" 122 | elif days_to_expire <= 7: 123 | return "🔴" 124 | elif days_to_expire <= 30: 125 | return f"🟠 ({days_to_expire} days left)" 126 | else: 127 | return f"🟢 ({days_to_expire} days left)" 128 | 129 | except ssl.SSLError as ssl_err: 130 | logger.error(f"SSL error for {host}:{port}: {ssl_err}") 131 | return "🔴" 132 | except socket.timeout: 133 | logger.warning(f"Connection timeout for {host}:{port}") 134 | return "⚪" 135 | except socket.error as sock_err: 136 | logger.warning(f"Socket error for {host}:{port}: {sock_err}") 137 | return "⚪" 138 | except Exception as e: 139 | logger.error(f"Unexpected error for {host}:{port}: {e}") 140 | return "⚪" 141 | -------------------------------------------------------------------------------- /checks/check_cors_headers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from requests.exceptions import RequestException, HTTPError 4 | from urllib.parse import urlparse 5 | import re 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def check_cors_headers(website: str) -> str: 10 | """ 11 | Checks the CORS policy of the given website. 12 | 13 | Args: 14 | website (str): The website URL to check. 15 | 16 | Returns: 17 | str: 18 | - "🟢" if the CORS policy is secure and properly configured. 19 | - "🟡" if CORS is configured but with potential security concerns. 20 | - "🔴" if the CORS policy is insecure or misconfigured. 21 | - "⚪" if an error occurred during checking. 22 | """ 23 | # Input validation and URL normalization 24 | if not website: 25 | logger.error("Website URL is required") 26 | return "⚪" 27 | 28 | if not website.startswith(('http://', 'https://')): 29 | website = f"https://{website}" 30 | 31 | try: 32 | parsed_url = urlparse(website) 33 | if not parsed_url.netloc: 34 | logger.error(f"Invalid URL format: {website}") 35 | return "⚪" 36 | except Exception as e: 37 | logger.error(f"URL parsing error for {website}: {e}") 38 | return "⚪" 39 | 40 | headers = { 41 | 'User-Agent': 'CORSPolicyChecker/2.0', 42 | 'Origin': 'https://example.com' # Test with a different origin 43 | } 44 | 45 | try: 46 | # Enhanced detection patterns - check multiple endpoints and methods 47 | endpoints_to_check = [ 48 | website, 49 | f"{website}/api", 50 | f"{website}/api/v1", 51 | f"{website}/graphql" 52 | ] 53 | 54 | cors_findings = [] 55 | 56 | for endpoint in endpoints_to_check: 57 | try: 58 | # Check OPTIONS request (preflight) 59 | options_response = requests.options(endpoint, headers=headers, timeout=10) 60 | 61 | # Check GET request for CORS headers 62 | get_response = requests.get(endpoint, headers=headers, timeout=10) 63 | 64 | for response in [options_response, get_response]: 65 | if response.status_code < 400: # Only check successful responses 66 | cors_analysis = analyze_cors_headers(response.headers, endpoint) 67 | if cors_analysis: 68 | cors_findings.append(cors_analysis) 69 | 70 | except RequestException: 71 | continue # Skip endpoints that don't respond 72 | 73 | if not cors_findings: 74 | logger.info(f"No CORS headers found for {website} - may not support CORS") 75 | return "🟢" 76 | 77 | # Improved scoring and categorization 78 | critical_issues = [f for f in cors_findings if f["risk"] == "critical"] 79 | high_issues = [f for f in cors_findings if f["risk"] == "high"] 80 | medium_issues = [f for f in cors_findings if f["risk"] == "medium"] 81 | 82 | # Log findings 83 | for finding in cors_findings[:5]: # Log first 5 findings 84 | level = logger.critical if finding["risk"] == "critical" else logger.warning 85 | level(f"CORS issue on {finding['endpoint']}: {finding['issue']}") 86 | 87 | if critical_issues: 88 | logger.critical(f"Critical CORS vulnerabilities found on {website}") 89 | return "🔴" 90 | elif high_issues or len(medium_issues) >= 2: 91 | logger.warning(f"CORS security concerns found on {website}") 92 | return "🟡" 93 | elif medium_issues: 94 | logger.info(f"Minor CORS configuration issues found on {website}") 95 | return "🟡" 96 | else: 97 | logger.info(f"CORS policy appears secure for {website}") 98 | return "🟢" 99 | 100 | except (HTTPError, RequestException) as e: 101 | logger.error(f"Request error while checking CORS headers for {website}: {e}") 102 | return "⚪" 103 | except Exception as e: 104 | logger.error(f"Unexpected error while checking CORS headers for {website}: {e}") 105 | return "⚪" 106 | 107 | def analyze_cors_headers(headers, endpoint): 108 | """Helper function to analyze CORS headers for security issues""" 109 | cors_origin = headers.get('Access-Control-Allow-Origin', '') 110 | cors_credentials = headers.get('Access-Control-Allow-Credentials', '').lower() 111 | cors_methods = headers.get('Access-Control-Allow-Methods', '') 112 | cors_headers_allowed = headers.get('Access-Control-Allow-Headers', '') 113 | 114 | # Critical security issues 115 | if cors_origin == '*' and cors_credentials == 'true': 116 | return { 117 | "endpoint": endpoint, 118 | "issue": "Wildcard origin with credentials allowed - critical security risk", 119 | "risk": "critical" 120 | } 121 | 122 | # High risk issues 123 | if cors_origin == '*': 124 | return { 125 | "endpoint": endpoint, 126 | "issue": "Wildcard CORS origin allows all domains", 127 | "risk": "high" 128 | } 129 | 130 | # Medium risk issues 131 | if 'DELETE' in cors_methods.upper() and cors_credentials == 'true': 132 | return { 133 | "endpoint": endpoint, 134 | "issue": "DELETE method allowed with credentials", 135 | "risk": "medium" 136 | } 137 | 138 | if '*' in cors_headers_allowed: 139 | return { 140 | "endpoint": endpoint, 141 | "issue": "Wildcard headers allowed", 142 | "risk": "medium" 143 | } 144 | 145 | return None 146 | -------------------------------------------------------------------------------- /checks/check_third_party_resources.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from urllib.parse import urlparse 4 | from bs4 import BeautifulSoup 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_third_party_resources(website: str) -> str: 12 | """ 13 | Check for third-party resources loaded by the website with enhanced analysis. 14 | 15 | Args: 16 | website (str): URL of the website to be checked. 17 | 18 | Returns: 19 | str: 20 | - "🟢" if the number of third-party resources is minimal (0-3 domains). 21 | - "🟠" if there is a moderate number of third-party resources (4-8 domains). 22 | - "🔴" if there is a high number of third-party resources (9+ domains). 23 | - "⚪" for any errors. 24 | """ 25 | # Input validation and URL normalization 26 | if not website or not isinstance(website, str): 27 | logger.error(f"Invalid website input: {website}") 28 | return "⚪" 29 | 30 | website = website.strip() 31 | if not website.startswith(('http://', 'https://')): 32 | website = f"https://{website}" 33 | 34 | headers = { 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 36 | } 37 | 38 | try: 39 | # Fetch content with proper error handling 40 | response = requests.get(website, headers=headers, timeout=15) 41 | response.raise_for_status() 42 | 43 | # Parse main domain 44 | main_domain = urlparse(website).netloc.lower() 45 | main_domain_parts = main_domain.split('.') 46 | if len(main_domain_parts) >= 2: 47 | root_domain = '.'.join(main_domain_parts[-2:]) 48 | else: 49 | root_domain = main_domain 50 | 51 | # Parse HTML content 52 | soup = BeautifulSoup(response.text, 'html.parser') 53 | 54 | # Track third-party domains and resource types 55 | third_party_domains = set() 56 | resource_types = { 57 | 'scripts': 0, 58 | 'stylesheets': 0, 59 | 'images': 0, 60 | 'fonts': 0, 61 | 'other': 0 62 | } 63 | 64 | # Enhanced resource detection 65 | resource_selectors = [ 66 | ('script', 'src', 'scripts'), 67 | ('link', 'href', 'stylesheets'), 68 | ('img', 'src', 'images'), 69 | ('source', 'src', 'images'), 70 | ('iframe', 'src', 'other'), 71 | ('embed', 'src', 'other'), 72 | ('object', 'data', 'other') 73 | ] 74 | 75 | for tag_name, attr_name, resource_type in resource_selectors: 76 | for tag in soup.find_all(tag_name): 77 | resource_url = tag.get(attr_name) 78 | if resource_url: 79 | parsed_url = urlparse(resource_url) 80 | domain = parsed_url.netloc.lower() 81 | 82 | if domain and domain != main_domain: 83 | # Check if it's a subdomain of the main domain 84 | domain_parts = domain.split('.') 85 | if len(domain_parts) >= 2: 86 | domain_root = '.'.join(domain_parts[-2:]) 87 | if domain_root != root_domain: 88 | third_party_domains.add(domain) 89 | resource_types[resource_type] += 1 90 | logger.debug(f"Third-party {resource_type}: {resource_url}") 91 | 92 | # Check for font resources specifically 93 | for link in soup.find_all('link', rel=lambda x: x and 'font' in str(x).lower()): 94 | href = link.get('href') 95 | if href: 96 | parsed_url = urlparse(href) 97 | domain = parsed_url.netloc.lower() 98 | if domain and domain != main_domain: 99 | domain_parts = domain.split('.') 100 | if len(domain_parts) >= 2: 101 | domain_root = '.'.join(domain_parts[-2:]) 102 | if domain_root != root_domain: 103 | third_party_domains.add(domain) 104 | resource_types['fonts'] += 1 105 | 106 | third_party_count = len(third_party_domains) 107 | total_resources = sum(resource_types.values()) 108 | 109 | logger.info(f"Third-party analysis for {website}: {third_party_count} domains, {total_resources} resources") 110 | logger.debug(f"Resource breakdown: {resource_types}") 111 | logger.debug(f"Third-party domains: {list(third_party_domains)}") 112 | 113 | # Enhanced scoring based on domain count and resource distribution 114 | if third_party_count == 0: 115 | logger.info(f"No third-party resources detected for {website}") 116 | return "🟢" 117 | elif third_party_count <= 3: 118 | logger.info(f"Minimal third-party resources ({third_party_count} domains) for {website}") 119 | return "🟢" 120 | elif third_party_count <= 8: 121 | logger.warning(f"Moderate third-party resources ({third_party_count} domains) for {website}") 122 | return "🟠" 123 | else: 124 | logger.warning(f"High number of third-party resources ({third_party_count} domains) for {website}") 125 | return "🔴" 126 | 127 | except (Timeout, HTTPError) as e: 128 | logger.warning(f"HTTP/Timeout error for {website}: {e}") 129 | return "⚪" 130 | except RequestException as e: 131 | logger.warning(f"Request error for {website}: {e}") 132 | return "⚪" 133 | except Exception as e: 134 | logger.error(f"Unexpected error for {website}: {e}") 135 | return "⚪" 136 | -------------------------------------------------------------------------------- /checks/check_third_party_requests.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from urllib.parse import urlparse 4 | from bs4 import BeautifulSoup 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_third_party_requests(website: str) -> str: 12 | """ 13 | Monitor the number of third-party requests made by a website with enhanced analysis. 14 | 15 | Args: 16 | website (str): URL of the website to be checked. 17 | 18 | Returns: 19 | str: 20 | - "🟢" if the number of third-party requests is minimal (0-20). 21 | - "🟠" if there is a moderate number of third-party requests (21-50). 22 | - "🔴" if the website makes a high number of third-party requests (51+). 23 | - "⚪" for any errors. 24 | """ 25 | # Input validation and URL normalization 26 | if not website or not isinstance(website, str): 27 | logger.error(f"Invalid website input: {website}") 28 | return "⚪" 29 | 30 | website = website.strip() 31 | if not website.startswith(('http://', 'https://')): 32 | website = f"https://{website}" 33 | 34 | headers = { 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 36 | } 37 | 38 | try: 39 | # Send HTTP GET request with enhanced error handling 40 | response = requests.get(website, headers=headers, timeout=15) 41 | response.raise_for_status() 42 | 43 | # Parse the main domain from the website URL 44 | parsed_url = urlparse(website) 45 | main_domain = parsed_url.netloc.lower() 46 | main_domain_parts = main_domain.split('.') 47 | if len(main_domain_parts) >= 2: 48 | root_domain = '.'.join(main_domain_parts[-2:]) 49 | else: 50 | root_domain = main_domain 51 | 52 | # Parse HTML content using BeautifulSoup 53 | soup = BeautifulSoup(response.text, 'lxml') 54 | 55 | # Track third-party requests by category 56 | third_party_requests = 0 57 | request_categories = { 58 | 'scripts': 0, 59 | 'stylesheets': 0, 60 | 'images': 0, 61 | 'fonts': 0, 62 | 'iframes': 0, 63 | 'other': 0 64 | } 65 | 66 | third_party_domains = set() 67 | 68 | # Enhanced resource detection with categorization 69 | resource_selectors = [ 70 | ('script', 'src', 'scripts'), 71 | ('link', 'href', 'stylesheets'), 72 | ('img', 'src', 'images'), 73 | ('source', 'src', 'images'), 74 | ('iframe', 'src', 'iframes'), 75 | ('embed', 'src', 'other'), 76 | ('object', 'data', 'other'), 77 | ('video', 'src', 'other'), 78 | ('audio', 'src', 'other') 79 | ] 80 | 81 | for tag_name, attr_name, category in resource_selectors: 82 | for tag in soup.find_all(tag_name): 83 | resource_url = tag.get(attr_name) 84 | if resource_url and resource_url.startswith(('http://', 'https://')): 85 | parsed_resource = urlparse(resource_url) 86 | domain = parsed_resource.netloc.lower() 87 | 88 | if domain and domain != main_domain: 89 | # Check if it's a subdomain of the main domain 90 | domain_parts = domain.split('.') 91 | if len(domain_parts) >= 2: 92 | domain_root = '.'.join(domain_parts[-2:]) 93 | if domain_root != root_domain: 94 | third_party_requests += 1 95 | third_party_domains.add(domain) 96 | request_categories[category] += 1 97 | logger.debug(f"Third-party {category}: {resource_url}") 98 | 99 | # Check for font resources specifically 100 | for link in soup.find_all('link', rel=lambda x: x and any(font_rel in str(x).lower() for font_rel in ['font', 'preload'])): 101 | href = link.get('href') 102 | if href and href.startswith(('http://', 'https://')): 103 | parsed_href = urlparse(href) 104 | domain = parsed_href.netloc.lower() 105 | if domain and domain != main_domain: 106 | domain_parts = domain.split('.') 107 | if len(domain_parts) >= 2: 108 | domain_root = '.'.join(domain_parts[-2:]) 109 | if domain_root != root_domain: 110 | third_party_requests += 1 111 | third_party_domains.add(domain) 112 | request_categories['fonts'] += 1 113 | 114 | logger.info(f"Third-party analysis for {website}: {third_party_requests} requests across {len(third_party_domains)} domains") 115 | logger.debug(f"Request breakdown: {request_categories}") 116 | logger.debug(f"Third-party domains: {list(third_party_domains)}") 117 | 118 | # Enhanced threshold-based evaluation 119 | if third_party_requests <= 20: 120 | logger.info(f"Minimal third-party requests ({third_party_requests}) for {website}") 121 | return "🟢" 122 | elif third_party_requests <= 50: 123 | logger.warning(f"Moderate third-party requests ({third_party_requests}) for {website}") 124 | return "🟠" 125 | else: 126 | logger.warning(f"High number of third-party requests ({third_party_requests}) for {website}") 127 | return "🔴" 128 | 129 | except (Timeout, HTTPError) as e: 130 | logger.warning(f"HTTP/Timeout error for {website}: {e}") 131 | return "⚪" 132 | except RequestException as e: 133 | logger.warning(f"Request error for {website}: {e}") 134 | return "⚪" 135 | except Exception as e: 136 | logger.error(f"Unexpected error for {website}: {e}") 137 | return "⚪" 138 | -------------------------------------------------------------------------------- /checks/check_sitemap.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from bs4 import BeautifulSoup 4 | from urllib.parse import urljoin, urlparse 5 | from requests.exceptions import RequestException, Timeout, HTTPError 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | def check_sitemap(website): 12 | """ 13 | Check if the provided website has a sitemap.xml with enhanced validation. 14 | 15 | Args: 16 | website (str): The URL of the website to be checked. 17 | 18 | Returns: 19 | str: 20 | - "🟢" if a valid sitemap is found. 21 | - "🔴" if no sitemap is found or if there's a request-related error. 22 | - "⚪" for any other unexpected errors. 23 | """ 24 | # Input validation and URL normalization 25 | if not website or not isinstance(website, str): 26 | logger.error(f"Invalid website input: {website}") 27 | return "⚪" 28 | 29 | website = website.strip() 30 | if not website.startswith(('http://', 'https://')): 31 | website = f"https://{website}" 32 | 33 | # Enhanced sitemap paths with more comprehensive patterns 34 | sitemap_paths = [ 35 | '/sitemap.xml', # Default location 36 | '/sitemap_index.xml', # Index file for multiple sitemaps 37 | '/sitemap/sitemap.xml', # Common alternative path 38 | '/sitemap1.xml', # Numbered sitemap 39 | '/sitemap-index.xml', # Alternative index naming 40 | '/sitemap/sitemap-index.xml', # Nested alternative 41 | '/sitemap_index.xml.gz', # Compressed sitemap 42 | '/sitemaps.xml', # Plural variant 43 | '/site-map.xml', # Hyphenated variant 44 | '/robots.txt' # Check robots.txt for sitemap reference 45 | ] 46 | 47 | headers = { 48 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 49 | } 50 | 51 | try: 52 | session = requests.Session() 53 | session.headers.update(headers) 54 | 55 | # Method 1: Check common sitemap paths 56 | for path in sitemap_paths[:-1]: # Exclude robots.txt for now 57 | try: 58 | sitemap_url = urljoin(website, path) 59 | response = session.get(sitemap_url, timeout=15) 60 | 61 | if response.status_code == 200: 62 | content = response.text.lower() 63 | 64 | # Enhanced validation of sitemap content 65 | if any(indicator in content for indicator in ['', '']): 66 | logger.info(f"Valid sitemap found at {sitemap_url}") 67 | return "🟢" 68 | 69 | except (Timeout, HTTPError, RequestException): 70 | continue 71 | 72 | # Method 2: Check robots.txt for sitemap references 73 | try: 74 | robots_url = urljoin(website, '/robots.txt') 75 | robots_response = session.get(robots_url, timeout=10) 76 | 77 | if robots_response.status_code == 200: 78 | robots_content = robots_response.text.lower() 79 | if 'sitemap:' in robots_content: 80 | # Extract sitemap URLs from robots.txt 81 | import re 82 | sitemap_matches = re.findall(r'sitemap:\s*(.+)', robots_content, re.IGNORECASE) 83 | 84 | for sitemap_url in sitemap_matches: 85 | sitemap_url = sitemap_url.strip() 86 | try: 87 | sitemap_response = session.get(sitemap_url, timeout=10) 88 | if sitemap_response.status_code == 200: 89 | content = sitemap_response.text.lower() 90 | if any(indicator in content for indicator in ['', '']): 91 | logger.info(f"Valid sitemap found via robots.txt: {sitemap_url}") 92 | return "🟢" 93 | except (Timeout, HTTPError, RequestException): 94 | continue 95 | except (Timeout, HTTPError, RequestException): 96 | pass 97 | 98 | # Method 3: Check HTML for sitemap links 99 | try: 100 | main_response = session.get(website, timeout=15) 101 | if main_response.status_code == 200: 102 | soup = BeautifulSoup(main_response.text, 'html.parser') 103 | 104 | # Look for sitemap links in HTML 105 | sitemap_links = soup.find_all('a', href=lambda x: x and 'sitemap' in x.lower()) 106 | for link in sitemap_links: 107 | href = link.get('href') 108 | if href: 109 | sitemap_url = urljoin(website, href) 110 | try: 111 | sitemap_response = session.get(sitemap_url, timeout=10) 112 | if sitemap_response.status_code == 200: 113 | content = sitemap_response.text.lower() 114 | if any(indicator in content for indicator in [' "Secrets and variables" -> "Actions" in your GitHub repository. 72 | * Add a new repository secret named `PAGESPEED_API_KEY` and paste your API key as the value. 73 | 5. **Create the `report_template.md` File:** 74 | * Create a new file called `report_template.md` in the root of your repository if it doesn't exist. 75 | * Add a default template to generate the report, for example: 76 | ```markdown 77 | # Websites Monitor 78 | ``` 79 | 6. **Commit All Changes:** 80 | * Commit and push the changes to your forked repository to trigger the initial report generation. 81 | 82 | ### How the Monitoring Works 83 | 84 | - **Daily Execution:** The `create-report.yml` GitHub Action workflow is scheduled to run daily. 85 | - **Website Checks:** The workflow executes the `main.py` script, which performs all the checks on the websites specified in `config.yaml`. 86 | - **Report Generation:** The `main.py` script automatically generates the report in the `README.md` file using the `report_template.md` as a base. 87 | - **Automatic Updates:** The `README.md` file will be automatically updated with the latest check results after each successful run of the workflow. 88 | 89 | ## Configuration Options 90 | 91 | The `config.yaml` file allows for various configurations: 92 | 93 | - `websites`: List of URLs to monitor (required) 94 | - `output_file`: The output filename of the generated report (default: `report.md`) 95 | - `max_workers`: Number of concurrent tasks when performing the checks (default: 4) 96 | - `timeout`: Default timeout in seconds for the checks (default: 30) 97 | - `report_template`: The filename of the report template (default: `report_template.md`) 98 | - `github_workflow_badge`: The GitHub workflow badge URL 99 | - `pagespeed_api_key`: Google PageSpeed Insights API key (can also be set as environment variable `PAGESPEED_API_KEY`) 100 | 101 | ### Example Configuration 102 | 103 | ```yaml 104 | websites: 105 | - audiolibri.org 106 | - example.com 107 | - mywebsite.com 108 | output_file: report.md 109 | max_workers: 2 110 | timeout: 30 111 | report_template: report_template.md 112 | github_workflow_badge: https://github.com/fabriziosalmi/websites-monitor/actions/workflows/create-report.yml/badge.svg 113 | ``` 114 | 115 | **Note**: When using GitHub Actions, you can override the `output_file` to `README.md` to update your repository's README automatically. 116 | 117 | ## Customizing Checks 118 | 119 | You can modify existing checks or add new ones by editing the files in the `checks` directory and then adding the check to the `WebsiteMonitor` class in `main.py`. Ensure your new check functions follow the same format, returning an emoji indicating status (🟢, 🔴, or ⚪). 120 | 121 | ## Understanding the Output 122 | 123 | The generated report in `README.md` includes a table with a row for each website, and the results for each check in each column. 124 | 125 | - 🟢: The check is successful. 126 | - 🔴: The check failed. 127 | - 🟡: The check returned a warning or requires attention. 128 | - ⚪: An error occurred during the check, or the check was not completed. 129 | 130 | ## Support 131 | 132 | For any issues or suggestions regarding this project, feel free to open an issue on GitHub. 133 | 134 | --- 135 | -------------------------------------------------------------------------------- /checks/check_privacy_exposure.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import logging 4 | from requests.exceptions import RequestException, Timeout, HTTPError 5 | from bs4 import BeautifulSoup 6 | from urllib.parse import urljoin 7 | 8 | # Configure logging 9 | logging.basicConfig(level=logging.INFO) 10 | logger = logging.getLogger(__name__) 11 | 12 | def check_privacy_exposure(website: str) -> str: 13 | """ 14 | Check a given website for potential exposure of sensitive or private data with enhanced detection. 15 | 16 | Args: 17 | website (str): The URL of the website to be checked. 18 | 19 | Returns: 20 | str: "🟢" if no patterns of sensitive data are found, "🔴" otherwise, "⚪" if an error occurred. 21 | """ 22 | # Input validation and URL normalization 23 | if not website or not isinstance(website, str): 24 | logger.error(f"Invalid website input: {website}") 25 | return "⚪" 26 | 27 | website = website.strip() 28 | if not website.startswith(('http://', 'https://')): 29 | website = f"https://{website}" 30 | 31 | headers = { 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 33 | } 34 | 35 | # Enhanced patterns to detect sensitive data exposure 36 | sensitive_data_patterns = [ 37 | r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', # Email addresses 38 | r'\b\d{10,11}\b', # Phone numbers (10-11 digits) 39 | r'(\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b)', # Credit Card numbers 40 | r'(\b\d{3}[- ]?\d{2}[- ]?\d{4}\b)', # Social Security Numbers 41 | r'("AWS_ACCESS_KEY_ID"|"aws_secret_access_key"|"AKIA[0-9A-Z]{16}")', # AWS Access Keys 42 | r'("-----BEGIN PRIVATE KEY-----")', # Private key exposure 43 | r'("api_key"|"apikey"):\s*["\'][^"\']+["\']', # API keys 44 | r'("password"|"passwd"):\s*["\'][^"\']+["\']', # Passwords 45 | r'("secret"|"token"):\s*["\'][^"\']+["\']', # Secrets/tokens 46 | r'\b[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\b', # IP addresses 47 | ] 48 | 49 | exposure_score = 0 50 | 51 | try: 52 | # Method 1: Direct HTML content analysis 53 | response = requests.get(website, headers=headers, timeout=15) 54 | response.raise_for_status() 55 | 56 | # Check for sensitive data patterns in the HTML content 57 | for pattern in sensitive_data_patterns: 58 | matches = re.findall(pattern, response.text, re.IGNORECASE) 59 | if matches: 60 | exposure_score += len(matches) 61 | logger.warning(f"Sensitive data pattern found: {pattern[:30]}... ({len(matches)} matches)") 62 | 63 | # Method 2: Meta tags and scripts analysis 64 | soup = BeautifulSoup(response.text, 'html.parser') 65 | 66 | # Check meta tags for privacy leaks 67 | meta_tags = soup.find_all('meta', {'name': re.compile(r'(description|keywords|author)', re.IGNORECASE)}) 68 | for tag in meta_tags: 69 | content = tag.get('content', '') 70 | for pattern in sensitive_data_patterns: 71 | if re.search(pattern, content, re.IGNORECASE): 72 | exposure_score += 1 73 | logger.warning(f"Sensitive data in meta tag: {tag}") 74 | 75 | # Check script tags for exposed data 76 | script_tags = soup.find_all('script') 77 | for script in script_tags: 78 | if script.string: 79 | for pattern in sensitive_data_patterns: 80 | matches = re.findall(pattern, script.string, re.IGNORECASE) 81 | if matches: 82 | exposure_score += len(matches) 83 | logger.warning(f"Sensitive data in script tag: {len(matches)} matches") 84 | 85 | # Method 3: Check for accidental exposure of configuration files 86 | sensitive_paths = [ 87 | '/.env', '/config.json', '/settings.py', '/config.php', 88 | '/wp-config.php', '/.git/config', '/.htaccess', '/.aws/credentials', 89 | '/database.yml', '/.env.local', '/.env.production', '/secrets.json', 90 | '/app.config', '/web.config', '/.npmrc', '/.dockerenv' 91 | ] 92 | 93 | for path in sensitive_paths: 94 | try: 95 | file_url = urljoin(website, path) 96 | file_response = requests.get(file_url, headers=headers, timeout=5) 97 | 98 | if file_response.status_code == 200: 99 | for pattern in sensitive_data_patterns: 100 | matches = re.findall(pattern, file_response.text, re.IGNORECASE) 101 | if matches: 102 | exposure_score += len(matches) * 2 # Higher weight for config files 103 | logger.error(f"Sensitive data in config file {path}: {len(matches)} matches") 104 | 105 | except (Timeout, HTTPError): 106 | continue # Ignore timeouts and HTTP errors for these paths 107 | 108 | # Method 4: Check common backup/temp file patterns 109 | backup_patterns = [ 110 | '/backup.sql', '/dump.sql', '/database.bak', '/site.zip', 111 | '/backup.zip', '/old_site.tar.gz', '/backup.tar' 112 | ] 113 | 114 | for backup_path in backup_patterns: 115 | try: 116 | backup_url = urljoin(website, backup_path) 117 | backup_response = requests.get(backup_url, headers=headers, timeout=5) 118 | 119 | if backup_response.status_code == 200: 120 | exposure_score += 5 # High weight for accessible backup files 121 | logger.error(f"Accessible backup file found: {backup_path}") 122 | 123 | except (Timeout, HTTPError): 124 | continue 125 | 126 | logger.info(f"Privacy exposure analysis for {website}: exposure score {exposure_score}") 127 | 128 | if exposure_score == 0: 129 | return "🟢" 130 | else: 131 | logger.warning(f"Privacy exposure detected for {website}: score {exposure_score}") 132 | return "🔴" 133 | 134 | except (Timeout, HTTPError) as e: 135 | logger.warning(f"HTTP/Timeout error while checking privacy exposure for {website}: {e}") 136 | return "⚪" 137 | except RequestException as e: 138 | logger.warning(f"Request error while checking privacy exposure for {website}: {e}") 139 | return "⚪" 140 | except Exception as e: 141 | logger.error(f"Unexpected error while checking privacy exposure for {website}: {e}") 142 | return "⚪" 143 | --------------------------------------------------------------------------------