├── requirements.txt ├── .gitignore ├── LICENSE ├── setup.py ├── CONTRIBUTING.md ├── docs └── app_architecture_overview.md ├── app_architecture.md ├── README.md ├── templates ├── components.html └── index.html └── app.py /requirements.txt: -------------------------------------------------------------------------------- 1 | flask==3.0.2 2 | requests==2.31.0 3 | beautifulsoup4==4.12.3 4 | urllib3==2.2.1 5 | cssutils==2.9.0 6 | selenium==4.18.1 7 | webdriver-manager==4.0.1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # Virtual environment 28 | venv/ 29 | env/ 30 | ENV/ 31 | 32 | # Flask related 33 | instance/ 34 | .webassets-cache 35 | 36 | # Selenium & WebDriver 37 | chromedriver 38 | chromedriver.exe 39 | *.log 40 | geckodriver 41 | geckodriver.exe 42 | .wdm/ 43 | 44 | # OS specific files 45 | .DS_Store 46 | .DS_Store? 47 | ._* 48 | .Spotlight-V100 49 | .Trashes 50 | ehthumbs.db 51 | Thumbs.db 52 | 53 | # Editor directories and files 54 | .idea/ 55 | .vscode/ 56 | *.swp 57 | *.swo 58 | 59 | # Temporary files 60 | *.tmp 61 | *~ 62 | tmp/ 63 | temp/ 64 | 65 | # Downloaded website archives 66 | *.zip 67 | 68 | # Local environment variables 69 | .env 70 | .env.local -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Sirio Berati 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | with open("requirements.txt", "r", encoding="utf-8") as fh: 7 | requirements = fh.read().splitlines() 8 | 9 | setup( 10 | name="website-extractor", 11 | version="1.0.0", 12 | author="Sirio Berati", 13 | author_email="your.email@example.com", # Replace with your actual email 14 | description="A tool to extract and archive entire websites with advanced rendering capabilities", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/sirioberati/website-extractor", 18 | packages=find_packages(), 19 | classifiers=[ 20 | "Programming Language :: Python :: 3", 21 | "License :: OSI Approved :: MIT License", 22 | "Operating System :: OS Independent", 23 | "Topic :: Internet :: WWW/HTTP", 24 | "Topic :: Software Development :: Libraries :: Python Modules", 25 | "Topic :: Utilities", 26 | ], 27 | python_requires=">=3.7", 28 | install_requires=requirements, 29 | entry_points={ 30 | "console_scripts": [ 31 | "website-extractor=app:main", 32 | ], 33 | }, 34 | include_package_data=True, 35 | ) -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Website Extractor 2 | 3 | Thank you for considering contributing to Website Extractor! This document provides guidelines and instructions for contributing to this project. 4 | 5 | ## Code of Conduct 6 | 7 | By participating in this project, you agree to maintain a respectful and inclusive environment for everyone. 8 | 9 | ## How Can I Contribute? 10 | 11 | ### Reporting Bugs 12 | 13 | Before creating a bug report: 14 | 15 | 1. Check the existing issues to see if the problem has already been reported 16 | 2. Collect information about the bug (steps to reproduce, error messages, etc.) 17 | 18 | When submitting a bug report, please include: 19 | 20 | - A clear and descriptive title 21 | - Detailed steps to reproduce the issue 22 | - Expected vs. actual behavior 23 | - Screenshots if applicable 24 | - Your environment information (OS, browser, Python version, etc.) 25 | 26 | ### Suggesting Enhancements 27 | 28 | Enhancement suggestions are welcome! Please include: 29 | 30 | - A clear and descriptive title 31 | - A detailed description of the proposed enhancement 32 | - The motivation behind the enhancement 33 | - Any potential implementation details you can think of 34 | 35 | ### Pull Requests 36 | 37 | 1. Fork the repository 38 | 2. Create a new branch (`git checkout -b feature/amazing-feature`) 39 | 3. Make your changes 40 | 4. Run tests if available 41 | 5. Commit your changes (`git commit -m 'Add some amazing feature'`) 42 | 6. Push to your branch (`git push origin feature/amazing-feature`) 43 | 7. Open a Pull Request 44 | 45 | ## Development Setup 46 | 47 | 1. Fork and clone the repository 48 | 2. Create a virtual environment: 49 | ```bash 50 | python -m venv venv 51 | source venv/bin/activate # On Windows: venv\Scripts\activate 52 | ``` 53 | 3. Install dependencies: 54 | ```bash 55 | pip install -r requirements.txt 56 | ``` 57 | 4. Run the application locally: 58 | ```bash 59 | python app.py 60 | ``` 61 | 62 | ## Coding Standards 63 | 64 | - Follow PEP 8 style guidelines 65 | - Write descriptive commit messages 66 | - Include comments and docstrings 67 | - Add tests for new features when possible 68 | 69 | ## License 70 | 71 | By contributing to this project, you agree that your contributions will be licensed under the project's [MIT License](LICENSE). 72 | 73 | ## Questions? 74 | 75 | If you have any questions, feel free to reach out to the project maintainer through the GitHub issues page. -------------------------------------------------------------------------------- /docs/app_architecture_overview.md: -------------------------------------------------------------------------------- 1 | # Website Extractor Architecture Overview 2 | 3 | ``` 4 | ┌───────────────────────────────────────────────────────────────────┐ 5 | │ Website Extractor Application │ 6 | └───────────────────────────────────────────────────────────────────┘ 7 | │ 8 | ▼ 9 | ┌───────────────────────────────────────────────────────────────────┐ 10 | │ Flask Web Server │ 11 | └───────────────────────────────────────────────────────────────────┘ 12 | │ 13 | ▼ 14 | ┌───────────────────────────────────────────────────────────────────┐ 15 | │ Extraction Core Processes │ 16 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤ 17 | │ HTTP Client │ Selenium Renderer │ Content Parser │ Asset Saver │ 18 | │ (requests) │ (WebDriver) │ (BeautifulSoup) │ (Zip) │ 19 | └───────────────┴──────────────────┴──────────────────┴─────────────┘ 20 | ``` 21 | 22 | ## Data Flow 23 | 24 | ``` 25 | ┌──────────┐ URL ┌──────────┐ HTML Content ┌──────────────┐ 26 | │ User │───────────▶│ Extractor│───────────────▶│ HTML Parser │ 27 | └──────────┘ └──────────┘ └──────────────┘ 28 | │ │ 29 | Rendering │ │ Asset URLs 30 | option │ │ 31 | ▼ ▼ 32 | ┌──────────┐ ┌──────────────┐ 33 | │ Selenium │ │ Asset │ 34 | │ WebDriver│ │ Downloader │ 35 | └──────────┘ └──────────────┘ 36 | │ │ 37 | Rendered│ Assets │ 38 | HTML │ │ 39 | ▼ ▼ 40 | ┌──────────────────────────────────────────┐ 41 | │ Zip File Creator │ 42 | └──────────────────────────────────────────┘ 43 | │ 44 | ▼ 45 | ┌──────────────────────────────────────────┐ 46 | │ File Download Response to User │ 47 | └──────────────────────────────────────────┘ 48 | ``` 49 | 50 | ### Key Components 51 | 52 | 1. **Flask Web Server**: The user interface and API endpoint 53 | 2. **HTTP Client**: Makes network requests to target websites 54 | 3. **Selenium Renderer**: Renders JavaScript-heavy sites (optional) 55 | 4. **Content Parser**: Analyzes HTML to extract assets 56 | 5. **Asset Downloader**: Retrieves all website assets 57 | 6. **Zip Creator**: Packages everything into a downloadable archive 58 | 59 | For more detailed information, see the full [app_architecture.md](../app_architecture.md) file. -------------------------------------------------------------------------------- /app_architecture.md: -------------------------------------------------------------------------------- 1 | # Website Extractor - Application Architecture 2 | 3 | ## Overview 4 | 5 | This document provides a high-level overview of the Website Extractor application architecture, explaining how the different components interact and the flow of data through the system. 6 | 7 | ``` 8 | ┌───────────────────────────────────────────────────────────────────┐ 9 | │ Website Extractor Application │ 10 | └───────────────────────────────────────────────────────────────────┘ 11 | │ 12 | ▼ 13 | ┌───────────────────────────────────────────────────────────────────┐ 14 | │ Flask Web Server │ 15 | └───────────────────────────────────────────────────────────────────┘ 16 | │ 17 | ▼ 18 | ┌───────────────────────────────────────────────────────────────────┐ 19 | │ Extraction Core Processes │ 20 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤ 21 | │ HTTP Client │ Selenium Renderer │ Content Parser │ Asset Saver │ 22 | │ (requests) │ (WebDriver) │ (BeautifulSoup) │ (Zip) │ 23 | └───────────────┴──────────────────┴──────────────────┴─────────────┘ 24 | ``` 25 | 26 | ## Data Flow Diagram 27 | 28 | ``` 29 | ┌──────────┐ URL ┌──────────┐ HTML Content ┌──────────────┐ 30 | │ User │───────────▶│ Extractor│───────────────▶│ HTML Parser │ 31 | └──────────┘ └──────────┘ └──────────────┘ 32 | │ │ 33 | Rendering │ │ Asset URLs 34 | option │ │ 35 | ▼ ▼ 36 | ┌──────────┐ ┌──────────────┐ 37 | │ Selenium │ │ Asset │ 38 | │ WebDriver│ │ Downloader │ 39 | └──────────┘ └──────────────┘ 40 | │ │ 41 | Rendered│ Assets │ 42 | HTML │ │ 43 | ▼ ▼ 44 | ┌──────────────────────────────────────────┐ 45 | │ Zip File Creator │ 46 | └──────────────────────────────────────────┘ 47 | │ 48 | ▼ 49 | ┌──────────────────────────────────────────┐ 50 | │ File Download Response to User │ 51 | └──────────────────────────────────────────┘ 52 | ``` 53 | 54 | ## Component Descriptions 55 | 56 | ### 1. Flask Web Server 57 | - **Purpose**: Provides the web interface and handles HTTP requests 58 | - **Key Files**: `app.py` (main file), `templates/index.html` (UI) 59 | - **Functions**: Serves the interface, processes form submissions, returns downloaded files 60 | 61 | ### 2. HTTP Client (Requests) 62 | - **Purpose**: Fetches website content using standard HTTP requests 63 | - **Key Functions**: `download_asset()`, HTTP request code in `/extract` route 64 | - **Features**: Cookie handling, header rotation, retry logic, error handling 65 | 66 | ### 3. Selenium Renderer (Optional) 67 | - **Purpose**: Renders JavaScript-heavy websites using a headless Chrome browser 68 | - **Key Functions**: `extract_with_selenium()` 69 | - **Features**: Waits for dynamic content, scrolls the page, handles lazy loading, identifies framework-specific resources 70 | 71 | ### 4. Content Parser 72 | - **Purpose**: Analyzes HTML content to extract assets and structure 73 | - **Key Functions**: `extract_assets()`, `extract_metadata()`, `extract_component_structure()` 74 | - **Features**: Identifies CSS, JS, images, fonts, extracts metadata, identifies UI components 75 | 76 | ### 5. Asset Downloader 77 | - **Purpose**: Downloads all discovered assets 78 | - **Key Functions**: `download_asset()` 79 | - **Features**: Handles different asset types, resolves relative URLs, manages retries 80 | 81 | ### 6. Zip File Creator 82 | - **Purpose**: Packages all assets into a downloadable zip file 83 | - **Key Functions**: `create_zip_file()` 84 | - **Features**: Organizes assets by type, handles file naming, adds metadata and documentation 85 | 86 | ## Process Flow 87 | 88 | 1. **User Submits URL**: 89 | - User enters a URL in the web interface 90 | - Optionally selects "Use Advanced Rendering (Selenium)" 91 | - Submits the form to the `/extract` endpoint 92 | 93 | 2. **Content Acquisition**: 94 | - If Selenium is selected: Uses Chrome WebDriver to render the page 95 | - Otherwise: Uses Requests library for HTTP retrieval 96 | - Handles redirects, errors, retries with different headers 97 | 98 | 3. **HTML Processing**: 99 | - Parses HTML using BeautifulSoup 100 | - Fixes relative URLs 101 | - Extracts metadata (title, description, etc.) 102 | - Identifies UI components 103 | 104 | 4. **Asset Discovery**: 105 | - Finds all linked resources (CSS, JS, images, fonts, etc.) 106 | - Resolves URLs 107 | - Categorizes assets by type 108 | - Handles duplicates 109 | 110 | 5. **Asset Download**: 111 | - Downloads all discovered assets 112 | - Handles binary vs. text content 113 | - Manages errors and retries 114 | 115 | 6. **Zip Creation**: 116 | - Creates organized folder structure 117 | - Adds README and metadata 118 | - Creates component index 119 | - Packages everything into a ZIP file 120 | 121 | 7. **User Download**: 122 | - Returns the ZIP file as a downloadable attachment 123 | - Manages temporary file cleanup 124 | 125 | ## Challenges & Error Patterns 126 | 127 | ### Common Failure Points 128 | 129 | 1. **Selenium WebDriver Initialization**: 130 | - Error seen in logs: `Error initializing Chrome WebDriver: [Errno 8] Exec format error` 131 | - Cause: WebDriver executable permission or architecture mismatch 132 | - Fallback: Alternative initialization method is attempted 133 | 134 | 2. **CDN and Image Processing URLs**: 135 | - Error seen: `Failed to download https://www.tesla.com/q_auto/Homepage-New-Legacy-Model-Y-Desktop.png, status: 404` 136 | - Cause: URLs contain transformation parameters (`q_auto`, `f_auto`) that are processed by CDNs and don't represent actual file paths 137 | 138 | 3. **Theme and Framework Resources**: 139 | - Error seen: `Failed to download https://www.tesla.com/themes/contrib/stable/images/core/throbber-active.gif, status: 404` 140 | - Cause: Theme resources may be generated dynamically or have access restrictions 141 | 142 | 4. **Anti-Bot Measures**: 143 | - Some sites implement anti-scraping measures (403 Forbidden responses) 144 | - Application implements header rotation and Selenium fallback to mitigate this 145 | 146 | ## Improvement Opportunities 147 | 148 | 1. **URL Processing**: Enhance the URL normalization to better handle CDN-specific parameters 149 | 2. **Asset Deduplication**: Improve handling of duplicate assets with different query parameters 150 | 3. **Error Handling**: Add more targeted error handling for specific CDN formats 151 | 4. **WebDriver Management**: Improve Selenium WebDriver initialization reliability 152 | 153 | ## Technical Dependencies 154 | 155 | - **Flask**: Web framework 156 | - **Requests**: HTTP client 157 | - **BeautifulSoup**: HTML parsing 158 | - **Selenium**: Browser automation 159 | - **cssutils**: CSS parsing 160 | - **zipfile**: ZIP file creation -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Website Extractor 2 | 3 | ![Website Extractor Banner](https://img.shields.io/badge/Website%20Extractor-Advanced-blue) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | 6 | ## Overview 7 | 8 | Website Extractor is a powerful Python-based tool that allows you to download and archive entire websites with a single click. This application extracts HTML, CSS, JavaScript, images, fonts, and other assets from any website, making it ideal for: 9 | 10 | - Creating pixel-perfect copies of any website online 11 | - Training AI agents with real-world web content 12 | - Studying website structure and design 13 | - Extracting UI components for design inspiration 14 | - Archiving web content for research 15 | - Learning web development techniques 16 | 17 | The application features advanced rendering capabilities using Selenium, allowing it to properly extract assets from modern JavaScript-heavy websites and single-page applications. 18 | 19 | ![App Architecture Overview](https://raw.githubusercontent.com/username/website-extractor/main/docs/app_architecture_overview.png) 20 | 21 | ## Features 22 | 23 | - **Advanced Rendering**: Uses Selenium with Chrome WebDriver to render JavaScript-heavy sites 24 | - **Comprehensive Asset Extraction**: Downloads HTML, CSS, JavaScript, images, fonts, and more 25 | - **Metadata Extraction**: Captures site metadata, OpenGraph tags, and structured data 26 | - **UI Component Analysis**: Identifies and extracts UI components like headers, navigation, cards, etc. 27 | - **Organized Output**: Creates a well-structured ZIP file with assets organized by type 28 | - **Responsive Design**: Works with both desktop and mobile websites 29 | - **CDN Support**: Handles assets from various Content Delivery Networks 30 | - **Modern Framework Support**: Special handling for React, Next.js, Angular, and Tailwind CSS 31 | 32 | ## Advanced Use Cases 33 | 34 | ### Pixel-Perfect Website Copies 35 | Create exact replicas of websites for study, testing, or inspiration. The advanced rendering engine ensures even complex layouts and JavaScript-driven designs are faithfully reproduced. 36 | 37 | ### AI Agent Training 38 | Extract websites to create high-quality training data for your AI agents: 39 | - Feed the structured content to AI models to improve their understanding of web layouts 40 | - Train AI assistants on real-world UI components and design patterns 41 | - Create diverse datasets of web content for machine learning projects 42 | 43 | ### Cursor IDE Integration 44 | Website Extractor works seamlessly with Cursor IDE: 45 | - Extract a website and open it directly in Cursor for code analysis 46 | - Edit the extracted code with Cursor's AI-powered assistance 47 | - Use the components as reference for your own projects 48 | - Ask Cursor to analyze the site's structure and styles to apply similar patterns to your work 49 | 50 | ### Design Inspiration & Reference 51 | Upload the extracted folder to your current project and: 52 | - Ask Cursor to reference its style when building new pages 53 | - Study professional UI implementations 54 | - Extract specific components for reuse in your own projects 55 | - Learn modern CSS techniques from production websites 56 | 57 | ## Installation 58 | 59 | ### Prerequisites 60 | 61 | - Python 3.7+ 62 | - Chrome/Chromium browser (for advanced rendering) 63 | - Git 64 | 65 | ### Using Cursor (Recommended) 66 | 67 | 1. Clone the repository: 68 | ```bash 69 | git clone https://github.com/sirioberati/WebTwin.git 70 | cd WebTwin 71 | ``` 72 | 73 | 2. Open the project in Cursor IDE: 74 | ```bash 75 | cursor . 76 | ``` 77 | 78 | 3. Create a virtual environment (within Cursor's terminal): 79 | ```bash 80 | python -m venv venv 81 | ``` 82 | 83 | 4. Activate the virtual environment: 84 | - On Windows: `venv\Scripts\activate` 85 | - On macOS/Linux: `source venv/bin/activate` 86 | 87 | 5. Install dependencies: 88 | ```bash 89 | pip install -r requirements.txt 90 | ``` 91 | 92 | ### Manual Installation 93 | 94 | 1. Clone the repository: 95 | ```bash 96 | git clone https://github.com/sirioberati/WebTwin.git 97 | cd WebTwin 98 | ``` 99 | 100 | 2. Create a virtual environment: 101 | ```bash 102 | python -m venv venv 103 | ``` 104 | 105 | 3. Activate the virtual environment: 106 | - On Windows: `venv\Scripts\activate` 107 | - On macOS/Linux: `source venv/bin/activate` 108 | 109 | 4. Install dependencies: 110 | ```bash 111 | pip install -r requirements.txt 112 | ``` 113 | 114 | ## Usage 115 | 116 | 1. Activate your virtual environment (if not already activated) 117 | 118 | 2. Run the application: 119 | ```bash 120 | python app.py 121 | ``` 122 | 123 | 3. Open your browser and navigate to: 124 | ``` 125 | http://127.0.0.1:5001 126 | ``` 127 | 128 | 4. Enter the URL of the website you want to extract 129 | 130 | 5. Check "Use Advanced Rendering (Selenium)" for JavaScript-heavy websites 131 | 132 | 6. Click "Extract Website" and wait for the download to complete 133 | 134 | ### Using Advanced Rendering 135 | 136 | The advanced rendering option uses Selenium with Chrome WebDriver to: 137 | - Execute JavaScript 138 | - Render dynamic content 139 | - Scroll through the page to trigger lazy loading 140 | - Click on UI elements to expose hidden content 141 | - Extract resources loaded by JavaScript frameworks 142 | 143 | This option is recommended for modern websites, especially those built with React, Angular, Vue, or other JavaScript frameworks. 144 | 145 | ### Using with Cursor IDE 146 | 147 | After extracting a website: 148 | 149 | 1. Unzip the downloaded file to a directory 150 | 2. Open with Cursor IDE: 151 | ```bash 152 | cursor /path/to/extracted/website 153 | ``` 154 | 3. Explore the code structure and assets 155 | 4. Ask Cursor AI to analyze the code with prompts like: 156 | - "Explain the CSS structure of this website" 157 | - "How can I implement a similar hero section in my project?" 158 | - "Analyze this navigation component and create a similar one for my React app" 159 | 160 | ## AI Agent Integration 161 | 162 | WebTwin can be a powerful tool when combined with AI agents, enabling sophisticated workflows for code analysis, design extraction, and content repurposing. 163 | 164 | ### Integration with Cursor AI 165 | 166 | Cursor's AI capabilities can be supercharged with WebTwin's extraction abilities: 167 | 168 | 1. **Extract and Modify Workflow**: 169 | ``` 170 | WebTwin → Extract Site → Open in Cursor → Ask AI to Modify 171 | ``` 172 | Example prompts: 173 | - "Convert this landing page to use Tailwind CSS instead of Bootstrap" 174 | - "Refactor this JavaScript code to use React hooks" 175 | - "Simplify this complex CSS layout while maintaining the same visual appearance" 176 | 177 | 2. **Component Library Creation**: 178 | ``` 179 | WebTwin → Extract Multiple Sites → Open in Cursor → AI-Powered Component Extraction 180 | ``` 181 | Example prompts: 182 | - "Extract all button styles from these websites and create a unified component library" 183 | - "Analyze these navigation patterns and create a best-practices implementation" 184 | 185 | 3. **Learn from Production Code**: 186 | ``` 187 | WebTwin → Extract Complex Site → Cursor AI Analysis → Generate Tutorial 188 | ``` 189 | Example prompts: 190 | - "Explain how this site implements its responsive design strategy" 191 | - "Show me how this animation effect works and help me implement something similar" 192 | 193 | ### Integration with OpenAI Assistants API & Agent SDK 194 | 195 | WebTwin can be integrated with the OpenAI Assistants API and Agent SDK to create specialized AI agents: 196 | 197 | 1. **Setup a Website Analysis Agent**: 198 | ```python 199 | from openai import OpenAI 200 | 201 | client = OpenAI(api_key="your-api-key") 202 | 203 | # Create an assistant specialized in web design analysis 204 | assistant = client.beta.assistants.create( 205 | name="WebDesignAnalyzer", 206 | instructions="You analyze websites extracted by WebTwin and provide design insights.", 207 | model="gpt-4-turbo", 208 | tools=[{"type": "file_search"}] 209 | ) 210 | 211 | # Upload the extracted website files 212 | file = client.files.create( 213 | file=open("extracted_website.zip", "rb"), 214 | purpose="assistants" 215 | ) 216 | 217 | # Create a thread with the file 218 | thread = client.beta.threads.create( 219 | messages=[ 220 | { 221 | "role": "user", 222 | "content": "Analyze this website's design patterns and component structure", 223 | "file_ids": [file.id] 224 | } 225 | ] 226 | ) 227 | 228 | # Run the assistant on the thread 229 | run = client.beta.threads.runs.create( 230 | thread_id=thread.id, 231 | assistant_id=assistant.id 232 | ) 233 | ``` 234 | 235 | 2. **Create a Website Transformation Pipeline**: 236 | ``` 237 | WebTwin → Extract Site → OpenAI Agent Processes → Generate New Code 238 | ``` 239 | 240 | 3. **Build a Web Design Critique Agent**: 241 | - Feed WebTwin extractions to an AI agent trained to evaluate design principles 242 | - Receive detailed feedback on accessibility, usability, and visual design 243 | 244 | ### Advanced Agent Workflows 245 | 246 | Combine WebTwin with AI agents for advanced workflows: 247 | 248 | 1. **Cross-Site Design Pattern Analysis**: 249 | - Extract multiple sites in the same industry 250 | - Use AI to identify common patterns and best practices 251 | - Generate a report on industry-standard approaches 252 | 253 | 2. **Automated Component Library Generation**: 254 | - Extract multiple sites 255 | - Use AI to identify and categorize UI components 256 | - Generate a unified component library with documentation 257 | 258 | 3. **SEO and Content Strategy Analysis**: 259 | - Extract content-rich websites 260 | - Use AI to analyze content structure, metadata, and keyword usage 261 | - Generate SEO recommendations and content strategy insights 262 | 263 | 4. **Competitive Analysis**: 264 | - Extract competitor websites 265 | - Use AI to compare features, UX patterns, and technical implementations 266 | - Generate a competitive analysis report with strengths and weaknesses 267 | 268 | ## Architecture 269 | 270 | The application is built with a modular architecture designed for flexibility and performance: 271 | 272 | ``` 273 | ┌───────────────────────────────────────────────────────────────────┐ 274 | │ Website Extractor Application │ 275 | └───────────────────────────────────────────────────────────────────┘ 276 | │ 277 | ▼ 278 | ┌───────────────────────────────────────────────────────────────────┐ 279 | │ Flask Web Server │ 280 | └───────────────────────────────────────────────────────────────────┘ 281 | │ 282 | ▼ 283 | ┌───────────────────────────────────────────────────────────────────┐ 284 | │ Extraction Core Processes │ 285 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤ 286 | │ HTTP Client │ Selenium Renderer │ Content Parser │ Asset Saver │ 287 | │ (requests) │ (WebDriver) │ (BeautifulSoup) │ (Zip) │ 288 | └───────────────┴──────────────────┴──────────────────┴─────────────┘ 289 | ``` 290 | 291 | ### Data Flow 292 | 293 | ``` 294 | ┌──────────┐ URL ┌──────────┐ HTML Content ┌──────────────┐ 295 | │ User │───────────▶│ Extractor│───────────────▶│ HTML Parser │ 296 | └──────────┘ └──────────┘ └──────────────┘ 297 | │ │ 298 | Rendering │ │ Asset URLs 299 | option │ │ 300 | ▼ ▼ 301 | ┌──────────┐ ┌──────────────┐ 302 | │ Selenium │ │ Asset │ 303 | │ WebDriver│ │ Downloader │ 304 | └──────────┘ └──────────────┘ 305 | │ │ 306 | Rendered│ Assets │ 307 | HTML │ │ 308 | ▼ ▼ 309 | ┌──────────────────────────────────────────┐ 310 | │ Zip File Creator │ 311 | └──────────────────────────────────────────┘ 312 | │ 313 | ▼ 314 | ┌──────────────────────────────────────────┐ 315 | │ File Download Response to User │ 316 | └──────────────────────────────────────────┘ 317 | ``` 318 | 319 | ### Key Components 320 | 321 | 1. **Flask Web Server**: Provides the user interface and handles HTTP requests 322 | 2. **HTTP Client**: Makes requests to fetch website content using the Requests library 323 | 3. **Selenium Renderer**: Optional component for JavaScript rendering and dynamic content 324 | 4. **Content Parser**: Analyzes HTML to extract assets and structure using BeautifulSoup 325 | 5. **Asset Downloader**: Downloads all discovered assets with sophisticated retry logic 326 | 6. **ZIP Creator**: Packages everything into an organized downloadable archive 327 | 328 | ### Processing Stages 329 | 330 | 1. **URL Submission**: User provides a URL and rendering options 331 | 2. **Content Acquisition**: HTML content is fetched (with or without JavaScript rendering) 332 | 3. **Structure Analysis**: HTML is parsed and analyzed for assets and components 333 | 4. **Asset Discovery**: All linked resources are identified and categorized 334 | 5. **Parallel Downloading**: Assets are downloaded with optimized concurrent requests 335 | 6. **Organization & Packaging**: Files are organized and compressed into a ZIP archive 336 | 337 | For more detailed technical information, see [app_architecture.md](app_architecture.md). 338 | 339 | ## Limitations 340 | 341 | - Some websites implement anti-scraping measures that may block extraction 342 | - Content requiring authentication may not be accessible 343 | - Very large websites may time out or require multiple extraction attempts 344 | - Some CDN-specific URL formats may fail to download (especially those with transformation parameters) 345 | 346 | ## License 347 | 348 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 349 | 350 | ## Author 351 | 352 | Created by Sirio Berati 353 | 354 | - Instagram: [@heysirio](https://instagram.com/heysirio) 355 | - Instagram: [@siriosagents](https://instagram.com/siriosagents) 356 | 357 | ## Contributing 358 | 359 | Contributions are welcome! Please feel free to submit a Pull Request. 360 | 361 | 1. Fork the repository 362 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`) 363 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`) 364 | 4. Push to the branch (`git push origin feature/amazing-feature`) 365 | 5. Open a Pull Request 366 | 367 | ## Acknowledgments 368 | 369 | - This project uses [Flask](https://flask.palletsprojects.com/) for the web framework 370 | - [Selenium](https://www.selenium.dev/) for advanced rendering 371 | - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing 372 | - All the open source libraries that made this project possible 373 | -------------------------------------------------------------------------------- /templates/components.html: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Extracted Components Viewer 13 | 14 | 42 | 43 | 44 |
45 |
46 |
47 |
48 |

Extracted Components

49 |

50 | {% if extracted_url is defined and extracted_url %} 51 | Components extracted from: {{ extracted_url }} 52 | {% else %} 53 | Browse and inspect all extracted UI components from the website 54 | {% endif %} 55 |

56 |
57 | 58 | Back to Main Page 59 | 60 |
61 | 62 |
63 |
64 | 65 | 66 | 67 |

How to Use This Viewer

68 |
69 |
    70 |
  1. Browse through the component categories below
  2. 71 |
  3. Click on any component to view its preview
  4. 72 |
  5. Use the "View Code" button to see the HTML structure
  6. 73 |
  7. Copy the code to use in your own projects
  8. 74 |
  9. Click "View in Context" to see how the component appears on the original page
  10. 75 |
76 |
77 | 78 |
79 |
80 |
81 | 82 | 83 | 84 |

Navigation

85 |
86 |

Headers, menus, and navigation bars

87 |
88 | {{ navigation_count }} components found 89 |
90 |
91 | 92 |
93 |
94 | 95 | 96 | 97 |

Hero Sections

98 |
99 |

Main banners and hero areas

100 |
101 | {{ hero_count }} components found 102 |
103 |
104 | 105 |
106 |
107 | 108 | 109 | 110 |

Cards

111 |
112 |

Product cards, info cards, and pricing cards

113 |
114 | {{ card_count }} components found 115 |
116 |
117 | 118 |
119 |
120 | 121 | 122 | 123 |

Sections

124 |
125 |

Content sections and feature blocks

126 |
127 | {{ section_count }} components found 128 |
129 |
130 | 131 |
132 |
133 | 134 | 135 | 136 |

Forms

137 |
138 |

Contact forms, sign-up forms, and inputs

139 |
140 | {{ form_count }} components found 141 |
142 |
143 | 144 |
145 |
146 | 147 | 148 | 149 |

Footers

150 |
151 |

Page footers and bottom sections

152 |
153 | {{ footer_count }} components found 154 |
155 |
156 | 157 |
158 |
159 | 160 | 161 | 162 |

Store Components

163 | NEW 164 |
165 |

Product listings, filters, and store layouts

166 |
167 | {{ store_count }} components found 168 |
169 |
170 | 171 |
172 |
173 | 174 | 175 | 176 |

Mobile Components

177 | NEW 178 |
179 |

Mobile-specific UI elements and responsive components

180 |
181 | {{ mobile_count }} components found 182 |
183 |
184 | 185 |
186 |
187 | 188 | 189 | 190 |

Cart Components

191 | NEW 192 |
193 |

Shopping cart elements and checkout flows

194 |
195 | {{ cart_count }} components found 196 |
197 |
198 |
199 | 200 |
201 |

Metadata

202 |
203 |
204 |

Page Title

205 |

{{ page_title }}

206 |
207 | 208 |
209 |

Description

210 |

{{ meta_description }}

211 |
212 | 213 |
214 |

Keywords

215 |
216 | {% for keyword in meta_keywords %} 217 | {{ keyword }} 218 | {% endfor %} 219 |
220 |
221 | 222 |
223 |

Open Graph

224 |
225 |

Title: {{ og_title }}

226 |

Description: {{ og_description }}

227 |

Image: {{ og_image }}

228 |
229 |
230 |
231 |
232 | 233 |
234 |

Framework Configuration

235 |
236 |
237 |
238 | 239 | 240 | 241 |

Next.js Configuration

242 | NEW 243 |
244 |
245 |
{{ next_config }}
246 |
247 |
248 | 249 |
250 |
251 | 252 | 253 | 254 |

Tailwind Configuration

255 | NEW 256 |
257 |
258 |
{{ tailwind_config }}
259 |
260 |
261 |
262 |
263 | 264 | 265 |
266 |

Component Previews

267 | 268 | {% if components|length > 0 %} 269 | {% for component in components %} 270 |
271 |
272 |
273 |

{{ component.name }}

274 |

{{ component.type }}

275 |
276 |
277 | 281 | 282 | View in Context 283 | 284 |
285 |
286 | 287 |
288 |
289 | {{ component.html|safe }} 290 |
291 |
292 | 293 |
294 |
{{ component.code }}
295 |
296 |
297 | {% endfor %} 298 | {% else %} 299 |
300 |
301 |
302 | 303 | 304 | 305 |
306 |
307 |

308 | No components were found in the extracted website. This could be because: 309 |

310 |
    311 |
  • The website uses a complex structure that's difficult to extract
  • 312 |
  • The website uses custom components that don't match our extraction patterns
  • 313 |
  • You're viewing the demo components instead of an actual extraction
  • 314 |
315 |

316 | Try extracting a different website or check the ZIP file for the complete website clone. 317 |

318 |
319 |
320 |
321 | {% endif %} 322 |
323 |
324 |
325 | 326 | 340 | 341 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Website Extractor - Pixel Perfect Clone 7 | 8 | 28 | 29 | 30 |
31 |
32 |
33 |

Website Extractor

34 |
35 |

Create pixel-perfect clones of any website

36 | 37 |
38 |
39 |
40 | 41 | 44 |

Enter the complete URL of the website you want to clone

45 |
46 | 47 |
48 | 50 | 54 |
55 |

Renders JavaScript and scrolls through the page to capture all content. Recommended for modern websites with dynamic content.

56 | 57 | 61 |
62 | 63 |
64 |
65 |
66 |
67 |
68 |
69 |

Extracting website assets...

70 |
71 |
72 |
73 |

Initializing...

74 |
75 |
76 |
77 | 78 | 79 | 80 | 101 |
102 | 103 |
104 |

Enhanced Extraction Features

105 |
    106 |
  • 107 | 108 | 109 | 110 | Complete HTML DOM structure 111 |
  • 112 |
  • 113 | 114 | 115 | 116 | All CSS stylesheets (external and inline) 117 |
  • 118 |
  • 119 | 120 | 121 | 122 | globals.css and styling files 123 | NEW 124 |
  • 125 |
  • 126 | 127 | 128 | 129 | JavaScript files and functionality 130 |
  • 131 |
  • 132 | 133 | 134 | 135 | Next.js configuration and files 136 | NEW 137 |
  • 138 |
  • 139 | 140 | 141 | 142 | Image configurations and assets 143 | NEW 144 |
  • 145 |
  • 146 | 147 | 148 | 149 | Metadata extraction 150 | NEW 151 |
  • 152 |
  • 153 | 154 | 155 | 156 | Mobile-specific components 157 | NEW 158 |
  • 159 |
  • 160 | 161 | 162 | 163 | SVG graphics (both linked and inline) 164 | NEW 165 |
  • 166 |
  • 167 | 168 | 169 | 170 | Video files and player components 171 | NEW 172 |
  • 173 |
  • 174 | 175 | 176 | 177 | Audio files and player components 178 | NEW 179 |
  • 180 |
  • 181 | 182 | 183 | 184 | Font files and font family detection 185 | NEW 186 |
  • 187 |
  • 188 | 189 | 190 | 191 | GIF animations and dynamic content 192 | NEW 193 |
  • 194 |
  • 195 | 196 | 197 | 198 | Screenshots of website and components 199 | PREMIUM 200 |
  • 201 |
  • 202 | 203 | 204 | 205 | JavaScript-rendered content capture 206 | PREMIUM 207 |
  • 208 |
209 |
210 | 211 |
212 |

Extracted UI Components

213 |

The tool automatically identifies and extracts key UI components for easy reuse:

214 | 215 |
216 |
217 |
218 | 219 | 220 | 221 |

Navigation

222 |
223 |

Headers, menus, and navigation bars with responsive design

224 |
225 | 226 |
227 |
228 | 229 | 230 | 231 |

Hero Sections

232 |
233 |

Eye-catching hero banners with images, text overlays, and call-to-action buttons

234 |
235 | 236 |
237 |
238 | 239 | 240 | 241 |

Store Pages

242 | NEW 243 |
244 |

Complete store layouts with product listings, filters, and shopping functionality

245 |
246 | 247 |
248 |
249 | 250 | 251 | 252 |

Mobile Menus

253 | NEW 254 |
255 |

Mobile-specific navigation components and responsive design elements

256 |
257 | 258 |
259 |
260 | 261 | 262 | 263 |

Product Grids

264 |
265 |

Product listings and card grids with images, pricing, and descriptions

266 |
267 | 268 |
269 |
270 | 271 | 272 | 273 |

Shopping Cart

274 | NEW 275 |
276 |

Cart components with item listings, quantity controls, and checkout buttons

277 |
278 | 279 |
280 |
281 | 282 | 283 | 284 |

Carousels & Sliders

285 | NEW 286 |
287 |

Image sliders, product carousels, and testimonial rotators with controls

288 |
289 | 290 |
291 |
292 | 293 | 294 | 295 |

Video Players

296 | NEW 297 |
298 |

Custom video player components with controls and responsive design

299 |
300 | 301 |
302 |
303 | 304 | 305 | 306 |

Audio Players

307 | NEW 308 |
309 |

Audio playback components with controls and playlist functionality

310 |
311 | 312 |
313 |
314 | 315 | 316 | 317 |

Tab Components

318 | NEW 319 |
320 |

Tabbed interfaces with content panels and interactive navigation

321 |
322 | 323 |
324 |
325 | 326 | 327 | 328 |

Social Media

329 | NEW 330 |
331 |

Social media links, sharing buttons, and embedded social feeds

332 |
333 | 334 |
335 |
336 | 337 | 338 | 339 |

Modals & Popups

340 | NEW 341 |
342 |

Modals, popup dialogs, and overlay components with animations

343 |
344 |
345 | 346 |
347 |

Components Included in ZIP File

348 |

All extracted UI components are included in the downloaded ZIP file for easy access and reuse.

349 |
350 |
351 | 352 |
353 |

Framework Support

354 |
355 |
356 |
357 | 358 | 359 | 360 |

Next.js

361 | NEW 362 |
363 |
    364 |
  • next.config.js extraction
  • 365 |
  • _app.js and _document.js
  • 366 |
  • Static and dynamic routes
  • 367 |
368 |
369 | 370 |
371 |
372 | 373 | 374 | 375 |

Tailwind CSS

376 | NEW 377 |
378 |
    379 |
  • tailwind.config.js extraction
  • 380 |
  • Custom theme settings
  • 381 |
  • Plugin configurations
  • 382 |
383 |
384 | 385 |
386 |
387 | 388 | 389 | 390 |

React

391 | NEW 392 |
393 |
    394 |
  • Component structure detection
  • 395 |
  • React-specific attributes
  • 396 |
  • State management patterns
  • 397 |
398 |
399 | 400 |
401 |
402 | 403 | 404 | 405 |

Vue.js

406 | NEW 407 |
408 |
    409 |
  • Vue component detection
  • 410 |
  • vue.config.js extraction
  • 411 |
  • Vue directives parsing
  • 412 |
413 |
414 | 415 |
416 |
417 | 418 | 419 | 420 |

Angular

421 | NEW 422 |
423 |
    424 |
  • Angular component structure
  • 425 |
  • angular.json configuration
  • 426 |
  • Module detection
  • 427 |
428 |
429 | 430 |
431 |
432 | 433 | 434 | 435 |

Bootstrap

436 | NEW 437 |
438 |
    439 |
  • Bootstrap component classes
  • 440 |
  • Grid system extraction
  • 441 |
  • Custom Bootstrap themes
  • 442 |
443 |
444 | 445 |
446 |
447 | 448 | 449 | 450 |

SCSS/SASS

451 | NEW 452 |
453 |
    454 |
  • Variable definitions
  • 455 |
  • Mixin extraction
  • 456 |
  • Nested styles
  • 457 |
458 |
459 | 460 |
461 |
462 | 463 | 464 | 465 |

Svelte

466 | NEW 467 |
468 |
    469 |
  • Svelte component format
  • 470 |
  • Reactive declarations
  • 471 |
  • Template structure
  • 472 |
473 |
474 | 475 |
476 |
477 | 478 | 479 | 480 |

Material UI

481 | NEW 482 |
483 |
    484 |
  • Material component classes
  • 485 |
  • Theme configuration
  • 486 |
  • Material icons
  • 487 |
488 |
489 |
490 |
491 | 492 |
493 |

How to Use the Clone

494 |
    495 |
  1. Extract the downloaded ZIP file
  2. 496 |
  3. Open the index.html file in your browser to view the static clone
  4. 497 |
  5. Find extracted components in the components folder
  6. 498 |
  7. Review metadata in metadata.json
  8. 499 |
  9. Check css/globals.css for global styling
  10. 500 |
  11. Edit the HTML, CSS, and JavaScript files in Cursor to customize the design
  12. 501 |
  13. Use the included manifest.json file to locate specific assets
  14. 502 |
503 |
504 |

Pro Tip:

505 |

For the most accurate results, try cloning the desktop version of websites. Some sites may have anti-scraping measures that could affect the results.

506 |
507 |
508 |
509 |
510 | 511 | 804 | 805 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, send_file, jsonify, session, after_this_request 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import os 5 | import re 6 | import json 7 | from urllib.parse import urljoin, urlparse, urlunparse, unquote, quote, parse_qs 8 | import zipfile 9 | from io import BytesIO 10 | import mimetypes 11 | import base64 12 | import cssutils 13 | import logging 14 | import uuid 15 | import random 16 | import time 17 | import urllib3 18 | import tempfile 19 | from datetime import datetime 20 | import traceback 21 | import html 22 | import shutil 23 | import threading 24 | 25 | # Try to import Selenium 26 | SELENIUM_AVAILABLE = False 27 | try: 28 | from selenium import webdriver 29 | from selenium.webdriver.chrome.options import Options 30 | from selenium.webdriver.common.by import By 31 | from selenium.webdriver.support.ui import WebDriverWait 32 | from selenium.webdriver.support import expected_conditions as EC 33 | from selenium.common.exceptions import TimeoutException, WebDriverException 34 | from selenium.webdriver.chrome.service import Service 35 | from webdriver_manager.chrome import ChromeDriverManager 36 | SELENIUM_AVAILABLE = True 37 | print("Selenium is available. Advanced rendering is enabled.") 38 | except ImportError: 39 | SELENIUM_AVAILABLE = False 40 | print("Selenium not available. Advanced rendering will be disabled.") 41 | 42 | # Suppress cssutils warnings 43 | cssutils.log.setLevel(logging.CRITICAL) 44 | 45 | app = Flask(__name__) 46 | app.secret_key = os.environ.get('SECRET_KEY', 'dev_key_for_website_extractor') 47 | 48 | def is_binary_content(content, asset_type): 49 | """Determine if content should be treated as binary or text based on asset type and content inspection""" 50 | # First check by asset type 51 | if asset_type in ['images', 'fonts', 'videos', 'audio']: 52 | return True 53 | 54 | # For potentially text-based assets, try to detect if it's binary 55 | if asset_type in ['css', 'js', 'html', 'svg', 'json', 'globals_css']: 56 | # Check if the content is bytes 57 | if not isinstance(content, bytes): 58 | return False 59 | 60 | # Try to detect if binary by checking for null bytes and high concentration of non-ASCII chars 61 | try: 62 | # Check for null bytes which indicate binary content 63 | if b'\x00' in content: 64 | return True 65 | 66 | # Sample the first 1024 bytes to determine if it's binary 67 | sample = content[:1024] 68 | text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}) 69 | return bool(sample.translate(None, text_chars)) 70 | except: 71 | # If there's any error in detection, treat as binary to be safe 72 | return True 73 | 74 | # For anything else, just check if it's bytes 75 | return isinstance(content, bytes) 76 | 77 | def download_asset(url, base_url, headers=None, session_obj=None): 78 | """ 79 | Download an asset from a URL 80 | 81 | Args: 82 | url: URL to download from 83 | base_url: Base URL of the website (for referrer) 84 | headers: Optional custom headers 85 | session_obj: Optional requests.Session object for maintaining cookies 86 | 87 | Returns: 88 | Content of the asset or None if download failed 89 | """ 90 | # List of user agents to rotate through to avoid detection 91 | user_agents = [ 92 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 93 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 94 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', 95 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', 96 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 97 | 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1' 98 | ] 99 | 100 | # Use a random user agent 101 | random_user_agent = random.choice(user_agents) 102 | 103 | if not headers: 104 | headers = { 105 | 'User-Agent': random_user_agent, 106 | 'Accept': '*/*', 107 | 'Accept-Language': 'en-US,en;q=0.9', 108 | 'Accept-Encoding': 'gzip, deflate, br', 109 | 'Connection': 'keep-alive', 110 | 'Referer': base_url, 111 | 'Sec-Fetch-Dest': 'empty', 112 | 'Sec-Fetch-Mode': 'cors', 113 | 'Sec-Fetch-Site': 'same-origin', 114 | 'Pragma': 'no-cache', 115 | 'Cache-Control': 'no-cache', 116 | } 117 | else: 118 | # Update the user agent in the provided headers 119 | headers['User-Agent'] = random_user_agent 120 | 121 | # Parse the URL to check if it's valid 122 | try: 123 | parsed_url = urlparse(url) 124 | if not parsed_url.scheme or not parsed_url.netloc: 125 | print(f"Invalid URL: {url}") 126 | return None 127 | except Exception as e: 128 | print(f"Error parsing URL {url}: {str(e)}") 129 | return None 130 | 131 | # Add a delay to avoid rate limiting 132 | time.sleep(0.1) # 100ms delay between requests 133 | 134 | # Maximum number of retries 135 | max_retries = 3 136 | retry_count = 0 137 | 138 | while retry_count < max_retries: 139 | try: 140 | # Use session if provided, otherwise make a direct request 141 | if session_obj: 142 | response = session_obj.get( 143 | url, 144 | timeout=15, 145 | headers=headers, 146 | stream=True, 147 | allow_redirects=True, 148 | verify=False # Ignore SSL certificate errors 149 | ) 150 | else: 151 | response = requests.get( 152 | url, 153 | timeout=15, 154 | headers=headers, 155 | stream=True, 156 | allow_redirects=True, 157 | verify=False # Ignore SSL certificate errors 158 | ) 159 | 160 | # Handle redirects 161 | if response.history: 162 | print(f"Request for {url} was redirected {len(response.history)} times to {response.url}") 163 | url = response.url # Update URL to the final destination 164 | 165 | if response.status_code == 200: 166 | # Check the Content-Type header 167 | content_type = response.headers.get('Content-Type', '') 168 | print(f"Downloaded {url} ({len(response.content)} bytes, type: {content_type})") 169 | 170 | # Check for binary content types 171 | is_binary = any(binary_type in content_type.lower() for binary_type in [ 172 | 'image/', 'video/', 'audio/', 'font/', 'application/octet-stream', 173 | 'application/zip', 'application/x-rar', 'application/pdf', 'application/vnd.' 174 | ]) 175 | 176 | # If binary or content-type suggests binary, return raw content 177 | if is_binary: 178 | return response.content 179 | 180 | # For text content types 181 | is_text = any(text_type in content_type.lower() for text_type in [ 182 | 'text/', 'application/json', 'application/javascript', 'application/xml', 'application/xhtml' 183 | ]) 184 | 185 | if is_text: 186 | # Try to determine encoding 187 | encoding = None 188 | 189 | # From Content-Type header 190 | if 'charset=' in content_type: 191 | encoding = content_type.split('charset=')[1].split(';')[0].strip() 192 | 193 | # From response encoding or apparent encoding 194 | if not encoding: 195 | encoding = response.encoding or response.apparent_encoding or 'utf-8' 196 | 197 | # Decode with specified encoding 198 | try: 199 | return response.content.decode(encoding, errors='replace').encode('utf-8') 200 | except (UnicodeDecodeError, LookupError): 201 | # If decoding fails, try utf-8 202 | try: 203 | return response.content.decode('utf-8', errors='replace').encode('utf-8') 204 | except: 205 | # If all else fails, return raw content 206 | return response.content 207 | 208 | # For unknown content types, return raw content 209 | return response.content 210 | elif response.status_code == 404: 211 | print(f"Resource not found (404): {url}") 212 | return None 213 | elif response.status_code == 403: 214 | print(f"Access forbidden (403): {url}") 215 | # Try with a different user agent on the next retry 216 | headers['User-Agent'] = random.choice(user_agents) 217 | retry_count += 1 218 | time.sleep(1) # Wait longer before retrying 219 | continue 220 | elif response.status_code >= 500: 221 | print(f"Server error ({response.status_code}): {url}") 222 | retry_count += 1 223 | time.sleep(1) # Wait longer before retrying 224 | continue 225 | else: 226 | print(f"HTTP error ({response.status_code}): {url}") 227 | return None 228 | 229 | except requests.exceptions.Timeout: 230 | print(f"Timeout error downloading {url}") 231 | retry_count += 1 232 | time.sleep(1) 233 | continue 234 | except requests.exceptions.ConnectionError: 235 | print(f"Connection error downloading {url}") 236 | retry_count += 1 237 | time.sleep(1) 238 | continue 239 | except requests.exceptions.TooManyRedirects: 240 | print(f"Too many redirects for {url}") 241 | return None 242 | except Exception as e: 243 | print(f"Error downloading {url}: {str(e)}") 244 | return None 245 | 246 | if retry_count == max_retries: 247 | print(f"Max retries reached for {url}") 248 | 249 | return None 250 | 251 | def get_asset_type(url): 252 | """Determine the type of asset from the URL""" 253 | # Handle empty or None URLs 254 | if not url: 255 | return 'other' 256 | 257 | url_lower = url.lower() 258 | 259 | # Framework-specific patterns 260 | if '_next/static' in url_lower: 261 | if '.css' in url_lower or 'styles' in url_lower: 262 | return 'css' 263 | return 'js' # Default to JS for Next.js assets 264 | 265 | if 'chunk.' in url_lower or 'webpack' in url_lower: 266 | return 'js' # Webpack chunks 267 | 268 | if 'angular' in url_lower and '.js' in url_lower: 269 | return 'js' # Angular bundles 270 | 271 | # Handle CSS files 272 | if url_lower.endswith(('.css', '.scss', '.less', '.sass')): 273 | return 'css' 274 | if 'global.css' in url_lower or 'globals.css' in url_lower or 'tailwind' in url_lower: 275 | return 'css' 276 | if 'fonts.googleapis.com' in url_lower: 277 | return 'css' 278 | if 'styles' in url_lower and '.css' in url_lower: 279 | return 'css' 280 | 281 | # Handle JS files 282 | if url_lower.endswith(('.js', '.jsx', '.mjs', '.ts', '.tsx', '.cjs')): 283 | return 'js' 284 | if 'bundle.js' in url_lower or 'main.js' in url_lower or 'app.js' in url_lower: 285 | return 'js' 286 | if 'polyfill' in url_lower or 'runtime' in url_lower or 'vendor' in url_lower: 287 | return 'js' 288 | if 'image-config' in url_lower or 'image.config' in url_lower: 289 | return 'js' 290 | 291 | # Handle image files 292 | if url_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.avif', '.bmp', '.ico')): 293 | return 'img' 294 | if '/images/' in url_lower or '/img/' in url_lower or '/assets/images/' in url_lower: 295 | return 'img' 296 | 297 | # Handle font files 298 | if url_lower.endswith(('.woff', '.woff2', '.ttf', '.otf', '.eot')): 299 | return 'fonts' 300 | if '/fonts/' in url_lower or 'font-awesome' in url_lower: 301 | return 'fonts' 302 | 303 | # Handle media files 304 | if url_lower.endswith(('.mp4', '.webm', '.ogg', '.avi', '.mov', '.flv')): 305 | return 'videos' 306 | if url_lower.endswith(('.mp3', '.wav', '.ogg', '.aac')): 307 | return 'audio' 308 | 309 | # Handle favicon 310 | if url_lower.endswith(('.ico', '.icon')): 311 | return 'favicons' 312 | if 'favicon' in url_lower: 313 | return 'favicons' 314 | 315 | # Handle special API endpoints 316 | if 'graphql' in url_lower or 'api.' in url_lower: 317 | return 'js' 318 | 319 | # Try to guess based on URL structure 320 | if '/css/' in url_lower: 321 | return 'css' 322 | if '/js/' in url_lower or '/scripts/' in url_lower: 323 | return 'js' 324 | if '/static/' in url_lower and not any(ext in url_lower for ext in ['.css', '.js', '.png', '.jpg']): 325 | # For static assets with unclear type, check the URL itself 326 | if 'style' in url_lower: 327 | return 'css' 328 | return 'js' # Default for static assets 329 | 330 | # For CDN resources, try to determine type from the host 331 | cdn_hosts = ['cdn.jsdelivr.net', 'unpkg.com', 'cdnjs.cloudflare.com'] 332 | for host in cdn_hosts: 333 | if host in url_lower: 334 | if any(lib in url_lower for lib in ['react', 'angular', 'vue', 'jquery']): 335 | return 'js' 336 | if any(lib in url_lower for lib in ['bootstrap', 'tailwind', 'material', 'font']): 337 | return 'css' 338 | 339 | # Default to JS for unknown extensions 340 | return 'js' 341 | 342 | def extract_metadata(soup, base_url): 343 | """Extract metadata from the HTML""" 344 | metadata = { 345 | 'title': '', 346 | 'description': '', 347 | 'keywords': '', 348 | 'og_tags': {}, 349 | 'twitter_cards': {}, 350 | 'canonical': '', 351 | 'language': '', 352 | 'favicon': '', 353 | 'structured_data': [] 354 | } 355 | 356 | # Extract title 357 | title_tag = soup.find('title') 358 | if title_tag and title_tag.string: 359 | metadata['title'] = title_tag.string.strip() 360 | 361 | # Extract meta tags 362 | meta_tags = soup.find_all('meta') 363 | for tag in meta_tags: 364 | # Description 365 | if tag.get('name') == 'description' and tag.get('content'): 366 | metadata['description'] = tag.get('content').strip() 367 | 368 | # Keywords 369 | elif tag.get('name') == 'keywords' and tag.get('content'): 370 | metadata['keywords'] = tag.get('content').strip() 371 | 372 | # OpenGraph tags 373 | elif tag.get('property') and tag.get('property').startswith('og:') and tag.get('content'): 374 | prop = tag.get('property')[3:] # Remove 'og:' prefix 375 | metadata['og_tags'][prop] = tag.get('content').strip() 376 | 377 | # Twitter card tags 378 | elif tag.get('name') and tag.get('name').startswith('twitter:') and tag.get('content'): 379 | prop = tag.get('name')[8:] # Remove 'twitter:' prefix 380 | metadata['twitter_cards'][prop] = tag.get('content').strip() 381 | 382 | # Extract canonical URL 383 | canonical_tag = soup.find('link', {'rel': 'canonical'}) 384 | if canonical_tag and canonical_tag.get('href'): 385 | canonical_url = canonical_tag.get('href') 386 | if not canonical_url.startswith(('http://', 'https://')): 387 | canonical_url = urljoin(base_url, canonical_url) 388 | metadata['canonical'] = canonical_url 389 | 390 | # Extract language 391 | html_tag = soup.find('html') 392 | if html_tag and html_tag.get('lang'): 393 | metadata['language'] = html_tag.get('lang') 394 | 395 | # Extract favicon 396 | favicon_tag = soup.find('link', {'rel': 'icon'}) or soup.find('link', {'rel': 'shortcut icon'}) 397 | if favicon_tag and favicon_tag.get('href'): 398 | favicon_url = favicon_tag.get('href') 399 | if not favicon_url.startswith(('http://', 'https://')): 400 | favicon_url = urljoin(base_url, favicon_url) 401 | metadata['favicon'] = favicon_url 402 | 403 | # Extract structured data (JSON-LD) 404 | script_tags = soup.find_all('script', {'type': 'application/ld+json'}) 405 | for tag in script_tags: 406 | if tag.string: 407 | try: 408 | json_data = json.loads(tag.string) 409 | metadata['structured_data'].append(json_data) 410 | except json.JSONDecodeError: 411 | pass 412 | 413 | return metadata 414 | 415 | def get_component_type(element): 416 | """Determine the type of UI component based on element attributes and classes""" 417 | if not element: 418 | return None 419 | 420 | # Get tag name, classes, and ID 421 | tag_name = element.name 422 | class_list = element.get('class', []) 423 | if class_list and not isinstance(class_list, list): 424 | class_list = [class_list] 425 | class_str = ' '.join(class_list).lower() if class_list else '' 426 | element_id = element.get('id', '').lower() 427 | 428 | # Get element role 429 | role = element.get('role', '').lower() 430 | 431 | # Navigation components 432 | if tag_name == 'nav' or role == 'navigation' or 'nav' in class_str or 'navigation' in class_str or 'menu' in class_str or element_id in ['nav', 'navigation', 'menu']: 433 | return 'navigation' 434 | 435 | # Header components 436 | if tag_name == 'header' or role == 'banner' or 'header' in class_str or 'banner' in class_str or element_id in ['header', 'banner']: 437 | return 'header' 438 | 439 | # Footer components 440 | if tag_name == 'footer' or role == 'contentinfo' or 'footer' in class_str or element_id == 'footer': 441 | return 'footer' 442 | 443 | # Hero/banner components 444 | if 'hero' in class_str or 'banner' in class_str or 'jumbotron' in class_str or 'showcase' in class_str or element_id in ['hero', 'banner', 'jumbotron', 'showcase']: 445 | return 'hero' 446 | 447 | # Card components 448 | if 'card' in class_str or 'tile' in class_str or 'item' in class_str or element_id in ['card', 'tile']: 449 | return 'card' 450 | 451 | # Form components 452 | if tag_name == 'form' or role == 'form' or 'form' in class_str or element_id == 'form': 453 | return 'form' 454 | 455 | # CTA (Call to Action) components 456 | if 'cta' in class_str or 'call-to-action' in class_str or 'action' in class_str or element_id in ['cta', 'call-to-action']: 457 | return 'cta' 458 | 459 | # Sidebar components 460 | if 'sidebar' in class_str or 'side-bar' in class_str or element_id in ['sidebar', 'side-bar']: 461 | return 'sidebar' 462 | 463 | # Modal/Dialog components 464 | if role == 'dialog' or 'modal' in class_str or 'dialog' in class_str or 'popup' in class_str or element_id in ['modal', 'dialog', 'popup']: 465 | return 'modal' 466 | 467 | # Section components 468 | if tag_name == 'section' or role == 'region' or 'section' in class_str: 469 | return 'section' 470 | 471 | # Mobile components 472 | if 'mobile' in class_str or 'smartphone' in class_str or 'mobile-only' in class_str: 473 | return 'mobile' 474 | 475 | # Store/Product components 476 | if 'product' in class_str or 'store' in class_str or 'shop' in class_str or 'pricing' in class_str: 477 | return 'store' 478 | 479 | # Cart components 480 | if 'cart' in class_str or 'basket' in class_str or 'shopping-cart' in class_str or element_id in ['cart', 'basket', 'shopping-cart']: 481 | return 'cart' 482 | 483 | # If no specific type is identified, check if the element is a major container 484 | if tag_name in ['div', 'section', 'article'] and ('container' in class_str or 'wrapper' in class_str or 'content' in class_str): 485 | return 'container' 486 | 487 | # Default to unknown if no specific type is identified 488 | return 'other' 489 | 490 | def extract_component_structure(soup): 491 | """Extract UI components from the HTML structure""" 492 | if not soup: 493 | return {} 494 | 495 | components = { 496 | 'navigation': [], 497 | 'header': [], 498 | 'footer': [], 499 | 'hero': [], 500 | 'card': [], 501 | 'form': [], 502 | 'cta': [], 503 | 'sidebar': [], 504 | 'modal': [], 505 | 'section': [], 506 | 'store': [], 507 | 'mobile': [], 508 | 'cart': [] 509 | } 510 | 511 | # Helper function to convert element to HTML string 512 | def element_to_html(element): 513 | return str(element) 514 | 515 | # Extract navigation components 516 | nav_elements = soup.find_all(['nav']) + soup.find_all(role='navigation') + soup.find_all(class_=lambda c: c and ('nav' in c.lower() or 'menu' in c.lower())) 517 | for element in nav_elements[:5]: # Limit to 5 to avoid excessive extraction 518 | components['navigation'].append({ 519 | 'html': element_to_html(element) 520 | }) 521 | 522 | # Extract header components 523 | header_elements = soup.find_all(['header']) + soup.find_all(role='banner') + soup.find_all(class_=lambda c: c and 'header' in c.lower()) 524 | for element in header_elements[:2]: # Usually only 1-2 headers per page 525 | components['header'].append({ 526 | 'html': element_to_html(element) 527 | }) 528 | 529 | # Extract footer components 530 | footer_elements = soup.find_all(['footer']) + soup.find_all(role='contentinfo') + soup.find_all(class_=lambda c: c and 'footer' in c.lower()) 531 | for element in footer_elements[:2]: # Usually only 1-2 footers per page 532 | components['footer'].append({ 533 | 'html': element_to_html(element) 534 | }) 535 | 536 | # Extract hero/banner components 537 | hero_elements = soup.find_all(class_=lambda c: c and ('hero' in c.lower() or 'banner' in c.lower() or 'jumbotron' in c.lower())) 538 | for element in hero_elements[:3]: # Limit to 3 539 | components['hero'].append({ 540 | 'html': element_to_html(element) 541 | }) 542 | 543 | # Extract card components - often these are repeated elements 544 | card_elements = soup.find_all(class_=lambda c: c and ('card' in c.lower() or 'tile' in c.lower())) 545 | 546 | # If we find many cards, just keep one of each unique structure 547 | unique_cards = {} 548 | for element in card_elements[:15]: # Examine up to 15 cards 549 | # Use a simplified structure hash to identify similar cards 550 | structure_hash = str(len(element.find_all())) # Number of child elements 551 | if structure_hash not in unique_cards: 552 | unique_cards[structure_hash] = element 553 | 554 | # Add unique cards to components 555 | for idx, element in enumerate(unique_cards.values()): 556 | if idx >= 5: # Limit to 5 unique cards 557 | break 558 | components['card'].append({ 559 | 'html': element_to_html(element) 560 | }) 561 | 562 | # Extract form components 563 | form_elements = soup.find_all(['form']) + soup.find_all(class_=lambda c: c and 'form' in c.lower()) 564 | for element in form_elements[:3]: # Limit to 3 565 | components['form'].append({ 566 | 'html': element_to_html(element) 567 | }) 568 | 569 | # Extract CTA components 570 | cta_elements = soup.find_all(class_=lambda c: c and ('cta' in c.lower() or 'call-to-action' in c.lower())) 571 | for element in cta_elements[:3]: # Limit to 3 572 | components['cta'].append({ 573 | 'html': element_to_html(element) 574 | }) 575 | 576 | # Extract sidebar components 577 | sidebar_elements = soup.find_all(class_=lambda c: c and ('sidebar' in c.lower() or 'side-bar' in c.lower())) 578 | for element in sidebar_elements[:2]: # Limit to 2 579 | components['sidebar'].append({ 580 | 'html': element_to_html(element) 581 | }) 582 | 583 | # Extract modal/dialog components 584 | modal_elements = soup.find_all(role='dialog') + soup.find_all(class_=lambda c: c and ('modal' in c.lower() or 'dialog' in c.lower() or 'popup' in c.lower())) 585 | for element in modal_elements[:3]: # Limit to 3 586 | components['modal'].append({ 587 | 'html': element_to_html(element) 588 | }) 589 | 590 | # Extract section components 591 | section_elements = soup.find_all(['section']) + soup.find_all(role='region') 592 | # Filter to get only substantial sections 593 | substantial_sections = [element for element in section_elements if len(element.find_all()) > 3] # Must have at least 3 child elements 594 | for element in substantial_sections[:5]: # Limit to 5 595 | components['section'].append({ 596 | 'html': element_to_html(element) 597 | }) 598 | 599 | # Extract mobile-specific components 600 | mobile_elements = soup.find_all(class_=lambda c: c and ('mobile' in c.lower() or 'smartphone' in c.lower() or 'mobile-only' in c.lower())) 601 | for element in mobile_elements[:3]: # Limit to 3 602 | components['mobile'].append({ 603 | 'html': element_to_html(element) 604 | }) 605 | 606 | # Extract store/product components 607 | store_elements = soup.find_all(class_=lambda c: c and ('product' in c.lower() or 'store' in c.lower() or 'shop' in c.lower() or 'pricing' in c.lower())) 608 | for element in store_elements[:5]: # Limit to 5 609 | components['store'].append({ 610 | 'html': element_to_html(element) 611 | }) 612 | 613 | # Extract cart components 614 | cart_elements = soup.find_all(class_=lambda c: c and ('cart' in c.lower() or 'basket' in c.lower() or 'shopping-cart' in c.lower())) 615 | for element in cart_elements[:2]: # Limit to 2 616 | components['cart'].append({ 617 | 'html': element_to_html(element) 618 | }) 619 | 620 | # Remove empty component types 621 | return {k: v for k, v in components.items() if v} 622 | 623 | def extract_inline_styles(soup): 624 | """Extract all inline styles from the HTML""" 625 | inline_styles = {} 626 | elements_with_style = soup.select('[style]') 627 | 628 | for i, element in enumerate(elements_with_style): 629 | style_content = element.get('style') 630 | if style_content: 631 | class_name = f'extracted-inline-style-{i}' 632 | inline_styles[class_name] = style_content 633 | # Add the class to the element 634 | element['class'] = element.get('class', []) + [class_name] 635 | # Remove the inline style 636 | del element['style'] 637 | 638 | return inline_styles 639 | 640 | def extract_inline_javascript(soup): 641 | """Extract inline JavaScript from HTML content""" 642 | inline_js = [] 643 | # Find all script tags without src attribute (inline scripts) 644 | for script in soup.find_all('script'): 645 | if not script.get('src') and script.string: 646 | inline_js.append(script.string.strip()) 647 | 648 | if inline_js: 649 | return '\n\n/* --- INLINE SCRIPTS --- */\n\n'.join(inline_js) 650 | return "" 651 | 652 | def extract_assets(html_content, base_url, session_obj=None, headers=None): 653 | """Extract all assets from HTML content""" 654 | assets = { 655 | 'css': [], 656 | 'js': [], 657 | 'img': [], 658 | 'fonts': [], 659 | 'videos': [], 660 | 'audio': [], 661 | 'favicons': [], 662 | 'font_families': set(), 663 | 'metadata': {}, 664 | 'components': {} 665 | } 666 | 667 | if not html_content: 668 | print("Warning: Empty HTML content provided to extract_assets") 669 | return assets 670 | 671 | try: 672 | # Create BeautifulSoup object 673 | soup = BeautifulSoup(html_content, 'html.parser') 674 | 675 | if not soup or not soup.html: 676 | print("Warning: Could not parse HTML content properly") 677 | # Try with a more lenient parser 678 | soup = BeautifulSoup(html_content, 'html5lib') 679 | if not soup or not soup.html: 680 | print("Error: Failed to parse HTML with both parsers") 681 | return assets 682 | 683 | # Extract metadata 684 | try: 685 | assets['metadata'] = extract_metadata(soup, base_url) 686 | except Exception as e: 687 | print(f"Error extracting metadata: {str(e)}") 688 | traceback.print_exc() 689 | 690 | # Extract all CSS files 691 | try: 692 | css_links = soup.find_all('link', {'rel': 'stylesheet'}) or [] 693 | # Also look for preload links with as="style" 694 | preload_css = soup.find_all('link', {'rel': 'preload', 'as': 'style'}) or [] 695 | 696 | for link in css_links + preload_css: 697 | href = link.get('href') 698 | if href: 699 | if not href.startswith(('http://', 'https://', 'data:')): 700 | href = urljoin(base_url, href) 701 | if href.startswith(('http://', 'https://')): 702 | assets['css'].append(href) 703 | except Exception as e: 704 | print(f"Error extracting CSS links: {str(e)}") 705 | 706 | # Look for Next.js specific CSS files 707 | try: 708 | next_css = soup.find_all('link', {'data-n-g': True}) or [] 709 | next_css += soup.find_all('link', {'data-n-p': True}) or [] 710 | for link in next_css: 711 | href = link.get('href') 712 | if href: 713 | if not href.startswith(('http://', 'https://', 'data:')): 714 | href = urljoin(base_url, href) 715 | if href.startswith(('http://', 'https://')): 716 | assets['css'].append(href) 717 | except Exception as e: 718 | print(f"Error extracting Next.js CSS: {str(e)}") 719 | 720 | # Extract all inline styles and check for CSS imports or fonts 721 | try: 722 | style_tags = soup.find_all('style') or [] 723 | for style in style_tags: 724 | style_content = style.string 725 | if style_content: 726 | # Extract @import statements 727 | import_urls = re.findall(r'@import\s+[\'"]([^\'"]+)[\'"]', style_content) or [] 728 | import_urls += re.findall(r'@import\s+url\([\'"]?([^\'"|\)]+)[\'"]?\)', style_content) or [] 729 | 730 | for import_url in import_urls: 731 | if not import_url.startswith(('http://', 'https://', 'data:')): 732 | import_url = urljoin(base_url, import_url) 733 | if import_url.startswith(('http://', 'https://')): 734 | assets['css'].append(import_url) 735 | 736 | # Extract font families 737 | font_families = re.findall(r'font-family:\s*[\'"]?([^\'";]+)[\'"]?', style_content) or [] 738 | for family in font_families: 739 | family = family.strip().split(',')[0].strip('\'"`') 740 | if family and family.lower() not in ['serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui']: 741 | assets['font_families'].add(family) 742 | except Exception as e: 743 | print(f"Error extracting inline styles: {str(e)}") 744 | 745 | # Extract all JavaScript files 746 | try: 747 | script_tags = soup.find_all('script', {'src': True}) or [] 748 | for script in script_tags: 749 | src = script.get('src') 750 | if src: 751 | if not src.startswith(('http://', 'https://', 'data:')): 752 | src = urljoin(base_url, src) 753 | if src.startswith(('http://', 'https://')): 754 | assets['js'].append(src) 755 | 756 | # Look for module scripts (common in modern frameworks) 757 | module_scripts = soup.find_all('script', {'type': 'module'}) or [] 758 | for script in module_scripts: 759 | src = script.get('src') 760 | if src: 761 | if not src.startswith(('http://', 'https://', 'data:')): 762 | src = urljoin(base_url, src) 763 | if src.startswith(('http://', 'https://')): 764 | assets['js'].append(src) 765 | except Exception as e: 766 | print(f"Error extracting JavaScript: {str(e)}") 767 | 768 | # Extract all images 769 | try: 770 | # Regular img tags 771 | img_tags = soup.find_all('img') or [] 772 | for img in img_tags: 773 | # Check src attribute 774 | src = img.get('src') 775 | if src: 776 | if not src.startswith(('http://', 'https://', 'data:')): 777 | src = urljoin(base_url, src) 778 | if src.startswith(('http://', 'https://')): 779 | assets['img'].append(src) 780 | 781 | # Check srcset attribute 782 | srcset = img.get('srcset') 783 | if srcset: 784 | for src_str in srcset.split(','): 785 | src_parts = src_str.strip().split(' ') 786 | if src_parts: 787 | src = src_parts[0] 788 | if not src.startswith(('http://', 'https://', 'data:')): 789 | src = urljoin(base_url, src) 790 | if src.startswith(('http://', 'https://')): 791 | assets['img'].append(src) 792 | 793 | # Check data-src (lazy loading) 794 | data_src = img.get('data-src') 795 | if data_src: 796 | if not data_src.startswith(('http://', 'https://', 'data:')): 797 | data_src = urljoin(base_url, data_src) 798 | if data_src.startswith(('http://', 'https://')): 799 | assets['img'].append(data_src) 800 | 801 | # Background images in style attributes 802 | elements_with_style = soup.select('[style]') or [] 803 | for element in elements_with_style: 804 | style = element.get('style', '') 805 | if 'background' in style or 'background-image' in style: 806 | # Try to extract URLs 807 | bg_urls = re.findall(r'url\([\'"]?([^\'"|\)]+)[\'"]?\)', style) 808 | for bg_url in bg_urls: 809 | if not bg_url.startswith(('http://', 'https://', 'data:')): 810 | bg_url = urljoin(base_url, bg_url) 811 | if bg_url.startswith(('http://', 'https://')): 812 | assets['img'].append(bg_url) 813 | except Exception as e: 814 | print(f"Error extracting images: {str(e)}") 815 | 816 | # Extract favicon 817 | try: 818 | favicon_links = soup.find_all('link', {'rel': lambda r: r and (r.lower() == 'icon' or 'icon' in r.lower().split())}) or [] 819 | for link in favicon_links: 820 | href = link.get('href') 821 | if href: 822 | if not href.startswith(('http://', 'https://', 'data:')): 823 | href = urljoin(base_url, href) 824 | if href.startswith(('http://', 'https://')): 825 | assets['favicons'].append(href) 826 | except Exception as e: 827 | print(f"Error extracting favicons: {str(e)}") 828 | 829 | # Extract all video sources 830 | try: 831 | video_tags = soup.find_all('video') or [] 832 | for video in video_tags: 833 | # Check src attribute 834 | src = video.get('src') 835 | if src: 836 | if not src.startswith(('http://', 'https://', 'data:')): 837 | src = urljoin(base_url, src) 838 | if src.startswith(('http://', 'https://')): 839 | assets['videos'].append(src) 840 | 841 | # Check source tags inside video 842 | source_tags = video.find_all('source') or [] 843 | for source in source_tags: 844 | src = source.get('src') 845 | if src: 846 | if not src.startswith(('http://', 'https://', 'data:')): 847 | src = urljoin(base_url, src) 848 | if src.startswith(('http://', 'https://')): 849 | assets['videos'].append(src) 850 | except Exception as e: 851 | print(f"Error extracting videos: {str(e)}") 852 | 853 | # Extract all audio sources 854 | try: 855 | audio_tags = soup.find_all('audio') or [] 856 | for audio in audio_tags: 857 | # Check src attribute 858 | src = audio.get('src') 859 | if src: 860 | if not src.startswith(('http://', 'https://', 'data:')): 861 | src = urljoin(base_url, src) 862 | if src.startswith(('http://', 'https://')): 863 | assets['audio'].append(src) 864 | 865 | # Check source tags inside audio 866 | source_tags = audio.find_all('source') or [] 867 | for source in source_tags: 868 | src = source.get('src') 869 | if src: 870 | if not src.startswith(('http://', 'https://', 'data:')): 871 | src = urljoin(base_url, src) 872 | if src.startswith(('http://', 'https://')): 873 | assets['audio'].append(src) 874 | except Exception as e: 875 | print(f"Error extracting audio: {str(e)}") 876 | 877 | # Extract all iframes 878 | try: 879 | iframe_tags = soup.find_all('iframe') or [] 880 | for iframe in iframe_tags: 881 | src = iframe.get('src') 882 | if src and not src.startswith('data:'): 883 | if not src.startswith(('http://', 'https://')): 884 | src = urljoin(base_url, src) 885 | if src.startswith(('http://', 'https://')): 886 | if 'youtube' in src or 'vimeo' in src: 887 | assets['videos'].append(src) 888 | else: 889 | assets['js'].append(src) # Treat as JS resource 890 | except Exception as e: 891 | print(f"Error extracting iframes: {str(e)}") 892 | 893 | # Extract Next.js specific resources 894 | try: 895 | # Look for Next.js data scripts 896 | next_data = soup.find('script', {'id': '__NEXT_DATA__'}) 897 | if next_data and next_data.string: 898 | try: 899 | next_json = json.loads(next_data.string) 900 | # Extract buildId 901 | if 'buildId' in next_json: 902 | build_id = next_json['buildId'] 903 | # Add common Next.js resources with this buildId 904 | for path in ['main', 'webpack', 'framework', 'pages/_app', 'pages/_error', 'pages/index']: 905 | chunk_url = f"{base_url}/_next/static/{build_id}/pages/{path}.js" 906 | assets['js'].append(chunk_url) 907 | 908 | # Extract page data 909 | if 'page' in next_json and 'props' in next_json.get('props', {}): 910 | # This often has valuable data we might want to preserve 911 | assets['metadata']['next_data'] = next_json 912 | except Exception as next_error: 913 | print(f"Error parsing Next.js data: {str(next_error)}") 914 | 915 | # Look for Webpack chunks in comments 916 | chunks_regex = r'/\*\s*webpackJsonp\s*\*/(.*?)/\*\s*end\s*webpackJsonp\s*\*/' 917 | chunks_matches = re.findall(chunks_regex, html_content, re.DOTALL) 918 | if chunks_matches: 919 | print("Found webpack chunks in comments") 920 | # These are JavaScript assets that might be dynamically loaded 921 | except Exception as e: 922 | print(f"Error extracting Next.js resources: {str(e)}") 923 | 924 | # Try to download CSS files and extract additional assets 925 | if session_obj and headers: 926 | try: 927 | css_urls = assets['css'].copy() # Copy to avoid modifying during iteration 928 | for css_url in css_urls: 929 | try: 930 | # Skip data URLs 931 | if css_url.startswith('data:'): 932 | continue 933 | 934 | # Download CSS file 935 | response = session_obj.get( 936 | css_url, 937 | timeout=10, 938 | headers=headers, 939 | verify=False # Ignore SSL certificate errors 940 | ) 941 | 942 | if response.status_code == 200: 943 | css_content = response.text 944 | 945 | # Extract URLs from url() function 946 | url_matches = re.findall(r'url\([\'"]?([^\'"|\)]+)[\'"]?\)', css_content) or [] 947 | for url in url_matches: 948 | if not url or url.startswith('data:'): 949 | continue 950 | 951 | if not url.startswith(('http://', 'https://')): 952 | # Resolve relative to the CSS file 953 | url = urljoin(css_url, url) 954 | 955 | # Determine asset type 956 | asset_type = get_asset_type(url) 957 | if asset_type in assets: 958 | assets[asset_type].append(url) 959 | 960 | # Extract font families 961 | font_families = re.findall(r'font-family:\s*[\'"]?([^\'";]+)[\'"]?', css_content) or [] 962 | for family in font_families: 963 | family = family.strip().split(',')[0].strip('\'"`') 964 | if family and family.lower() not in ['serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui']: 965 | assets['font_families'].add(family) 966 | 967 | # Extract Google Fonts specifically 968 | google_fonts_imports = re.findall(r'@import\s+url\([\'"]?(https?://fonts\.googleapis\.com/[^\'"|\)]+)[\'"]?\)', css_content) or [] 969 | for font_url in google_fonts_imports: 970 | if font_url not in assets['css']: 971 | assets['css'].append(font_url) 972 | 973 | # Check for Tailwind 974 | if 'tailwind' in css_content.lower() or '.tw-' in css_content: 975 | print("Detected Tailwind CSS in stylesheets") 976 | except Exception as css_error: 977 | print(f"Error processing CSS {css_url}: {str(css_error)}") 978 | except Exception as e: 979 | print(f"Error processing CSS files: {str(e)}") 980 | 981 | # Extract UI components 982 | try: 983 | components = extract_component_structure(soup) 984 | if components: 985 | assets['components'] = components 986 | except Exception as e: 987 | print(f"Error extracting components: {str(e)}") 988 | traceback.print_exc() 989 | 990 | # Remove duplicates while preserving order 991 | for asset_type in assets: 992 | if isinstance(assets[asset_type], list): 993 | # Use dict.fromkeys to remove duplicates while preserving order 994 | assets[asset_type] = list(dict.fromkeys(assets[asset_type])) 995 | 996 | return assets 997 | 998 | except Exception as e: 999 | print(f"Error in extract_assets: {str(e)}") 1000 | traceback.print_exc() 1001 | return assets 1002 | 1003 | def create_zip_file(html_content, assets, url, session_obj, headers, screenshots=None): 1004 | """Create a zip file containing the extracted website data""" 1005 | # Create a temp file for the zip 1006 | temp_zip = tempfile.NamedTemporaryFile(delete=False, suffix='.zip') 1007 | temp_zip.close() 1008 | 1009 | # Extract domain for the folder name 1010 | parsed_url = urlparse(url) 1011 | domain = parsed_url.netloc 1012 | 1013 | # Current timestamp 1014 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 1015 | 1016 | # Create the zip file 1017 | with zipfile.ZipFile(temp_zip.name, 'w', zipfile.ZIP_DEFLATED) as zipf: 1018 | # Write the main HTML 1019 | zipf.writestr('index.html', html_content) 1020 | 1021 | # Create directories for each asset type 1022 | for asset_type in assets.keys(): 1023 | if asset_type in ['font_families', 'metadata', 'components']: 1024 | continue # Skip non-URL assets 1025 | 1026 | # Make sure the assets[asset_type] exists and is a list before iterating 1027 | if not assets[asset_type] or not isinstance(assets[asset_type], list): 1028 | print(f" Skipping {asset_type} - no assets found or invalid format") 1029 | continue 1030 | 1031 | # Create the directory 1032 | zipf.writestr(f'{asset_type}/.gitkeep', '') 1033 | 1034 | # Download each asset 1035 | processed_urls = set() # Track processed URLs to avoid duplicates 1036 | 1037 | for url in assets[asset_type]: 1038 | # Skip if the URL is None, empty, or a data URL 1039 | if not url or url.startswith('data:'): 1040 | continue 1041 | 1042 | # Skip if we've already processed this URL 1043 | if url in processed_urls: 1044 | continue 1045 | 1046 | processed_urls.add(url) 1047 | 1048 | try: 1049 | # Fix URL if it's relative 1050 | if url.startswith('//'): 1051 | url = 'https:' + url 1052 | elif url.startswith('/'): 1053 | parsed_base = urlparse(parsed_url.scheme + '://' + parsed_url.netloc) 1054 | url = urljoin(parsed_base.geturl(), url) 1055 | 1056 | # Extract filename from URL 1057 | path = urlparse(url).path 1058 | # Handle query parameters in the URL 1059 | query = urlparse(url).query 1060 | filename = os.path.basename(unquote(path)) 1061 | 1062 | # Clean filename 1063 | if not filename: 1064 | filename = f"{timestamp}_{uuid.uuid4().hex[:8]}.{asset_type}" 1065 | elif '.' not in filename: 1066 | filename = f"{filename}.{asset_type}" 1067 | 1068 | # Add query parameters to filename to make it unique 1069 | if query: 1070 | clean_query = re.sub(r'[^a-zA-Z0-9]', '_', query)[:30] # Limit length 1071 | name, ext = os.path.splitext(filename) 1072 | filename = f"{name}_{clean_query}{ext}" 1073 | 1074 | # Avoid duplicate filenames with UUID 1075 | file_path = f"{asset_type}/{filename}" 1076 | 1077 | try: 1078 | # Download the file 1079 | response = session_obj.get( 1080 | url, 1081 | timeout=10, 1082 | headers=headers, 1083 | verify=False # Ignore SSL certificate errors 1084 | ) 1085 | 1086 | if response.status_code == 200: 1087 | zipf.writestr(file_path, response.content) 1088 | print(f" Added {file_path}") 1089 | else: 1090 | print(f" Failed to download {url}, status: {response.status_code}") 1091 | except Exception as e: 1092 | print(f" Error downloading {url}: {str(e)}") 1093 | except Exception as e: 1094 | print(f" Error processing URL {url}: {str(e)}") 1095 | 1096 | # Handle font families 1097 | if 'font_families' in assets and assets['font_families']: 1098 | zipf.writestr('css/fonts.css', '\n'.join([ 1099 | f"/* Font Family: {family} */\n" 1100 | f"@import url('https://fonts.googleapis.com/css2?family={family.replace(' ', '+')}&display=swap');\n" 1101 | for family in assets['font_families'] 1102 | ])) 1103 | 1104 | # Handle metadata if present 1105 | if 'metadata' in assets and assets['metadata']: 1106 | metadata_content = json.dumps(assets['metadata'], indent=2) 1107 | zipf.writestr('metadata.json', metadata_content) 1108 | 1109 | # Handle UI components if present 1110 | if 'components' in assets and assets['components'] and isinstance(assets['components'], dict): 1111 | # Create components directory 1112 | zipf.writestr('components/.gitkeep', '') 1113 | 1114 | # Create index for components 1115 | component_html = """ 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | Extracted UI Components 1122 | 1131 | 1132 | 1133 |

Extracted UI Components

1134 |

The following components were extracted from the website.

1135 | """ 1136 | 1137 | # Add each component 1138 | for component_type, components in assets['components'].items(): 1139 | if components: 1140 | component_html += f'

{component_type.replace("_", " ").title()} Components

' 1141 | 1142 | for i, component in enumerate(components): 1143 | html_code = component.get('html', '') 1144 | if html_code: 1145 | component_html += f""" 1146 |
1147 |
1148 | {component_type.replace("_", " ").title()} {i+1} 1149 |
1150 |
1151 | {html_code} 1152 |
1153 |
1154 |
{html.escape(html_code)}
1155 |
1156 |
1157 | """ 1158 | 1159 | component_html += """ 1160 | 1161 | 1162 | """ 1163 | 1164 | zipf.writestr('components/index.html', component_html) 1165 | 1166 | # Save individual components 1167 | for component_type, components in assets['components'].items(): 1168 | if components: 1169 | zipf.writestr(f'components/{component_type}/.gitkeep', '') 1170 | 1171 | for i, component in enumerate(components): 1172 | html_code = component.get('html', '') 1173 | if html_code: 1174 | zipf.writestr(f'components/{component_type}/component_{i+1}.html', html_code) 1175 | 1176 | # Create a README file 1177 | readme_content = f"""# Website Clone: {domain} 1178 | 1179 | Extracted on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} 1180 | Source URL: {url} 1181 | 1182 | ## Contents 1183 | 1184 | - `index.html`: Main HTML file 1185 | - `css/`: Stylesheets 1186 | - `js/`: JavaScript files 1187 | - `img/`: Images 1188 | - `fonts/`: Font files 1189 | - `components/`: Extracted UI components 1190 | - `metadata.json`: Website metadata (title, description, etc.) 1191 | 1192 | ## How to Use 1193 | 1194 | 1. Unzip this file 1195 | 2. Open `index.html` in your browser 1196 | 3. For best results, serve the files with a local server: 1197 | ``` 1198 | python -m http.server 1199 | ``` 1200 | Then open http://localhost:8000 in your browser 1201 | 1202 | ## Component Viewer 1203 | 1204 | If components were extracted, you can view them by opening `components/index.html` 1205 | 1206 | ## Notes 1207 | 1208 | - Some assets might not load correctly due to cross-origin restrictions 1209 | - External resources and APIs may not work without proper configuration 1210 | - JavaScript functionality might be limited without a proper backend 1211 | 1212 | ## Handling Modern Frameworks 1213 | 1214 | This extraction has been optimized to handle the following frameworks: 1215 | - React and Next.js: Script chunks and module loading 1216 | - Angular: Component structure and scripts 1217 | - Tailwind CSS: Utility classes and structure 1218 | 1219 | Generated by Website Extractor 1220 | """ 1221 | zipf.writestr('README.md', readme_content) 1222 | 1223 | return temp_zip.name 1224 | 1225 | def extract_with_selenium(url, timeout=30): 1226 | """ 1227 | Extract rendered HTML content using Selenium with Chrome/Chromium. 1228 | This method will execute JavaScript and capture the fully rendered page structure. 1229 | 1230 | Args: 1231 | url: URL to fetch 1232 | timeout: Maximum time to wait for page to load (seconds) 1233 | 1234 | Returns: 1235 | tuple: (html_content, discovered_urls, None) 1236 | """ 1237 | if not SELENIUM_AVAILABLE: 1238 | return None, None, {"error": "Selenium is not installed. Run: pip install selenium webdriver-manager"} 1239 | 1240 | try: 1241 | print("Setting up advanced Chrome options...") 1242 | # Set up Chrome options with anti-detection measures 1243 | chrome_options = Options() 1244 | chrome_options.add_argument("--headless") # Run headless 1245 | chrome_options.add_argument("--disable-gpu") # Disable GPU hardware acceleration 1246 | chrome_options.add_argument("--no-sandbox") # Required for running as root 1247 | chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems 1248 | chrome_options.add_argument("--window-size=1920,1080") # Set window size 1249 | chrome_options.add_argument("--disable-notifications") # Disable notifications 1250 | chrome_options.add_argument("--disable-extensions") # Disable extensions 1251 | chrome_options.add_argument("--disable-infobars") # Disable infobars 1252 | 1253 | # Avoid detection as a bot 1254 | chrome_options.add_argument("--disable-blink-features=AutomationControlled") 1255 | chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) 1256 | chrome_options.add_experimental_option("useAutomationExtension", False) 1257 | 1258 | # Add modern user agent to avoid detection 1259 | chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36") 1260 | 1261 | # Initialize the Chrome driver 1262 | print(f"Initializing Chrome WebDriver...") 1263 | try: 1264 | service = Service(ChromeDriverManager().install()) 1265 | driver = webdriver.Chrome(service=service, options=chrome_options) 1266 | except Exception as driver_error: 1267 | print(f"Error initializing Chrome WebDriver: {str(driver_error)}") 1268 | print("Trying alternative initialization method...") 1269 | try: 1270 | # Try alternative initialization without Service object 1271 | driver = webdriver.Chrome(options=chrome_options) 1272 | except Exception as alt_error: 1273 | print(f"Alternative initialization also failed: {str(alt_error)}") 1274 | return None, None, {"error": f"Failed to initialize Chrome WebDriver: {str(alt_error)}"} 1275 | 1276 | # Set page load timeout 1277 | driver.set_page_load_timeout(timeout) 1278 | 1279 | # Used to store discovered URLs 1280 | discovered_urls = [] 1281 | 1282 | try: 1283 | print(f"Navigating to {url}...") 1284 | driver.get(url) 1285 | 1286 | # Wait for page to be fully loaded 1287 | try: 1288 | WebDriverWait(driver, timeout).until( 1289 | EC.presence_of_element_located((By.TAG_NAME, "body")) 1290 | ) 1291 | except Exception as e: 1292 | print(f"Warning: Timeout waiting for body element: {str(e)}") 1293 | 1294 | # Execute JavaScript to disable animation 1295 | try: 1296 | driver.execute_script(""" 1297 | var style = document.createElement('style'); 1298 | style.type = 'text/css'; 1299 | style.innerHTML = '* { animation-duration: 0.001s !important; transition-duration: 0.001s !important; }'; 1300 | document.getElementsByTagName('head')[0].appendChild(style); 1301 | """) 1302 | print("Animations disabled to improve extraction") 1303 | except Exception as e: 1304 | print(f"Warning: Could not disable animations: {str(e)}") 1305 | 1306 | # Wait for page to be fully rendered 1307 | print("Waiting for dynamic content to load...") 1308 | try: 1309 | # Wait a bit for any dynamic content to load 1310 | time.sleep(5) 1311 | 1312 | # Wait for network to be idle 1313 | driver.execute_script("return window.performance.getEntriesByType('resource').length") 1314 | time.sleep(2) # Wait a bit more after resources are loaded 1315 | except Exception as e: 1316 | print(f"Warning while waiting for dynamic content: {str(e)}") 1317 | 1318 | # Implement advanced scrolling to trigger lazy loading 1319 | print("Performing advanced scrolling to trigger lazy loading...") 1320 | try: 1321 | # Get the total height of the page 1322 | total_height = driver.execute_script("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);") 1323 | 1324 | # Scroll down the page in steps 1325 | viewport_height = driver.execute_script("return window.innerHeight") 1326 | scroll_steps = max(1, min(20, total_height // viewport_height)) # Cap at 20 steps 1327 | 1328 | for i in range(scroll_steps + 1): 1329 | scroll_position = (i * total_height) // scroll_steps 1330 | driver.execute_script(f"window.scrollTo(0, {scroll_position});") 1331 | 1332 | # Small pause to allow content to load 1333 | time.sleep(0.3) 1334 | 1335 | # Extract resources after each scroll 1336 | try: 1337 | urls = driver.execute_script(""" 1338 | var resources = []; 1339 | // Get all link hrefs 1340 | document.querySelectorAll('link[rel="stylesheet"], link[as="style"]').forEach(function(el) { 1341 | if (el.href) resources.push(el.href); 1342 | }); 1343 | // Get all script srcs 1344 | document.querySelectorAll('script[src]').forEach(function(el) { 1345 | if (el.src) resources.push(el.src); 1346 | }); 1347 | // Get all image srcs 1348 | document.querySelectorAll('img[src]').forEach(function(el) { 1349 | if (el.src && !el.src.startsWith('data:')) resources.push(el.src); 1350 | }); 1351 | return resources; 1352 | """) 1353 | discovered_urls.extend(urls) 1354 | except Exception as res_error: 1355 | print(f"Error extracting resources during scroll: {str(res_error)}") 1356 | 1357 | # Scroll back to top 1358 | driver.execute_script("window.scrollTo(0, 0);") 1359 | 1360 | # Wait for everything to settle after scrolling 1361 | time.sleep(1) 1362 | except Exception as scroll_error: 1363 | print(f"Error during page scrolling: {str(scroll_error)}") 1364 | 1365 | # Try to click on common elements that might reveal more content 1366 | try: 1367 | # Common UI elements that might reveal more content when clicked 1368 | for selector in [ 1369 | 'button.load-more', '.show-more', '.expand', '.accordion-toggle', 1370 | '[aria-expanded="false"]', '.menu-toggle', '.navbar-toggler', 1371 | '.mobile-menu-button', '.hamburger', '[data-toggle="collapse"]' 1372 | ]: 1373 | try: 1374 | elements = driver.find_elements(By.CSS_SELECTOR, selector) 1375 | for element in elements[:3]: # Limit to first 3 matches of each type 1376 | if element.is_displayed(): 1377 | driver.execute_script("arguments[0].click();", element) 1378 | time.sleep(0.5) # Wait for content to appear 1379 | except Exception as click_error: 1380 | # Skip any errors and continue with next selector 1381 | continue 1382 | print("Attempted to expand hidden content") 1383 | except Exception as interact_error: 1384 | print(f"Error expanding content: {str(interact_error)}") 1385 | 1386 | # Get the final HTML content after all JavaScript executed 1387 | html_content = driver.page_source 1388 | print(f"HTML content captured ({len(html_content)} bytes)") 1389 | 1390 | # Extract URLs for modern frameworks 1391 | try: 1392 | # React/Next.js specific resources 1393 | next_js_urls = driver.execute_script(""" 1394 | var resources = []; 1395 | // Find Next.js specific scripts 1396 | document.querySelectorAll('script[src*="_next"]').forEach(function(el) { 1397 | resources.push(el.src); 1398 | }); 1399 | // Find chunk files 1400 | document.querySelectorAll('script[src*="chunk"]').forEach(function(el) { 1401 | resources.push(el.src); 1402 | }); 1403 | // Find webpack files 1404 | document.querySelectorAll('script[src*="webpack"]').forEach(function(el) { 1405 | resources.push(el.src); 1406 | }); 1407 | // Find hydration scripts 1408 | document.querySelectorAll('script[src*="hydration"]').forEach(function(el) { 1409 | resources.push(el.src); 1410 | }); 1411 | return resources; 1412 | """) 1413 | discovered_urls.extend(next_js_urls) 1414 | 1415 | # Angular specific resources 1416 | angular_urls = driver.execute_script(""" 1417 | var resources = []; 1418 | // Find Angular specific scripts 1419 | document.querySelectorAll('script[src*="runtime"]').forEach(function(el) { 1420 | resources.push(el.src); 1421 | }); 1422 | document.querySelectorAll('script[src*="polyfills"]').forEach(function(el) { 1423 | resources.push(el.src); 1424 | }); 1425 | document.querySelectorAll('script[src*="main"]').forEach(function(el) { 1426 | resources.push(el.src); 1427 | }); 1428 | return resources; 1429 | """) 1430 | discovered_urls.extend(angular_urls) 1431 | 1432 | # Get CSS variables for Tailwind detection 1433 | tailwind_check = driver.execute_script(""" 1434 | var style = window.getComputedStyle(document.body); 1435 | var hasTailwind = false; 1436 | // Check for common Tailwind classes 1437 | if (document.querySelector('.flex') && 1438 | document.querySelector('.grid') && 1439 | document.querySelector('.text-')) { 1440 | hasTailwind = true; 1441 | } 1442 | return hasTailwind; 1443 | """) 1444 | 1445 | if tailwind_check: 1446 | print("Tailwind CSS detected, including appropriate CSS files") 1447 | except Exception as framework_error: 1448 | print(f"Error detecting framework resources: {str(framework_error)}") 1449 | 1450 | # Remove duplicates from discovered URLs 1451 | discovered_urls = list(set(discovered_urls)) 1452 | print(f"Discovered {len(discovered_urls)} resource URLs") 1453 | 1454 | return html_content, discovered_urls, None 1455 | 1456 | except TimeoutException: 1457 | print(f"Timeout while loading {url}") 1458 | return None, None, {"error": "Timeout while loading page"} 1459 | except WebDriverException as e: 1460 | print(f"Selenium error: {str(e)}") 1461 | return None, None, {"error": f"Selenium error: {str(e)}"} 1462 | finally: 1463 | # Close the browser 1464 | print("Closing WebDriver...") 1465 | driver.quit() 1466 | 1467 | except Exception as e: 1468 | print(f"Error setting up Selenium: {str(e)}") 1469 | return None, None, {"error": f"Error setting up Selenium: {str(e)}"} 1470 | 1471 | def fix_relative_urls(html_content, base_url): 1472 | """Fix relative URLs in the HTML content""" 1473 | soup = BeautifulSoup(html_content, 'html.parser') 1474 | 1475 | # Fix relative URLs for links 1476 | for link in soup.find_all('a', href=True): 1477 | href = link['href'] 1478 | if href.startswith('/'): 1479 | link['href'] = urljoin(base_url, href) 1480 | 1481 | # Fix relative URLs for images 1482 | for img in soup.find_all('img', src=True): 1483 | src = img['src'] 1484 | if not src.startswith(('http://', 'https://', 'data:')): 1485 | img['src'] = urljoin(base_url, src) 1486 | 1487 | # Fix relative URLs for scripts 1488 | for script in soup.find_all('script', src=True): 1489 | src = script['src'] 1490 | if not src.startswith(('http://', 'https://', 'data:')): 1491 | script['src'] = urljoin(base_url, src) 1492 | 1493 | # Fix relative URLs for stylesheets 1494 | for link in soup.find_all('link', href=True): 1495 | href = link['href'] 1496 | if not href.startswith(('http://', 'https://', 'data:')): 1497 | link['href'] = urljoin(base_url, href) 1498 | 1499 | return str(soup) 1500 | 1501 | @app.route('/') 1502 | def index(): 1503 | """Render the home page""" 1504 | return render_template('index.html') 1505 | 1506 | @app.route('/clear') 1507 | def clear_session(): 1508 | """Clear the session data""" 1509 | session.clear() 1510 | return jsonify({'message': 'Session cleared'}) 1511 | 1512 | @app.route('/extract', methods=['POST']) 1513 | def extract(): 1514 | url = request.form.get('url') 1515 | use_selenium = request.form.get('use_selenium') == 'true' 1516 | 1517 | if not url: 1518 | return jsonify({'error': 'URL is required'}), 400 1519 | 1520 | try: 1521 | # Add http:// if not present 1522 | if not url.startswith(('http://', 'https://')): 1523 | url = 'https://' + url 1524 | 1525 | print(f"\n{'='*80}\nStarting extraction for: {url}\n{'='*80}") 1526 | 1527 | # Create a session to maintain cookies 1528 | session_obj = requests.Session() 1529 | 1530 | # Disable SSL verification warnings 1531 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 1532 | 1533 | # List of user agents to try if we get blocked 1534 | user_agents = [ 1535 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', 1536 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', 1537 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', 1538 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', 1539 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', 1540 | 'Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1', 1541 | 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1' 1542 | ] 1543 | 1544 | # List of referers to try 1545 | referers = [ 1546 | 'https://www.google.com/', 1547 | 'https://www.bing.com/', 1548 | 'https://www.instagram.com/', 1549 | 'https://www.facebook.com/', 1550 | 'https://www.twitter.com/', 1551 | 'https://www.linkedin.com/' 1552 | ] 1553 | 1554 | # Initial headers (will be rotated if needed) 1555 | headers = { 1556 | 'User-Agent': random.choice(user_agents), 1557 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 1558 | 'Accept-Language': 'en-US,en;q=0.9', 1559 | 'Accept-Encoding': 'gzip, deflate, br', 1560 | 'Connection': 'keep-alive', 1561 | 'Upgrade-Insecure-Requests': '1', 1562 | 'Sec-Fetch-Dest': 'document', 1563 | 'Sec-Fetch-Mode': 'navigate', 1564 | 'Sec-Fetch-Site': 'none', 1565 | 'Sec-Fetch-User': '?1', 1566 | 'Cache-Control': 'max-age=0', 1567 | 'Referer': random.choice(referers), 1568 | 'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"', 1569 | 'sec-ch-ua-mobile': '?0', 1570 | 'sec-ch-ua-platform': '"Windows"', 1571 | } 1572 | 1573 | html_content = None 1574 | additional_urls = [] 1575 | 1576 | # Use Selenium for rendering if requested and available 1577 | if use_selenium and SELENIUM_AVAILABLE: 1578 | print("Using Selenium for advanced rendering...") 1579 | html_content, additional_urls, error_info = extract_with_selenium(url) 1580 | 1581 | if not html_content: 1582 | print("Selenium extraction failed, falling back to regular request") 1583 | use_selenium = False 1584 | # Check if we have an error message 1585 | if error_info and isinstance(error_info, dict) and 'error' in error_info: 1586 | print(f"Selenium error: {error_info['error']}") 1587 | 1588 | # If Selenium wasn't used or failed, use regular requests with retries 1589 | if not use_selenium or not html_content: 1590 | # Maximum number of retries with different configurations 1591 | max_retries = 5 1592 | retry_count = 0 1593 | last_error = None 1594 | 1595 | while retry_count < max_retries and not html_content: 1596 | try: 1597 | print(f"HTTP Request attempt {retry_count+1}/{max_retries} for: {url}") 1598 | print(f"Using User-Agent: {headers['User-Agent'][:30]}...") 1599 | 1600 | # First request to get cookies and possible redirects 1601 | response = session_obj.get( 1602 | url, 1603 | timeout=20, # Increased timeout 1604 | headers=headers, 1605 | allow_redirects=True, 1606 | verify=False # Ignore SSL certificate errors 1607 | ) 1608 | 1609 | # Follow redirects manually if needed 1610 | if response.history: 1611 | print(f"Request was redirected {len(response.history)} times") 1612 | for i, resp in enumerate(response.history): 1613 | print(f" Redirect {i+1}: {resp.url} -> {resp.headers.get('Location')}") 1614 | print(f" Final URL: {response.url}") 1615 | url = response.url # Update URL to the final destination 1616 | 1617 | # Handle different status codes 1618 | if response.status_code == 200: 1619 | print(f"Success! Received 200 OK response ({len(response.content)} bytes)") 1620 | 1621 | # Determine encoding from Content-Type header or content 1622 | content_type = response.headers.get('Content-Type', '') 1623 | print(f"Content-Type: {content_type}") 1624 | 1625 | # Get encoding from headers or meta tag 1626 | encoding = None 1627 | 1628 | # Try to get encoding from Content-Type header 1629 | if 'charset=' in content_type: 1630 | encoding = content_type.split('charset=')[1].split(';')[0].strip() 1631 | print(f"Encoding from headers: {encoding}") 1632 | 1633 | # If no encoding specified, try to detect from content 1634 | if not encoding: 1635 | # Look for tag 1636 | charset_match = re.search(r']+)', response.text, re.IGNORECASE) 1637 | if charset_match: 1638 | encoding = charset_match.group(1) 1639 | print(f"Encoding from meta charset tag: {encoding}") 1640 | else: 1641 | # Look for 1642 | http_equiv_match = re.search(r']+)', response.text, re.IGNORECASE) 1643 | if http_equiv_match: 1644 | encoding = http_equiv_match.group(1) 1645 | print(f"Encoding from meta http-equiv tag: {encoding}") 1646 | 1647 | # If still no encoding, use apparent encoding from requests 1648 | if not encoding and response.apparent_encoding: 1649 | encoding = response.apparent_encoding 1650 | print(f"Detected encoding: {encoding}") 1651 | 1652 | # Default to utf-8 if still no encoding 1653 | if not encoding: 1654 | encoding = 'utf-8' 1655 | print("Using default encoding: utf-8") 1656 | 1657 | # Decode content with detected encoding 1658 | try: 1659 | html_content = response.content.decode(encoding, errors='replace') 1660 | print(f"Successfully decoded HTML content with {encoding} encoding ({len(html_content)} bytes)") 1661 | break # Exit the retry loop on success 1662 | except (UnicodeDecodeError, LookupError) as e: 1663 | print(f"Error decoding with {encoding}: {str(e)}, falling back to utf-8") 1664 | html_content = response.content.decode('utf-8', errors='replace') 1665 | break # Exit the retry loop on success with fallback 1666 | 1667 | elif response.status_code == 403: # Forbidden - likely bot protection 1668 | print(f"Received 403 Forbidden response - website is likely blocking scrapers") 1669 | 1670 | # If we have Selenium available as a fallback, try that instead 1671 | if SELENIUM_AVAILABLE and not use_selenium: 1672 | print("Trying Selenium as a fallback for 403 error...") 1673 | html_content, additional_urls, error_info = extract_with_selenium(url) 1674 | if html_content: 1675 | print("Successfully bypassed 403 with Selenium!") 1676 | break 1677 | 1678 | # Otherwise, rotate our headers and try again 1679 | headers['User-Agent'] = random.choice(user_agents) 1680 | headers['Referer'] = random.choice(referers) 1681 | 1682 | # Add some randomization to headers 1683 | if random.random() > 0.5: 1684 | headers['Accept-Language'] = random.choice(['en-US,en;q=0.9', 'en-GB,en;q=0.8,en-US;q=0.7', 'en-CA,en;q=0.9,fr-CA;q=0.8']) 1685 | 1686 | # Try adding cookies if we have any from previous responses 1687 | if session_obj.cookies: 1688 | print(f"Using {len(session_obj.cookies)} cookies from previous responses") 1689 | 1690 | # Add delay to avoid rate limiting 1691 | delay = random.uniform(1.0, 3.0) 1692 | print(f"Waiting {delay:.2f} seconds before retrying...") 1693 | time.sleep(delay) 1694 | 1695 | elif response.status_code == 429: # Too Many Requests 1696 | print(f"Received 429 Too Many Requests - rate limited") 1697 | 1698 | # Check if we have a Retry-After header 1699 | retry_after = response.headers.get('Retry-After') 1700 | if retry_after and retry_after.isdigit(): 1701 | delay = int(retry_after) + random.uniform(0.1, 1.0) 1702 | else: 1703 | delay = 5 + random.uniform(1.0, 5.0) # 5-10 second delay 1704 | 1705 | print(f"Waiting {delay:.2f} seconds before retrying...") 1706 | time.sleep(delay) 1707 | 1708 | # Rotate headers 1709 | headers['User-Agent'] = random.choice(user_agents) 1710 | 1711 | elif response.status_code == 503: # Service Unavailable - often used for anti-bot 1712 | print(f"Received 503 Service Unavailable - possible anti-bot measure") 1713 | 1714 | # Try with a longer delay and new headers 1715 | delay = 10 + random.uniform(1.0, 5.0) # 10-15 second delay 1716 | print(f"Waiting {delay:.2f} seconds before retrying...") 1717 | time.sleep(delay) 1718 | 1719 | # Complete header rotation 1720 | headers['User-Agent'] = random.choice(user_agents) 1721 | headers['Referer'] = random.choice(referers) 1722 | 1723 | else: 1724 | print(f"Received unexpected status code: {response.status_code}") 1725 | last_error = f"HTTP error ({response.status_code})" 1726 | 1727 | # Try with new headers on next attempt 1728 | headers['User-Agent'] = random.choice(user_agents) 1729 | 1730 | except requests.exceptions.Timeout: 1731 | print(f"Timeout error fetching {url}") 1732 | last_error = "Request timeout" 1733 | # Try with increased timeout on next attempt 1734 | 1735 | except requests.exceptions.ConnectionError: 1736 | print(f"Connection error fetching {url}") 1737 | last_error = "Connection error" 1738 | # Wait before retrying 1739 | time.sleep(2) 1740 | 1741 | except requests.exceptions.TooManyRedirects: 1742 | print(f"Too many redirects for {url}") 1743 | last_error = "Too many redirects" 1744 | # This is likely a permanent issue, break the loop 1745 | break 1746 | 1747 | except Exception as e: 1748 | print(f"Error fetching {url}: {str(e)}") 1749 | last_error = str(e) 1750 | 1751 | retry_count += 1 1752 | 1753 | # If we've exhausted all retries and still don't have content 1754 | if not html_content and retry_count >= max_retries: 1755 | error_msg = f"Failed to fetch website after {max_retries} attempts. Last error: {last_error}" 1756 | print(error_msg) 1757 | return jsonify({'error': error_msg}), 400 1758 | 1759 | # Safety check - make sure we have HTML content 1760 | if not html_content or len(html_content) < 100: # Arbitrary minimum size for valid HTML 1761 | return jsonify({'error': 'Failed to extract valid HTML content from the website'}), 400 1762 | 1763 | # Continue with asset extraction and zip file creation 1764 | try: 1765 | print("\nExtracting assets...") 1766 | # Extract assets from the HTML content 1767 | assets = extract_assets(html_content, url, session_obj, headers) 1768 | 1769 | if not assets: 1770 | return jsonify({'error': 'Failed to extract assets from the website'}), 500 1771 | 1772 | print(f"Assets extracted: {', '.join(assets.keys())}") 1773 | 1774 | # If we have additional URLs from Selenium, add them to the assets 1775 | if additional_urls: 1776 | print(f"Adding {len(additional_urls)} URLs discovered by Selenium") 1777 | for asset_url in additional_urls: 1778 | # Skip data URLs 1779 | if not asset_url or asset_url.startswith('data:'): 1780 | continue 1781 | 1782 | # Normalize URL 1783 | if asset_url.startswith('//'): 1784 | asset_url = f"https:{asset_url}" 1785 | 1786 | try: 1787 | asset_type = get_asset_type(asset_url) 1788 | if asset_type in assets and asset_url not in assets[asset_type]: 1789 | # Validate URL 1790 | parsed = urlparse(asset_url) 1791 | if parsed.scheme and parsed.netloc: 1792 | assets[asset_type].append(asset_url) 1793 | except Exception as url_error: 1794 | print(f"Error processing URL {asset_url}: {str(url_error)}") 1795 | 1796 | # Count assets by type 1797 | asset_counts = {asset_type: len(urls) for asset_type, urls in assets.items() 1798 | if isinstance(urls, list) and asset_type not in ['metadata', 'font_families']} 1799 | print(f"\nAsset counts:") 1800 | for asset_type, count in asset_counts.items(): 1801 | print(f" {asset_type}: {count}") 1802 | 1803 | # Check if we have enough assets 1804 | total_assets = sum(count for count in asset_counts.values()) 1805 | if total_assets < 5: 1806 | print("\nWARNING: Very few assets extracted. Trying alternative extraction methods...") 1807 | 1808 | # Try to extract assets from the page using JavaScript execution (simulated) 1809 | try: 1810 | # Look for JavaScript variables that might contain asset URLs 1811 | js_asset_patterns = [ 1812 | r'["\'](https?://[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']', 1813 | r'["\'](/[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']', 1814 | r'["\'](//[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']', 1815 | r'loadCSS\(["\']([^"\']+)["\']', 1816 | r'loadJS\(["\']([^"\']+)["\']', 1817 | r'src=["\'](/[^"\']+)["\']', 1818 | r'href=["\'](/[^"\']+\.css)["\']', 1819 | # React/Next.js specific patterns 1820 | r'__NEXT_DATA__\s*=\s*({.*})', 1821 | r'window\.__PRELOADED_STATE__\s*=\s*({.*})', 1822 | r'window\.__INITIAL_STATE__\s*=\s*({.*})', 1823 | r'_ASSET_PREFIX_\s*=\s*["\']([^"\']+)["\']' 1824 | ] 1825 | 1826 | for pattern in js_asset_patterns: 1827 | matches = re.findall(pattern, html_content) 1828 | for match in matches: 1829 | if isinstance(match, tuple): 1830 | match_url = match[0] 1831 | else: 1832 | match_url = match 1833 | 1834 | if match_url.startswith('//'): 1835 | match_url = 'https:' + match_url 1836 | elif match_url.startswith('/'): 1837 | match_url = urljoin(url, match_url) 1838 | 1839 | # Skip if it's clearly not a URL (likely JSON data) 1840 | if '{' in match_url or '}' in match_url: 1841 | continue 1842 | 1843 | asset_type = get_asset_type(match_url) 1844 | if asset_type in assets: 1845 | assets[asset_type].append(match_url) 1846 | 1847 | print("Extracted additional assets from JavaScript patterns") 1848 | except Exception as e: 1849 | print(f"Error extracting additional assets: {str(e)}") 1850 | 1851 | # Try to fix relative URLs in the HTML 1852 | try: 1853 | print("\nFixing relative URLs...") 1854 | fixed_html = fix_relative_urls(html_content, url) 1855 | print("Relative URLs fixed") 1856 | except Exception as e: 1857 | print(f"Error fixing URLs: {str(e)}") 1858 | fixed_html = html_content # Use original HTML if fixing fails 1859 | 1860 | try: 1861 | # Create and send zip file, passing the session and headers 1862 | print("\nCreating zip file...") 1863 | 1864 | # Extract domain from URL for the filename 1865 | domain = urlparse(url).netloc 1866 | safe_domain = re.sub(r'[^\w\-_]', '_', domain) 1867 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 1868 | filename = f"{safe_domain}_{timestamp}.zip" 1869 | 1870 | # Create a zip file with the extracted content 1871 | zip_file_path = create_zip_file(fixed_html, assets, url, session_obj, headers) 1872 | 1873 | # Check if the file was created successfully 1874 | if not os.path.exists(zip_file_path) or os.path.getsize(zip_file_path) < 100: 1875 | return jsonify({'error': 'Failed to create valid zip file'}), 500 1876 | 1877 | print(f"Zip file created successfully at {zip_file_path} ({os.path.getsize(zip_file_path)} bytes)") 1878 | print(f"\nExtraction completed for: {url}\n{'='*80}") 1879 | 1880 | # Copy the temporary file to a more persistent location 1881 | persistent_dir = os.path.join(tempfile.gettempdir(), 'website_extractor_downloads') 1882 | os.makedirs(persistent_dir, exist_ok=True) 1883 | persistent_path = os.path.join(persistent_dir, filename) 1884 | 1885 | # Copy the file instead of moving to ensure the original isn't deleted prematurely 1886 | shutil.copy2(zip_file_path, persistent_path) 1887 | 1888 | # Schedule the temp file for deletion after a reasonable period (30 minutes) 1889 | def delete_temp_file(): 1890 | try: 1891 | time.sleep(1800) # 30 minutes 1892 | if os.path.exists(zip_file_path): 1893 | os.remove(zip_file_path) 1894 | print(f"Temporary file {zip_file_path} removed after 30 minutes") 1895 | if os.path.exists(persistent_path): 1896 | os.remove(persistent_path) 1897 | print(f"Persistent file {persistent_path} removed after 30 minutes") 1898 | except Exception as e: 1899 | print(f"Error removing temporary file: {str(e)}") 1900 | 1901 | # Start a thread to handle file deletion 1902 | cleanup_thread = threading.Thread(target=delete_temp_file) 1903 | cleanup_thread.daemon = True 1904 | cleanup_thread.start() 1905 | 1906 | # Send the persistent file with improved headers and explicit attachment 1907 | response = send_file( 1908 | persistent_path, 1909 | mimetype='application/zip', 1910 | as_attachment=True, 1911 | download_name=filename 1912 | ) 1913 | 1914 | # Add headers to prevent caching issues 1915 | response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate' 1916 | response.headers['Pragma'] = 'no-cache' 1917 | response.headers['Expires'] = '0' 1918 | response.headers['Content-Disposition'] = f'attachment; filename="{filename}"' 1919 | 1920 | # Note: We're no longer using after_this_request to remove the file immediately 1921 | # Instead, we're using a background thread to clean up after 30 minutes 1922 | 1923 | return response 1924 | 1925 | except Exception as e: 1926 | print(f"Error creating or sending zip file: {str(e)}") 1927 | traceback.print_exc() 1928 | return jsonify({'error': f'Failed to create or send zip file: {str(e)}'}), 500 1929 | except Exception as e: 1930 | print(f"Error in asset extraction: {str(e)}") 1931 | traceback.print_exc() 1932 | return jsonify({'error': f'Error extracting assets: {str(e)}'}), 500 1933 | 1934 | except Exception as e: 1935 | print(f"Unexpected error: {str(e)}") 1936 | traceback.print_exc() 1937 | return jsonify({'error': str(e)}), 500 1938 | 1939 | if __name__ == '__main__': 1940 | print("\n" + "="*80) 1941 | print("Website Extractor is running!") 1942 | print("Access it in your browser at: http://127.0.0.1:5001") 1943 | print("="*80 + "\n") 1944 | app.run(debug=True, threaded=True, port=5001) 1945 | 1946 | def main(): 1947 | """Entry point for the package, to allow running as an installed package from command line""" 1948 | print("\n" + "="*80) 1949 | print("Website Extractor is running!") 1950 | print("Access it in your browser at: http://127.0.0.1:5001") 1951 | print("="*80 + "\n") 1952 | app.run(debug=True, threaded=True, port=5001) --------------------------------------------------------------------------------