├── requirements.txt
├── .gitignore
├── LICENSE
├── setup.py
├── CONTRIBUTING.md
├── docs
    └── app_architecture_overview.md
├── app_architecture.md
├── README.md
├── templates
    ├── components.html
    └── index.html
└── app.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==3.0.2
2 | requests==2.31.0
3 | beautifulsoup4==4.12.3
4 | urllib3==2.2.1
5 | cssutils==2.9.0
6 | selenium==4.18.1
7 | webdriver-manager==4.0.1 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # Virtual environment
28 | venv/
29 | env/
30 | ENV/
31 | 
32 | # Flask related
33 | instance/
34 | .webassets-cache
35 | 
36 | # Selenium & WebDriver
37 | chromedriver
38 | chromedriver.exe
39 | *.log
40 | geckodriver
41 | geckodriver.exe
42 | .wdm/
43 | 
44 | # OS specific files
45 | .DS_Store
46 | .DS_Store?
47 | ._*
48 | .Spotlight-V100
49 | .Trashes
50 | ehthumbs.db
51 | Thumbs.db
52 | 
53 | # Editor directories and files
54 | .idea/
55 | .vscode/
56 | *.swp
57 | *.swo
58 | 
59 | # Temporary files
60 | *.tmp
61 | *~
62 | tmp/
63 | temp/
64 | 
65 | # Downloaded website archives
66 | *.zip
67 | 
68 | # Local environment variables
69 | .env
70 | .env.local 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Sirio Berati
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | with open("requirements.txt", "r", encoding="utf-8") as fh:
 7 |     requirements = fh.read().splitlines()
 8 | 
 9 | setup(
10 |     name="website-extractor",
11 |     version="1.0.0",
12 |     author="Sirio Berati",
13 |     author_email="your.email@example.com",  # Replace with your actual email
14 |     description="A tool to extract and archive entire websites with advanced rendering capabilities",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/sirioberati/website-extractor",
18 |     packages=find_packages(),
19 |     classifiers=[
20 |         "Programming Language :: Python :: 3",
21 |         "License :: OSI Approved :: MIT License",
22 |         "Operating System :: OS Independent",
23 |         "Topic :: Internet :: WWW/HTTP",
24 |         "Topic :: Software Development :: Libraries :: Python Modules",
25 |         "Topic :: Utilities",
26 |     ],
27 |     python_requires=">=3.7",
28 |     install_requires=requirements,
29 |     entry_points={
30 |         "console_scripts": [
31 |             "website-extractor=app:main",
32 |         ],
33 |     },
34 |     include_package_data=True,
35 | ) 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Website Extractor
 2 | 
 3 | Thank you for considering contributing to Website Extractor! This document provides guidelines and instructions for contributing to this project.
 4 | 
 5 | ## Code of Conduct
 6 | 
 7 | By participating in this project, you agree to maintain a respectful and inclusive environment for everyone.
 8 | 
 9 | ## How Can I Contribute?
10 | 
11 | ### Reporting Bugs
12 | 
13 | Before creating a bug report:
14 | 
15 | 1. Check the existing issues to see if the problem has already been reported
16 | 2. Collect information about the bug (steps to reproduce, error messages, etc.)
17 | 
18 | When submitting a bug report, please include:
19 | 
20 | - A clear and descriptive title
21 | - Detailed steps to reproduce the issue
22 | - Expected vs. actual behavior
23 | - Screenshots if applicable
24 | - Your environment information (OS, browser, Python version, etc.)
25 | 
26 | ### Suggesting Enhancements
27 | 
28 | Enhancement suggestions are welcome! Please include:
29 | 
30 | - A clear and descriptive title
31 | - A detailed description of the proposed enhancement
32 | - The motivation behind the enhancement
33 | - Any potential implementation details you can think of
34 | 
35 | ### Pull Requests
36 | 
37 | 1. Fork the repository
38 | 2. Create a new branch (`git checkout -b feature/amazing-feature`)
39 | 3. Make your changes
40 | 4. Run tests if available
41 | 5. Commit your changes (`git commit -m 'Add some amazing feature'`)
42 | 6. Push to your branch (`git push origin feature/amazing-feature`)
43 | 7. Open a Pull Request
44 | 
45 | ## Development Setup
46 | 
47 | 1. Fork and clone the repository
48 | 2. Create a virtual environment:
49 |    ```bash
50 |    python -m venv venv
51 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
52 |    ```
53 | 3. Install dependencies:
54 |    ```bash
55 |    pip install -r requirements.txt
56 |    ```
57 | 4. Run the application locally:
58 |    ```bash
59 |    python app.py
60 |    ```
61 | 
62 | ## Coding Standards
63 | 
64 | - Follow PEP 8 style guidelines
65 | - Write descriptive commit messages
66 | - Include comments and docstrings
67 | - Add tests for new features when possible
68 | 
69 | ## License
70 | 
71 | By contributing to this project, you agree that your contributions will be licensed under the project's [MIT License](LICENSE).
72 | 
73 | ## Questions?
74 | 
75 | If you have any questions, feel free to reach out to the project maintainer through the GitHub issues page. 


--------------------------------------------------------------------------------
/docs/app_architecture_overview.md:
--------------------------------------------------------------------------------
 1 | # Website Extractor Architecture Overview
 2 | 
 3 | ```
 4 | ┌───────────────────────────────────────────────────────────────────┐
 5 | │                    Website Extractor Application                   │
 6 | └───────────────────────────────────────────────────────────────────┘
 7 |                                   │
 8 |                                   ▼
 9 | ┌───────────────────────────────────────────────────────────────────┐
10 | │                           Flask Web Server                         │
11 | └───────────────────────────────────────────────────────────────────┘
12 |                                   │
13 |                                   ▼
14 | ┌───────────────────────────────────────────────────────────────────┐
15 | │                      Extraction Core Processes                     │
16 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤
17 | │  HTTP Client  │ Selenium Renderer │  Content Parser  │ Asset Saver │
18 | │ (requests)    │ (WebDriver)       │ (BeautifulSoup)  │ (Zip)       │
19 | └───────────────┴──────────────────┴──────────────────┴─────────────┘
20 | ```
21 | 
22 | ## Data Flow
23 | 
24 | ```
25 | ┌──────────┐    URL     ┌──────────┐  HTML Content  ┌──────────────┐
26 | │  User    │───────────▶│ Extractor│───────────────▶│ HTML Parser  │
27 | └──────────┘            └──────────┘                └──────────────┘
28 |                              │                             │
29 |                    Rendering │                             │ Asset URLs
30 |                      option  │                             │
31 |                              ▼                             ▼
32 |                       ┌──────────┐                  ┌──────────────┐
33 |                       │ Selenium │                  │ Asset        │
34 |                       │ WebDriver│                  │ Downloader   │
35 |                       └──────────┘                  └──────────────┘
36 |                              │                             │
37 |                      Rendered│                      Assets │
38 |                        HTML  │                             │
39 |                              ▼                             ▼
40 |                       ┌──────────────────────────────────────────┐
41 |                       │            Zip File Creator              │
42 |                       └──────────────────────────────────────────┘
43 |                                           │
44 |                                           ▼
45 |                       ┌──────────────────────────────────────────┐
46 |                       │      File Download Response to User      │
47 |                       └──────────────────────────────────────────┘
48 | ```
49 | 
50 | ### Key Components
51 | 
52 | 1. **Flask Web Server**: The user interface and API endpoint
53 | 2. **HTTP Client**: Makes network requests to target websites
54 | 3. **Selenium Renderer**: Renders JavaScript-heavy sites (optional)
55 | 4. **Content Parser**: Analyzes HTML to extract assets
56 | 5. **Asset Downloader**: Retrieves all website assets
57 | 6. **Zip Creator**: Packages everything into a downloadable archive
58 | 
59 | For more detailed information, see the full [app_architecture.md](../app_architecture.md) file. 


--------------------------------------------------------------------------------
/app_architecture.md:
--------------------------------------------------------------------------------
  1 | # Website Extractor - Application Architecture
  2 | 
  3 | ## Overview
  4 | 
  5 | This document provides a high-level overview of the Website Extractor application architecture, explaining how the different components interact and the flow of data through the system.
  6 | 
  7 | ```
  8 | ┌───────────────────────────────────────────────────────────────────┐
  9 | │                    Website Extractor Application                   │
 10 | └───────────────────────────────────────────────────────────────────┘
 11 |                                   │
 12 |                                   ▼
 13 | ┌───────────────────────────────────────────────────────────────────┐
 14 | │                           Flask Web Server                         │
 15 | └───────────────────────────────────────────────────────────────────┘
 16 |                                   │
 17 |                                   ▼
 18 | ┌───────────────────────────────────────────────────────────────────┐
 19 | │                      Extraction Core Processes                     │
 20 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤
 21 | │  HTTP Client  │ Selenium Renderer │  Content Parser  │ Asset Saver │
 22 | │ (requests)    │ (WebDriver)       │ (BeautifulSoup)  │ (Zip)       │
 23 | └───────────────┴──────────────────┴──────────────────┴─────────────┘
 24 | ```
 25 | 
 26 | ## Data Flow Diagram
 27 | 
 28 | ```
 29 | ┌──────────┐    URL     ┌──────────┐  HTML Content  ┌──────────────┐
 30 | │  User    │───────────▶│ Extractor│───────────────▶│ HTML Parser  │
 31 | └──────────┘            └──────────┘                └──────────────┘
 32 |                              │                             │
 33 |                    Rendering │                             │ Asset URLs
 34 |                      option  │                             │
 35 |                              ▼                             ▼
 36 |                       ┌──────────┐                  ┌──────────────┐
 37 |                       │ Selenium │                  │ Asset        │
 38 |                       │ WebDriver│                  │ Downloader   │
 39 |                       └──────────┘                  └──────────────┘
 40 |                              │                             │
 41 |                      Rendered│                      Assets │
 42 |                        HTML  │                             │
 43 |                              ▼                             ▼
 44 |                       ┌──────────────────────────────────────────┐
 45 |                       │            Zip File Creator              │
 46 |                       └──────────────────────────────────────────┘
 47 |                                           │
 48 |                                           ▼
 49 |                       ┌──────────────────────────────────────────┐
 50 |                       │      File Download Response to User      │
 51 |                       └──────────────────────────────────────────┘
 52 | ```
 53 | 
 54 | ## Component Descriptions
 55 | 
 56 | ### 1. Flask Web Server
 57 | - **Purpose**: Provides the web interface and handles HTTP requests
 58 | - **Key Files**: `app.py` (main file), `templates/index.html` (UI)
 59 | - **Functions**: Serves the interface, processes form submissions, returns downloaded files
 60 | 
 61 | ### 2. HTTP Client (Requests)
 62 | - **Purpose**: Fetches website content using standard HTTP requests
 63 | - **Key Functions**: `download_asset()`, HTTP request code in `/extract` route
 64 | - **Features**: Cookie handling, header rotation, retry logic, error handling
 65 | 
 66 | ### 3. Selenium Renderer (Optional)
 67 | - **Purpose**: Renders JavaScript-heavy websites using a headless Chrome browser
 68 | - **Key Functions**: `extract_with_selenium()`
 69 | - **Features**: Waits for dynamic content, scrolls the page, handles lazy loading, identifies framework-specific resources
 70 | 
 71 | ### 4. Content Parser
 72 | - **Purpose**: Analyzes HTML content to extract assets and structure
 73 | - **Key Functions**: `extract_assets()`, `extract_metadata()`, `extract_component_structure()`
 74 | - **Features**: Identifies CSS, JS, images, fonts, extracts metadata, identifies UI components
 75 | 
 76 | ### 5. Asset Downloader
 77 | - **Purpose**: Downloads all discovered assets
 78 | - **Key Functions**: `download_asset()` 
 79 | - **Features**: Handles different asset types, resolves relative URLs, manages retries
 80 | 
 81 | ### 6. Zip File Creator
 82 | - **Purpose**: Packages all assets into a downloadable zip file
 83 | - **Key Functions**: `create_zip_file()`
 84 | - **Features**: Organizes assets by type, handles file naming, adds metadata and documentation
 85 | 
 86 | ## Process Flow
 87 | 
 88 | 1. **User Submits URL**:
 89 |    - User enters a URL in the web interface
 90 |    - Optionally selects "Use Advanced Rendering (Selenium)"
 91 |    - Submits the form to the `/extract` endpoint
 92 | 
 93 | 2. **Content Acquisition**:
 94 |    - If Selenium is selected: Uses Chrome WebDriver to render the page
 95 |    - Otherwise: Uses Requests library for HTTP retrieval
 96 |    - Handles redirects, errors, retries with different headers
 97 | 
 98 | 3. **HTML Processing**:
 99 |    - Parses HTML using BeautifulSoup
100 |    - Fixes relative URLs
101 |    - Extracts metadata (title, description, etc.)
102 |    - Identifies UI components
103 | 
104 | 4. **Asset Discovery**:
105 |    - Finds all linked resources (CSS, JS, images, fonts, etc.)
106 |    - Resolves URLs
107 |    - Categorizes assets by type
108 |    - Handles duplicates
109 | 
110 | 5. **Asset Download**:
111 |    - Downloads all discovered assets
112 |    - Handles binary vs. text content
113 |    - Manages errors and retries
114 | 
115 | 6. **Zip Creation**:
116 |    - Creates organized folder structure
117 |    - Adds README and metadata
118 |    - Creates component index
119 |    - Packages everything into a ZIP file
120 | 
121 | 7. **User Download**:
122 |    - Returns the ZIP file as a downloadable attachment
123 |    - Manages temporary file cleanup
124 | 
125 | ## Challenges & Error Patterns
126 | 
127 | ### Common Failure Points
128 | 
129 | 1. **Selenium WebDriver Initialization**:
130 |    - Error seen in logs: `Error initializing Chrome WebDriver: [Errno 8] Exec format error`
131 |    - Cause: WebDriver executable permission or architecture mismatch
132 |    - Fallback: Alternative initialization method is attempted
133 | 
134 | 2. **CDN and Image Processing URLs**:
135 |    - Error seen: `Failed to download https://www.tesla.com/q_auto/Homepage-New-Legacy-Model-Y-Desktop.png, status: 404`
136 |    - Cause: URLs contain transformation parameters (`q_auto`, `f_auto`) that are processed by CDNs and don't represent actual file paths
137 | 
138 | 3. **Theme and Framework Resources**:
139 |    - Error seen: `Failed to download https://www.tesla.com/themes/contrib/stable/images/core/throbber-active.gif, status: 404`
140 |    - Cause: Theme resources may be generated dynamically or have access restrictions
141 | 
142 | 4. **Anti-Bot Measures**:
143 |    - Some sites implement anti-scraping measures (403 Forbidden responses)
144 |    - Application implements header rotation and Selenium fallback to mitigate this
145 | 
146 | ## Improvement Opportunities
147 | 
148 | 1. **URL Processing**: Enhance the URL normalization to better handle CDN-specific parameters
149 | 2. **Asset Deduplication**: Improve handling of duplicate assets with different query parameters
150 | 3. **Error Handling**: Add more targeted error handling for specific CDN formats
151 | 4. **WebDriver Management**: Improve Selenium WebDriver initialization reliability
152 | 
153 | ## Technical Dependencies
154 | 
155 | - **Flask**: Web framework
156 | - **Requests**: HTTP client
157 | - **BeautifulSoup**: HTML parsing
158 | - **Selenium**: Browser automation
159 | - **cssutils**: CSS parsing
160 | - **zipfile**: ZIP file creation 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Website Extractor
  2 | 
  3 | ![Website Extractor Banner](https://img.shields.io/badge/Website%20Extractor-Advanced-blue)
  4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
  5 | 
  6 | ## Overview
  7 | 
  8 | Website Extractor is a powerful Python-based tool that allows you to download and archive entire websites with a single click. This application extracts HTML, CSS, JavaScript, images, fonts, and other assets from any website, making it ideal for:
  9 | 
 10 | - Creating pixel-perfect copies of any website online
 11 | - Training AI agents with real-world web content
 12 | - Studying website structure and design
 13 | - Extracting UI components for design inspiration
 14 | - Archiving web content for research
 15 | - Learning web development techniques
 16 | 
 17 | The application features advanced rendering capabilities using Selenium, allowing it to properly extract assets from modern JavaScript-heavy websites and single-page applications.
 18 | 
 19 | ![App Architecture Overview](https://raw.githubusercontent.com/username/website-extractor/main/docs/app_architecture_overview.png)
 20 | 
 21 | ## Features
 22 | 
 23 | - **Advanced Rendering**: Uses Selenium with Chrome WebDriver to render JavaScript-heavy sites
 24 | - **Comprehensive Asset Extraction**: Downloads HTML, CSS, JavaScript, images, fonts, and more
 25 | - **Metadata Extraction**: Captures site metadata, OpenGraph tags, and structured data
 26 | - **UI Component Analysis**: Identifies and extracts UI components like headers, navigation, cards, etc.
 27 | - **Organized Output**: Creates a well-structured ZIP file with assets organized by type
 28 | - **Responsive Design**: Works with both desktop and mobile websites
 29 | - **CDN Support**: Handles assets from various Content Delivery Networks
 30 | - **Modern Framework Support**: Special handling for React, Next.js, Angular, and Tailwind CSS
 31 | 
 32 | ## Advanced Use Cases
 33 | 
 34 | ### Pixel-Perfect Website Copies
 35 | Create exact replicas of websites for study, testing, or inspiration. The advanced rendering engine ensures even complex layouts and JavaScript-driven designs are faithfully reproduced.
 36 | 
 37 | ### AI Agent Training
 38 | Extract websites to create high-quality training data for your AI agents:
 39 | - Feed the structured content to AI models to improve their understanding of web layouts
 40 | - Train AI assistants on real-world UI components and design patterns
 41 | - Create diverse datasets of web content for machine learning projects
 42 | 
 43 | ### Cursor IDE Integration
 44 | Website Extractor works seamlessly with Cursor IDE:
 45 | - Extract a website and open it directly in Cursor for code analysis
 46 | - Edit the extracted code with Cursor's AI-powered assistance
 47 | - Use the components as reference for your own projects
 48 | - Ask Cursor to analyze the site's structure and styles to apply similar patterns to your work
 49 | 
 50 | ### Design Inspiration & Reference
 51 | Upload the extracted folder to your current project and:
 52 | - Ask Cursor to reference its style when building new pages
 53 | - Study professional UI implementations
 54 | - Extract specific components for reuse in your own projects
 55 | - Learn modern CSS techniques from production websites
 56 | 
 57 | ## Installation
 58 | 
 59 | ### Prerequisites
 60 | 
 61 | - Python 3.7+
 62 | - Chrome/Chromium browser (for advanced rendering)
 63 | - Git
 64 | 
 65 | ### Using Cursor (Recommended)
 66 | 
 67 | 1. Clone the repository:
 68 |    ```bash
 69 |    git clone https://github.com/sirioberati/WebTwin.git
 70 |    cd WebTwin
 71 |    ```
 72 | 
 73 | 2. Open the project in Cursor IDE:
 74 |    ```bash
 75 |    cursor .
 76 |    ```
 77 | 
 78 | 3. Create a virtual environment (within Cursor's terminal):
 79 |    ```bash
 80 |    python -m venv venv
 81 |    ```
 82 | 
 83 | 4. Activate the virtual environment:
 84 |    - On Windows: `venv\Scripts\activate`
 85 |    - On macOS/Linux: `source venv/bin/activate`
 86 | 
 87 | 5. Install dependencies:
 88 |    ```bash
 89 |    pip install -r requirements.txt
 90 |    ```
 91 | 
 92 | ### Manual Installation
 93 | 
 94 | 1. Clone the repository:
 95 |    ```bash
 96 |    git clone https://github.com/sirioberati/WebTwin.git
 97 |    cd WebTwin
 98 |    ```
 99 | 
100 | 2. Create a virtual environment:
101 |    ```bash
102 |    python -m venv venv
103 |    ```
104 | 
105 | 3. Activate the virtual environment:
106 |    - On Windows: `venv\Scripts\activate`
107 |    - On macOS/Linux: `source venv/bin/activate`
108 | 
109 | 4. Install dependencies:
110 |    ```bash
111 |    pip install -r requirements.txt
112 |    ```
113 | 
114 | ## Usage
115 | 
116 | 1. Activate your virtual environment (if not already activated)
117 | 
118 | 2. Run the application:
119 |    ```bash
120 |    python app.py
121 |    ```
122 | 
123 | 3. Open your browser and navigate to:
124 |    ```
125 |    http://127.0.0.1:5001
126 |    ```
127 | 
128 | 4. Enter the URL of the website you want to extract
129 | 
130 | 5. Check "Use Advanced Rendering (Selenium)" for JavaScript-heavy websites
131 | 
132 | 6. Click "Extract Website" and wait for the download to complete
133 | 
134 | ### Using Advanced Rendering
135 | 
136 | The advanced rendering option uses Selenium with Chrome WebDriver to:
137 | - Execute JavaScript
138 | - Render dynamic content
139 | - Scroll through the page to trigger lazy loading
140 | - Click on UI elements to expose hidden content
141 | - Extract resources loaded by JavaScript frameworks
142 | 
143 | This option is recommended for modern websites, especially those built with React, Angular, Vue, or other JavaScript frameworks.
144 | 
145 | ### Using with Cursor IDE
146 | 
147 | After extracting a website:
148 | 
149 | 1. Unzip the downloaded file to a directory
150 | 2. Open with Cursor IDE:
151 |    ```bash
152 |    cursor /path/to/extracted/website
153 |    ```
154 | 3. Explore the code structure and assets
155 | 4. Ask Cursor AI to analyze the code with prompts like:
156 |    - "Explain the CSS structure of this website"
157 |    - "How can I implement a similar hero section in my project?"
158 |    - "Analyze this navigation component and create a similar one for my React app"
159 | 
160 | ## AI Agent Integration
161 | 
162 | WebTwin can be a powerful tool when combined with AI agents, enabling sophisticated workflows for code analysis, design extraction, and content repurposing.
163 | 
164 | ### Integration with Cursor AI
165 | 
166 | Cursor's AI capabilities can be supercharged with WebTwin's extraction abilities:
167 | 
168 | 1. **Extract and Modify Workflow**:
169 |    ```
170 |    WebTwin → Extract Site → Open in Cursor → Ask AI to Modify
171 |    ```
172 |    Example prompts:
173 |    - "Convert this landing page to use Tailwind CSS instead of Bootstrap"
174 |    - "Refactor this JavaScript code to use React hooks"
175 |    - "Simplify this complex CSS layout while maintaining the same visual appearance"
176 | 
177 | 2. **Component Library Creation**:
178 |    ```
179 |    WebTwin → Extract Multiple Sites → Open in Cursor → AI-Powered Component Extraction
180 |    ```
181 |    Example prompts:
182 |    - "Extract all button styles from these websites and create a unified component library"
183 |    - "Analyze these navigation patterns and create a best-practices implementation"
184 | 
185 | 3. **Learn from Production Code**:
186 |    ```
187 |    WebTwin → Extract Complex Site → Cursor AI Analysis → Generate Tutorial
188 |    ```
189 |    Example prompts:
190 |    - "Explain how this site implements its responsive design strategy"
191 |    - "Show me how this animation effect works and help me implement something similar"
192 | 
193 | ### Integration with OpenAI Assistants API & Agent SDK
194 | 
195 | WebTwin can be integrated with the OpenAI Assistants API and Agent SDK to create specialized AI agents:
196 | 
197 | 1. **Setup a Website Analysis Agent**:
198 |    ```python
199 |    from openai import OpenAI
200 |    
201 |    client = OpenAI(api_key="your-api-key")
202 |    
203 |    # Create an assistant specialized in web design analysis
204 |    assistant = client.beta.assistants.create(
205 |        name="WebDesignAnalyzer",
206 |        instructions="You analyze websites extracted by WebTwin and provide design insights.",
207 |        model="gpt-4-turbo",
208 |        tools=[{"type": "file_search"}]
209 |    )
210 |    
211 |    # Upload the extracted website files
212 |    file = client.files.create(
213 |        file=open("extracted_website.zip", "rb"),
214 |        purpose="assistants"
215 |    )
216 |    
217 |    # Create a thread with the file
218 |    thread = client.beta.threads.create(
219 |        messages=[
220 |            {
221 |                "role": "user",
222 |                "content": "Analyze this website's design patterns and component structure",
223 |                "file_ids": [file.id]
224 |            }
225 |        ]
226 |    )
227 |    
228 |    # Run the assistant on the thread
229 |    run = client.beta.threads.runs.create(
230 |        thread_id=thread.id,
231 |        assistant_id=assistant.id
232 |    )
233 |    ```
234 | 
235 | 2. **Create a Website Transformation Pipeline**:
236 |    ```
237 |    WebTwin → Extract Site → OpenAI Agent Processes → Generate New Code
238 |    ```
239 | 
240 | 3. **Build a Web Design Critique Agent**:
241 |    - Feed WebTwin extractions to an AI agent trained to evaluate design principles
242 |    - Receive detailed feedback on accessibility, usability, and visual design
243 | 
244 | ### Advanced Agent Workflows
245 | 
246 | Combine WebTwin with AI agents for advanced workflows:
247 | 
248 | 1. **Cross-Site Design Pattern Analysis**:
249 |    - Extract multiple sites in the same industry
250 |    - Use AI to identify common patterns and best practices
251 |    - Generate a report on industry-standard approaches
252 | 
253 | 2. **Automated Component Library Generation**:
254 |    - Extract multiple sites
255 |    - Use AI to identify and categorize UI components
256 |    - Generate a unified component library with documentation
257 | 
258 | 3. **SEO and Content Strategy Analysis**:
259 |    - Extract content-rich websites
260 |    - Use AI to analyze content structure, metadata, and keyword usage
261 |    - Generate SEO recommendations and content strategy insights
262 | 
263 | 4. **Competitive Analysis**:
264 |    - Extract competitor websites
265 |    - Use AI to compare features, UX patterns, and technical implementations
266 |    - Generate a competitive analysis report with strengths and weaknesses
267 | 
268 | ## Architecture
269 | 
270 | The application is built with a modular architecture designed for flexibility and performance:
271 | 
272 | ```
273 | ┌───────────────────────────────────────────────────────────────────┐
274 | │                    Website Extractor Application                   │
275 | └───────────────────────────────────────────────────────────────────┘
276 |                                   │
277 |                                   ▼
278 | ┌───────────────────────────────────────────────────────────────────┐
279 | │                           Flask Web Server                         │
280 | └───────────────────────────────────────────────────────────────────┘
281 |                                   │
282 |                                   ▼
283 | ┌───────────────────────────────────────────────────────────────────┐
284 | │                      Extraction Core Processes                     │
285 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤
286 | │  HTTP Client  │ Selenium Renderer │  Content Parser  │ Asset Saver │
287 | │ (requests)    │ (WebDriver)       │ (BeautifulSoup)  │ (Zip)       │
288 | └───────────────┴──────────────────┴──────────────────┴─────────────┘
289 | ```
290 | 
291 | ### Data Flow
292 | 
293 | ```
294 | ┌──────────┐    URL     ┌──────────┐  HTML Content  ┌──────────────┐
295 | │  User    │───────────▶│ Extractor│───────────────▶│ HTML Parser  │
296 | └──────────┘            └──────────┘                └──────────────┘
297 |                              │                             │
298 |                    Rendering │                             │ Asset URLs
299 |                      option  │                             │
300 |                              ▼                             ▼
301 |                       ┌──────────┐                  ┌──────────────┐
302 |                       │ Selenium │                  │ Asset        │
303 |                       │ WebDriver│                  │ Downloader   │
304 |                       └──────────┘                  └──────────────┘
305 |                              │                             │
306 |                      Rendered│                      Assets │
307 |                        HTML  │                             │
308 |                              ▼                             ▼
309 |                       ┌──────────────────────────────────────────┐
310 |                       │            Zip File Creator              │
311 |                       └──────────────────────────────────────────┘
312 |                                           │
313 |                                           ▼
314 |                       ┌──────────────────────────────────────────┐
315 |                       │      File Download Response to User      │
316 |                       └──────────────────────────────────────────┘
317 | ```
318 | 
319 | ### Key Components
320 | 
321 | 1. **Flask Web Server**: Provides the user interface and handles HTTP requests
322 | 2. **HTTP Client**: Makes requests to fetch website content using the Requests library
323 | 3. **Selenium Renderer**: Optional component for JavaScript rendering and dynamic content
324 | 4. **Content Parser**: Analyzes HTML to extract assets and structure using BeautifulSoup
325 | 5. **Asset Downloader**: Downloads all discovered assets with sophisticated retry logic
326 | 6. **ZIP Creator**: Packages everything into an organized downloadable archive
327 | 
328 | ### Processing Stages
329 | 
330 | 1. **URL Submission**: User provides a URL and rendering options
331 | 2. **Content Acquisition**: HTML content is fetched (with or without JavaScript rendering)
332 | 3. **Structure Analysis**: HTML is parsed and analyzed for assets and components
333 | 4. **Asset Discovery**: All linked resources are identified and categorized
334 | 5. **Parallel Downloading**: Assets are downloaded with optimized concurrent requests
335 | 6. **Organization & Packaging**: Files are organized and compressed into a ZIP archive
336 | 
337 | For more detailed technical information, see [app_architecture.md](app_architecture.md).
338 | 
339 | ## Limitations
340 | 
341 | - Some websites implement anti-scraping measures that may block extraction
342 | - Content requiring authentication may not be accessible
343 | - Very large websites may time out or require multiple extraction attempts
344 | - Some CDN-specific URL formats may fail to download (especially those with transformation parameters)
345 | 
346 | ## License
347 | 
348 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
349 | 
350 | ## Author
351 | 
352 | Created by Sirio Berati
353 | 
354 | - Instagram: [@heysirio](https://instagram.com/heysirio)
355 | - Instagram: [@siriosagents](https://instagram.com/siriosagents)
356 | 
357 | ## Contributing
358 | 
359 | Contributions are welcome! Please feel free to submit a Pull Request.
360 | 
361 | 1. Fork the repository
362 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
363 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
364 | 4. Push to the branch (`git push origin feature/amazing-feature`)
365 | 5. Open a Pull Request
366 | 
367 | ## Acknowledgments
368 | 
369 | - This project uses [Flask](https://flask.palletsprojects.com/) for the web framework
370 | - [Selenium](https://www.selenium.dev/) for advanced rendering
371 | - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
372 | - All the open source libraries that made this project possible 
373 | 


--------------------------------------------------------------------------------
/templates/components.html:
--------------------------------------------------------------------------------
  1 | <!-- 
  2 | This file is no longer used in the application.
  3 | The components viewer functionality has been removed as it was not working properly.
  4 | The file is kept for reference purposes only.
  5 | -->
  6 | 
  7 | <!DOCTYPE html>
  8 | <html lang="en">
  9 | <head>
 10 |     <meta charset="UTF-8">
 11 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 12 |     <title>Extracted Components Viewer</title>
 13 |     <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
 14 |     <style>
 15 |         .component-preview {
 16 |             border: 1px solid #e2e8f0;
 17 |             border-radius: 0.5rem;
 18 |             overflow: hidden;
 19 |             transition: all 0.3s ease;
 20 |         }
 21 |         .component-preview:hover {
 22 |             box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1), 0 4px 6px -2px rgba(0, 0, 0, 0.05);
 23 |         }
 24 |         .component-code {
 25 |             max-height: 0;
 26 |             overflow: hidden;
 27 |             transition: max-height 0.5s ease;
 28 |         }
 29 |         .component-code.active {
 30 |             max-height: 500px;
 31 |             overflow-y: auto;
 32 |         }
 33 |         .badge {
 34 |             background-color: #ebf8ff;
 35 |             color: #3182ce;
 36 |             padding: 2px 6px;
 37 |             border-radius: 4px;
 38 |             font-size: 0.75rem;
 39 |             margin-left: 6px;
 40 |         }
 41 |     </style>
 42 | </head>
 43 | <body class="bg-gray-100 min-h-screen">
 44 |     <div class="container mx-auto px-4 py-8">
 45 |         <div class="max-w-6xl mx-auto">
 46 |             <div class="flex items-center justify-between mb-8">
 47 |                 <div>
 48 |                     <h1 class="text-3xl font-bold text-gray-800">Extracted Components</h1>
 49 |                     <p class="text-gray-600">
 50 |                         {% if extracted_url is defined and extracted_url %}
 51 |                         Components extracted from: <a href="{{ extracted_url }}" target="_blank" class="text-blue-600 hover:underline">{{ extracted_url }}</a>
 52 |                         {% else %}
 53 |                         Browse and inspect all extracted UI components from the website
 54 |                         {% endif %}
 55 |                     </p>
 56 |                 </div>
 57 |                 <a href="/" class="bg-blue-600 text-white py-2 px-4 rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2">
 58 |                     Back to Main Page
 59 |                 </a>
 60 |             </div>
 61 |             
 62 |             <div class="bg-white rounded-lg shadow-lg p-6 mb-8">
 63 |                 <div class="flex items-center mb-4">
 64 |                     <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
 65 |                         <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"></path>
 66 |                     </svg>
 67 |                     <h2 class="text-xl font-semibold">How to Use This Viewer</h2>
 68 |                 </div>
 69 |                 <ol class="list-decimal pl-5 space-y-2 text-gray-600">
 70 |                     <li>Browse through the component categories below</li>
 71 |                     <li>Click on any component to view its preview</li>
 72 |                     <li>Use the "View Code" button to see the HTML structure</li>
 73 |                     <li>Copy the code to use in your own projects</li>
 74 |                     <li>Click "View in Context" to see how the component appears on the original page</li>
 75 |                 </ol>
 76 |             </div>
 77 |             
 78 |             <div class="grid grid-cols-1 md:grid-cols-3 gap-6 mb-8">
 79 |                 <div class="bg-white rounded-lg shadow-lg p-6">
 80 |                     <div class="flex items-center mb-4">
 81 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
 82 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 6h16M4 12h16M4 18h16"></path>
 83 |                         </svg>
 84 |                         <h2 class="text-xl font-semibold">Navigation</h2>
 85 |                     </div>
 86 |                     <p class="text-gray-600 mb-4">Headers, menus, and navigation bars</p>
 87 |                     <div class="text-sm text-gray-500">
 88 |                         <span class="font-medium text-gray-700">{{ navigation_count }}</span> components found
 89 |                     </div>
 90 |                 </div>
 91 |                 
 92 |                 <div class="bg-white rounded-lg shadow-lg p-6">
 93 |                     <div class="flex items-center mb-4">
 94 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
 95 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 11H5m14 0a2 2 0 012 2v6a2 2 0 01-2 2H5a2 2 0 01-2-2v-6a2 2 0 012-2m14 0V9a2 2 0 00-2-2M5 11V9a2 2 0 012-2m0 0V5a2 2 0 012-2h6a2 2 0 012 2v2M7 7h10"></path>
 96 |                         </svg>
 97 |                         <h2 class="text-xl font-semibold">Hero Sections</h2>
 98 |                     </div>
 99 |                     <p class="text-gray-600 mb-4">Main banners and hero areas</p>
100 |                     <div class="text-sm text-gray-500">
101 |                         <span class="font-medium text-gray-700">{{ hero_count }}</span> components found
102 |                     </div>
103 |                 </div>
104 |                 
105 |                 <div class="bg-white rounded-lg shadow-lg p-6">
106 |                     <div class="flex items-center mb-4">
107 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
108 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 10h18M3 14h18m-9-4v8m-7 0h14a2 2 0 002-2V8a2 2 0 00-2-2H5a2 2 0 00-2 2v8a2 2 0 002 2z"></path>
109 |                         </svg>
110 |                         <h2 class="text-xl font-semibold">Cards</h2>
111 |                     </div>
112 |                     <p class="text-gray-600 mb-4">Product cards, info cards, and pricing cards</p>
113 |                     <div class="text-sm text-gray-500">
114 |                         <span class="font-medium text-gray-700">{{ card_count }}</span> components found
115 |                     </div>
116 |                 </div>
117 |                 
118 |                 <div class="bg-white rounded-lg shadow-lg p-6">
119 |                     <div class="flex items-center mb-4">
120 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
121 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8 16H6a2 2 0 01-2-2V6a2 2 0 012-2h8a2 2 0 012 2v2m-6 12h8a2 2 0 002-2v-8a2 2 0 00-2-2h-8a2 2 0 00-2 2v8a2 2 0 002 2z"></path>
122 |                         </svg>
123 |                         <h2 class="text-xl font-semibold">Sections</h2>
124 |                     </div>
125 |                     <p class="text-gray-600 mb-4">Content sections and feature blocks</p>
126 |                     <div class="text-sm text-gray-500">
127 |                         <span class="font-medium text-gray-700">{{ section_count }}</span> components found
128 |                     </div>
129 |                 </div>
130 |                 
131 |                 <div class="bg-white rounded-lg shadow-lg p-6">
132 |                     <div class="flex items-center mb-4">
133 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
134 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 8l7.89 5.26a2 2 0 002.22 0L21 8M5 19h14a2 2 0 002-2V7a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"></path>
135 |                         </svg>
136 |                         <h2 class="text-xl font-semibold">Forms</h2>
137 |                     </div>
138 |                     <p class="text-gray-600 mb-4">Contact forms, sign-up forms, and inputs</p>
139 |                     <div class="text-sm text-gray-500">
140 |                         <span class="font-medium text-gray-700">{{ form_count }}</span> components found
141 |                     </div>
142 |                 </div>
143 |                 
144 |                 <div class="bg-white rounded-lg shadow-lg p-6">
145 |                     <div class="flex items-center mb-4">
146 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
147 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 5a1 1 0 011-1h14a1 1 0 011 1v2a1 1 0 01-1 1H5a1 1 0 01-1-1V5zM4 13a1 1 0 011-1h6a1 1 0 011 1v6a1 1 0 01-1 1H5a1 1 0 01-1-1v-6zM16 13a1 1 0 011-1h2a1 1 0 011 1v6a1 1 0 01-1 1h-2a1 1 0 01-1-1v-6z"></path>
148 |                         </svg>
149 |                         <h2 class="text-xl font-semibold">Footers</h2>
150 |                     </div>
151 |                     <p class="text-gray-600 mb-4">Page footers and bottom sections</p>
152 |                     <div class="text-sm text-gray-500">
153 |                         <span class="font-medium text-gray-700">{{ footer_count }}</span> components found
154 |                     </div>
155 |                 </div>
156 |                 
157 |                 <div class="bg-white rounded-lg shadow-lg p-6">
158 |                     <div class="flex items-center mb-4">
159 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
160 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M16 11V7a4 4 0 00-8 0v4M5 9h14l1 12H4L5 9z"></path>
161 |                         </svg>
162 |                         <h2 class="text-xl font-semibold">Store Components</h2>
163 |                         <span class="badge">NEW</span>
164 |                     </div>
165 |                     <p class="text-gray-600 mb-4">Product listings, filters, and store layouts</p>
166 |                     <div class="text-sm text-gray-500">
167 |                         <span class="font-medium text-gray-700">{{ store_count }}</span> components found
168 |                     </div>
169 |                 </div>
170 |                 
171 |                 <div class="bg-white rounded-lg shadow-lg p-6">
172 |                     <div class="flex items-center mb-4">
173 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
174 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 18h.01M8 21h8a2 2 0 002-2V5a2 2 0 00-2-2H8a2 2 0 00-2 2v14a2 2 0 002 2z"></path>
175 |                         </svg>
176 |                         <h2 class="text-xl font-semibold">Mobile Components</h2>
177 |                         <span class="badge">NEW</span>
178 |                     </div>
179 |                     <p class="text-gray-600 mb-4">Mobile-specific UI elements and responsive components</p>
180 |                     <div class="text-sm text-gray-500">
181 |                         <span class="font-medium text-gray-700">{{ mobile_count }}</span> components found
182 |                     </div>
183 |                 </div>
184 |                 
185 |                 <div class="bg-white rounded-lg shadow-lg p-6">
186 |                     <div class="flex items-center mb-4">
187 |                         <svg class="w-6 h-6 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
188 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 3h2l.4 2M7 13h10l4-8H5.4M7 13L5.4 5M7 13l-2.293 2.293c-.63.63-.184 1.707.707 1.707H17m0 0a2 2 0 100 4 2 2 0 000-4zm-8 2a2 2 0 11-4 0 2 2 0 014 0z"></path>
189 |                         </svg>
190 |                         <h2 class="text-xl font-semibold">Cart Components</h2>
191 |                         <span class="badge">NEW</span>
192 |                     </div>
193 |                     <p class="text-gray-600 mb-4">Shopping cart elements and checkout flows</p>
194 |                     <div class="text-sm text-gray-500">
195 |                         <span class="font-medium text-gray-700">{{ cart_count }}</span> components found
196 |                     </div>
197 |                 </div>
198 |             </div>
199 |             
200 |             <div class="bg-white rounded-lg shadow-lg p-6 mb-8">
201 |                 <h2 class="text-xl font-semibold mb-4">Metadata</h2>
202 |                 <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
203 |                     <div class="border border-gray-200 rounded-lg p-4">
204 |                         <h3 class="font-medium text-gray-800 mb-2">Page Title</h3>
205 |                         <p class="text-gray-600">{{ page_title }}</p>
206 |                     </div>
207 |                     
208 |                     <div class="border border-gray-200 rounded-lg p-4">
209 |                         <h3 class="font-medium text-gray-800 mb-2">Description</h3>
210 |                         <p class="text-gray-600">{{ meta_description }}</p>
211 |                     </div>
212 |                     
213 |                     <div class="border border-gray-200 rounded-lg p-4">
214 |                         <h3 class="font-medium text-gray-800 mb-2">Keywords</h3>
215 |                         <div class="flex flex-wrap gap-2">
216 |                             {% for keyword in meta_keywords %}
217 |                             <span class="bg-gray-100 text-gray-800 text-xs px-2 py-1 rounded">{{ keyword }}</span>
218 |                             {% endfor %}
219 |                         </div>
220 |                     </div>
221 |                     
222 |                     <div class="border border-gray-200 rounded-lg p-4">
223 |                         <h3 class="font-medium text-gray-800 mb-2">Open Graph</h3>
224 |                         <div class="space-y-2">
225 |                             <p class="text-sm"><span class="font-medium">Title:</span> {{ og_title }}</p>
226 |                             <p class="text-sm"><span class="font-medium">Description:</span> {{ og_description }}</p>
227 |                             <p class="text-sm"><span class="font-medium">Image:</span> {{ og_image }}</p>
228 |                         </div>
229 |                     </div>
230 |                 </div>
231 |             </div>
232 |             
233 |             <div class="bg-white rounded-lg shadow-lg p-6 mb-8">
234 |                 <h2 class="text-xl font-semibold mb-4">Framework Configuration</h2>
235 |                 <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
236 |                     <div class="border border-gray-200 rounded-lg p-4">
237 |                         <div class="flex items-center mb-2">
238 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
239 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10 20l4-16m4 4l4 4-4 4M6 16l-4-4 4-4"></path>
240 |                             </svg>
241 |                             <h3 class="font-medium text-gray-800">Next.js Configuration</h3>
242 |                             <span class="badge">NEW</span>
243 |                         </div>
244 |                         <div class="bg-gray-50 p-3 rounded-md">
245 |                             <pre class="text-xs text-gray-700 overflow-x-auto">{{ next_config }}</pre>
246 |                         </div>
247 |                     </div>
248 |                     
249 |                     <div class="border border-gray-200 rounded-lg p-4">
250 |                         <div class="flex items-center mb-2">
251 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
252 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M7 21a4 4 0 01-4-4V5a2 2 0 012-2h4a2 2 0 012 2v12a4 4 0 01-4 4zm0 0h12a2 2 0 002-2v-4a2 2 0 00-2-2h-2.343M11 7.343l1.657-1.657a2 2 0 012.828 0l2.829 2.829a2 2 0 010 2.828l-8.486 8.485M7 17h.01"></path>
253 |                             </svg>
254 |                             <h3 class="font-medium text-gray-800">Tailwind Configuration</h3>
255 |                             <span class="badge">NEW</span>
256 |                         </div>
257 |                         <div class="bg-gray-50 p-3 rounded-md">
258 |                             <pre class="text-xs text-gray-700 overflow-x-auto">{{ tailwind_config }}</pre>
259 |                         </div>
260 |                     </div>
261 |                 </div>
262 |             </div>
263 |             
264 |             <!-- Component Previews -->
265 |             <div class="space-y-8">
266 |                 <h2 class="text-2xl font-bold text-gray-800 mb-4">Component Previews</h2>
267 |                 
268 |                 {% if components|length > 0 %}
269 |                 {% for component in components %}
270 |                 <div class="component-preview bg-white rounded-lg shadow-lg overflow-hidden">
271 |                     <div class="p-4 bg-gray-50 border-b border-gray-200 flex justify-between items-center">
272 |                         <div>
273 |                             <h3 class="font-medium text-gray-800">{{ component.name }}</h3>
274 |                             <p class="text-sm text-gray-500">{{ component.type }}</p>
275 |                         </div>
276 |                         <div class="space-x-2">
277 |                             <button class="view-code-btn bg-gray-200 hover:bg-gray-300 text-gray-700 py-1 px-3 rounded text-sm transition-colors duration-200"
278 |                                     data-target="code-{{ component.id }}">
279 |                                 View Code
280 |                             </button>
281 |                             <a href="{{ component.context_url }}" target="_blank" class="bg-blue-600 hover:bg-blue-700 text-white py-1 px-3 rounded text-sm transition-colors duration-200">
282 |                                 View in Context
283 |                             </a>
284 |                         </div>
285 |                     </div>
286 |                     
287 |                     <div class="p-6 bg-white">
288 |                         <div class="component-preview-content">
289 |                             {{ component.html|safe }}
290 |                         </div>
291 |                     </div>
292 |                     
293 |                     <div id="code-{{ component.id }}" class="component-code bg-gray-100 p-4 border-t border-gray-200">
294 |                         <pre class="text-xs text-gray-700 overflow-x-auto p-4 bg-gray-50 rounded"><code>{{ component.code }}</code></pre>
295 |                     </div>
296 |                 </div>
297 |                 {% endfor %}
298 |                 {% else %}
299 |                 <div class="bg-yellow-50 border-l-4 border-yellow-400 p-4 mb-8">
300 |                     <div class="flex">
301 |                         <div class="flex-shrink-0">
302 |                             <svg class="h-5 w-5 text-yellow-400" viewBox="0 0 20 20" fill="currentColor">
303 |                                 <path fill-rule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clip-rule="evenodd" />
304 |                             </svg>
305 |                         </div>
306 |                         <div class="ml-3">
307 |                             <p class="text-sm text-yellow-700">
308 |                                 No components were found in the extracted website. This could be because:
309 |                             </p>
310 |                             <ul class="mt-2 text-sm text-yellow-700 list-disc list-inside">
311 |                                 <li>The website uses a complex structure that's difficult to extract</li>
312 |                                 <li>The website uses custom components that don't match our extraction patterns</li>
313 |                                 <li>You're viewing the demo components instead of an actual extraction</li>
314 |                             </ul>
315 |                             <p class="mt-2 text-sm text-yellow-700">
316 |                                 Try extracting a different website or check the ZIP file for the complete website clone.
317 |                             </p>
318 |                         </div>
319 |                     </div>
320 |                 </div>
321 |                 {% endif %}
322 |             </div>
323 |         </div>
324 |     </div>
325 |     
326 |     <script>
327 |         document.addEventListener('DOMContentLoaded', function() {
328 |             // Toggle code view
329 |             const codeButtons = document.querySelectorAll('.view-code-btn');
330 |             codeButtons.forEach(button => {
331 |                 button.addEventListener('click', function() {
332 |                     const targetId = this.getAttribute('data-target');
333 |                     const codeElement = document.getElementById(targetId);
334 |                     codeElement.classList.toggle('active');
335 |                     this.textContent = codeElement.classList.contains('active') ? 'Hide Code' : 'View Code';
336 |                 });
337 |             });
338 |         });
339 |     </script>
340 | </body>
341 | </html> 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>Website Extractor - Pixel Perfect Clone</title>
  7 |     <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
  8 |     <style>
  9 |         .loading {
 10 |             display: none;
 11 |         }
 12 |         .loading.active {
 13 |             display: block;
 14 |         }
 15 |         .progress-bar {
 16 |             transition: width 0.3s ease-in-out;
 17 |         }
 18 |         .feature-badge {
 19 |             background-color: #ebf8ff;
 20 |             color: #3182ce;
 21 |             padding: 2px 6px;
 22 |             border-radius: 4px;
 23 |             font-size: 0.75rem;
 24 |             margin-left: 6px;
 25 |             vertical-align: middle;
 26 |         }
 27 |     </style>
 28 | </head>
 29 | <body class="bg-gray-100 min-h-screen">
 30 |     <div class="container mx-auto px-4 py-8">
 31 |         <div class="max-w-2xl mx-auto">
 32 |             <div class="flex items-center justify-between mb-6">
 33 |                 <h1 class="text-4xl font-bold text-gray-800">Website Extractor</h1>
 34 |             </div>
 35 |             <p class="text-center text-gray-600 mb-8">Create pixel-perfect clones of any website</p>
 36 |             
 37 |             <div class="bg-white rounded-lg shadow-lg p-6">
 38 |                 <form id="extractForm" class="space-y-4">
 39 |                     <div>
 40 |                         <label for="url" class="block text-sm font-medium text-gray-700">Website URL</label>
 41 |                         <input type="url" id="url" name="url" required
 42 |                                class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
 43 |                                placeholder="https://example.com">
 44 |                         <p class="mt-1 text-sm text-gray-500">Enter the complete URL of the website you want to clone</p>
 45 |                     </div>
 46 |                     
 47 |                     <div class="flex items-center">
 48 |                         <input type="checkbox" id="use_selenium" name="use_selenium" value="true" 
 49 |                                class="h-4 w-4 text-blue-600 focus:ring-blue-500 border-gray-300 rounded">
 50 |                         <label for="use_selenium" class="ml-2 block text-sm text-gray-900">
 51 |                             Use advanced rendering (Selenium)
 52 |                             <span class="feature-badge">RECOMMENDED</span>
 53 |                         </label>
 54 |                     </div>
 55 |                     <p class="text-xs text-gray-500 -mt-2 ml-6">Renders JavaScript and scrolls through the page to capture all content. Recommended for modern websites with dynamic content.</p>
 56 |                     
 57 |                     <button type="submit"
 58 |                             class="w-full bg-blue-600 text-white py-2 px-4 rounded-md hover:bg-blue-700 focus:outline-none focus:ring-2 focus:ring-blue-500 focus:ring-offset-2">
 59 |                         Extract Website
 60 |                     </button>
 61 |                 </form>
 62 |                 
 63 |                 <div id="loading" class="loading mt-6">
 64 |                     <div class="space-y-4">
 65 |                         <div class="flex items-center justify-center">
 66 |                             <div class="animate-spin rounded-full h-8 w-8 border-b-2 border-blue-600"></div>
 67 |                         </div>
 68 |                         <div class="text-center">
 69 |                             <p class="text-gray-600 mb-2">Extracting website assets...</p>
 70 |                             <div class="w-full bg-gray-200 rounded-full h-2.5">
 71 |                                 <div id="progressBar" class="progress-bar bg-blue-600 h-2.5 rounded-full" style="width: 0%"></div>
 72 |                             </div>
 73 |                             <p id="progressText" class="text-sm text-gray-500 mt-2">Initializing...</p>
 74 |                         </div>
 75 |                     </div>
 76 |                 </div>
 77 |                 
 78 |                 <div id="error" class="hidden mt-4 p-4 bg-red-100 text-red-700 rounded-md"></div>
 79 |                 
 80 |                 <div id="success" class="hidden">
 81 |                     <div class="bg-green-100 border-l-4 border-green-500 text-green-700 p-4 mb-4 rounded">
 82 |                         <div class="flex items-center">
 83 |                             <svg class="w-6 h-6 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
 84 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
 85 |                             </svg>
 86 |                             <p class="font-bold text-lg">Website extracted successfully!</p>
 87 |                         </div>
 88 |                         <div class="success-message mt-2">
 89 |                             <p>Your download should start automatically. If not, click the button below:</p>
 90 |                             <div class="mt-3 flex justify-center">
 91 |                                 <button id="direct-download" class="bg-blue-600 hover:bg-blue-700 text-white py-3 px-6 rounded-md font-bold flex items-center text-lg">
 92 |                                     <svg class="w-6 h-6 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
 93 |                                         <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4"></path>
 94 |                                     </svg>
 95 |                                     Click here to download
 96 |                                 </button>
 97 |                             </div>
 98 |                         </div>
 99 |                     </div>
100 |                 </div>
101 |             </div>
102 |             
103 |             <div class="mt-8 bg-white rounded-lg shadow-lg p-6">
104 |                 <h2 class="text-xl font-semibold mb-4">Enhanced Extraction Features</h2>
105 |                 <ul class="space-y-2 text-gray-600">
106 |                     <li class="flex items-center">
107 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
108 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
109 |                         </svg>
110 |                         Complete HTML DOM structure
111 |                     </li>
112 |                     <li class="flex items-center">
113 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
114 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
115 |                         </svg>
116 |                         All CSS stylesheets (external and inline)
117 |                     </li>
118 |                     <li class="flex items-center">
119 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
120 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
121 |                         </svg>
122 |                         globals.css and styling files
123 |                         <span class="feature-badge">NEW</span>
124 |                     </li>
125 |                     <li class="flex items-center">
126 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
127 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
128 |                         </svg>
129 |                         JavaScript files and functionality
130 |                     </li>
131 |                     <li class="flex items-center">
132 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
133 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
134 |                         </svg>
135 |                         Next.js configuration and files
136 |                         <span class="feature-badge">NEW</span>
137 |                     </li>
138 |                     <li class="flex items-center">
139 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
140 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
141 |                         </svg>
142 |                         Image configurations and assets
143 |                         <span class="feature-badge">NEW</span>
144 |                     </li>
145 |                     <li class="flex items-center">
146 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
147 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
148 |                         </svg>
149 |                         Metadata extraction
150 |                         <span class="feature-badge">NEW</span>
151 |                     </li>
152 |                     <li class="flex items-center">
153 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
154 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
155 |                         </svg>
156 |                         Mobile-specific components
157 |                         <span class="feature-badge">NEW</span>
158 |                     </li>
159 |                     <li class="flex items-center">
160 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
161 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
162 |                         </svg>
163 |                         SVG graphics (both linked and inline)
164 |                         <span class="feature-badge">NEW</span>
165 |                     </li>
166 |                     <li class="flex items-center">
167 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
168 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
169 |                         </svg>
170 |                         Video files and player components
171 |                         <span class="feature-badge">NEW</span>
172 |                     </li>
173 |                     <li class="flex items-center">
174 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
175 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
176 |                         </svg>
177 |                         Audio files and player components
178 |                         <span class="feature-badge">NEW</span>
179 |                     </li>
180 |                     <li class="flex items-center">
181 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
182 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
183 |                         </svg>
184 |                         Font files and font family detection
185 |                         <span class="feature-badge">NEW</span>
186 |                     </li>
187 |                     <li class="flex items-center">
188 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
189 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
190 |                         </svg>
191 |                         GIF animations and dynamic content
192 |                         <span class="feature-badge">NEW</span>
193 |                     </li>
194 |                     <li class="flex items-center">
195 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
196 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
197 |                         </svg>
198 |                         Screenshots of website and components 
199 |                         <span class="feature-badge bg-indigo-100 text-indigo-700">PREMIUM</span>
200 |                     </li>
201 |                     <li class="flex items-center">
202 |                         <svg class="w-5 h-5 text-green-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
203 |                             <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
204 |                         </svg>
205 |                         JavaScript-rendered content capture
206 |                         <span class="feature-badge bg-indigo-100 text-indigo-700">PREMIUM</span>
207 |                     </li>
208 |                 </ul>
209 |             </div>
210 |             
211 |             <div class="mt-8 bg-white rounded-lg shadow-lg p-6">
212 |                 <h2 class="text-xl font-semibold mb-4">Extracted UI Components</h2>
213 |                 <p class="text-gray-600 mb-4">The tool automatically identifies and extracts key UI components for easy reuse:</p>
214 |                 
215 |                 <div class="grid grid-cols-1 md:grid-cols-3 gap-4 mt-4">
216 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
217 |                         <div class="flex items-center mb-2">
218 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
219 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
220 |                             </svg>
221 |                             <h3 class="font-medium">Navigation</h3>
222 |                         </div>
223 |                         <p class="text-sm text-gray-600">Headers, menus, and navigation bars with responsive design</p>
224 |                     </div>
225 |                     
226 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
227 |                         <div class="flex items-center mb-2">
228 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
229 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
230 |                             </svg>
231 |                             <h3 class="font-medium">Hero Sections</h3>
232 |                         </div>
233 |                         <p class="text-sm text-gray-600">Eye-catching hero banners with images, text overlays, and call-to-action buttons</p>
234 |                     </div>
235 |                     
236 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
237 |                         <div class="flex items-center mb-2">
238 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
239 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
240 |                             </svg>
241 |                             <h3 class="font-medium">Store Pages</h3>
242 |                             <span class="feature-badge">NEW</span>
243 |                         </div>
244 |                         <p class="text-sm text-gray-600">Complete store layouts with product listings, filters, and shopping functionality</p>
245 |                     </div>
246 |                     
247 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
248 |                         <div class="flex items-center mb-2">
249 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
250 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
251 |                             </svg>
252 |                             <h3 class="font-medium">Mobile Menus</h3>
253 |                             <span class="feature-badge">NEW</span>
254 |                         </div>
255 |                         <p class="text-sm text-gray-600">Mobile-specific navigation components and responsive design elements</p>
256 |                     </div>
257 |                     
258 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
259 |                         <div class="flex items-center mb-2">
260 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
261 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
262 |                             </svg>
263 |                             <h3 class="font-medium">Product Grids</h3>
264 |                         </div>
265 |                         <p class="text-sm text-gray-600">Product listings and card grids with images, pricing, and descriptions</p>
266 |                     </div>
267 |                     
268 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
269 |                         <div class="flex items-center mb-2">
270 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
271 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
272 |                             </svg>
273 |                             <h3 class="font-medium">Shopping Cart</h3>
274 |                             <span class="feature-badge">NEW</span>
275 |                         </div>
276 |                         <p class="text-sm text-gray-600">Cart components with item listings, quantity controls, and checkout buttons</p>
277 |                     </div>
278 |                     
279 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
280 |                         <div class="flex items-center mb-2">
281 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
282 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
283 |                             </svg>
284 |                             <h3 class="font-medium">Carousels & Sliders</h3>
285 |                             <span class="feature-badge">NEW</span>
286 |                         </div>
287 |                         <p class="text-sm text-gray-600">Image sliders, product carousels, and testimonial rotators with controls</p>
288 |                     </div>
289 |                     
290 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
291 |                         <div class="flex items-center mb-2">
292 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
293 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
294 |                             </svg>
295 |                             <h3 class="font-medium">Video Players</h3>
296 |                             <span class="feature-badge">NEW</span>
297 |                         </div>
298 |                         <p class="text-sm text-gray-600">Custom video player components with controls and responsive design</p>
299 |                     </div>
300 |                     
301 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
302 |                         <div class="flex items-center mb-2">
303 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
304 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
305 |                             </svg>
306 |                             <h3 class="font-medium">Audio Players</h3>
307 |                             <span class="feature-badge">NEW</span>
308 |                         </div>
309 |                         <p class="text-sm text-gray-600">Audio playback components with controls and playlist functionality</p>
310 |                     </div>
311 |                     
312 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
313 |                         <div class="flex items-center mb-2">
314 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
315 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
316 |                             </svg>
317 |                             <h3 class="font-medium">Tab Components</h3>
318 |                             <span class="feature-badge">NEW</span>
319 |                         </div>
320 |                         <p class="text-sm text-gray-600">Tabbed interfaces with content panels and interactive navigation</p>
321 |                     </div>
322 |                     
323 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
324 |                         <div class="flex items-center mb-2">
325 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
326 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
327 |                             </svg>
328 |                             <h3 class="font-medium">Social Media</h3>
329 |                             <span class="feature-badge">NEW</span>
330 |                         </div>
331 |                         <p class="text-sm text-gray-600">Social media links, sharing buttons, and embedded social feeds</p>
332 |                     </div>
333 |                     
334 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
335 |                         <div class="flex items-center mb-2">
336 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
337 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 9l-7 7-7-7"></path>
338 |                             </svg>
339 |                             <h3 class="font-medium">Modals & Popups</h3>
340 |                             <span class="feature-badge">NEW</span>
341 |                         </div>
342 |                         <p class="text-sm text-gray-600">Modals, popup dialogs, and overlay components with animations</p>
343 |                     </div>
344 |                 </div>
345 |                 
346 |                 <div class="mt-4 p-4 bg-blue-50 text-blue-700 rounded-md">
347 |                     <p class="font-medium">Components Included in ZIP File</p>
348 |                     <p class="text-sm mt-1">All extracted UI components are included in the downloaded ZIP file for easy access and reuse.</p>
349 |                 </div>
350 |             </div>
351 |             
352 |             <div class="mt-8 bg-white rounded-lg shadow-lg p-6">
353 |                 <h2 class="text-xl font-semibold mb-4">Framework Support</h2>
354 |                 <div class="grid grid-cols-1 md:grid-cols-3 gap-4">
355 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
356 |                         <div class="flex items-center mb-2">
357 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
358 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
359 |                             </svg>
360 |                             <h3 class="font-medium">Next.js</h3>
361 |                             <span class="feature-badge">NEW</span>
362 |                         </div>
363 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
364 |                             <li>next.config.js extraction</li>
365 |                             <li>_app.js and _document.js</li>
366 |                             <li>Static and dynamic routes</li>
367 |                         </ul>
368 |                     </div>
369 |                     
370 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
371 |                         <div class="flex items-center mb-2">
372 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
373 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
374 |                             </svg>
375 |                             <h3 class="font-medium">Tailwind CSS</h3>
376 |                             <span class="feature-badge">NEW</span>
377 |                         </div>
378 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
379 |                             <li>tailwind.config.js extraction</li>
380 |                             <li>Custom theme settings</li>
381 |                             <li>Plugin configurations</li>
382 |                         </ul>
383 |                     </div>
384 |                     
385 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
386 |                         <div class="flex items-center mb-2">
387 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
388 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
389 |                             </svg>
390 |                             <h3 class="font-medium">React</h3>
391 |                             <span class="feature-badge">NEW</span>
392 |                         </div>
393 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
394 |                             <li>Component structure detection</li>
395 |                             <li>React-specific attributes</li>
396 |                             <li>State management patterns</li>
397 |                         </ul>
398 |                     </div>
399 |                     
400 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
401 |                         <div class="flex items-center mb-2">
402 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
403 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
404 |                             </svg>
405 |                             <h3 class="font-medium">Vue.js</h3>
406 |                             <span class="feature-badge">NEW</span>
407 |                         </div>
408 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
409 |                             <li>Vue component detection</li>
410 |                             <li>vue.config.js extraction</li>
411 |                             <li>Vue directives parsing</li>
412 |                         </ul>
413 |                     </div>
414 |                     
415 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
416 |                         <div class="flex items-center mb-2">
417 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
418 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
419 |                             </svg>
420 |                             <h3 class="font-medium">Angular</h3>
421 |                             <span class="feature-badge">NEW</span>
422 |                         </div>
423 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
424 |                             <li>Angular component structure</li>
425 |                             <li>angular.json configuration</li>
426 |                             <li>Module detection</li>
427 |                         </ul>
428 |                     </div>
429 |                     
430 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
431 |                         <div class="flex items-center mb-2">
432 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
433 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
434 |                             </svg>
435 |                             <h3 class="font-medium">Bootstrap</h3>
436 |                             <span class="feature-badge">NEW</span>
437 |                         </div>
438 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
439 |                             <li>Bootstrap component classes</li>
440 |                             <li>Grid system extraction</li>
441 |                             <li>Custom Bootstrap themes</li>
442 |                         </ul>
443 |                     </div>
444 |                     
445 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
446 |                         <div class="flex items-center mb-2">
447 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
448 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
449 |                             </svg>
450 |                             <h3 class="font-medium">SCSS/SASS</h3>
451 |                             <span class="feature-badge">NEW</span>
452 |                         </div>
453 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
454 |                             <li>Variable definitions</li>
455 |                             <li>Mixin extraction</li>
456 |                             <li>Nested styles</li>
457 |                         </ul>
458 |                     </div>
459 |                     
460 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
461 |                         <div class="flex items-center mb-2">
462 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
463 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
464 |                             </svg>
465 |                             <h3 class="font-medium">Svelte</h3>
466 |                             <span class="feature-badge">NEW</span>
467 |                         </div>
468 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
469 |                             <li>Svelte component format</li>
470 |                             <li>Reactive declarations</li>
471 |                             <li>Template structure</li>
472 |                         </ul>
473 |                     </div>
474 |                     
475 |                     <div class="border border-gray-200 rounded-lg p-4 bg-gray-50">
476 |                         <div class="flex items-center mb-2">
477 |                             <svg class="w-5 h-5 text-blue-500 mr-2" fill="none" stroke="currentColor" viewBox="0 0 24 24">
478 |                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M5 13l4 4L19 7"></path>
479 |                             </svg>
480 |                             <h3 class="font-medium">Material UI</h3>
481 |                             <span class="feature-badge">NEW</span>
482 |                         </div>
483 |                         <ul class="text-sm text-gray-600 list-disc pl-5 mt-2 space-y-1">
484 |                             <li>Material component classes</li>
485 |                             <li>Theme configuration</li>
486 |                             <li>Material icons</li>
487 |                         </ul>
488 |                     </div>
489 |                 </div>
490 |             </div>
491 |             
492 |             <div class="mt-8 bg-white rounded-lg shadow-lg p-6">
493 |                 <h2 class="text-xl font-semibold mb-4">How to Use the Clone</h2>
494 |                 <ol class="list-decimal pl-5 space-y-2 text-gray-600">
495 |                     <li>Extract the downloaded ZIP file</li>
496 |                     <li>Open the <code class="bg-gray-100 px-1 py-0.5 rounded">index.html</code> file in your browser to view the static clone</li>
497 |                     <li>Find extracted components in the <code class="bg-gray-100 px-1 py-0.5 rounded">components</code> folder</li>
498 |                     <li>Review metadata in <code class="bg-gray-100 px-1 py-0.5 rounded">metadata.json</code></li>
499 |                     <li>Check <code class="bg-gray-100 px-1 py-0.5 rounded">css/globals.css</code> for global styling</li>
500 |                     <li>Edit the HTML, CSS, and JavaScript files in Cursor to customize the design</li>
501 |                     <li>Use the included <code class="bg-gray-100 px-1 py-0.5 rounded">manifest.json</code> file to locate specific assets</li>
502 |                 </ol>
503 |                 <div class="mt-4 p-4 bg-blue-50 text-blue-700 rounded-md">
504 |                     <p class="font-medium">Pro Tip:</p>
505 |                     <p class="text-sm mt-1">For the most accurate results, try cloning the desktop version of websites. Some sites may have anti-scraping measures that could affect the results.</p>
506 |                 </div>
507 |             </div>
508 |         </div>
509 |     </div>
510 | 
511 |     <script>
512 |         document.addEventListener('DOMContentLoaded', function() {
513 |             const form = document.getElementById('extractForm');
514 |             const loading = document.getElementById('loading');
515 |             const error = document.getElementById('error');
516 |             const success = document.getElementById('success');
517 |             const progressBar = document.getElementById('progressBar');
518 |             const progressText = document.getElementById('progressText');
519 |             
520 |             // Progress simulation values
521 |             let progressInterval;
522 |             let currentProgress = 0;
523 |             
524 |             form.addEventListener('submit', function(e) {
525 |                 e.preventDefault();
526 |                 
527 |                 // Get form data
528 |                 const url = document.getElementById('url').value.trim();
529 |                 const useSelenium = document.getElementById('use_selenium').checked;
530 |                 
531 |                 if (!url) {
532 |                     showError('Please enter a valid URL');
533 |                     return;
534 |                 }
535 |                 
536 |                 // Reset UI state
537 |                 resetState();
538 |                 
539 |                 // Show loading indicator
540 |                 loading.classList.add('active');
541 |                 
542 |                 // Start progress simulation
543 |                 startProgressSimulation();
544 |                 
545 |                 // Create form data
546 |                 const formData = new FormData();
547 |                 formData.append('url', url);
548 |                 formData.append('use_selenium', useSelenium ? 'true' : 'false');
549 |                 
550 |                 // If using Selenium, show info that it might take longer
551 |                 if (useSelenium) {
552 |                     progressText.textContent = "Initializing browser for advanced rendering (this may take a minute)...";
553 |                 }
554 |                 
555 |                 // Send extraction request
556 |                 fetch('/extract', {
557 |                     method: 'POST',
558 |                     body: formData
559 |                 })
560 |                 .then(response => {
561 |                     // Clear progress simulation
562 |                     clearInterval(progressInterval);
563 |                     
564 |                     if (response.ok) {
565 |                         // Set progress to 100%
566 |                         updateProgress(100, 'Download starting...');
567 |                         
568 |                         // Show success message after a short delay
569 |                         setTimeout(() => {
570 |                             loading.classList.remove('active');
571 |                             success.classList.remove('hidden');
572 |                         }, 1000);
573 |                         
574 |                         // Try to get the filename from the Content-Disposition header
575 |                         const contentDisposition = response.headers.get('Content-Disposition');
576 |                         let filename = 'website_clone.zip';
577 |                         
578 |                         if (contentDisposition) {
579 |                             const filenameMatch = contentDisposition.match(/filename="?([^"]+)"?/);
580 |                             if (filenameMatch && filenameMatch[1]) {
581 |                                 filename = filenameMatch[1];
582 |                             }
583 |                         }
584 |                         
585 |                         // Trigger download
586 |                         return response.blob()
587 |                             .then(blob => {
588 |                                 if (blob.size < 1000) {
589 |                                     // If the blob is too small, it might be an error response disguised as a 200
590 |                                     console.error("Downloaded blob is too small, likely an error response");
591 |                                     throw new Error("The server returned an empty or invalid file. Please try again.");
592 |                                 }
593 |                                 
594 |                                 // Create a download link
595 |                                 const url = window.URL.createObjectURL(blob);
596 |                                 const a = document.createElement('a');
597 |                                 a.style.display = 'none';
598 |                                 a.href = url;
599 |                                 a.download = filename;
600 |                                 document.body.appendChild(a);
601 |                                 
602 |                                 // Show success message immediately with link
603 |                                 loading.classList.remove('active');
604 |                                 success.classList.remove('hidden');
605 |                                 
606 |                                 // Add event listener to the direct download button
607 |                                 const directDownloadBtn = document.getElementById('direct-download');
608 |                                 if (directDownloadBtn) {
609 |                                     directDownloadBtn.onclick = function() {
610 |                                         // Create a new anchor element for this click to ensure it works every time
611 |                                         const downloadLink = document.createElement('a');
612 |                                         downloadLink.href = url;
613 |                                         downloadLink.download = filename;
614 |                                         document.body.appendChild(downloadLink);
615 |                                         downloadLink.click();
616 |                                         document.body.removeChild(downloadLink);
617 |                                         console.log("Manual download initiated via direct download button");
618 |                                     };
619 |                                 }
620 |                                 
621 |                                 // Display helpful message with alternate download link as a fallback
622 |                                 const successMessage = document.querySelector('.success-message');
623 |                                 if (successMessage) {
624 |                                     // Add the fallback instructions after the main button
625 |                                     const fallbackDiv = document.createElement('div');
626 |                                     fallbackDiv.className = 'mt-4 p-3 bg-blue-50 rounded-md border border-blue-200 text-sm';
627 |                                     fallbackDiv.innerHTML = `
628 |                                         <p class="font-medium text-blue-800">If the button doesn't work:</p>
629 |                                         <a href="${url}" download="${filename}" class="inline-block mt-2 bg-blue-600 hover:bg-blue-700 text-white py-2 px-4 rounded-md flex items-center text-sm">
630 |                                             <svg class="w-4 h-4 mr-1" fill="none" stroke="currentColor" viewBox="0 0 24 24">
631 |                                                 <path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M4 16v1a3 3 0 003 3h10a3 3 0 003-3v-1m-4-4l-4 4m0 0l-4-4m4 4V4"></path>
632 |                                             </svg>
633 |                                             Alternative download link
634 |                                         </a>
635 |                                         <p class="mt-2 text-xs text-blue-600">This link will remain available for 30 minutes.</p>
636 |                                     `;
637 |                                     // Append after the main button's parent div
638 |                                     const mainButtonParent = document.querySelector('.success-message .mt-3');
639 |                                     if (mainButtonParent && mainButtonParent.nextSibling) {
640 |                                         mainButtonParent.parentNode.insertBefore(fallbackDiv, mainButtonParent.nextSibling);
641 |                                     } else if (mainButtonParent) {
642 |                                         mainButtonParent.parentNode.appendChild(fallbackDiv);
643 |                                     }
644 |                                 }
645 |                                 
646 |                                 // Try to download using the anchor click method
647 |                                 try {
648 |                                     setTimeout(() => {
649 |                                         a.click();
650 |                                         console.log("Download initiated via click() after showing UI");
651 |                                     }, 500); // Slight delay to ensure UI is updated first
652 |                                     
653 |                                     // Clean up the hidden anchor but keep the blob URL valid for manual download
654 |                                     setTimeout(() => {
655 |                                         document.body.removeChild(a);
656 |                                         // Keep the URL valid for 30 minutes before releasing
657 |                                         setTimeout(() => {
658 |                                             window.URL.revokeObjectURL(url);
659 |                                             console.log("Blob URL revoked after timeout");
660 |                                             // Update the success message to indicate the link is no longer available
661 |                                             if (successMessage) {
662 |                                                 const downloadLinks = successMessage.querySelectorAll('a');
663 |                                                 downloadLinks.forEach(link => {
664 |                                                     link.classList.add('opacity-50', 'cursor-not-allowed');
665 |                                                     link.classList.remove('hover:bg-blue-700');
666 |                                                     link.onclick = (e) => { 
667 |                                                         e.preventDefault();
668 |                                                         alert('The download link has expired. Please try extracting the website again.');
669 |                                                     };
670 |                                                 });
671 |                                                 
672 |                                                 // Disable the direct download button as well
673 |                                                 if (directDownloadBtn) {
674 |                                                     directDownloadBtn.classList.add('opacity-50', 'cursor-not-allowed', 'bg-gray-500');
675 |                                                     directDownloadBtn.classList.remove('hover:bg-blue-700', 'bg-blue-600');
676 |                                                     directDownloadBtn.onclick = (e) => {
677 |                                                         e.preventDefault();
678 |                                                         alert('The download link has expired. Please try extracting the website again.');
679 |                                                     };
680 |                                                 }
681 |                                                 
682 |                                                 const expireMessages = successMessage.querySelectorAll('p.text-xs, p.text-sm');
683 |                                                 expireMessages.forEach(message => {
684 |                                                     if (message.textContent.includes('available for 30 minutes')) {
685 |                                                         message.textContent = 'The download link has expired. Please try extracting the website again.';
686 |                                                         message.classList.remove('text-blue-600');
687 |                                                         message.classList.add('text-red-600');
688 |                                                     }
689 |                                                 });
690 |                                             }
691 |                                         }, 1800000); // 30 minutes (matching the server-side cleanup)
692 |                                     }, 1000);
693 |                                 } catch (e) {
694 |                                     console.error("Error with click method:", e);
695 |                                     
696 |                                     // Show user-friendly error and make manual download link more prominent
697 |                                     if (successMessage) {
698 |                                         const downloadInfo = successMessage.querySelector('.p-3');
699 |                                         if (downloadInfo) {
700 |                                             downloadInfo.classList.remove('bg-blue-50', 'border-blue-200');
701 |                                             downloadInfo.classList.add('bg-red-50', 'border-red-200');
702 |                                             
703 |                                             const titleElement = downloadInfo.querySelector('p.font-medium');
704 |                                             if (titleElement) {
705 |                                                 titleElement.textContent = 'Automatic download failed:';
706 |                                                 titleElement.classList.remove('text-blue-800');
707 |                                                 titleElement.classList.add('text-red-800');
708 |                                             }
709 |                                         }
710 |                                     }
711 |                                     
712 |                                     // Fallback: Open in new tab/window if browser supports it
713 |                                     try {
714 |                                         window.open(url, '_blank');
715 |                                     } catch (tabError) {
716 |                                         console.error("Error opening in new tab:", tabError);
717 |                                         // Last resort: Alert the user
718 |                                         alert("Download couldn't start automatically. Please use the download button on the page");
719 |                                     }
720 |                                 }
721 |                             })
722 |                             .catch(blobError => {
723 |                                 console.error("Error processing blob:", blobError);
724 |                                 loading.classList.remove('active');
725 |                                 showError("Error preparing download: " + blobError.message);
726 |                             });
727 |                     } else {
728 |                         // Show error for non-200 responses
729 |                         return response.text().then(text => {
730 |                             console.error("Server response error:", text);
731 |                             try {
732 |                                 // Try to parse as JSON first
733 |                                 const data = JSON.parse(text);
734 |                                 throw new Error(data.error || 'Failed to extract website');
735 |                             } catch (jsonError) {
736 |                                 // If not JSON, return the raw text or a generic error
737 |                                 throw new Error(text || 'Failed to extract website');
738 |                             }
739 |                         });
740 |                     }
741 |                 })
742 |                 .catch(err => {
743 |                     clearInterval(progressInterval);
744 |                     loading.classList.remove('active');
745 |                     console.error("Fetch error:", err);
746 |                     showError(err.message || 'An error occurred while extracting the website');
747 |                 });
748 |             });
749 |             
750 |             function resetState() {
751 |                 loading.classList.remove('active');
752 |                 error.classList.add('hidden');
753 |                 success.classList.add('hidden');
754 |                 error.textContent = '';
755 |                 currentProgress = 0;
756 |                 updateProgress(0, 'Initializing...');
757 |             }
758 |             
759 |             function showError(message) {
760 |                 error.textContent = message;
761 |                 error.classList.remove('hidden');
762 |             }
763 |             
764 |             function startProgressSimulation() {
765 |                 // Clear any existing interval
766 |                 if (progressInterval) {
767 |                     clearInterval(progressInterval);
768 |                 }
769 |                 
770 |                 const useSelenium = document.getElementById('use_selenium').checked;
771 |                 const maxProgress = useSelenium ? 90 : 95; // Leave room for completion
772 |                 
773 |                 progressInterval = setInterval(() => {
774 |                     if (currentProgress < maxProgress) {
775 |                         // Slower progress at the beginning
776 |                         if (currentProgress < 30) {
777 |                             currentProgress += 0.5;
778 |                             updateProgress(currentProgress, 'Fetching website content...');
779 |                         } 
780 |                         // Increase speed a bit
781 |                         else if (currentProgress < 60) {
782 |                             currentProgress += 0.3;
783 |                             updateProgress(currentProgress, 'Extracting CSS and JavaScript...');
784 |                         }
785 |                         // Slower again approaching the end
786 |                         else if (currentProgress < maxProgress) {
787 |                             currentProgress += 0.1;
788 |                             if (useSelenium) {
789 |                                 updateProgress(currentProgress, 'Rendering JavaScript and capturing dynamic content...');
790 |                             } else {
791 |                                 updateProgress(currentProgress, 'Downloading assets and creating ZIP file...');
792 |                             }
793 |                         }
794 |                     }
795 |                 }, 100);
796 |             }
797 |             
798 |             function updateProgress(value, text) {
799 |                 progressBar.style.width = `${value}%`;
800 |                 progressText.textContent = text;
801 |             }
802 |         });
803 |     </script>
804 | </body>
805 | </html> 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
   1 | from flask import Flask, render_template, request, send_file, jsonify, session, after_this_request
   2 | import requests
   3 | from bs4 import BeautifulSoup
   4 | import os
   5 | import re
   6 | import json
   7 | from urllib.parse import urljoin, urlparse, urlunparse, unquote, quote, parse_qs
   8 | import zipfile
   9 | from io import BytesIO
  10 | import mimetypes
  11 | import base64
  12 | import cssutils
  13 | import logging
  14 | import uuid
  15 | import random
  16 | import time
  17 | import urllib3
  18 | import tempfile
  19 | from datetime import datetime
  20 | import traceback
  21 | import html
  22 | import shutil
  23 | import threading
  24 | 
  25 | # Try to import Selenium
  26 | SELENIUM_AVAILABLE = False
  27 | try:
  28 |     from selenium import webdriver
  29 |     from selenium.webdriver.chrome.options import Options
  30 |     from selenium.webdriver.common.by import By
  31 |     from selenium.webdriver.support.ui import WebDriverWait
  32 |     from selenium.webdriver.support import expected_conditions as EC
  33 |     from selenium.common.exceptions import TimeoutException, WebDriverException
  34 |     from selenium.webdriver.chrome.service import Service
  35 |     from webdriver_manager.chrome import ChromeDriverManager
  36 |     SELENIUM_AVAILABLE = True
  37 |     print("Selenium is available. Advanced rendering is enabled.")
  38 | except ImportError:
  39 |     SELENIUM_AVAILABLE = False
  40 |     print("Selenium not available. Advanced rendering will be disabled.")
  41 | 
  42 | # Suppress cssutils warnings
  43 | cssutils.log.setLevel(logging.CRITICAL)
  44 | 
  45 | app = Flask(__name__)
  46 | app.secret_key = os.environ.get('SECRET_KEY', 'dev_key_for_website_extractor')
  47 | 
  48 | def is_binary_content(content, asset_type):
  49 |     """Determine if content should be treated as binary or text based on asset type and content inspection"""
  50 |     # First check by asset type
  51 |     if asset_type in ['images', 'fonts', 'videos', 'audio']:
  52 |         return True
  53 |         
  54 |     # For potentially text-based assets, try to detect if it's binary
  55 |     if asset_type in ['css', 'js', 'html', 'svg', 'json', 'globals_css']:
  56 |         # Check if the content is bytes
  57 |         if not isinstance(content, bytes):
  58 |             return False
  59 |             
  60 |         # Try to detect if binary by checking for null bytes and high concentration of non-ASCII chars
  61 |         try:
  62 |             # Check for null bytes which indicate binary content
  63 |             if b'\x00' in content:
  64 |                 return True
  65 |                 
  66 |             # Sample the first 1024 bytes to determine if it's binary
  67 |             sample = content[:1024]
  68 |             text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
  69 |             return bool(sample.translate(None, text_chars))
  70 |         except:
  71 |             # If there's any error in detection, treat as binary to be safe
  72 |             return True
  73 |             
  74 |     # For anything else, just check if it's bytes
  75 |     return isinstance(content, bytes)
  76 | 
  77 | def download_asset(url, base_url, headers=None, session_obj=None):
  78 |     """
  79 |     Download an asset from a URL
  80 |     
  81 |     Args:
  82 |         url: URL to download from
  83 |         base_url: Base URL of the website (for referrer)
  84 |         headers: Optional custom headers
  85 |         session_obj: Optional requests.Session object for maintaining cookies
  86 |     
  87 |     Returns:
  88 |         Content of the asset or None if download failed
  89 |     """
  90 |     # List of user agents to rotate through to avoid detection
  91 |     user_agents = [
  92 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
  93 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
  94 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
  95 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
  96 |         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
  97 |         'Mozilla/5.0 (iPhone; CPU iPhone OS 17_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1'
  98 |     ]
  99 |     
 100 |     # Use a random user agent
 101 |     random_user_agent = random.choice(user_agents)
 102 |     
 103 |     if not headers:
 104 |         headers = {
 105 |             'User-Agent': random_user_agent,
 106 |             'Accept': '*/*',
 107 |             'Accept-Language': 'en-US,en;q=0.9',
 108 |             'Accept-Encoding': 'gzip, deflate, br',
 109 |             'Connection': 'keep-alive',
 110 |             'Referer': base_url,
 111 |             'Sec-Fetch-Dest': 'empty',
 112 |             'Sec-Fetch-Mode': 'cors',
 113 |             'Sec-Fetch-Site': 'same-origin',
 114 |             'Pragma': 'no-cache',
 115 |             'Cache-Control': 'no-cache',
 116 |         }
 117 |     else:
 118 |         # Update the user agent in the provided headers
 119 |         headers['User-Agent'] = random_user_agent
 120 |     
 121 |     # Parse the URL to check if it's valid
 122 |     try:
 123 |         parsed_url = urlparse(url)
 124 |         if not parsed_url.scheme or not parsed_url.netloc:
 125 |             print(f"Invalid URL: {url}")
 126 |             return None
 127 |     except Exception as e:
 128 |         print(f"Error parsing URL {url}: {str(e)}")
 129 |         return None
 130 |     
 131 |     # Add a delay to avoid rate limiting
 132 |     time.sleep(0.1)  # 100ms delay between requests
 133 |     
 134 |     # Maximum number of retries
 135 |     max_retries = 3
 136 |     retry_count = 0
 137 |     
 138 |     while retry_count < max_retries:
 139 |         try:
 140 |             # Use session if provided, otherwise make a direct request
 141 |             if session_obj:
 142 |                 response = session_obj.get(
 143 |                     url, 
 144 |                     timeout=15, 
 145 |                     headers=headers, 
 146 |                     stream=True, 
 147 |                     allow_redirects=True,
 148 |                     verify=False  # Ignore SSL certificate errors
 149 |                 )
 150 |             else:
 151 |                 response = requests.get(
 152 |                     url, 
 153 |                     timeout=15, 
 154 |                     headers=headers, 
 155 |                     stream=True, 
 156 |                     allow_redirects=True,
 157 |                     verify=False  # Ignore SSL certificate errors
 158 |                 )
 159 |             
 160 |             # Handle redirects
 161 |             if response.history:
 162 |                 print(f"Request for {url} was redirected {len(response.history)} times to {response.url}")
 163 |                 url = response.url  # Update URL to the final destination
 164 |             
 165 |             if response.status_code == 200:
 166 |                 # Check the Content-Type header
 167 |                 content_type = response.headers.get('Content-Type', '')
 168 |                 print(f"Downloaded {url} ({len(response.content)} bytes, type: {content_type})")
 169 |                 
 170 |                 # Check for binary content types
 171 |                 is_binary = any(binary_type in content_type.lower() for binary_type in [
 172 |                     'image/', 'video/', 'audio/', 'font/', 'application/octet-stream', 
 173 |                     'application/zip', 'application/x-rar', 'application/pdf', 'application/vnd.'
 174 |                 ])
 175 |                 
 176 |                 # If binary or content-type suggests binary, return raw content
 177 |                 if is_binary:
 178 |                     return response.content
 179 |                 
 180 |                 # For text content types
 181 |                 is_text = any(text_type in content_type.lower() for text_type in [
 182 |                     'text/', 'application/json', 'application/javascript', 'application/xml', 'application/xhtml'
 183 |                 ])
 184 |                 
 185 |                 if is_text:
 186 |                     # Try to determine encoding
 187 |                     encoding = None
 188 |                     
 189 |                     # From Content-Type header
 190 |                     if 'charset=' in content_type:
 191 |                         encoding = content_type.split('charset=')[1].split(';')[0].strip()
 192 |                     
 193 |                     # From response encoding or apparent encoding
 194 |                     if not encoding:
 195 |                         encoding = response.encoding or response.apparent_encoding or 'utf-8'
 196 |                     
 197 |                     # Decode with specified encoding
 198 |                     try:
 199 |                         return response.content.decode(encoding, errors='replace').encode('utf-8')
 200 |                     except (UnicodeDecodeError, LookupError):
 201 |                         # If decoding fails, try utf-8
 202 |                         try:
 203 |                             return response.content.decode('utf-8', errors='replace').encode('utf-8')
 204 |                         except:
 205 |                             # If all else fails, return raw content
 206 |                             return response.content
 207 |                 
 208 |                 # For unknown content types, return raw content
 209 |                 return response.content
 210 |             elif response.status_code == 404:
 211 |                 print(f"Resource not found (404): {url}")
 212 |                 return None
 213 |             elif response.status_code == 403:
 214 |                 print(f"Access forbidden (403): {url}")
 215 |                 # Try with a different user agent on the next retry
 216 |                 headers['User-Agent'] = random.choice(user_agents)
 217 |                 retry_count += 1
 218 |                 time.sleep(1)  # Wait longer before retrying
 219 |                 continue
 220 |             elif response.status_code >= 500:
 221 |                 print(f"Server error ({response.status_code}): {url}")
 222 |                 retry_count += 1
 223 |                 time.sleep(1)  # Wait longer before retrying
 224 |                 continue
 225 |             else:
 226 |                 print(f"HTTP error ({response.status_code}): {url}")
 227 |                 return None
 228 |                 
 229 |         except requests.exceptions.Timeout:
 230 |             print(f"Timeout error downloading {url}")
 231 |             retry_count += 1
 232 |             time.sleep(1)
 233 |             continue
 234 |         except requests.exceptions.ConnectionError:
 235 |             print(f"Connection error downloading {url}")
 236 |             retry_count += 1
 237 |             time.sleep(1)
 238 |             continue
 239 |         except requests.exceptions.TooManyRedirects:
 240 |             print(f"Too many redirects for {url}")
 241 |             return None
 242 |         except Exception as e:
 243 |             print(f"Error downloading {url}: {str(e)}")
 244 |             return None
 245 |     
 246 |     if retry_count == max_retries:
 247 |         print(f"Max retries reached for {url}")
 248 |     
 249 |     return None
 250 | 
 251 | def get_asset_type(url):
 252 |     """Determine the type of asset from the URL"""
 253 |     # Handle empty or None URLs
 254 |     if not url:
 255 |         return 'other'
 256 |     
 257 |     url_lower = url.lower()
 258 |     
 259 |     # Framework-specific patterns
 260 |     if '_next/static' in url_lower:
 261 |         if '.css' in url_lower or 'styles' in url_lower:
 262 |             return 'css'
 263 |         return 'js'  # Default to JS for Next.js assets
 264 |         
 265 |     if 'chunk.' in url_lower or 'webpack' in url_lower:
 266 |         return 'js'  # Webpack chunks
 267 |         
 268 |     if 'angular' in url_lower and '.js' in url_lower:
 269 |         return 'js'  # Angular bundles
 270 |         
 271 |     # Handle CSS files
 272 |     if url_lower.endswith(('.css', '.scss', '.less', '.sass')):
 273 |         return 'css'
 274 |     if 'global.css' in url_lower or 'globals.css' in url_lower or 'tailwind' in url_lower:
 275 |         return 'css'
 276 |     if 'fonts.googleapis.com' in url_lower:
 277 |         return 'css'
 278 |     if 'styles' in url_lower and '.css' in url_lower:
 279 |         return 'css'
 280 |         
 281 |     # Handle JS files
 282 |     if url_lower.endswith(('.js', '.jsx', '.mjs', '.ts', '.tsx', '.cjs')):
 283 |         return 'js'
 284 |     if 'bundle.js' in url_lower or 'main.js' in url_lower or 'app.js' in url_lower:
 285 |         return 'js'
 286 |     if 'polyfill' in url_lower or 'runtime' in url_lower or 'vendor' in url_lower:
 287 |         return 'js'
 288 |     if 'image-config' in url_lower or 'image.config' in url_lower:
 289 |         return 'js'
 290 |         
 291 |     # Handle image files
 292 |     if url_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.avif', '.bmp', '.ico')):
 293 |         return 'img'
 294 |     if '/images/' in url_lower or '/img/' in url_lower or '/assets/images/' in url_lower:
 295 |         return 'img'
 296 |         
 297 |     # Handle font files
 298 |     if url_lower.endswith(('.woff', '.woff2', '.ttf', '.otf', '.eot')):
 299 |         return 'fonts'
 300 |     if '/fonts/' in url_lower or 'font-awesome' in url_lower:
 301 |         return 'fonts'
 302 |         
 303 |     # Handle media files
 304 |     if url_lower.endswith(('.mp4', '.webm', '.ogg', '.avi', '.mov', '.flv')):
 305 |         return 'videos'
 306 |     if url_lower.endswith(('.mp3', '.wav', '.ogg', '.aac')):
 307 |         return 'audio'
 308 |         
 309 |     # Handle favicon
 310 |     if url_lower.endswith(('.ico', '.icon')):
 311 |         return 'favicons'
 312 |     if 'favicon' in url_lower:
 313 |         return 'favicons'
 314 |         
 315 |     # Handle special API endpoints
 316 |     if 'graphql' in url_lower or 'api.' in url_lower:
 317 |         return 'js'
 318 |         
 319 |     # Try to guess based on URL structure
 320 |     if '/css/' in url_lower:
 321 |         return 'css'
 322 |     if '/js/' in url_lower or '/scripts/' in url_lower:
 323 |         return 'js'
 324 |     if '/static/' in url_lower and not any(ext in url_lower for ext in ['.css', '.js', '.png', '.jpg']):
 325 |         # For static assets with unclear type, check the URL itself
 326 |         if 'style' in url_lower:
 327 |             return 'css'
 328 |         return 'js'  # Default for static assets
 329 |         
 330 |     # For CDN resources, try to determine type from the host
 331 |     cdn_hosts = ['cdn.jsdelivr.net', 'unpkg.com', 'cdnjs.cloudflare.com']
 332 |     for host in cdn_hosts:
 333 |         if host in url_lower:
 334 |             if any(lib in url_lower for lib in ['react', 'angular', 'vue', 'jquery']):
 335 |                 return 'js'
 336 |             if any(lib in url_lower for lib in ['bootstrap', 'tailwind', 'material', 'font']):
 337 |                 return 'css'
 338 |     
 339 |     # Default to JS for unknown extensions
 340 |     return 'js'
 341 | 
 342 | def extract_metadata(soup, base_url):
 343 |     """Extract metadata from the HTML"""
 344 |     metadata = {
 345 |         'title': '',
 346 |         'description': '',
 347 |         'keywords': '',
 348 |         'og_tags': {},
 349 |         'twitter_cards': {},
 350 |         'canonical': '',
 351 |         'language': '',
 352 |         'favicon': '',
 353 |         'structured_data': []
 354 |     }
 355 |     
 356 |     # Extract title
 357 |     title_tag = soup.find('title')
 358 |     if title_tag and title_tag.string:
 359 |         metadata['title'] = title_tag.string.strip()
 360 |     
 361 |     # Extract meta tags
 362 |     meta_tags = soup.find_all('meta')
 363 |     for tag in meta_tags:
 364 |         # Description
 365 |         if tag.get('name') == 'description' and tag.get('content'):
 366 |             metadata['description'] = tag.get('content').strip()
 367 |         
 368 |         # Keywords
 369 |         elif tag.get('name') == 'keywords' and tag.get('content'):
 370 |             metadata['keywords'] = tag.get('content').strip()
 371 |         
 372 |         # OpenGraph tags
 373 |         elif tag.get('property') and tag.get('property').startswith('og:') and tag.get('content'):
 374 |             prop = tag.get('property')[3:]  # Remove 'og:' prefix
 375 |             metadata['og_tags'][prop] = tag.get('content').strip()
 376 |         
 377 |         # Twitter card tags
 378 |         elif tag.get('name') and tag.get('name').startswith('twitter:') and tag.get('content'):
 379 |             prop = tag.get('name')[8:]  # Remove 'twitter:' prefix
 380 |             metadata['twitter_cards'][prop] = tag.get('content').strip()
 381 |     
 382 |     # Extract canonical URL
 383 |     canonical_tag = soup.find('link', {'rel': 'canonical'})
 384 |     if canonical_tag and canonical_tag.get('href'):
 385 |         canonical_url = canonical_tag.get('href')
 386 |         if not canonical_url.startswith(('http://', 'https://')):
 387 |             canonical_url = urljoin(base_url, canonical_url)
 388 |         metadata['canonical'] = canonical_url
 389 |     
 390 |     # Extract language
 391 |     html_tag = soup.find('html')
 392 |     if html_tag and html_tag.get('lang'):
 393 |         metadata['language'] = html_tag.get('lang')
 394 |     
 395 |     # Extract favicon
 396 |     favicon_tag = soup.find('link', {'rel': 'icon'}) or soup.find('link', {'rel': 'shortcut icon'})
 397 |     if favicon_tag and favicon_tag.get('href'):
 398 |         favicon_url = favicon_tag.get('href')
 399 |         if not favicon_url.startswith(('http://', 'https://')):
 400 |             favicon_url = urljoin(base_url, favicon_url)
 401 |         metadata['favicon'] = favicon_url
 402 |     
 403 |     # Extract structured data (JSON-LD)
 404 |     script_tags = soup.find_all('script', {'type': 'application/ld+json'})
 405 |     for tag in script_tags:
 406 |         if tag.string:
 407 |             try:
 408 |                 json_data = json.loads(tag.string)
 409 |                 metadata['structured_data'].append(json_data)
 410 |             except json.JSONDecodeError:
 411 |                 pass
 412 |     
 413 |     return metadata
 414 | 
 415 | def get_component_type(element):
 416 |     """Determine the type of UI component based on element attributes and classes"""
 417 |     if not element:
 418 |         return None
 419 |         
 420 |     # Get tag name, classes, and ID
 421 |     tag_name = element.name
 422 |     class_list = element.get('class', [])
 423 |     if class_list and not isinstance(class_list, list):
 424 |         class_list = [class_list]
 425 |     class_str = ' '.join(class_list).lower() if class_list else ''
 426 |     element_id = element.get('id', '').lower()
 427 |     
 428 |     # Get element role
 429 |     role = element.get('role', '').lower()
 430 |     
 431 |     # Navigation components
 432 |     if tag_name == 'nav' or role == 'navigation' or 'nav' in class_str or 'navigation' in class_str or 'menu' in class_str or element_id in ['nav', 'navigation', 'menu']:
 433 |         return 'navigation'
 434 |     
 435 |     # Header components
 436 |     if tag_name == 'header' or role == 'banner' or 'header' in class_str or 'banner' in class_str or element_id in ['header', 'banner']:
 437 |         return 'header'
 438 |     
 439 |     # Footer components
 440 |     if tag_name == 'footer' or role == 'contentinfo' or 'footer' in class_str or element_id == 'footer':
 441 |         return 'footer'
 442 |     
 443 |     # Hero/banner components
 444 |     if 'hero' in class_str or 'banner' in class_str or 'jumbotron' in class_str or 'showcase' in class_str or element_id in ['hero', 'banner', 'jumbotron', 'showcase']:
 445 |         return 'hero'
 446 |     
 447 |     # Card components
 448 |     if 'card' in class_str or 'tile' in class_str or 'item' in class_str or element_id in ['card', 'tile']:
 449 |         return 'card'
 450 |     
 451 |     # Form components
 452 |     if tag_name == 'form' or role == 'form' or 'form' in class_str or element_id == 'form':
 453 |         return 'form'
 454 |     
 455 |     # CTA (Call to Action) components
 456 |     if 'cta' in class_str or 'call-to-action' in class_str or 'action' in class_str or element_id in ['cta', 'call-to-action']:
 457 |         return 'cta'
 458 |     
 459 |     # Sidebar components
 460 |     if 'sidebar' in class_str or 'side-bar' in class_str or element_id in ['sidebar', 'side-bar']:
 461 |         return 'sidebar'
 462 |     
 463 |     # Modal/Dialog components
 464 |     if role == 'dialog' or 'modal' in class_str or 'dialog' in class_str or 'popup' in class_str or element_id in ['modal', 'dialog', 'popup']:
 465 |         return 'modal'
 466 |     
 467 |     # Section components
 468 |     if tag_name == 'section' or role == 'region' or 'section' in class_str:
 469 |         return 'section'
 470 |     
 471 |     # Mobile components
 472 |     if 'mobile' in class_str or 'smartphone' in class_str or 'mobile-only' in class_str:
 473 |         return 'mobile'
 474 |     
 475 |     # Store/Product components
 476 |     if 'product' in class_str or 'store' in class_str or 'shop' in class_str or 'pricing' in class_str:
 477 |         return 'store'
 478 |     
 479 |     # Cart components
 480 |     if 'cart' in class_str or 'basket' in class_str or 'shopping-cart' in class_str or element_id in ['cart', 'basket', 'shopping-cart']:
 481 |         return 'cart'
 482 |     
 483 |     # If no specific type is identified, check if the element is a major container
 484 |     if tag_name in ['div', 'section', 'article'] and ('container' in class_str or 'wrapper' in class_str or 'content' in class_str):
 485 |         return 'container'
 486 |     
 487 |     # Default to unknown if no specific type is identified
 488 |     return 'other'
 489 | 
 490 | def extract_component_structure(soup):
 491 |     """Extract UI components from the HTML structure"""
 492 |     if not soup:
 493 |         return {}
 494 |         
 495 |     components = {
 496 |         'navigation': [],
 497 |         'header': [],
 498 |         'footer': [],
 499 |         'hero': [],
 500 |         'card': [],
 501 |         'form': [],
 502 |         'cta': [],
 503 |         'sidebar': [],
 504 |         'modal': [],
 505 |         'section': [],
 506 |         'store': [],
 507 |         'mobile': [],
 508 |         'cart': []
 509 |     }
 510 |     
 511 |     # Helper function to convert element to HTML string
 512 |     def element_to_html(element):
 513 |         return str(element)
 514 |     
 515 |     # Extract navigation components
 516 |     nav_elements = soup.find_all(['nav']) + soup.find_all(role='navigation') + soup.find_all(class_=lambda c: c and ('nav' in c.lower() or 'menu' in c.lower()))
 517 |     for element in nav_elements[:5]:  # Limit to 5 to avoid excessive extraction
 518 |         components['navigation'].append({
 519 |             'html': element_to_html(element)
 520 |         })
 521 |     
 522 |     # Extract header components
 523 |     header_elements = soup.find_all(['header']) + soup.find_all(role='banner') + soup.find_all(class_=lambda c: c and 'header' in c.lower())
 524 |     for element in header_elements[:2]:  # Usually only 1-2 headers per page
 525 |         components['header'].append({
 526 |             'html': element_to_html(element)
 527 |         })
 528 |     
 529 |     # Extract footer components
 530 |     footer_elements = soup.find_all(['footer']) + soup.find_all(role='contentinfo') + soup.find_all(class_=lambda c: c and 'footer' in c.lower())
 531 |     for element in footer_elements[:2]:  # Usually only 1-2 footers per page
 532 |         components['footer'].append({
 533 |             'html': element_to_html(element)
 534 |         })
 535 |     
 536 |     # Extract hero/banner components
 537 |     hero_elements = soup.find_all(class_=lambda c: c and ('hero' in c.lower() or 'banner' in c.lower() or 'jumbotron' in c.lower()))
 538 |     for element in hero_elements[:3]:  # Limit to 3
 539 |         components['hero'].append({
 540 |             'html': element_to_html(element)
 541 |         })
 542 |     
 543 |     # Extract card components - often these are repeated elements
 544 |     card_elements = soup.find_all(class_=lambda c: c and ('card' in c.lower() or 'tile' in c.lower()))
 545 |     
 546 |     # If we find many cards, just keep one of each unique structure
 547 |     unique_cards = {}
 548 |     for element in card_elements[:15]:  # Examine up to 15 cards
 549 |         # Use a simplified structure hash to identify similar cards
 550 |         structure_hash = str(len(element.find_all()))  # Number of child elements
 551 |         if structure_hash not in unique_cards:
 552 |             unique_cards[structure_hash] = element
 553 |     
 554 |     # Add unique cards to components
 555 |     for idx, element in enumerate(unique_cards.values()):
 556 |         if idx >= 5:  # Limit to 5 unique cards
 557 |             break
 558 |         components['card'].append({
 559 |             'html': element_to_html(element)
 560 |         })
 561 |     
 562 |     # Extract form components
 563 |     form_elements = soup.find_all(['form']) + soup.find_all(class_=lambda c: c and 'form' in c.lower())
 564 |     for element in form_elements[:3]:  # Limit to 3
 565 |         components['form'].append({
 566 |             'html': element_to_html(element)
 567 |         })
 568 |     
 569 |     # Extract CTA components
 570 |     cta_elements = soup.find_all(class_=lambda c: c and ('cta' in c.lower() or 'call-to-action' in c.lower()))
 571 |     for element in cta_elements[:3]:  # Limit to 3
 572 |         components['cta'].append({
 573 |             'html': element_to_html(element)
 574 |         })
 575 |     
 576 |     # Extract sidebar components
 577 |     sidebar_elements = soup.find_all(class_=lambda c: c and ('sidebar' in c.lower() or 'side-bar' in c.lower()))
 578 |     for element in sidebar_elements[:2]:  # Limit to 2
 579 |         components['sidebar'].append({
 580 |             'html': element_to_html(element)
 581 |         })
 582 |     
 583 |     # Extract modal/dialog components
 584 |     modal_elements = soup.find_all(role='dialog') + soup.find_all(class_=lambda c: c and ('modal' in c.lower() or 'dialog' in c.lower() or 'popup' in c.lower()))
 585 |     for element in modal_elements[:3]:  # Limit to 3
 586 |         components['modal'].append({
 587 |             'html': element_to_html(element)
 588 |         })
 589 |     
 590 |     # Extract section components
 591 |     section_elements = soup.find_all(['section']) + soup.find_all(role='region')
 592 |     # Filter to get only substantial sections
 593 |     substantial_sections = [element for element in section_elements if len(element.find_all()) > 3]  # Must have at least 3 child elements
 594 |     for element in substantial_sections[:5]:  # Limit to 5
 595 |         components['section'].append({
 596 |             'html': element_to_html(element)
 597 |         })
 598 |     
 599 |     # Extract mobile-specific components
 600 |     mobile_elements = soup.find_all(class_=lambda c: c and ('mobile' in c.lower() or 'smartphone' in c.lower() or 'mobile-only' in c.lower()))
 601 |     for element in mobile_elements[:3]:  # Limit to 3
 602 |         components['mobile'].append({
 603 |             'html': element_to_html(element)
 604 |         })
 605 |     
 606 |     # Extract store/product components
 607 |     store_elements = soup.find_all(class_=lambda c: c and ('product' in c.lower() or 'store' in c.lower() or 'shop' in c.lower() or 'pricing' in c.lower()))
 608 |     for element in store_elements[:5]:  # Limit to 5
 609 |         components['store'].append({
 610 |             'html': element_to_html(element)
 611 |         })
 612 |     
 613 |     # Extract cart components
 614 |     cart_elements = soup.find_all(class_=lambda c: c and ('cart' in c.lower() or 'basket' in c.lower() or 'shopping-cart' in c.lower()))
 615 |     for element in cart_elements[:2]:  # Limit to 2
 616 |         components['cart'].append({
 617 |             'html': element_to_html(element)
 618 |         })
 619 |     
 620 |     # Remove empty component types
 621 |     return {k: v for k, v in components.items() if v}
 622 | 
 623 | def extract_inline_styles(soup):
 624 |     """Extract all inline styles from the HTML"""
 625 |     inline_styles = {}
 626 |     elements_with_style = soup.select('[style]')
 627 |     
 628 |     for i, element in enumerate(elements_with_style):
 629 |         style_content = element.get('style')
 630 |         if style_content:
 631 |             class_name = f'extracted-inline-style-{i}'
 632 |             inline_styles[class_name] = style_content
 633 |             # Add the class to the element
 634 |             element['class'] = element.get('class', []) + [class_name]
 635 |             # Remove the inline style
 636 |             del element['style']
 637 |     
 638 |     return inline_styles
 639 | 
 640 | def extract_inline_javascript(soup):
 641 |     """Extract inline JavaScript from HTML content"""
 642 |     inline_js = []
 643 |     # Find all script tags without src attribute (inline scripts)
 644 |     for script in soup.find_all('script'):
 645 |         if not script.get('src') and script.string:
 646 |             inline_js.append(script.string.strip())
 647 |     
 648 |     if inline_js:
 649 |         return '\n\n/* --- INLINE SCRIPTS --- */\n\n'.join(inline_js)
 650 |     return ""
 651 | 
 652 | def extract_assets(html_content, base_url, session_obj=None, headers=None):
 653 |     """Extract all assets from HTML content"""
 654 |     assets = {
 655 |         'css': [],
 656 |         'js': [],
 657 |         'img': [],
 658 |         'fonts': [],
 659 |         'videos': [],
 660 |         'audio': [],
 661 |         'favicons': [],
 662 |         'font_families': set(),
 663 |         'metadata': {},
 664 |         'components': {}
 665 |     }
 666 |     
 667 |     if not html_content:
 668 |         print("Warning: Empty HTML content provided to extract_assets")
 669 |         return assets
 670 |     
 671 |     try:
 672 |         # Create BeautifulSoup object
 673 |         soup = BeautifulSoup(html_content, 'html.parser')
 674 |         
 675 |         if not soup or not soup.html:
 676 |             print("Warning: Could not parse HTML content properly")
 677 |             # Try with a more lenient parser
 678 |             soup = BeautifulSoup(html_content, 'html5lib')
 679 |             if not soup or not soup.html:
 680 |                 print("Error: Failed to parse HTML with both parsers")
 681 |                 return assets
 682 |         
 683 |         # Extract metadata
 684 |         try:
 685 |             assets['metadata'] = extract_metadata(soup, base_url)
 686 |         except Exception as e:
 687 |             print(f"Error extracting metadata: {str(e)}")
 688 |             traceback.print_exc()
 689 |         
 690 |         # Extract all CSS files
 691 |         try:
 692 |             css_links = soup.find_all('link', {'rel': 'stylesheet'}) or []
 693 |             # Also look for preload links with as="style"
 694 |             preload_css = soup.find_all('link', {'rel': 'preload', 'as': 'style'}) or []
 695 |             
 696 |             for link in css_links + preload_css:
 697 |                 href = link.get('href')
 698 |                 if href:
 699 |                     if not href.startswith(('http://', 'https://', 'data:')):
 700 |                         href = urljoin(base_url, href)
 701 |                     if href.startswith(('http://', 'https://')):
 702 |                         assets['css'].append(href)
 703 |         except Exception as e:
 704 |             print(f"Error extracting CSS links: {str(e)}")
 705 |         
 706 |         # Look for Next.js specific CSS files
 707 |         try:
 708 |             next_css = soup.find_all('link', {'data-n-g': True}) or []
 709 |             next_css += soup.find_all('link', {'data-n-p': True}) or []
 710 |             for link in next_css:
 711 |                 href = link.get('href')
 712 |                 if href:
 713 |                     if not href.startswith(('http://', 'https://', 'data:')):
 714 |                         href = urljoin(base_url, href)
 715 |                     if href.startswith(('http://', 'https://')):
 716 |                         assets['css'].append(href)
 717 |         except Exception as e:
 718 |             print(f"Error extracting Next.js CSS: {str(e)}")
 719 |                     
 720 |         # Extract all inline styles and check for CSS imports or fonts
 721 |         try:
 722 |             style_tags = soup.find_all('style') or []
 723 |             for style in style_tags:
 724 |                 style_content = style.string
 725 |                 if style_content:
 726 |                     # Extract @import statements
 727 |                     import_urls = re.findall(r'@import\s+[\'"]([^\'"]+)[\'"]', style_content) or []
 728 |                     import_urls += re.findall(r'@import\s+url\([\'"]?([^\'"|\)]+)[\'"]?\)', style_content) or []
 729 |                     
 730 |                     for import_url in import_urls:
 731 |                         if not import_url.startswith(('http://', 'https://', 'data:')):
 732 |                             import_url = urljoin(base_url, import_url)
 733 |                         if import_url.startswith(('http://', 'https://')):
 734 |                             assets['css'].append(import_url)
 735 |                     
 736 |                     # Extract font families
 737 |                     font_families = re.findall(r'font-family:\s*[\'"]?([^\'";]+)[\'"]?', style_content) or []
 738 |                     for family in font_families:
 739 |                         family = family.strip().split(',')[0].strip('\'"`')
 740 |                         if family and family.lower() not in ['serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui']:
 741 |                             assets['font_families'].add(family)
 742 |         except Exception as e:
 743 |             print(f"Error extracting inline styles: {str(e)}")
 744 |     
 745 |         # Extract all JavaScript files
 746 |         try:
 747 |             script_tags = soup.find_all('script', {'src': True}) or []
 748 |             for script in script_tags:
 749 |                 src = script.get('src')
 750 |                 if src:
 751 |                     if not src.startswith(('http://', 'https://', 'data:')):
 752 |                         src = urljoin(base_url, src)
 753 |                     if src.startswith(('http://', 'https://')):
 754 |                         assets['js'].append(src)
 755 |             
 756 |             # Look for module scripts (common in modern frameworks)
 757 |             module_scripts = soup.find_all('script', {'type': 'module'}) or []
 758 |             for script in module_scripts:
 759 |                 src = script.get('src')
 760 |                 if src:
 761 |                     if not src.startswith(('http://', 'https://', 'data:')):
 762 |                         src = urljoin(base_url, src)
 763 |                     if src.startswith(('http://', 'https://')):
 764 |                         assets['js'].append(src)
 765 |         except Exception as e:
 766 |             print(f"Error extracting JavaScript: {str(e)}")
 767 |         
 768 |         # Extract all images
 769 |         try:
 770 |             # Regular img tags
 771 |             img_tags = soup.find_all('img') or []
 772 |             for img in img_tags:
 773 |                 # Check src attribute
 774 |                 src = img.get('src')
 775 |                 if src:
 776 |                     if not src.startswith(('http://', 'https://', 'data:')):
 777 |                         src = urljoin(base_url, src)
 778 |                     if src.startswith(('http://', 'https://')):
 779 |                         assets['img'].append(src)
 780 |                 
 781 |                 # Check srcset attribute
 782 |                 srcset = img.get('srcset')
 783 |                 if srcset:
 784 |                     for src_str in srcset.split(','):
 785 |                         src_parts = src_str.strip().split(' ')
 786 |                         if src_parts:
 787 |                             src = src_parts[0]
 788 |                             if not src.startswith(('http://', 'https://', 'data:')):
 789 |                                 src = urljoin(base_url, src)
 790 |                             if src.startswith(('http://', 'https://')):
 791 |                                 assets['img'].append(src)
 792 |                 
 793 |                 # Check data-src (lazy loading)
 794 |                 data_src = img.get('data-src')
 795 |                 if data_src:
 796 |                     if not data_src.startswith(('http://', 'https://', 'data:')):
 797 |                         data_src = urljoin(base_url, data_src)
 798 |                     if data_src.startswith(('http://', 'https://')):
 799 |                         assets['img'].append(data_src)
 800 |             
 801 |             # Background images in style attributes
 802 |             elements_with_style = soup.select('[style]') or []
 803 |             for element in elements_with_style:
 804 |                 style = element.get('style', '')
 805 |                 if 'background' in style or 'background-image' in style:
 806 |                     # Try to extract URLs
 807 |                     bg_urls = re.findall(r'url\([\'"]?([^\'"|\)]+)[\'"]?\)', style)
 808 |                     for bg_url in bg_urls:
 809 |                         if not bg_url.startswith(('http://', 'https://', 'data:')):
 810 |                             bg_url = urljoin(base_url, bg_url)
 811 |                         if bg_url.startswith(('http://', 'https://')):
 812 |                             assets['img'].append(bg_url)
 813 |         except Exception as e:
 814 |             print(f"Error extracting images: {str(e)}")
 815 |         
 816 |         # Extract favicon
 817 |         try:
 818 |             favicon_links = soup.find_all('link', {'rel': lambda r: r and (r.lower() == 'icon' or 'icon' in r.lower().split())}) or []
 819 |             for link in favicon_links:
 820 |                 href = link.get('href')
 821 |                 if href:
 822 |                     if not href.startswith(('http://', 'https://', 'data:')):
 823 |                         href = urljoin(base_url, href)
 824 |                     if href.startswith(('http://', 'https://')):
 825 |                         assets['favicons'].append(href)
 826 |         except Exception as e:
 827 |             print(f"Error extracting favicons: {str(e)}")
 828 |         
 829 |         # Extract all video sources
 830 |         try:
 831 |             video_tags = soup.find_all('video') or []
 832 |             for video in video_tags:
 833 |                 # Check src attribute
 834 |                 src = video.get('src')
 835 |                 if src:
 836 |                     if not src.startswith(('http://', 'https://', 'data:')):
 837 |                         src = urljoin(base_url, src)
 838 |                     if src.startswith(('http://', 'https://')):
 839 |                         assets['videos'].append(src)
 840 |                 
 841 |                 # Check source tags inside video
 842 |                 source_tags = video.find_all('source') or []
 843 |                 for source in source_tags:
 844 |                     src = source.get('src')
 845 |                     if src:
 846 |                         if not src.startswith(('http://', 'https://', 'data:')):
 847 |                             src = urljoin(base_url, src)
 848 |                         if src.startswith(('http://', 'https://')):
 849 |                             assets['videos'].append(src)
 850 |         except Exception as e:
 851 |             print(f"Error extracting videos: {str(e)}")
 852 |         
 853 |         # Extract all audio sources
 854 |         try:
 855 |             audio_tags = soup.find_all('audio') or []
 856 |             for audio in audio_tags:
 857 |                 # Check src attribute
 858 |                 src = audio.get('src')
 859 |                 if src:
 860 |                     if not src.startswith(('http://', 'https://', 'data:')):
 861 |                         src = urljoin(base_url, src)
 862 |                     if src.startswith(('http://', 'https://')):
 863 |                         assets['audio'].append(src)
 864 |                 
 865 |                 # Check source tags inside audio
 866 |                 source_tags = audio.find_all('source') or []
 867 |                 for source in source_tags:
 868 |                     src = source.get('src')
 869 |                     if src:
 870 |                         if not src.startswith(('http://', 'https://', 'data:')):
 871 |                             src = urljoin(base_url, src)
 872 |                         if src.startswith(('http://', 'https://')):
 873 |                             assets['audio'].append(src)
 874 |         except Exception as e:
 875 |             print(f"Error extracting audio: {str(e)}")
 876 |         
 877 |         # Extract all iframes
 878 |         try:
 879 |             iframe_tags = soup.find_all('iframe') or []
 880 |             for iframe in iframe_tags:
 881 |                 src = iframe.get('src')
 882 |                 if src and not src.startswith('data:'):
 883 |                     if not src.startswith(('http://', 'https://')):
 884 |                         src = urljoin(base_url, src)
 885 |                     if src.startswith(('http://', 'https://')):
 886 |                         if 'youtube' in src or 'vimeo' in src:
 887 |                             assets['videos'].append(src)
 888 |                         else:
 889 |                             assets['js'].append(src)  # Treat as JS resource
 890 |         except Exception as e:
 891 |             print(f"Error extracting iframes: {str(e)}")
 892 |         
 893 |         # Extract Next.js specific resources
 894 |         try:
 895 |             # Look for Next.js data scripts
 896 |             next_data = soup.find('script', {'id': '__NEXT_DATA__'})
 897 |             if next_data and next_data.string:
 898 |                 try:
 899 |                     next_json = json.loads(next_data.string)
 900 |                     # Extract buildId
 901 |                     if 'buildId' in next_json:
 902 |                         build_id = next_json['buildId']
 903 |                         # Add common Next.js resources with this buildId
 904 |                         for path in ['main', 'webpack', 'framework', 'pages/_app', 'pages/_error', 'pages/index']:
 905 |                             chunk_url = f"{base_url}/_next/static/{build_id}/pages/{path}.js"
 906 |                             assets['js'].append(chunk_url)
 907 |                         
 908 |                     # Extract page data
 909 |                     if 'page' in next_json and 'props' in next_json.get('props', {}):
 910 |                         # This often has valuable data we might want to preserve
 911 |                         assets['metadata']['next_data'] = next_json
 912 |                 except Exception as next_error:
 913 |                     print(f"Error parsing Next.js data: {str(next_error)}")
 914 |             
 915 |             # Look for Webpack chunks in comments
 916 |             chunks_regex = r'/\*\s*webpackJsonp\s*\*/(.*?)/\*\s*end\s*webpackJsonp\s*\*/'
 917 |             chunks_matches = re.findall(chunks_regex, html_content, re.DOTALL)
 918 |             if chunks_matches:
 919 |                 print("Found webpack chunks in comments")
 920 |                 # These are JavaScript assets that might be dynamically loaded
 921 |         except Exception as e:
 922 |             print(f"Error extracting Next.js resources: {str(e)}")
 923 |         
 924 |         # Try to download CSS files and extract additional assets
 925 |         if session_obj and headers:
 926 |             try:
 927 |                 css_urls = assets['css'].copy()  # Copy to avoid modifying during iteration
 928 |                 for css_url in css_urls:
 929 |                     try:
 930 |                         # Skip data URLs
 931 |                         if css_url.startswith('data:'):
 932 |                             continue
 933 |                             
 934 |                         # Download CSS file
 935 |                         response = session_obj.get(
 936 |                             css_url, 
 937 |                             timeout=10, 
 938 |                             headers=headers,
 939 |                             verify=False  # Ignore SSL certificate errors
 940 |                         )
 941 |                         
 942 |                         if response.status_code == 200:
 943 |                             css_content = response.text
 944 |                             
 945 |                             # Extract URLs from url() function
 946 |                             url_matches = re.findall(r'url\([\'"]?([^\'"|\)]+)[\'"]?\)', css_content) or []
 947 |                             for url in url_matches:
 948 |                                 if not url or url.startswith('data:'):
 949 |                                     continue
 950 |                                     
 951 |                                 if not url.startswith(('http://', 'https://')):
 952 |                                     # Resolve relative to the CSS file
 953 |                                     url = urljoin(css_url, url)
 954 |                                     
 955 |                                 # Determine asset type
 956 |                                 asset_type = get_asset_type(url)
 957 |                                 if asset_type in assets:
 958 |                                     assets[asset_type].append(url)
 959 |                             
 960 |                             # Extract font families
 961 |                             font_families = re.findall(r'font-family:\s*[\'"]?([^\'";]+)[\'"]?', css_content) or []
 962 |                             for family in font_families:
 963 |                                 family = family.strip().split(',')[0].strip('\'"`')
 964 |                                 if family and family.lower() not in ['serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui']:
 965 |                                     assets['font_families'].add(family)
 966 |                             
 967 |                             # Extract Google Fonts specifically
 968 |                             google_fonts_imports = re.findall(r'@import\s+url\([\'"]?(https?://fonts\.googleapis\.com/[^\'"|\)]+)[\'"]?\)', css_content) or []
 969 |                             for font_url in google_fonts_imports:
 970 |                                 if font_url not in assets['css']:
 971 |                                     assets['css'].append(font_url)
 972 |                                     
 973 |                             # Check for Tailwind
 974 |                             if 'tailwind' in css_content.lower() or '.tw-' in css_content:
 975 |                                 print("Detected Tailwind CSS in stylesheets")
 976 |                     except Exception as css_error:
 977 |                         print(f"Error processing CSS {css_url}: {str(css_error)}")
 978 |             except Exception as e:
 979 |                 print(f"Error processing CSS files: {str(e)}")
 980 |         
 981 |         # Extract UI components
 982 |         try:
 983 |             components = extract_component_structure(soup)
 984 |             if components:
 985 |                 assets['components'] = components
 986 |         except Exception as e:
 987 |             print(f"Error extracting components: {str(e)}")
 988 |             traceback.print_exc()
 989 |         
 990 |         # Remove duplicates while preserving order
 991 |         for asset_type in assets:
 992 |             if isinstance(assets[asset_type], list):
 993 |                 # Use dict.fromkeys to remove duplicates while preserving order
 994 |                 assets[asset_type] = list(dict.fromkeys(assets[asset_type]))
 995 |                 
 996 |         return assets
 997 |         
 998 |     except Exception as e:
 999 |         print(f"Error in extract_assets: {str(e)}")
1000 |         traceback.print_exc()
1001 |         return assets
1002 | 
1003 | def create_zip_file(html_content, assets, url, session_obj, headers, screenshots=None):
1004 |     """Create a zip file containing the extracted website data"""
1005 |     # Create a temp file for the zip
1006 |     temp_zip = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
1007 |     temp_zip.close()
1008 |     
1009 |     # Extract domain for the folder name
1010 |     parsed_url = urlparse(url)
1011 |     domain = parsed_url.netloc
1012 |     
1013 |     # Current timestamp
1014 |     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1015 |     
1016 |     # Create the zip file
1017 |     with zipfile.ZipFile(temp_zip.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
1018 |         # Write the main HTML
1019 |         zipf.writestr('index.html', html_content)
1020 |         
1021 |         # Create directories for each asset type
1022 |         for asset_type in assets.keys():
1023 |             if asset_type in ['font_families', 'metadata', 'components']:
1024 |                 continue  # Skip non-URL assets
1025 |                 
1026 |             # Make sure the assets[asset_type] exists and is a list before iterating
1027 |             if not assets[asset_type] or not isinstance(assets[asset_type], list):
1028 |                 print(f"  Skipping {asset_type} - no assets found or invalid format")
1029 |                 continue
1030 |                 
1031 |             # Create the directory
1032 |             zipf.writestr(f'{asset_type}/.gitkeep', '')
1033 |             
1034 |             # Download each asset
1035 |             processed_urls = set()  # Track processed URLs to avoid duplicates
1036 |             
1037 |             for url in assets[asset_type]:
1038 |                 # Skip if the URL is None, empty, or a data URL
1039 |                 if not url or url.startswith('data:'):
1040 |                     continue
1041 |                     
1042 |                 # Skip if we've already processed this URL
1043 |                 if url in processed_urls:
1044 |                     continue
1045 |                     
1046 |                 processed_urls.add(url)
1047 |                     
1048 |                 try:
1049 |                     # Fix URL if it's relative
1050 |                     if url.startswith('//'):
1051 |                         url = 'https:' + url
1052 |                     elif url.startswith('/'):
1053 |                         parsed_base = urlparse(parsed_url.scheme + '://' + parsed_url.netloc)
1054 |                         url = urljoin(parsed_base.geturl(), url)
1055 |                         
1056 |                     # Extract filename from URL
1057 |                     path = urlparse(url).path
1058 |                     # Handle query parameters in the URL
1059 |                     query = urlparse(url).query
1060 |                     filename = os.path.basename(unquote(path))
1061 |                     
1062 |                     # Clean filename
1063 |                     if not filename:
1064 |                         filename = f"{timestamp}_{uuid.uuid4().hex[:8]}.{asset_type}"
1065 |                     elif '.' not in filename:
1066 |                         filename = f"{filename}.{asset_type}"
1067 |                         
1068 |                     # Add query parameters to filename to make it unique
1069 |                     if query:
1070 |                         clean_query = re.sub(r'[^a-zA-Z0-9]', '_', query)[:30]  # Limit length
1071 |                         name, ext = os.path.splitext(filename)
1072 |                         filename = f"{name}_{clean_query}{ext}"
1073 |                         
1074 |                     # Avoid duplicate filenames with UUID
1075 |                     file_path = f"{asset_type}/{filename}"
1076 |                     
1077 |                     try:
1078 |                         # Download the file
1079 |                         response = session_obj.get(
1080 |                             url, 
1081 |                             timeout=10, 
1082 |                             headers=headers,
1083 |                             verify=False  # Ignore SSL certificate errors
1084 |                         )
1085 |                         
1086 |                         if response.status_code == 200:
1087 |                             zipf.writestr(file_path, response.content)
1088 |                             print(f"  Added {file_path}")
1089 |                         else:
1090 |                             print(f"  Failed to download {url}, status: {response.status_code}")
1091 |                     except Exception as e:
1092 |                         print(f"  Error downloading {url}: {str(e)}")
1093 |                 except Exception as e:
1094 |                     print(f"  Error processing URL {url}: {str(e)}")
1095 |         
1096 |         # Handle font families
1097 |         if 'font_families' in assets and assets['font_families']:
1098 |             zipf.writestr('css/fonts.css', '\n'.join([
1099 |                 f"/* Font Family: {family} */\n"
1100 |                 f"@import url('https://fonts.googleapis.com/css2?family={family.replace(' ', '+')}&display=swap');\n"
1101 |                 for family in assets['font_families']
1102 |             ]))
1103 |             
1104 |         # Handle metadata if present
1105 |         if 'metadata' in assets and assets['metadata']:
1106 |             metadata_content = json.dumps(assets['metadata'], indent=2)
1107 |             zipf.writestr('metadata.json', metadata_content)
1108 |             
1109 |         # Handle UI components if present
1110 |         if 'components' in assets and assets['components'] and isinstance(assets['components'], dict):
1111 |             # Create components directory
1112 |             zipf.writestr('components/.gitkeep', '')
1113 |             
1114 |             # Create index for components
1115 |             component_html = """
1116 |             <!DOCTYPE html>
1117 |             <html lang="en">
1118 |             <head>
1119 |                 <meta charset="UTF-8">
1120 |                 <meta name="viewport" content="width=device-width, initial-scale=1.0">
1121 |                 <title>Extracted UI Components</title>
1122 |                 <style>
1123 |                     body { font-family: Arial, sans-serif; max-width: 1200px; margin: 0 auto; padding: 20px; }
1124 |                     .component { margin-bottom: 40px; border: 1px solid #ddd; border-radius: 5px; overflow: hidden; }
1125 |                     .component-header { background: #f5f5f5; padding: 10px 15px; border-bottom: 1px solid #ddd; }
1126 |                     .component-content { padding: 15px; }
1127 |                     .component-code { background: #f8f8f8; padding: 15px; border-top: 1px solid #ddd; white-space: pre-wrap; overflow-x: auto; }
1128 |                     h1, h2 { color: #333; }
1129 |                     pre { margin: 0; }
1130 |                 </style>
1131 |             </head>
1132 |             <body>
1133 |                 <h1>Extracted UI Components</h1>
1134 |                 <p>The following components were extracted from the website.</p>
1135 |             """
1136 |             
1137 |             # Add each component
1138 |             for component_type, components in assets['components'].items():
1139 |                 if components:
1140 |                     component_html += f'<h2>{component_type.replace("_", " ").title()} Components</h2>'
1141 |                     
1142 |                     for i, component in enumerate(components):
1143 |                         html_code = component.get('html', '')
1144 |                         if html_code:
1145 |                             component_html += f"""
1146 |                             <div class="component">
1147 |                                 <div class="component-header">
1148 |                                     <strong>{component_type.replace("_", " ").title()} {i+1}</strong>
1149 |                                 </div>
1150 |                                 <div class="component-content">
1151 |                                     {html_code}
1152 |                                 </div>
1153 |                                 <div class="component-code">
1154 |                                     <pre>{html.escape(html_code)}</pre>
1155 |                                 </div>
1156 |                             </div>
1157 |                             """
1158 |             
1159 |             component_html += """
1160 |             </body>
1161 |             </html>
1162 |             """
1163 |             
1164 |             zipf.writestr('components/index.html', component_html)
1165 |             
1166 |             # Save individual components
1167 |             for component_type, components in assets['components'].items():
1168 |                 if components:
1169 |                     zipf.writestr(f'components/{component_type}/.gitkeep', '')
1170 |                     
1171 |                     for i, component in enumerate(components):
1172 |                         html_code = component.get('html', '')
1173 |                         if html_code:
1174 |                             zipf.writestr(f'components/{component_type}/component_{i+1}.html', html_code)
1175 |         
1176 |         # Create a README file
1177 |         readme_content = f"""# Website Clone: {domain}
1178 | 
1179 | Extracted on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
1180 | Source URL: {url}
1181 | 
1182 | ## Contents
1183 | 
1184 | - `index.html`: Main HTML file
1185 | - `css/`: Stylesheets
1186 | - `js/`: JavaScript files
1187 | - `img/`: Images
1188 | - `fonts/`: Font files
1189 | - `components/`: Extracted UI components
1190 | - `metadata.json`: Website metadata (title, description, etc.)
1191 | 
1192 | ## How to Use
1193 | 
1194 | 1. Unzip this file
1195 | 2. Open `index.html` in your browser
1196 | 3. For best results, serve the files with a local server:
1197 |    ```
1198 |    python -m http.server
1199 |    ```
1200 |    Then open http://localhost:8000 in your browser
1201 | 
1202 | ## Component Viewer
1203 | 
1204 | If components were extracted, you can view them by opening `components/index.html`
1205 | 
1206 | ## Notes
1207 | 
1208 | - Some assets might not load correctly due to cross-origin restrictions
1209 | - External resources and APIs may not work without proper configuration
1210 | - JavaScript functionality might be limited without a proper backend
1211 | 
1212 | ## Handling Modern Frameworks
1213 | 
1214 | This extraction has been optimized to handle the following frameworks:
1215 | - React and Next.js: Script chunks and module loading
1216 | - Angular: Component structure and scripts
1217 | - Tailwind CSS: Utility classes and structure
1218 | 
1219 | Generated by Website Extractor
1220 | """
1221 |         zipf.writestr('README.md', readme_content)
1222 |     
1223 |     return temp_zip.name
1224 | 
1225 | def extract_with_selenium(url, timeout=30):
1226 |     """
1227 |     Extract rendered HTML content using Selenium with Chrome/Chromium.
1228 |     This method will execute JavaScript and capture the fully rendered page structure.
1229 |     
1230 |     Args:
1231 |         url: URL to fetch
1232 |         timeout: Maximum time to wait for page to load (seconds)
1233 |         
1234 |     Returns:
1235 |         tuple: (html_content, discovered_urls, None)
1236 |     """
1237 |     if not SELENIUM_AVAILABLE:
1238 |         return None, None, {"error": "Selenium is not installed. Run: pip install selenium webdriver-manager"}
1239 |     
1240 |     try:
1241 |         print("Setting up advanced Chrome options...")
1242 |         # Set up Chrome options with anti-detection measures
1243 |         chrome_options = Options()
1244 |         chrome_options.add_argument("--headless")  # Run headless
1245 |         chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
1246 |         chrome_options.add_argument("--no-sandbox")  # Required for running as root
1247 |         chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
1248 |         chrome_options.add_argument("--window-size=1920,1080")  # Set window size
1249 |         chrome_options.add_argument("--disable-notifications")  # Disable notifications
1250 |         chrome_options.add_argument("--disable-extensions")  # Disable extensions
1251 |         chrome_options.add_argument("--disable-infobars")  # Disable infobars
1252 |         
1253 |         # Avoid detection as a bot
1254 |         chrome_options.add_argument("--disable-blink-features=AutomationControlled")
1255 |         chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
1256 |         chrome_options.add_experimental_option("useAutomationExtension", False)
1257 |         
1258 |         # Add modern user agent to avoid detection
1259 |         chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36")
1260 |         
1261 |         # Initialize the Chrome driver
1262 |         print(f"Initializing Chrome WebDriver...")
1263 |         try:
1264 |             service = Service(ChromeDriverManager().install())
1265 |             driver = webdriver.Chrome(service=service, options=chrome_options)
1266 |         except Exception as driver_error:
1267 |             print(f"Error initializing Chrome WebDriver: {str(driver_error)}")
1268 |             print("Trying alternative initialization method...")
1269 |             try:
1270 |                 # Try alternative initialization without Service object
1271 |                 driver = webdriver.Chrome(options=chrome_options)
1272 |             except Exception as alt_error:
1273 |                 print(f"Alternative initialization also failed: {str(alt_error)}")
1274 |                 return None, None, {"error": f"Failed to initialize Chrome WebDriver: {str(alt_error)}"}
1275 |         
1276 |         # Set page load timeout
1277 |         driver.set_page_load_timeout(timeout)
1278 |         
1279 |         # Used to store discovered URLs
1280 |         discovered_urls = []
1281 |         
1282 |         try:
1283 |             print(f"Navigating to {url}...")
1284 |             driver.get(url)
1285 |             
1286 |             # Wait for page to be fully loaded
1287 |             try:
1288 |                 WebDriverWait(driver, timeout).until(
1289 |                     EC.presence_of_element_located((By.TAG_NAME, "body"))
1290 |                 )
1291 |             except Exception as e:
1292 |                 print(f"Warning: Timeout waiting for body element: {str(e)}")
1293 |             
1294 |             # Execute JavaScript to disable animation
1295 |             try:
1296 |                 driver.execute_script("""
1297 |                     var style = document.createElement('style');
1298 |                     style.type = 'text/css';
1299 |                     style.innerHTML = '* { animation-duration: 0.001s !important; transition-duration: 0.001s !important; }';
1300 |                     document.getElementsByTagName('head')[0].appendChild(style);
1301 |                 """)
1302 |                 print("Animations disabled to improve extraction")
1303 |             except Exception as e:
1304 |                 print(f"Warning: Could not disable animations: {str(e)}")
1305 |             
1306 |             # Wait for page to be fully rendered
1307 |             print("Waiting for dynamic content to load...")
1308 |             try:
1309 |                 # Wait a bit for any dynamic content to load
1310 |                 time.sleep(5)
1311 |                 
1312 |                 # Wait for network to be idle
1313 |                 driver.execute_script("return window.performance.getEntriesByType('resource').length")
1314 |                 time.sleep(2)  # Wait a bit more after resources are loaded
1315 |             except Exception as e:
1316 |                 print(f"Warning while waiting for dynamic content: {str(e)}")
1317 |             
1318 |             # Implement advanced scrolling to trigger lazy loading
1319 |             print("Performing advanced scrolling to trigger lazy loading...")
1320 |             try:
1321 |                 # Get the total height of the page
1322 |                 total_height = driver.execute_script("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);")
1323 |                 
1324 |                 # Scroll down the page in steps
1325 |                 viewport_height = driver.execute_script("return window.innerHeight")
1326 |                 scroll_steps = max(1, min(20, total_height // viewport_height))  # Cap at 20 steps
1327 |                 
1328 |                 for i in range(scroll_steps + 1):
1329 |                     scroll_position = (i * total_height) // scroll_steps
1330 |                     driver.execute_script(f"window.scrollTo(0, {scroll_position});")
1331 |                     
1332 |                     # Small pause to allow content to load
1333 |                     time.sleep(0.3)
1334 |                     
1335 |                     # Extract resources after each scroll
1336 |                     try:
1337 |                         urls = driver.execute_script("""
1338 |                             var resources = [];
1339 |                             // Get all link hrefs
1340 |                             document.querySelectorAll('link[rel="stylesheet"], link[as="style"]').forEach(function(el) {
1341 |                                 if (el.href) resources.push(el.href);
1342 |                             });
1343 |                             // Get all script srcs
1344 |                             document.querySelectorAll('script[src]').forEach(function(el) {
1345 |                                 if (el.src) resources.push(el.src);
1346 |                             });
1347 |                             // Get all image srcs
1348 |                             document.querySelectorAll('img[src]').forEach(function(el) {
1349 |                                 if (el.src && !el.src.startsWith('data:')) resources.push(el.src);
1350 |                             });
1351 |                             return resources;
1352 |                         """)
1353 |                         discovered_urls.extend(urls)
1354 |                     except Exception as res_error:
1355 |                         print(f"Error extracting resources during scroll: {str(res_error)}")
1356 |                 
1357 |                 # Scroll back to top
1358 |                 driver.execute_script("window.scrollTo(0, 0);")
1359 |                 
1360 |                 # Wait for everything to settle after scrolling
1361 |                 time.sleep(1)
1362 |             except Exception as scroll_error:
1363 |                 print(f"Error during page scrolling: {str(scroll_error)}")
1364 |             
1365 |             # Try to click on common elements that might reveal more content
1366 |             try:
1367 |                 # Common UI elements that might reveal more content when clicked
1368 |                 for selector in [
1369 |                     'button.load-more', '.show-more', '.expand', '.accordion-toggle', 
1370 |                     '[aria-expanded="false"]', '.menu-toggle', '.navbar-toggler',
1371 |                     '.mobile-menu-button', '.hamburger', '[data-toggle="collapse"]'
1372 |                 ]:
1373 |                     try:
1374 |                         elements = driver.find_elements(By.CSS_SELECTOR, selector)
1375 |                         for element in elements[:3]:  # Limit to first 3 matches of each type
1376 |                             if element.is_displayed():
1377 |                                 driver.execute_script("arguments[0].click();", element)
1378 |                                 time.sleep(0.5)  # Wait for content to appear
1379 |                     except Exception as click_error:
1380 |                         # Skip any errors and continue with next selector
1381 |                         continue
1382 |                 print("Attempted to expand hidden content")
1383 |             except Exception as interact_error:
1384 |                 print(f"Error expanding content: {str(interact_error)}")
1385 |             
1386 |             # Get the final HTML content after all JavaScript executed
1387 |             html_content = driver.page_source
1388 |             print(f"HTML content captured ({len(html_content)} bytes)")
1389 |             
1390 |             # Extract URLs for modern frameworks
1391 |             try:
1392 |                 # React/Next.js specific resources
1393 |                 next_js_urls = driver.execute_script("""
1394 |                     var resources = [];
1395 |                     // Find Next.js specific scripts
1396 |                     document.querySelectorAll('script[src*="_next"]').forEach(function(el) {
1397 |                         resources.push(el.src);
1398 |                     });
1399 |                     // Find chunk files
1400 |                     document.querySelectorAll('script[src*="chunk"]').forEach(function(el) {
1401 |                         resources.push(el.src);
1402 |                     });
1403 |                     // Find webpack files
1404 |                     document.querySelectorAll('script[src*="webpack"]').forEach(function(el) {
1405 |                         resources.push(el.src);
1406 |                     });
1407 |                     // Find hydration scripts
1408 |                     document.querySelectorAll('script[src*="hydration"]').forEach(function(el) {
1409 |                         resources.push(el.src);
1410 |                     });
1411 |                     return resources;
1412 |                 """)
1413 |                 discovered_urls.extend(next_js_urls)
1414 |                 
1415 |                 # Angular specific resources
1416 |                 angular_urls = driver.execute_script("""
1417 |                     var resources = [];
1418 |                     // Find Angular specific scripts
1419 |                     document.querySelectorAll('script[src*="runtime"]').forEach(function(el) {
1420 |                         resources.push(el.src);
1421 |                     });
1422 |                     document.querySelectorAll('script[src*="polyfills"]').forEach(function(el) {
1423 |                         resources.push(el.src);
1424 |                     });
1425 |                     document.querySelectorAll('script[src*="main"]').forEach(function(el) {
1426 |                         resources.push(el.src);
1427 |                     });
1428 |                     return resources;
1429 |                 """)
1430 |                 discovered_urls.extend(angular_urls)
1431 |                 
1432 |                 # Get CSS variables for Tailwind detection
1433 |                 tailwind_check = driver.execute_script("""
1434 |                     var style = window.getComputedStyle(document.body);
1435 |                     var hasTailwind = false;
1436 |                     // Check for common Tailwind classes
1437 |                     if (document.querySelector('.flex') && 
1438 |                         document.querySelector('.grid') && 
1439 |                         document.querySelector('.text-')) {
1440 |                         hasTailwind = true;
1441 |                     }
1442 |                     return hasTailwind;
1443 |                 """)
1444 |                 
1445 |                 if tailwind_check:
1446 |                     print("Tailwind CSS detected, including appropriate CSS files")
1447 |             except Exception as framework_error:
1448 |                 print(f"Error detecting framework resources: {str(framework_error)}")
1449 |             
1450 |             # Remove duplicates from discovered URLs
1451 |             discovered_urls = list(set(discovered_urls))
1452 |             print(f"Discovered {len(discovered_urls)} resource URLs")
1453 |             
1454 |             return html_content, discovered_urls, None
1455 |             
1456 |         except TimeoutException:
1457 |             print(f"Timeout while loading {url}")
1458 |             return None, None, {"error": "Timeout while loading page"}
1459 |         except WebDriverException as e:
1460 |             print(f"Selenium error: {str(e)}")
1461 |             return None, None, {"error": f"Selenium error: {str(e)}"}
1462 |         finally:
1463 |             # Close the browser
1464 |             print("Closing WebDriver...")
1465 |             driver.quit()
1466 |     
1467 |     except Exception as e:
1468 |         print(f"Error setting up Selenium: {str(e)}")
1469 |         return None, None, {"error": f"Error setting up Selenium: {str(e)}"}
1470 | 
1471 | def fix_relative_urls(html_content, base_url):
1472 |     """Fix relative URLs in the HTML content"""
1473 |     soup = BeautifulSoup(html_content, 'html.parser')
1474 |     
1475 |     # Fix relative URLs for links
1476 |     for link in soup.find_all('a', href=True):
1477 |         href = link['href']
1478 |         if href.startswith('/'):
1479 |             link['href'] = urljoin(base_url, href)
1480 |     
1481 |     # Fix relative URLs for images
1482 |     for img in soup.find_all('img', src=True):
1483 |         src = img['src']
1484 |         if not src.startswith(('http://', 'https://', 'data:')):
1485 |             img['src'] = urljoin(base_url, src)
1486 |     
1487 |     # Fix relative URLs for scripts
1488 |     for script in soup.find_all('script', src=True):
1489 |         src = script['src']
1490 |         if not src.startswith(('http://', 'https://', 'data:')):
1491 |             script['src'] = urljoin(base_url, src)
1492 |     
1493 |     # Fix relative URLs for stylesheets
1494 |     for link in soup.find_all('link', href=True):
1495 |         href = link['href']
1496 |         if not href.startswith(('http://', 'https://', 'data:')):
1497 |             link['href'] = urljoin(base_url, href)
1498 |     
1499 |     return str(soup)
1500 | 
1501 | @app.route('/')
1502 | def index():
1503 |     """Render the home page"""
1504 |     return render_template('index.html')
1505 | 
1506 | @app.route('/clear')
1507 | def clear_session():
1508 |     """Clear the session data"""
1509 |     session.clear()
1510 |     return jsonify({'message': 'Session cleared'})
1511 | 
1512 | @app.route('/extract', methods=['POST'])
1513 | def extract():
1514 |     url = request.form.get('url')
1515 |     use_selenium = request.form.get('use_selenium') == 'true'
1516 |     
1517 |     if not url:
1518 |         return jsonify({'error': 'URL is required'}), 400
1519 |     
1520 |     try:
1521 |         # Add http:// if not present
1522 |         if not url.startswith(('http://', 'https://')):
1523 |             url = 'https://' + url
1524 |         
1525 |         print(f"\n{'='*80}\nStarting extraction for: {url}\n{'='*80}")
1526 |         
1527 |         # Create a session to maintain cookies
1528 |         session_obj = requests.Session()
1529 |         
1530 |         # Disable SSL verification warnings
1531 |         urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
1532 |         
1533 |         # List of user agents to try if we get blocked
1534 |         user_agents = [
1535 |             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
1536 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
1537 |             'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
1538 |             'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
1539 |             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
1540 |             'Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
1541 |             'Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1'
1542 |         ]
1543 |         
1544 |         # List of referers to try
1545 |         referers = [
1546 |             'https://www.google.com/',
1547 |             'https://www.bing.com/',
1548 |             'https://www.instagram.com/',
1549 |             'https://www.facebook.com/',
1550 |             'https://www.twitter.com/',
1551 |             'https://www.linkedin.com/'
1552 |         ]
1553 |         
1554 |         # Initial headers (will be rotated if needed)
1555 |         headers = {
1556 |             'User-Agent': random.choice(user_agents),
1557 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
1558 |             'Accept-Language': 'en-US,en;q=0.9',
1559 |             'Accept-Encoding': 'gzip, deflate, br',
1560 |             'Connection': 'keep-alive',
1561 |             'Upgrade-Insecure-Requests': '1',
1562 |             'Sec-Fetch-Dest': 'document',
1563 |             'Sec-Fetch-Mode': 'navigate',
1564 |             'Sec-Fetch-Site': 'none',
1565 |             'Sec-Fetch-User': '?1',
1566 |             'Cache-Control': 'max-age=0',
1567 |             'Referer': random.choice(referers),
1568 |             'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
1569 |             'sec-ch-ua-mobile': '?0',
1570 |             'sec-ch-ua-platform': '"Windows"',
1571 |         }
1572 |         
1573 |         html_content = None
1574 |         additional_urls = []
1575 |         
1576 |         # Use Selenium for rendering if requested and available
1577 |         if use_selenium and SELENIUM_AVAILABLE:
1578 |             print("Using Selenium for advanced rendering...")
1579 |             html_content, additional_urls, error_info = extract_with_selenium(url)
1580 |             
1581 |             if not html_content:
1582 |                 print("Selenium extraction failed, falling back to regular request")
1583 |                 use_selenium = False
1584 |                 # Check if we have an error message
1585 |                 if error_info and isinstance(error_info, dict) and 'error' in error_info:
1586 |                     print(f"Selenium error: {error_info['error']}")
1587 |         
1588 |         # If Selenium wasn't used or failed, use regular requests with retries
1589 |         if not use_selenium or not html_content:
1590 |             # Maximum number of retries with different configurations
1591 |             max_retries = 5
1592 |             retry_count = 0
1593 |             last_error = None
1594 |             
1595 |             while retry_count < max_retries and not html_content:
1596 |                 try:
1597 |                     print(f"HTTP Request attempt {retry_count+1}/{max_retries} for: {url}")
1598 |                     print(f"Using User-Agent: {headers['User-Agent'][:30]}...")
1599 |                     
1600 |                     # First request to get cookies and possible redirects
1601 |                     response = session_obj.get(
1602 |                         url, 
1603 |                         timeout=20,  # Increased timeout 
1604 |                         headers=headers, 
1605 |                         allow_redirects=True,
1606 |                         verify=False  # Ignore SSL certificate errors
1607 |                     )
1608 |                     
1609 |                     # Follow redirects manually if needed
1610 |                     if response.history:
1611 |                         print(f"Request was redirected {len(response.history)} times")
1612 |                         for i, resp in enumerate(response.history):
1613 |                             print(f"  Redirect {i+1}: {resp.url} -> {resp.headers.get('Location')}")
1614 |                         print(f"  Final URL: {response.url}")
1615 |                         url = response.url  # Update URL to the final destination
1616 |                     
1617 |                     # Handle different status codes
1618 |                     if response.status_code == 200:
1619 |                         print(f"Success! Received 200 OK response ({len(response.content)} bytes)")
1620 |                         
1621 |                         # Determine encoding from Content-Type header or content
1622 |                         content_type = response.headers.get('Content-Type', '')
1623 |                         print(f"Content-Type: {content_type}")
1624 |                         
1625 |                         # Get encoding from headers or meta tag
1626 |                         encoding = None
1627 |                         
1628 |                         # Try to get encoding from Content-Type header
1629 |                         if 'charset=' in content_type:
1630 |                             encoding = content_type.split('charset=')[1].split(';')[0].strip()
1631 |                             print(f"Encoding from headers: {encoding}")
1632 |                         
1633 |                         # If no encoding specified, try to detect from content
1634 |                         if not encoding:
1635 |                             # Look for <meta charset="..."> tag
1636 |                             charset_match = re.search(r'<meta.*?charset=["\']*([^\s"\'/>]+)', response.text, re.IGNORECASE)
1637 |                             if charset_match:
1638 |                                 encoding = charset_match.group(1)
1639 |                                 print(f"Encoding from meta charset tag: {encoding}")
1640 |                             else:
1641 |                                 # Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
1642 |                                 http_equiv_match = re.search(r'<meta.*?http-equiv=["\']*content-type["\']*.*?content=["\']*.*?charset=([^\s"\'/>]+)', response.text, re.IGNORECASE)
1643 |                                 if http_equiv_match:
1644 |                                     encoding = http_equiv_match.group(1)
1645 |                                     print(f"Encoding from meta http-equiv tag: {encoding}")
1646 |                         
1647 |                         # If still no encoding, use apparent encoding from requests
1648 |                         if not encoding and response.apparent_encoding:
1649 |                             encoding = response.apparent_encoding
1650 |                             print(f"Detected encoding: {encoding}")
1651 |                         
1652 |                         # Default to utf-8 if still no encoding
1653 |                         if not encoding:
1654 |                             encoding = 'utf-8'
1655 |                             print("Using default encoding: utf-8")
1656 |                         
1657 |                         # Decode content with detected encoding
1658 |                         try:
1659 |                             html_content = response.content.decode(encoding, errors='replace')
1660 |                             print(f"Successfully decoded HTML content with {encoding} encoding ({len(html_content)} bytes)")
1661 |                             break  # Exit the retry loop on success
1662 |                         except (UnicodeDecodeError, LookupError) as e:
1663 |                             print(f"Error decoding with {encoding}: {str(e)}, falling back to utf-8")
1664 |                             html_content = response.content.decode('utf-8', errors='replace')
1665 |                             break  # Exit the retry loop on success with fallback
1666 |                     
1667 |                     elif response.status_code == 403:  # Forbidden - likely bot protection
1668 |                         print(f"Received 403 Forbidden response - website is likely blocking scrapers")
1669 |                         
1670 |                         # If we have Selenium available as a fallback, try that instead
1671 |                         if SELENIUM_AVAILABLE and not use_selenium:
1672 |                             print("Trying Selenium as a fallback for 403 error...")
1673 |                             html_content, additional_urls, error_info = extract_with_selenium(url)
1674 |                             if html_content:
1675 |                                 print("Successfully bypassed 403 with Selenium!")
1676 |                                 break
1677 |                         
1678 |                         # Otherwise, rotate our headers and try again
1679 |                         headers['User-Agent'] = random.choice(user_agents)
1680 |                         headers['Referer'] = random.choice(referers)
1681 |                         
1682 |                         # Add some randomization to headers
1683 |                         if random.random() > 0.5:
1684 |                             headers['Accept-Language'] = random.choice(['en-US,en;q=0.9', 'en-GB,en;q=0.8,en-US;q=0.7', 'en-CA,en;q=0.9,fr-CA;q=0.8'])
1685 |                         
1686 |                         # Try adding cookies if we have any from previous responses
1687 |                         if session_obj.cookies:
1688 |                             print(f"Using {len(session_obj.cookies)} cookies from previous responses")
1689 |                         
1690 |                         # Add delay to avoid rate limiting
1691 |                         delay = random.uniform(1.0, 3.0)
1692 |                         print(f"Waiting {delay:.2f} seconds before retrying...")
1693 |                         time.sleep(delay)
1694 |                         
1695 |                     elif response.status_code == 429:  # Too Many Requests
1696 |                         print(f"Received 429 Too Many Requests - rate limited")
1697 |                         
1698 |                         # Check if we have a Retry-After header
1699 |                         retry_after = response.headers.get('Retry-After')
1700 |                         if retry_after and retry_after.isdigit():
1701 |                             delay = int(retry_after) + random.uniform(0.1, 1.0)
1702 |                         else:
1703 |                             delay = 5 + random.uniform(1.0, 5.0)  # 5-10 second delay
1704 |                         
1705 |                         print(f"Waiting {delay:.2f} seconds before retrying...")
1706 |                         time.sleep(delay)
1707 |                         
1708 |                         # Rotate headers
1709 |                         headers['User-Agent'] = random.choice(user_agents)
1710 |                         
1711 |                     elif response.status_code == 503:  # Service Unavailable - often used for anti-bot
1712 |                         print(f"Received 503 Service Unavailable - possible anti-bot measure")
1713 |                         
1714 |                         # Try with a longer delay and new headers
1715 |                         delay = 10 + random.uniform(1.0, 5.0)  # 10-15 second delay
1716 |                         print(f"Waiting {delay:.2f} seconds before retrying...")
1717 |                         time.sleep(delay)
1718 |                         
1719 |                         # Complete header rotation
1720 |                         headers['User-Agent'] = random.choice(user_agents)
1721 |                         headers['Referer'] = random.choice(referers)
1722 |                         
1723 |                     else:
1724 |                         print(f"Received unexpected status code: {response.status_code}")
1725 |                         last_error = f"HTTP error ({response.status_code})"
1726 |                         
1727 |                         # Try with new headers on next attempt
1728 |                         headers['User-Agent'] = random.choice(user_agents)
1729 |                 
1730 |                 except requests.exceptions.Timeout:
1731 |                     print(f"Timeout error fetching {url}")
1732 |                     last_error = "Request timeout"
1733 |                     # Try with increased timeout on next attempt
1734 |                     
1735 |                 except requests.exceptions.ConnectionError:
1736 |                     print(f"Connection error fetching {url}")
1737 |                     last_error = "Connection error"
1738 |                     # Wait before retrying
1739 |                     time.sleep(2)
1740 |                     
1741 |                 except requests.exceptions.TooManyRedirects:
1742 |                     print(f"Too many redirects for {url}")
1743 |                     last_error = "Too many redirects"
1744 |                     # This is likely a permanent issue, break the loop
1745 |                     break
1746 |                     
1747 |                 except Exception as e:
1748 |                     print(f"Error fetching {url}: {str(e)}")
1749 |                     last_error = str(e)
1750 |                 
1751 |                 retry_count += 1
1752 |             
1753 |             # If we've exhausted all retries and still don't have content
1754 |             if not html_content and retry_count >= max_retries:
1755 |                 error_msg = f"Failed to fetch website after {max_retries} attempts. Last error: {last_error}"
1756 |                 print(error_msg)
1757 |                 return jsonify({'error': error_msg}), 400
1758 |         
1759 |         # Safety check - make sure we have HTML content
1760 |         if not html_content or len(html_content) < 100:  # Arbitrary minimum size for valid HTML
1761 |             return jsonify({'error': 'Failed to extract valid HTML content from the website'}), 400
1762 |         
1763 |         # Continue with asset extraction and zip file creation
1764 |         try:
1765 |             print("\nExtracting assets...")
1766 |             # Extract assets from the HTML content
1767 |             assets = extract_assets(html_content, url, session_obj, headers)
1768 |             
1769 |             if not assets:
1770 |                 return jsonify({'error': 'Failed to extract assets from the website'}), 500
1771 |                 
1772 |             print(f"Assets extracted: {', '.join(assets.keys())}")
1773 |             
1774 |             # If we have additional URLs from Selenium, add them to the assets
1775 |             if additional_urls:
1776 |                 print(f"Adding {len(additional_urls)} URLs discovered by Selenium")
1777 |                 for asset_url in additional_urls:
1778 |                     # Skip data URLs
1779 |                     if not asset_url or asset_url.startswith('data:'):
1780 |                         continue
1781 |                         
1782 |                     # Normalize URL
1783 |                     if asset_url.startswith('//'):
1784 |                         asset_url = f"https:{asset_url}"
1785 |                     
1786 |                     try:
1787 |                         asset_type = get_asset_type(asset_url)
1788 |                         if asset_type in assets and asset_url not in assets[asset_type]:
1789 |                             # Validate URL
1790 |                             parsed = urlparse(asset_url)
1791 |                             if parsed.scheme and parsed.netloc:
1792 |                                 assets[asset_type].append(asset_url)
1793 |                     except Exception as url_error:
1794 |                         print(f"Error processing URL {asset_url}: {str(url_error)}")
1795 |             
1796 |             # Count assets by type
1797 |             asset_counts = {asset_type: len(urls) for asset_type, urls in assets.items() 
1798 |                           if isinstance(urls, list) and asset_type not in ['metadata', 'font_families']}
1799 |             print(f"\nAsset counts:")
1800 |             for asset_type, count in asset_counts.items():
1801 |                 print(f"  {asset_type}: {count}")
1802 |             
1803 |             # Check if we have enough assets
1804 |             total_assets = sum(count for count in asset_counts.values())
1805 |             if total_assets < 5:
1806 |                 print("\nWARNING: Very few assets extracted. Trying alternative extraction methods...")
1807 |                 
1808 |                 # Try to extract assets from the page using JavaScript execution (simulated)
1809 |                 try:
1810 |                     # Look for JavaScript variables that might contain asset URLs
1811 |                     js_asset_patterns = [
1812 |                         r'["\'](https?://[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']',
1813 |                         r'["\'](/[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']',
1814 |                         r'["\'](//[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']',
1815 |                         r'loadCSS\(["\']([^"\']+)["\']',
1816 |                         r'loadJS\(["\']([^"\']+)["\']',
1817 |                         r'src=["\'](/[^"\']+)["\']',
1818 |                         r'href=["\'](/[^"\']+\.css)["\']',
1819 |                         # React/Next.js specific patterns
1820 |                         r'__NEXT_DATA__\s*=\s*({.*})',
1821 |                         r'window\.__PRELOADED_STATE__\s*=\s*({.*})',
1822 |                         r'window\.__INITIAL_STATE__\s*=\s*({.*})',
1823 |                         r'_ASSET_PREFIX_\s*=\s*["\']([^"\']+)["\']'
1824 |                     ]
1825 |                     
1826 |                     for pattern in js_asset_patterns:
1827 |                         matches = re.findall(pattern, html_content)
1828 |                         for match in matches:
1829 |                             if isinstance(match, tuple):
1830 |                                 match_url = match[0]
1831 |                             else:
1832 |                                 match_url = match
1833 |                                 
1834 |                             if match_url.startswith('//'):
1835 |                                 match_url = 'https:' + match_url
1836 |                             elif match_url.startswith('/'):
1837 |                                 match_url = urljoin(url, match_url)
1838 |                                 
1839 |                             # Skip if it's clearly not a URL (likely JSON data)
1840 |                             if '{' in match_url or '}' in match_url:
1841 |                                 continue
1842 |                                 
1843 |                             asset_type = get_asset_type(match_url)
1844 |                             if asset_type in assets:
1845 |                                 assets[asset_type].append(match_url)
1846 |                     
1847 |                     print("Extracted additional assets from JavaScript patterns")
1848 |                 except Exception as e:
1849 |                     print(f"Error extracting additional assets: {str(e)}")
1850 |             
1851 |             # Try to fix relative URLs in the HTML
1852 |             try:
1853 |                 print("\nFixing relative URLs...")
1854 |                 fixed_html = fix_relative_urls(html_content, url)
1855 |                 print("Relative URLs fixed")
1856 |             except Exception as e:
1857 |                 print(f"Error fixing URLs: {str(e)}")
1858 |                 fixed_html = html_content  # Use original HTML if fixing fails
1859 |             
1860 |             try:
1861 |                 # Create and send zip file, passing the session and headers
1862 |                 print("\nCreating zip file...")
1863 |                 
1864 |                 # Extract domain from URL for the filename
1865 |                 domain = urlparse(url).netloc
1866 |                 safe_domain = re.sub(r'[^\w\-_]', '_', domain)
1867 |                 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1868 |                 filename = f"{safe_domain}_{timestamp}.zip"
1869 |                 
1870 |                 # Create a zip file with the extracted content
1871 |                 zip_file_path = create_zip_file(fixed_html, assets, url, session_obj, headers)
1872 |                 
1873 |                 # Check if the file was created successfully
1874 |                 if not os.path.exists(zip_file_path) or os.path.getsize(zip_file_path) < 100:
1875 |                     return jsonify({'error': 'Failed to create valid zip file'}), 500
1876 |                 
1877 |                 print(f"Zip file created successfully at {zip_file_path} ({os.path.getsize(zip_file_path)} bytes)")
1878 |                 print(f"\nExtraction completed for: {url}\n{'='*80}")
1879 |                 
1880 |                 # Copy the temporary file to a more persistent location
1881 |                 persistent_dir = os.path.join(tempfile.gettempdir(), 'website_extractor_downloads')
1882 |                 os.makedirs(persistent_dir, exist_ok=True)
1883 |                 persistent_path = os.path.join(persistent_dir, filename)
1884 | 
1885 |                 # Copy the file instead of moving to ensure the original isn't deleted prematurely
1886 |                 shutil.copy2(zip_file_path, persistent_path)
1887 | 
1888 |                 # Schedule the temp file for deletion after a reasonable period (30 minutes)
1889 |                 def delete_temp_file():
1890 |                     try:
1891 |                         time.sleep(1800)  # 30 minutes
1892 |                         if os.path.exists(zip_file_path):
1893 |                             os.remove(zip_file_path)
1894 |                             print(f"Temporary file {zip_file_path} removed after 30 minutes")
1895 |                         if os.path.exists(persistent_path):
1896 |                             os.remove(persistent_path)
1897 |                             print(f"Persistent file {persistent_path} removed after 30 minutes")
1898 |                     except Exception as e:
1899 |                         print(f"Error removing temporary file: {str(e)}")
1900 | 
1901 |                 # Start a thread to handle file deletion
1902 |                 cleanup_thread = threading.Thread(target=delete_temp_file)
1903 |                 cleanup_thread.daemon = True
1904 |                 cleanup_thread.start()
1905 | 
1906 |                 # Send the persistent file with improved headers and explicit attachment
1907 |                 response = send_file(
1908 |                     persistent_path,
1909 |                     mimetype='application/zip',
1910 |                     as_attachment=True,
1911 |                     download_name=filename
1912 |                 )
1913 | 
1914 |                 # Add headers to prevent caching issues
1915 |                 response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
1916 |                 response.headers['Pragma'] = 'no-cache'
1917 |                 response.headers['Expires'] = '0'
1918 |                 response.headers['Content-Disposition'] = f'attachment; filename="{filename}"'
1919 | 
1920 |                 # Note: We're no longer using after_this_request to remove the file immediately
1921 |                 # Instead, we're using a background thread to clean up after 30 minutes
1922 | 
1923 |                 return response
1924 |                 
1925 |             except Exception as e:
1926 |                 print(f"Error creating or sending zip file: {str(e)}")
1927 |                 traceback.print_exc()
1928 |                 return jsonify({'error': f'Failed to create or send zip file: {str(e)}'}), 500
1929 |         except Exception as e:
1930 |             print(f"Error in asset extraction: {str(e)}")
1931 |             traceback.print_exc()
1932 |             return jsonify({'error': f'Error extracting assets: {str(e)}'}), 500
1933 |     
1934 |     except Exception as e:
1935 |         print(f"Unexpected error: {str(e)}")
1936 |         traceback.print_exc()
1937 |         return jsonify({'error': str(e)}), 500
1938 | 
1939 | if __name__ == '__main__':
1940 |     print("\n" + "="*80)
1941 |     print("Website Extractor is running!")
1942 |     print("Access it in your browser at: http://127.0.0.1:5001")
1943 |     print("="*80 + "\n")
1944 |     app.run(debug=True, threaded=True, port=5001) 
1945 | 
1946 | def main():
1947 |     """Entry point for the package, to allow running as an installed package from command line"""
1948 |     print("\n" + "="*80)
1949 |     print("Website Extractor is running!")
1950 |     print("Access it in your browser at: http://127.0.0.1:5001")
1951 |     print("="*80 + "\n")
1952 |     app.run(debug=True, threaded=True, port=5001) 


--------------------------------------------------------------------------------