├── requirements.txt
├── .gitignore
├── LICENSE
├── setup.py
├── CONTRIBUTING.md
├── docs
└── app_architecture_overview.md
├── app_architecture.md
├── README.md
├── templates
├── components.html
└── index.html
└── app.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==3.0.2
2 | requests==2.31.0
3 | beautifulsoup4==4.12.3
4 | urllib3==2.2.1
5 | cssutils==2.9.0
6 | selenium==4.18.1
7 | webdriver-manager==4.0.1
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # Virtual environment
28 | venv/
29 | env/
30 | ENV/
31 |
32 | # Flask related
33 | instance/
34 | .webassets-cache
35 |
36 | # Selenium & WebDriver
37 | chromedriver
38 | chromedriver.exe
39 | *.log
40 | geckodriver
41 | geckodriver.exe
42 | .wdm/
43 |
44 | # OS specific files
45 | .DS_Store
46 | .DS_Store?
47 | ._*
48 | .Spotlight-V100
49 | .Trashes
50 | ehthumbs.db
51 | Thumbs.db
52 |
53 | # Editor directories and files
54 | .idea/
55 | .vscode/
56 | *.swp
57 | *.swo
58 |
59 | # Temporary files
60 | *.tmp
61 | *~
62 | tmp/
63 | temp/
64 |
65 | # Downloaded website archives
66 | *.zip
67 |
68 | # Local environment variables
69 | .env
70 | .env.local
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Sirio Berati
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | with open("requirements.txt", "r", encoding="utf-8") as fh:
7 | requirements = fh.read().splitlines()
8 |
9 | setup(
10 | name="website-extractor",
11 | version="1.0.0",
12 | author="Sirio Berati",
13 | author_email="your.email@example.com", # Replace with your actual email
14 | description="A tool to extract and archive entire websites with advanced rendering capabilities",
15 | long_description=long_description,
16 | long_description_content_type="text/markdown",
17 | url="https://github.com/sirioberati/website-extractor",
18 | packages=find_packages(),
19 | classifiers=[
20 | "Programming Language :: Python :: 3",
21 | "License :: OSI Approved :: MIT License",
22 | "Operating System :: OS Independent",
23 | "Topic :: Internet :: WWW/HTTP",
24 | "Topic :: Software Development :: Libraries :: Python Modules",
25 | "Topic :: Utilities",
26 | ],
27 | python_requires=">=3.7",
28 | install_requires=requirements,
29 | entry_points={
30 | "console_scripts": [
31 | "website-extractor=app:main",
32 | ],
33 | },
34 | include_package_data=True,
35 | )
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Website Extractor
2 |
3 | Thank you for considering contributing to Website Extractor! This document provides guidelines and instructions for contributing to this project.
4 |
5 | ## Code of Conduct
6 |
7 | By participating in this project, you agree to maintain a respectful and inclusive environment for everyone.
8 |
9 | ## How Can I Contribute?
10 |
11 | ### Reporting Bugs
12 |
13 | Before creating a bug report:
14 |
15 | 1. Check the existing issues to see if the problem has already been reported
16 | 2. Collect information about the bug (steps to reproduce, error messages, etc.)
17 |
18 | When submitting a bug report, please include:
19 |
20 | - A clear and descriptive title
21 | - Detailed steps to reproduce the issue
22 | - Expected vs. actual behavior
23 | - Screenshots if applicable
24 | - Your environment information (OS, browser, Python version, etc.)
25 |
26 | ### Suggesting Enhancements
27 |
28 | Enhancement suggestions are welcome! Please include:
29 |
30 | - A clear and descriptive title
31 | - A detailed description of the proposed enhancement
32 | - The motivation behind the enhancement
33 | - Any potential implementation details you can think of
34 |
35 | ### Pull Requests
36 |
37 | 1. Fork the repository
38 | 2. Create a new branch (`git checkout -b feature/amazing-feature`)
39 | 3. Make your changes
40 | 4. Run tests if available
41 | 5. Commit your changes (`git commit -m 'Add some amazing feature'`)
42 | 6. Push to your branch (`git push origin feature/amazing-feature`)
43 | 7. Open a Pull Request
44 |
45 | ## Development Setup
46 |
47 | 1. Fork and clone the repository
48 | 2. Create a virtual environment:
49 | ```bash
50 | python -m venv venv
51 | source venv/bin/activate # On Windows: venv\Scripts\activate
52 | ```
53 | 3. Install dependencies:
54 | ```bash
55 | pip install -r requirements.txt
56 | ```
57 | 4. Run the application locally:
58 | ```bash
59 | python app.py
60 | ```
61 |
62 | ## Coding Standards
63 |
64 | - Follow PEP 8 style guidelines
65 | - Write descriptive commit messages
66 | - Include comments and docstrings
67 | - Add tests for new features when possible
68 |
69 | ## License
70 |
71 | By contributing to this project, you agree that your contributions will be licensed under the project's [MIT License](LICENSE).
72 |
73 | ## Questions?
74 |
75 | If you have any questions, feel free to reach out to the project maintainer through the GitHub issues page.
--------------------------------------------------------------------------------
/docs/app_architecture_overview.md:
--------------------------------------------------------------------------------
1 | # Website Extractor Architecture Overview
2 |
3 | ```
4 | ┌───────────────────────────────────────────────────────────────────┐
5 | │ Website Extractor Application │
6 | └───────────────────────────────────────────────────────────────────┘
7 | │
8 | ▼
9 | ┌───────────────────────────────────────────────────────────────────┐
10 | │ Flask Web Server │
11 | └───────────────────────────────────────────────────────────────────┘
12 | │
13 | ▼
14 | ┌───────────────────────────────────────────────────────────────────┐
15 | │ Extraction Core Processes │
16 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤
17 | │ HTTP Client │ Selenium Renderer │ Content Parser │ Asset Saver │
18 | │ (requests) │ (WebDriver) │ (BeautifulSoup) │ (Zip) │
19 | └───────────────┴──────────────────┴──────────────────┴─────────────┘
20 | ```
21 |
22 | ## Data Flow
23 |
24 | ```
25 | ┌──────────┐ URL ┌──────────┐ HTML Content ┌──────────────┐
26 | │ User │───────────▶│ Extractor│───────────────▶│ HTML Parser │
27 | └──────────┘ └──────────┘ └──────────────┘
28 | │ │
29 | Rendering │ │ Asset URLs
30 | option │ │
31 | ▼ ▼
32 | ┌──────────┐ ┌──────────────┐
33 | │ Selenium │ │ Asset │
34 | │ WebDriver│ │ Downloader │
35 | └──────────┘ └──────────────┘
36 | │ │
37 | Rendered│ Assets │
38 | HTML │ │
39 | ▼ ▼
40 | ┌──────────────────────────────────────────┐
41 | │ Zip File Creator │
42 | └──────────────────────────────────────────┘
43 | │
44 | ▼
45 | ┌──────────────────────────────────────────┐
46 | │ File Download Response to User │
47 | └──────────────────────────────────────────┘
48 | ```
49 |
50 | ### Key Components
51 |
52 | 1. **Flask Web Server**: The user interface and API endpoint
53 | 2. **HTTP Client**: Makes network requests to target websites
54 | 3. **Selenium Renderer**: Renders JavaScript-heavy sites (optional)
55 | 4. **Content Parser**: Analyzes HTML to extract assets
56 | 5. **Asset Downloader**: Retrieves all website assets
57 | 6. **Zip Creator**: Packages everything into a downloadable archive
58 |
59 | For more detailed information, see the full [app_architecture.md](../app_architecture.md) file.
--------------------------------------------------------------------------------
/app_architecture.md:
--------------------------------------------------------------------------------
1 | # Website Extractor - Application Architecture
2 |
3 | ## Overview
4 |
5 | This document provides a high-level overview of the Website Extractor application architecture, explaining how the different components interact and the flow of data through the system.
6 |
7 | ```
8 | ┌───────────────────────────────────────────────────────────────────┐
9 | │ Website Extractor Application │
10 | └───────────────────────────────────────────────────────────────────┘
11 | │
12 | ▼
13 | ┌───────────────────────────────────────────────────────────────────┐
14 | │ Flask Web Server │
15 | └───────────────────────────────────────────────────────────────────┘
16 | │
17 | ▼
18 | ┌───────────────────────────────────────────────────────────────────┐
19 | │ Extraction Core Processes │
20 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤
21 | │ HTTP Client │ Selenium Renderer │ Content Parser │ Asset Saver │
22 | │ (requests) │ (WebDriver) │ (BeautifulSoup) │ (Zip) │
23 | └───────────────┴──────────────────┴──────────────────┴─────────────┘
24 | ```
25 |
26 | ## Data Flow Diagram
27 |
28 | ```
29 | ┌──────────┐ URL ┌──────────┐ HTML Content ┌──────────────┐
30 | │ User │───────────▶│ Extractor│───────────────▶│ HTML Parser │
31 | └──────────┘ └──────────┘ └──────────────┘
32 | │ │
33 | Rendering │ │ Asset URLs
34 | option │ │
35 | ▼ ▼
36 | ┌──────────┐ ┌──────────────┐
37 | │ Selenium │ │ Asset │
38 | │ WebDriver│ │ Downloader │
39 | └──────────┘ └──────────────┘
40 | │ │
41 | Rendered│ Assets │
42 | HTML │ │
43 | ▼ ▼
44 | ┌──────────────────────────────────────────┐
45 | │ Zip File Creator │
46 | └──────────────────────────────────────────┘
47 | │
48 | ▼
49 | ┌──────────────────────────────────────────┐
50 | │ File Download Response to User │
51 | └──────────────────────────────────────────┘
52 | ```
53 |
54 | ## Component Descriptions
55 |
56 | ### 1. Flask Web Server
57 | - **Purpose**: Provides the web interface and handles HTTP requests
58 | - **Key Files**: `app.py` (main file), `templates/index.html` (UI)
59 | - **Functions**: Serves the interface, processes form submissions, returns downloaded files
60 |
61 | ### 2. HTTP Client (Requests)
62 | - **Purpose**: Fetches website content using standard HTTP requests
63 | - **Key Functions**: `download_asset()`, HTTP request code in `/extract` route
64 | - **Features**: Cookie handling, header rotation, retry logic, error handling
65 |
66 | ### 3. Selenium Renderer (Optional)
67 | - **Purpose**: Renders JavaScript-heavy websites using a headless Chrome browser
68 | - **Key Functions**: `extract_with_selenium()`
69 | - **Features**: Waits for dynamic content, scrolls the page, handles lazy loading, identifies framework-specific resources
70 |
71 | ### 4. Content Parser
72 | - **Purpose**: Analyzes HTML content to extract assets and structure
73 | - **Key Functions**: `extract_assets()`, `extract_metadata()`, `extract_component_structure()`
74 | - **Features**: Identifies CSS, JS, images, fonts, extracts metadata, identifies UI components
75 |
76 | ### 5. Asset Downloader
77 | - **Purpose**: Downloads all discovered assets
78 | - **Key Functions**: `download_asset()`
79 | - **Features**: Handles different asset types, resolves relative URLs, manages retries
80 |
81 | ### 6. Zip File Creator
82 | - **Purpose**: Packages all assets into a downloadable zip file
83 | - **Key Functions**: `create_zip_file()`
84 | - **Features**: Organizes assets by type, handles file naming, adds metadata and documentation
85 |
86 | ## Process Flow
87 |
88 | 1. **User Submits URL**:
89 | - User enters a URL in the web interface
90 | - Optionally selects "Use Advanced Rendering (Selenium)"
91 | - Submits the form to the `/extract` endpoint
92 |
93 | 2. **Content Acquisition**:
94 | - If Selenium is selected: Uses Chrome WebDriver to render the page
95 | - Otherwise: Uses Requests library for HTTP retrieval
96 | - Handles redirects, errors, retries with different headers
97 |
98 | 3. **HTML Processing**:
99 | - Parses HTML using BeautifulSoup
100 | - Fixes relative URLs
101 | - Extracts metadata (title, description, etc.)
102 | - Identifies UI components
103 |
104 | 4. **Asset Discovery**:
105 | - Finds all linked resources (CSS, JS, images, fonts, etc.)
106 | - Resolves URLs
107 | - Categorizes assets by type
108 | - Handles duplicates
109 |
110 | 5. **Asset Download**:
111 | - Downloads all discovered assets
112 | - Handles binary vs. text content
113 | - Manages errors and retries
114 |
115 | 6. **Zip Creation**:
116 | - Creates organized folder structure
117 | - Adds README and metadata
118 | - Creates component index
119 | - Packages everything into a ZIP file
120 |
121 | 7. **User Download**:
122 | - Returns the ZIP file as a downloadable attachment
123 | - Manages temporary file cleanup
124 |
125 | ## Challenges & Error Patterns
126 |
127 | ### Common Failure Points
128 |
129 | 1. **Selenium WebDriver Initialization**:
130 | - Error seen in logs: `Error initializing Chrome WebDriver: [Errno 8] Exec format error`
131 | - Cause: WebDriver executable permission or architecture mismatch
132 | - Fallback: Alternative initialization method is attempted
133 |
134 | 2. **CDN and Image Processing URLs**:
135 | - Error seen: `Failed to download https://www.tesla.com/q_auto/Homepage-New-Legacy-Model-Y-Desktop.png, status: 404`
136 | - Cause: URLs contain transformation parameters (`q_auto`, `f_auto`) that are processed by CDNs and don't represent actual file paths
137 |
138 | 3. **Theme and Framework Resources**:
139 | - Error seen: `Failed to download https://www.tesla.com/themes/contrib/stable/images/core/throbber-active.gif, status: 404`
140 | - Cause: Theme resources may be generated dynamically or have access restrictions
141 |
142 | 4. **Anti-Bot Measures**:
143 | - Some sites implement anti-scraping measures (403 Forbidden responses)
144 | - Application implements header rotation and Selenium fallback to mitigate this
145 |
146 | ## Improvement Opportunities
147 |
148 | 1. **URL Processing**: Enhance the URL normalization to better handle CDN-specific parameters
149 | 2. **Asset Deduplication**: Improve handling of duplicate assets with different query parameters
150 | 3. **Error Handling**: Add more targeted error handling for specific CDN formats
151 | 4. **WebDriver Management**: Improve Selenium WebDriver initialization reliability
152 |
153 | ## Technical Dependencies
154 |
155 | - **Flask**: Web framework
156 | - **Requests**: HTTP client
157 | - **BeautifulSoup**: HTML parsing
158 | - **Selenium**: Browser automation
159 | - **cssutils**: CSS parsing
160 | - **zipfile**: ZIP file creation
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Website Extractor
2 |
3 | 
4 | [](https://opensource.org/licenses/MIT)
5 |
6 | ## Overview
7 |
8 | Website Extractor is a powerful Python-based tool that allows you to download and archive entire websites with a single click. This application extracts HTML, CSS, JavaScript, images, fonts, and other assets from any website, making it ideal for:
9 |
10 | - Creating pixel-perfect copies of any website online
11 | - Training AI agents with real-world web content
12 | - Studying website structure and design
13 | - Extracting UI components for design inspiration
14 | - Archiving web content for research
15 | - Learning web development techniques
16 |
17 | The application features advanced rendering capabilities using Selenium, allowing it to properly extract assets from modern JavaScript-heavy websites and single-page applications.
18 |
19 | 
20 |
21 | ## Features
22 |
23 | - **Advanced Rendering**: Uses Selenium with Chrome WebDriver to render JavaScript-heavy sites
24 | - **Comprehensive Asset Extraction**: Downloads HTML, CSS, JavaScript, images, fonts, and more
25 | - **Metadata Extraction**: Captures site metadata, OpenGraph tags, and structured data
26 | - **UI Component Analysis**: Identifies and extracts UI components like headers, navigation, cards, etc.
27 | - **Organized Output**: Creates a well-structured ZIP file with assets organized by type
28 | - **Responsive Design**: Works with both desktop and mobile websites
29 | - **CDN Support**: Handles assets from various Content Delivery Networks
30 | - **Modern Framework Support**: Special handling for React, Next.js, Angular, and Tailwind CSS
31 |
32 | ## Advanced Use Cases
33 |
34 | ### Pixel-Perfect Website Copies
35 | Create exact replicas of websites for study, testing, or inspiration. The advanced rendering engine ensures even complex layouts and JavaScript-driven designs are faithfully reproduced.
36 |
37 | ### AI Agent Training
38 | Extract websites to create high-quality training data for your AI agents:
39 | - Feed the structured content to AI models to improve their understanding of web layouts
40 | - Train AI assistants on real-world UI components and design patterns
41 | - Create diverse datasets of web content for machine learning projects
42 |
43 | ### Cursor IDE Integration
44 | Website Extractor works seamlessly with Cursor IDE:
45 | - Extract a website and open it directly in Cursor for code analysis
46 | - Edit the extracted code with Cursor's AI-powered assistance
47 | - Use the components as reference for your own projects
48 | - Ask Cursor to analyze the site's structure and styles to apply similar patterns to your work
49 |
50 | ### Design Inspiration & Reference
51 | Upload the extracted folder to your current project and:
52 | - Ask Cursor to reference its style when building new pages
53 | - Study professional UI implementations
54 | - Extract specific components for reuse in your own projects
55 | - Learn modern CSS techniques from production websites
56 |
57 | ## Installation
58 |
59 | ### Prerequisites
60 |
61 | - Python 3.7+
62 | - Chrome/Chromium browser (for advanced rendering)
63 | - Git
64 |
65 | ### Using Cursor (Recommended)
66 |
67 | 1. Clone the repository:
68 | ```bash
69 | git clone https://github.com/sirioberati/WebTwin.git
70 | cd WebTwin
71 | ```
72 |
73 | 2. Open the project in Cursor IDE:
74 | ```bash
75 | cursor .
76 | ```
77 |
78 | 3. Create a virtual environment (within Cursor's terminal):
79 | ```bash
80 | python -m venv venv
81 | ```
82 |
83 | 4. Activate the virtual environment:
84 | - On Windows: `venv\Scripts\activate`
85 | - On macOS/Linux: `source venv/bin/activate`
86 |
87 | 5. Install dependencies:
88 | ```bash
89 | pip install -r requirements.txt
90 | ```
91 |
92 | ### Manual Installation
93 |
94 | 1. Clone the repository:
95 | ```bash
96 | git clone https://github.com/sirioberati/WebTwin.git
97 | cd WebTwin
98 | ```
99 |
100 | 2. Create a virtual environment:
101 | ```bash
102 | python -m venv venv
103 | ```
104 |
105 | 3. Activate the virtual environment:
106 | - On Windows: `venv\Scripts\activate`
107 | - On macOS/Linux: `source venv/bin/activate`
108 |
109 | 4. Install dependencies:
110 | ```bash
111 | pip install -r requirements.txt
112 | ```
113 |
114 | ## Usage
115 |
116 | 1. Activate your virtual environment (if not already activated)
117 |
118 | 2. Run the application:
119 | ```bash
120 | python app.py
121 | ```
122 |
123 | 3. Open your browser and navigate to:
124 | ```
125 | http://127.0.0.1:5001
126 | ```
127 |
128 | 4. Enter the URL of the website you want to extract
129 |
130 | 5. Check "Use Advanced Rendering (Selenium)" for JavaScript-heavy websites
131 |
132 | 6. Click "Extract Website" and wait for the download to complete
133 |
134 | ### Using Advanced Rendering
135 |
136 | The advanced rendering option uses Selenium with Chrome WebDriver to:
137 | - Execute JavaScript
138 | - Render dynamic content
139 | - Scroll through the page to trigger lazy loading
140 | - Click on UI elements to expose hidden content
141 | - Extract resources loaded by JavaScript frameworks
142 |
143 | This option is recommended for modern websites, especially those built with React, Angular, Vue, or other JavaScript frameworks.
144 |
145 | ### Using with Cursor IDE
146 |
147 | After extracting a website:
148 |
149 | 1. Unzip the downloaded file to a directory
150 | 2. Open with Cursor IDE:
151 | ```bash
152 | cursor /path/to/extracted/website
153 | ```
154 | 3. Explore the code structure and assets
155 | 4. Ask Cursor AI to analyze the code with prompts like:
156 | - "Explain the CSS structure of this website"
157 | - "How can I implement a similar hero section in my project?"
158 | - "Analyze this navigation component and create a similar one for my React app"
159 |
160 | ## AI Agent Integration
161 |
162 | WebTwin can be a powerful tool when combined with AI agents, enabling sophisticated workflows for code analysis, design extraction, and content repurposing.
163 |
164 | ### Integration with Cursor AI
165 |
166 | Cursor's AI capabilities can be supercharged with WebTwin's extraction abilities:
167 |
168 | 1. **Extract and Modify Workflow**:
169 | ```
170 | WebTwin → Extract Site → Open in Cursor → Ask AI to Modify
171 | ```
172 | Example prompts:
173 | - "Convert this landing page to use Tailwind CSS instead of Bootstrap"
174 | - "Refactor this JavaScript code to use React hooks"
175 | - "Simplify this complex CSS layout while maintaining the same visual appearance"
176 |
177 | 2. **Component Library Creation**:
178 | ```
179 | WebTwin → Extract Multiple Sites → Open in Cursor → AI-Powered Component Extraction
180 | ```
181 | Example prompts:
182 | - "Extract all button styles from these websites and create a unified component library"
183 | - "Analyze these navigation patterns and create a best-practices implementation"
184 |
185 | 3. **Learn from Production Code**:
186 | ```
187 | WebTwin → Extract Complex Site → Cursor AI Analysis → Generate Tutorial
188 | ```
189 | Example prompts:
190 | - "Explain how this site implements its responsive design strategy"
191 | - "Show me how this animation effect works and help me implement something similar"
192 |
193 | ### Integration with OpenAI Assistants API & Agent SDK
194 |
195 | WebTwin can be integrated with the OpenAI Assistants API and Agent SDK to create specialized AI agents:
196 |
197 | 1. **Setup a Website Analysis Agent**:
198 | ```python
199 | from openai import OpenAI
200 |
201 | client = OpenAI(api_key="your-api-key")
202 |
203 | # Create an assistant specialized in web design analysis
204 | assistant = client.beta.assistants.create(
205 | name="WebDesignAnalyzer",
206 | instructions="You analyze websites extracted by WebTwin and provide design insights.",
207 | model="gpt-4-turbo",
208 | tools=[{"type": "file_search"}]
209 | )
210 |
211 | # Upload the extracted website files
212 | file = client.files.create(
213 | file=open("extracted_website.zip", "rb"),
214 | purpose="assistants"
215 | )
216 |
217 | # Create a thread with the file
218 | thread = client.beta.threads.create(
219 | messages=[
220 | {
221 | "role": "user",
222 | "content": "Analyze this website's design patterns and component structure",
223 | "file_ids": [file.id]
224 | }
225 | ]
226 | )
227 |
228 | # Run the assistant on the thread
229 | run = client.beta.threads.runs.create(
230 | thread_id=thread.id,
231 | assistant_id=assistant.id
232 | )
233 | ```
234 |
235 | 2. **Create a Website Transformation Pipeline**:
236 | ```
237 | WebTwin → Extract Site → OpenAI Agent Processes → Generate New Code
238 | ```
239 |
240 | 3. **Build a Web Design Critique Agent**:
241 | - Feed WebTwin extractions to an AI agent trained to evaluate design principles
242 | - Receive detailed feedback on accessibility, usability, and visual design
243 |
244 | ### Advanced Agent Workflows
245 |
246 | Combine WebTwin with AI agents for advanced workflows:
247 |
248 | 1. **Cross-Site Design Pattern Analysis**:
249 | - Extract multiple sites in the same industry
250 | - Use AI to identify common patterns and best practices
251 | - Generate a report on industry-standard approaches
252 |
253 | 2. **Automated Component Library Generation**:
254 | - Extract multiple sites
255 | - Use AI to identify and categorize UI components
256 | - Generate a unified component library with documentation
257 |
258 | 3. **SEO and Content Strategy Analysis**:
259 | - Extract content-rich websites
260 | - Use AI to analyze content structure, metadata, and keyword usage
261 | - Generate SEO recommendations and content strategy insights
262 |
263 | 4. **Competitive Analysis**:
264 | - Extract competitor websites
265 | - Use AI to compare features, UX patterns, and technical implementations
266 | - Generate a competitive analysis report with strengths and weaknesses
267 |
268 | ## Architecture
269 |
270 | The application is built with a modular architecture designed for flexibility and performance:
271 |
272 | ```
273 | ┌───────────────────────────────────────────────────────────────────┐
274 | │ Website Extractor Application │
275 | └───────────────────────────────────────────────────────────────────┘
276 | │
277 | ▼
278 | ┌───────────────────────────────────────────────────────────────────┐
279 | │ Flask Web Server │
280 | └───────────────────────────────────────────────────────────────────┘
281 | │
282 | ▼
283 | ┌───────────────────────────────────────────────────────────────────┐
284 | │ Extraction Core Processes │
285 | ├───────────────┬──────────────────┬──────────────────┬─────────────┤
286 | │ HTTP Client │ Selenium Renderer │ Content Parser │ Asset Saver │
287 | │ (requests) │ (WebDriver) │ (BeautifulSoup) │ (Zip) │
288 | └───────────────┴──────────────────┴──────────────────┴─────────────┘
289 | ```
290 |
291 | ### Data Flow
292 |
293 | ```
294 | ┌──────────┐ URL ┌──────────┐ HTML Content ┌──────────────┐
295 | │ User │───────────▶│ Extractor│───────────────▶│ HTML Parser │
296 | └──────────┘ └──────────┘ └──────────────┘
297 | │ │
298 | Rendering │ │ Asset URLs
299 | option │ │
300 | ▼ ▼
301 | ┌──────────┐ ┌──────────────┐
302 | │ Selenium │ │ Asset │
303 | │ WebDriver│ │ Downloader │
304 | └──────────┘ └──────────────┘
305 | │ │
306 | Rendered│ Assets │
307 | HTML │ │
308 | ▼ ▼
309 | ┌──────────────────────────────────────────┐
310 | │ Zip File Creator │
311 | └──────────────────────────────────────────┘
312 | │
313 | ▼
314 | ┌──────────────────────────────────────────┐
315 | │ File Download Response to User │
316 | └──────────────────────────────────────────┘
317 | ```
318 |
319 | ### Key Components
320 |
321 | 1. **Flask Web Server**: Provides the user interface and handles HTTP requests
322 | 2. **HTTP Client**: Makes requests to fetch website content using the Requests library
323 | 3. **Selenium Renderer**: Optional component for JavaScript rendering and dynamic content
324 | 4. **Content Parser**: Analyzes HTML to extract assets and structure using BeautifulSoup
325 | 5. **Asset Downloader**: Downloads all discovered assets with sophisticated retry logic
326 | 6. **ZIP Creator**: Packages everything into an organized downloadable archive
327 |
328 | ### Processing Stages
329 |
330 | 1. **URL Submission**: User provides a URL and rendering options
331 | 2. **Content Acquisition**: HTML content is fetched (with or without JavaScript rendering)
332 | 3. **Structure Analysis**: HTML is parsed and analyzed for assets and components
333 | 4. **Asset Discovery**: All linked resources are identified and categorized
334 | 5. **Parallel Downloading**: Assets are downloaded with optimized concurrent requests
335 | 6. **Organization & Packaging**: Files are organized and compressed into a ZIP archive
336 |
337 | For more detailed technical information, see [app_architecture.md](app_architecture.md).
338 |
339 | ## Limitations
340 |
341 | - Some websites implement anti-scraping measures that may block extraction
342 | - Content requiring authentication may not be accessible
343 | - Very large websites may time out or require multiple extraction attempts
344 | - Some CDN-specific URL formats may fail to download (especially those with transformation parameters)
345 |
346 | ## License
347 |
348 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
349 |
350 | ## Author
351 |
352 | Created by Sirio Berati
353 |
354 | - Instagram: [@heysirio](https://instagram.com/heysirio)
355 | - Instagram: [@siriosagents](https://instagram.com/siriosagents)
356 |
357 | ## Contributing
358 |
359 | Contributions are welcome! Please feel free to submit a Pull Request.
360 |
361 | 1. Fork the repository
362 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
363 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
364 | 4. Push to the branch (`git push origin feature/amazing-feature`)
365 | 5. Open a Pull Request
366 |
367 | ## Acknowledgments
368 |
369 | - This project uses [Flask](https://flask.palletsprojects.com/) for the web framework
370 | - [Selenium](https://www.selenium.dev/) for advanced rendering
371 | - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
372 | - All the open source libraries that made this project possible
373 |
--------------------------------------------------------------------------------
/templates/components.html:
--------------------------------------------------------------------------------
1 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | Extracted Components Viewer
13 |
14 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
Extracted Components
49 |
50 | {% if extracted_url is defined and extracted_url %}
51 | Components extracted from: {{ extracted_url }}
52 | {% else %}
53 | Browse and inspect all extracted UI components from the website
54 | {% endif %}
55 |
56 |
57 |
58 | Back to Main Page
59 |
60 |
61 |
62 |
63 |
64 |
67 |
How to Use This Viewer
68 |
69 |
70 | - Browse through the component categories below
71 | - Click on any component to view its preview
72 | - Use the "View Code" button to see the HTML structure
73 | - Copy the code to use in your own projects
74 | - Click "View in Context" to see how the component appears on the original page
75 |
76 |
77 |
78 |
79 |
80 |
81 |
84 |
Navigation
85 |
86 |
Headers, menus, and navigation bars
87 |
88 | {{ navigation_count }} components found
89 |
90 |
91 |
92 |
93 |
94 |
97 |
Hero Sections
98 |
99 |
Main banners and hero areas
100 |
101 | {{ hero_count }} components found
102 |
103 |
104 |
105 |
106 |
107 |
110 |
Cards
111 |
112 |
Product cards, info cards, and pricing cards
113 |
114 | {{ card_count }} components found
115 |
116 |
117 |
118 |
119 |
120 |
123 |
Sections
124 |
125 |
Content sections and feature blocks
126 |
127 | {{ section_count }} components found
128 |
129 |
130 |
131 |
132 |
133 |
136 |
Forms
137 |
138 |
Contact forms, sign-up forms, and inputs
139 |
140 | {{ form_count }} components found
141 |
142 |
143 |
144 |
145 |
146 |
149 |
Footers
150 |
151 |
Page footers and bottom sections
152 |
153 | {{ footer_count }} components found
154 |
155 |
156 |
157 |
158 |
159 |
162 |
Store Components
163 |
NEW
164 |
165 |
Product listings, filters, and store layouts
166 |
167 | {{ store_count }} components found
168 |
169 |
170 |
171 |
172 |
173 |
176 |
Mobile Components
177 |
NEW
178 |
179 |
Mobile-specific UI elements and responsive components
180 |
181 | {{ mobile_count }} components found
182 |
183 |
184 |
185 |
186 |
187 |
190 |
Cart Components
191 |
NEW
192 |
193 |
Shopping cart elements and checkout flows
194 |
195 | {{ cart_count }} components found
196 |
197 |
198 |
199 |
200 |
201 |
Metadata
202 |
203 |
204 |
Page Title
205 |
{{ page_title }}
206 |
207 |
208 |
209 |
Description
210 |
{{ meta_description }}
211 |
212 |
213 |
214 |
Keywords
215 |
216 | {% for keyword in meta_keywords %}
217 | {{ keyword }}
218 | {% endfor %}
219 |
220 |
221 |
222 |
223 |
Open Graph
224 |
225 |
Title: {{ og_title }}
226 |
Description: {{ og_description }}
227 |
Image: {{ og_image }}
228 |
229 |
230 |
231 |
232 |
233 |
234 |
Framework Configuration
235 |
236 |
237 |
238 |
241 |
Next.js Configuration
242 |
NEW
243 |
244 |
245 |
{{ next_config }}
246 |
247 |
248 |
249 |
250 |
251 |
254 |
Tailwind Configuration
255 |
NEW
256 |
257 |
258 |
{{ tailwind_config }}
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
Component Previews
267 |
268 | {% if components|length > 0 %}
269 | {% for component in components %}
270 |
271 |
272 |
273 |
{{ component.name }}
274 |
{{ component.type }}
275 |
276 |
285 |
286 |
287 |
288 |
289 | {{ component.html|safe }}
290 |
291 |
292 |
293 |
294 |
{{ component.code }}
295 |
296 |
297 | {% endfor %}
298 | {% else %}
299 |
300 |
301 |
306 |
307 |
308 | No components were found in the extracted website. This could be because:
309 |
310 |
311 | - The website uses a complex structure that's difficult to extract
312 | - The website uses custom components that don't match our extraction patterns
313 | - You're viewing the demo components instead of an actual extraction
314 |
315 |
316 | Try extracting a different website or check the ZIP file for the complete website clone.
317 |
318 |
319 |
320 |
321 | {% endif %}
322 |
323 |
324 |
325 |
326 |
340 |
341 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Website Extractor - Pixel Perfect Clone
7 |
8 |
28 |
29 |
30 |
31 |
32 |
33 |
Website Extractor
34 |
35 |
Create pixel-perfect clones of any website
36 |
37 |
38 |
62 |
63 |
64 |
65 |
68 |
69 |
Extracting website assets...
70 |
73 |
Initializing...
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
86 |
Website extracted successfully!
87 |
88 |
89 |
Your download should start automatically. If not, click the button below:
90 |
91 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
Enhanced Extraction Features
105 |
106 | -
107 |
110 | Complete HTML DOM structure
111 |
112 | -
113 |
116 | All CSS stylesheets (external and inline)
117 |
118 | -
119 |
122 | globals.css and styling files
123 | NEW
124 |
125 | -
126 |
129 | JavaScript files and functionality
130 |
131 | -
132 |
135 | Next.js configuration and files
136 | NEW
137 |
138 | -
139 |
142 | Image configurations and assets
143 | NEW
144 |
145 | -
146 |
149 | Metadata extraction
150 | NEW
151 |
152 | -
153 |
156 | Mobile-specific components
157 | NEW
158 |
159 | -
160 |
163 | SVG graphics (both linked and inline)
164 | NEW
165 |
166 | -
167 |
170 | Video files and player components
171 | NEW
172 |
173 | -
174 |
177 | Audio files and player components
178 | NEW
179 |
180 | -
181 |
184 | Font files and font family detection
185 | NEW
186 |
187 | -
188 |
191 | GIF animations and dynamic content
192 | NEW
193 |
194 | -
195 |
198 | Screenshots of website and components
199 | PREMIUM
200 |
201 | -
202 |
205 | JavaScript-rendered content capture
206 | PREMIUM
207 |
208 |
209 |
210 |
211 |
212 |
Extracted UI Components
213 |
The tool automatically identifies and extracts key UI components for easy reuse:
214 |
215 |
216 |
217 |
218 |
221 |
Navigation
222 |
223 |
Headers, menus, and navigation bars with responsive design
224 |
225 |
226 |
227 |
228 |
231 |
Hero Sections
232 |
233 |
Eye-catching hero banners with images, text overlays, and call-to-action buttons
234 |
235 |
236 |
237 |
238 |
241 |
Store Pages
242 |
NEW
243 |
244 |
Complete store layouts with product listings, filters, and shopping functionality
245 |
246 |
247 |
248 |
249 |
252 |
Mobile Menus
253 |
NEW
254 |
255 |
Mobile-specific navigation components and responsive design elements
256 |
257 |
258 |
259 |
260 |
263 |
Product Grids
264 |
265 |
Product listings and card grids with images, pricing, and descriptions
266 |
267 |
268 |
269 |
270 |
273 |
Shopping Cart
274 |
NEW
275 |
276 |
Cart components with item listings, quantity controls, and checkout buttons
277 |
278 |
279 |
280 |
281 |
284 |
Carousels & Sliders
285 |
NEW
286 |
287 |
Image sliders, product carousels, and testimonial rotators with controls
288 |
289 |
290 |
291 |
292 |
295 |
Video Players
296 |
NEW
297 |
298 |
Custom video player components with controls and responsive design
299 |
300 |
301 |
302 |
303 |
306 |
Audio Players
307 |
NEW
308 |
309 |
Audio playback components with controls and playlist functionality
310 |
311 |
312 |
313 |
314 |
317 |
Tab Components
318 |
NEW
319 |
320 |
Tabbed interfaces with content panels and interactive navigation
321 |
322 |
323 |
324 |
325 |
328 |
Social Media
329 |
NEW
330 |
331 |
Social media links, sharing buttons, and embedded social feeds
332 |
333 |
334 |
335 |
336 |
339 |
Modals & Popups
340 |
NEW
341 |
342 |
Modals, popup dialogs, and overlay components with animations
343 |
344 |
345 |
346 |
347 |
Components Included in ZIP File
348 |
All extracted UI components are included in the downloaded ZIP file for easy access and reuse.
349 |
350 |
351 |
352 |
353 |
Framework Support
354 |
355 |
356 |
357 |
360 |
Next.js
361 |
NEW
362 |
363 |
364 | - next.config.js extraction
365 | - _app.js and _document.js
366 | - Static and dynamic routes
367 |
368 |
369 |
370 |
371 |
372 |
375 |
Tailwind CSS
376 |
NEW
377 |
378 |
379 | - tailwind.config.js extraction
380 | - Custom theme settings
381 | - Plugin configurations
382 |
383 |
384 |
385 |
386 |
387 |
390 |
React
391 |
NEW
392 |
393 |
394 | - Component structure detection
395 | - React-specific attributes
396 | - State management patterns
397 |
398 |
399 |
400 |
401 |
402 |
405 |
Vue.js
406 |
NEW
407 |
408 |
409 | - Vue component detection
410 | - vue.config.js extraction
411 | - Vue directives parsing
412 |
413 |
414 |
415 |
416 |
417 |
420 |
Angular
421 |
NEW
422 |
423 |
424 | - Angular component structure
425 | - angular.json configuration
426 | - Module detection
427 |
428 |
429 |
430 |
431 |
432 |
435 |
Bootstrap
436 |
NEW
437 |
438 |
439 | - Bootstrap component classes
440 | - Grid system extraction
441 | - Custom Bootstrap themes
442 |
443 |
444 |
445 |
446 |
447 |
450 |
SCSS/SASS
451 |
NEW
452 |
453 |
454 | - Variable definitions
455 | - Mixin extraction
456 | - Nested styles
457 |
458 |
459 |
460 |
461 |
462 |
465 |
Svelte
466 |
NEW
467 |
468 |
469 | - Svelte component format
470 | - Reactive declarations
471 | - Template structure
472 |
473 |
474 |
475 |
476 |
477 |
480 |
Material UI
481 |
NEW
482 |
483 |
484 | - Material component classes
485 | - Theme configuration
486 | - Material icons
487 |
488 |
489 |
490 |
491 |
492 |
493 |
How to Use the Clone
494 |
495 | - Extract the downloaded ZIP file
496 | - Open the
index.html file in your browser to view the static clone
497 | - Find extracted components in the
components folder
498 | - Review metadata in
metadata.json
499 | - Check
css/globals.css for global styling
500 | - Edit the HTML, CSS, and JavaScript files in Cursor to customize the design
501 | - Use the included
manifest.json file to locate specific assets
502 |
503 |
504 |
Pro Tip:
505 |
For the most accurate results, try cloning the desktop version of websites. Some sites may have anti-scraping measures that could affect the results.
506 |
507 |
508 |
509 |
510 |
511 |
804 |
805 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, request, send_file, jsonify, session, after_this_request
2 | import requests
3 | from bs4 import BeautifulSoup
4 | import os
5 | import re
6 | import json
7 | from urllib.parse import urljoin, urlparse, urlunparse, unquote, quote, parse_qs
8 | import zipfile
9 | from io import BytesIO
10 | import mimetypes
11 | import base64
12 | import cssutils
13 | import logging
14 | import uuid
15 | import random
16 | import time
17 | import urllib3
18 | import tempfile
19 | from datetime import datetime
20 | import traceback
21 | import html
22 | import shutil
23 | import threading
24 |
25 | # Try to import Selenium
26 | SELENIUM_AVAILABLE = False
27 | try:
28 | from selenium import webdriver
29 | from selenium.webdriver.chrome.options import Options
30 | from selenium.webdriver.common.by import By
31 | from selenium.webdriver.support.ui import WebDriverWait
32 | from selenium.webdriver.support import expected_conditions as EC
33 | from selenium.common.exceptions import TimeoutException, WebDriverException
34 | from selenium.webdriver.chrome.service import Service
35 | from webdriver_manager.chrome import ChromeDriverManager
36 | SELENIUM_AVAILABLE = True
37 | print("Selenium is available. Advanced rendering is enabled.")
38 | except ImportError:
39 | SELENIUM_AVAILABLE = False
40 | print("Selenium not available. Advanced rendering will be disabled.")
41 |
42 | # Suppress cssutils warnings
43 | cssutils.log.setLevel(logging.CRITICAL)
44 |
45 | app = Flask(__name__)
46 | app.secret_key = os.environ.get('SECRET_KEY', 'dev_key_for_website_extractor')
47 |
48 | def is_binary_content(content, asset_type):
49 | """Determine if content should be treated as binary or text based on asset type and content inspection"""
50 | # First check by asset type
51 | if asset_type in ['images', 'fonts', 'videos', 'audio']:
52 | return True
53 |
54 | # For potentially text-based assets, try to detect if it's binary
55 | if asset_type in ['css', 'js', 'html', 'svg', 'json', 'globals_css']:
56 | # Check if the content is bytes
57 | if not isinstance(content, bytes):
58 | return False
59 |
60 | # Try to detect if binary by checking for null bytes and high concentration of non-ASCII chars
61 | try:
62 | # Check for null bytes which indicate binary content
63 | if b'\x00' in content:
64 | return True
65 |
66 | # Sample the first 1024 bytes to determine if it's binary
67 | sample = content[:1024]
68 | text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
69 | return bool(sample.translate(None, text_chars))
70 | except:
71 | # If there's any error in detection, treat as binary to be safe
72 | return True
73 |
74 | # For anything else, just check if it's bytes
75 | return isinstance(content, bytes)
76 |
77 | def download_asset(url, base_url, headers=None, session_obj=None):
78 | """
79 | Download an asset from a URL
80 |
81 | Args:
82 | url: URL to download from
83 | base_url: Base URL of the website (for referrer)
84 | headers: Optional custom headers
85 | session_obj: Optional requests.Session object for maintaining cookies
86 |
87 | Returns:
88 | Content of the asset or None if download failed
89 | """
90 | # List of user agents to rotate through to avoid detection
91 | user_agents = [
92 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
93 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
94 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
95 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
96 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
97 | 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1'
98 | ]
99 |
100 | # Use a random user agent
101 | random_user_agent = random.choice(user_agents)
102 |
103 | if not headers:
104 | headers = {
105 | 'User-Agent': random_user_agent,
106 | 'Accept': '*/*',
107 | 'Accept-Language': 'en-US,en;q=0.9',
108 | 'Accept-Encoding': 'gzip, deflate, br',
109 | 'Connection': 'keep-alive',
110 | 'Referer': base_url,
111 | 'Sec-Fetch-Dest': 'empty',
112 | 'Sec-Fetch-Mode': 'cors',
113 | 'Sec-Fetch-Site': 'same-origin',
114 | 'Pragma': 'no-cache',
115 | 'Cache-Control': 'no-cache',
116 | }
117 | else:
118 | # Update the user agent in the provided headers
119 | headers['User-Agent'] = random_user_agent
120 |
121 | # Parse the URL to check if it's valid
122 | try:
123 | parsed_url = urlparse(url)
124 | if not parsed_url.scheme or not parsed_url.netloc:
125 | print(f"Invalid URL: {url}")
126 | return None
127 | except Exception as e:
128 | print(f"Error parsing URL {url}: {str(e)}")
129 | return None
130 |
131 | # Add a delay to avoid rate limiting
132 | time.sleep(0.1) # 100ms delay between requests
133 |
134 | # Maximum number of retries
135 | max_retries = 3
136 | retry_count = 0
137 |
138 | while retry_count < max_retries:
139 | try:
140 | # Use session if provided, otherwise make a direct request
141 | if session_obj:
142 | response = session_obj.get(
143 | url,
144 | timeout=15,
145 | headers=headers,
146 | stream=True,
147 | allow_redirects=True,
148 | verify=False # Ignore SSL certificate errors
149 | )
150 | else:
151 | response = requests.get(
152 | url,
153 | timeout=15,
154 | headers=headers,
155 | stream=True,
156 | allow_redirects=True,
157 | verify=False # Ignore SSL certificate errors
158 | )
159 |
160 | # Handle redirects
161 | if response.history:
162 | print(f"Request for {url} was redirected {len(response.history)} times to {response.url}")
163 | url = response.url # Update URL to the final destination
164 |
165 | if response.status_code == 200:
166 | # Check the Content-Type header
167 | content_type = response.headers.get('Content-Type', '')
168 | print(f"Downloaded {url} ({len(response.content)} bytes, type: {content_type})")
169 |
170 | # Check for binary content types
171 | is_binary = any(binary_type in content_type.lower() for binary_type in [
172 | 'image/', 'video/', 'audio/', 'font/', 'application/octet-stream',
173 | 'application/zip', 'application/x-rar', 'application/pdf', 'application/vnd.'
174 | ])
175 |
176 | # If binary or content-type suggests binary, return raw content
177 | if is_binary:
178 | return response.content
179 |
180 | # For text content types
181 | is_text = any(text_type in content_type.lower() for text_type in [
182 | 'text/', 'application/json', 'application/javascript', 'application/xml', 'application/xhtml'
183 | ])
184 |
185 | if is_text:
186 | # Try to determine encoding
187 | encoding = None
188 |
189 | # From Content-Type header
190 | if 'charset=' in content_type:
191 | encoding = content_type.split('charset=')[1].split(';')[0].strip()
192 |
193 | # From response encoding or apparent encoding
194 | if not encoding:
195 | encoding = response.encoding or response.apparent_encoding or 'utf-8'
196 |
197 | # Decode with specified encoding
198 | try:
199 | return response.content.decode(encoding, errors='replace').encode('utf-8')
200 | except (UnicodeDecodeError, LookupError):
201 | # If decoding fails, try utf-8
202 | try:
203 | return response.content.decode('utf-8', errors='replace').encode('utf-8')
204 | except:
205 | # If all else fails, return raw content
206 | return response.content
207 |
208 | # For unknown content types, return raw content
209 | return response.content
210 | elif response.status_code == 404:
211 | print(f"Resource not found (404): {url}")
212 | return None
213 | elif response.status_code == 403:
214 | print(f"Access forbidden (403): {url}")
215 | # Try with a different user agent on the next retry
216 | headers['User-Agent'] = random.choice(user_agents)
217 | retry_count += 1
218 | time.sleep(1) # Wait longer before retrying
219 | continue
220 | elif response.status_code >= 500:
221 | print(f"Server error ({response.status_code}): {url}")
222 | retry_count += 1
223 | time.sleep(1) # Wait longer before retrying
224 | continue
225 | else:
226 | print(f"HTTP error ({response.status_code}): {url}")
227 | return None
228 |
229 | except requests.exceptions.Timeout:
230 | print(f"Timeout error downloading {url}")
231 | retry_count += 1
232 | time.sleep(1)
233 | continue
234 | except requests.exceptions.ConnectionError:
235 | print(f"Connection error downloading {url}")
236 | retry_count += 1
237 | time.sleep(1)
238 | continue
239 | except requests.exceptions.TooManyRedirects:
240 | print(f"Too many redirects for {url}")
241 | return None
242 | except Exception as e:
243 | print(f"Error downloading {url}: {str(e)}")
244 | return None
245 |
246 | if retry_count == max_retries:
247 | print(f"Max retries reached for {url}")
248 |
249 | return None
250 |
251 | def get_asset_type(url):
252 | """Determine the type of asset from the URL"""
253 | # Handle empty or None URLs
254 | if not url:
255 | return 'other'
256 |
257 | url_lower = url.lower()
258 |
259 | # Framework-specific patterns
260 | if '_next/static' in url_lower:
261 | if '.css' in url_lower or 'styles' in url_lower:
262 | return 'css'
263 | return 'js' # Default to JS for Next.js assets
264 |
265 | if 'chunk.' in url_lower or 'webpack' in url_lower:
266 | return 'js' # Webpack chunks
267 |
268 | if 'angular' in url_lower and '.js' in url_lower:
269 | return 'js' # Angular bundles
270 |
271 | # Handle CSS files
272 | if url_lower.endswith(('.css', '.scss', '.less', '.sass')):
273 | return 'css'
274 | if 'global.css' in url_lower or 'globals.css' in url_lower or 'tailwind' in url_lower:
275 | return 'css'
276 | if 'fonts.googleapis.com' in url_lower:
277 | return 'css'
278 | if 'styles' in url_lower and '.css' in url_lower:
279 | return 'css'
280 |
281 | # Handle JS files
282 | if url_lower.endswith(('.js', '.jsx', '.mjs', '.ts', '.tsx', '.cjs')):
283 | return 'js'
284 | if 'bundle.js' in url_lower or 'main.js' in url_lower or 'app.js' in url_lower:
285 | return 'js'
286 | if 'polyfill' in url_lower or 'runtime' in url_lower or 'vendor' in url_lower:
287 | return 'js'
288 | if 'image-config' in url_lower or 'image.config' in url_lower:
289 | return 'js'
290 |
291 | # Handle image files
292 | if url_lower.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.avif', '.bmp', '.ico')):
293 | return 'img'
294 | if '/images/' in url_lower or '/img/' in url_lower or '/assets/images/' in url_lower:
295 | return 'img'
296 |
297 | # Handle font files
298 | if url_lower.endswith(('.woff', '.woff2', '.ttf', '.otf', '.eot')):
299 | return 'fonts'
300 | if '/fonts/' in url_lower or 'font-awesome' in url_lower:
301 | return 'fonts'
302 |
303 | # Handle media files
304 | if url_lower.endswith(('.mp4', '.webm', '.ogg', '.avi', '.mov', '.flv')):
305 | return 'videos'
306 | if url_lower.endswith(('.mp3', '.wav', '.ogg', '.aac')):
307 | return 'audio'
308 |
309 | # Handle favicon
310 | if url_lower.endswith(('.ico', '.icon')):
311 | return 'favicons'
312 | if 'favicon' in url_lower:
313 | return 'favicons'
314 |
315 | # Handle special API endpoints
316 | if 'graphql' in url_lower or 'api.' in url_lower:
317 | return 'js'
318 |
319 | # Try to guess based on URL structure
320 | if '/css/' in url_lower:
321 | return 'css'
322 | if '/js/' in url_lower or '/scripts/' in url_lower:
323 | return 'js'
324 | if '/static/' in url_lower and not any(ext in url_lower for ext in ['.css', '.js', '.png', '.jpg']):
325 | # For static assets with unclear type, check the URL itself
326 | if 'style' in url_lower:
327 | return 'css'
328 | return 'js' # Default for static assets
329 |
330 | # For CDN resources, try to determine type from the host
331 | cdn_hosts = ['cdn.jsdelivr.net', 'unpkg.com', 'cdnjs.cloudflare.com']
332 | for host in cdn_hosts:
333 | if host in url_lower:
334 | if any(lib in url_lower for lib in ['react', 'angular', 'vue', 'jquery']):
335 | return 'js'
336 | if any(lib in url_lower for lib in ['bootstrap', 'tailwind', 'material', 'font']):
337 | return 'css'
338 |
339 | # Default to JS for unknown extensions
340 | return 'js'
341 |
342 | def extract_metadata(soup, base_url):
343 | """Extract metadata from the HTML"""
344 | metadata = {
345 | 'title': '',
346 | 'description': '',
347 | 'keywords': '',
348 | 'og_tags': {},
349 | 'twitter_cards': {},
350 | 'canonical': '',
351 | 'language': '',
352 | 'favicon': '',
353 | 'structured_data': []
354 | }
355 |
356 | # Extract title
357 | title_tag = soup.find('title')
358 | if title_tag and title_tag.string:
359 | metadata['title'] = title_tag.string.strip()
360 |
361 | # Extract meta tags
362 | meta_tags = soup.find_all('meta')
363 | for tag in meta_tags:
364 | # Description
365 | if tag.get('name') == 'description' and tag.get('content'):
366 | metadata['description'] = tag.get('content').strip()
367 |
368 | # Keywords
369 | elif tag.get('name') == 'keywords' and tag.get('content'):
370 | metadata['keywords'] = tag.get('content').strip()
371 |
372 | # OpenGraph tags
373 | elif tag.get('property') and tag.get('property').startswith('og:') and tag.get('content'):
374 | prop = tag.get('property')[3:] # Remove 'og:' prefix
375 | metadata['og_tags'][prop] = tag.get('content').strip()
376 |
377 | # Twitter card tags
378 | elif tag.get('name') and tag.get('name').startswith('twitter:') and tag.get('content'):
379 | prop = tag.get('name')[8:] # Remove 'twitter:' prefix
380 | metadata['twitter_cards'][prop] = tag.get('content').strip()
381 |
382 | # Extract canonical URL
383 | canonical_tag = soup.find('link', {'rel': 'canonical'})
384 | if canonical_tag and canonical_tag.get('href'):
385 | canonical_url = canonical_tag.get('href')
386 | if not canonical_url.startswith(('http://', 'https://')):
387 | canonical_url = urljoin(base_url, canonical_url)
388 | metadata['canonical'] = canonical_url
389 |
390 | # Extract language
391 | html_tag = soup.find('html')
392 | if html_tag and html_tag.get('lang'):
393 | metadata['language'] = html_tag.get('lang')
394 |
395 | # Extract favicon
396 | favicon_tag = soup.find('link', {'rel': 'icon'}) or soup.find('link', {'rel': 'shortcut icon'})
397 | if favicon_tag and favicon_tag.get('href'):
398 | favicon_url = favicon_tag.get('href')
399 | if not favicon_url.startswith(('http://', 'https://')):
400 | favicon_url = urljoin(base_url, favicon_url)
401 | metadata['favicon'] = favicon_url
402 |
403 | # Extract structured data (JSON-LD)
404 | script_tags = soup.find_all('script', {'type': 'application/ld+json'})
405 | for tag in script_tags:
406 | if tag.string:
407 | try:
408 | json_data = json.loads(tag.string)
409 | metadata['structured_data'].append(json_data)
410 | except json.JSONDecodeError:
411 | pass
412 |
413 | return metadata
414 |
415 | def get_component_type(element):
416 | """Determine the type of UI component based on element attributes and classes"""
417 | if not element:
418 | return None
419 |
420 | # Get tag name, classes, and ID
421 | tag_name = element.name
422 | class_list = element.get('class', [])
423 | if class_list and not isinstance(class_list, list):
424 | class_list = [class_list]
425 | class_str = ' '.join(class_list).lower() if class_list else ''
426 | element_id = element.get('id', '').lower()
427 |
428 | # Get element role
429 | role = element.get('role', '').lower()
430 |
431 | # Navigation components
432 | if tag_name == 'nav' or role == 'navigation' or 'nav' in class_str or 'navigation' in class_str or 'menu' in class_str or element_id in ['nav', 'navigation', 'menu']:
433 | return 'navigation'
434 |
435 | # Header components
436 | if tag_name == 'header' or role == 'banner' or 'header' in class_str or 'banner' in class_str or element_id in ['header', 'banner']:
437 | return 'header'
438 |
439 | # Footer components
440 | if tag_name == 'footer' or role == 'contentinfo' or 'footer' in class_str or element_id == 'footer':
441 | return 'footer'
442 |
443 | # Hero/banner components
444 | if 'hero' in class_str or 'banner' in class_str or 'jumbotron' in class_str or 'showcase' in class_str or element_id in ['hero', 'banner', 'jumbotron', 'showcase']:
445 | return 'hero'
446 |
447 | # Card components
448 | if 'card' in class_str or 'tile' in class_str or 'item' in class_str or element_id in ['card', 'tile']:
449 | return 'card'
450 |
451 | # Form components
452 | if tag_name == 'form' or role == 'form' or 'form' in class_str or element_id == 'form':
453 | return 'form'
454 |
455 | # CTA (Call to Action) components
456 | if 'cta' in class_str or 'call-to-action' in class_str or 'action' in class_str or element_id in ['cta', 'call-to-action']:
457 | return 'cta'
458 |
459 | # Sidebar components
460 | if 'sidebar' in class_str or 'side-bar' in class_str or element_id in ['sidebar', 'side-bar']:
461 | return 'sidebar'
462 |
463 | # Modal/Dialog components
464 | if role == 'dialog' or 'modal' in class_str or 'dialog' in class_str or 'popup' in class_str or element_id in ['modal', 'dialog', 'popup']:
465 | return 'modal'
466 |
467 | # Section components
468 | if tag_name == 'section' or role == 'region' or 'section' in class_str:
469 | return 'section'
470 |
471 | # Mobile components
472 | if 'mobile' in class_str or 'smartphone' in class_str or 'mobile-only' in class_str:
473 | return 'mobile'
474 |
475 | # Store/Product components
476 | if 'product' in class_str or 'store' in class_str or 'shop' in class_str or 'pricing' in class_str:
477 | return 'store'
478 |
479 | # Cart components
480 | if 'cart' in class_str or 'basket' in class_str or 'shopping-cart' in class_str or element_id in ['cart', 'basket', 'shopping-cart']:
481 | return 'cart'
482 |
483 | # If no specific type is identified, check if the element is a major container
484 | if tag_name in ['div', 'section', 'article'] and ('container' in class_str or 'wrapper' in class_str or 'content' in class_str):
485 | return 'container'
486 |
487 | # Default to unknown if no specific type is identified
488 | return 'other'
489 |
490 | def extract_component_structure(soup):
491 | """Extract UI components from the HTML structure"""
492 | if not soup:
493 | return {}
494 |
495 | components = {
496 | 'navigation': [],
497 | 'header': [],
498 | 'footer': [],
499 | 'hero': [],
500 | 'card': [],
501 | 'form': [],
502 | 'cta': [],
503 | 'sidebar': [],
504 | 'modal': [],
505 | 'section': [],
506 | 'store': [],
507 | 'mobile': [],
508 | 'cart': []
509 | }
510 |
511 | # Helper function to convert element to HTML string
512 | def element_to_html(element):
513 | return str(element)
514 |
515 | # Extract navigation components
516 | nav_elements = soup.find_all(['nav']) + soup.find_all(role='navigation') + soup.find_all(class_=lambda c: c and ('nav' in c.lower() or 'menu' in c.lower()))
517 | for element in nav_elements[:5]: # Limit to 5 to avoid excessive extraction
518 | components['navigation'].append({
519 | 'html': element_to_html(element)
520 | })
521 |
522 | # Extract header components
523 | header_elements = soup.find_all(['header']) + soup.find_all(role='banner') + soup.find_all(class_=lambda c: c and 'header' in c.lower())
524 | for element in header_elements[:2]: # Usually only 1-2 headers per page
525 | components['header'].append({
526 | 'html': element_to_html(element)
527 | })
528 |
529 | # Extract footer components
530 | footer_elements = soup.find_all(['footer']) + soup.find_all(role='contentinfo') + soup.find_all(class_=lambda c: c and 'footer' in c.lower())
531 | for element in footer_elements[:2]: # Usually only 1-2 footers per page
532 | components['footer'].append({
533 | 'html': element_to_html(element)
534 | })
535 |
536 | # Extract hero/banner components
537 | hero_elements = soup.find_all(class_=lambda c: c and ('hero' in c.lower() or 'banner' in c.lower() or 'jumbotron' in c.lower()))
538 | for element in hero_elements[:3]: # Limit to 3
539 | components['hero'].append({
540 | 'html': element_to_html(element)
541 | })
542 |
543 | # Extract card components - often these are repeated elements
544 | card_elements = soup.find_all(class_=lambda c: c and ('card' in c.lower() or 'tile' in c.lower()))
545 |
546 | # If we find many cards, just keep one of each unique structure
547 | unique_cards = {}
548 | for element in card_elements[:15]: # Examine up to 15 cards
549 | # Use a simplified structure hash to identify similar cards
550 | structure_hash = str(len(element.find_all())) # Number of child elements
551 | if structure_hash not in unique_cards:
552 | unique_cards[structure_hash] = element
553 |
554 | # Add unique cards to components
555 | for idx, element in enumerate(unique_cards.values()):
556 | if idx >= 5: # Limit to 5 unique cards
557 | break
558 | components['card'].append({
559 | 'html': element_to_html(element)
560 | })
561 |
562 | # Extract form components
563 | form_elements = soup.find_all(['form']) + soup.find_all(class_=lambda c: c and 'form' in c.lower())
564 | for element in form_elements[:3]: # Limit to 3
565 | components['form'].append({
566 | 'html': element_to_html(element)
567 | })
568 |
569 | # Extract CTA components
570 | cta_elements = soup.find_all(class_=lambda c: c and ('cta' in c.lower() or 'call-to-action' in c.lower()))
571 | for element in cta_elements[:3]: # Limit to 3
572 | components['cta'].append({
573 | 'html': element_to_html(element)
574 | })
575 |
576 | # Extract sidebar components
577 | sidebar_elements = soup.find_all(class_=lambda c: c and ('sidebar' in c.lower() or 'side-bar' in c.lower()))
578 | for element in sidebar_elements[:2]: # Limit to 2
579 | components['sidebar'].append({
580 | 'html': element_to_html(element)
581 | })
582 |
583 | # Extract modal/dialog components
584 | modal_elements = soup.find_all(role='dialog') + soup.find_all(class_=lambda c: c and ('modal' in c.lower() or 'dialog' in c.lower() or 'popup' in c.lower()))
585 | for element in modal_elements[:3]: # Limit to 3
586 | components['modal'].append({
587 | 'html': element_to_html(element)
588 | })
589 |
590 | # Extract section components
591 | section_elements = soup.find_all(['section']) + soup.find_all(role='region')
592 | # Filter to get only substantial sections
593 | substantial_sections = [element for element in section_elements if len(element.find_all()) > 3] # Must have at least 3 child elements
594 | for element in substantial_sections[:5]: # Limit to 5
595 | components['section'].append({
596 | 'html': element_to_html(element)
597 | })
598 |
599 | # Extract mobile-specific components
600 | mobile_elements = soup.find_all(class_=lambda c: c and ('mobile' in c.lower() or 'smartphone' in c.lower() or 'mobile-only' in c.lower()))
601 | for element in mobile_elements[:3]: # Limit to 3
602 | components['mobile'].append({
603 | 'html': element_to_html(element)
604 | })
605 |
606 | # Extract store/product components
607 | store_elements = soup.find_all(class_=lambda c: c and ('product' in c.lower() or 'store' in c.lower() or 'shop' in c.lower() or 'pricing' in c.lower()))
608 | for element in store_elements[:5]: # Limit to 5
609 | components['store'].append({
610 | 'html': element_to_html(element)
611 | })
612 |
613 | # Extract cart components
614 | cart_elements = soup.find_all(class_=lambda c: c and ('cart' in c.lower() or 'basket' in c.lower() or 'shopping-cart' in c.lower()))
615 | for element in cart_elements[:2]: # Limit to 2
616 | components['cart'].append({
617 | 'html': element_to_html(element)
618 | })
619 |
620 | # Remove empty component types
621 | return {k: v for k, v in components.items() if v}
622 |
623 | def extract_inline_styles(soup):
624 | """Extract all inline styles from the HTML"""
625 | inline_styles = {}
626 | elements_with_style = soup.select('[style]')
627 |
628 | for i, element in enumerate(elements_with_style):
629 | style_content = element.get('style')
630 | if style_content:
631 | class_name = f'extracted-inline-style-{i}'
632 | inline_styles[class_name] = style_content
633 | # Add the class to the element
634 | element['class'] = element.get('class', []) + [class_name]
635 | # Remove the inline style
636 | del element['style']
637 |
638 | return inline_styles
639 |
640 | def extract_inline_javascript(soup):
641 | """Extract inline JavaScript from HTML content"""
642 | inline_js = []
643 | # Find all script tags without src attribute (inline scripts)
644 | for script in soup.find_all('script'):
645 | if not script.get('src') and script.string:
646 | inline_js.append(script.string.strip())
647 |
648 | if inline_js:
649 | return '\n\n/* --- INLINE SCRIPTS --- */\n\n'.join(inline_js)
650 | return ""
651 |
652 | def extract_assets(html_content, base_url, session_obj=None, headers=None):
653 | """Extract all assets from HTML content"""
654 | assets = {
655 | 'css': [],
656 | 'js': [],
657 | 'img': [],
658 | 'fonts': [],
659 | 'videos': [],
660 | 'audio': [],
661 | 'favicons': [],
662 | 'font_families': set(),
663 | 'metadata': {},
664 | 'components': {}
665 | }
666 |
667 | if not html_content:
668 | print("Warning: Empty HTML content provided to extract_assets")
669 | return assets
670 |
671 | try:
672 | # Create BeautifulSoup object
673 | soup = BeautifulSoup(html_content, 'html.parser')
674 |
675 | if not soup or not soup.html:
676 | print("Warning: Could not parse HTML content properly")
677 | # Try with a more lenient parser
678 | soup = BeautifulSoup(html_content, 'html5lib')
679 | if not soup or not soup.html:
680 | print("Error: Failed to parse HTML with both parsers")
681 | return assets
682 |
683 | # Extract metadata
684 | try:
685 | assets['metadata'] = extract_metadata(soup, base_url)
686 | except Exception as e:
687 | print(f"Error extracting metadata: {str(e)}")
688 | traceback.print_exc()
689 |
690 | # Extract all CSS files
691 | try:
692 | css_links = soup.find_all('link', {'rel': 'stylesheet'}) or []
693 | # Also look for preload links with as="style"
694 | preload_css = soup.find_all('link', {'rel': 'preload', 'as': 'style'}) or []
695 |
696 | for link in css_links + preload_css:
697 | href = link.get('href')
698 | if href:
699 | if not href.startswith(('http://', 'https://', 'data:')):
700 | href = urljoin(base_url, href)
701 | if href.startswith(('http://', 'https://')):
702 | assets['css'].append(href)
703 | except Exception as e:
704 | print(f"Error extracting CSS links: {str(e)}")
705 |
706 | # Look for Next.js specific CSS files
707 | try:
708 | next_css = soup.find_all('link', {'data-n-g': True}) or []
709 | next_css += soup.find_all('link', {'data-n-p': True}) or []
710 | for link in next_css:
711 | href = link.get('href')
712 | if href:
713 | if not href.startswith(('http://', 'https://', 'data:')):
714 | href = urljoin(base_url, href)
715 | if href.startswith(('http://', 'https://')):
716 | assets['css'].append(href)
717 | except Exception as e:
718 | print(f"Error extracting Next.js CSS: {str(e)}")
719 |
720 | # Extract all inline styles and check for CSS imports or fonts
721 | try:
722 | style_tags = soup.find_all('style') or []
723 | for style in style_tags:
724 | style_content = style.string
725 | if style_content:
726 | # Extract @import statements
727 | import_urls = re.findall(r'@import\s+[\'"]([^\'"]+)[\'"]', style_content) or []
728 | import_urls += re.findall(r'@import\s+url\([\'"]?([^\'"|\)]+)[\'"]?\)', style_content) or []
729 |
730 | for import_url in import_urls:
731 | if not import_url.startswith(('http://', 'https://', 'data:')):
732 | import_url = urljoin(base_url, import_url)
733 | if import_url.startswith(('http://', 'https://')):
734 | assets['css'].append(import_url)
735 |
736 | # Extract font families
737 | font_families = re.findall(r'font-family:\s*[\'"]?([^\'";]+)[\'"]?', style_content) or []
738 | for family in font_families:
739 | family = family.strip().split(',')[0].strip('\'"`')
740 | if family and family.lower() not in ['serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui']:
741 | assets['font_families'].add(family)
742 | except Exception as e:
743 | print(f"Error extracting inline styles: {str(e)}")
744 |
745 | # Extract all JavaScript files
746 | try:
747 | script_tags = soup.find_all('script', {'src': True}) or []
748 | for script in script_tags:
749 | src = script.get('src')
750 | if src:
751 | if not src.startswith(('http://', 'https://', 'data:')):
752 | src = urljoin(base_url, src)
753 | if src.startswith(('http://', 'https://')):
754 | assets['js'].append(src)
755 |
756 | # Look for module scripts (common in modern frameworks)
757 | module_scripts = soup.find_all('script', {'type': 'module'}) or []
758 | for script in module_scripts:
759 | src = script.get('src')
760 | if src:
761 | if not src.startswith(('http://', 'https://', 'data:')):
762 | src = urljoin(base_url, src)
763 | if src.startswith(('http://', 'https://')):
764 | assets['js'].append(src)
765 | except Exception as e:
766 | print(f"Error extracting JavaScript: {str(e)}")
767 |
768 | # Extract all images
769 | try:
770 | # Regular img tags
771 | img_tags = soup.find_all('img') or []
772 | for img in img_tags:
773 | # Check src attribute
774 | src = img.get('src')
775 | if src:
776 | if not src.startswith(('http://', 'https://', 'data:')):
777 | src = urljoin(base_url, src)
778 | if src.startswith(('http://', 'https://')):
779 | assets['img'].append(src)
780 |
781 | # Check srcset attribute
782 | srcset = img.get('srcset')
783 | if srcset:
784 | for src_str in srcset.split(','):
785 | src_parts = src_str.strip().split(' ')
786 | if src_parts:
787 | src = src_parts[0]
788 | if not src.startswith(('http://', 'https://', 'data:')):
789 | src = urljoin(base_url, src)
790 | if src.startswith(('http://', 'https://')):
791 | assets['img'].append(src)
792 |
793 | # Check data-src (lazy loading)
794 | data_src = img.get('data-src')
795 | if data_src:
796 | if not data_src.startswith(('http://', 'https://', 'data:')):
797 | data_src = urljoin(base_url, data_src)
798 | if data_src.startswith(('http://', 'https://')):
799 | assets['img'].append(data_src)
800 |
801 | # Background images in style attributes
802 | elements_with_style = soup.select('[style]') or []
803 | for element in elements_with_style:
804 | style = element.get('style', '')
805 | if 'background' in style or 'background-image' in style:
806 | # Try to extract URLs
807 | bg_urls = re.findall(r'url\([\'"]?([^\'"|\)]+)[\'"]?\)', style)
808 | for bg_url in bg_urls:
809 | if not bg_url.startswith(('http://', 'https://', 'data:')):
810 | bg_url = urljoin(base_url, bg_url)
811 | if bg_url.startswith(('http://', 'https://')):
812 | assets['img'].append(bg_url)
813 | except Exception as e:
814 | print(f"Error extracting images: {str(e)}")
815 |
816 | # Extract favicon
817 | try:
818 | favicon_links = soup.find_all('link', {'rel': lambda r: r and (r.lower() == 'icon' or 'icon' in r.lower().split())}) or []
819 | for link in favicon_links:
820 | href = link.get('href')
821 | if href:
822 | if not href.startswith(('http://', 'https://', 'data:')):
823 | href = urljoin(base_url, href)
824 | if href.startswith(('http://', 'https://')):
825 | assets['favicons'].append(href)
826 | except Exception as e:
827 | print(f"Error extracting favicons: {str(e)}")
828 |
829 | # Extract all video sources
830 | try:
831 | video_tags = soup.find_all('video') or []
832 | for video in video_tags:
833 | # Check src attribute
834 | src = video.get('src')
835 | if src:
836 | if not src.startswith(('http://', 'https://', 'data:')):
837 | src = urljoin(base_url, src)
838 | if src.startswith(('http://', 'https://')):
839 | assets['videos'].append(src)
840 |
841 | # Check source tags inside video
842 | source_tags = video.find_all('source') or []
843 | for source in source_tags:
844 | src = source.get('src')
845 | if src:
846 | if not src.startswith(('http://', 'https://', 'data:')):
847 | src = urljoin(base_url, src)
848 | if src.startswith(('http://', 'https://')):
849 | assets['videos'].append(src)
850 | except Exception as e:
851 | print(f"Error extracting videos: {str(e)}")
852 |
853 | # Extract all audio sources
854 | try:
855 | audio_tags = soup.find_all('audio') or []
856 | for audio in audio_tags:
857 | # Check src attribute
858 | src = audio.get('src')
859 | if src:
860 | if not src.startswith(('http://', 'https://', 'data:')):
861 | src = urljoin(base_url, src)
862 | if src.startswith(('http://', 'https://')):
863 | assets['audio'].append(src)
864 |
865 | # Check source tags inside audio
866 | source_tags = audio.find_all('source') or []
867 | for source in source_tags:
868 | src = source.get('src')
869 | if src:
870 | if not src.startswith(('http://', 'https://', 'data:')):
871 | src = urljoin(base_url, src)
872 | if src.startswith(('http://', 'https://')):
873 | assets['audio'].append(src)
874 | except Exception as e:
875 | print(f"Error extracting audio: {str(e)}")
876 |
877 | # Extract all iframes
878 | try:
879 | iframe_tags = soup.find_all('iframe') or []
880 | for iframe in iframe_tags:
881 | src = iframe.get('src')
882 | if src and not src.startswith('data:'):
883 | if not src.startswith(('http://', 'https://')):
884 | src = urljoin(base_url, src)
885 | if src.startswith(('http://', 'https://')):
886 | if 'youtube' in src or 'vimeo' in src:
887 | assets['videos'].append(src)
888 | else:
889 | assets['js'].append(src) # Treat as JS resource
890 | except Exception as e:
891 | print(f"Error extracting iframes: {str(e)}")
892 |
893 | # Extract Next.js specific resources
894 | try:
895 | # Look for Next.js data scripts
896 | next_data = soup.find('script', {'id': '__NEXT_DATA__'})
897 | if next_data and next_data.string:
898 | try:
899 | next_json = json.loads(next_data.string)
900 | # Extract buildId
901 | if 'buildId' in next_json:
902 | build_id = next_json['buildId']
903 | # Add common Next.js resources with this buildId
904 | for path in ['main', 'webpack', 'framework', 'pages/_app', 'pages/_error', 'pages/index']:
905 | chunk_url = f"{base_url}/_next/static/{build_id}/pages/{path}.js"
906 | assets['js'].append(chunk_url)
907 |
908 | # Extract page data
909 | if 'page' in next_json and 'props' in next_json.get('props', {}):
910 | # This often has valuable data we might want to preserve
911 | assets['metadata']['next_data'] = next_json
912 | except Exception as next_error:
913 | print(f"Error parsing Next.js data: {str(next_error)}")
914 |
915 | # Look for Webpack chunks in comments
916 | chunks_regex = r'/\*\s*webpackJsonp\s*\*/(.*?)/\*\s*end\s*webpackJsonp\s*\*/'
917 | chunks_matches = re.findall(chunks_regex, html_content, re.DOTALL)
918 | if chunks_matches:
919 | print("Found webpack chunks in comments")
920 | # These are JavaScript assets that might be dynamically loaded
921 | except Exception as e:
922 | print(f"Error extracting Next.js resources: {str(e)}")
923 |
924 | # Try to download CSS files and extract additional assets
925 | if session_obj and headers:
926 | try:
927 | css_urls = assets['css'].copy() # Copy to avoid modifying during iteration
928 | for css_url in css_urls:
929 | try:
930 | # Skip data URLs
931 | if css_url.startswith('data:'):
932 | continue
933 |
934 | # Download CSS file
935 | response = session_obj.get(
936 | css_url,
937 | timeout=10,
938 | headers=headers,
939 | verify=False # Ignore SSL certificate errors
940 | )
941 |
942 | if response.status_code == 200:
943 | css_content = response.text
944 |
945 | # Extract URLs from url() function
946 | url_matches = re.findall(r'url\([\'"]?([^\'"|\)]+)[\'"]?\)', css_content) or []
947 | for url in url_matches:
948 | if not url or url.startswith('data:'):
949 | continue
950 |
951 | if not url.startswith(('http://', 'https://')):
952 | # Resolve relative to the CSS file
953 | url = urljoin(css_url, url)
954 |
955 | # Determine asset type
956 | asset_type = get_asset_type(url)
957 | if asset_type in assets:
958 | assets[asset_type].append(url)
959 |
960 | # Extract font families
961 | font_families = re.findall(r'font-family:\s*[\'"]?([^\'";]+)[\'"]?', css_content) or []
962 | for family in font_families:
963 | family = family.strip().split(',')[0].strip('\'"`')
964 | if family and family.lower() not in ['serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'system-ui']:
965 | assets['font_families'].add(family)
966 |
967 | # Extract Google Fonts specifically
968 | google_fonts_imports = re.findall(r'@import\s+url\([\'"]?(https?://fonts\.googleapis\.com/[^\'"|\)]+)[\'"]?\)', css_content) or []
969 | for font_url in google_fonts_imports:
970 | if font_url not in assets['css']:
971 | assets['css'].append(font_url)
972 |
973 | # Check for Tailwind
974 | if 'tailwind' in css_content.lower() or '.tw-' in css_content:
975 | print("Detected Tailwind CSS in stylesheets")
976 | except Exception as css_error:
977 | print(f"Error processing CSS {css_url}: {str(css_error)}")
978 | except Exception as e:
979 | print(f"Error processing CSS files: {str(e)}")
980 |
981 | # Extract UI components
982 | try:
983 | components = extract_component_structure(soup)
984 | if components:
985 | assets['components'] = components
986 | except Exception as e:
987 | print(f"Error extracting components: {str(e)}")
988 | traceback.print_exc()
989 |
990 | # Remove duplicates while preserving order
991 | for asset_type in assets:
992 | if isinstance(assets[asset_type], list):
993 | # Use dict.fromkeys to remove duplicates while preserving order
994 | assets[asset_type] = list(dict.fromkeys(assets[asset_type]))
995 |
996 | return assets
997 |
998 | except Exception as e:
999 | print(f"Error in extract_assets: {str(e)}")
1000 | traceback.print_exc()
1001 | return assets
1002 |
1003 | def create_zip_file(html_content, assets, url, session_obj, headers, screenshots=None):
1004 | """Create a zip file containing the extracted website data"""
1005 | # Create a temp file for the zip
1006 | temp_zip = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
1007 | temp_zip.close()
1008 |
1009 | # Extract domain for the folder name
1010 | parsed_url = urlparse(url)
1011 | domain = parsed_url.netloc
1012 |
1013 | # Current timestamp
1014 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1015 |
1016 | # Create the zip file
1017 | with zipfile.ZipFile(temp_zip.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
1018 | # Write the main HTML
1019 | zipf.writestr('index.html', html_content)
1020 |
1021 | # Create directories for each asset type
1022 | for asset_type in assets.keys():
1023 | if asset_type in ['font_families', 'metadata', 'components']:
1024 | continue # Skip non-URL assets
1025 |
1026 | # Make sure the assets[asset_type] exists and is a list before iterating
1027 | if not assets[asset_type] or not isinstance(assets[asset_type], list):
1028 | print(f" Skipping {asset_type} - no assets found or invalid format")
1029 | continue
1030 |
1031 | # Create the directory
1032 | zipf.writestr(f'{asset_type}/.gitkeep', '')
1033 |
1034 | # Download each asset
1035 | processed_urls = set() # Track processed URLs to avoid duplicates
1036 |
1037 | for url in assets[asset_type]:
1038 | # Skip if the URL is None, empty, or a data URL
1039 | if not url or url.startswith('data:'):
1040 | continue
1041 |
1042 | # Skip if we've already processed this URL
1043 | if url in processed_urls:
1044 | continue
1045 |
1046 | processed_urls.add(url)
1047 |
1048 | try:
1049 | # Fix URL if it's relative
1050 | if url.startswith('//'):
1051 | url = 'https:' + url
1052 | elif url.startswith('/'):
1053 | parsed_base = urlparse(parsed_url.scheme + '://' + parsed_url.netloc)
1054 | url = urljoin(parsed_base.geturl(), url)
1055 |
1056 | # Extract filename from URL
1057 | path = urlparse(url).path
1058 | # Handle query parameters in the URL
1059 | query = urlparse(url).query
1060 | filename = os.path.basename(unquote(path))
1061 |
1062 | # Clean filename
1063 | if not filename:
1064 | filename = f"{timestamp}_{uuid.uuid4().hex[:8]}.{asset_type}"
1065 | elif '.' not in filename:
1066 | filename = f"{filename}.{asset_type}"
1067 |
1068 | # Add query parameters to filename to make it unique
1069 | if query:
1070 | clean_query = re.sub(r'[^a-zA-Z0-9]', '_', query)[:30] # Limit length
1071 | name, ext = os.path.splitext(filename)
1072 | filename = f"{name}_{clean_query}{ext}"
1073 |
1074 | # Avoid duplicate filenames with UUID
1075 | file_path = f"{asset_type}/{filename}"
1076 |
1077 | try:
1078 | # Download the file
1079 | response = session_obj.get(
1080 | url,
1081 | timeout=10,
1082 | headers=headers,
1083 | verify=False # Ignore SSL certificate errors
1084 | )
1085 |
1086 | if response.status_code == 200:
1087 | zipf.writestr(file_path, response.content)
1088 | print(f" Added {file_path}")
1089 | else:
1090 | print(f" Failed to download {url}, status: {response.status_code}")
1091 | except Exception as e:
1092 | print(f" Error downloading {url}: {str(e)}")
1093 | except Exception as e:
1094 | print(f" Error processing URL {url}: {str(e)}")
1095 |
1096 | # Handle font families
1097 | if 'font_families' in assets and assets['font_families']:
1098 | zipf.writestr('css/fonts.css', '\n'.join([
1099 | f"/* Font Family: {family} */\n"
1100 | f"@import url('https://fonts.googleapis.com/css2?family={family.replace(' ', '+')}&display=swap');\n"
1101 | for family in assets['font_families']
1102 | ]))
1103 |
1104 | # Handle metadata if present
1105 | if 'metadata' in assets and assets['metadata']:
1106 | metadata_content = json.dumps(assets['metadata'], indent=2)
1107 | zipf.writestr('metadata.json', metadata_content)
1108 |
1109 | # Handle UI components if present
1110 | if 'components' in assets and assets['components'] and isinstance(assets['components'], dict):
1111 | # Create components directory
1112 | zipf.writestr('components/.gitkeep', '')
1113 |
1114 | # Create index for components
1115 | component_html = """
1116 |
1117 |
1118 |
1119 |
1120 |
1121 | Extracted UI Components
1122 |
1131 |
1132 |
1133 | Extracted UI Components
1134 | The following components were extracted from the website.
1135 | """
1136 |
1137 | # Add each component
1138 | for component_type, components in assets['components'].items():
1139 | if components:
1140 | component_html += f'{component_type.replace("_", " ").title()} Components
'
1141 |
1142 | for i, component in enumerate(components):
1143 | html_code = component.get('html', '')
1144 | if html_code:
1145 | component_html += f"""
1146 |
1147 |
1150 |
1151 | {html_code}
1152 |
1153 |
1154 |
{html.escape(html_code)}
1155 |
1156 |
1157 | """
1158 |
1159 | component_html += """
1160 |
1161 |
1162 | """
1163 |
1164 | zipf.writestr('components/index.html', component_html)
1165 |
1166 | # Save individual components
1167 | for component_type, components in assets['components'].items():
1168 | if components:
1169 | zipf.writestr(f'components/{component_type}/.gitkeep', '')
1170 |
1171 | for i, component in enumerate(components):
1172 | html_code = component.get('html', '')
1173 | if html_code:
1174 | zipf.writestr(f'components/{component_type}/component_{i+1}.html', html_code)
1175 |
1176 | # Create a README file
1177 | readme_content = f"""# Website Clone: {domain}
1178 |
1179 | Extracted on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
1180 | Source URL: {url}
1181 |
1182 | ## Contents
1183 |
1184 | - `index.html`: Main HTML file
1185 | - `css/`: Stylesheets
1186 | - `js/`: JavaScript files
1187 | - `img/`: Images
1188 | - `fonts/`: Font files
1189 | - `components/`: Extracted UI components
1190 | - `metadata.json`: Website metadata (title, description, etc.)
1191 |
1192 | ## How to Use
1193 |
1194 | 1. Unzip this file
1195 | 2. Open `index.html` in your browser
1196 | 3. For best results, serve the files with a local server:
1197 | ```
1198 | python -m http.server
1199 | ```
1200 | Then open http://localhost:8000 in your browser
1201 |
1202 | ## Component Viewer
1203 |
1204 | If components were extracted, you can view them by opening `components/index.html`
1205 |
1206 | ## Notes
1207 |
1208 | - Some assets might not load correctly due to cross-origin restrictions
1209 | - External resources and APIs may not work without proper configuration
1210 | - JavaScript functionality might be limited without a proper backend
1211 |
1212 | ## Handling Modern Frameworks
1213 |
1214 | This extraction has been optimized to handle the following frameworks:
1215 | - React and Next.js: Script chunks and module loading
1216 | - Angular: Component structure and scripts
1217 | - Tailwind CSS: Utility classes and structure
1218 |
1219 | Generated by Website Extractor
1220 | """
1221 | zipf.writestr('README.md', readme_content)
1222 |
1223 | return temp_zip.name
1224 |
1225 | def extract_with_selenium(url, timeout=30):
1226 | """
1227 | Extract rendered HTML content using Selenium with Chrome/Chromium.
1228 | This method will execute JavaScript and capture the fully rendered page structure.
1229 |
1230 | Args:
1231 | url: URL to fetch
1232 | timeout: Maximum time to wait for page to load (seconds)
1233 |
1234 | Returns:
1235 | tuple: (html_content, discovered_urls, None)
1236 | """
1237 | if not SELENIUM_AVAILABLE:
1238 | return None, None, {"error": "Selenium is not installed. Run: pip install selenium webdriver-manager"}
1239 |
1240 | try:
1241 | print("Setting up advanced Chrome options...")
1242 | # Set up Chrome options with anti-detection measures
1243 | chrome_options = Options()
1244 | chrome_options.add_argument("--headless") # Run headless
1245 | chrome_options.add_argument("--disable-gpu") # Disable GPU hardware acceleration
1246 | chrome_options.add_argument("--no-sandbox") # Required for running as root
1247 | chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems
1248 | chrome_options.add_argument("--window-size=1920,1080") # Set window size
1249 | chrome_options.add_argument("--disable-notifications") # Disable notifications
1250 | chrome_options.add_argument("--disable-extensions") # Disable extensions
1251 | chrome_options.add_argument("--disable-infobars") # Disable infobars
1252 |
1253 | # Avoid detection as a bot
1254 | chrome_options.add_argument("--disable-blink-features=AutomationControlled")
1255 | chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
1256 | chrome_options.add_experimental_option("useAutomationExtension", False)
1257 |
1258 | # Add modern user agent to avoid detection
1259 | chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36")
1260 |
1261 | # Initialize the Chrome driver
1262 | print(f"Initializing Chrome WebDriver...")
1263 | try:
1264 | service = Service(ChromeDriverManager().install())
1265 | driver = webdriver.Chrome(service=service, options=chrome_options)
1266 | except Exception as driver_error:
1267 | print(f"Error initializing Chrome WebDriver: {str(driver_error)}")
1268 | print("Trying alternative initialization method...")
1269 | try:
1270 | # Try alternative initialization without Service object
1271 | driver = webdriver.Chrome(options=chrome_options)
1272 | except Exception as alt_error:
1273 | print(f"Alternative initialization also failed: {str(alt_error)}")
1274 | return None, None, {"error": f"Failed to initialize Chrome WebDriver: {str(alt_error)}"}
1275 |
1276 | # Set page load timeout
1277 | driver.set_page_load_timeout(timeout)
1278 |
1279 | # Used to store discovered URLs
1280 | discovered_urls = []
1281 |
1282 | try:
1283 | print(f"Navigating to {url}...")
1284 | driver.get(url)
1285 |
1286 | # Wait for page to be fully loaded
1287 | try:
1288 | WebDriverWait(driver, timeout).until(
1289 | EC.presence_of_element_located((By.TAG_NAME, "body"))
1290 | )
1291 | except Exception as e:
1292 | print(f"Warning: Timeout waiting for body element: {str(e)}")
1293 |
1294 | # Execute JavaScript to disable animation
1295 | try:
1296 | driver.execute_script("""
1297 | var style = document.createElement('style');
1298 | style.type = 'text/css';
1299 | style.innerHTML = '* { animation-duration: 0.001s !important; transition-duration: 0.001s !important; }';
1300 | document.getElementsByTagName('head')[0].appendChild(style);
1301 | """)
1302 | print("Animations disabled to improve extraction")
1303 | except Exception as e:
1304 | print(f"Warning: Could not disable animations: {str(e)}")
1305 |
1306 | # Wait for page to be fully rendered
1307 | print("Waiting for dynamic content to load...")
1308 | try:
1309 | # Wait a bit for any dynamic content to load
1310 | time.sleep(5)
1311 |
1312 | # Wait for network to be idle
1313 | driver.execute_script("return window.performance.getEntriesByType('resource').length")
1314 | time.sleep(2) # Wait a bit more after resources are loaded
1315 | except Exception as e:
1316 | print(f"Warning while waiting for dynamic content: {str(e)}")
1317 |
1318 | # Implement advanced scrolling to trigger lazy loading
1319 | print("Performing advanced scrolling to trigger lazy loading...")
1320 | try:
1321 | # Get the total height of the page
1322 | total_height = driver.execute_script("return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight, document.body.offsetHeight, document.documentElement.offsetHeight, document.body.clientHeight, document.documentElement.clientHeight);")
1323 |
1324 | # Scroll down the page in steps
1325 | viewport_height = driver.execute_script("return window.innerHeight")
1326 | scroll_steps = max(1, min(20, total_height // viewport_height)) # Cap at 20 steps
1327 |
1328 | for i in range(scroll_steps + 1):
1329 | scroll_position = (i * total_height) // scroll_steps
1330 | driver.execute_script(f"window.scrollTo(0, {scroll_position});")
1331 |
1332 | # Small pause to allow content to load
1333 | time.sleep(0.3)
1334 |
1335 | # Extract resources after each scroll
1336 | try:
1337 | urls = driver.execute_script("""
1338 | var resources = [];
1339 | // Get all link hrefs
1340 | document.querySelectorAll('link[rel="stylesheet"], link[as="style"]').forEach(function(el) {
1341 | if (el.href) resources.push(el.href);
1342 | });
1343 | // Get all script srcs
1344 | document.querySelectorAll('script[src]').forEach(function(el) {
1345 | if (el.src) resources.push(el.src);
1346 | });
1347 | // Get all image srcs
1348 | document.querySelectorAll('img[src]').forEach(function(el) {
1349 | if (el.src && !el.src.startsWith('data:')) resources.push(el.src);
1350 | });
1351 | return resources;
1352 | """)
1353 | discovered_urls.extend(urls)
1354 | except Exception as res_error:
1355 | print(f"Error extracting resources during scroll: {str(res_error)}")
1356 |
1357 | # Scroll back to top
1358 | driver.execute_script("window.scrollTo(0, 0);")
1359 |
1360 | # Wait for everything to settle after scrolling
1361 | time.sleep(1)
1362 | except Exception as scroll_error:
1363 | print(f"Error during page scrolling: {str(scroll_error)}")
1364 |
1365 | # Try to click on common elements that might reveal more content
1366 | try:
1367 | # Common UI elements that might reveal more content when clicked
1368 | for selector in [
1369 | 'button.load-more', '.show-more', '.expand', '.accordion-toggle',
1370 | '[aria-expanded="false"]', '.menu-toggle', '.navbar-toggler',
1371 | '.mobile-menu-button', '.hamburger', '[data-toggle="collapse"]'
1372 | ]:
1373 | try:
1374 | elements = driver.find_elements(By.CSS_SELECTOR, selector)
1375 | for element in elements[:3]: # Limit to first 3 matches of each type
1376 | if element.is_displayed():
1377 | driver.execute_script("arguments[0].click();", element)
1378 | time.sleep(0.5) # Wait for content to appear
1379 | except Exception as click_error:
1380 | # Skip any errors and continue with next selector
1381 | continue
1382 | print("Attempted to expand hidden content")
1383 | except Exception as interact_error:
1384 | print(f"Error expanding content: {str(interact_error)}")
1385 |
1386 | # Get the final HTML content after all JavaScript executed
1387 | html_content = driver.page_source
1388 | print(f"HTML content captured ({len(html_content)} bytes)")
1389 |
1390 | # Extract URLs for modern frameworks
1391 | try:
1392 | # React/Next.js specific resources
1393 | next_js_urls = driver.execute_script("""
1394 | var resources = [];
1395 | // Find Next.js specific scripts
1396 | document.querySelectorAll('script[src*="_next"]').forEach(function(el) {
1397 | resources.push(el.src);
1398 | });
1399 | // Find chunk files
1400 | document.querySelectorAll('script[src*="chunk"]').forEach(function(el) {
1401 | resources.push(el.src);
1402 | });
1403 | // Find webpack files
1404 | document.querySelectorAll('script[src*="webpack"]').forEach(function(el) {
1405 | resources.push(el.src);
1406 | });
1407 | // Find hydration scripts
1408 | document.querySelectorAll('script[src*="hydration"]').forEach(function(el) {
1409 | resources.push(el.src);
1410 | });
1411 | return resources;
1412 | """)
1413 | discovered_urls.extend(next_js_urls)
1414 |
1415 | # Angular specific resources
1416 | angular_urls = driver.execute_script("""
1417 | var resources = [];
1418 | // Find Angular specific scripts
1419 | document.querySelectorAll('script[src*="runtime"]').forEach(function(el) {
1420 | resources.push(el.src);
1421 | });
1422 | document.querySelectorAll('script[src*="polyfills"]').forEach(function(el) {
1423 | resources.push(el.src);
1424 | });
1425 | document.querySelectorAll('script[src*="main"]').forEach(function(el) {
1426 | resources.push(el.src);
1427 | });
1428 | return resources;
1429 | """)
1430 | discovered_urls.extend(angular_urls)
1431 |
1432 | # Get CSS variables for Tailwind detection
1433 | tailwind_check = driver.execute_script("""
1434 | var style = window.getComputedStyle(document.body);
1435 | var hasTailwind = false;
1436 | // Check for common Tailwind classes
1437 | if (document.querySelector('.flex') &&
1438 | document.querySelector('.grid') &&
1439 | document.querySelector('.text-')) {
1440 | hasTailwind = true;
1441 | }
1442 | return hasTailwind;
1443 | """)
1444 |
1445 | if tailwind_check:
1446 | print("Tailwind CSS detected, including appropriate CSS files")
1447 | except Exception as framework_error:
1448 | print(f"Error detecting framework resources: {str(framework_error)}")
1449 |
1450 | # Remove duplicates from discovered URLs
1451 | discovered_urls = list(set(discovered_urls))
1452 | print(f"Discovered {len(discovered_urls)} resource URLs")
1453 |
1454 | return html_content, discovered_urls, None
1455 |
1456 | except TimeoutException:
1457 | print(f"Timeout while loading {url}")
1458 | return None, None, {"error": "Timeout while loading page"}
1459 | except WebDriverException as e:
1460 | print(f"Selenium error: {str(e)}")
1461 | return None, None, {"error": f"Selenium error: {str(e)}"}
1462 | finally:
1463 | # Close the browser
1464 | print("Closing WebDriver...")
1465 | driver.quit()
1466 |
1467 | except Exception as e:
1468 | print(f"Error setting up Selenium: {str(e)}")
1469 | return None, None, {"error": f"Error setting up Selenium: {str(e)}"}
1470 |
1471 | def fix_relative_urls(html_content, base_url):
1472 | """Fix relative URLs in the HTML content"""
1473 | soup = BeautifulSoup(html_content, 'html.parser')
1474 |
1475 | # Fix relative URLs for links
1476 | for link in soup.find_all('a', href=True):
1477 | href = link['href']
1478 | if href.startswith('/'):
1479 | link['href'] = urljoin(base_url, href)
1480 |
1481 | # Fix relative URLs for images
1482 | for img in soup.find_all('img', src=True):
1483 | src = img['src']
1484 | if not src.startswith(('http://', 'https://', 'data:')):
1485 | img['src'] = urljoin(base_url, src)
1486 |
1487 | # Fix relative URLs for scripts
1488 | for script in soup.find_all('script', src=True):
1489 | src = script['src']
1490 | if not src.startswith(('http://', 'https://', 'data:')):
1491 | script['src'] = urljoin(base_url, src)
1492 |
1493 | # Fix relative URLs for stylesheets
1494 | for link in soup.find_all('link', href=True):
1495 | href = link['href']
1496 | if not href.startswith(('http://', 'https://', 'data:')):
1497 | link['href'] = urljoin(base_url, href)
1498 |
1499 | return str(soup)
1500 |
1501 | @app.route('/')
1502 | def index():
1503 | """Render the home page"""
1504 | return render_template('index.html')
1505 |
1506 | @app.route('/clear')
1507 | def clear_session():
1508 | """Clear the session data"""
1509 | session.clear()
1510 | return jsonify({'message': 'Session cleared'})
1511 |
1512 | @app.route('/extract', methods=['POST'])
1513 | def extract():
1514 | url = request.form.get('url')
1515 | use_selenium = request.form.get('use_selenium') == 'true'
1516 |
1517 | if not url:
1518 | return jsonify({'error': 'URL is required'}), 400
1519 |
1520 | try:
1521 | # Add http:// if not present
1522 | if not url.startswith(('http://', 'https://')):
1523 | url = 'https://' + url
1524 |
1525 | print(f"\n{'='*80}\nStarting extraction for: {url}\n{'='*80}")
1526 |
1527 | # Create a session to maintain cookies
1528 | session_obj = requests.Session()
1529 |
1530 | # Disable SSL verification warnings
1531 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
1532 |
1533 | # List of user agents to try if we get blocked
1534 | user_agents = [
1535 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
1536 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
1537 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0',
1538 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
1539 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
1540 | 'Mozilla/5.0 (iPad; CPU OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
1541 | 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1'
1542 | ]
1543 |
1544 | # List of referers to try
1545 | referers = [
1546 | 'https://www.google.com/',
1547 | 'https://www.bing.com/',
1548 | 'https://www.instagram.com/',
1549 | 'https://www.facebook.com/',
1550 | 'https://www.twitter.com/',
1551 | 'https://www.linkedin.com/'
1552 | ]
1553 |
1554 | # Initial headers (will be rotated if needed)
1555 | headers = {
1556 | 'User-Agent': random.choice(user_agents),
1557 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
1558 | 'Accept-Language': 'en-US,en;q=0.9',
1559 | 'Accept-Encoding': 'gzip, deflate, br',
1560 | 'Connection': 'keep-alive',
1561 | 'Upgrade-Insecure-Requests': '1',
1562 | 'Sec-Fetch-Dest': 'document',
1563 | 'Sec-Fetch-Mode': 'navigate',
1564 | 'Sec-Fetch-Site': 'none',
1565 | 'Sec-Fetch-User': '?1',
1566 | 'Cache-Control': 'max-age=0',
1567 | 'Referer': random.choice(referers),
1568 | 'sec-ch-ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
1569 | 'sec-ch-ua-mobile': '?0',
1570 | 'sec-ch-ua-platform': '"Windows"',
1571 | }
1572 |
1573 | html_content = None
1574 | additional_urls = []
1575 |
1576 | # Use Selenium for rendering if requested and available
1577 | if use_selenium and SELENIUM_AVAILABLE:
1578 | print("Using Selenium for advanced rendering...")
1579 | html_content, additional_urls, error_info = extract_with_selenium(url)
1580 |
1581 | if not html_content:
1582 | print("Selenium extraction failed, falling back to regular request")
1583 | use_selenium = False
1584 | # Check if we have an error message
1585 | if error_info and isinstance(error_info, dict) and 'error' in error_info:
1586 | print(f"Selenium error: {error_info['error']}")
1587 |
1588 | # If Selenium wasn't used or failed, use regular requests with retries
1589 | if not use_selenium or not html_content:
1590 | # Maximum number of retries with different configurations
1591 | max_retries = 5
1592 | retry_count = 0
1593 | last_error = None
1594 |
1595 | while retry_count < max_retries and not html_content:
1596 | try:
1597 | print(f"HTTP Request attempt {retry_count+1}/{max_retries} for: {url}")
1598 | print(f"Using User-Agent: {headers['User-Agent'][:30]}...")
1599 |
1600 | # First request to get cookies and possible redirects
1601 | response = session_obj.get(
1602 | url,
1603 | timeout=20, # Increased timeout
1604 | headers=headers,
1605 | allow_redirects=True,
1606 | verify=False # Ignore SSL certificate errors
1607 | )
1608 |
1609 | # Follow redirects manually if needed
1610 | if response.history:
1611 | print(f"Request was redirected {len(response.history)} times")
1612 | for i, resp in enumerate(response.history):
1613 | print(f" Redirect {i+1}: {resp.url} -> {resp.headers.get('Location')}")
1614 | print(f" Final URL: {response.url}")
1615 | url = response.url # Update URL to the final destination
1616 |
1617 | # Handle different status codes
1618 | if response.status_code == 200:
1619 | print(f"Success! Received 200 OK response ({len(response.content)} bytes)")
1620 |
1621 | # Determine encoding from Content-Type header or content
1622 | content_type = response.headers.get('Content-Type', '')
1623 | print(f"Content-Type: {content_type}")
1624 |
1625 | # Get encoding from headers or meta tag
1626 | encoding = None
1627 |
1628 | # Try to get encoding from Content-Type header
1629 | if 'charset=' in content_type:
1630 | encoding = content_type.split('charset=')[1].split(';')[0].strip()
1631 | print(f"Encoding from headers: {encoding}")
1632 |
1633 | # If no encoding specified, try to detect from content
1634 | if not encoding:
1635 | # Look for tag
1636 | charset_match = re.search(r']+)', response.text, re.IGNORECASE)
1637 | if charset_match:
1638 | encoding = charset_match.group(1)
1639 | print(f"Encoding from meta charset tag: {encoding}")
1640 | else:
1641 | # Look for
1642 | http_equiv_match = re.search(r']+)', response.text, re.IGNORECASE)
1643 | if http_equiv_match:
1644 | encoding = http_equiv_match.group(1)
1645 | print(f"Encoding from meta http-equiv tag: {encoding}")
1646 |
1647 | # If still no encoding, use apparent encoding from requests
1648 | if not encoding and response.apparent_encoding:
1649 | encoding = response.apparent_encoding
1650 | print(f"Detected encoding: {encoding}")
1651 |
1652 | # Default to utf-8 if still no encoding
1653 | if not encoding:
1654 | encoding = 'utf-8'
1655 | print("Using default encoding: utf-8")
1656 |
1657 | # Decode content with detected encoding
1658 | try:
1659 | html_content = response.content.decode(encoding, errors='replace')
1660 | print(f"Successfully decoded HTML content with {encoding} encoding ({len(html_content)} bytes)")
1661 | break # Exit the retry loop on success
1662 | except (UnicodeDecodeError, LookupError) as e:
1663 | print(f"Error decoding with {encoding}: {str(e)}, falling back to utf-8")
1664 | html_content = response.content.decode('utf-8', errors='replace')
1665 | break # Exit the retry loop on success with fallback
1666 |
1667 | elif response.status_code == 403: # Forbidden - likely bot protection
1668 | print(f"Received 403 Forbidden response - website is likely blocking scrapers")
1669 |
1670 | # If we have Selenium available as a fallback, try that instead
1671 | if SELENIUM_AVAILABLE and not use_selenium:
1672 | print("Trying Selenium as a fallback for 403 error...")
1673 | html_content, additional_urls, error_info = extract_with_selenium(url)
1674 | if html_content:
1675 | print("Successfully bypassed 403 with Selenium!")
1676 | break
1677 |
1678 | # Otherwise, rotate our headers and try again
1679 | headers['User-Agent'] = random.choice(user_agents)
1680 | headers['Referer'] = random.choice(referers)
1681 |
1682 | # Add some randomization to headers
1683 | if random.random() > 0.5:
1684 | headers['Accept-Language'] = random.choice(['en-US,en;q=0.9', 'en-GB,en;q=0.8,en-US;q=0.7', 'en-CA,en;q=0.9,fr-CA;q=0.8'])
1685 |
1686 | # Try adding cookies if we have any from previous responses
1687 | if session_obj.cookies:
1688 | print(f"Using {len(session_obj.cookies)} cookies from previous responses")
1689 |
1690 | # Add delay to avoid rate limiting
1691 | delay = random.uniform(1.0, 3.0)
1692 | print(f"Waiting {delay:.2f} seconds before retrying...")
1693 | time.sleep(delay)
1694 |
1695 | elif response.status_code == 429: # Too Many Requests
1696 | print(f"Received 429 Too Many Requests - rate limited")
1697 |
1698 | # Check if we have a Retry-After header
1699 | retry_after = response.headers.get('Retry-After')
1700 | if retry_after and retry_after.isdigit():
1701 | delay = int(retry_after) + random.uniform(0.1, 1.0)
1702 | else:
1703 | delay = 5 + random.uniform(1.0, 5.0) # 5-10 second delay
1704 |
1705 | print(f"Waiting {delay:.2f} seconds before retrying...")
1706 | time.sleep(delay)
1707 |
1708 | # Rotate headers
1709 | headers['User-Agent'] = random.choice(user_agents)
1710 |
1711 | elif response.status_code == 503: # Service Unavailable - often used for anti-bot
1712 | print(f"Received 503 Service Unavailable - possible anti-bot measure")
1713 |
1714 | # Try with a longer delay and new headers
1715 | delay = 10 + random.uniform(1.0, 5.0) # 10-15 second delay
1716 | print(f"Waiting {delay:.2f} seconds before retrying...")
1717 | time.sleep(delay)
1718 |
1719 | # Complete header rotation
1720 | headers['User-Agent'] = random.choice(user_agents)
1721 | headers['Referer'] = random.choice(referers)
1722 |
1723 | else:
1724 | print(f"Received unexpected status code: {response.status_code}")
1725 | last_error = f"HTTP error ({response.status_code})"
1726 |
1727 | # Try with new headers on next attempt
1728 | headers['User-Agent'] = random.choice(user_agents)
1729 |
1730 | except requests.exceptions.Timeout:
1731 | print(f"Timeout error fetching {url}")
1732 | last_error = "Request timeout"
1733 | # Try with increased timeout on next attempt
1734 |
1735 | except requests.exceptions.ConnectionError:
1736 | print(f"Connection error fetching {url}")
1737 | last_error = "Connection error"
1738 | # Wait before retrying
1739 | time.sleep(2)
1740 |
1741 | except requests.exceptions.TooManyRedirects:
1742 | print(f"Too many redirects for {url}")
1743 | last_error = "Too many redirects"
1744 | # This is likely a permanent issue, break the loop
1745 | break
1746 |
1747 | except Exception as e:
1748 | print(f"Error fetching {url}: {str(e)}")
1749 | last_error = str(e)
1750 |
1751 | retry_count += 1
1752 |
1753 | # If we've exhausted all retries and still don't have content
1754 | if not html_content and retry_count >= max_retries:
1755 | error_msg = f"Failed to fetch website after {max_retries} attempts. Last error: {last_error}"
1756 | print(error_msg)
1757 | return jsonify({'error': error_msg}), 400
1758 |
1759 | # Safety check - make sure we have HTML content
1760 | if not html_content or len(html_content) < 100: # Arbitrary minimum size for valid HTML
1761 | return jsonify({'error': 'Failed to extract valid HTML content from the website'}), 400
1762 |
1763 | # Continue with asset extraction and zip file creation
1764 | try:
1765 | print("\nExtracting assets...")
1766 | # Extract assets from the HTML content
1767 | assets = extract_assets(html_content, url, session_obj, headers)
1768 |
1769 | if not assets:
1770 | return jsonify({'error': 'Failed to extract assets from the website'}), 500
1771 |
1772 | print(f"Assets extracted: {', '.join(assets.keys())}")
1773 |
1774 | # If we have additional URLs from Selenium, add them to the assets
1775 | if additional_urls:
1776 | print(f"Adding {len(additional_urls)} URLs discovered by Selenium")
1777 | for asset_url in additional_urls:
1778 | # Skip data URLs
1779 | if not asset_url or asset_url.startswith('data:'):
1780 | continue
1781 |
1782 | # Normalize URL
1783 | if asset_url.startswith('//'):
1784 | asset_url = f"https:{asset_url}"
1785 |
1786 | try:
1787 | asset_type = get_asset_type(asset_url)
1788 | if asset_type in assets and asset_url not in assets[asset_type]:
1789 | # Validate URL
1790 | parsed = urlparse(asset_url)
1791 | if parsed.scheme and parsed.netloc:
1792 | assets[asset_type].append(asset_url)
1793 | except Exception as url_error:
1794 | print(f"Error processing URL {asset_url}: {str(url_error)}")
1795 |
1796 | # Count assets by type
1797 | asset_counts = {asset_type: len(urls) for asset_type, urls in assets.items()
1798 | if isinstance(urls, list) and asset_type not in ['metadata', 'font_families']}
1799 | print(f"\nAsset counts:")
1800 | for asset_type, count in asset_counts.items():
1801 | print(f" {asset_type}: {count}")
1802 |
1803 | # Check if we have enough assets
1804 | total_assets = sum(count for count in asset_counts.values())
1805 | if total_assets < 5:
1806 | print("\nWARNING: Very few assets extracted. Trying alternative extraction methods...")
1807 |
1808 | # Try to extract assets from the page using JavaScript execution (simulated)
1809 | try:
1810 | # Look for JavaScript variables that might contain asset URLs
1811 | js_asset_patterns = [
1812 | r'["\'](https?://[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']',
1813 | r'["\'](/[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']',
1814 | r'["\'](//[^"\']+\.(css|js|png|jpg|jpeg|gif|svg|woff2?))["\']',
1815 | r'loadCSS\(["\']([^"\']+)["\']',
1816 | r'loadJS\(["\']([^"\']+)["\']',
1817 | r'src=["\'](/[^"\']+)["\']',
1818 | r'href=["\'](/[^"\']+\.css)["\']',
1819 | # React/Next.js specific patterns
1820 | r'__NEXT_DATA__\s*=\s*({.*})',
1821 | r'window\.__PRELOADED_STATE__\s*=\s*({.*})',
1822 | r'window\.__INITIAL_STATE__\s*=\s*({.*})',
1823 | r'_ASSET_PREFIX_\s*=\s*["\']([^"\']+)["\']'
1824 | ]
1825 |
1826 | for pattern in js_asset_patterns:
1827 | matches = re.findall(pattern, html_content)
1828 | for match in matches:
1829 | if isinstance(match, tuple):
1830 | match_url = match[0]
1831 | else:
1832 | match_url = match
1833 |
1834 | if match_url.startswith('//'):
1835 | match_url = 'https:' + match_url
1836 | elif match_url.startswith('/'):
1837 | match_url = urljoin(url, match_url)
1838 |
1839 | # Skip if it's clearly not a URL (likely JSON data)
1840 | if '{' in match_url or '}' in match_url:
1841 | continue
1842 |
1843 | asset_type = get_asset_type(match_url)
1844 | if asset_type in assets:
1845 | assets[asset_type].append(match_url)
1846 |
1847 | print("Extracted additional assets from JavaScript patterns")
1848 | except Exception as e:
1849 | print(f"Error extracting additional assets: {str(e)}")
1850 |
1851 | # Try to fix relative URLs in the HTML
1852 | try:
1853 | print("\nFixing relative URLs...")
1854 | fixed_html = fix_relative_urls(html_content, url)
1855 | print("Relative URLs fixed")
1856 | except Exception as e:
1857 | print(f"Error fixing URLs: {str(e)}")
1858 | fixed_html = html_content # Use original HTML if fixing fails
1859 |
1860 | try:
1861 | # Create and send zip file, passing the session and headers
1862 | print("\nCreating zip file...")
1863 |
1864 | # Extract domain from URL for the filename
1865 | domain = urlparse(url).netloc
1866 | safe_domain = re.sub(r'[^\w\-_]', '_', domain)
1867 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1868 | filename = f"{safe_domain}_{timestamp}.zip"
1869 |
1870 | # Create a zip file with the extracted content
1871 | zip_file_path = create_zip_file(fixed_html, assets, url, session_obj, headers)
1872 |
1873 | # Check if the file was created successfully
1874 | if not os.path.exists(zip_file_path) or os.path.getsize(zip_file_path) < 100:
1875 | return jsonify({'error': 'Failed to create valid zip file'}), 500
1876 |
1877 | print(f"Zip file created successfully at {zip_file_path} ({os.path.getsize(zip_file_path)} bytes)")
1878 | print(f"\nExtraction completed for: {url}\n{'='*80}")
1879 |
1880 | # Copy the temporary file to a more persistent location
1881 | persistent_dir = os.path.join(tempfile.gettempdir(), 'website_extractor_downloads')
1882 | os.makedirs(persistent_dir, exist_ok=True)
1883 | persistent_path = os.path.join(persistent_dir, filename)
1884 |
1885 | # Copy the file instead of moving to ensure the original isn't deleted prematurely
1886 | shutil.copy2(zip_file_path, persistent_path)
1887 |
1888 | # Schedule the temp file for deletion after a reasonable period (30 minutes)
1889 | def delete_temp_file():
1890 | try:
1891 | time.sleep(1800) # 30 minutes
1892 | if os.path.exists(zip_file_path):
1893 | os.remove(zip_file_path)
1894 | print(f"Temporary file {zip_file_path} removed after 30 minutes")
1895 | if os.path.exists(persistent_path):
1896 | os.remove(persistent_path)
1897 | print(f"Persistent file {persistent_path} removed after 30 minutes")
1898 | except Exception as e:
1899 | print(f"Error removing temporary file: {str(e)}")
1900 |
1901 | # Start a thread to handle file deletion
1902 | cleanup_thread = threading.Thread(target=delete_temp_file)
1903 | cleanup_thread.daemon = True
1904 | cleanup_thread.start()
1905 |
1906 | # Send the persistent file with improved headers and explicit attachment
1907 | response = send_file(
1908 | persistent_path,
1909 | mimetype='application/zip',
1910 | as_attachment=True,
1911 | download_name=filename
1912 | )
1913 |
1914 | # Add headers to prevent caching issues
1915 | response.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
1916 | response.headers['Pragma'] = 'no-cache'
1917 | response.headers['Expires'] = '0'
1918 | response.headers['Content-Disposition'] = f'attachment; filename="{filename}"'
1919 |
1920 | # Note: We're no longer using after_this_request to remove the file immediately
1921 | # Instead, we're using a background thread to clean up after 30 minutes
1922 |
1923 | return response
1924 |
1925 | except Exception as e:
1926 | print(f"Error creating or sending zip file: {str(e)}")
1927 | traceback.print_exc()
1928 | return jsonify({'error': f'Failed to create or send zip file: {str(e)}'}), 500
1929 | except Exception as e:
1930 | print(f"Error in asset extraction: {str(e)}")
1931 | traceback.print_exc()
1932 | return jsonify({'error': f'Error extracting assets: {str(e)}'}), 500
1933 |
1934 | except Exception as e:
1935 | print(f"Unexpected error: {str(e)}")
1936 | traceback.print_exc()
1937 | return jsonify({'error': str(e)}), 500
1938 |
1939 | if __name__ == '__main__':
1940 | print("\n" + "="*80)
1941 | print("Website Extractor is running!")
1942 | print("Access it in your browser at: http://127.0.0.1:5001")
1943 | print("="*80 + "\n")
1944 | app.run(debug=True, threaded=True, port=5001)
1945 |
1946 | def main():
1947 | """Entry point for the package, to allow running as an installed package from command line"""
1948 | print("\n" + "="*80)
1949 | print("Website Extractor is running!")
1950 | print("Access it in your browser at: http://127.0.0.1:5001")
1951 | print("="*80 + "\n")
1952 | app.run(debug=True, threaded=True, port=5001)
--------------------------------------------------------------------------------