├── Dockerfile ├── README.md ├── api_management.py ├── assets.py ├── chromedriver-win64 ├── LICENSE.chromedriver ├── THIRD_PARTY_NOTICES.chromedriver └── chromedriver.exe ├── pagination_detector.py ├── requirements.txt ├── scraper.py ├── streamlit_app.py └── test.py /Dockerfile: -------------------------------------------------------------------------------- 1 | # Base image with Python 3.11.4 2 | FROM python:3.11.4-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /app 6 | 7 | # Copy the requirements file and install dependencies 8 | COPY requirements.txt ./ 9 | RUN pip install --no-cache-dir -r requirements.txt 10 | 11 | # Install Chrome Browser and additional dependencies 12 | RUN apt-get update && apt-get install -y \ 13 | wget \ 14 | gnupg \ 15 | unzip \ 16 | libappindicator3-1 \ 17 | libasound2 \ 18 | libatk-bridge2.0-0 \ 19 | libatk1.0-0 \ 20 | libcups2 \ 21 | libdrm2 \ 22 | libgbm1 \ 23 | libnspr4 \ 24 | libnss3 \ 25 | libxcomposite1 \ 26 | libxdamage1 \ 27 | libxrandr2 \ 28 | fonts-liberation \ 29 | xdg-utils \ 30 | --no-install-recommends && \ 31 | wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \ 32 | sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \ 33 | apt-get update && apt-get install -y google-chrome-stable && \ 34 | rm -rf /var/lib/apt/lists/* 35 | 36 | # Install WebDriver Manager to manage ChromeDriver 37 | RUN pip install webdriver-manager 38 | 39 | # Set environment variables for Chrome in headless mode 40 | ENV CHROME_BIN=/usr/bin/google-chrome 41 | ENV CHROME_DRIVER=/usr/local/bin/chromedriver 42 | 43 | # Copy the entire project into the container 44 | COPY . . 45 | 46 | # Expose the port that Streamlit will use 47 | EXPOSE 8501 48 | 49 | # Clean up any unnecessary packages after installation 50 | RUN apt-get clean && rm -rf /var/lib/apt/lists/* 51 | 52 | # Ensure all packages are updated to their latest versions 53 | RUN apt-get update && apt-get dist-upgrade -y 54 | 55 | 56 | # Command to run the Streamlit app 57 | CMD ["streamlit", "run", "streamlit_app.py"] 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScrapeMaster 2 | 3 | ScrapeMaster is a Streamlit-based web scraping application designed to simplify the process of extracting data from web pages. It allows users to specify URLs and data fields interactively, facilitating the extraction and manipulation of web data. 4 | 5 | ## Features 6 | 7 | - Easy-to-use web interface. 8 | - Custom field specification for data extraction. 9 | - Pagination 10 | - Dynamic data processing with Python and Streamlit. 11 | - Direct download capabilities for extracted data in various formats. 12 | - Attended mode 13 | 14 | ## Prerequisites 15 | 16 | Before you begin, ensure you have the following installed: 17 | - Python 3.6 or higher 18 | - Pip for managing Python packages 19 | 20 | ## Installation 21 | 22 | Follow these steps to get your development environment running: 23 | 24 | ```bash 25 | # Clone the repository 26 | git clone https://github.com/reda-marzouk608/scrape-master 27 | cd scrape-master 28 | 29 | # It's recommended to create a virtual environment 30 | python -m venv venv 31 | # Activate the virtual environment 32 | # On Windows 33 | venv\Scripts\activate 34 | # On MacOS/Linux 35 | source venv/bin/activate 36 | 37 | # Install the required packages 38 | pip install -r requirements.txt 39 | ``` 40 | 41 | ## Launching the Application 42 | 43 | To run ScrapeMaster, navigate to the project directory and run the following command: 44 | 45 | ```bash 46 | streamlit run streamlit_app.py 47 | ``` 48 | 49 | 50 | ## Usage 51 | After launching the application, open your web browser to the indicated address (typically http://localhost:8501). Use the sidebar to input the URL and fields you wish to scrape, then click the "Scrape" button to see results. -------------------------------------------------------------------------------- /api_management.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | 4 | def get_api_key(api_key_name): 5 | # Check if the API key from the sidebar is present, else fallback to the .env file 6 | if api_key_name == 'OPENAI_API_KEY': 7 | return st.session_state['openai_api_key'] or os.getenv(api_key_name) 8 | elif api_key_name == 'GOOGLE_API_KEY': 9 | return st.session_state['gemini_api_key'] or os.getenv(api_key_name) 10 | elif api_key_name == 'GROQ_API_KEY': 11 | return st.session_state['groq_api_key'] or os.getenv(api_key_name) 12 | else: 13 | return os.getenv(api_key_name) 14 | -------------------------------------------------------------------------------- /assets.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains configuration variables and constants 3 | that are used across different parts of the application. 4 | """ 5 | 6 | # List of user agents to mimic different users 7 | USER_AGENTS = [ 8 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", 9 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", 10 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0", 11 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36", 12 | "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", 13 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", 14 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36", 15 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", 16 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", 17 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", 18 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0", 19 | "Mozilla/5.0 (X11; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0", 20 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", 21 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36", 22 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36", 23 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36", 24 | "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0", 25 | "Mozilla/5.0 (iPhone; CPU iPhone OS 13_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Mobile/15E148 Safari/604.1", 26 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15", 27 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", 28 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", 29 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0", 30 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36", 31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15", 32 | "Mozilla/5.0 (iPhone; CPU iPhone OS 13_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Mobile/15E148 Safari/604.1", 33 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0", 34 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36", 35 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", 36 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15", 37 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0", 38 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36", 39 | "Mozilla/5.0 (iPhone; CPU iPhone OS 14_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", 40 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", 41 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0", 42 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", 43 | "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", 44 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", 45 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36", 46 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0", 47 | "Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", 48 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36", 49 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", 50 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0", 51 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15", 52 | "Mozilla/5.0 (X11; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0", 53 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", 54 | "Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Mobile/15E148 Safari/604.1", 55 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", 56 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15", 57 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36", 58 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", 59 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36", 60 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36", 61 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36" 62 | ] 63 | 64 | 65 | # Define the pricing for models without Batch API 66 | PRICING = { 67 | "gpt-4o-mini": { 68 | "input": 0.150 / 1_000_000, # $0.150 per 1M input tokens 69 | "output": 0.600 / 1_000_000, # $0.600 per 1M output tokens 70 | }, 71 | "gpt-4o-2024-08-06": { 72 | "input": 2.5 / 1_000_000, # $2.5 per 1M input tokens 73 | "output": 10 / 1_000_000, # $10 per 1M output tokens 74 | }, 75 | "gemini-1.5-flash": { 76 | "input": 0.075 / 1_000_000, # $0.075 per 1M input tokens 77 | "output": 0.30 / 1_000_000, # $0.30 per 1M output tokens 78 | }, 79 | "Llama3.1 8B": { 80 | "input": 0 , # Free 81 | "output": 0 , # Free 82 | }, 83 | "Groq Llama3.1 70b": { 84 | "input": 0 , # Free 85 | "output": 0 , # Free 86 | }, 87 | # Add other models and their prices here if needed 88 | } 89 | 90 | # Timeout settings for web scraping 91 | TIMEOUT_SETTINGS = { 92 | "page_load": 30, 93 | "script": 10 94 | } 95 | 96 | # Other reusable constants or configuration settings 97 | HEADLESS_OPTIONS = ["--disable-gpu", "--disable-dev-shm-usage","--window-size=1920,1080","--disable-search-engine-choice-screen","--disable-blink-features=AutomationControlled"] 98 | 99 | 100 | HEADLESS_OPTIONS_DOCKER = ["--headless=new","--no-sandbox","--disable-gpu", "--disable-dev-shm-usage","--disable-software-rasterizer","--disable-setuid-sandbox","--remote-debugging-port=9222","--disable-search-engine-choice-screen"] 101 | #in case you don't need to open the website 102 | ##HEADLESS_OPTIONS=HEADLESS_OPTIONS+[ "--headless=new"] 103 | 104 | #number of scrolls 105 | NUMBER_SCROLL=2 106 | 107 | 108 | LLAMA_MODEL_FULLNAME="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF" 109 | GROQ_LLAMA_MODEL_FULLNAME="llama-3.1-70b-versatile" 110 | 111 | SYSTEM_MESSAGE = """You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 112 | from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text, 113 | with no additional commentary, explanations, or extraneous information. 114 | You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language. 115 | Please process the following text and provide the output in pure JSON format with no words before or after the JSON:""" 116 | 117 | USER_MESSAGE = f"Extract the following information from the provided text:\nPage content:\n\n" 118 | 119 | 120 | 121 | 122 | 123 | PROMPT_PAGINATION = """ 124 | You are an assistant that extracts pagination elements from markdown content of websites your goal as a universal pagination scrapper of urls from all websites no matter how different they are. 125 | 126 | Please extract the following: 127 | 128 | - The url of the 'Next', 'More', 'See more', 'load more' or any other button indicating how to access the next page, if any, it should be 1 url and no more, if there are multiple urls with the same structure leave this empty. 129 | 130 | - A list of page URLs for pagination it should be a pattern of similar urls with pages that are numbered, if you detect this pattern and the numbers starts from a certain low number until a large number generate the rest of the urls even if they're not included, 131 | your goal here is to give as many urls for the user to choose from in order for them to do further scraping, you will have to deal with very different websites that can potientially have so many urls of images and other elements, 132 | detect only the urls that are clearly defining a pattern to show data on multiple pages, sometimes there is only a part of these urls and you have to combine it with the initial url, that will be provided for you at the end of this prompt. 133 | 134 | - The user can give you indications on how the pagination works for the specific website at the end of this prompt, if those indications are not empty pay special attention to them as they will directly help you understand the structure and the number of pages to generate. 135 | 136 | Provide the output as a JSON object with the following structure: 137 | 138 | { 139 | "page_urls": ["url1", "url2", "url3",...,"urlN"] 140 | } 141 | 142 | Do not include any additional text or explanations. 143 | """ 144 | -------------------------------------------------------------------------------- /chromedriver-win64/LICENSE.chromedriver: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Chromium Authors 2 | // 3 | // Redistribution and use in source and binary forms, with or without 4 | // modification, are permitted provided that the following conditions are 5 | // met: 6 | // 7 | // * Redistributions of source code must retain the above copyright 8 | // notice, this list of conditions and the following disclaimer. 9 | // * Redistributions in binary form must reproduce the above 10 | // copyright notice, this list of conditions and the following disclaimer 11 | // in the documentation and/or other materials provided with the 12 | // distribution. 13 | // * Neither the name of Google LLC nor the names of its 14 | // contributors may be used to endorse or promote products derived from 15 | // this software without specific prior written permission. 16 | // 17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /chromedriver-win64/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reda-marzouk608/scrape-master/177fe5e19fe7b615c6e9303c341653b8f866b11c/chromedriver-win64/chromedriver.exe -------------------------------------------------------------------------------- /pagination_detector.py: -------------------------------------------------------------------------------- 1 | # pagination_detector.py 2 | 3 | import os 4 | import json 5 | from typing import List, Dict, Tuple, Union 6 | from pydantic import BaseModel, Field, ValidationError 7 | 8 | import tiktoken 9 | from dotenv import load_dotenv 10 | 11 | from openai import OpenAI 12 | import google.generativeai as genai 13 | from groq import Groq 14 | 15 | 16 | from api_management import get_api_key 17 | from assets import PROMPT_PAGINATION, PRICING, LLAMA_MODEL_FULLNAME, GROQ_LLAMA_MODEL_FULLNAME 18 | 19 | load_dotenv() 20 | import logging 21 | 22 | class PaginationData(BaseModel): 23 | page_urls: List[str] = Field(default_factory=list, description="List of pagination URLs, including 'Next' button URL if present") 24 | 25 | def calculate_pagination_price(token_counts: Dict[str, int], model: str) -> float: 26 | """ 27 | Calculate the price for pagination based on token counts and the selected model. 28 | 29 | Args: 30 | token_counts (Dict[str, int]): A dictionary containing 'input_tokens' and 'output_tokens'. 31 | model (str): The name of the selected model. 32 | 33 | Returns: 34 | float: The total price for the pagination operation. 35 | """ 36 | input_tokens = token_counts['input_tokens'] 37 | output_tokens = token_counts['output_tokens'] 38 | 39 | input_price = input_tokens * PRICING[model]['input'] 40 | output_price = output_tokens * PRICING[model]['output'] 41 | 42 | return input_price + output_price 43 | 44 | def detect_pagination_elements(url: str, indications: str, selected_model: str, markdown_content: str) -> Tuple[Union[PaginationData, Dict, str], Dict, float]: 45 | try: 46 | """ 47 | Uses AI models to analyze markdown content and extract pagination elements. 48 | 49 | Args: 50 | selected_model (str): The name of the OpenAI model to use. 51 | markdown_content (str): The markdown content to analyze. 52 | 53 | Returns: 54 | Tuple[PaginationData, Dict, float]: Parsed pagination data, token counts, and pagination price. 55 | """ 56 | prompt_pagination = PROMPT_PAGINATION+"\n The url of the page to extract pagination from "+url+"if the urls that you find are not complete combine them intelligently in a way that fit the pattern **ALWAYS GIVE A FULL URL**" 57 | if indications != "": 58 | prompt_pagination +=PROMPT_PAGINATION+"\n\n these are the users indications that, pay special attention to them: "+indications+"\n\n below are the markdowns of the website: \n\n" 59 | else: 60 | prompt_pagination +=PROMPT_PAGINATION+"\n There are no user indications in this case just apply the logic described. \n\n below are the markdowns of the website: \n\n" 61 | 62 | if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]: 63 | # Use OpenAI API 64 | client = OpenAI(api_key=get_api_key('OPENAI_API_KEY')) 65 | completion = client.beta.chat.completions.parse( 66 | model=selected_model, 67 | messages=[ 68 | {"role": "system", "content": prompt_pagination}, 69 | {"role": "user", "content": markdown_content}, 70 | ], 71 | response_format=PaginationData 72 | ) 73 | 74 | # Extract the parsed response 75 | parsed_response = completion.choices[0].message.parsed 76 | 77 | # Calculate tokens using tiktoken 78 | encoder = tiktoken.encoding_for_model(selected_model) 79 | input_token_count = len(encoder.encode(markdown_content)) 80 | output_token_count = len(encoder.encode(json.dumps(parsed_response.dict()))) 81 | token_counts = { 82 | "input_tokens": input_token_count, 83 | "output_tokens": output_token_count 84 | } 85 | 86 | # Calculate the price 87 | pagination_price = calculate_pagination_price(token_counts, selected_model) 88 | 89 | return parsed_response, token_counts, pagination_price 90 | 91 | elif selected_model == "gemini-1.5-flash": 92 | # Use Google Gemini API 93 | genai.configure(api_key=get_api_key("GOOGLE_API_KEY")) 94 | model = genai.GenerativeModel( 95 | 'gemini-1.5-flash', 96 | generation_config={ 97 | "response_mime_type": "application/json", 98 | "response_schema": PaginationData 99 | } 100 | ) 101 | prompt = f"{prompt_pagination}\n{markdown_content}" 102 | # Count input tokens using Gemini's method 103 | input_tokens = model.count_tokens(prompt) 104 | completion = model.generate_content(prompt) 105 | # Extract token counts from usage_metadata 106 | usage_metadata = completion.usage_metadata 107 | token_counts = { 108 | "input_tokens": usage_metadata.prompt_token_count, 109 | "output_tokens": usage_metadata.candidates_token_count 110 | } 111 | # Get the result 112 | response_content = completion.text 113 | 114 | # Log the response content and its type 115 | logging.info(f"Gemini Flash response type: {type(response_content)}") 116 | logging.info(f"Gemini Flash response content: {response_content}") 117 | 118 | # Try to parse the response as JSON 119 | try: 120 | parsed_data = json.loads(response_content) 121 | if isinstance(parsed_data, dict) and 'page_urls' in parsed_data: 122 | pagination_data = PaginationData(**parsed_data) 123 | else: 124 | pagination_data = PaginationData(page_urls=[]) 125 | except json.JSONDecodeError: 126 | logging.error("Failed to parse Gemini Flash response as JSON") 127 | pagination_data = PaginationData(page_urls=[]) 128 | 129 | # Calculate the price 130 | pagination_price = calculate_pagination_price(token_counts, selected_model) 131 | 132 | return pagination_data, token_counts, pagination_price 133 | 134 | elif selected_model == "Llama3.1 8B": 135 | # Use Llama model via OpenAI API pointing to local server 136 | openai.api_key = "lm-studio" 137 | openai.api_base = "http://localhost:1234/v1" 138 | response = openai.ChatCompletion.create( 139 | model=LLAMA_MODEL_FULLNAME, 140 | messages=[ 141 | {"role": "system", "content": prompt_pagination}, 142 | {"role": "user", "content": markdown_content}, 143 | ], 144 | temperature=0.7, 145 | ) 146 | response_content = response['choices'][0]['message']['content'].strip() 147 | # Try to parse the JSON 148 | try: 149 | pagination_data = json.loads(response_content) 150 | except json.JSONDecodeError: 151 | pagination_data = {"next_buttons": [], "page_urls": []} 152 | # Token counts 153 | token_counts = { 154 | "input_tokens": response['usage']['prompt_tokens'], 155 | "output_tokens": response['usage']['completion_tokens'] 156 | } 157 | # Calculate the price 158 | pagination_price = calculate_pagination_price(token_counts, selected_model) 159 | 160 | return pagination_data, token_counts, pagination_price 161 | 162 | elif selected_model == "Groq Llama3.1 70b": 163 | # Use Groq client 164 | client = Groq(api_key=get_api_key("GROQ_API_KEY")) 165 | response = client.chat.completions.create( 166 | model=GROQ_LLAMA_MODEL_FULLNAME, 167 | messages=[ 168 | {"role": "system", "content": prompt_pagination}, 169 | {"role": "user", "content": markdown_content}, 170 | ], 171 | ) 172 | response_content = response.choices[0].message.content.strip() 173 | # Try to parse the JSON 174 | try: 175 | pagination_data = json.loads(response_content) 176 | except json.JSONDecodeError: 177 | pagination_data = {"page_urls": []} 178 | # Token counts 179 | token_counts = { 180 | "input_tokens": response.usage.prompt_tokens, 181 | "output_tokens": response.usage.completion_tokens 182 | } 183 | # Calculate the price 184 | pagination_price = calculate_pagination_price(token_counts, selected_model) 185 | 186 | '''# Ensure the pagination_data is a dictionary 187 | if isinstance(pagination_data, PaginationData): 188 | pagination_data = pagination_data.model_dump() 189 | elif not isinstance(pagination_data, dict): 190 | pagination_data = {"page_urls": []}''' 191 | 192 | return pagination_data, token_counts, pagination_price 193 | 194 | else: 195 | raise ValueError(f"Unsupported model: {selected_model}") 196 | 197 | except Exception as e: 198 | logging.error(f"An error occurred in detect_pagination_elements: {e}") 199 | # Return default values if an error occurs 200 | return PaginationData(page_urls=[]), {"input_tokens": 0, "output_tokens": 0}, 0.0 201 | 202 | 203 | if __name__ == "__main__": 204 | 205 | url="""https://scrapeme.live/shop/""" 206 | # Define the path to your markdown file 207 | markdown_file_path = r"C:\Users\redam\Documents\VSCode\ScrapeMaster2.0\output\scrapeme_live_2024_09_24__00_33_20\rawData_1.md" 208 | 209 | # Read the markdown content from the file 210 | with open(markdown_file_path, 'r', encoding='utf-8') as f: 211 | markdown_content = f.read() 212 | 213 | # Specify the model you want to use 214 | selected_model = 'gemini-1.5-flash' # Replace with your desired model 215 | 216 | # Call the detect_pagination_elements function 217 | pagination_data, token_counts, pagination_price = detect_pagination_elements(url,"",selected_model, markdown_content) 218 | 219 | print("Page URLs:", pagination_data.page_urls) 220 | print("Pagination Price:", pagination_price) 221 | 222 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | python-dotenv 3 | pandas 4 | pydantic 5 | requests 6 | beautifulsoup4 7 | html2text 8 | tiktoken 9 | selenium 10 | readability-lxml 11 | streamlit 12 | streamlit-tags 13 | openpyxl 14 | groq 15 | google-generativeai 16 | webdriver-manager 17 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import time 4 | import re 5 | import json 6 | from datetime import datetime 7 | from typing import List, Dict, Type 8 | 9 | import pandas as pd 10 | from bs4 import BeautifulSoup 11 | from pydantic import BaseModel, Field, create_model 12 | import html2text 13 | import tiktoken 14 | import streamlit as st 15 | 16 | from dotenv import load_dotenv 17 | from selenium import webdriver 18 | from selenium.webdriver.chrome.service import Service 19 | from selenium.webdriver.chrome.options import Options 20 | from selenium.webdriver.common.by import By 21 | from selenium.webdriver.common.action_chains import ActionChains 22 | from selenium.webdriver.support.ui import WebDriverWait 23 | from selenium.webdriver.support import expected_conditions as EC 24 | from webdriver_manager.chrome import ChromeDriverManager 25 | 26 | 27 | from openai import OpenAI 28 | import google.generativeai as genai 29 | from groq import Groq 30 | 31 | from api_management import get_api_key 32 | from assets import USER_AGENTS,PRICING,HEADLESS_OPTIONS,SYSTEM_MESSAGE,USER_MESSAGE,LLAMA_MODEL_FULLNAME,GROQ_LLAMA_MODEL_FULLNAME,HEADLESS_OPTIONS_DOCKER 33 | load_dotenv() 34 | 35 | 36 | # Set up the Chrome WebDriver options 37 | 38 | 39 | def is_running_in_docker(): 40 | """ 41 | Detect if the app is running inside a Docker container. 42 | This checks if the '/proc/1/cgroup' file contains 'docker'. 43 | """ 44 | try: 45 | with open("/proc/1/cgroup", "rt") as file: 46 | return "docker" in file.read() 47 | except Exception: 48 | return False 49 | 50 | def setup_selenium(attended_mode=False): 51 | options = Options() 52 | service = Service(ChromeDriverManager().install()) 53 | 54 | # Apply headless options based on whether the code is running in Docker 55 | if is_running_in_docker(): 56 | # Running inside Docker, use Docker-specific headless options 57 | for option in HEADLESS_OPTIONS_DOCKER: 58 | options.add_argument(option) 59 | else: 60 | # Not running inside Docker, use the normal headless options 61 | for option in HEADLESS_OPTIONS: 62 | options.add_argument(option) 63 | 64 | # Initialize the WebDriver 65 | driver = webdriver.Chrome(service=service, options=options) 66 | return driver 67 | 68 | 69 | 70 | 71 | def fetch_html_selenium(url, attended_mode=False, driver=None): 72 | if driver is None: 73 | driver = setup_selenium(attended_mode) 74 | should_quit = True 75 | if not attended_mode: 76 | driver.get(url) 77 | else: 78 | should_quit = False 79 | # Do not navigate to the URL if in attended mode and driver is already initialized 80 | if not attended_mode: 81 | driver.get(url) 82 | 83 | try: 84 | if not attended_mode: 85 | # Add more realistic actions like scrolling 86 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);") 87 | time.sleep(random.uniform(1.1, 1.8)) 88 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1.2);") 89 | time.sleep(random.uniform(1.1, 1.8)) 90 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1);") 91 | time.sleep(random.uniform(1.1, 1.8)) 92 | # Get the page source from the current page 93 | html = driver.page_source 94 | return html 95 | finally: 96 | if should_quit: 97 | driver.quit() 98 | 99 | 100 | 101 | 102 | def clean_html(html_content): 103 | soup = BeautifulSoup(html_content, 'html.parser') 104 | 105 | # Remove headers and footers based on common HTML tags or classes 106 | for element in soup.find_all(['header', 'footer']): 107 | element.decompose() # Remove these tags and their content 108 | 109 | return str(soup) 110 | 111 | 112 | def html_to_markdown_with_readability(html_content): 113 | 114 | 115 | cleaned_html = clean_html(html_content) 116 | 117 | # Convert to markdown 118 | markdown_converter = html2text.HTML2Text() 119 | markdown_converter.ignore_links = False 120 | markdown_content = markdown_converter.handle(cleaned_html) 121 | 122 | return markdown_content 123 | 124 | 125 | 126 | def save_raw_data(raw_data: str, output_folder: str, file_name: str): 127 | """Save raw markdown data to the specified output folder.""" 128 | os.makedirs(output_folder, exist_ok=True) 129 | raw_output_path = os.path.join(output_folder, file_name) 130 | with open(raw_output_path, 'w', encoding='utf-8') as f: 131 | f.write(raw_data) 132 | print(f"Raw data saved to {raw_output_path}") 133 | return raw_output_path 134 | 135 | 136 | def create_dynamic_listing_model(field_names: List[str]) -> Type[BaseModel]: 137 | """ 138 | Dynamically creates a Pydantic model based on provided fields. 139 | field_name is a list of names of the fields to extract from the markdown. 140 | """ 141 | # Create field definitions using aliases for Field parameters 142 | field_definitions = {field: (str, ...) for field in field_names} 143 | # Dynamically create the model with all field 144 | return create_model('DynamicListingModel', **field_definitions) 145 | 146 | 147 | def create_listings_container_model(listing_model: Type[BaseModel]) -> Type[BaseModel]: 148 | """ 149 | Create a container model that holds a list of the given listing model. 150 | """ 151 | return create_model('DynamicListingsContainer', listings=(List[listing_model], ...)) 152 | 153 | 154 | 155 | 156 | def trim_to_token_limit(text, model, max_tokens=120000): 157 | encoder = tiktoken.encoding_for_model(model) 158 | tokens = encoder.encode(text) 159 | if len(tokens) > max_tokens: 160 | trimmed_text = encoder.decode(tokens[:max_tokens]) 161 | return trimmed_text 162 | return text 163 | 164 | def generate_system_message(listing_model: BaseModel) -> str: 165 | """ 166 | Dynamically generate a system message based on the fields in the provided listing model. 167 | """ 168 | # Use the model_json_schema() method to introspect the Pydantic model 169 | schema_info = listing_model.model_json_schema() 170 | 171 | # Extract field descriptions from the schema 172 | field_descriptions = [] 173 | for field_name, field_info in schema_info["properties"].items(): 174 | # Get the field type from the schema info 175 | field_type = field_info["type"] 176 | field_descriptions.append(f'"{field_name}": "{field_type}"') 177 | 178 | # Create the JSON schema structure for the listings 179 | schema_structure = ",\n".join(field_descriptions) 180 | 181 | # Generate the system message dynamically 182 | system_message = f""" 183 | You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 184 | from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text, 185 | with no additional commentary, explanations, or extraneous information. 186 | You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language. 187 | Please process the following text and provide the output in pure JSON format with no words before or after the JSON: 188 | Please ensure the output strictly follows this schema: 189 | 190 | {{ 191 | "listings": [ 192 | {{ 193 | {schema_structure} 194 | }} 195 | ] 196 | }} """ 197 | 198 | return system_message 199 | 200 | 201 | 202 | def format_data(data, DynamicListingsContainer, DynamicListingModel, selected_model): 203 | token_counts = {} 204 | 205 | if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]: 206 | # Use OpenAI API 207 | client = OpenAI(api_key=get_api_key('OPENAI_API_KEY')) 208 | completion = client.beta.chat.completions.parse( 209 | model=selected_model, 210 | messages=[ 211 | {"role": "system", "content": SYSTEM_MESSAGE}, 212 | {"role": "user", "content": USER_MESSAGE + data}, 213 | ], 214 | response_format=DynamicListingsContainer 215 | ) 216 | # Calculate tokens using tiktoken 217 | encoder = tiktoken.encoding_for_model(selected_model) 218 | input_token_count = len(encoder.encode(USER_MESSAGE + data)) 219 | output_token_count = len(encoder.encode(json.dumps(completion.choices[0].message.parsed.dict()))) 220 | token_counts = { 221 | "input_tokens": input_token_count, 222 | "output_tokens": output_token_count 223 | } 224 | return completion.choices[0].message.parsed, token_counts 225 | 226 | elif selected_model == "gemini-1.5-flash": 227 | # Use Google Gemini API 228 | genai.configure(api_key=get_api_key("GOOGLE_API_KEY")) 229 | model = genai.GenerativeModel('gemini-1.5-flash', 230 | generation_config={ 231 | "response_mime_type": "application/json", 232 | "response_schema": DynamicListingsContainer 233 | }) 234 | prompt = SYSTEM_MESSAGE + "\n" + USER_MESSAGE + data 235 | # Count input tokens using Gemini's method 236 | input_tokens = model.count_tokens(prompt) 237 | completion = model.generate_content(prompt) 238 | # Extract token counts from usage_metadata 239 | usage_metadata = completion.usage_metadata 240 | token_counts = { 241 | "input_tokens": usage_metadata.prompt_token_count, 242 | "output_tokens": usage_metadata.candidates_token_count 243 | } 244 | return completion.text, token_counts 245 | 246 | elif selected_model == "Llama3.1 8B": 247 | 248 | # Dynamically generate the system message based on the schema 249 | sys_message = generate_system_message(DynamicListingModel) 250 | # print(SYSTEM_MESSAGE) 251 | # Point to the local server 252 | client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio") 253 | 254 | completion = client.chat.completions.create( 255 | model=LLAMA_MODEL_FULLNAME, #change this if needed (use a better model) 256 | messages=[ 257 | {"role": "system", "content": sys_message}, 258 | {"role": "user", "content": USER_MESSAGE + data} 259 | ], 260 | temperature=0.7, 261 | 262 | ) 263 | 264 | # Extract the content from the response 265 | response_content = completion.choices[0].message.content 266 | print(response_content) 267 | # Convert the content from JSON string to a Python dictionary 268 | parsed_response = json.loads(response_content) 269 | 270 | # Extract token usage 271 | token_counts = { 272 | "input_tokens": completion.usage.prompt_tokens, 273 | "output_tokens": completion.usage.completion_tokens 274 | } 275 | 276 | return parsed_response, token_counts 277 | elif selected_model== "Groq Llama3.1 70b": 278 | 279 | # Dynamically generate the system message based on the schema 280 | sys_message = generate_system_message(DynamicListingModel) 281 | # print(SYSTEM_MESSAGE) 282 | # Point to the local server 283 | client = Groq(api_key=get_api_key("GROQ_API_KEY"),) 284 | 285 | completion = client.chat.completions.create( 286 | messages=[ 287 | {"role": "system","content": sys_message}, 288 | {"role": "user","content": USER_MESSAGE + data} 289 | ], 290 | model=GROQ_LLAMA_MODEL_FULLNAME, 291 | ) 292 | 293 | # Extract the content from the response 294 | response_content = completion.choices[0].message.content 295 | 296 | # Convert the content from JSON string to a Python dictionary 297 | parsed_response = json.loads(response_content) 298 | 299 | # completion.usage 300 | token_counts = { 301 | "input_tokens": completion.usage.prompt_tokens, 302 | "output_tokens": completion.usage.completion_tokens 303 | } 304 | 305 | return parsed_response, token_counts 306 | else: 307 | raise ValueError(f"Unsupported model: {selected_model}") 308 | 309 | 310 | 311 | def save_formatted_data(formatted_data, output_folder: str, json_file_name: str, excel_file_name: str): 312 | """Save formatted data as JSON and Excel in the specified output folder.""" 313 | os.makedirs(output_folder, exist_ok=True) 314 | 315 | # Parse the formatted data if it's a JSON string (from Gemini API) 316 | if isinstance(formatted_data, str): 317 | try: 318 | formatted_data_dict = json.loads(formatted_data) 319 | except json.JSONDecodeError: 320 | raise ValueError("The provided formatted data is a string but not valid JSON.") 321 | else: 322 | # Handle data from OpenAI or other sources 323 | formatted_data_dict = formatted_data.dict() if hasattr(formatted_data, 'dict') else formatted_data 324 | 325 | # Save the formatted data as JSON 326 | json_output_path = os.path.join(output_folder, json_file_name) 327 | with open(json_output_path, 'w', encoding='utf-8') as f: 328 | json.dump(formatted_data_dict, f, indent=4) 329 | print(f"Formatted data saved to JSON at {json_output_path}") 330 | 331 | # Prepare data for DataFrame 332 | if isinstance(formatted_data_dict, dict): 333 | # If the data is a dictionary containing lists, assume these lists are records 334 | data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict 335 | elif isinstance(formatted_data_dict, list): 336 | data_for_df = formatted_data_dict 337 | else: 338 | raise ValueError("Formatted data is neither a dictionary nor a list, cannot convert to DataFrame") 339 | 340 | # Create DataFrame 341 | try: 342 | df = pd.DataFrame(data_for_df) 343 | print("DataFrame created successfully.") 344 | 345 | # Save the DataFrame to an Excel file 346 | excel_output_path = os.path.join(output_folder, excel_file_name) 347 | df.to_excel(excel_output_path, index=False) 348 | print(f"Formatted data saved to Excel at {excel_output_path}") 349 | 350 | return df 351 | except Exception as e: 352 | print(f"Error creating DataFrame or saving Excel: {str(e)}") 353 | return None 354 | 355 | def calculate_price(token_counts, model): 356 | input_token_count = token_counts.get("input_tokens", 0) 357 | output_token_count = token_counts.get("output_tokens", 0) 358 | 359 | # Calculate the costs 360 | input_cost = input_token_count * PRICING[model]["input"] 361 | output_cost = output_token_count * PRICING[model]["output"] 362 | total_cost = input_cost + output_cost 363 | 364 | return input_token_count, output_token_count, total_cost 365 | 366 | 367 | def generate_unique_folder_name(url): 368 | timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S') 369 | url_name = re.sub(r'\W+', '_', url.split('//')[1].split('/')[0]) # Extract domain name and replace non-alphanumeric characters 370 | return f"{url_name}_{timestamp}" 371 | 372 | 373 | def scrape_url(url: str, fields: List[str], selected_model: str, output_folder: str, file_number: int, markdown: str): 374 | """Scrape a single URL and save the results.""" 375 | try: 376 | # Save raw data 377 | save_raw_data(markdown, output_folder, f'rawData_{file_number}.md') 378 | 379 | # Create the dynamic listing model 380 | DynamicListingModel = create_dynamic_listing_model(fields) 381 | 382 | # Create the container model that holds a list of the dynamic listing models 383 | DynamicListingsContainer = create_listings_container_model(DynamicListingModel) 384 | 385 | # Format data 386 | formatted_data, token_counts = format_data(markdown, DynamicListingsContainer, DynamicListingModel, selected_model) 387 | 388 | # Save formatted data 389 | save_formatted_data(formatted_data, output_folder, f'sorted_data_{file_number}.json', f'sorted_data_{file_number}.xlsx') 390 | 391 | # Calculate and return token usage and cost 392 | input_tokens, output_tokens, total_cost = calculate_price(token_counts, selected_model) 393 | return input_tokens, output_tokens, total_cost, formatted_data 394 | 395 | except Exception as e: 396 | print(f"An error occurred while processing {url}: {e}") 397 | return 0, 0, 0, None 398 | 399 | # Remove the main execution block if it's not needed for testing purposes 400 | -------------------------------------------------------------------------------- /streamlit_app.py: -------------------------------------------------------------------------------- 1 | # streamlit_app.py 2 | 3 | import streamlit as st 4 | from streamlit_tags import st_tags_sidebar 5 | import pandas as pd 6 | import json 7 | from datetime import datetime 8 | from scraper import ( 9 | fetch_html_selenium, 10 | save_raw_data, 11 | format_data, 12 | save_formatted_data, 13 | calculate_price, 14 | html_to_markdown_with_readability, 15 | create_dynamic_listing_model, 16 | create_listings_container_model, 17 | scrape_url, 18 | setup_selenium, 19 | generate_unique_folder_name 20 | ) 21 | from pagination_detector import detect_pagination_elements 22 | import re 23 | from urllib.parse import urlparse 24 | from assets import PRICING 25 | import os 26 | 27 | # Initialize Streamlit app 28 | st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑") 29 | st.title("Universal Web Scraper 🦑") 30 | 31 | # Initialize session state variables 32 | if 'scraping_state' not in st.session_state: 33 | st.session_state['scraping_state'] = 'idle' # Possible states: 'idle', 'waiting', 'scraping', 'completed' 34 | if 'results' not in st.session_state: 35 | st.session_state['results'] = None 36 | if 'driver' not in st.session_state: 37 | st.session_state['driver'] = None 38 | 39 | # Sidebar components 40 | st.sidebar.title("Web Scraper Settings") 41 | 42 | # API Keys 43 | with st.sidebar.expander("API Keys", expanded=False): 44 | st.session_state['openai_api_key'] = st.text_input("OpenAI API Key", type="password") 45 | st.session_state['gemini_api_key'] = st.text_input("Gemini API Key", type="password") 46 | st.session_state['groq_api_key'] = st.text_input("Groq API Key", type="password") 47 | 48 | # Model selection 49 | model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0) 50 | 51 | # URL input 52 | url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace") 53 | # Process URLs 54 | urls = url_input.strip().split() 55 | num_urls = len(urls) 56 | # Fields to extract 57 | show_tags = st.sidebar.toggle("Enable Scraping") 58 | fields = [] 59 | if show_tags: 60 | fields = st_tags_sidebar( 61 | label='Enter Fields to Extract:', 62 | text='Press enter to add a field', 63 | value=[], 64 | suggestions=[], 65 | maxtags=-1, 66 | key='fields_input' 67 | ) 68 | 69 | st.sidebar.markdown("---") 70 | 71 | # Conditionally display Pagination and Attended Mode options 72 | if num_urls <= 1: 73 | # Pagination settings 74 | use_pagination = st.sidebar.toggle("Enable Pagination") 75 | pagination_details = "" 76 | if use_pagination: 77 | pagination_details = st.sidebar.text_input( 78 | "Enter Pagination Details (optional)", 79 | help="Describe how to navigate through pages (e.g., 'Next' button class, URL pattern)" 80 | ) 81 | 82 | st.sidebar.markdown("---") 83 | 84 | # Attended mode toggle 85 | attended_mode = st.sidebar.toggle("Enable Attended Mode") 86 | else: 87 | # Multiple URLs entered; disable Pagination and Attended Mode 88 | use_pagination = False 89 | attended_mode = False 90 | pagination_details = "" 91 | st.sidebar.info("Pagination and Attended Mode are disabled when multiple URLs are entered.") 92 | 93 | st.sidebar.markdown("---") 94 | 95 | 96 | 97 | # Main action button 98 | if st.sidebar.button("LAUNCH SCRAPER", type="primary"): 99 | if url_input.strip() == "": 100 | st.error("Please enter at least one URL.") 101 | elif show_tags and len(fields) == 0: 102 | st.error("Please enter at least one field to extract.") 103 | else: 104 | # Set up scraping parameters in session state 105 | st.session_state['urls'] = url_input.strip().split() 106 | st.session_state['fields'] = fields 107 | st.session_state['model_selection'] = model_selection 108 | st.session_state['attended_mode'] = attended_mode 109 | st.session_state['use_pagination'] = use_pagination 110 | st.session_state['pagination_details'] = pagination_details 111 | st.session_state['scraping_state'] = 'waiting' if attended_mode else 'scraping' 112 | 113 | # Scraping logic 114 | if st.session_state['scraping_state'] == 'waiting': 115 | # Attended mode: set up driver and wait for user interaction 116 | if st.session_state['driver'] is None: 117 | st.session_state['driver'] = setup_selenium(attended_mode=True) 118 | st.session_state['driver'].get(st.session_state['urls'][0]) 119 | st.write("Perform any required actions in the browser window that opened.") 120 | st.write("Navigate to the page you want to scrape.") 121 | st.write("When ready, click the 'Resume Scraping' button.") 122 | else: 123 | st.write("Browser window is already open. Perform your actions and click 'Resume Scraping'.") 124 | 125 | if st.button("Resume Scraping"): 126 | st.session_state['scraping_state'] = 'scraping' 127 | st.rerun() 128 | 129 | elif st.session_state['scraping_state'] == 'scraping': 130 | with st.spinner('Scraping in progress...'): 131 | # Perform scraping 132 | output_folder = os.path.join('output', generate_unique_folder_name(st.session_state['urls'][0])) 133 | os.makedirs(output_folder, exist_ok=True) 134 | 135 | total_input_tokens = 0 136 | total_output_tokens = 0 137 | total_cost = 0 138 | all_data = [] 139 | pagination_info = None 140 | 141 | driver = st.session_state.get('driver', None) 142 | if st.session_state['attended_mode'] and driver is not None: 143 | # Attended mode: scrape the current page without navigating 144 | # Fetch HTML from the current page 145 | raw_html = fetch_html_selenium(st.session_state['urls'][0], attended_mode=True, driver=driver) 146 | markdown = html_to_markdown_with_readability(raw_html) 147 | save_raw_data(markdown, output_folder, f'rawData_1.md') 148 | 149 | current_url = driver.current_url # Use the current URL for logging and saving purposes 150 | 151 | # Detect pagination if enabled 152 | if st.session_state['use_pagination']: 153 | pagination_data, token_counts, pagination_price = detect_pagination_elements( 154 | current_url, st.session_state['pagination_details'], st.session_state['model_selection'], markdown 155 | ) 156 | # Check if pagination_data is a dict or a model with 'page_urls' attribute 157 | if isinstance(pagination_data, dict): 158 | page_urls = pagination_data.get("page_urls", []) 159 | else: 160 | page_urls = pagination_data.page_urls 161 | 162 | pagination_info = { 163 | "page_urls": page_urls, 164 | "token_counts": token_counts, 165 | "price": pagination_price 166 | } 167 | # Scrape data if fields are specified 168 | if show_tags: 169 | # Create dynamic models 170 | DynamicListingModel = create_dynamic_listing_model(st.session_state['fields']) 171 | DynamicListingsContainer = create_listings_container_model(DynamicListingModel) 172 | # Format data 173 | formatted_data, token_counts = format_data( 174 | markdown, DynamicListingsContainer, DynamicListingModel, st.session_state['model_selection'] 175 | ) 176 | input_tokens, output_tokens, cost = calculate_price(token_counts, st.session_state['model_selection']) 177 | total_input_tokens += input_tokens 178 | total_output_tokens += output_tokens 179 | total_cost += cost 180 | # Save formatted data 181 | df = save_formatted_data(formatted_data, output_folder, f'sorted_data_1.json', f'sorted_data_1.xlsx') 182 | all_data.append(formatted_data) 183 | else: 184 | # Non-attended mode or driver not available 185 | for i, url in enumerate(st.session_state['urls'], start=1): 186 | # Fetch HTML 187 | raw_html = fetch_html_selenium(url, attended_mode=False) 188 | markdown = html_to_markdown_with_readability(raw_html) 189 | save_raw_data(markdown, output_folder, f'rawData_{i}.md') 190 | 191 | # Detect pagination if enabled and only for the first URL 192 | if st.session_state['use_pagination'] and i == 1: 193 | pagination_data, token_counts, pagination_price = detect_pagination_elements( 194 | url, st.session_state['pagination_details'], st.session_state['model_selection'], markdown 195 | ) 196 | # Check if pagination_data is a dict or a model with 'page_urls' attribute 197 | if isinstance(pagination_data, dict): 198 | page_urls = pagination_data.get("page_urls", []) 199 | else: 200 | page_urls = pagination_data.page_urls 201 | 202 | pagination_info = { 203 | "page_urls": page_urls, 204 | "token_counts": token_counts, 205 | "price": pagination_price 206 | } 207 | # Scrape data if fields are specified 208 | if show_tags: 209 | # Create dynamic models 210 | DynamicListingModel = create_dynamic_listing_model(st.session_state['fields']) 211 | DynamicListingsContainer = create_listings_container_model(DynamicListingModel) 212 | # Format data 213 | formatted_data, token_counts = format_data( 214 | markdown, DynamicListingsContainer, DynamicListingModel, st.session_state['model_selection'] 215 | ) 216 | input_tokens, output_tokens, cost = calculate_price(token_counts, st.session_state['model_selection']) 217 | total_input_tokens += input_tokens 218 | total_output_tokens += output_tokens 219 | total_cost += cost 220 | # Save formatted data 221 | df = save_formatted_data(formatted_data, output_folder, f'sorted_data_{i}.json', f'sorted_data_{i}.xlsx') 222 | all_data.append(formatted_data) 223 | 224 | # Clean up driver if used 225 | if driver: 226 | driver.quit() 227 | st.session_state['driver'] = None 228 | 229 | # Save results 230 | st.session_state['results'] = { 231 | 'data': all_data, 232 | 'input_tokens': total_input_tokens, 233 | 'output_tokens': total_output_tokens, 234 | 'total_cost': total_cost, 235 | 'output_folder': output_folder, 236 | 'pagination_info': pagination_info 237 | } 238 | st.session_state['scraping_state'] = 'completed' 239 | # Display results 240 | if st.session_state['scraping_state'] == 'completed' and st.session_state['results']: 241 | results = st.session_state['results'] 242 | all_data = results['data'] 243 | total_input_tokens = results['input_tokens'] 244 | total_output_tokens = results['output_tokens'] 245 | total_cost = results['total_cost'] 246 | output_folder = results['output_folder'] 247 | pagination_info = results['pagination_info'] 248 | 249 | # Display scraping details 250 | if show_tags: 251 | st.subheader("Scraping Results") 252 | for i, data in enumerate(all_data, start=1): 253 | st.write(f"Data from URL {i}:") 254 | 255 | # Handle string data (convert to dict if it's JSON) 256 | if isinstance(data, str): 257 | try: 258 | data = json.loads(data) 259 | except json.JSONDecodeError: 260 | st.error(f"Failed to parse data as JSON for URL {i}") 261 | continue 262 | 263 | if isinstance(data, dict): 264 | if 'listings' in data and isinstance(data['listings'], list): 265 | df = pd.DataFrame(data['listings']) 266 | else: 267 | # If 'listings' is not in the dict or not a list, use the entire dict 268 | df = pd.DataFrame([data]) 269 | elif hasattr(data, 'listings') and isinstance(data.listings, list): 270 | # Handle the case where data is a Pydantic model 271 | listings = [item.dict() for item in data.listings] 272 | df = pd.DataFrame(listings) 273 | else: 274 | st.error(f"Unexpected data format for URL {i}") 275 | continue 276 | # Display the dataframe 277 | st.dataframe(df, use_container_width=True) 278 | 279 | # Display token usage and cost 280 | st.sidebar.markdown("---") 281 | st.sidebar.markdown("### Scraping Details") 282 | st.sidebar.markdown("#### Token Usage") 283 | st.sidebar.markdown(f"*Input Tokens:* {total_input_tokens}") 284 | st.sidebar.markdown(f"*Output Tokens:* {total_output_tokens}") 285 | st.sidebar.markdown(f"**Total Cost:** :green-background[**${total_cost:.4f}**]") 286 | 287 | # Download options 288 | st.subheader("Download Extracted Data") 289 | col1, col2 = st.columns(2) 290 | with col1: 291 | json_data = json.dumps(all_data, default=lambda o: o.dict() if hasattr(o, 'dict') else str(o), indent=4) 292 | st.download_button( 293 | "Download JSON", 294 | data=json_data, 295 | file_name="scraped_data.json" 296 | ) 297 | with col2: 298 | # Convert all data to a single DataFrame 299 | all_listings = [] 300 | for data in all_data: 301 | if isinstance(data, str): 302 | try: 303 | data = json.loads(data) 304 | except json.JSONDecodeError: 305 | continue 306 | if isinstance(data, dict) and 'listings' in data: 307 | all_listings.extend(data['listings']) 308 | elif hasattr(data, 'listings'): 309 | all_listings.extend([item.dict() for item in data.listings]) 310 | else: 311 | all_listings.append(data) 312 | 313 | combined_df = pd.DataFrame(all_listings) 314 | st.download_button( 315 | "Download CSV", 316 | data=combined_df.to_csv(index=False), 317 | file_name="scraped_data.csv" 318 | ) 319 | 320 | st.success(f"Scraping completed. Results saved in {output_folder}") 321 | 322 | # Display pagination info 323 | if pagination_info: 324 | st.markdown("---") 325 | st.subheader("Pagination Information") 326 | 327 | # Display token usage and cost using metrics 328 | st.sidebar.markdown("---") 329 | st.sidebar.markdown("### Pagination Details") 330 | st.sidebar.markdown(f"**Number of Page URLs:** {len(pagination_info['page_urls'])}") 331 | st.sidebar.markdown("#### Pagination Token Usage") 332 | st.sidebar.markdown(f"*Input Tokens:* {pagination_info['token_counts']['input_tokens']}") 333 | st.sidebar.markdown(f"*Output Tokens:* {pagination_info['token_counts']['output_tokens']}") 334 | st.sidebar.markdown(f"**Pagination Cost:** :blue-background[**${pagination_info['price']:.4f}**]") 335 | 336 | 337 | # Display page URLs in a table 338 | st.write("**Page URLs:**") 339 | # Make URLs clickable 340 | pagination_df = pd.DataFrame(pagination_info["page_urls"], columns=["Page URLs"]) 341 | 342 | st.dataframe( 343 | pagination_df, 344 | column_config={ 345 | "Page URLs": st.column_config.LinkColumn("Page URLs") 346 | },use_container_width=True 347 | ) 348 | 349 | # Download pagination URLs 350 | st.subheader("Download Pagination URLs") 351 | col1, col2 = st.columns(2) 352 | with col1: 353 | st.download_button("Download Pagination CSV",data=pagination_df.to_csv(index=False),file_name="pagination_urls.csv") 354 | with col2: 355 | st.download_button("Download Pagination JSON",data=json.dumps(pagination_info['page_urls'], indent=4),file_name="pagination_urls.json") 356 | # Reset scraping state 357 | if st.sidebar.button("Clear Results"): 358 | st.session_state['scraping_state'] = 'idle' 359 | st.session_state['results'] = None 360 | 361 | # If both scraping and pagination were performed, show totals under the pagination table 362 | if show_tags and pagination_info: 363 | st.markdown("---") 364 | total_input_tokens_combined = total_input_tokens + pagination_info['token_counts']['input_tokens'] 365 | total_output_tokens_combined = total_output_tokens + pagination_info['token_counts']['output_tokens'] 366 | total_combined_cost = total_cost + pagination_info['price'] 367 | st.markdown("### Total Counts and Cost (Including Pagination)") 368 | st.markdown(f"**Total Input Tokens:** {total_input_tokens_combined}") 369 | st.markdown(f"**Total Output Tokens:** {total_output_tokens_combined}") 370 | st.markdown(f"**Total Combined Cost:** :rainbow-background[**${total_combined_cost:.4f}**]") 371 | # Helper function to generate unique folder names 372 | def generate_unique_folder_name(url): 373 | timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S') 374 | 375 | # Parse the URL 376 | parsed_url = urlparse(url) 377 | 378 | # Extract the domain name 379 | domain = parsed_url.netloc or parsed_url.path.split('/')[0] 380 | 381 | # Remove 'www.' if present 382 | domain = re.sub(r'^www\.', '', domain) 383 | 384 | # Remove any non-alphanumeric characters and replace with underscores 385 | clean_domain = re.sub(r'\W+', '_', domain) 386 | 387 | return f"{clean_domain}_{timestamp}" 388 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reda-marzouk608/scrape-master/177fe5e19fe7b615c6e9303c341653b8f866b11c/test.py --------------------------------------------------------------------------------