├── Dockerfile
├── README.md
├── api_management.py
├── assets.py
├── chromedriver-win64
    ├── LICENSE.chromedriver
    ├── THIRD_PARTY_NOTICES.chromedriver
    └── chromedriver.exe
├── pagination_detector.py
├── requirements.txt
├── scraper.py
├── streamlit_app.py
└── test.py


/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base image with Python 3.11.4
 2 | FROM python:3.11.4-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Copy the requirements file and install dependencies
 8 | COPY requirements.txt ./
 9 | RUN pip install --no-cache-dir -r requirements.txt
10 | 
11 | # Install Chrome Browser and additional dependencies
12 | RUN apt-get update && apt-get install -y \
13 |     wget \
14 |     gnupg \
15 |     unzip \
16 |     libappindicator3-1 \
17 |     libasound2 \
18 |     libatk-bridge2.0-0 \
19 |     libatk1.0-0 \
20 |     libcups2 \
21 |     libdrm2 \
22 |     libgbm1 \
23 |     libnspr4 \
24 |     libnss3 \
25 |     libxcomposite1 \
26 |     libxdamage1 \
27 |     libxrandr2 \
28 |     fonts-liberation \
29 |     xdg-utils \
30 |     --no-install-recommends && \
31 |     wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \
32 |     sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \
33 |     apt-get update && apt-get install -y google-chrome-stable && \
34 |     rm -rf /var/lib/apt/lists/*
35 | 
36 | # Install WebDriver Manager to manage ChromeDriver
37 | RUN pip install webdriver-manager
38 | 
39 | # Set environment variables for Chrome in headless mode
40 | ENV CHROME_BIN=/usr/bin/google-chrome
41 | ENV CHROME_DRIVER=/usr/local/bin/chromedriver
42 | 
43 | # Copy the entire project into the container
44 | COPY . .
45 | 
46 | # Expose the port that Streamlit will use
47 | EXPOSE 8501
48 | 
49 | # Clean up any unnecessary packages after installation
50 | RUN apt-get clean && rm -rf /var/lib/apt/lists/*
51 | 
52 | # Ensure all packages are updated to their latest versions
53 | RUN apt-get update && apt-get dist-upgrade -y 
54 | 
55 | 
56 | # Command to run the Streamlit app
57 | CMD ["streamlit", "run", "streamlit_app.py"]
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ScrapeMaster
 2 | 
 3 | ScrapeMaster is a Streamlit-based web scraping application designed to simplify the process of extracting data from web pages. It allows users to specify URLs and data fields interactively, facilitating the extraction and manipulation of web data.
 4 | 
 5 | ## Features
 6 | 
 7 | - Easy-to-use web interface.
 8 | - Custom field specification for data extraction.
 9 | - Pagination
10 | - Dynamic data processing with Python and Streamlit.
11 | - Direct download capabilities for extracted data in various formats.
12 | - Attended mode
13 | 
14 | ## Prerequisites
15 | 
16 | Before you begin, ensure you have the following installed:
17 | - Python 3.6 or higher
18 | - Pip for managing Python packages
19 | 
20 | ## Installation
21 | 
22 | Follow these steps to get your development environment running:
23 | 
24 | ```bash
25 | # Clone the repository
26 | git clone https://github.com/reda-marzouk608/scrape-master
27 | cd scrape-master
28 | 
29 | # It's recommended to create a virtual environment
30 | python -m venv venv
31 | # Activate the virtual environment
32 | # On Windows
33 | venv\Scripts\activate
34 | # On MacOS/Linux
35 | source venv/bin/activate
36 | 
37 | # Install the required packages
38 | pip install -r requirements.txt
39 | ```
40 | 
41 | ## Launching the Application
42 | 
43 | To run ScrapeMaster, navigate to the project directory and run the following command:
44 | 
45 | ```bash
46 | streamlit run streamlit_app.py
47 | ```
48 | 
49 | 
50 | ## Usage
51 | After launching the application, open your web browser to the indicated address (typically http://localhost:8501). Use the sidebar to input the URL and fields you wish to scrape, then click the "Scrape" button to see results.


--------------------------------------------------------------------------------
/api_management.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | 
 4 | def get_api_key(api_key_name):
 5 |     # Check if the API key from the sidebar is present, else fallback to the .env file
 6 |     if api_key_name == 'OPENAI_API_KEY':
 7 |         return st.session_state['openai_api_key'] or os.getenv(api_key_name)
 8 |     elif api_key_name == 'GOOGLE_API_KEY':
 9 |         return st.session_state['gemini_api_key'] or os.getenv(api_key_name)
10 |     elif api_key_name == 'GROQ_API_KEY':
11 |         return st.session_state['groq_api_key'] or os.getenv(api_key_name)
12 |     else:
13 |         return os.getenv(api_key_name)
14 | 


--------------------------------------------------------------------------------
/assets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains configuration variables and constants
  3 | that are used across different parts of the application.
  4 | """
  5 | 
  6 | # List of user agents to mimic different users
  7 | USER_AGENTS  = [
  8 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  9 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
 10 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
 11 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
 12 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
 13 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
 14 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
 15 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
 16 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
 17 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
 18 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
 19 |     "Mozilla/5.0 (X11; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0",
 20 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
 21 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
 22 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36",
 23 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36",
 24 |     "Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
 25 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 13_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Mobile/15E148 Safari/604.1",
 26 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
 27 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
 28 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
 29 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
 30 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
 31 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
 32 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 13_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Mobile/15E148 Safari/604.1",
 33 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0",
 34 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
 35 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
 36 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
 37 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0",
 38 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36",
 39 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 14_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
 40 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
 41 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
 42 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
 43 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
 44 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
 45 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
 46 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0",
 47 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
 48 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
 49 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
 50 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
 51 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
 52 |     "Mozilla/5.0 (X11; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0",
 53 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
 54 |     "Mozilla/5.0 (iPhone; CPU iPhone OS 13_7 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Mobile/15E148 Safari/604.1",
 55 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
 56 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
 57 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36",
 58 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
 59 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36",
 60 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36",
 61 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
 62 | ]
 63 | 
 64 | 
 65 | # Define the pricing for models without Batch API
 66 | PRICING = {
 67 |     "gpt-4o-mini": {
 68 |         "input": 0.150 / 1_000_000,  # $0.150 per 1M input tokens
 69 |         "output": 0.600 / 1_000_000, # $0.600 per 1M output tokens
 70 |     },
 71 |     "gpt-4o-2024-08-06": {
 72 |         "input": 2.5 / 1_000_000,  # $2.5 per 1M input tokens
 73 |         "output": 10 / 1_000_000, # $10 per 1M output tokens
 74 |     },
 75 |     "gemini-1.5-flash": {
 76 |         "input": 0.075 / 1_000_000,  # $0.075 per 1M input tokens
 77 |         "output": 0.30 / 1_000_000, # $0.30 per 1M output tokens
 78 |     },
 79 |     "Llama3.1 8B": {
 80 |         "input": 0 ,  # Free
 81 |         "output": 0 , # Free
 82 |     },
 83 |     "Groq Llama3.1 70b": {
 84 |         "input": 0 ,  # Free
 85 |         "output": 0 , # Free
 86 |     },
 87 |     # Add other models and their prices here if needed
 88 | }
 89 | 
 90 | # Timeout settings for web scraping
 91 | TIMEOUT_SETTINGS = {
 92 |     "page_load": 30,
 93 |     "script": 10
 94 | }
 95 | 
 96 | # Other reusable constants or configuration settings
 97 | HEADLESS_OPTIONS = ["--disable-gpu", "--disable-dev-shm-usage","--window-size=1920,1080","--disable-search-engine-choice-screen","--disable-blink-features=AutomationControlled"]
 98 | 
 99 | 
100 | HEADLESS_OPTIONS_DOCKER = ["--headless=new","--no-sandbox","--disable-gpu", "--disable-dev-shm-usage","--disable-software-rasterizer","--disable-setuid-sandbox","--remote-debugging-port=9222","--disable-search-engine-choice-screen"]
101 | #in case you don't need to open the website
102 | ##HEADLESS_OPTIONS=HEADLESS_OPTIONS+[ "--headless=new"]
103 | 
104 | #number of scrolls
105 | NUMBER_SCROLL=2
106 | 
107 | 
108 | LLAMA_MODEL_FULLNAME="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF"
109 | GROQ_LLAMA_MODEL_FULLNAME="llama-3.1-70b-versatile"
110 | 
111 | SYSTEM_MESSAGE = """You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
112 |                         from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text, 
113 |                         with no additional commentary, explanations, or extraneous information. 
114 |                         You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
115 |                         Please process the following text and provide the output in pure JSON format with no words before or after the JSON:"""
116 | 
117 | USER_MESSAGE = f"Extract the following information from the provided text:\nPage content:\n\n"
118 |         
119 | 
120 | 
121 | 
122 | 
123 | PROMPT_PAGINATION = """
124 | You are an assistant that extracts pagination elements from markdown content of websites your goal as a universal pagination scrapper of urls from all websites no matter how different they are.
125 | 
126 | Please extract the following:
127 | 
128 | - The url of the 'Next', 'More', 'See more', 'load more' or any other button indicating how to access the next page, if any, it should be 1 url and no more, if there are multiple urls with the same structure leave this empty.
129 | 
130 | - A list of page URLs for pagination it should be a pattern of similar urls with pages that are numbered, if you detect this pattern and the numbers starts from a certain low number until a large number generate the rest of the urls even if they're not included, 
131 | your goal here is to give as many urls for the user to choose from in order for them to do further scraping, you will have to deal with very different websites that can potientially have so many urls of images and other elements, 
132 | detect only the urls that are clearly defining a pattern to show data on multiple pages, sometimes there is only a part of these urls and you have to combine it with the initial url, that will be provided for you at the end of this prompt.
133 | 
134 | - The user can give you indications on how the pagination works for the specific website at the end of this prompt, if those indications are not empty pay special attention to them as they will directly help you understand the structure and the number of pages to generate.
135 | 
136 | Provide the output as a JSON object with the following structure:
137 | 
138 | {
139 |     "page_urls": ["url1", "url2", "url3",...,"urlN"]
140 | }
141 | 
142 | Do not include any additional text or explanations.
143 | """
144 | 


--------------------------------------------------------------------------------
/chromedriver-win64/LICENSE.chromedriver:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The Chromium Authors
 2 | //
 3 | // Redistribution and use in source and binary forms, with or without
 4 | // modification, are permitted provided that the following conditions are
 5 | // met:
 6 | //
 7 | //    * Redistributions of source code must retain the above copyright
 8 | // notice, this list of conditions and the following disclaimer.
 9 | //    * Redistributions in binary form must reproduce the above
10 | // copyright notice, this list of conditions and the following disclaimer
11 | // in the documentation and/or other materials provided with the
12 | // distribution.
13 | //    * Neither the name of Google LLC nor the names of its
14 | // contributors may be used to endorse or promote products derived from
15 | // this software without specific prior written permission.
16 | //
17 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/chromedriver-win64/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reda-marzouk608/scrape-master/177fe5e19fe7b615c6e9303c341653b8f866b11c/chromedriver-win64/chromedriver.exe


--------------------------------------------------------------------------------
/pagination_detector.py:
--------------------------------------------------------------------------------
  1 | # pagination_detector.py
  2 | 
  3 | import os
  4 | import json
  5 | from typing import List, Dict, Tuple, Union
  6 | from pydantic import BaseModel, Field, ValidationError
  7 | 
  8 | import tiktoken
  9 | from dotenv import load_dotenv
 10 | 
 11 | from openai import OpenAI
 12 | import google.generativeai as genai
 13 | from groq import Groq
 14 | 
 15 | 
 16 | from api_management import get_api_key
 17 | from assets import PROMPT_PAGINATION, PRICING, LLAMA_MODEL_FULLNAME, GROQ_LLAMA_MODEL_FULLNAME
 18 | 
 19 | load_dotenv()
 20 | import logging
 21 | 
 22 | class PaginationData(BaseModel):
 23 |     page_urls: List[str] = Field(default_factory=list, description="List of pagination URLs, including 'Next' button URL if present")
 24 | 
 25 | def calculate_pagination_price(token_counts: Dict[str, int], model: str) -> float:
 26 |     """
 27 |     Calculate the price for pagination based on token counts and the selected model.
 28 |     
 29 |     Args:
 30 |     token_counts (Dict[str, int]): A dictionary containing 'input_tokens' and 'output_tokens'.
 31 |     model (str): The name of the selected model.
 32 | 
 33 |     Returns:
 34 |     float: The total price for the pagination operation.
 35 |     """
 36 |     input_tokens = token_counts['input_tokens']
 37 |     output_tokens = token_counts['output_tokens']
 38 |     
 39 |     input_price = input_tokens * PRICING[model]['input']
 40 |     output_price = output_tokens * PRICING[model]['output']
 41 |     
 42 |     return input_price + output_price
 43 | 
 44 | def detect_pagination_elements(url: str, indications: str, selected_model: str, markdown_content: str) -> Tuple[Union[PaginationData, Dict, str], Dict, float]:
 45 |     try:
 46 |         """
 47 |         Uses AI models to analyze markdown content and extract pagination elements.
 48 | 
 49 |         Args:
 50 |             selected_model (str): The name of the OpenAI model to use.
 51 |             markdown_content (str): The markdown content to analyze.
 52 | 
 53 |         Returns:
 54 |             Tuple[PaginationData, Dict, float]: Parsed pagination data, token counts, and pagination price.
 55 |         """ 
 56 |         prompt_pagination = PROMPT_PAGINATION+"\n The url of the page to extract pagination from   "+url+"if the urls that you find are not complete combine them intelligently in a way that fit the pattern **ALWAYS GIVE A FULL URL**"
 57 |         if indications != "":
 58 |             prompt_pagination +=PROMPT_PAGINATION+"\n\n these are the users indications that, pay special attention to them: "+indications+"\n\n below are the markdowns of the website: \n\n"
 59 |         else:
 60 |             prompt_pagination +=PROMPT_PAGINATION+"\n There are no user indications in this case just apply the logic described. \n\n below are the markdowns of the website: \n\n"
 61 | 
 62 |         if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]:
 63 |             # Use OpenAI API
 64 |             client = OpenAI(api_key=get_api_key('OPENAI_API_KEY'))
 65 |             completion = client.beta.chat.completions.parse(
 66 |                 model=selected_model,
 67 |                 messages=[
 68 |                     {"role": "system", "content": prompt_pagination},
 69 |                     {"role": "user", "content": markdown_content},
 70 |                 ],
 71 |                 response_format=PaginationData
 72 |             )
 73 | 
 74 |             # Extract the parsed response
 75 |             parsed_response = completion.choices[0].message.parsed
 76 | 
 77 |             # Calculate tokens using tiktoken
 78 |             encoder = tiktoken.encoding_for_model(selected_model)
 79 |             input_token_count = len(encoder.encode(markdown_content))
 80 |             output_token_count = len(encoder.encode(json.dumps(parsed_response.dict())))
 81 |             token_counts = {
 82 |                 "input_tokens": input_token_count,
 83 |                 "output_tokens": output_token_count
 84 |             }
 85 | 
 86 |             # Calculate the price
 87 |             pagination_price = calculate_pagination_price(token_counts, selected_model)
 88 | 
 89 |             return parsed_response, token_counts, pagination_price
 90 | 
 91 |         elif selected_model == "gemini-1.5-flash":
 92 |             # Use Google Gemini API
 93 |             genai.configure(api_key=get_api_key("GOOGLE_API_KEY"))
 94 |             model = genai.GenerativeModel(
 95 |                 'gemini-1.5-flash',
 96 |                 generation_config={
 97 |                     "response_mime_type": "application/json",
 98 |                     "response_schema": PaginationData
 99 |                 }
100 |             )
101 |             prompt = f"{prompt_pagination}\n{markdown_content}"
102 |             # Count input tokens using Gemini's method
103 |             input_tokens = model.count_tokens(prompt)
104 |             completion = model.generate_content(prompt)
105 |             # Extract token counts from usage_metadata
106 |             usage_metadata = completion.usage_metadata
107 |             token_counts = {
108 |                 "input_tokens": usage_metadata.prompt_token_count,
109 |                 "output_tokens": usage_metadata.candidates_token_count
110 |             }
111 |             # Get the result
112 |             response_content = completion.text
113 |             
114 |             # Log the response content and its type
115 |             logging.info(f"Gemini Flash response type: {type(response_content)}")
116 |             logging.info(f"Gemini Flash response content: {response_content}")
117 |             
118 |             # Try to parse the response as JSON
119 |             try:
120 |                 parsed_data = json.loads(response_content)
121 |                 if isinstance(parsed_data, dict) and 'page_urls' in parsed_data:
122 |                     pagination_data = PaginationData(**parsed_data)
123 |                 else:
124 |                     pagination_data = PaginationData(page_urls=[])
125 |             except json.JSONDecodeError:
126 |                 logging.error("Failed to parse Gemini Flash response as JSON")
127 |                 pagination_data = PaginationData(page_urls=[])
128 | 
129 |             # Calculate the price
130 |             pagination_price = calculate_pagination_price(token_counts, selected_model)
131 | 
132 |             return pagination_data, token_counts, pagination_price
133 | 
134 |         elif selected_model == "Llama3.1 8B":
135 |             # Use Llama model via OpenAI API pointing to local server
136 |             openai.api_key = "lm-studio"
137 |             openai.api_base = "http://localhost:1234/v1"
138 |             response = openai.ChatCompletion.create(
139 |                 model=LLAMA_MODEL_FULLNAME,
140 |                 messages=[
141 |                     {"role": "system", "content": prompt_pagination},
142 |                     {"role": "user", "content": markdown_content},
143 |                 ],
144 |                 temperature=0.7,
145 |             )
146 |             response_content = response['choices'][0]['message']['content'].strip()
147 |             # Try to parse the JSON
148 |             try:
149 |                 pagination_data = json.loads(response_content)
150 |             except json.JSONDecodeError:
151 |                 pagination_data = {"next_buttons": [], "page_urls": []}
152 |             # Token counts
153 |             token_counts = {
154 |                 "input_tokens": response['usage']['prompt_tokens'],
155 |                 "output_tokens": response['usage']['completion_tokens']
156 |             }
157 |             # Calculate the price
158 |             pagination_price = calculate_pagination_price(token_counts, selected_model)
159 | 
160 |             return pagination_data, token_counts, pagination_price
161 | 
162 |         elif selected_model == "Groq Llama3.1 70b":
163 |             # Use Groq client
164 |             client = Groq(api_key=get_api_key("GROQ_API_KEY"))
165 |             response = client.chat.completions.create(
166 |                 model=GROQ_LLAMA_MODEL_FULLNAME,
167 |                 messages=[
168 |                     {"role": "system", "content": prompt_pagination},
169 |                     {"role": "user", "content": markdown_content},
170 |                 ],
171 |             )
172 |             response_content = response.choices[0].message.content.strip()
173 |             # Try to parse the JSON
174 |             try:
175 |                 pagination_data = json.loads(response_content)
176 |             except json.JSONDecodeError:
177 |                 pagination_data = {"page_urls": []}
178 |             # Token counts
179 |             token_counts = {
180 |                 "input_tokens": response.usage.prompt_tokens,
181 |                 "output_tokens": response.usage.completion_tokens
182 |             }
183 |             # Calculate the price
184 |             pagination_price = calculate_pagination_price(token_counts, selected_model)
185 | 
186 |             '''# Ensure the pagination_data is a dictionary
187 |             if isinstance(pagination_data, PaginationData):
188 |                 pagination_data = pagination_data.model_dump()
189 |             elif not isinstance(pagination_data, dict):
190 |                 pagination_data = {"page_urls": []}'''
191 | 
192 |             return pagination_data, token_counts, pagination_price
193 | 
194 |         else:
195 |             raise ValueError(f"Unsupported model: {selected_model}")
196 | 
197 |     except Exception as e:
198 |         logging.error(f"An error occurred in detect_pagination_elements: {e}")
199 |         # Return default values if an error occurs
200 |         return PaginationData(page_urls=[]), {"input_tokens": 0, "output_tokens": 0}, 0.0
201 | 
202 | 
203 | if __name__ == "__main__":
204 | 
205 |     url="""https://scrapeme.live/shop/"""
206 |     # Define the path to your markdown file
207 |     markdown_file_path = r"C:\Users\redam\Documents\VSCode\ScrapeMaster2.0\output\scrapeme_live_2024_09_24__00_33_20\rawData_1.md"
208 |     
209 |     # Read the markdown content from the file
210 |     with open(markdown_file_path, 'r', encoding='utf-8') as f:
211 |         markdown_content = f.read()
212 |     
213 |     # Specify the model you want to use
214 |     selected_model = 'gemini-1.5-flash'  # Replace with your desired model
215 | 
216 |     # Call the detect_pagination_elements function
217 |     pagination_data, token_counts, pagination_price = detect_pagination_elements(url,"",selected_model, markdown_content)
218 |     
219 |     print("Page URLs:", pagination_data.page_urls)
220 |     print("Pagination Price:", pagination_price)
221 | 
222 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai
 2 | python-dotenv
 3 | pandas
 4 | pydantic
 5 | requests
 6 | beautifulsoup4
 7 | html2text
 8 | tiktoken
 9 | selenium
10 | readability-lxml
11 | streamlit
12 | streamlit-tags
13 | openpyxl
14 | groq
15 | google-generativeai
16 | webdriver-manager
17 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | import re
  5 | import json
  6 | from datetime import datetime
  7 | from typing import List, Dict, Type
  8 | 
  9 | import pandas as pd
 10 | from bs4 import BeautifulSoup
 11 | from pydantic import BaseModel, Field, create_model
 12 | import html2text
 13 | import tiktoken
 14 | import streamlit as st
 15 | 
 16 | from dotenv import load_dotenv
 17 | from selenium import webdriver
 18 | from selenium.webdriver.chrome.service import Service
 19 | from selenium.webdriver.chrome.options import Options
 20 | from selenium.webdriver.common.by import By
 21 | from selenium.webdriver.common.action_chains import ActionChains
 22 | from selenium.webdriver.support.ui import WebDriverWait
 23 | from selenium.webdriver.support import expected_conditions as EC
 24 | from webdriver_manager.chrome import ChromeDriverManager
 25 | 
 26 | 
 27 | from openai import OpenAI
 28 | import google.generativeai as genai
 29 | from groq import Groq
 30 | 
 31 | from api_management import get_api_key
 32 | from assets import USER_AGENTS,PRICING,HEADLESS_OPTIONS,SYSTEM_MESSAGE,USER_MESSAGE,LLAMA_MODEL_FULLNAME,GROQ_LLAMA_MODEL_FULLNAME,HEADLESS_OPTIONS_DOCKER
 33 | load_dotenv()
 34 | 
 35 | 
 36 | # Set up the Chrome WebDriver options
 37 | 
 38 | 
 39 | def is_running_in_docker():
 40 |     """
 41 |     Detect if the app is running inside a Docker container.
 42 |     This checks if the '/proc/1/cgroup' file contains 'docker'.
 43 |     """
 44 |     try:
 45 |         with open("/proc/1/cgroup", "rt") as file:
 46 |             return "docker" in file.read()
 47 |     except Exception:
 48 |         return False
 49 | 
 50 | def setup_selenium(attended_mode=False):
 51 |     options = Options()
 52 |     service = Service(ChromeDriverManager().install())
 53 | 
 54 |     # Apply headless options based on whether the code is running in Docker
 55 |     if is_running_in_docker():
 56 |         # Running inside Docker, use Docker-specific headless options
 57 |         for option in HEADLESS_OPTIONS_DOCKER:
 58 |             options.add_argument(option)
 59 |     else:
 60 |         # Not running inside Docker, use the normal headless options
 61 |         for option in HEADLESS_OPTIONS:
 62 |             options.add_argument(option)
 63 | 
 64 |     # Initialize the WebDriver
 65 |     driver = webdriver.Chrome(service=service, options=options)
 66 |     return driver
 67 | 
 68 | 
 69 | 
 70 | 
 71 | def fetch_html_selenium(url, attended_mode=False, driver=None):
 72 |     if driver is None:
 73 |         driver = setup_selenium(attended_mode)
 74 |         should_quit = True
 75 |         if not attended_mode:
 76 |             driver.get(url)
 77 |     else:
 78 |         should_quit = False
 79 |         # Do not navigate to the URL if in attended mode and driver is already initialized
 80 |         if not attended_mode:
 81 |             driver.get(url)
 82 | 
 83 |     try:
 84 |         if not attended_mode:
 85 |             # Add more realistic actions like scrolling
 86 |             driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
 87 |             time.sleep(random.uniform(1.1, 1.8))
 88 |             driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1.2);")
 89 |             time.sleep(random.uniform(1.1, 1.8))
 90 |             driver.execute_script("window.scrollTo(0, document.body.scrollHeight/1);")
 91 |             time.sleep(random.uniform(1.1, 1.8))
 92 |         # Get the page source from the current page
 93 |         html = driver.page_source
 94 |         return html
 95 |     finally:
 96 |         if should_quit:
 97 |             driver.quit()
 98 | 
 99 | 
100 | 
101 | 
102 | def clean_html(html_content):
103 |     soup = BeautifulSoup(html_content, 'html.parser')
104 |     
105 |     # Remove headers and footers based on common HTML tags or classes
106 |     for element in soup.find_all(['header', 'footer']):
107 |         element.decompose()  # Remove these tags and their content
108 | 
109 |     return str(soup)
110 | 
111 | 
112 | def html_to_markdown_with_readability(html_content):
113 | 
114 |     
115 |     cleaned_html = clean_html(html_content)  
116 |     
117 |     # Convert to markdown
118 |     markdown_converter = html2text.HTML2Text()
119 |     markdown_converter.ignore_links = False
120 |     markdown_content = markdown_converter.handle(cleaned_html)
121 |     
122 |     return markdown_content
123 | 
124 | 
125 |     
126 | def save_raw_data(raw_data: str, output_folder: str, file_name: str):
127 |     """Save raw markdown data to the specified output folder."""
128 |     os.makedirs(output_folder, exist_ok=True)
129 |     raw_output_path = os.path.join(output_folder, file_name)
130 |     with open(raw_output_path, 'w', encoding='utf-8') as f:
131 |         f.write(raw_data)
132 |     print(f"Raw data saved to {raw_output_path}")
133 |     return raw_output_path
134 | 
135 | 
136 | def create_dynamic_listing_model(field_names: List[str]) -> Type[BaseModel]:
137 |     """
138 |     Dynamically creates a Pydantic model based on provided fields.
139 |     field_name is a list of names of the fields to extract from the markdown.
140 |     """
141 |     # Create field definitions using aliases for Field parameters
142 |     field_definitions = {field: (str, ...) for field in field_names}
143 |     # Dynamically create the model with all field
144 |     return create_model('DynamicListingModel', **field_definitions)
145 | 
146 | 
147 | def create_listings_container_model(listing_model: Type[BaseModel]) -> Type[BaseModel]:
148 |     """
149 |     Create a container model that holds a list of the given listing model.
150 |     """
151 |     return create_model('DynamicListingsContainer', listings=(List[listing_model], ...))
152 | 
153 | 
154 | 
155 | 
156 | def trim_to_token_limit(text, model, max_tokens=120000):
157 |     encoder = tiktoken.encoding_for_model(model)
158 |     tokens = encoder.encode(text)
159 |     if len(tokens) > max_tokens:
160 |         trimmed_text = encoder.decode(tokens[:max_tokens])
161 |         return trimmed_text
162 |     return text
163 | 
164 | def generate_system_message(listing_model: BaseModel) -> str:
165 |     """
166 |     Dynamically generate a system message based on the fields in the provided listing model.
167 |     """
168 |     # Use the model_json_schema() method to introspect the Pydantic model
169 |     schema_info = listing_model.model_json_schema()
170 | 
171 |     # Extract field descriptions from the schema
172 |     field_descriptions = []
173 |     for field_name, field_info in schema_info["properties"].items():
174 |         # Get the field type from the schema info
175 |         field_type = field_info["type"]
176 |         field_descriptions.append(f'"{field_name}": "{field_type}"')
177 | 
178 |     # Create the JSON schema structure for the listings
179 |     schema_structure = ",\n".join(field_descriptions)
180 | 
181 |     # Generate the system message dynamically
182 |     system_message = f"""
183 |     You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
184 |                         from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text, 
185 |                         with no additional commentary, explanations, or extraneous information. 
186 |                         You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
187 |                         Please process the following text and provide the output in pure JSON format with no words before or after the JSON:
188 |     Please ensure the output strictly follows this schema:
189 | 
190 |     {{
191 |         "listings": [
192 |             {{
193 |                 {schema_structure}
194 |             }}
195 |         ]
196 |     }} """
197 | 
198 |     return system_message
199 | 
200 | 
201 | 
202 | def format_data(data, DynamicListingsContainer, DynamicListingModel, selected_model):
203 |     token_counts = {}
204 |     
205 |     if selected_model in ["gpt-4o-mini", "gpt-4o-2024-08-06"]:
206 |         # Use OpenAI API
207 |         client = OpenAI(api_key=get_api_key('OPENAI_API_KEY'))
208 |         completion = client.beta.chat.completions.parse(
209 |             model=selected_model,
210 |             messages=[
211 |                 {"role": "system", "content": SYSTEM_MESSAGE},
212 |                 {"role": "user", "content": USER_MESSAGE + data},
213 |             ],
214 |             response_format=DynamicListingsContainer
215 |         )
216 |         # Calculate tokens using tiktoken
217 |         encoder = tiktoken.encoding_for_model(selected_model)
218 |         input_token_count = len(encoder.encode(USER_MESSAGE + data))
219 |         output_token_count = len(encoder.encode(json.dumps(completion.choices[0].message.parsed.dict())))
220 |         token_counts = {
221 |             "input_tokens": input_token_count,
222 |             "output_tokens": output_token_count
223 |         }
224 |         return completion.choices[0].message.parsed, token_counts
225 | 
226 |     elif selected_model == "gemini-1.5-flash":
227 |         # Use Google Gemini API
228 |         genai.configure(api_key=get_api_key("GOOGLE_API_KEY"))
229 |         model = genai.GenerativeModel('gemini-1.5-flash',
230 |                 generation_config={
231 |                     "response_mime_type": "application/json",
232 |                     "response_schema": DynamicListingsContainer
233 |                 })
234 |         prompt = SYSTEM_MESSAGE + "\n" + USER_MESSAGE + data
235 |         # Count input tokens using Gemini's method
236 |         input_tokens = model.count_tokens(prompt)
237 |         completion = model.generate_content(prompt)
238 |         # Extract token counts from usage_metadata
239 |         usage_metadata = completion.usage_metadata
240 |         token_counts = {
241 |             "input_tokens": usage_metadata.prompt_token_count,
242 |             "output_tokens": usage_metadata.candidates_token_count
243 |         }
244 |         return completion.text, token_counts
245 |     
246 |     elif selected_model == "Llama3.1 8B":
247 | 
248 |         # Dynamically generate the system message based on the schema
249 |         sys_message = generate_system_message(DynamicListingModel)
250 |         # print(SYSTEM_MESSAGE)
251 |         # Point to the local server
252 |         client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
253 | 
254 |         completion = client.chat.completions.create(
255 |             model=LLAMA_MODEL_FULLNAME, #change this if needed (use a better model)
256 |             messages=[
257 |                 {"role": "system", "content": sys_message},
258 |                 {"role": "user", "content": USER_MESSAGE + data}
259 |             ],
260 |             temperature=0.7,
261 |             
262 |         )
263 | 
264 |         # Extract the content from the response
265 |         response_content = completion.choices[0].message.content
266 |         print(response_content)
267 |         # Convert the content from JSON string to a Python dictionary
268 |         parsed_response = json.loads(response_content)
269 |         
270 |         # Extract token usage
271 |         token_counts = {
272 |             "input_tokens": completion.usage.prompt_tokens,
273 |             "output_tokens": completion.usage.completion_tokens
274 |         }
275 | 
276 |         return parsed_response, token_counts
277 |     elif selected_model== "Groq Llama3.1 70b":
278 |         
279 |         # Dynamically generate the system message based on the schema
280 |         sys_message = generate_system_message(DynamicListingModel)
281 |         # print(SYSTEM_MESSAGE)
282 |         # Point to the local server
283 |         client = Groq(api_key=get_api_key("GROQ_API_KEY"),)
284 | 
285 |         completion = client.chat.completions.create(
286 |         messages=[
287 |             {"role": "system","content": sys_message},
288 |             {"role": "user","content": USER_MESSAGE + data}
289 |         ],
290 |         model=GROQ_LLAMA_MODEL_FULLNAME,
291 |     )
292 | 
293 |         # Extract the content from the response
294 |         response_content = completion.choices[0].message.content
295 |         
296 |         # Convert the content from JSON string to a Python dictionary
297 |         parsed_response = json.loads(response_content)
298 |         
299 |         # completion.usage
300 |         token_counts = {
301 |             "input_tokens": completion.usage.prompt_tokens,
302 |             "output_tokens": completion.usage.completion_tokens
303 |         }
304 | 
305 |         return parsed_response, token_counts
306 |     else:
307 |         raise ValueError(f"Unsupported model: {selected_model}")
308 | 
309 | 
310 | 
311 | def save_formatted_data(formatted_data, output_folder: str, json_file_name: str, excel_file_name: str):
312 |     """Save formatted data as JSON and Excel in the specified output folder."""
313 |     os.makedirs(output_folder, exist_ok=True)
314 |     
315 |     # Parse the formatted data if it's a JSON string (from Gemini API)
316 |     if isinstance(formatted_data, str):
317 |         try:
318 |             formatted_data_dict = json.loads(formatted_data)
319 |         except json.JSONDecodeError:
320 |             raise ValueError("The provided formatted data is a string but not valid JSON.")
321 |     else:
322 |         # Handle data from OpenAI or other sources
323 |         formatted_data_dict = formatted_data.dict() if hasattr(formatted_data, 'dict') else formatted_data
324 | 
325 |     # Save the formatted data as JSON
326 |     json_output_path = os.path.join(output_folder, json_file_name)
327 |     with open(json_output_path, 'w', encoding='utf-8') as f:
328 |         json.dump(formatted_data_dict, f, indent=4)
329 |     print(f"Formatted data saved to JSON at {json_output_path}")
330 | 
331 |     # Prepare data for DataFrame
332 |     if isinstance(formatted_data_dict, dict):
333 |         # If the data is a dictionary containing lists, assume these lists are records
334 |         data_for_df = next(iter(formatted_data_dict.values())) if len(formatted_data_dict) == 1 else formatted_data_dict
335 |     elif isinstance(formatted_data_dict, list):
336 |         data_for_df = formatted_data_dict
337 |     else:
338 |         raise ValueError("Formatted data is neither a dictionary nor a list, cannot convert to DataFrame")
339 | 
340 |     # Create DataFrame
341 |     try:
342 |         df = pd.DataFrame(data_for_df)
343 |         print("DataFrame created successfully.")
344 | 
345 |         # Save the DataFrame to an Excel file
346 |         excel_output_path = os.path.join(output_folder, excel_file_name)
347 |         df.to_excel(excel_output_path, index=False)
348 |         print(f"Formatted data saved to Excel at {excel_output_path}")
349 |         
350 |         return df
351 |     except Exception as e:
352 |         print(f"Error creating DataFrame or saving Excel: {str(e)}")
353 |         return None
354 | 
355 | def calculate_price(token_counts, model):
356 |     input_token_count = token_counts.get("input_tokens", 0)
357 |     output_token_count = token_counts.get("output_tokens", 0)
358 |     
359 |     # Calculate the costs
360 |     input_cost = input_token_count * PRICING[model]["input"]
361 |     output_cost = output_token_count * PRICING[model]["output"]
362 |     total_cost = input_cost + output_cost
363 |     
364 |     return input_token_count, output_token_count, total_cost
365 | 
366 | 
367 | def generate_unique_folder_name(url):
368 |     timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
369 |     url_name = re.sub(r'\W+', '_', url.split('//')[1].split('/')[0])  # Extract domain name and replace non-alphanumeric characters
370 |     return f"{url_name}_{timestamp}"
371 | 
372 | 
373 | def scrape_url(url: str, fields: List[str], selected_model: str, output_folder: str, file_number: int, markdown: str):
374 |     """Scrape a single URL and save the results."""
375 |     try:
376 |         # Save raw data
377 |         save_raw_data(markdown, output_folder, f'rawData_{file_number}.md')
378 | 
379 |         # Create the dynamic listing model
380 |         DynamicListingModel = create_dynamic_listing_model(fields)
381 | 
382 |         # Create the container model that holds a list of the dynamic listing models
383 |         DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
384 |         
385 |         # Format data
386 |         formatted_data, token_counts = format_data(markdown, DynamicListingsContainer, DynamicListingModel, selected_model)
387 |         
388 |         # Save formatted data
389 |         save_formatted_data(formatted_data, output_folder, f'sorted_data_{file_number}.json', f'sorted_data_{file_number}.xlsx')
390 | 
391 |         # Calculate and return token usage and cost
392 |         input_tokens, output_tokens, total_cost = calculate_price(token_counts, selected_model)
393 |         return input_tokens, output_tokens, total_cost, formatted_data
394 | 
395 |     except Exception as e:
396 |         print(f"An error occurred while processing {url}: {e}")
397 |         return 0, 0, 0, None
398 | 
399 | # Remove the main execution block if it's not needed for testing purposes
400 |         


--------------------------------------------------------------------------------
/streamlit_app.py:
--------------------------------------------------------------------------------
  1 | # streamlit_app.py
  2 | 
  3 | import streamlit as st
  4 | from streamlit_tags import st_tags_sidebar
  5 | import pandas as pd
  6 | import json
  7 | from datetime import datetime
  8 | from scraper import (
  9 |     fetch_html_selenium,
 10 |     save_raw_data,
 11 |     format_data,
 12 |     save_formatted_data,
 13 |     calculate_price,
 14 |     html_to_markdown_with_readability,
 15 |     create_dynamic_listing_model,
 16 |     create_listings_container_model,
 17 |     scrape_url,
 18 |     setup_selenium,
 19 |     generate_unique_folder_name
 20 | )
 21 | from pagination_detector import detect_pagination_elements
 22 | import re
 23 | from urllib.parse import urlparse
 24 | from assets import PRICING
 25 | import os
 26 | 
 27 | # Initialize Streamlit app
 28 | st.set_page_config(page_title="Universal Web Scraper", page_icon="🦑")
 29 | st.title("Universal Web Scraper 🦑")
 30 | 
 31 | # Initialize session state variables
 32 | if 'scraping_state' not in st.session_state:
 33 |     st.session_state['scraping_state'] = 'idle'  # Possible states: 'idle', 'waiting', 'scraping', 'completed'
 34 | if 'results' not in st.session_state:
 35 |     st.session_state['results'] = None
 36 | if 'driver' not in st.session_state:
 37 |     st.session_state['driver'] = None
 38 | 
 39 | # Sidebar components
 40 | st.sidebar.title("Web Scraper Settings")
 41 | 
 42 | # API Keys
 43 | with st.sidebar.expander("API Keys", expanded=False):
 44 |     st.session_state['openai_api_key'] = st.text_input("OpenAI API Key", type="password")
 45 |     st.session_state['gemini_api_key'] = st.text_input("Gemini API Key", type="password")
 46 |     st.session_state['groq_api_key'] = st.text_input("Groq API Key", type="password")
 47 | 
 48 | # Model selection
 49 | model_selection = st.sidebar.selectbox("Select Model", options=list(PRICING.keys()), index=0)
 50 | 
 51 | # URL input
 52 | url_input = st.sidebar.text_input("Enter URL(s) separated by whitespace")
 53 | # Process URLs
 54 | urls = url_input.strip().split()
 55 | num_urls = len(urls)
 56 | # Fields to extract
 57 | show_tags = st.sidebar.toggle("Enable Scraping")
 58 | fields = []
 59 | if show_tags:
 60 |     fields = st_tags_sidebar(
 61 |         label='Enter Fields to Extract:',
 62 |         text='Press enter to add a field',
 63 |         value=[],
 64 |         suggestions=[],
 65 |         maxtags=-1,
 66 |         key='fields_input'
 67 |     )
 68 | 
 69 | st.sidebar.markdown("---")
 70 | 
 71 | # Conditionally display Pagination and Attended Mode options
 72 | if num_urls <= 1:
 73 |     # Pagination settings
 74 |     use_pagination = st.sidebar.toggle("Enable Pagination")
 75 |     pagination_details = ""
 76 |     if use_pagination:
 77 |         pagination_details = st.sidebar.text_input(
 78 |             "Enter Pagination Details (optional)",
 79 |             help="Describe how to navigate through pages (e.g., 'Next' button class, URL pattern)"
 80 |         )
 81 | 
 82 |     st.sidebar.markdown("---")
 83 | 
 84 |     # Attended mode toggle
 85 |     attended_mode = st.sidebar.toggle("Enable Attended Mode")
 86 | else:
 87 |     # Multiple URLs entered; disable Pagination and Attended Mode
 88 |     use_pagination = False
 89 |     attended_mode = False
 90 |     pagination_details = ""  
 91 |     st.sidebar.info("Pagination and Attended Mode are disabled when multiple URLs are entered.")
 92 | 
 93 | st.sidebar.markdown("---")
 94 | 
 95 | 
 96 | 
 97 | # Main action button
 98 | if st.sidebar.button("LAUNCH SCRAPER", type="primary"):
 99 |     if url_input.strip() == "":
100 |         st.error("Please enter at least one URL.")
101 |     elif show_tags and len(fields) == 0:
102 |         st.error("Please enter at least one field to extract.")
103 |     else:
104 |         # Set up scraping parameters in session state
105 |         st.session_state['urls'] = url_input.strip().split()
106 |         st.session_state['fields'] = fields
107 |         st.session_state['model_selection'] = model_selection
108 |         st.session_state['attended_mode'] = attended_mode
109 |         st.session_state['use_pagination'] = use_pagination
110 |         st.session_state['pagination_details'] = pagination_details
111 |         st.session_state['scraping_state'] = 'waiting' if attended_mode else 'scraping'
112 | 
113 | # Scraping logic
114 | if st.session_state['scraping_state'] == 'waiting':
115 |     # Attended mode: set up driver and wait for user interaction
116 |     if st.session_state['driver'] is None:
117 |         st.session_state['driver'] = setup_selenium(attended_mode=True)
118 |         st.session_state['driver'].get(st.session_state['urls'][0])
119 |         st.write("Perform any required actions in the browser window that opened.")
120 |         st.write("Navigate to the page you want to scrape.")
121 |         st.write("When ready, click the 'Resume Scraping' button.")
122 |     else:
123 |         st.write("Browser window is already open. Perform your actions and click 'Resume Scraping'.")
124 | 
125 |     if st.button("Resume Scraping"):
126 |         st.session_state['scraping_state'] = 'scraping'
127 |         st.rerun()
128 | 
129 | elif st.session_state['scraping_state'] == 'scraping':
130 |     with st.spinner('Scraping in progress...'):
131 |         # Perform scraping
132 |         output_folder = os.path.join('output', generate_unique_folder_name(st.session_state['urls'][0]))
133 |         os.makedirs(output_folder, exist_ok=True)
134 | 
135 |         total_input_tokens = 0
136 |         total_output_tokens = 0
137 |         total_cost = 0
138 |         all_data = []
139 |         pagination_info = None
140 | 
141 |         driver = st.session_state.get('driver', None)
142 |         if st.session_state['attended_mode'] and driver is not None:
143 |             # Attended mode: scrape the current page without navigating
144 |             # Fetch HTML from the current page
145 |             raw_html = fetch_html_selenium(st.session_state['urls'][0], attended_mode=True, driver=driver)
146 |             markdown = html_to_markdown_with_readability(raw_html)
147 |             save_raw_data(markdown, output_folder, f'rawData_1.md')
148 | 
149 |             current_url = driver.current_url  # Use the current URL for logging and saving purposes
150 | 
151 |             # Detect pagination if enabled
152 |             if st.session_state['use_pagination']:
153 |                 pagination_data, token_counts, pagination_price = detect_pagination_elements(
154 |                     current_url, st.session_state['pagination_details'], st.session_state['model_selection'], markdown
155 |                 )
156 |                 # Check if pagination_data is a dict or a model with 'page_urls' attribute
157 |                 if isinstance(pagination_data, dict):
158 |                     page_urls = pagination_data.get("page_urls", [])
159 |                 else:
160 |                     page_urls = pagination_data.page_urls
161 |                 
162 |                 pagination_info = {
163 |                     "page_urls": page_urls,
164 |                     "token_counts": token_counts,
165 |                     "price": pagination_price
166 |                 }
167 |             # Scrape data if fields are specified
168 |             if show_tags:
169 |                 # Create dynamic models
170 |                 DynamicListingModel = create_dynamic_listing_model(st.session_state['fields'])
171 |                 DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
172 |                 # Format data
173 |                 formatted_data, token_counts = format_data(
174 |                     markdown, DynamicListingsContainer, DynamicListingModel, st.session_state['model_selection']
175 |                 )
176 |                 input_tokens, output_tokens, cost = calculate_price(token_counts, st.session_state['model_selection'])
177 |                 total_input_tokens += input_tokens
178 |                 total_output_tokens += output_tokens
179 |                 total_cost += cost
180 |                 # Save formatted data
181 |                 df = save_formatted_data(formatted_data, output_folder, f'sorted_data_1.json', f'sorted_data_1.xlsx')
182 |                 all_data.append(formatted_data)
183 |         else:
184 |             # Non-attended mode or driver not available
185 |             for i, url in enumerate(st.session_state['urls'], start=1):
186 |                 # Fetch HTML
187 |                 raw_html = fetch_html_selenium(url, attended_mode=False)
188 |                 markdown = html_to_markdown_with_readability(raw_html)
189 |                 save_raw_data(markdown, output_folder, f'rawData_{i}.md')
190 | 
191 |                 # Detect pagination if enabled and only for the first URL
192 |                 if st.session_state['use_pagination'] and i == 1:
193 |                     pagination_data, token_counts, pagination_price = detect_pagination_elements(
194 |                         url, st.session_state['pagination_details'], st.session_state['model_selection'], markdown
195 |                     )
196 |                     # Check if pagination_data is a dict or a model with 'page_urls' attribute
197 |                     if isinstance(pagination_data, dict):
198 |                         page_urls = pagination_data.get("page_urls", [])
199 |                     else:
200 |                         page_urls = pagination_data.page_urls
201 |                     
202 |                     pagination_info = {
203 |                         "page_urls": page_urls,
204 |                         "token_counts": token_counts,
205 |                         "price": pagination_price
206 |                     }
207 |                 # Scrape data if fields are specified
208 |                 if show_tags:
209 |                     # Create dynamic models
210 |                     DynamicListingModel = create_dynamic_listing_model(st.session_state['fields'])
211 |                     DynamicListingsContainer = create_listings_container_model(DynamicListingModel)
212 |                     # Format data
213 |                     formatted_data, token_counts = format_data(
214 |                         markdown, DynamicListingsContainer, DynamicListingModel, st.session_state['model_selection']
215 |                     )
216 |                     input_tokens, output_tokens, cost = calculate_price(token_counts, st.session_state['model_selection'])
217 |                     total_input_tokens += input_tokens
218 |                     total_output_tokens += output_tokens
219 |                     total_cost += cost
220 |                     # Save formatted data
221 |                     df = save_formatted_data(formatted_data, output_folder, f'sorted_data_{i}.json', f'sorted_data_{i}.xlsx')
222 |                     all_data.append(formatted_data)
223 | 
224 |         # Clean up driver if used
225 |         if driver:
226 |             driver.quit()
227 |             st.session_state['driver'] = None
228 | 
229 |         # Save results
230 |         st.session_state['results'] = {
231 |             'data': all_data,
232 |             'input_tokens': total_input_tokens,
233 |             'output_tokens': total_output_tokens,
234 |             'total_cost': total_cost,
235 |             'output_folder': output_folder,
236 |             'pagination_info': pagination_info
237 |         }
238 |         st.session_state['scraping_state'] = 'completed'
239 | # Display results
240 | if st.session_state['scraping_state'] == 'completed' and st.session_state['results']:
241 |     results = st.session_state['results']
242 |     all_data = results['data']
243 |     total_input_tokens = results['input_tokens']
244 |     total_output_tokens = results['output_tokens']
245 |     total_cost = results['total_cost']
246 |     output_folder = results['output_folder']
247 |     pagination_info = results['pagination_info']
248 | 
249 |     # Display scraping details
250 |     if show_tags:
251 |         st.subheader("Scraping Results")
252 |         for i, data in enumerate(all_data, start=1):
253 |             st.write(f"Data from URL {i}:")
254 |             
255 |             # Handle string data (convert to dict if it's JSON)
256 |             if isinstance(data, str):
257 |                 try:
258 |                     data = json.loads(data)
259 |                 except json.JSONDecodeError:
260 |                     st.error(f"Failed to parse data as JSON for URL {i}")
261 |                     continue
262 |             
263 |             if isinstance(data, dict):
264 |                 if 'listings' in data and isinstance(data['listings'], list):
265 |                     df = pd.DataFrame(data['listings'])
266 |                 else:
267 |                     # If 'listings' is not in the dict or not a list, use the entire dict
268 |                     df = pd.DataFrame([data])
269 |             elif hasattr(data, 'listings') and isinstance(data.listings, list):
270 |                 # Handle the case where data is a Pydantic model
271 |                 listings = [item.dict() for item in data.listings]
272 |                 df = pd.DataFrame(listings)
273 |             else:
274 |                 st.error(f"Unexpected data format for URL {i}")
275 |                 continue
276 |             # Display the dataframe
277 |             st.dataframe(df, use_container_width=True)
278 | 
279 |         # Display token usage and cost
280 |         st.sidebar.markdown("---")
281 |         st.sidebar.markdown("### Scraping Details")
282 |         st.sidebar.markdown("#### Token Usage")
283 |         st.sidebar.markdown(f"*Input Tokens:* {total_input_tokens}")
284 |         st.sidebar.markdown(f"*Output Tokens:* {total_output_tokens}")
285 |         st.sidebar.markdown(f"**Total Cost:** :green-background[**${total_cost:.4f}**]")
286 | 
287 |         # Download options
288 |         st.subheader("Download Extracted Data")
289 |         col1, col2 = st.columns(2)
290 |         with col1:
291 |             json_data = json.dumps(all_data, default=lambda o: o.dict() if hasattr(o, 'dict') else str(o), indent=4)
292 |             st.download_button(
293 |                 "Download JSON",
294 |                 data=json_data,
295 |                 file_name="scraped_data.json"
296 |             )
297 |         with col2:
298 |             # Convert all data to a single DataFrame
299 |             all_listings = []
300 |             for data in all_data:
301 |                 if isinstance(data, str):
302 |                     try:
303 |                         data = json.loads(data)
304 |                     except json.JSONDecodeError:
305 |                         continue
306 |                 if isinstance(data, dict) and 'listings' in data:
307 |                     all_listings.extend(data['listings'])
308 |                 elif hasattr(data, 'listings'):
309 |                     all_listings.extend([item.dict() for item in data.listings])
310 |                 else:
311 |                     all_listings.append(data)
312 |             
313 |             combined_df = pd.DataFrame(all_listings)
314 |             st.download_button(
315 |                 "Download CSV",
316 |                 data=combined_df.to_csv(index=False),
317 |                 file_name="scraped_data.csv"
318 |             )
319 | 
320 |         st.success(f"Scraping completed. Results saved in {output_folder}")
321 | 
322 |     # Display pagination info
323 |     if pagination_info:
324 |         st.markdown("---")
325 |         st.subheader("Pagination Information")
326 | 
327 |         # Display token usage and cost using metrics
328 |         st.sidebar.markdown("---")
329 |         st.sidebar.markdown("### Pagination Details")
330 |         st.sidebar.markdown(f"**Number of Page URLs:** {len(pagination_info['page_urls'])}")
331 |         st.sidebar.markdown("#### Pagination Token Usage")
332 |         st.sidebar.markdown(f"*Input Tokens:* {pagination_info['token_counts']['input_tokens']}")
333 |         st.sidebar.markdown(f"*Output Tokens:* {pagination_info['token_counts']['output_tokens']}")
334 |         st.sidebar.markdown(f"**Pagination Cost:** :blue-background[**${pagination_info['price']:.4f}**]")
335 | 
336 | 
337 |         # Display page URLs in a table
338 |         st.write("**Page URLs:**")
339 |         # Make URLs clickable
340 |         pagination_df = pd.DataFrame(pagination_info["page_urls"], columns=["Page URLs"])
341 |         
342 |         st.dataframe(
343 |             pagination_df,
344 |             column_config={
345 |                 "Page URLs": st.column_config.LinkColumn("Page URLs")
346 |             },use_container_width=True
347 |         )
348 | 
349 |         # Download pagination URLs
350 |         st.subheader("Download Pagination URLs")
351 |         col1, col2 = st.columns(2)
352 |         with col1:
353 |             st.download_button("Download Pagination CSV",data=pagination_df.to_csv(index=False),file_name="pagination_urls.csv")
354 |         with col2:
355 |             st.download_button("Download Pagination JSON",data=json.dumps(pagination_info['page_urls'], indent=4),file_name="pagination_urls.json")
356 |     # Reset scraping state
357 |     if st.sidebar.button("Clear Results"):
358 |         st.session_state['scraping_state'] = 'idle'
359 |         st.session_state['results'] = None
360 | 
361 |    # If both scraping and pagination were performed, show totals under the pagination table
362 |     if show_tags and pagination_info:
363 |         st.markdown("---")
364 |         total_input_tokens_combined = total_input_tokens + pagination_info['token_counts']['input_tokens']
365 |         total_output_tokens_combined = total_output_tokens + pagination_info['token_counts']['output_tokens']
366 |         total_combined_cost = total_cost + pagination_info['price']
367 |         st.markdown("### Total Counts and Cost (Including Pagination)")
368 |         st.markdown(f"**Total Input Tokens:** {total_input_tokens_combined}")
369 |         st.markdown(f"**Total Output Tokens:** {total_output_tokens_combined}")
370 |         st.markdown(f"**Total Combined Cost:** :rainbow-background[**${total_combined_cost:.4f}**]")
371 | # Helper function to generate unique folder names
372 | def generate_unique_folder_name(url):
373 |     timestamp = datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
374 |     
375 |     # Parse the URL
376 |     parsed_url = urlparse(url)
377 |     
378 |     # Extract the domain name
379 |     domain = parsed_url.netloc or parsed_url.path.split('/')[0]
380 |     
381 |     # Remove 'www.' if present
382 |     domain = re.sub(r'^www\.', '', domain)
383 |     
384 |     # Remove any non-alphanumeric characters and replace with underscores
385 |     clean_domain = re.sub(r'\W+', '_', domain)
386 |     
387 |     return f"{clean_domain}_{timestamp}"
388 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reda-marzouk608/scrape-master/177fe5e19fe7b615c6e9303c341653b8f866b11c/test.py


--------------------------------------------------------------------------------