├── requirements.txt
├── .devcontainer
    └── devcontainer.json
├── README.md
└── groqcrawl.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.32.0
 2 | pocketgroq==0.4.8 
 3 | bs4>=0.0.2
 4 | groq>=0.8.0
 5 | python-dotenv>=0.19.1
 6 | requests>=2.32.3
 7 | langchain>=0.3.1
 8 | langchain-groq>=0.2.0
 9 | langchain-community>=0.3.1
10 | markdown2>=2.5.0
11 | faiss-cpu>=1.8.0.post1
12 | ollama>=0.3.3
13 | pytest>=7.3.1
14 | pytest-asyncio>=0.21.0
15 | html2text>=2024.2.26


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "groqcrawl.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run groqcrawl.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GroqCrawl: Advanced Web Crawling and Scraping with Streamlit and PocketGroq
  2 | 
  3 | ![image](https://github.com/user-attachments/assets/2bf60247-6b93-47c9-aaf4-98b57a241082)
  4 | 
  5 | 
  6 | ## Table of Contents
  7 | 1. [Introduction](#introduction)
  8 | 2. [Features](#features)
  9 | 3. [Installation](#installation)
 10 | 4. [Usage](#usage)
 11 | 5. [Advanced Options](#advanced-options)
 12 | 6. [Output Formats](#output-formats)
 13 | 7. [Examples](#examples)
 14 | 8. [Troubleshooting](#troubleshooting)
 15 | 9. [Contributing](#contributing)
 16 | 10. [License](#license)
 17 | 
 18 | ## Introduction
 19 | 
 20 | GroqCrawl is a powerful and user-friendly web crawling and scraping application built with Streamlit and powered by PocketGroq. It provides an intuitive interface for extracting LLM friendly AI consumable content from websites, with support for single-page scraping, multi-page crawling, and site mapping.
 21 | 
 22 | Whether you're a data scientist, researcher, or web developer, GroqCrawl offers a seamless experience for gathering web data in various formats, including Markdown, HTML, and structured data.
 23 | 
 24 | ## Features
 25 | 
 26 | - **Single URL Scraping**: Extract content from individual web pages.
 27 | - **Website Crawling**: Traverse multiple pages of a website, respecting depth and page limits.
 28 | - **Site Mapping**: Generate a list of all accessible URLs within a website.
 29 | - **Multiple Output Formats**: Choose from Markdown, HTML, and structured data representations.
 30 | - **Advanced Crawling Options**: Customize your crawl with exclude/include paths, depth limits, and more.
 31 | - **Interactive Results Display**: View scraped content directly in the Streamlit interface.
 32 | - **Download Options**: Save your results as JSON files for further processing.
 33 | 
 34 | ## Installation
 35 | 
 36 | 1. Ensure you have Python 3.7 or later installed on your system.
 37 | 
 38 | 2. Clone the GroqCrawl repository:
 39 |    ```
 40 |    git clone https://github.com/yourusername/groqcrawl.git
 41 |    cd groqcrawl
 42 |    ```
 43 | 
 44 | 3. Install the required dependencies:
 45 |    ```
 46 |    pip install -r requirements.txt
 47 |    ```
 48 | 
 49 | 4. Set up your PocketGroq API key:
 50 |    - Create a `.env` file in the project root directory.
 51 |    - Add your API key to the file: `GROQ_API_KEY=your_api_key_here`
 52 | 
 53 | ## Usage
 54 | 
 55 | To run GroqCrawl:
 56 | 
 57 | 1. Navigate to the project directory:
 58 |    ```
 59 |    cd path/to/groqcrawl
 60 |    ```
 61 | 
 62 | 2. Launch the Streamlit app:
 63 |    ```
 64 |    streamlit run groqcrawl.py
 65 |    ```
 66 | 
 67 | 3. Open your web browser and go to the URL displayed in the terminal (usually `http://localhost:8501`).
 68 | 
 69 | 4. Use the interface to select your scraping type, enter a URL, and configure options.
 70 | 
 71 | 5. Click "Run" to start the scraping/crawling process.
 72 | 
 73 | ## Advanced Options
 74 | 
 75 | - **Max Depth**: Set the maximum depth for crawling (Crawl mode only).
 76 | - **Max Pages**: Limit the total number of pages to crawl (Crawl mode only).
 77 | - **Exclude Paths**: Specify URL patterns to exclude from crawling.
 78 | - **Include Only Paths**: Limit crawling to specific URL patterns.
 79 | - **Ignore Sitemap**: Skip using the sitemap.xml for crawling.
 80 | - **Allow Backwards Links**: Enable crawling of links that point to previously visited pages.
 81 | 
 82 | ## Output Formats
 83 | 
 84 | 1. **Markdown**: 
 85 |    - Human-readable text format.
 86 |    - Ideal for content analysis and easy viewing.
 87 | 
 88 | 2. **HTML**: 
 89 |    - Raw HTML content of the page.
 90 |    - Useful for detailed structure analysis or further processing.
 91 | 
 92 | 3. **Structured Data**: 
 93 |    - JSON format containing:
 94 |      - Full text content
 95 |      - Headings (h1 to h6)
 96 |      - Links (text and href)
 97 |      - Images (src and alt attributes)
 98 |      - JSON-LD data (if available)
 99 | 
100 | ## Examples
101 | 
102 | ### Single URL Scraping
103 | 
104 | 1. Select "Single URL (/scrape)" from the radio buttons.
105 | 2. Enter a URL, e.g., `https://example.com`.
106 | 3. Choose desired output formats.
107 | 4. Click "Run".
108 | 
109 | ### Website Crawling
110 | 
111 | 1. Select "Crawl (/crawl)" from the radio buttons.
112 | 2. Enter the starting URL, e.g., `https://example.com`.
113 | 3. Set Max Depth and Max Pages in the Options section.
114 | 4. Choose desired output formats.
115 | 5. Click "Run".
116 | 
117 | ### Site Mapping
118 | 
119 | 1. Select "Map (/map)" from the radio buttons.
120 | 2. Enter the website URL, e.g., `https://example.com`.
121 | 3. Click "Run".
122 | 
123 | ## Troubleshooting
124 | 
125 | - **API Key Issues**: Ensure your PocketGroq API key is correctly set in the `.env` file.
126 | - **Connection Errors**: Check your internet connection and verify the URL is accessible.
127 | - **Slow Performance**: For large websites, try reducing Max Depth or Max Pages.
128 | - **Missing Content**: Some websites may block scraping. Check the site's robots.txt file and consider respecting their scraping policies.
129 | 
130 | ## Contributing
131 | 
132 | We welcome contributions to GroqCrawl! Please follow these steps:
133 | 
134 | 1. Fork the repository.
135 | 2. Create a new branch for your feature or bug fix.
136 | 3. Make your changes and commit them with clear, descriptive messages.
137 | 4. Push your changes to your fork.
138 | 5. Submit a pull request with a detailed description of your changes.
139 | 
140 | ## License
141 | 
142 | GroqCrawl is released under the MIT License. See the [LICENSE](LICENSE) file for details.
143 | 
144 | ---
145 | 
146 | For more information or support, please open an issue on the [GitHub repository](https://github.com/yourusername/groqcrawl/issues).
147 | 
148 | Happy crawling!
149 | 


--------------------------------------------------------------------------------
/groqcrawl.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import importlib    
  3 | import json
  4 | import sys
  5 | from pocketgroq import GroqProvider
  6 | from typing import List, Dict, Any
  7 | 
  8 | # Initialize GroqProvider
  9 | groq = GroqProvider()
 10 | 
 11 | def check_import(module_name):
 12 |     try:
 13 |         importlib.import_module(module_name)
 14 |         return None
 15 |     except ImportError as e:
 16 |         return str(e)
 17 | 
 18 | # Check for required modules
 19 | required_modules = ['pocketgroq', 'groq', 'langchain_groq', 'langchain', 'langchain_community']
 20 | import_errors = {module: check_import(module) for module in required_modules}
 21 | 
 22 | if any(import_errors.values()):
 23 |     st.error("Some required modules could not be imported:")
 24 |     for module, error in import_errors.items():
 25 |         if error:
 26 |             st.error(f"{module}: {error}")
 27 |     st.error("Please check your installation and requirements.txt file.")
 28 |     st.write("Python version:", sys.version)
 29 |     st.write("Python path:", sys.executable)
 30 |     st.write("sys.path:", sys.path)
 31 | else:
 32 |     # If all modules are present, import GroqProvider
 33 |     from pocketgroq import GroqProvider
 34 | 
 35 |     # Initialize GroqProvider
 36 |     try:
 37 |         groq = GroqProvider()
 38 |         st.success("GroqProvider initialized successfully.")
 39 |     except Exception as e:
 40 |         st.error(f"Error initializing GroqProvider: {str(e)}")
 41 | 
 42 | def scrape_url(url: str, formats: List[str] = ["markdown", "html"]) -> Dict[str, Any]:
 43 |     """
 44 |     Scrape a single URL using PocketGroq's enhanced_web_tool.
 45 |     """
 46 |     try:
 47 |         result = groq.enhanced_web_tool.scrape_page(url, formats)
 48 |         return result
 49 |     except Exception as e:
 50 |         return {"error": str(e)}
 51 | 
 52 | def crawl_website(url: str, max_depth: int, max_pages: int, formats: List[str] = ["markdown", "html"]) -> List[Dict[str, Any]]:
 53 |     """
 54 |     Crawl a website using PocketGroq's enhanced_web_tool.
 55 |     """
 56 |     try:
 57 |         groq.enhanced_web_tool.max_depth = max_depth
 58 |         groq.enhanced_web_tool.max_pages = max_pages
 59 |         results = groq.enhanced_web_tool.crawl(url, formats)
 60 |         return results
 61 |     except Exception as e:
 62 |         return [{"error": str(e)}]
 63 | 
 64 | def map_website(url: str) -> List[str]:
 65 |     """
 66 |     Map a website using PocketGroq's web_search method.
 67 |     """
 68 |     try:
 69 |         results = groq.web_search(f"site:{url}")
 70 |         return [result['url'] for result in results]
 71 |     except Exception as e:
 72 |         return [f"Error: {str(e)}"]
 73 | 
 74 | # Streamlit UI
 75 | st.title("GroqCrawl Interface")
 76 | 
 77 | # Select scraping type
 78 | scraping_type = st.radio("Select Scraping Type:", ["Single URL (/scrape)", "Crawl (/crawl)", "Map (/map)"])
 79 | 
 80 | # Input URL
 81 | url = st.text_input("URL:")
 82 | 
 83 | # Advanced Options
 84 | with st.expander("Options"):
 85 |     if scraping_type == "Crawl (/crawl)":
 86 |         max_depth = st.number_input("Max depth", min_value=1, value=3)
 87 |         max_pages = st.number_input("Max pages", min_value=1, value=10)
 88 |     
 89 |     formats = st.multiselect("Output formats", ["markdown", "html", "structured_data"], default=["markdown", "html"])
 90 |     
 91 |     exclude_paths = st.text_input("Exclude Paths (comma separated):", "blog/, /about/")
 92 |     include_paths = st.text_input("Include Only Paths (comma separated):", "articles/")
 93 |     ignore_sitemap = st.checkbox("Ignore sitemap")
 94 |     allow_backwards_links = st.checkbox("Allow backwards links")
 95 | 
 96 | # Run button
 97 | if st.button("Run"):
 98 |     if url:
 99 |         if scraping_type == "Single URL (/scrape)":
100 |             result = scrape_url(url, formats)
101 |             st.subheader("Scraped Result:")
102 |             if "error" in result:
103 |                 st.error(result["error"])
104 |             else:
105 |                 if "markdown" in formats:
106 |                     st.markdown("### Markdown Content (Raw)")
107 |                     st.text_area("Raw Markdown", result.get("markdown", ""), height=300)
108 |                 if "html" in formats:
109 |                     st.markdown("### HTML Content")
110 |                     st.code(result.get("html", ""), language="html")
111 |                 if "structured_data" in formats:
112 |                     st.markdown("### Structured Data")
113 |                     st.json(result.get("structured_data", {}))
114 |                 
115 |                 # Option to download result as JSON
116 |                 json_result = json.dumps(result, indent=4)
117 |                 st.download_button("Download JSON", json_result, "scraped_result.json", "application/json")
118 |         
119 |         elif scraping_type == "Crawl (/crawl)":
120 |             results = crawl_website(url, max_depth, max_pages, formats)
121 |             st.subheader("Crawl Results:")
122 |             for i, result in enumerate(results, 1):
123 |                 st.write(f"Page {i}:")
124 |                 if "error" in result:
125 |                     st.error(result["error"])
126 |                 else:
127 |                     st.write(f"URL: {result['url']}")
128 |                     if "markdown" in formats:
129 |                         st.markdown("#### Markdown Content (Raw)")
130 |                         st.text_area(f"Raw Markdown (Page {i})", result.get("markdown", ""), height=200)
131 |                     if "html" in formats:
132 |                         st.markdown("#### HTML Content")
133 |                         st.code(result.get("html", ""), language="html")
134 |                     if "structured_data" in formats:
135 |                         st.markdown("#### Structured Data")
136 |                         st.json(result.get("structured_data", {}))
137 |                 st.markdown("---")
138 |             
139 |             # Option to download results as JSON
140 |             json_result = json.dumps(results, indent=4)
141 |             st.download_button("Download JSON", json_result, "crawl_results.json", "application/json")
142 |         
143 |         elif scraping_type == "Crawl (/crawl)":
144 |             results = crawl_website(url, max_depth, max_pages, formats)
145 |             st.subheader("Crawl Results:")
146 |             for i, result in enumerate(results, 1):
147 |                 st.write(f"Page {i}:")
148 |                 if "error" in result:
149 |                     st.error(result["error"])
150 |                 else:
151 |                     st.write(f"URL: {result['url']}")
152 |                     if "markdown" in formats:
153 |                         st.markdown("#### Markdown Content")
154 |                         st.markdown(result.get("markdown", ""))
155 |                     if "html" in formats:
156 |                         st.markdown("#### HTML Content")
157 |                         st.code(result.get("html", ""), language="html")
158 |                     if "structured_data" in formats:
159 |                         st.markdown("#### Structured Data")
160 |                         st.json(result.get("structured_data", {}))
161 |                 st.markdown("---")
162 |             
163 |             # Option to download results as JSON
164 |             json_result = json.dumps(results, indent=4)
165 |             st.download_button("Download JSON", json_result, "crawl_results.json", "application/json")
166 |         
167 |         elif scraping_type == "Map (/map)":
168 |             results = map_website(url)
169 |             st.subheader("Site Map:")
170 |             for link in results:
171 |                 st.write(link)
172 |             
173 |             # Option to download results as JSON
174 |             json_result = json.dumps(results, indent=4)
175 |             st.download_button("Download JSON", json_result, "site_map.json", "application/json")
176 |     
177 |     else:
178 |         st.error("Please enter a URL to process.")
179 | 
180 | # Display appropriate warnings
181 | if scraping_type == "Crawl (/crawl)":
182 |     st.warning("Crawling may take some time depending on the site size and depth.")
183 | elif scraping_type == "Map (/map)":
184 |     st.warning("Mapping uses a basic implementation. Results may not be comprehensive.")


--------------------------------------------------------------------------------