├── requirements.txt ├── .devcontainer └── devcontainer.json ├── README.md └── groqcrawl.py /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.32.0 2 | pocketgroq==0.4.8 3 | bs4>=0.0.2 4 | groq>=0.8.0 5 | python-dotenv>=0.19.1 6 | requests>=2.32.3 7 | langchain>=0.3.1 8 | langchain-groq>=0.2.0 9 | langchain-community>=0.3.1 10 | markdown2>=2.5.0 11 | faiss-cpu>=1.8.0.post1 12 | ollama>=0.3.3 13 | pytest>=7.3.1 14 | pytest-asyncio>=0.21.0 15 | html2text>=2024.2.26 -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "groqcrawl.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y Dict[str, Any]: 43 | """ 44 | Scrape a single URL using PocketGroq's enhanced_web_tool. 45 | """ 46 | try: 47 | result = groq.enhanced_web_tool.scrape_page(url, formats) 48 | return result 49 | except Exception as e: 50 | return {"error": str(e)} 51 | 52 | def crawl_website(url: str, max_depth: int, max_pages: int, formats: List[str] = ["markdown", "html"]) -> List[Dict[str, Any]]: 53 | """ 54 | Crawl a website using PocketGroq's enhanced_web_tool. 55 | """ 56 | try: 57 | groq.enhanced_web_tool.max_depth = max_depth 58 | groq.enhanced_web_tool.max_pages = max_pages 59 | results = groq.enhanced_web_tool.crawl(url, formats) 60 | return results 61 | except Exception as e: 62 | return [{"error": str(e)}] 63 | 64 | def map_website(url: str) -> List[str]: 65 | """ 66 | Map a website using PocketGroq's web_search method. 67 | """ 68 | try: 69 | results = groq.web_search(f"site:{url}") 70 | return [result['url'] for result in results] 71 | except Exception as e: 72 | return [f"Error: {str(e)}"] 73 | 74 | # Streamlit UI 75 | st.title("GroqCrawl Interface") 76 | 77 | # Select scraping type 78 | scraping_type = st.radio("Select Scraping Type:", ["Single URL (/scrape)", "Crawl (/crawl)", "Map (/map)"]) 79 | 80 | # Input URL 81 | url = st.text_input("URL:") 82 | 83 | # Advanced Options 84 | with st.expander("Options"): 85 | if scraping_type == "Crawl (/crawl)": 86 | max_depth = st.number_input("Max depth", min_value=1, value=3) 87 | max_pages = st.number_input("Max pages", min_value=1, value=10) 88 | 89 | formats = st.multiselect("Output formats", ["markdown", "html", "structured_data"], default=["markdown", "html"]) 90 | 91 | exclude_paths = st.text_input("Exclude Paths (comma separated):", "blog/, /about/") 92 | include_paths = st.text_input("Include Only Paths (comma separated):", "articles/") 93 | ignore_sitemap = st.checkbox("Ignore sitemap") 94 | allow_backwards_links = st.checkbox("Allow backwards links") 95 | 96 | # Run button 97 | if st.button("Run"): 98 | if url: 99 | if scraping_type == "Single URL (/scrape)": 100 | result = scrape_url(url, formats) 101 | st.subheader("Scraped Result:") 102 | if "error" in result: 103 | st.error(result["error"]) 104 | else: 105 | if "markdown" in formats: 106 | st.markdown("### Markdown Content (Raw)") 107 | st.text_area("Raw Markdown", result.get("markdown", ""), height=300) 108 | if "html" in formats: 109 | st.markdown("### HTML Content") 110 | st.code(result.get("html", ""), language="html") 111 | if "structured_data" in formats: 112 | st.markdown("### Structured Data") 113 | st.json(result.get("structured_data", {})) 114 | 115 | # Option to download result as JSON 116 | json_result = json.dumps(result, indent=4) 117 | st.download_button("Download JSON", json_result, "scraped_result.json", "application/json") 118 | 119 | elif scraping_type == "Crawl (/crawl)": 120 | results = crawl_website(url, max_depth, max_pages, formats) 121 | st.subheader("Crawl Results:") 122 | for i, result in enumerate(results, 1): 123 | st.write(f"Page {i}:") 124 | if "error" in result: 125 | st.error(result["error"]) 126 | else: 127 | st.write(f"URL: {result['url']}") 128 | if "markdown" in formats: 129 | st.markdown("#### Markdown Content (Raw)") 130 | st.text_area(f"Raw Markdown (Page {i})", result.get("markdown", ""), height=200) 131 | if "html" in formats: 132 | st.markdown("#### HTML Content") 133 | st.code(result.get("html", ""), language="html") 134 | if "structured_data" in formats: 135 | st.markdown("#### Structured Data") 136 | st.json(result.get("structured_data", {})) 137 | st.markdown("---") 138 | 139 | # Option to download results as JSON 140 | json_result = json.dumps(results, indent=4) 141 | st.download_button("Download JSON", json_result, "crawl_results.json", "application/json") 142 | 143 | elif scraping_type == "Crawl (/crawl)": 144 | results = crawl_website(url, max_depth, max_pages, formats) 145 | st.subheader("Crawl Results:") 146 | for i, result in enumerate(results, 1): 147 | st.write(f"Page {i}:") 148 | if "error" in result: 149 | st.error(result["error"]) 150 | else: 151 | st.write(f"URL: {result['url']}") 152 | if "markdown" in formats: 153 | st.markdown("#### Markdown Content") 154 | st.markdown(result.get("markdown", "")) 155 | if "html" in formats: 156 | st.markdown("#### HTML Content") 157 | st.code(result.get("html", ""), language="html") 158 | if "structured_data" in formats: 159 | st.markdown("#### Structured Data") 160 | st.json(result.get("structured_data", {})) 161 | st.markdown("---") 162 | 163 | # Option to download results as JSON 164 | json_result = json.dumps(results, indent=4) 165 | st.download_button("Download JSON", json_result, "crawl_results.json", "application/json") 166 | 167 | elif scraping_type == "Map (/map)": 168 | results = map_website(url) 169 | st.subheader("Site Map:") 170 | for link in results: 171 | st.write(link) 172 | 173 | # Option to download results as JSON 174 | json_result = json.dumps(results, indent=4) 175 | st.download_button("Download JSON", json_result, "site_map.json", "application/json") 176 | 177 | else: 178 | st.error("Please enter a URL to process.") 179 | 180 | # Display appropriate warnings 181 | if scraping_type == "Crawl (/crawl)": 182 | st.warning("Crawling may take some time depending on the site size and depth.") 183 | elif scraping_type == "Map (/map)": 184 | st.warning("Mapping uses a basic implementation. Results may not be comprehensive.") --------------------------------------------------------------------------------