├── README.md
├── requirements.txt
├── LICENSE
├── .gitignore
└── app.py


/README.md:
--------------------------------------------------------------------------------
1 | # crawler-demo
2 | Firecrawl based web crawler
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.28.1
2 | requests==2.31.0
3 | python-dotenv==1.0.0
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 AI Anytime
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[codz]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | #poetry.toml
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114 | #   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115 | #pdm.lock
116 | #pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # pixi
121 | #   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122 | #pixi.lock
123 | #   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124 | #   in the .venv directory. It is recommended not to include this directory in version control.
125 | .pixi
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .envrc
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 | 
151 | # Rope project settings
152 | .ropeproject
153 | 
154 | # mkdocs documentation
155 | /site
156 | 
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 | 
162 | # Pyre type checker
163 | .pyre/
164 | 
165 | # pytype static type analyzer
166 | .pytype/
167 | 
168 | # Cython debug symbols
169 | cython_debug/
170 | 
171 | # PyCharm
172 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
175 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
176 | #.idea/
177 | 
178 | # Abstra
179 | # Abstra is an AI-powered process automation framework.
180 | # Ignore directories containing user credentials, local state, and settings.
181 | # Learn more at https://abstra.io/docs
182 | .abstra/
183 | 
184 | # Visual Studio Code
185 | #  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
186 | #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187 | #  and can be added to the global gitignore or merged into this file. However, if you prefer, 
188 | #  you could uncomment the following to ignore the entire vscode folder
189 | # .vscode/
190 | 
191 | # Ruff stuff:
192 | .ruff_cache/
193 | 
194 | # PyPI configuration file
195 | .pypirc
196 | 
197 | # Cursor
198 | #  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199 | #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200 | #  refer to https://docs.cursor.com/context/ignore-files
201 | .cursorignore
202 | .cursorindexingignore
203 | 
204 | # Marimo
205 | marimo/_static/
206 | marimo/_lsp/
207 | __marimo__/
208 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import requests
  3 | import os
  4 | from dotenv import load_dotenv
  5 | import time
  6 | 
  7 | # Load environment variables
  8 | load_dotenv()
  9 | 
 10 | st.title("🕷️ Web Crawler with Firecrawl")
 11 | st.write("Enter a URL to crawl and extract content using Firecrawl API")
 12 | 
 13 | # Input for URL
 14 | url_to_crawl = st.text_input("Enter URL to crawl:", placeholder="https://example.com")
 15 | 
 16 | # Crawl options
 17 | col1, col2 = st.columns(2)
 18 | with col1:
 19 |     limit = st.number_input("Limit pages", min_value=1, max_value=50, value=10)
 20 |     crawl_entire_domain = st.checkbox("Crawl entire domain", value=False)
 21 | 
 22 | with col2:
 23 |     include_sitemap = st.checkbox("Include sitemap", value=True)
 24 |     only_main_content = st.checkbox("Only main content", value=False)
 25 | 
 26 | # Crawl button
 27 | if st.button("🚀 Start Crawling", type="primary"):
 28 |     if not url_to_crawl:
 29 |         st.error("Please enter a URL to crawl")
 30 |     else:
 31 |         # Get API key from environment
 32 |         api_key = os.getenv("FIRECRAWL_API_KEY")
 33 |         if not api_key:
 34 |             st.error("FIRECRAWL_API_KEY not found in environment variables")
 35 |             st.stop()
 36 |         
 37 |         # Prepare the payload
 38 |         payload = {
 39 |             "url": url_to_crawl,
 40 |             "sitemap": "include" if include_sitemap else "exclude",
 41 |             "crawlEntireDomain": crawl_entire_domain,
 42 |             "limit": limit,
 43 |             "scrapeOptions": {
 44 |                 "onlyMainContent": only_main_content,
 45 |                 "maxAge": 172800000,
 46 |                 "parsers": ["pdf"],
 47 |                 "formats": ["markdown"]
 48 |             }
 49 |         }
 50 |         
 51 |         headers = {
 52 |             "Authorization": f"Bearer {api_key}",
 53 |             "Content-Type": "application/json"
 54 |         }
 55 |         
 56 |         # Show loading spinner
 57 |         with st.spinner("🔄 Starting crawl job... This may take a few minutes."):
 58 |             try:
 59 |                 # Make the initial API request to start crawling
 60 |                 response = requests.post(
 61 |                     "https://api.firecrawl.dev/v2/crawl",
 62 |                     json=payload,
 63 |                     headers=headers,
 64 |                     timeout=60
 65 |                 )
 66 |                 
 67 |                 if response.status_code == 200:
 68 |                     result = response.json()
 69 |                     
 70 |                     if "id" in result:
 71 |                         crawl_id = result["id"]
 72 |                         st.info(f"🆔 Crawl job started with ID: {crawl_id}")
 73 |                         
 74 |                         # Poll for results
 75 |                         with st.spinner("🔄 Waiting for crawl to complete... This may take several minutes."):
 76 |                             max_attempts = 60  # Maximum 10 minutes (60 * 10 seconds)
 77 |                             attempt = 0
 78 |                             
 79 |                             while attempt < max_attempts:
 80 |                                 # Check crawl status
 81 |                                 status_response = requests.get(
 82 |                                     f"https://api.firecrawl.dev/v2/crawl/{crawl_id}",
 83 |                                     headers=headers,
 84 |                                     timeout=30
 85 |                                 )
 86 |                                 
 87 |                                 if status_response.status_code == 200:
 88 |                                     status_result = status_response.json()
 89 |                                     
 90 |                                     # Check if crawl is completed
 91 |                                     if status_result.get("status") == "completed":
 92 |                                         st.success("✅ Crawling completed successfully!")
 93 |                                         
 94 |                                         # Display the results
 95 |                                         st.subheader("📄 Crawl Results")
 96 |                                         
 97 |                                         # Show raw JSON response in expandable section
 98 |                                         with st.expander("View Raw Response"):
 99 |                                             st.json(status_result)
100 |                                         
101 |                                         # Display formatted results if data is available
102 |                                         if "data" in status_result and status_result["data"]:
103 |                                             st.subheader("📋 Extracted Content")
104 |                                             for i, page in enumerate(status_result["data"], 1):
105 |                                                 with st.expander(f"Page {i}: {page.get('url', 'Unknown URL')}"):
106 |                                                     if "markdown" in page:
107 |                                                         st.markdown(page["markdown"])
108 |                                                     elif "content" in page:
109 |                                                         st.text(page["content"])
110 |                                                     else:
111 |                                                         st.json(page)
112 |                                         else:
113 |                                             st.warning("No content data found in the response.")
114 |                                         break
115 |                                     
116 |                                     elif status_result.get("status") == "failed":
117 |                                         st.error("❌ Crawl job failed!")
118 |                                         st.error(f"Error: {status_result.get('error', 'Unknown error')}")
119 |                                         break
120 |                                     
121 |                                     else:
122 |                                         # Still in progress, show status
123 |                                         current_status = status_result.get("status", "unknown")
124 |                                         st.info(f"⏳ Status: {current_status}")
125 |                                         time.sleep(10)  # Wait 10 seconds before next check
126 |                                         attempt += 1
127 |                                 
128 |                                 else:
129 |                                     st.error(f"❌ Error checking status: {status_response.status_code}")
130 |                                     break
131 |                             
132 |                             if attempt >= max_attempts:
133 |                                 st.warning("⏰ Crawl is taking longer than expected. You can check the status manually using the crawl ID above.")
134 |                     
135 |                     else:
136 |                         st.error("❌ No crawl ID returned from the API")
137 |                         st.json(result)
138 |                     
139 |                 else:
140 |                     st.error(f"❌ Error: {response.status_code}")
141 |                     st.error(f"Response: {response.text}")
142 |                     
143 |             except requests.exceptions.Timeout:
144 |                 st.error("⏰ Request timed out. The crawling process is taking longer than expected.")
145 |             except requests.exceptions.RequestException as e:
146 |                 st.error(f"❌ Request failed: {str(e)}")
147 |             except Exception as e:
148 |                 st.error(f"❌ An unexpected error occurred: {str(e)}")
149 | 
150 | # Sidebar with information
151 | with st.sidebar:
152 |     st.header("ℹ️ About")
153 |     st.write("This app uses Firecrawl API to crawl websites and extract content.")
154 |     
155 |     st.header("⚙️ Settings")
156 |     st.write("Configure your crawling options in the main panel.")
157 |     
158 |     st.header("📚 Features")
159 |     st.write("• Extract content in Markdown format")
160 |     st.write("• Crawl single pages or entire domains")
161 |     st.write("• Include/exclude sitemaps")
162 |     st.write("• Filter main content only")
163 |     st.write("• PDF parsing support")
164 | 


--------------------------------------------------------------------------------