├── images
└── firecrawl-devdocs-to-llm-cover.png
├── LICENSE
├── .gitignore
├── README.md
└── devdocs_to_llm_firecrawl.ipynb
/images/firecrawl-devdocs-to-llm-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexfazio/devdocs-to-llm/main/images/firecrawl-devdocs-to-llm-cover.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Alex Fazio
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
3 |
4 | # User-specific stuff
5 | .idea/**/workspace.xml
6 | .idea/**/tasks.xml
7 | .idea/**/usage.statistics.xml
8 | .idea/**/dictionaries
9 | .idea/**/shelf
10 |
11 | # AWS User-specific
12 | .idea/**/aws.xml
13 |
14 | # Generated files
15 | .idea/**/contentModel.xml
16 |
17 | # Sensitive or high-churn files
18 | .idea/**/dataSources/
19 | .idea/**/dataSources.ids
20 | .idea/**/dataSources.local.xml
21 | .idea/**/sqlDataSources.xml
22 | .idea/**/dynamic.xml
23 | .idea/**/uiDesigner.xml
24 | .idea/**/dbnavigator.xml
25 |
26 | # Gradle
27 | .idea/**/gradle.xml
28 | .idea/**/libraries
29 |
30 | # Gradle and Maven with auto-import
31 | # When using Gradle or Maven with auto-import, you should exclude module files,
32 | # since they will be recreated, and may cause churn. Uncomment if using
33 | # auto-import.
34 | # .idea/artifacts
35 | # .idea/compiler.xml
36 | # .idea/jarRepositories.xml
37 | # .idea/modules.xml
38 | # .idea/*.iml
39 | # .idea/modules
40 | # *.iml
41 | # *.ipr
42 |
43 | # CMake
44 | cmake-build-*/
45 |
46 | # Mongo Explorer plugin
47 | .idea/**/mongoSettings.xml
48 |
49 | # File-based project format
50 | *.iws
51 |
52 | # IntelliJ
53 | out/
54 |
55 | # mpeltonen/sbt-idea plugin
56 | .idea_modules/
57 |
58 | # JIRA plugin
59 | atlassian-ide-plugin.xml
60 |
61 | # Cursive Clojure plugin
62 | .idea/replstate.xml
63 |
64 | # SonarLint plugin
65 | .idea/sonarlint/
66 |
67 | # Crashlytics plugin (for Android Studio and IntelliJ)
68 | com_crashlytics_export_strings.xml
69 | crashlytics.properties
70 | crashlytics-build.properties
71 | fabric.properties
72 |
73 | # Editor-based Rest Client
74 | .idea/httpRequests
75 |
76 | # Android studio 3.1+ serialized cache file
77 | .idea/caches/build_file_checksums.ser
78 |
79 | # Env Files, excluding examples
80 | *.env
81 | !.env.example
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Turn any developer documentation into a specialized GPT.
9 |
10 | ## Overview
11 |
12 | DevDocs to LLM is a tool that allows you to crawl developer documentation, extract content, and process it into a format suitable for use with large language models (LLMs) like ChatGPT. This enables you to create specialized assistants tailored to specific documentation sets.
13 |
14 | ## Features
15 |
16 | - Web crawling with customizable options
17 | - Content extraction in Markdown format
18 | - Rate limiting to respect server constraints
19 | - Retry mechanism for failed scrapes
20 | - Export options:
21 | - Rentry.co for quick sharing
22 | - Google Docs for larger documents
23 |
24 | ## Usage
25 |
26 | 1. Set up the Firecrawl environment
27 | 2. Crawl a website and generate a sitemap
28 | 3. Extract content from crawled pages
29 | 4. Export the processed content
30 |
31 | ## Requirements
32 |
33 | - Firecrawl API key
34 | - Google Docs API credentials (optional, for Google Docs export)
35 |
36 | ## Installation
37 |
38 | This project is designed to run in a Jupyter notebook environment, particularly Google Colab. No local installation is required.
39 |
40 | ## Configuration
41 |
42 | Before running the notebook, you'll need to set a few parameters:
43 |
44 | - `sub_url`: The URL of the documentation you want to crawl
45 | - `limit`: Maximum number of pages to crawl
46 | - `scrape_option`: Choose to scrape all pages or a specific number
47 | - `num_pages`: Number of pages to scrape if not scraping all
48 | - `pages_per_minute`: Rate limiting parameter
49 | - `wait_time_between_chunks`: Delay between scraping chunks
50 | - `retry_attempts`: Number of retries for failed scrapes
51 |
52 | ## Contributing
53 |
54 | Contributions are welcome! Please feel free to submit a Pull Request.
55 |
56 | ## License
57 |
58 | [MIT](https://opensource.org/licenses/MIT)
59 |
60 | Copyright (c) 2024-present, Alex Fazio
61 |
62 | ---
63 |
64 | [](https://x.com/alxfazio/status/1826731977283641615)
65 |
--------------------------------------------------------------------------------
/devdocs_to_llm_firecrawl.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "private_outputs": true,
7 | "provenance": [],
8 | "authorship_tag": "ABX9TyMK76muNMFRt0JTXy2fMPCy",
9 | "include_colab_link": true
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | },
15 | "language_info": {
16 | "name": "python"
17 | }
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "view-in-github",
24 | "colab_type": "text"
25 | },
26 | "source": [
27 | "
"
28 | ]
29 | },
30 | {
31 | "cell_type": "markdown",
32 | "source": [
33 | "# DevDocs to LLM: turn any documentation into a GPT\n",
34 | "\n",
35 | "By Alex Fazio (https://twitter.com/alxfazio)\n",
36 | "\n",
37 | "Github repo: https://github.com/alexfazio/devdocs-to-llm\n",
38 | "\n",
39 | "This Jupyter notebook demonstrates how to use the Firecrawl API to crawl developer documentation, extract content, and process the information to be ready to be used in an assistant like ChatGPT.\n",
40 | "\n",
41 | "By the end of this notebook, you'll be able to:\n",
42 | "\n",
43 | "1. Set up the Firecrawl environment\n",
44 | "Crawl a website and generate a sitemap\n",
45 | "2. Extract content from crawled pages in Markdown\n",
46 | "3. Export the processed content to various platforms and platforms, including Rentry.co and Google Docs!\n",
47 | "\n",
48 | "This cookbook is designed for developers and data scientists who want to efficiently gather and analyze developer documentation at scale."
49 | ],
50 | "metadata": {
51 | "id": "yEkq-MhCH5bw"
52 | }
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "source": [
57 | "## Requirements\n",
58 | "Before proceeding, ensure you have the following:\n",
59 | "\n",
60 | "- **Firecrawl API key**: Essential for accessing the Firecrawl service\n",
61 | "- Google Docs API credentials: (Optional) A JSON file named `client_secret_<...>.apps.googleusercontent.com.json` for Google Docs integration.\n",
62 | "\n",
63 | "Note: The Google Docs API credential is only required if you plan to export content to Google Docs. All other functionalities can be used without this optional component."
64 | ],
65 | "metadata": {
66 | "id": "puKp3vCs8Oex"
67 | }
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "source": [
72 | "## Tested Documentation Sources\n",
73 | "\n",
74 | "| Status | Documentation Source | URL |\n",
75 | "|--------|----------------------|-----|\n",
76 | "| ✅ | CrewAI | https://docs.crewai.com/ |\n",
77 | "| ✅ | Brave Search API | https://api.search.brave.com/app/documentation/ |\n",
78 | "| ✅ | OpenAI | https://platform.openai.com/docs |\n",
79 | "| ✅ | FireCrawl | https://docs.firecrawl.dev/ |\n",
80 | "| ✅ | Anthropic | https://docs.anthropic.com/en/docs/ |\n",
81 | "| ✅ | LangChain | https://python.langchain.com/v0.2/docs |\n",
82 | "\n",
83 | "Note: A checkmark (✅) indicates successful testing with the corresponding documentation source."
84 | ],
85 | "metadata": {
86 | "id": "8othA_x4AvGK"
87 | }
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "source": [
92 | "## Setup\n",
93 | "\n",
94 | "First, let's set up our environment with the necessary imports and initializations:\n",
95 | "\n",
96 | "This notebook requires the following libraries:\n",
97 | "\n",
98 | "- `firecrawl-py`: For web crawling and content extraction\n",
99 | "- `requests`: For making HTTP requests\n",
100 | "- `beautifulsoup4`: For parsing HTML content"
101 | ],
102 | "metadata": {
103 | "id": "Uag3A-M2Hs1L"
104 | }
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {
110 | "id": "vAt71wSHVqgz"
111 | },
112 | "outputs": [],
113 | "source": [
114 | "%pip install firecrawl-py requests beautifulsoup4 --quiet\n",
115 | "print(\"---\")\n",
116 | "%pip show requests beautifulsoup4"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "source": [
122 | "from firecrawl import FirecrawlApp"
123 | ],
124 | "metadata": {
125 | "id": "_lKdJlD2iFAI"
126 | },
127 | "execution_count": null,
128 | "outputs": []
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "source": [
133 | "Next, insert you Firecrawl API key `fc-...`"
134 | ],
135 | "metadata": {
136 | "id": "etLyo0ot3_ct"
137 | }
138 | },
139 | {
140 | "cell_type": "code",
141 | "source": [
142 | "from getpass import getpass\n",
143 | "fc_api_key = getpass(\"Enter your Firecrawl API key: \")\n",
144 | "assert fc_api_key != \"\", \"Error: fc_api_key should not be an empty string\""
145 | ],
146 | "metadata": {
147 | "id": "xnKnooAbTbHt"
148 | },
149 | "execution_count": null,
150 | "outputs": []
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "source": [
155 | "## Crawling\n",
156 | "\n",
157 | "Now let's crawl some DevDocs pages to use in our examples.\n",
158 | "\n",
159 | "Enter the documentation URL in your browser's address bar to access the main documentation overview page, **rather than a specific section or page within the documentation**.\n",
160 | "\n",
161 | "This allows you to start the crawl of the main documentation page and navigate to specific topics as needed."
162 | ],
163 | "metadata": {
164 | "id": "dd0NwYwIAKbm"
165 | }
166 | },
167 | {
168 | "cell_type": "code",
169 | "source": [
170 | "sub_url = \"https://docs.cursor.com/\" # @param {type:\"string\"}\n",
171 | "assert sub_url != \"\", \"Error: sub_url should not be an empty string\""
172 | ],
173 | "metadata": {
174 | "id": "kn8ar3iZgd-Y",
175 | "cellView": "form"
176 | },
177 | "execution_count": null,
178 | "outputs": []
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "source": [
183 | "### Attempt Site Map Display and Page Count Preview (Optional)\n",
184 | "\n",
185 | "The following cell will display the site map and preview the page count, providing only a general idea of the structure and number of pages. This process **will not** use the Firecrawl API or consume any usage tokens. Please be aware that this preview is not as accurate as crawling the entire website with Firecrawl."
186 | ],
187 | "metadata": {
188 | "id": "JTMP1UbTSQXs"
189 | }
190 | },
191 | {
192 | "cell_type": "code",
193 | "source": [
194 | "import requests\n",
195 | "from bs4 import BeautifulSoup\n",
196 | "from urllib.parse import urljoin\n",
197 | "\n",
198 | "def get_site_map(url, base_url=None, depth=0, max_depth=3, visited=None, sitemap=None):\n",
199 | " if visited is None:\n",
200 | " visited = set()\n",
201 | " if sitemap is None:\n",
202 | " sitemap = []\n",
203 | " if base_url is None:\n",
204 | " base_url = url\n",
205 | " if depth > max_depth or url in visited:\n",
206 | " return\n",
207 | "\n",
208 | " visited.add(url)\n",
209 | " sitemap.append(url)\n",
210 | "\n",
211 | " try:\n",
212 | " response = requests.get(url)\n",
213 | " soup = BeautifulSoup(response.text, 'html.parser')\n",
214 | " print('| ' * depth + '+-- ' + url)\n",
215 | "\n",
216 | " for link in soup.find_all('a', href=True):\n",
217 | " href = link['href']\n",
218 | " full_url = urljoin(url, href)\n",
219 | " # Check if the full_url starts with the base_url\n",
220 | " if full_url.startswith(base_url):\n",
221 | " get_site_map(full_url, base_url, depth + 1, max_depth, visited, sitemap)\n",
222 | " except Exception as e:\n",
223 | " print(f\"Error processing {url}: {str(e)}\")\n",
224 | "\n",
225 | " return sitemap\n",
226 | "\n",
227 | "def crawl_sub_url(sub_url, max_depth=3):\n",
228 | " # Ensure the sub_url ends with a '/'\n",
229 | " if not sub_url.endswith('/'):\n",
230 | " sub_url += '/'\n",
231 | "\n",
232 | " base_url = sub_url\n",
233 | " print(f\"[REQUESTS CRAWL] Sitemap for {base_url}:\")\n",
234 | " sitemap = get_site_map(base_url, base_url=base_url, max_depth=max_depth)\n",
235 | " print(f\"\\n[REQUESTS CRAWL] Total pages crawled: {len(sitemap)}\")\n",
236 | " return sitemap\n",
237 | "\n",
238 | "# Example usage\n",
239 | "preview_sitemap = crawl_sub_url(sub_url)\n",
240 | "print(\"\\nSite map:\")\n",
241 | "print(preview_sitemap)\n",
242 | "\n",
243 | "# Store the preview_sitemap in a global variable\n",
244 | "sitemap = preview_sitemap"
245 | ],
246 | "metadata": {
247 | "id": "qyNFRv_WSAa9"
248 | },
249 | "execution_count": null,
250 | "outputs": []
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "source": [
255 | "Next, let's initialize `FirecrawlApp()` a Python object that allows you to interact with the Firecrawl API. It essentially sets up a connection so you can use its methods (like crawl_url or scrape_url) to send requests to Firecrawl and get website data back."
256 | ],
257 | "metadata": {
258 | "id": "U-BkvAvMLUzL"
259 | }
260 | },
261 | {
262 | "cell_type": "code",
263 | "source": [
264 | " app = FirecrawlApp(api_key=fc_api_key)"
265 | ],
266 | "metadata": {
267 | "id": "0FyyXaUPT6HB"
268 | },
269 | "execution_count": null,
270 | "outputs": []
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "source": [
275 | "To manage costs and control crawl scope, specify a maximum number of pages to crawl using the limit parameter below."
276 | ],
277 | "metadata": {
278 | "id": "S-NuxKlLL0qJ"
279 | }
280 | },
281 | {
282 | "cell_type": "code",
283 | "source": [
284 | "#@markdown Limit the crawl to a maximum of `limit` pages\n",
285 | "limit = 55 #@param {type:\"number\"}"
286 | ],
287 | "metadata": {
288 | "id": "koUvHtRhVdwi"
289 | },
290 | "execution_count": null,
291 | "outputs": []
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "source": [
296 | "# Crawl Launch"
297 | ],
298 | "metadata": {
299 | "id": "lTFC4NkOqxUd"
300 | }
301 | },
302 | {
303 | "cell_type": "code",
304 | "source": [
305 | "import requests\n",
306 | "from bs4 import BeautifulSoup\n",
307 | "from urllib.parse import urljoin, urlparse\n",
308 | "import random\n",
309 | "from firecrawl import FirecrawlApp\n",
310 | "import json\n",
311 | "\n",
312 | "def merged_crawl(start_url, limit, fc_api_key=fc_api_key):\n",
313 | " def standard_crawl(start_url):\n",
314 | " visited = set()\n",
315 | " collected_urls = []\n",
316 | "\n",
317 | " def normalize_url(url):\n",
318 | " parsed_url = urlparse(url)\n",
319 | " normalized_url = parsed_url._replace(fragment=\"\", query=\"\").geturl()\n",
320 | " return normalized_url\n",
321 | "\n",
322 | " def is_valid_subpage(url):\n",
323 | " return url.startswith(start_url) and not url.startswith(start_url + '#')\n",
324 | "\n",
325 | " def crawl(url):\n",
326 | " if len(collected_urls) >= limit:\n",
327 | " return\n",
328 | "\n",
329 | " normalized_url = normalize_url(url)\n",
330 | " if (normalized_url in visited) or (not is_valid_subpage(normalized_url)):\n",
331 | " return\n",
332 | " visited.add(normalized_url)\n",
333 | " try:\n",
334 | " response = requests.get(normalized_url)\n",
335 | " if response.status_code != 200:\n",
336 | " return\n",
337 | " except requests.exceptions.RequestException:\n",
338 | " return\n",
339 | "\n",
340 | " collected_urls.append(normalized_url)\n",
341 | " print(f\"[REQUESTS CRAWLER] {len(collected_urls)}/{limit}: {normalized_url}\")\n",
342 | "\n",
343 | " if len(collected_urls) >= limit:\n",
344 | " return\n",
345 | "\n",
346 | " soup = BeautifulSoup(response.content, \"html.parser\")\n",
347 | " for link in soup.find_all(\"a\", href=True):\n",
348 | " if len(collected_urls) >= limit:\n",
349 | " return\n",
350 | " href = link['href']\n",
351 | " full_url = urljoin(url, href)\n",
352 | " normalized_full_url = normalize_url(full_url)\n",
353 | " if is_valid_subpage(normalized_full_url):\n",
354 | " crawl(normalized_full_url)\n",
355 | "\n",
356 | " crawl(start_url)\n",
357 | " return collected_urls\n",
358 | "\n",
359 | " def firecrawl_method(start_url, limit):\n",
360 | " app = FirecrawlApp(api_key=fc_api_key)\n",
361 | " crawl_result = app.crawl_url(\n",
362 | " start_url,\n",
363 | " {\n",
364 | " 'crawlerOptions': {\n",
365 | " 'includePaths': ['/docs/', '/documentation/'],\n",
366 | " 'limit': limit,\n",
367 | " }\n",
368 | " }\n",
369 | " )\n",
370 | "\n",
371 | " urls_to_scrape = []\n",
372 | " for page_data in crawl_result:\n",
373 | " metadata = page_data.get('metadata', {})\n",
374 | " source_url = metadata.get('sourceURL')\n",
375 | " if source_url:\n",
376 | " urls_to_scrape.append(source_url)\n",
377 | " links_on_page = metadata.get('linksOnPage')\n",
378 | " if links_on_page:\n",
379 | " urls_to_scrape.extend(links_on_page)\n",
380 | "\n",
381 | " return urls_to_scrape\n",
382 | "\n",
383 | " try:\n",
384 | " print(\"[CRAWLER] Attempting standard crawl...\")\n",
385 | " result = standard_crawl(start_url)\n",
386 | " if not result:\n",
387 | " raise Exception(\"[CRAWLER] Standard crawl returned no results\")\n",
388 | " return result\n",
389 | " except Exception as e:\n",
390 | " print(f\"[CRAWLER] Standard crawl failed: {str(e)}\")\n",
391 | " print(\"[CRAWLER] Falling back to firecrawl method...\")\n",
392 | " return firecrawl_method(start_url, limit)\n",
393 | "\n",
394 | "# Usage\n",
395 | "sitemap = merged_crawl(sub_url, limit)\n",
396 | "print(\"---\")\n",
397 | "print(f\"[CRAWLER] Crawled URLs (sitemap): {sitemap}\")\n",
398 | "print(f\"[CRAWLER] Number of pages crawled: {len(sitemap)}\")"
399 | ],
400 | "metadata": {
401 | "id": "QtM_CYOM1q2V"
402 | },
403 | "execution_count": null,
404 | "outputs": []
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "source": [
409 | "## Scraping\n",
410 | "\n",
411 | "With our sitemap in hand, we can now proceed to extract content from each page. Firecrawl's content extraction capabilities allow us to efficiently parse web pages and retrieve the main content as markdown, filtering out navigation menus, advertisements, and other non-essential elements."
412 | ],
413 | "metadata": {
414 | "id": "tygeo3dr0xi0"
415 | }
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "source": [
420 | "Before we begin the extraction process, let's set some parameters:\n",
421 | "\n",
422 | "- `scrape_option`: Choose whether to scrape all pages or a specific number of pages.\n",
423 | "- `num_pages`: If scraping a specific number of pages, set the desired number here.\n",
424 | "\n",
425 | "Please set these parameters in the cell below."
426 | ],
427 | "metadata": {
428 | "id": "ce2N8HasNylT"
429 | }
430 | },
431 | {
432 | "cell_type": "markdown",
433 | "source": [
434 | "# Scraping Options"
435 | ],
436 | "metadata": {
437 | "id": "HZau_9Hsq1Jg"
438 | }
439 | },
440 | {
441 | "cell_type": "code",
442 | "source": [
443 | "# Create a dropdown for scrape options\n",
444 | "scrape_option = \"Specific number of pages\" # @param [\"All pages\", \"Specific number of pages\"]\n",
445 | "\n",
446 | "# Create a numerical input for the specific number of pages\n",
447 | "num_pages = 55 # @param {type:\"number\"}\n",
448 | "\n",
449 | "# Initialize the num_pages variable depending on the scrape_option\n",
450 | "if scrape_option == \"Specific number of pages\":\n",
451 | " # Check if the number of pages exceeds the length of the sitemap\n",
452 | " num_pages = min(num_pages, len(sitemap)) # Set num_pages to the smaller of the two values\n",
453 | "else:\n",
454 | " # If \"All pages\" is selected, set num_pages to the total length of the sitemap\n",
455 | " num_pages = len(sitemap)\n",
456 | "\n",
457 | "# Now you can proceed with your scraping logic using num_pages\n",
458 | "print(f\"[SCRAPER] Number of pages to scrape: {num_pages}\")"
459 | ],
460 | "metadata": {
461 | "id": "CdIFd1fuA00_"
462 | },
463 | "execution_count": null,
464 | "outputs": []
465 | },
466 | {
467 | "cell_type": "markdown",
468 | "source": [
469 | "Now, let's execute the content extraction process. Our script will:\n",
470 | "\n",
471 | "1. Iterate through the URLs in our sitemap\n",
472 | "2. Use Firecrawl's API to extract the main content from each page\n",
473 | "3. Store the extracted content in both XML and Markdown formats\n",
474 | "\n",
475 | "XML helps in structuring large documents before feeding them to an LLM for RAG or direct query."
476 | ],
477 | "metadata": {
478 | "id": "M7M9lAYzN9-k"
479 | }
480 | },
481 | {
482 | "cell_type": "markdown",
483 | "source": [
484 | "Here is the edited table with only the scraping feature:\n",
485 | "\n",
486 | "| **Plan** | **Max Pages Scraped/Minute** |\n",
487 | "|--------------|------------------------------|\n",
488 | "| Free | 5 |\n",
489 | "| Hobby | 10 |\n",
490 | "| Standard | 50 |\n",
491 | "| Growth | 500 |\n",
492 | "\n",
493 | "Set the times below, ensuring that you stay within the appropriate rate limits for your FireCrawl usage tier and avoid triggering any scraping restrictions.\n",
494 | "\n",
495 | "Check [FireCrawl docs](https://docs.firecrawl.dev/rate-limits) for further guidance on rate limits."
496 | ],
497 | "metadata": {
498 | "id": "4bvLnXVgfSQN"
499 | }
500 | },
501 | {
502 | "cell_type": "markdown",
503 | "source": [
504 | "# Rate Limiting and Retry Parameters\n"
505 | ],
506 | "metadata": {
507 | "id": "2fqRWptDfhPc"
508 | }
509 | },
510 | {
511 | "cell_type": "code",
512 | "source": [
513 | "# @markdown Set the rate limiting and retry parameters for the web scraper:\n",
514 | "\n",
515 | "pages_per_minute = 9 # @param {type:\"integer\"}\n",
516 | "# @markdown Number of pages that can be scraped per minute\n",
517 | "\n",
518 | "wait_time_between_chunks = 33 # @param {type:\"integer\"}\n",
519 | "# @markdown Waiting time (in seconds) between max chunks\n",
520 | "\n",
521 | "retry_attempts = 3 # @param {type:\"integer\"}\n",
522 | "# @markdown Number of times to retry failed scrapes\n",
523 | "\n",
524 | "print(f\"Pages per minute: {pages_per_minute}\")\n",
525 | "print(f\"Wait time between chunks: {wait_time_between_chunks} seconds\")\n",
526 | "print(f\"Number of retry attempts: {retry_attempts}\")\n",
527 | "\n",
528 | "# You can use these variables in your main scraping code"
529 | ],
530 | "metadata": {
531 | "id": "5pxAjeDma8c-"
532 | },
533 | "execution_count": null,
534 | "outputs": []
535 | },
536 | {
537 | "cell_type": "markdown",
538 | "source": [
539 | "## Check the status of the `sitemap` variable for debugging\n",
540 | "\n"
541 | ],
542 | "metadata": {
543 | "id": "hKKezwOKpQLq"
544 | }
545 | },
546 | {
547 | "cell_type": "code",
548 | "source": [
549 | "print(sitemap)"
550 | ],
551 | "metadata": {
552 | "id": "-0f_2R1DpPsV"
553 | },
554 | "execution_count": null,
555 | "outputs": []
556 | },
557 | {
558 | "cell_type": "markdown",
559 | "source": [
560 | "# Firecrawl Scrape Launch"
561 | ],
562 | "metadata": {
563 | "id": "2z23mapxqitK"
564 | }
565 | },
566 | {
567 | "cell_type": "code",
568 | "source": [
569 | "import time\n",
570 | "import math\n",
571 | "import requests\n",
572 | "\n",
573 | "# Initialize a file to store the XML content\n",
574 | "output_file = 'scraped_content.xml'\n",
575 | "\n",
576 | "# Initialize strings to store all the XML and markdown content\n",
577 | "all_xml = \"\\n\"\n",
578 | "all_markdown = \"\"\n",
579 | "\n",
580 | "# Initialize a list to store failed scrape URLs\n",
581 | "failed_scrapes = []\n",
582 | "\n",
583 | "# Determine the number of pages to scrape\n",
584 | "pages_to_scrape = len(sitemap) if scrape_option == \"All pages\" else min(num_pages, len(sitemap))\n",
585 | "\n",
586 | "# Calculate the chunk size and total number of chunks\n",
587 | "chunk_size = pages_per_minute\n",
588 | "total_chunks = math.ceil(pages_to_scrape / chunk_size)\n",
589 | "\n",
590 | "def scrape_url(url, attempt=1):\n",
591 | " print(f\"[FIRECRAWL SCRAPER] Attempting to scrape URL: {url} (Attempt {attempt})\")\n",
592 | " start_time = time.time()\n",
593 | " try:\n",
594 | " response = app.scrape_url(url=url, params={\n",
595 | " 'onlyMainContent': True,\n",
596 | " 'waitFor': 5000,\n",
597 | " })\n",
598 | " markdown_content = response.get('markdown', '')\n",
599 | " end_time = time.time()\n",
600 | " scrape_time = end_time - start_time\n",
601 | " print(f\"[FIRECRAWL SCRAPER] Successfully scraped URL: {url}\")\n",
602 | " print(f\"[FIRECRAWL SCRAPER] Scrape time: {scrape_time:.2f} seconds\")\n",
603 | " print(f\"[FIRECRAWL SCRAPER] Content length: {len(markdown_content)} characters\")\n",
604 | " return markdown_content\n",
605 | " except requests.exceptions.HTTPError as e:\n",
606 | " end_time = time.time()\n",
607 | " scrape_time = end_time - start_time\n",
608 | " print(f\"[FIRECRAWL SCRAPER] Error scraping {url}: {str(e)}\")\n",
609 | " print(f\"[FIRECRAWL SCRAPER] Scrape time (failed): {scrape_time:.2f} seconds\")\n",
610 | " return None\n",
611 | "\n",
612 | "def process_scraped_content(url, markdown_content):\n",
613 | " global all_xml, all_markdown\n",
614 | " # Create XML structure for this page without indentation\n",
615 | " page_xml = f\"\\n\\n{markdown_content}\\n\\n\\n\"\n",
616 | "\n",
617 | " # Add the markdown content with a proper separator\n",
618 | " if all_markdown: # If it's not the first entry, add a separator\n",
619 | " all_markdown += \"\\n\\n***\\n\\n\"\n",
620 | " all_markdown += f\"# {url}\\n\\n{markdown_content}\"\n",
621 | "\n",
622 | " # Append the page XML to the XML string\n",
623 | " all_xml += page_xml\n",
624 | " print(f\"[FIRECRAWL SCRAPER] Processed content for URL: {url}\")\n",
625 | " print(f\"[FIRECRAWL SCRAPER] XML content length: {len(page_xml)} characters\")\n",
626 | " return page_xml\n",
627 | "\n",
628 | "# Open the file in write mode\n",
629 | "with open(output_file, 'w') as file:\n",
630 | " # Write the opening XML tag\n",
631 | " file.write(\"\\n\")\n",
632 | "\n",
633 | " for chunk in range(total_chunks):\n",
634 | " chunk_start = chunk * chunk_size\n",
635 | " chunk_end = min((chunk + 1) * chunk_size, pages_to_scrape)\n",
636 | " print(f\"[FIRECRAWL SCRAPER] Processing chunk {chunk+1}/{total_chunks} (URLs {chunk_start+1}-{chunk_end})\")\n",
637 | "\n",
638 | " for i in range(chunk_start, chunk_end):\n",
639 | " url = sitemap[i]\n",
640 | " print(f\"[FIRECRAWL SCRAPER] Processing URL {i+1}/{pages_to_scrape}: {url}\")\n",
641 | " markdown_content = scrape_url(url)\n",
642 | "\n",
643 | " if markdown_content is not None:\n",
644 | " page_xml = process_scraped_content(url, markdown_content)\n",
645 | " # Write the page XML to the file\n",
646 | " file.write(page_xml)\n",
647 | " print(f\"[FIRECRAWL SCRAPER] Successfully wrote content for URL: {url}\")\n",
648 | " else:\n",
649 | " failed_scrapes.append(url)\n",
650 | " print(f\"[FIRECRAWL SCRAPER] Failed to scrape URL: {url}\")\n",
651 | "\n",
652 | " # Wait after each chunk, except for the last one\n",
653 | " if chunk < total_chunks - 1:\n",
654 | " print(f\"[FIRECRAWL SCRAPER] Chunk {chunk+1} completed. Waiting for {wait_time_between_chunks} seconds before the next chunk...\")\n",
655 | " time.sleep(wait_time_between_chunks)\n",
656 | "\n",
657 | " # Retry failed scrapes\n",
658 | " for attempt in range(retry_attempts):\n",
659 | " if not failed_scrapes:\n",
660 | " break\n",
661 | " print(f\"[FIRECRAWL SCRAPER] Retry attempt {attempt + 1} of {retry_attempts} for {len(failed_scrapes)} failed scrapes...\")\n",
662 | " retry_urls = failed_scrapes.copy()\n",
663 | " failed_scrapes = []\n",
664 | " for url in retry_urls:\n",
665 | " print(f\"[FIRECRAWL SCRAPER] Retrying URL: {url}\")\n",
666 | " markdown_content = scrape_url(url, attempt=attempt+2)\n",
667 | " if markdown_content is not None:\n",
668 | " page_xml = process_scraped_content(url, markdown_content)\n",
669 | " # Write the page XML to the file\n",
670 | " file.write(page_xml)\n",
671 | " print(f\"[FIRECRAWL SCRAPER] Successfully scraped and wrote content for retried URL: {url}\")\n",
672 | " else:\n",
673 | " failed_scrapes.append(url)\n",
674 | " print(f\"[FIRECRAWL SCRAPER] Failed to scrape URL on retry: {url}\")\n",
675 | "\n",
676 | " if failed_scrapes:\n",
677 | " print(f\"[FIRECRAWL SCRAPER] Retry attempt {attempt + 1} completed. Waiting for {wait_time_between_chunks} seconds before the next retry attempt...\")\n",
678 | " time.sleep(wait_time_between_chunks)\n",
679 | "\n",
680 | " # Write the closing XML tag\n",
681 | " file.write(\"\")\n",
682 | " print(\"[FIRECRAWL SCRAPER] Finished writing to XML file\")\n",
683 | "\n",
684 | "# Add the closing tag to the XML string variable\n",
685 | "all_xml += \"\"\n",
686 | "\n",
687 | "# Now you can use the 'all_xml' and 'all_markdown' string variables as needed\n",
688 | "print(f\"[FIRECRAWL SCRAPER] Total characters in all_xml: {len(all_xml)}\")\n",
689 | "print(f\"[FIRECRAWL SCRAPER] Total characters in all_markdown: {len(all_markdown)}\")\n",
690 | "print(f\"[FIRECRAWL SCRAPER] Number of pages scraped: {pages_to_scrape}\")\n",
691 | "print(f\"[FIRECRAWL SCRAPER] Number of pages that failed to scrape after all retries: {len(failed_scrapes)}\")\n",
692 | "if failed_scrapes:\n",
693 | " print(\"[FIRECRAWL SCRAPER] Failed URLs:\")\n",
694 | " for url in failed_scrapes:\n",
695 | " print(url)"
696 | ],
697 | "metadata": {
698 | "id": "5cGuX2YpMgMu"
699 | },
700 | "execution_count": null,
701 | "outputs": []
702 | },
703 | {
704 | "cell_type": "markdown",
705 | "source": [
706 | "## Exporting Extracted Content\n",
707 | "\n",
708 | "After extracting the content, we have several options for exporting and storing the data. In this notebook, we'll demonstrate two export methods:\n",
709 | "\n",
710 | "1. Exporting to Rentry.co, a simple pastebin-like service\n",
711 | "2. Exporting to Google Docs"
712 | ],
713 | "metadata": {
714 | "id": "3vaXC_QG-689"
715 | }
716 | },
717 | {
718 | "cell_type": "code",
719 | "source": [
720 | "# @title Export to Rentry.com\n",
721 | "\n",
722 | "# Import necessary libraries\n",
723 | "import os\n",
724 | "import requests\n",
725 | "import re\n",
726 | "\n",
727 | "# Function to strip HTML tags\n",
728 | "def strip_html_tags(text):\n",
729 | " return re.sub('<[^<]+?>', '', text)\n",
730 | "\n",
731 | "# Function to create a new Rentry post\n",
732 | "def new_rentry(url, edit_code, text):\n",
733 | " base_url = os.getenv('BASE_URL', 'https://rentry.co')\n",
734 | " api_url = f\"{base_url}/api/new\"\n",
735 | "\n",
736 | " # Get CSRF token\n",
737 | " session = requests.Session()\n",
738 | " response = session.get(base_url)\n",
739 | " csrf_token = session.cookies.get('csrftoken')\n",
740 | "\n",
741 | " # Prepare payload\n",
742 | " payload = {\n",
743 | " 'csrfmiddlewaretoken': csrf_token,\n",
744 | " 'url': url,\n",
745 | " 'edit_code': edit_code,\n",
746 | " 'text': text\n",
747 | " }\n",
748 | "\n",
749 | " headers = {\n",
750 | " \"Referer\": base_url,\n",
751 | " \"X-CSRFToken\": csrf_token\n",
752 | " }\n",
753 | "\n",
754 | " # Make POST request\n",
755 | " response = session.post(api_url, data=payload, headers=headers)\n",
756 | " return response.json()\n",
757 | "\n",
758 | "# Function to export content to Rentry\n",
759 | "def export_to_rentry(content):\n",
760 | " cleaned_content = strip_html_tags(content)\n",
761 | "\n",
762 | " # Check if the content exceeds 200,000 characters\n",
763 | " if len(cleaned_content) > 200000:\n",
764 | " print(\"The content exceeds 200,000 characters. Please try using Google Docs instead due to the character limit.\")\n",
765 | " return None, None\n",
766 | "\n",
767 | " url = '' # Leave empty for random URL\n",
768 | " edit_code = '' # Leave empty for random edit code\n",
769 | " response = new_rentry(url, edit_code, cleaned_content)\n",
770 | " if response['status'] == '200':\n",
771 | " return response['url'], response['edit_code']\n",
772 | " else:\n",
773 | " return None, None\n",
774 | "\n",
775 | "# Main execution\n",
776 | "rentry_url, rentry_edit_code = export_to_rentry(all_xml)\n",
777 | "\n",
778 | "if rentry_url and rentry_edit_code:\n",
779 | " print(f\"Rentry document created successfully!\")\n",
780 | " print(f\"URL: {rentry_url}\")\n",
781 | " print(f\"Edit code: {rentry_edit_code}\")\n",
782 | "elif len(strip_html_tags(all_xml)) > 200000:\n",
783 | " # This message is already printed in the export_to_rentry function, but we'll keep it here for clarity\n",
784 | " print(\"The content exceeds 200,000 characters. Please try using Google Docs instead due to the character limit.\")\n",
785 | "else:\n",
786 | " print(\"Failed to create Rentry document.\")"
787 | ],
788 | "metadata": {
789 | "id": "QM2L5kAe2Q1r",
790 | "cellView": "form"
791 | },
792 | "execution_count": null,
793 | "outputs": []
794 | },
795 | {
796 | "cell_type": "code",
797 | "source": [
798 | "# @title Export to Google Docs\n",
799 | "\n",
800 | "from google.colab import files\n",
801 | "from google.oauth2.credentials import Credentials\n",
802 | "from google_auth_oauthlib.flow import Flow\n",
803 | "from googleapiclient.discovery import build\n",
804 | "import json\n",
805 | "import io\n",
806 | "import getpass\n",
807 | "\n",
808 | "# Function to securely get input\n",
809 | "def secure_input(prompt):\n",
810 | " return getpass.getpass(prompt)\n",
811 | "\n",
812 | "# Upload button for JSON credentials file\n",
813 | "print(\"Please upload your client secret JSON file.\")\n",
814 | "uploaded = files.upload()\n",
815 | "\n",
816 | "# Get the filename of the uploaded file\n",
817 | "filename = list(uploaded.keys())[0]\n",
818 | "\n",
819 | "# Read the contents of the uploaded file\n",
820 | "client_secret_json = io.StringIO(uploaded[filename].decode('utf-8')).read()\n",
821 | "\n",
822 | "# Parse the JSON content\n",
823 | "client_secret_data = json.loads(client_secret_json)\n",
824 | "\n",
825 | "# Create a Flow instance\n",
826 | "flow = Flow.from_client_config(\n",
827 | " client_secret_data,\n",
828 | " scopes=['https://www.googleapis.com/auth/documents'],\n",
829 | " redirect_uri='urn:ietf:wg:oauth:2.0:oob')\n",
830 | "\n",
831 | "# Tell the user to go to the authorization URL.\n",
832 | "auth_url, _ = flow.authorization_url(prompt='consent')\n",
833 | "\n",
834 | "print(\"Please go to this URL to authorize the application:\")\n",
835 | "print(auth_url)\n",
836 | "\n",
837 | "# The user will get an authorization code. This line will wait for the user to input it securely.\n",
838 | "code = secure_input(\"Enter the authorization code: \")\n",
839 | "\n",
840 | "# Exchange the authorization code for credentials.\n",
841 | "flow.fetch_token(code=code)\n",
842 | "\n",
843 | "# Get the credentials\n",
844 | "creds = flow.credentials\n",
845 | "\n",
846 | "# Create a Docs API service object\n",
847 | "service = build('docs', 'v1', credentials=creds)\n",
848 | "\n",
849 | "# Create a new document\n",
850 | "document = service.documents().create(body={'title': 'My New Document'}).execute()\n",
851 | "print(f\"Created document with title: {document.get('title')}\")\n",
852 | "\n",
853 | "# Get the document ID\n",
854 | "document_id = document.get('documentId')\n",
855 | "\n",
856 | "# Prepare the content to be inserted\n",
857 | "requests = [\n",
858 | " {\n",
859 | " 'insertText': {\n",
860 | " 'location': {\n",
861 | " 'index': 1,\n",
862 | " },\n",
863 | " 'text': all_markdown\n",
864 | " }\n",
865 | " }\n",
866 | "]\n",
867 | "\n",
868 | "# Execute the request to insert the content\n",
869 | "result = service.documents().batchUpdate(documentId=document_id, body={'requests': requests}).execute()\n",
870 | "\n",
871 | "print(f\"Document content updated. You can find it at: https://docs.google.com/document/d/{document_id}/\")\n",
872 | "\n",
873 | "# Clear sensitive variables\n",
874 | "del client_secret_json, client_secret_data, code, creds"
875 | ],
876 | "metadata": {
877 | "id": "iEOCy5XymyhK",
878 | "cellView": "form"
879 | },
880 | "execution_count": null,
881 | "outputs": []
882 | }
883 | ]
884 | }
--------------------------------------------------------------------------------