├── images └── firecrawl-devdocs-to-llm-cover.png ├── LICENSE ├── .gitignore ├── README.md └── devdocs_to_llm_firecrawl.ipynb /images/firecrawl-devdocs-to-llm-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/devdocs-to-llm/main/images/firecrawl-devdocs-to-llm-cover.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Alex Fazio 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 3 | 4 | # User-specific stuff 5 | .idea/**/workspace.xml 6 | .idea/**/tasks.xml 7 | .idea/**/usage.statistics.xml 8 | .idea/**/dictionaries 9 | .idea/**/shelf 10 | 11 | # AWS User-specific 12 | .idea/**/aws.xml 13 | 14 | # Generated files 15 | .idea/**/contentModel.xml 16 | 17 | # Sensitive or high-churn files 18 | .idea/**/dataSources/ 19 | .idea/**/dataSources.ids 20 | .idea/**/dataSources.local.xml 21 | .idea/**/sqlDataSources.xml 22 | .idea/**/dynamic.xml 23 | .idea/**/uiDesigner.xml 24 | .idea/**/dbnavigator.xml 25 | 26 | # Gradle 27 | .idea/**/gradle.xml 28 | .idea/**/libraries 29 | 30 | # Gradle and Maven with auto-import 31 | # When using Gradle or Maven with auto-import, you should exclude module files, 32 | # since they will be recreated, and may cause churn. Uncomment if using 33 | # auto-import. 34 | # .idea/artifacts 35 | # .idea/compiler.xml 36 | # .idea/jarRepositories.xml 37 | # .idea/modules.xml 38 | # .idea/*.iml 39 | # .idea/modules 40 | # *.iml 41 | # *.ipr 42 | 43 | # CMake 44 | cmake-build-*/ 45 | 46 | # Mongo Explorer plugin 47 | .idea/**/mongoSettings.xml 48 | 49 | # File-based project format 50 | *.iws 51 | 52 | # IntelliJ 53 | out/ 54 | 55 | # mpeltonen/sbt-idea plugin 56 | .idea_modules/ 57 | 58 | # JIRA plugin 59 | atlassian-ide-plugin.xml 60 | 61 | # Cursive Clojure plugin 62 | .idea/replstate.xml 63 | 64 | # SonarLint plugin 65 | .idea/sonarlint/ 66 | 67 | # Crashlytics plugin (for Android Studio and IntelliJ) 68 | com_crashlytics_export_strings.xml 69 | crashlytics.properties 70 | crashlytics-build.properties 71 | fabric.properties 72 | 73 | # Editor-based Rest Client 74 | .idea/httpRequests 75 | 76 | # Android studio 3.1+ serialized cache file 77 | .idea/caches/build_file_checksums.ser 78 | 79 | # Env Files, excluding examples 80 | *.env 81 | !.env.example -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | OpenAI Cookbook Logo 5 | 6 | 7 | 8 | Turn any developer documentation into a specialized GPT. 9 | 10 | ## Overview 11 | 12 | DevDocs to LLM is a tool that allows you to crawl developer documentation, extract content, and process it into a format suitable for use with large language models (LLMs) like ChatGPT. This enables you to create specialized assistants tailored to specific documentation sets. 13 | 14 | ## Features 15 | 16 | - Web crawling with customizable options 17 | - Content extraction in Markdown format 18 | - Rate limiting to respect server constraints 19 | - Retry mechanism for failed scrapes 20 | - Export options: 21 | - Rentry.co for quick sharing 22 | - Google Docs for larger documents 23 | 24 | ## Usage 25 | 26 | 1. Set up the Firecrawl environment 27 | 2. Crawl a website and generate a sitemap 28 | 3. Extract content from crawled pages 29 | 4. Export the processed content 30 | 31 | ## Requirements 32 | 33 | - Firecrawl API key 34 | - Google Docs API credentials (optional, for Google Docs export) 35 | 36 | ## Installation 37 | 38 | This project is designed to run in a Jupyter notebook environment, particularly Google Colab. No local installation is required. 39 | 40 | ## Configuration 41 | 42 | Before running the notebook, you'll need to set a few parameters: 43 | 44 | - `sub_url`: The URL of the documentation you want to crawl 45 | - `limit`: Maximum number of pages to crawl 46 | - `scrape_option`: Choose to scrape all pages or a specific number 47 | - `num_pages`: Number of pages to scrape if not scraping all 48 | - `pages_per_minute`: Rate limiting parameter 49 | - `wait_time_between_chunks`: Delay between scraping chunks 50 | - `retry_attempts`: Number of retries for failed scrapes 51 | 52 | ## Contributing 53 | 54 | Contributions are welcome! Please feel free to submit a Pull Request. 55 | 56 | ## License 57 | 58 | [MIT](https://opensource.org/licenses/MIT) 59 | 60 | Copyright (c) 2024-present, Alex Fazio 61 | 62 | --- 63 | 64 | [![Watch the video](https://i.imgur.com/VKRoApP.png)](https://x.com/alxfazio/status/1826731977283641615) 65 | -------------------------------------------------------------------------------- /devdocs_to_llm_firecrawl.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "private_outputs": true, 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMK76muNMFRt0JTXy2fMPCy", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | } 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "source": [ 33 | "# DevDocs to LLM: turn any documentation into a GPT\n", 34 | "\n", 35 | "By Alex Fazio (https://twitter.com/alxfazio)\n", 36 | "\n", 37 | "Github repo: https://github.com/alexfazio/devdocs-to-llm\n", 38 | "\n", 39 | "This Jupyter notebook demonstrates how to use the Firecrawl API to crawl developer documentation, extract content, and process the information to be ready to be used in an assistant like ChatGPT.\n", 40 | "\n", 41 | "By the end of this notebook, you'll be able to:\n", 42 | "\n", 43 | "1. Set up the Firecrawl environment\n", 44 | "Crawl a website and generate a sitemap\n", 45 | "2. Extract content from crawled pages in Markdown\n", 46 | "3. Export the processed content to various platforms and platforms, including Rentry.co and Google Docs!\n", 47 | "\n", 48 | "This cookbook is designed for developers and data scientists who want to efficiently gather and analyze developer documentation at scale." 49 | ], 50 | "metadata": { 51 | "id": "yEkq-MhCH5bw" 52 | } 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "source": [ 57 | "## Requirements\n", 58 | "Before proceeding, ensure you have the following:\n", 59 | "\n", 60 | "- **Firecrawl API key**: Essential for accessing the Firecrawl service\n", 61 | "- Google Docs API credentials: (Optional) A JSON file named `client_secret_<...>.apps.googleusercontent.com.json` for Google Docs integration.\n", 62 | "\n", 63 | "Note: The Google Docs API credential is only required if you plan to export content to Google Docs. All other functionalities can be used without this optional component." 64 | ], 65 | "metadata": { 66 | "id": "puKp3vCs8Oex" 67 | } 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "source": [ 72 | "## Tested Documentation Sources\n", 73 | "\n", 74 | "| Status | Documentation Source | URL |\n", 75 | "|--------|----------------------|-----|\n", 76 | "| ✅ | CrewAI | https://docs.crewai.com/ |\n", 77 | "| ✅ | Brave Search API | https://api.search.brave.com/app/documentation/ |\n", 78 | "| ✅ | OpenAI | https://platform.openai.com/docs |\n", 79 | "| ✅ | FireCrawl | https://docs.firecrawl.dev/ |\n", 80 | "| ✅ | Anthropic | https://docs.anthropic.com/en/docs/ |\n", 81 | "| ✅ | LangChain | https://python.langchain.com/v0.2/docs |\n", 82 | "\n", 83 | "Note: A checkmark (✅) indicates successful testing with the corresponding documentation source." 84 | ], 85 | "metadata": { 86 | "id": "8othA_x4AvGK" 87 | } 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "source": [ 92 | "## Setup\n", 93 | "\n", 94 | "First, let's set up our environment with the necessary imports and initializations:\n", 95 | "\n", 96 | "This notebook requires the following libraries:\n", 97 | "\n", 98 | "- `firecrawl-py`: For web crawling and content extraction\n", 99 | "- `requests`: For making HTTP requests\n", 100 | "- `beautifulsoup4`: For parsing HTML content" 101 | ], 102 | "metadata": { 103 | "id": "Uag3A-M2Hs1L" 104 | } 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "id": "vAt71wSHVqgz" 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "%pip install firecrawl-py requests beautifulsoup4 --quiet\n", 115 | "print(\"---\")\n", 116 | "%pip show requests beautifulsoup4" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "source": [ 122 | "from firecrawl import FirecrawlApp" 123 | ], 124 | "metadata": { 125 | "id": "_lKdJlD2iFAI" 126 | }, 127 | "execution_count": null, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "source": [ 133 | "Next, insert you Firecrawl API key `fc-...`" 134 | ], 135 | "metadata": { 136 | "id": "etLyo0ot3_ct" 137 | } 138 | }, 139 | { 140 | "cell_type": "code", 141 | "source": [ 142 | "from getpass import getpass\n", 143 | "fc_api_key = getpass(\"Enter your Firecrawl API key: \")\n", 144 | "assert fc_api_key != \"\", \"Error: fc_api_key should not be an empty string\"" 145 | ], 146 | "metadata": { 147 | "id": "xnKnooAbTbHt" 148 | }, 149 | "execution_count": null, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "source": [ 155 | "## Crawling\n", 156 | "\n", 157 | "Now let's crawl some DevDocs pages to use in our examples.\n", 158 | "\n", 159 | "Enter the documentation URL in your browser's address bar to access the main documentation overview page, **rather than a specific section or page within the documentation**.\n", 160 | "\n", 161 | "This allows you to start the crawl of the main documentation page and navigate to specific topics as needed." 162 | ], 163 | "metadata": { 164 | "id": "dd0NwYwIAKbm" 165 | } 166 | }, 167 | { 168 | "cell_type": "code", 169 | "source": [ 170 | "sub_url = \"https://docs.cursor.com/\" # @param {type:\"string\"}\n", 171 | "assert sub_url != \"\", \"Error: sub_url should not be an empty string\"" 172 | ], 173 | "metadata": { 174 | "id": "kn8ar3iZgd-Y", 175 | "cellView": "form" 176 | }, 177 | "execution_count": null, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "source": [ 183 | "### Attempt Site Map Display and Page Count Preview (Optional)\n", 184 | "\n", 185 | "The following cell will display the site map and preview the page count, providing only a general idea of the structure and number of pages. This process **will not** use the Firecrawl API or consume any usage tokens. Please be aware that this preview is not as accurate as crawling the entire website with Firecrawl." 186 | ], 187 | "metadata": { 188 | "id": "JTMP1UbTSQXs" 189 | } 190 | }, 191 | { 192 | "cell_type": "code", 193 | "source": [ 194 | "import requests\n", 195 | "from bs4 import BeautifulSoup\n", 196 | "from urllib.parse import urljoin\n", 197 | "\n", 198 | "def get_site_map(url, base_url=None, depth=0, max_depth=3, visited=None, sitemap=None):\n", 199 | " if visited is None:\n", 200 | " visited = set()\n", 201 | " if sitemap is None:\n", 202 | " sitemap = []\n", 203 | " if base_url is None:\n", 204 | " base_url = url\n", 205 | " if depth > max_depth or url in visited:\n", 206 | " return\n", 207 | "\n", 208 | " visited.add(url)\n", 209 | " sitemap.append(url)\n", 210 | "\n", 211 | " try:\n", 212 | " response = requests.get(url)\n", 213 | " soup = BeautifulSoup(response.text, 'html.parser')\n", 214 | " print('| ' * depth + '+-- ' + url)\n", 215 | "\n", 216 | " for link in soup.find_all('a', href=True):\n", 217 | " href = link['href']\n", 218 | " full_url = urljoin(url, href)\n", 219 | " # Check if the full_url starts with the base_url\n", 220 | " if full_url.startswith(base_url):\n", 221 | " get_site_map(full_url, base_url, depth + 1, max_depth, visited, sitemap)\n", 222 | " except Exception as e:\n", 223 | " print(f\"Error processing {url}: {str(e)}\")\n", 224 | "\n", 225 | " return sitemap\n", 226 | "\n", 227 | "def crawl_sub_url(sub_url, max_depth=3):\n", 228 | " # Ensure the sub_url ends with a '/'\n", 229 | " if not sub_url.endswith('/'):\n", 230 | " sub_url += '/'\n", 231 | "\n", 232 | " base_url = sub_url\n", 233 | " print(f\"[REQUESTS CRAWL] Sitemap for {base_url}:\")\n", 234 | " sitemap = get_site_map(base_url, base_url=base_url, max_depth=max_depth)\n", 235 | " print(f\"\\n[REQUESTS CRAWL] Total pages crawled: {len(sitemap)}\")\n", 236 | " return sitemap\n", 237 | "\n", 238 | "# Example usage\n", 239 | "preview_sitemap = crawl_sub_url(sub_url)\n", 240 | "print(\"\\nSite map:\")\n", 241 | "print(preview_sitemap)\n", 242 | "\n", 243 | "# Store the preview_sitemap in a global variable\n", 244 | "sitemap = preview_sitemap" 245 | ], 246 | "metadata": { 247 | "id": "qyNFRv_WSAa9" 248 | }, 249 | "execution_count": null, 250 | "outputs": [] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "source": [ 255 | "Next, let's initialize `FirecrawlApp()` a Python object that allows you to interact with the Firecrawl API. It essentially sets up a connection so you can use its methods (like crawl_url or scrape_url) to send requests to Firecrawl and get website data back." 256 | ], 257 | "metadata": { 258 | "id": "U-BkvAvMLUzL" 259 | } 260 | }, 261 | { 262 | "cell_type": "code", 263 | "source": [ 264 | " app = FirecrawlApp(api_key=fc_api_key)" 265 | ], 266 | "metadata": { 267 | "id": "0FyyXaUPT6HB" 268 | }, 269 | "execution_count": null, 270 | "outputs": [] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "source": [ 275 | "To manage costs and control crawl scope, specify a maximum number of pages to crawl using the limit parameter below." 276 | ], 277 | "metadata": { 278 | "id": "S-NuxKlLL0qJ" 279 | } 280 | }, 281 | { 282 | "cell_type": "code", 283 | "source": [ 284 | "#@markdown Limit the crawl to a maximum of `limit` pages\n", 285 | "limit = 55 #@param {type:\"number\"}" 286 | ], 287 | "metadata": { 288 | "id": "koUvHtRhVdwi" 289 | }, 290 | "execution_count": null, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "source": [ 296 | "# Crawl Launch" 297 | ], 298 | "metadata": { 299 | "id": "lTFC4NkOqxUd" 300 | } 301 | }, 302 | { 303 | "cell_type": "code", 304 | "source": [ 305 | "import requests\n", 306 | "from bs4 import BeautifulSoup\n", 307 | "from urllib.parse import urljoin, urlparse\n", 308 | "import random\n", 309 | "from firecrawl import FirecrawlApp\n", 310 | "import json\n", 311 | "\n", 312 | "def merged_crawl(start_url, limit, fc_api_key=fc_api_key):\n", 313 | " def standard_crawl(start_url):\n", 314 | " visited = set()\n", 315 | " collected_urls = []\n", 316 | "\n", 317 | " def normalize_url(url):\n", 318 | " parsed_url = urlparse(url)\n", 319 | " normalized_url = parsed_url._replace(fragment=\"\", query=\"\").geturl()\n", 320 | " return normalized_url\n", 321 | "\n", 322 | " def is_valid_subpage(url):\n", 323 | " return url.startswith(start_url) and not url.startswith(start_url + '#')\n", 324 | "\n", 325 | " def crawl(url):\n", 326 | " if len(collected_urls) >= limit:\n", 327 | " return\n", 328 | "\n", 329 | " normalized_url = normalize_url(url)\n", 330 | " if (normalized_url in visited) or (not is_valid_subpage(normalized_url)):\n", 331 | " return\n", 332 | " visited.add(normalized_url)\n", 333 | " try:\n", 334 | " response = requests.get(normalized_url)\n", 335 | " if response.status_code != 200:\n", 336 | " return\n", 337 | " except requests.exceptions.RequestException:\n", 338 | " return\n", 339 | "\n", 340 | " collected_urls.append(normalized_url)\n", 341 | " print(f\"[REQUESTS CRAWLER] {len(collected_urls)}/{limit}: {normalized_url}\")\n", 342 | "\n", 343 | " if len(collected_urls) >= limit:\n", 344 | " return\n", 345 | "\n", 346 | " soup = BeautifulSoup(response.content, \"html.parser\")\n", 347 | " for link in soup.find_all(\"a\", href=True):\n", 348 | " if len(collected_urls) >= limit:\n", 349 | " return\n", 350 | " href = link['href']\n", 351 | " full_url = urljoin(url, href)\n", 352 | " normalized_full_url = normalize_url(full_url)\n", 353 | " if is_valid_subpage(normalized_full_url):\n", 354 | " crawl(normalized_full_url)\n", 355 | "\n", 356 | " crawl(start_url)\n", 357 | " return collected_urls\n", 358 | "\n", 359 | " def firecrawl_method(start_url, limit):\n", 360 | " app = FirecrawlApp(api_key=fc_api_key)\n", 361 | " crawl_result = app.crawl_url(\n", 362 | " start_url,\n", 363 | " {\n", 364 | " 'crawlerOptions': {\n", 365 | " 'includePaths': ['/docs/', '/documentation/'],\n", 366 | " 'limit': limit,\n", 367 | " }\n", 368 | " }\n", 369 | " )\n", 370 | "\n", 371 | " urls_to_scrape = []\n", 372 | " for page_data in crawl_result:\n", 373 | " metadata = page_data.get('metadata', {})\n", 374 | " source_url = metadata.get('sourceURL')\n", 375 | " if source_url:\n", 376 | " urls_to_scrape.append(source_url)\n", 377 | " links_on_page = metadata.get('linksOnPage')\n", 378 | " if links_on_page:\n", 379 | " urls_to_scrape.extend(links_on_page)\n", 380 | "\n", 381 | " return urls_to_scrape\n", 382 | "\n", 383 | " try:\n", 384 | " print(\"[CRAWLER] Attempting standard crawl...\")\n", 385 | " result = standard_crawl(start_url)\n", 386 | " if not result:\n", 387 | " raise Exception(\"[CRAWLER] Standard crawl returned no results\")\n", 388 | " return result\n", 389 | " except Exception as e:\n", 390 | " print(f\"[CRAWLER] Standard crawl failed: {str(e)}\")\n", 391 | " print(\"[CRAWLER] Falling back to firecrawl method...\")\n", 392 | " return firecrawl_method(start_url, limit)\n", 393 | "\n", 394 | "# Usage\n", 395 | "sitemap = merged_crawl(sub_url, limit)\n", 396 | "print(\"---\")\n", 397 | "print(f\"[CRAWLER] Crawled URLs (sitemap): {sitemap}\")\n", 398 | "print(f\"[CRAWLER] Number of pages crawled: {len(sitemap)}\")" 399 | ], 400 | "metadata": { 401 | "id": "QtM_CYOM1q2V" 402 | }, 403 | "execution_count": null, 404 | "outputs": [] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "source": [ 409 | "## Scraping\n", 410 | "\n", 411 | "With our sitemap in hand, we can now proceed to extract content from each page. Firecrawl's content extraction capabilities allow us to efficiently parse web pages and retrieve the main content as markdown, filtering out navigation menus, advertisements, and other non-essential elements." 412 | ], 413 | "metadata": { 414 | "id": "tygeo3dr0xi0" 415 | } 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "source": [ 420 | "Before we begin the extraction process, let's set some parameters:\n", 421 | "\n", 422 | "- `scrape_option`: Choose whether to scrape all pages or a specific number of pages.\n", 423 | "- `num_pages`: If scraping a specific number of pages, set the desired number here.\n", 424 | "\n", 425 | "Please set these parameters in the cell below." 426 | ], 427 | "metadata": { 428 | "id": "ce2N8HasNylT" 429 | } 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "source": [ 434 | "# Scraping Options" 435 | ], 436 | "metadata": { 437 | "id": "HZau_9Hsq1Jg" 438 | } 439 | }, 440 | { 441 | "cell_type": "code", 442 | "source": [ 443 | "# Create a dropdown for scrape options\n", 444 | "scrape_option = \"Specific number of pages\" # @param [\"All pages\", \"Specific number of pages\"]\n", 445 | "\n", 446 | "# Create a numerical input for the specific number of pages\n", 447 | "num_pages = 55 # @param {type:\"number\"}\n", 448 | "\n", 449 | "# Initialize the num_pages variable depending on the scrape_option\n", 450 | "if scrape_option == \"Specific number of pages\":\n", 451 | " # Check if the number of pages exceeds the length of the sitemap\n", 452 | " num_pages = min(num_pages, len(sitemap)) # Set num_pages to the smaller of the two values\n", 453 | "else:\n", 454 | " # If \"All pages\" is selected, set num_pages to the total length of the sitemap\n", 455 | " num_pages = len(sitemap)\n", 456 | "\n", 457 | "# Now you can proceed with your scraping logic using num_pages\n", 458 | "print(f\"[SCRAPER] Number of pages to scrape: {num_pages}\")" 459 | ], 460 | "metadata": { 461 | "id": "CdIFd1fuA00_" 462 | }, 463 | "execution_count": null, 464 | "outputs": [] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "source": [ 469 | "Now, let's execute the content extraction process. Our script will:\n", 470 | "\n", 471 | "1. Iterate through the URLs in our sitemap\n", 472 | "2. Use Firecrawl's API to extract the main content from each page\n", 473 | "3. Store the extracted content in both XML and Markdown formats\n", 474 | "\n", 475 | "XML helps in structuring large documents before feeding them to an LLM for RAG or direct query." 476 | ], 477 | "metadata": { 478 | "id": "M7M9lAYzN9-k" 479 | } 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "source": [ 484 | "Here is the edited table with only the scraping feature:\n", 485 | "\n", 486 | "| **Plan** | **Max Pages Scraped/Minute** |\n", 487 | "|--------------|------------------------------|\n", 488 | "| Free | 5 |\n", 489 | "| Hobby | 10 |\n", 490 | "| Standard | 50 |\n", 491 | "| Growth | 500 |\n", 492 | "\n", 493 | "Set the times below, ensuring that you stay within the appropriate rate limits for your FireCrawl usage tier and avoid triggering any scraping restrictions.\n", 494 | "\n", 495 | "Check [FireCrawl docs](https://docs.firecrawl.dev/rate-limits) for further guidance on rate limits." 496 | ], 497 | "metadata": { 498 | "id": "4bvLnXVgfSQN" 499 | } 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "source": [ 504 | "# Rate Limiting and Retry Parameters\n" 505 | ], 506 | "metadata": { 507 | "id": "2fqRWptDfhPc" 508 | } 509 | }, 510 | { 511 | "cell_type": "code", 512 | "source": [ 513 | "# @markdown Set the rate limiting and retry parameters for the web scraper:\n", 514 | "\n", 515 | "pages_per_minute = 9 # @param {type:\"integer\"}\n", 516 | "# @markdown Number of pages that can be scraped per minute\n", 517 | "\n", 518 | "wait_time_between_chunks = 33 # @param {type:\"integer\"}\n", 519 | "# @markdown Waiting time (in seconds) between max chunks\n", 520 | "\n", 521 | "retry_attempts = 3 # @param {type:\"integer\"}\n", 522 | "# @markdown Number of times to retry failed scrapes\n", 523 | "\n", 524 | "print(f\"Pages per minute: {pages_per_minute}\")\n", 525 | "print(f\"Wait time between chunks: {wait_time_between_chunks} seconds\")\n", 526 | "print(f\"Number of retry attempts: {retry_attempts}\")\n", 527 | "\n", 528 | "# You can use these variables in your main scraping code" 529 | ], 530 | "metadata": { 531 | "id": "5pxAjeDma8c-" 532 | }, 533 | "execution_count": null, 534 | "outputs": [] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "source": [ 539 | "## Check the status of the `sitemap` variable for debugging\n", 540 | "\n" 541 | ], 542 | "metadata": { 543 | "id": "hKKezwOKpQLq" 544 | } 545 | }, 546 | { 547 | "cell_type": "code", 548 | "source": [ 549 | "print(sitemap)" 550 | ], 551 | "metadata": { 552 | "id": "-0f_2R1DpPsV" 553 | }, 554 | "execution_count": null, 555 | "outputs": [] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "source": [ 560 | "# Firecrawl Scrape Launch" 561 | ], 562 | "metadata": { 563 | "id": "2z23mapxqitK" 564 | } 565 | }, 566 | { 567 | "cell_type": "code", 568 | "source": [ 569 | "import time\n", 570 | "import math\n", 571 | "import requests\n", 572 | "\n", 573 | "# Initialize a file to store the XML content\n", 574 | "output_file = 'scraped_content.xml'\n", 575 | "\n", 576 | "# Initialize strings to store all the XML and markdown content\n", 577 | "all_xml = \"\\n\"\n", 578 | "all_markdown = \"\"\n", 579 | "\n", 580 | "# Initialize a list to store failed scrape URLs\n", 581 | "failed_scrapes = []\n", 582 | "\n", 583 | "# Determine the number of pages to scrape\n", 584 | "pages_to_scrape = len(sitemap) if scrape_option == \"All pages\" else min(num_pages, len(sitemap))\n", 585 | "\n", 586 | "# Calculate the chunk size and total number of chunks\n", 587 | "chunk_size = pages_per_minute\n", 588 | "total_chunks = math.ceil(pages_to_scrape / chunk_size)\n", 589 | "\n", 590 | "def scrape_url(url, attempt=1):\n", 591 | " print(f\"[FIRECRAWL SCRAPER] Attempting to scrape URL: {url} (Attempt {attempt})\")\n", 592 | " start_time = time.time()\n", 593 | " try:\n", 594 | " response = app.scrape_url(url=url, params={\n", 595 | " 'onlyMainContent': True,\n", 596 | " 'waitFor': 5000,\n", 597 | " })\n", 598 | " markdown_content = response.get('markdown', '')\n", 599 | " end_time = time.time()\n", 600 | " scrape_time = end_time - start_time\n", 601 | " print(f\"[FIRECRAWL SCRAPER] Successfully scraped URL: {url}\")\n", 602 | " print(f\"[FIRECRAWL SCRAPER] Scrape time: {scrape_time:.2f} seconds\")\n", 603 | " print(f\"[FIRECRAWL SCRAPER] Content length: {len(markdown_content)} characters\")\n", 604 | " return markdown_content\n", 605 | " except requests.exceptions.HTTPError as e:\n", 606 | " end_time = time.time()\n", 607 | " scrape_time = end_time - start_time\n", 608 | " print(f\"[FIRECRAWL SCRAPER] Error scraping {url}: {str(e)}\")\n", 609 | " print(f\"[FIRECRAWL SCRAPER] Scrape time (failed): {scrape_time:.2f} seconds\")\n", 610 | " return None\n", 611 | "\n", 612 | "def process_scraped_content(url, markdown_content):\n", 613 | " global all_xml, all_markdown\n", 614 | " # Create XML structure for this page without indentation\n", 615 | " page_xml = f\"\\n\\n{markdown_content}\\n\\n\\n\"\n", 616 | "\n", 617 | " # Add the markdown content with a proper separator\n", 618 | " if all_markdown: # If it's not the first entry, add a separator\n", 619 | " all_markdown += \"\\n\\n***\\n\\n\"\n", 620 | " all_markdown += f\"# {url}\\n\\n{markdown_content}\"\n", 621 | "\n", 622 | " # Append the page XML to the XML string\n", 623 | " all_xml += page_xml\n", 624 | " print(f\"[FIRECRAWL SCRAPER] Processed content for URL: {url}\")\n", 625 | " print(f\"[FIRECRAWL SCRAPER] XML content length: {len(page_xml)} characters\")\n", 626 | " return page_xml\n", 627 | "\n", 628 | "# Open the file in write mode\n", 629 | "with open(output_file, 'w') as file:\n", 630 | " # Write the opening XML tag\n", 631 | " file.write(\"\\n\")\n", 632 | "\n", 633 | " for chunk in range(total_chunks):\n", 634 | " chunk_start = chunk * chunk_size\n", 635 | " chunk_end = min((chunk + 1) * chunk_size, pages_to_scrape)\n", 636 | " print(f\"[FIRECRAWL SCRAPER] Processing chunk {chunk+1}/{total_chunks} (URLs {chunk_start+1}-{chunk_end})\")\n", 637 | "\n", 638 | " for i in range(chunk_start, chunk_end):\n", 639 | " url = sitemap[i]\n", 640 | " print(f\"[FIRECRAWL SCRAPER] Processing URL {i+1}/{pages_to_scrape}: {url}\")\n", 641 | " markdown_content = scrape_url(url)\n", 642 | "\n", 643 | " if markdown_content is not None:\n", 644 | " page_xml = process_scraped_content(url, markdown_content)\n", 645 | " # Write the page XML to the file\n", 646 | " file.write(page_xml)\n", 647 | " print(f\"[FIRECRAWL SCRAPER] Successfully wrote content for URL: {url}\")\n", 648 | " else:\n", 649 | " failed_scrapes.append(url)\n", 650 | " print(f\"[FIRECRAWL SCRAPER] Failed to scrape URL: {url}\")\n", 651 | "\n", 652 | " # Wait after each chunk, except for the last one\n", 653 | " if chunk < total_chunks - 1:\n", 654 | " print(f\"[FIRECRAWL SCRAPER] Chunk {chunk+1} completed. Waiting for {wait_time_between_chunks} seconds before the next chunk...\")\n", 655 | " time.sleep(wait_time_between_chunks)\n", 656 | "\n", 657 | " # Retry failed scrapes\n", 658 | " for attempt in range(retry_attempts):\n", 659 | " if not failed_scrapes:\n", 660 | " break\n", 661 | " print(f\"[FIRECRAWL SCRAPER] Retry attempt {attempt + 1} of {retry_attempts} for {len(failed_scrapes)} failed scrapes...\")\n", 662 | " retry_urls = failed_scrapes.copy()\n", 663 | " failed_scrapes = []\n", 664 | " for url in retry_urls:\n", 665 | " print(f\"[FIRECRAWL SCRAPER] Retrying URL: {url}\")\n", 666 | " markdown_content = scrape_url(url, attempt=attempt+2)\n", 667 | " if markdown_content is not None:\n", 668 | " page_xml = process_scraped_content(url, markdown_content)\n", 669 | " # Write the page XML to the file\n", 670 | " file.write(page_xml)\n", 671 | " print(f\"[FIRECRAWL SCRAPER] Successfully scraped and wrote content for retried URL: {url}\")\n", 672 | " else:\n", 673 | " failed_scrapes.append(url)\n", 674 | " print(f\"[FIRECRAWL SCRAPER] Failed to scrape URL on retry: {url}\")\n", 675 | "\n", 676 | " if failed_scrapes:\n", 677 | " print(f\"[FIRECRAWL SCRAPER] Retry attempt {attempt + 1} completed. Waiting for {wait_time_between_chunks} seconds before the next retry attempt...\")\n", 678 | " time.sleep(wait_time_between_chunks)\n", 679 | "\n", 680 | " # Write the closing XML tag\n", 681 | " file.write(\"\")\n", 682 | " print(\"[FIRECRAWL SCRAPER] Finished writing to XML file\")\n", 683 | "\n", 684 | "# Add the closing tag to the XML string variable\n", 685 | "all_xml += \"\"\n", 686 | "\n", 687 | "# Now you can use the 'all_xml' and 'all_markdown' string variables as needed\n", 688 | "print(f\"[FIRECRAWL SCRAPER] Total characters in all_xml: {len(all_xml)}\")\n", 689 | "print(f\"[FIRECRAWL SCRAPER] Total characters in all_markdown: {len(all_markdown)}\")\n", 690 | "print(f\"[FIRECRAWL SCRAPER] Number of pages scraped: {pages_to_scrape}\")\n", 691 | "print(f\"[FIRECRAWL SCRAPER] Number of pages that failed to scrape after all retries: {len(failed_scrapes)}\")\n", 692 | "if failed_scrapes:\n", 693 | " print(\"[FIRECRAWL SCRAPER] Failed URLs:\")\n", 694 | " for url in failed_scrapes:\n", 695 | " print(url)" 696 | ], 697 | "metadata": { 698 | "id": "5cGuX2YpMgMu" 699 | }, 700 | "execution_count": null, 701 | "outputs": [] 702 | }, 703 | { 704 | "cell_type": "markdown", 705 | "source": [ 706 | "## Exporting Extracted Content\n", 707 | "\n", 708 | "After extracting the content, we have several options for exporting and storing the data. In this notebook, we'll demonstrate two export methods:\n", 709 | "\n", 710 | "1. Exporting to Rentry.co, a simple pastebin-like service\n", 711 | "2. Exporting to Google Docs" 712 | ], 713 | "metadata": { 714 | "id": "3vaXC_QG-689" 715 | } 716 | }, 717 | { 718 | "cell_type": "code", 719 | "source": [ 720 | "# @title Export to Rentry.com\n", 721 | "\n", 722 | "# Import necessary libraries\n", 723 | "import os\n", 724 | "import requests\n", 725 | "import re\n", 726 | "\n", 727 | "# Function to strip HTML tags\n", 728 | "def strip_html_tags(text):\n", 729 | " return re.sub('<[^<]+?>', '', text)\n", 730 | "\n", 731 | "# Function to create a new Rentry post\n", 732 | "def new_rentry(url, edit_code, text):\n", 733 | " base_url = os.getenv('BASE_URL', 'https://rentry.co')\n", 734 | " api_url = f\"{base_url}/api/new\"\n", 735 | "\n", 736 | " # Get CSRF token\n", 737 | " session = requests.Session()\n", 738 | " response = session.get(base_url)\n", 739 | " csrf_token = session.cookies.get('csrftoken')\n", 740 | "\n", 741 | " # Prepare payload\n", 742 | " payload = {\n", 743 | " 'csrfmiddlewaretoken': csrf_token,\n", 744 | " 'url': url,\n", 745 | " 'edit_code': edit_code,\n", 746 | " 'text': text\n", 747 | " }\n", 748 | "\n", 749 | " headers = {\n", 750 | " \"Referer\": base_url,\n", 751 | " \"X-CSRFToken\": csrf_token\n", 752 | " }\n", 753 | "\n", 754 | " # Make POST request\n", 755 | " response = session.post(api_url, data=payload, headers=headers)\n", 756 | " return response.json()\n", 757 | "\n", 758 | "# Function to export content to Rentry\n", 759 | "def export_to_rentry(content):\n", 760 | " cleaned_content = strip_html_tags(content)\n", 761 | "\n", 762 | " # Check if the content exceeds 200,000 characters\n", 763 | " if len(cleaned_content) > 200000:\n", 764 | " print(\"The content exceeds 200,000 characters. Please try using Google Docs instead due to the character limit.\")\n", 765 | " return None, None\n", 766 | "\n", 767 | " url = '' # Leave empty for random URL\n", 768 | " edit_code = '' # Leave empty for random edit code\n", 769 | " response = new_rentry(url, edit_code, cleaned_content)\n", 770 | " if response['status'] == '200':\n", 771 | " return response['url'], response['edit_code']\n", 772 | " else:\n", 773 | " return None, None\n", 774 | "\n", 775 | "# Main execution\n", 776 | "rentry_url, rentry_edit_code = export_to_rentry(all_xml)\n", 777 | "\n", 778 | "if rentry_url and rentry_edit_code:\n", 779 | " print(f\"Rentry document created successfully!\")\n", 780 | " print(f\"URL: {rentry_url}\")\n", 781 | " print(f\"Edit code: {rentry_edit_code}\")\n", 782 | "elif len(strip_html_tags(all_xml)) > 200000:\n", 783 | " # This message is already printed in the export_to_rentry function, but we'll keep it here for clarity\n", 784 | " print(\"The content exceeds 200,000 characters. Please try using Google Docs instead due to the character limit.\")\n", 785 | "else:\n", 786 | " print(\"Failed to create Rentry document.\")" 787 | ], 788 | "metadata": { 789 | "id": "QM2L5kAe2Q1r", 790 | "cellView": "form" 791 | }, 792 | "execution_count": null, 793 | "outputs": [] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "source": [ 798 | "# @title Export to Google Docs\n", 799 | "\n", 800 | "from google.colab import files\n", 801 | "from google.oauth2.credentials import Credentials\n", 802 | "from google_auth_oauthlib.flow import Flow\n", 803 | "from googleapiclient.discovery import build\n", 804 | "import json\n", 805 | "import io\n", 806 | "import getpass\n", 807 | "\n", 808 | "# Function to securely get input\n", 809 | "def secure_input(prompt):\n", 810 | " return getpass.getpass(prompt)\n", 811 | "\n", 812 | "# Upload button for JSON credentials file\n", 813 | "print(\"Please upload your client secret JSON file.\")\n", 814 | "uploaded = files.upload()\n", 815 | "\n", 816 | "# Get the filename of the uploaded file\n", 817 | "filename = list(uploaded.keys())[0]\n", 818 | "\n", 819 | "# Read the contents of the uploaded file\n", 820 | "client_secret_json = io.StringIO(uploaded[filename].decode('utf-8')).read()\n", 821 | "\n", 822 | "# Parse the JSON content\n", 823 | "client_secret_data = json.loads(client_secret_json)\n", 824 | "\n", 825 | "# Create a Flow instance\n", 826 | "flow = Flow.from_client_config(\n", 827 | " client_secret_data,\n", 828 | " scopes=['https://www.googleapis.com/auth/documents'],\n", 829 | " redirect_uri='urn:ietf:wg:oauth:2.0:oob')\n", 830 | "\n", 831 | "# Tell the user to go to the authorization URL.\n", 832 | "auth_url, _ = flow.authorization_url(prompt='consent')\n", 833 | "\n", 834 | "print(\"Please go to this URL to authorize the application:\")\n", 835 | "print(auth_url)\n", 836 | "\n", 837 | "# The user will get an authorization code. This line will wait for the user to input it securely.\n", 838 | "code = secure_input(\"Enter the authorization code: \")\n", 839 | "\n", 840 | "# Exchange the authorization code for credentials.\n", 841 | "flow.fetch_token(code=code)\n", 842 | "\n", 843 | "# Get the credentials\n", 844 | "creds = flow.credentials\n", 845 | "\n", 846 | "# Create a Docs API service object\n", 847 | "service = build('docs', 'v1', credentials=creds)\n", 848 | "\n", 849 | "# Create a new document\n", 850 | "document = service.documents().create(body={'title': 'My New Document'}).execute()\n", 851 | "print(f\"Created document with title: {document.get('title')}\")\n", 852 | "\n", 853 | "# Get the document ID\n", 854 | "document_id = document.get('documentId')\n", 855 | "\n", 856 | "# Prepare the content to be inserted\n", 857 | "requests = [\n", 858 | " {\n", 859 | " 'insertText': {\n", 860 | " 'location': {\n", 861 | " 'index': 1,\n", 862 | " },\n", 863 | " 'text': all_markdown\n", 864 | " }\n", 865 | " }\n", 866 | "]\n", 867 | "\n", 868 | "# Execute the request to insert the content\n", 869 | "result = service.documents().batchUpdate(documentId=document_id, body={'requests': requests}).execute()\n", 870 | "\n", 871 | "print(f\"Document content updated. You can find it at: https://docs.google.com/document/d/{document_id}/\")\n", 872 | "\n", 873 | "# Clear sensitive variables\n", 874 | "del client_secret_json, client_secret_data, code, creds" 875 | ], 876 | "metadata": { 877 | "id": "iEOCy5XymyhK", 878 | "cellView": "form" 879 | }, 880 | "execution_count": null, 881 | "outputs": [] 882 | } 883 | ] 884 | } --------------------------------------------------------------------------------