├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── cfg.py ├── embed_docs.py ├── requirements.txt └── rtd_scraper ├── __init__.py ├── scrape_rtd.py ├── scrapy.cfg └── tutorial ├── __init__.py ├── middlewares.py ├── settings.py └── spiders ├── __init__.py └── docs_spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | outputs/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Jeremy Pinto 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: RAGTheDocs 3 | emoji: 👀 4 | colorFrom: gray 5 | colorTo: yellow 6 | sdk: gradio 7 | sdk_version: 3.50.2 8 | app_file: app.py 9 | pinned: false 10 | license: mit 11 | --- 12 | 13 | # RAGtheDocs 14 | 15 | ## Introduction 📚 16 | 17 | RAGTheDocs is an open-source library that allows you to **one-click deploy** retrieval augmented generation (RAG) on any readthedocs documentation on [huggingface 🤗 spaces](https://huggingface.co/spaces/jerpint/RAGTheDocs)! 18 | 19 | ## Usage 👉 20 | 21 | 1) Go to the [example space](https://huggingface.co/spaces/jerpint/RAGTheDocs) 22 | 2) Duplicate the space: 23 | 24 | ![image](https://github.com/jerpint/buster/assets/18450628/0c89038c-c3af-4c1f-9d3b-9b4d83db4910) 25 | 26 | 3) Set your environment variables: 27 | * `OPENAI_API_KEY` (required): Needed for the app to work, e.g. `sk-...` 28 | * `READTHEDOCS_URL` (required): The url of the website you are interested in scraping (must be built with 29 | sphinx/readthedocs). e.g. `https://orion.readthedocs.io` 30 | * `READTHEDOCS_VERSION` (optional): This is important if there exist multiple versions of the docs (e.g. `en/v0.2.7` or `en/latest`). If left empty, it will scrape all available versions (there can be many for open-source projects!). 31 | 32 | ## Features 🚀 33 | 34 | - **Web Scraping and embeddings:** RAGtheDocs automatically scrapes and embeds documentation from any website generated by ReadTheDocs/Sphinx using OpenAI embeddings 35 | 36 | - **RAG Interface:** It comes built-in with a gradio UI for users to interact with [Buster 🤖](https://github.com/jerpint/buste) our RAG agent. 37 | 38 | - **Customization Options:** Tailor RAGtheDocs prompts and settings with customizable settings and options. 39 | 40 | ## Disclaimers ❗ 41 | 42 | * This is a quickly hacked together side-project. This code should be considered experimental at best. 43 | 44 | * This library will automatically call OpenAI APIs for you (for embeddings and chatGPT). 45 | 46 | * Use at your own risk! ⚠️ 47 | 48 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional, Tuple 3 | 4 | import gradio as gr 5 | import pandas as pd 6 | from buster.completers import Completion 7 | 8 | # from embed_docs import embed_rtd_website 9 | # from rtd_scraper.scrape_rtd import scrape_rtd 10 | from embed_docs import embed_documents 11 | import cfg 12 | from cfg import setup_buster 13 | 14 | # Typehint for chatbot history 15 | ChatHistory = list[list[Optional[str], Optional[str]]] 16 | 17 | 18 | # Because this is a one-click deploy app, we will be relying on env. variables being set 19 | openai_api_key = os.getenv("OPENAI_API_KEY") # Mandatory for app to work 20 | readthedocs_url = os.getenv("READTHEDOCS_URL") # Mandatory for app to work as intended 21 | readthedocs_version = os.getenv("READTHEDOCS_VERSION") 22 | 23 | if openai_api_key is None: 24 | print( 25 | "Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'." 26 | ) 27 | 28 | if readthedocs_url is None: 29 | raise ValueError( 30 | "No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'" 31 | ) 32 | 33 | if readthedocs_version is None: 34 | print( 35 | """ 36 | Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped. 37 | Set it with e.g. 'export READTHEDOCS_VERSION=en/stable' 38 | """ 39 | ) 40 | 41 | 42 | # Override to put it anywhere 43 | save_directory = "outputs/" 44 | 45 | # scrape and embed content from readthedocs website 46 | # You only need to embed the first time the app runs, comment it out to skip 47 | embed_documents( 48 | homepage_url=readthedocs_url, 49 | save_directory=save_directory, 50 | target_version=readthedocs_version, 51 | ) 52 | 53 | # Setup RAG agent 54 | buster = setup_buster(cfg.buster_cfg) 55 | 56 | 57 | # Setup Gradio app 58 | def add_user_question( 59 | user_question: str, chat_history: Optional[ChatHistory] = None 60 | ) -> ChatHistory: 61 | """Adds a user's question to the chat history. 62 | 63 | If no history is provided, the first element of the history will be the user conversation. 64 | """ 65 | if chat_history is None: 66 | chat_history = [] 67 | chat_history.append([user_question, None]) 68 | return chat_history 69 | 70 | 71 | def format_sources(matched_documents: pd.DataFrame) -> str: 72 | if len(matched_documents) == 0: 73 | return "" 74 | 75 | matched_documents.similarity_to_answer = ( 76 | matched_documents.similarity_to_answer * 100 77 | ) 78 | 79 | # drop duplicate pages (by title), keep highest ranking ones 80 | matched_documents = matched_documents.sort_values( 81 | "similarity_to_answer", ascending=False 82 | ).drop_duplicates("title", keep="first") 83 | 84 | documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}" 85 | document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %" 86 | 87 | documents = "\n".join( 88 | [ 89 | document_template.format(document=document) 90 | for _, document in matched_documents.iterrows() 91 | ] 92 | ) 93 | footnote: str = "I'm a bot 🤖 and not always perfect." 94 | 95 | return documents_answer_template.format(documents=documents, footnote=footnote) 96 | 97 | 98 | def add_sources(history, completion): 99 | if completion.answer_relevant: 100 | formatted_sources = format_sources(completion.matched_documents) 101 | history.append([None, formatted_sources]) 102 | 103 | return history 104 | 105 | 106 | def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]: 107 | """Answer a user's question using retrieval augmented generation.""" 108 | 109 | # We assume that the question is the user's last interaction 110 | user_input = chat_history[-1][0] 111 | 112 | # Do retrieval + augmented generation with buster 113 | completion = buster.process_input(user_input) 114 | 115 | # Stream tokens one at a time to the user 116 | chat_history[-1][1] = "" 117 | for token in completion.answer_generator: 118 | chat_history[-1][1] += token 119 | 120 | yield chat_history, completion 121 | 122 | 123 | demo = gr.Blocks() 124 | with demo: 125 | with gr.Row(): 126 | gr.Markdown("

RAGTheDocs

") 127 | 128 | gr.Markdown( 129 | """ 130 | ## About 131 | [RAGTheDocs](https://github.com/jerpint/RAGTheDocs) allows you to ask questions about any documentation hosted on readthedocs. 132 | Simply clone this space and set the environment variables: 133 | 134 | * `OPENAI_API_KEY` (required): Needed for the app to work, e.g. `sk-...` 135 | * `READTHEDOCS_URL` (required): The url of the website you are interested in scraping (must be built with 136 | sphinx/readthedocs). e.g. `https://orion.readthedocs.io` 137 | * `READTHEDOCS_VERSION` (optional): This is important if there exist multiple versions of the docs (e.g. `en/v0.2.7` or `en/latest`). If left empty, it will scrape all available versions (there can be many for open-source projects!). 138 | 139 | Try it out by asking a question below 👇 about [orion](https://orion.readthedocs.io/), an open-source hyperparameter optimization library. 140 | 141 | ## How it works 142 | This app uses [Buster 🤖](https://github.com/jerpint/buster) and ChatGPT to search the docs for relevant info and 143 | answer questions. 144 | View the code on the [project homepage](https://github.com/jerpint/RAGTheDocs) 145 | """ 146 | ) 147 | 148 | chatbot = gr.Chatbot() 149 | 150 | with gr.Row(): 151 | question = gr.Textbox( 152 | label="What's your question?", 153 | placeholder="Type your question here...", 154 | lines=1, 155 | ) 156 | submit = gr.Button(value="Send", variant="secondary") 157 | 158 | examples = gr.Examples( 159 | examples=[ 160 | "How can I install the library?", 161 | "What dependencies are required?", 162 | "Give a brief overview of the library.", 163 | ], 164 | inputs=question, 165 | ) 166 | 167 | response = gr.State() 168 | 169 | # fmt: off 170 | gr.on( 171 | triggers=[submit.click, question.submit], 172 | fn=add_user_question, 173 | inputs=[question], 174 | outputs=[chatbot] 175 | ).then( 176 | chat, 177 | inputs=[chatbot], 178 | outputs=[chatbot, response] 179 | ).then( 180 | add_sources, 181 | inputs=[chatbot, response], 182 | outputs=[chatbot] 183 | ) 184 | 185 | 186 | demo.queue(concurrency_count=8) 187 | demo.launch(share=False) 188 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | from buster.busterbot import Buster, BusterConfig 2 | from buster.completers import ChatGPTCompleter, DocumentAnswerer 3 | from buster.formatters.documents import DocumentsFormatterJSON 4 | from buster.formatters.prompts import PromptFormatter 5 | from buster.retriever import DeepLakeRetriever, Retriever 6 | from buster.tokenizers import GPTTokenizer 7 | from buster.validators import QuestionAnswerValidator, Validator 8 | 9 | buster_cfg = BusterConfig( 10 | retriever_cfg={ 11 | "path": "outputs/deeplake_store", 12 | "top_k": 3, 13 | "thresh": 0.7, 14 | "max_tokens": 2000, 15 | "embedding_model": "text-embedding-ada-002", 16 | }, 17 | documents_answerer_cfg={ 18 | "no_documents_message": "No documents are available for this question.", 19 | }, 20 | completion_cfg={ 21 | "completion_kwargs": { 22 | "model": "gpt-3.5-turbo", 23 | "stream": True, 24 | "temperature": 0, 25 | }, 26 | }, 27 | tokenizer_cfg={ 28 | "model_name": "gpt-3.5-turbo", 29 | }, 30 | documents_formatter_cfg={ 31 | "max_tokens": 3500, 32 | "columns": ["content", "title", "source"], 33 | }, 34 | prompt_formatter_cfg={ 35 | "max_tokens": 3500, 36 | "text_before_docs": ( 37 | "You are a chatbot assistant answering technical questions about artificial intelligence (AI)." 38 | "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. " 39 | "If the answer is in the documentation, summarize it in a helpful way to the user. " 40 | "If it isn't, simply reply that you cannot answer the question. " 41 | "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. " 42 | "Here is the documentation:\n" 43 | ), 44 | "text_after_docs": ( 45 | "REMEMBER:\n" 46 | "You are a chatbot assistant answering technical questions about artificial intelligence (AI)." 47 | "Here are the rules you must follow:\n" 48 | "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n" 49 | "2) Make sure to format your answers in Markdown format, including code block and snippets.\n" 50 | "3) Do not reference any links, urls or hyperlinks in your answers.\n" 51 | "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n" 52 | "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. " 53 | "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'" 54 | "For example:\n" 55 | "What is the meaning of life for an qa bot?\n" 56 | "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?" 57 | "Now answer the following question:\n" 58 | ), 59 | }, 60 | validator_cfg={ 61 | "unknown_response_templates": [ 62 | "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?", 63 | ], 64 | "unknown_threshold": 0.85, 65 | "embedding_model": "text-embedding-ada-002", 66 | "use_reranking": True, 67 | "invalid_question_response": "This question does not seem relevant to my current knowledge. If you think this is a mistake, you can modify the question validation prompt.", 68 | "check_question_prompt": """You are an chatbot answering questions on python libraries hosted on readthedocs. 69 | 70 | Your job is to determine wether or not a question is valid, and should be answered. 71 | A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid. 72 | 73 | For example: 74 | 75 | Q: How can I install the library? 76 | true 77 | 78 | Q: What is the meaning of life? 79 | false 80 | 81 | A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""", 82 | "completion_kwargs": { 83 | "model": "gpt-3.5-turbo", 84 | "stream": False, 85 | "temperature": 0, 86 | }, 87 | }, 88 | ) 89 | 90 | 91 | def setup_buster(buster_cfg: BusterConfig): 92 | """initialize buster with a buster_cfg class""" 93 | retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg) 94 | tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg) 95 | document_answerer: DocumentAnswerer = DocumentAnswerer( 96 | completer=ChatGPTCompleter(**buster_cfg.completion_cfg), 97 | documents_formatter=DocumentsFormatterJSON( 98 | tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg 99 | ), 100 | prompt_formatter=PromptFormatter( 101 | tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg 102 | ), 103 | **buster_cfg.documents_answerer_cfg, 104 | ) 105 | validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg) 106 | buster: Buster = Buster( 107 | retriever=retriever, document_answerer=document_answerer, validator=validator 108 | ) 109 | return buster 110 | -------------------------------------------------------------------------------- /embed_docs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from buster.docparser import get_all_documents 5 | from buster.documents_manager import DeepLakeDocumentsManager 6 | from buster.parser import SphinxParser 7 | 8 | from rtd_scraper.scrape_rtd import sanitize_url, run_spider 9 | 10 | # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here... 11 | for name in logging.root.manager.loggerDict: 12 | logger = logging.getLogger(name) 13 | logger.setLevel(logging.INFO) 14 | 15 | 16 | def embed_documents(homepage_url, save_directory, target_version=None): 17 | # adds https:// and trailing slash 18 | homepage_url = sanitize_url(homepage_url) 19 | 20 | # Crawl the website using scrapy 21 | run_spider( 22 | homepage_url, save_directory=save_directory, target_version=target_version 23 | ) 24 | 25 | # # Convert the .html pages into chunks using Buster's SphinxParser 26 | # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/ 27 | root_dir = os.path.join(save_directory, homepage_url.split("https://")[1]) 28 | df = get_all_documents( 29 | root_dir=root_dir, 30 | base_url=homepage_url, 31 | parser_cls=SphinxParser, 32 | min_section_length=100, 33 | max_section_length=1000, 34 | ) 35 | df["source"] = "readthedocs" # Add the source column 36 | 37 | # Initialize the DeepLake vector store 38 | vector_store_path = os.path.join(save_directory, "deeplake_store") 39 | dm = DeepLakeDocumentsManager( 40 | vector_store_path=vector_store_path, 41 | overwrite=True, 42 | required_columns=["url", "content", "source", "title"], 43 | ) 44 | 45 | # Add all embeddings to the vector store 46 | dm.batch_add( 47 | df=df, 48 | batch_size=3000, 49 | min_time_interval=60, 50 | num_workers=32, 51 | ) 52 | 53 | 54 | if __name__ == "__main__": 55 | homepage_url = "https://orion.readthedocs.io/" 56 | target_version = "v0.2.7" 57 | save_directory = "outputs/" 58 | embed_documents( 59 | homepage_url=homepage_url, 60 | target_version=target_version, 61 | save_directory=save_directory, 62 | ) 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | buster-doctalk 2 | scrapy 3 | -------------------------------------------------------------------------------- /rtd_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerpint/RAGTheDocs/1895d54d811a9a21cdc9a6a0d872286bea1d8585/rtd_scraper/__init__.py -------------------------------------------------------------------------------- /rtd_scraper/scrape_rtd.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from scrapy.crawler import CrawlerProcess 5 | from scrapy.utils.project import get_project_settings 6 | 7 | from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url 8 | 9 | # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here... 10 | for name in logging.root.manager.loggerDict: 11 | logger = logging.getLogger(name) 12 | logger.setLevel(logging.INFO) 13 | 14 | 15 | def run_spider(homepage_url, save_directory, target_version=None): 16 | process = CrawlerProcess(settings=get_project_settings()) 17 | process.crawl( 18 | DocsSpider, 19 | homepage_url=homepage_url, 20 | save_dir=save_directory, 21 | target_version=target_version, 22 | ) 23 | 24 | # Start the crawling process 25 | process.start() 26 | 27 | # To stop the crawling process gracefully 28 | process.stop() 29 | -------------------------------------------------------------------------------- /rtd_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutorial 12 | -------------------------------------------------------------------------------- /rtd_scraper/tutorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jerpint/RAGTheDocs/1895d54d811a9a21cdc9a6a0d872286bea1d8585/rtd_scraper/tutorial/__init__.py -------------------------------------------------------------------------------- /rtd_scraper/tutorial/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | # useful for handling different item types with a single interface 7 | from itemadapter import ItemAdapter, is_item 8 | from scrapy import signals 9 | 10 | 11 | class TutorialSpiderMiddleware: 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, or item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request or item objects. 43 | pass 44 | 45 | def process_start_requests(self, start_requests, spider): 46 | # Called with the start requests of the spider, and works 47 | # similarly to the process_spider_output() method, except 48 | # that it doesn’t have a response associated. 49 | 50 | # Must return only requests (not items). 51 | for r in start_requests: 52 | yield r 53 | 54 | def spider_opened(self, spider): 55 | spider.logger.info("Spider opened: %s" % spider.name) 56 | 57 | 58 | class TutorialDownloaderMiddleware: 59 | # Not all methods need to be defined. If a method is not defined, 60 | # scrapy acts as if the downloader middleware does not modify the 61 | # passed objects. 62 | 63 | @classmethod 64 | def from_crawler(cls, crawler): 65 | # This method is used by Scrapy to create your spiders. 66 | s = cls() 67 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 68 | return s 69 | 70 | def process_request(self, request, spider): 71 | # Called for each request that goes through the downloader 72 | # middleware. 73 | 74 | # Must either: 75 | # - return None: continue processing this request 76 | # - or return a Response object 77 | # - or return a Request object 78 | # - or raise IgnoreRequest: process_exception() methods of 79 | # installed downloader middleware will be called 80 | return None 81 | 82 | def process_response(self, request, response, spider): 83 | # Called with the response returned from the downloader. 84 | 85 | # Must either; 86 | # - return a Response object 87 | # - return a Request object 88 | # - or raise IgnoreRequest 89 | return response 90 | 91 | def process_exception(self, request, exception, spider): 92 | # Called when a download handler or a process_request() 93 | # (from other downloader middleware) raises an exception. 94 | 95 | # Must either: 96 | # - return None: continue processing this exception 97 | # - return a Response object: stops process_exception() chain 98 | # - return a Request object: stops process_exception() chain 99 | pass 100 | 101 | def spider_opened(self, spider): 102 | spider.logger.info("Spider opened: %s" % spider.name) 103 | -------------------------------------------------------------------------------- /rtd_scraper/tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for tutorial project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | from scrapy.utils.log import configure_logging 11 | 12 | # Disable default Scrapy log settings. 13 | configure_logging(install_root_handler=False) 14 | BOT_NAME = "tutorial" 15 | 16 | SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"] 17 | NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders" 18 | 19 | # SPIDER_MODULES = ["tutorial.spiders"] 20 | # NEWSPIDER_MODULE = "tutorial.spiders" 21 | 22 | LOG_ENABLED = False 23 | LOG_LEVEL = "INFO" 24 | 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 26 | # USER_AGENT = "tutorial (+http://www.yourdomain.com)" 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = True 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | # CONCURRENT_REQUESTS = 32 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | # DOWNLOAD_DELAY = 3 38 | # The download delay setting will honor only one of: 39 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 40 | # CONCURRENT_REQUESTS_PER_IP = 16 41 | 42 | # Disable cookies (enabled by default) 43 | # COOKIES_ENABLED = False 44 | 45 | # Disable Telnet Console (enabled by default) 46 | # TELNETCONSOLE_ENABLED = False 47 | 48 | # Override the default request headers: 49 | # DEFAULT_REQUEST_HEADERS = { 50 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 51 | # "Accept-Language": "en", 52 | # } 53 | 54 | # Enable or disable spider middlewares 55 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 56 | # SPIDER_MIDDLEWARES = { 57 | # "tutorial.middlewares.TutorialSpiderMiddleware": 543, 58 | # } 59 | 60 | # Enable or disable downloader middlewares 61 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 62 | # DOWNLOADER_MIDDLEWARES = { 63 | # "tutorial.middlewares.TutorialDownloaderMiddleware": 543, 64 | # } 65 | 66 | # Enable or disable extensions 67 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 68 | # EXTENSIONS = { 69 | # "scrapy.extensions.telnet.TelnetConsole": None, 70 | # } 71 | 72 | # Configure item pipelines 73 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 74 | # ITEM_PIPELINES = { 75 | # "tutorial.pipelines.TutorialPipeline": 300, 76 | # } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 80 | # AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | # AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | # AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | # AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | # HTTPCACHE_ENABLED = True 94 | # HTTPCACHE_EXPIRATION_SECS = 0 95 | # HTTPCACHE_DIR = "httpcache" 96 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 98 | 99 | # Set settings whose default value is deprecated to a future-proof value 100 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" 101 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 102 | FEED_EXPORT_ENCODING = "utf-8" 103 | -------------------------------------------------------------------------------- /rtd_scraper/tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /rtd_scraper/tutorial/spiders/docs_spider.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from urllib.parse import urlparse 4 | 5 | import scrapy 6 | 7 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR) 8 | 9 | from urllib.parse import urlparse 10 | 11 | 12 | def extract_domain(url): 13 | """ 14 | Extract the domain (including subdomains) from a given URL. 15 | 16 | Args: 17 | - url (str): The URL from which the domain needs to be extracted. 18 | 19 | Returns: 20 | - str: The domain (with subdomains) extracted from the URL. 21 | For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'. 22 | 23 | """ 24 | parsed_uri = urlparse(url) 25 | # The netloc attribute will contain the domain name 26 | domain = parsed_uri.netloc 27 | return domain 28 | 29 | 30 | def sanitize_url(url: str) -> str: 31 | """Adds https:// and trailing backslash.""" 32 | if not url.startswith("https://"): 33 | url = "https://" + url 34 | 35 | if not url.endswith("/"): 36 | url = url + "/" 37 | return url 38 | 39 | 40 | class DocsSpider(scrapy.Spider): 41 | name = "docs" 42 | 43 | def __init__( 44 | self, 45 | homepage_url: str, 46 | save_dir="outputs/", 47 | target_version=None, 48 | *args, 49 | **kwargs, 50 | ): 51 | super(DocsSpider, self).__init__(*args, **kwargs) 52 | 53 | homepage_url = sanitize_url(homepage_url) 54 | 55 | self.allowed_domains = [extract_domain(homepage_url)] 56 | self.start_urls = [homepage_url] 57 | self.base_dir = Path(save_dir) 58 | self.target_version = target_version 59 | 60 | def parse(self, response): 61 | parsed_uri = urlparse(response.url) 62 | # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename. 63 | if parsed_uri.path.endswith("/"): 64 | filepath = ( 65 | self.base_dir 66 | / parsed_uri.netloc 67 | / parsed_uri.path.strip("/") 68 | / "index.html" 69 | ) 70 | else: 71 | filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/") 72 | filepath.parent.mkdir(parents=True, exist_ok=True) 73 | 74 | with open(filepath, "wb") as f: 75 | f.write(response.body) 76 | 77 | # Follow links to other documentation pages only if they contain the target version in the full URL 78 | for href in response.css("a::attr(href)").getall(): 79 | if self.target_version: 80 | # A version was specified, check to see if it's the correct version from url 81 | full_url = response.urljoin(href) # Expand href to a full URL 82 | if self.target_version in full_url: 83 | yield response.follow(href, self.parse) 84 | else: 85 | # no version specified, follow all links 86 | yield response.follow(href, self.parse) 87 | --------------------------------------------------------------------------------