├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── cfg.py
├── embed_docs.py
├── requirements.txt
└── rtd_scraper
    ├── __init__.py
    ├── scrape_rtd.py
    ├── scrapy.cfg
    └── tutorial
        ├── __init__.py
        ├── middlewares.py
        ├── settings.py
        └── spiders
            ├── __init__.py
            └── docs_spider.py


/.gitignore:
--------------------------------------------------------------------------------
1 | outputs/
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Jeremy Pinto
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: RAGTheDocs
 3 | emoji: 👀
 4 | colorFrom: gray
 5 | colorTo: yellow
 6 | sdk: gradio
 7 | sdk_version: 3.50.2
 8 | app_file: app.py
 9 | pinned: false
10 | license: mit
11 | ---
12 | 
13 | # RAGtheDocs
14 | 
15 | ## Introduction 📚
16 | 
17 | RAGTheDocs is an open-source library that allows you to **one-click deploy** retrieval augmented generation (RAG) on any readthedocs documentation on [huggingface 🤗 spaces](https://huggingface.co/spaces/jerpint/RAGTheDocs)!
18 | 
19 | ## Usage 👉
20 | 
21 | 1) Go to the [example space](https://huggingface.co/spaces/jerpint/RAGTheDocs)
22 | 2) Duplicate the space:
23 | 
24 | ![image](https://github.com/jerpint/buster/assets/18450628/0c89038c-c3af-4c1f-9d3b-9b4d83db4910)
25 | 
26 | 3) Set your environment variables:
27 | * `OPENAI_API_KEY` (required): Needed for the app to work, e.g. `sk-...`
28 | * `READTHEDOCS_URL` (required): The url of the website you are interested in scraping (must be built with
29 | sphinx/readthedocs). e.g. `https://orion.readthedocs.io`
30 | * `READTHEDOCS_VERSION` (optional): This is important if there exist multiple versions of the docs (e.g. `en/v0.2.7` or `en/latest`). If left empty, it will scrape all available versions (there can be many for open-source projects!).
31 | 
32 | ## Features 🚀
33 | 
34 | - **Web Scraping and embeddings:** RAGtheDocs automatically scrapes and embeds documentation from any website generated by ReadTheDocs/Sphinx using OpenAI embeddings
35 | 
36 | - **RAG Interface:** It comes built-in with a gradio UI for users to interact with [Buster 🤖](https://github.com/jerpint/buste) our RAG agent.
37 | 
38 | - **Customization Options:** Tailor RAGtheDocs prompts and settings with customizable settings and options.
39 | 
40 | ## Disclaimers ❗
41 | 
42 | * This is a quickly hacked together side-project. This code should be considered experimental at best.
43 | 
44 | * This library will automatically call OpenAI APIs for you (for embeddings and chatGPT).
45 | 
46 | * Use at your own risk! ⚠️
47 | 
48 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Optional, Tuple
  3 | 
  4 | import gradio as gr
  5 | import pandas as pd
  6 | from buster.completers import Completion
  7 | 
  8 | # from embed_docs import embed_rtd_website
  9 | # from rtd_scraper.scrape_rtd import scrape_rtd
 10 | from embed_docs import embed_documents
 11 | import cfg
 12 | from cfg import setup_buster
 13 | 
 14 | # Typehint for chatbot history
 15 | ChatHistory = list[list[Optional[str], Optional[str]]]
 16 | 
 17 | 
 18 | # Because this is a one-click deploy app, we will be relying on env. variables being set
 19 | openai_api_key = os.getenv("OPENAI_API_KEY")  # Mandatory for app to work
 20 | readthedocs_url = os.getenv("READTHEDOCS_URL")  # Mandatory for app to work as intended
 21 | readthedocs_version = os.getenv("READTHEDOCS_VERSION")
 22 | 
 23 | if openai_api_key is None:
 24 |     print(
 25 |         "Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'."
 26 |     )
 27 | 
 28 | if readthedocs_url is None:
 29 |     raise ValueError(
 30 |         "No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'"
 31 |     )
 32 | 
 33 | if readthedocs_version is None:
 34 |     print(
 35 |         """
 36 |     Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped.
 37 |     Set it with e.g. 'export READTHEDOCS_VERSION=en/stable'
 38 |     """
 39 |     )
 40 | 
 41 | 
 42 | # Override to put it anywhere
 43 | save_directory = "outputs/"
 44 | 
 45 | # scrape and embed content from readthedocs website
 46 | # You only need to embed the first time the app runs, comment it out to skip
 47 | embed_documents(
 48 |     homepage_url=readthedocs_url,
 49 |     save_directory=save_directory,
 50 |     target_version=readthedocs_version,
 51 | )
 52 | 
 53 | # Setup RAG agent
 54 | buster = setup_buster(cfg.buster_cfg)
 55 | 
 56 | 
 57 | # Setup Gradio app
 58 | def add_user_question(
 59 |     user_question: str, chat_history: Optional[ChatHistory] = None
 60 | ) -> ChatHistory:
 61 |     """Adds a user's question to the chat history.
 62 | 
 63 |     If no history is provided, the first element of the history will be the user conversation.
 64 |     """
 65 |     if chat_history is None:
 66 |         chat_history = []
 67 |     chat_history.append([user_question, None])
 68 |     return chat_history
 69 | 
 70 | 
 71 | def format_sources(matched_documents: pd.DataFrame) -> str:
 72 |     if len(matched_documents) == 0:
 73 |         return ""
 74 | 
 75 |     matched_documents.similarity_to_answer = (
 76 |         matched_documents.similarity_to_answer * 100
 77 |     )
 78 | 
 79 |     # drop duplicate pages (by title), keep highest ranking ones
 80 |     matched_documents = matched_documents.sort_values(
 81 |         "similarity_to_answer", ascending=False
 82 |     ).drop_duplicates("title", keep="first")
 83 | 
 84 |     documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
 85 |     document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
 86 | 
 87 |     documents = "\n".join(
 88 |         [
 89 |             document_template.format(document=document)
 90 |             for _, document in matched_documents.iterrows()
 91 |         ]
 92 |     )
 93 |     footnote: str = "I'm a bot 🤖 and not always perfect."
 94 | 
 95 |     return documents_answer_template.format(documents=documents, footnote=footnote)
 96 | 
 97 | 
 98 | def add_sources(history, completion):
 99 |     if completion.answer_relevant:
100 |         formatted_sources = format_sources(completion.matched_documents)
101 |         history.append([None, formatted_sources])
102 | 
103 |     return history
104 | 
105 | 
106 | def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
107 |     """Answer a user's question using retrieval augmented generation."""
108 | 
109 |     # We assume that the question is the user's last interaction
110 |     user_input = chat_history[-1][0]
111 | 
112 |     # Do retrieval + augmented generation with buster
113 |     completion = buster.process_input(user_input)
114 | 
115 |     # Stream tokens one at a time to the user
116 |     chat_history[-1][1] = ""
117 |     for token in completion.answer_generator:
118 |         chat_history[-1][1] += token
119 | 
120 |         yield chat_history, completion
121 | 
122 | 
123 | demo = gr.Blocks()
124 | with demo:
125 |     with gr.Row():
126 |         gr.Markdown("<h1><center>RAGTheDocs</center></h1>")
127 | 
128 |     gr.Markdown(
129 |         """
130 |         ## About
131 |         [RAGTheDocs](https://github.com/jerpint/RAGTheDocs) allows you to ask questions about any documentation hosted on readthedocs.
132 |         Simply clone this space and set the environment variables:
133 | 
134 |         * `OPENAI_API_KEY` (required): Needed for the app to work, e.g. `sk-...`
135 |         * `READTHEDOCS_URL` (required): The url of the website you are interested in scraping (must be built with
136 |         sphinx/readthedocs). e.g. `https://orion.readthedocs.io`
137 |         * `READTHEDOCS_VERSION` (optional): This is important if there exist multiple versions of the docs (e.g. `en/v0.2.7` or `en/latest`). If left empty, it will scrape all available versions (there can be many for open-source projects!).
138 | 
139 |         Try it out by asking a question below 👇 about [orion](https://orion.readthedocs.io/), an open-source hyperparameter optimization library.
140 | 
141 |         ## How it works
142 |         This app uses [Buster 🤖](https://github.com/jerpint/buster) and ChatGPT to search the docs for relevant info and
143 |         answer questions.
144 |         View the code on the [project homepage](https://github.com/jerpint/RAGTheDocs)
145 |         """
146 |     )
147 | 
148 |     chatbot = gr.Chatbot()
149 | 
150 |     with gr.Row():
151 |         question = gr.Textbox(
152 |             label="What's your question?",
153 |             placeholder="Type your question here...",
154 |             lines=1,
155 |         )
156 |         submit = gr.Button(value="Send", variant="secondary")
157 | 
158 |     examples = gr.Examples(
159 |         examples=[
160 |             "How can I install the library?",
161 |             "What dependencies are required?",
162 |             "Give a brief overview of the library.",
163 |         ],
164 |         inputs=question,
165 |     )
166 | 
167 |     response = gr.State()
168 | 
169 |     # fmt: off
170 |     gr.on(
171 |         triggers=[submit.click, question.submit],
172 |         fn=add_user_question,
173 |         inputs=[question],
174 |         outputs=[chatbot]
175 |     ).then(
176 |         chat,
177 |         inputs=[chatbot],
178 |         outputs=[chatbot, response]
179 |     ).then(
180 |         add_sources,
181 |         inputs=[chatbot, response],
182 |         outputs=[chatbot]
183 |     )
184 | 
185 | 
186 | demo.queue(concurrency_count=8)
187 | demo.launch(share=False)
188 | 


--------------------------------------------------------------------------------
/cfg.py:
--------------------------------------------------------------------------------
  1 | from buster.busterbot import Buster, BusterConfig
  2 | from buster.completers import ChatGPTCompleter, DocumentAnswerer
  3 | from buster.formatters.documents import DocumentsFormatterJSON
  4 | from buster.formatters.prompts import PromptFormatter
  5 | from buster.retriever import DeepLakeRetriever, Retriever
  6 | from buster.tokenizers import GPTTokenizer
  7 | from buster.validators import QuestionAnswerValidator, Validator
  8 | 
  9 | buster_cfg = BusterConfig(
 10 |     retriever_cfg={
 11 |         "path": "outputs/deeplake_store",
 12 |         "top_k": 3,
 13 |         "thresh": 0.7,
 14 |         "max_tokens": 2000,
 15 |         "embedding_model": "text-embedding-ada-002",
 16 |     },
 17 |     documents_answerer_cfg={
 18 |         "no_documents_message": "No documents are available for this question.",
 19 |     },
 20 |     completion_cfg={
 21 |         "completion_kwargs": {
 22 |             "model": "gpt-3.5-turbo",
 23 |             "stream": True,
 24 |             "temperature": 0,
 25 |         },
 26 |     },
 27 |     tokenizer_cfg={
 28 |         "model_name": "gpt-3.5-turbo",
 29 |     },
 30 |     documents_formatter_cfg={
 31 |         "max_tokens": 3500,
 32 |         "columns": ["content", "title", "source"],
 33 |     },
 34 |     prompt_formatter_cfg={
 35 |         "max_tokens": 3500,
 36 |         "text_before_docs": (
 37 |             "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
 38 |             "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
 39 |             "If the answer is in the documentation, summarize it in a helpful way to the user. "
 40 |             "If it isn't, simply reply that you cannot answer the question. "
 41 |             "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
 42 |             "Here is the documentation:\n"
 43 |         ),
 44 |         "text_after_docs": (
 45 |             "REMEMBER:\n"
 46 |             "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
 47 |             "Here are the rules you must follow:\n"
 48 |             "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
 49 |             "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
 50 |             "3) Do not reference any links, urls or hyperlinks in your answers.\n"
 51 |             "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
 52 |             "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
 53 |             "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
 54 |             "For example:\n"
 55 |             "What is the meaning of life for an qa bot?\n"
 56 |             "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
 57 |             "Now answer the following question:\n"
 58 |         ),
 59 |     },
 60 |     validator_cfg={
 61 |         "unknown_response_templates": [
 62 |             "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
 63 |         ],
 64 |         "unknown_threshold": 0.85,
 65 |         "embedding_model": "text-embedding-ada-002",
 66 |         "use_reranking": True,
 67 |         "invalid_question_response": "This question does not seem relevant to my current knowledge. If you think this is a mistake, you can modify the question validation prompt.",
 68 |         "check_question_prompt": """You are an chatbot answering questions on python libraries hosted on readthedocs.
 69 | 
 70 | Your job is to determine wether or not a question is valid, and should be answered.
 71 | A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
 72 | 
 73 | For example:
 74 | 
 75 | Q: How can I install the library?
 76 | true
 77 | 
 78 | Q: What is the meaning of life?
 79 | false
 80 | 
 81 | A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
 82 |         "completion_kwargs": {
 83 |             "model": "gpt-3.5-turbo",
 84 |             "stream": False,
 85 |             "temperature": 0,
 86 |         },
 87 |     },
 88 | )
 89 | 
 90 | 
 91 | def setup_buster(buster_cfg: BusterConfig):
 92 |     """initialize buster with a buster_cfg class"""
 93 |     retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
 94 |     tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
 95 |     document_answerer: DocumentAnswerer = DocumentAnswerer(
 96 |         completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
 97 |         documents_formatter=DocumentsFormatterJSON(
 98 |             tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
 99 |         ),
100 |         prompt_formatter=PromptFormatter(
101 |             tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
102 |         ),
103 |         **buster_cfg.documents_answerer_cfg,
104 |     )
105 |     validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
106 |     buster: Buster = Buster(
107 |         retriever=retriever, document_answerer=document_answerer, validator=validator
108 |     )
109 |     return buster
110 | 


--------------------------------------------------------------------------------
/embed_docs.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from buster.docparser import get_all_documents
 5 | from buster.documents_manager import DeepLakeDocumentsManager
 6 | from buster.parser import SphinxParser
 7 | 
 8 | from rtd_scraper.scrape_rtd import sanitize_url, run_spider
 9 | 
10 | # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
11 | for name in logging.root.manager.loggerDict:
12 |     logger = logging.getLogger(name)
13 |     logger.setLevel(logging.INFO)
14 | 
15 | 
16 | def embed_documents(homepage_url, save_directory, target_version=None):
17 |     # adds https:// and trailing slash
18 |     homepage_url = sanitize_url(homepage_url)
19 | 
20 |     # Crawl the website using scrapy
21 |     run_spider(
22 |         homepage_url, save_directory=save_directory, target_version=target_version
23 |     )
24 | 
25 |     # # Convert the .html pages into chunks using Buster's SphinxParser
26 |     # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
27 |     root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
28 |     df = get_all_documents(
29 |         root_dir=root_dir,
30 |         base_url=homepage_url,
31 |         parser_cls=SphinxParser,
32 |         min_section_length=100,
33 |         max_section_length=1000,
34 |     )
35 |     df["source"] = "readthedocs"  # Add the source column
36 | 
37 |     #  Initialize the DeepLake vector store
38 |     vector_store_path = os.path.join(save_directory, "deeplake_store")
39 |     dm = DeepLakeDocumentsManager(
40 |         vector_store_path=vector_store_path,
41 |         overwrite=True,
42 |         required_columns=["url", "content", "source", "title"],
43 |     )
44 | 
45 |     # Add all embeddings to the vector store
46 |     dm.batch_add(
47 |         df=df,
48 |         batch_size=3000,
49 |         min_time_interval=60,
50 |         num_workers=32,
51 |     )
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     homepage_url = "https://orion.readthedocs.io/"
56 |     target_version = "v0.2.7"
57 |     save_directory = "outputs/"
58 |     embed_documents(
59 |         homepage_url=homepage_url,
60 |         target_version=target_version,
61 |         save_directory=save_directory,
62 |     )
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | buster-doctalk
2 | scrapy
3 | 


--------------------------------------------------------------------------------
/rtd_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerpint/RAGTheDocs/1895d54d811a9a21cdc9a6a0d872286bea1d8585/rtd_scraper/__init__.py


--------------------------------------------------------------------------------
/rtd_scraper/scrape_rtd.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from scrapy.crawler import CrawlerProcess
 5 | from scrapy.utils.project import get_project_settings
 6 | 
 7 | from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
 8 | 
 9 | # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
10 | for name in logging.root.manager.loggerDict:
11 |     logger = logging.getLogger(name)
12 |     logger.setLevel(logging.INFO)
13 | 
14 | 
15 | def run_spider(homepage_url, save_directory, target_version=None):
16 |     process = CrawlerProcess(settings=get_project_settings())
17 |     process.crawl(
18 |         DocsSpider,
19 |         homepage_url=homepage_url,
20 |         save_dir=save_directory,
21 |         target_version=target_version,
22 |     )
23 | 
24 |     # Start the crawling process
25 |     process.start()
26 | 
27 |     # To stop the crawling process gracefully
28 |     process.stop()
29 | 


--------------------------------------------------------------------------------
/rtd_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tutorial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 | 


--------------------------------------------------------------------------------
/rtd_scraper/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerpint/RAGTheDocs/1895d54d811a9a21cdc9a6a0d872286bea1d8585/rtd_scraper/tutorial/__init__.py


--------------------------------------------------------------------------------
/rtd_scraper/tutorial/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | # useful for handling different item types with a single interface
  7 | from itemadapter import ItemAdapter, is_item
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class TutorialSpiderMiddleware:
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, or item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request or item objects.
 43 |         pass
 44 | 
 45 |     def process_start_requests(self, start_requests, spider):
 46 |         # Called with the start requests of the spider, and works
 47 |         # similarly to the process_spider_output() method, except
 48 |         # that it doesn’t have a response associated.
 49 | 
 50 |         # Must return only requests (not items).
 51 |         for r in start_requests:
 52 |             yield r
 53 | 
 54 |     def spider_opened(self, spider):
 55 |         spider.logger.info("Spider opened: %s" % spider.name)
 56 | 
 57 | 
 58 | class TutorialDownloaderMiddleware:
 59 |     # Not all methods need to be defined. If a method is not defined,
 60 |     # scrapy acts as if the downloader middleware does not modify the
 61 |     # passed objects.
 62 | 
 63 |     @classmethod
 64 |     def from_crawler(cls, crawler):
 65 |         # This method is used by Scrapy to create your spiders.
 66 |         s = cls()
 67 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 68 |         return s
 69 | 
 70 |     def process_request(self, request, spider):
 71 |         # Called for each request that goes through the downloader
 72 |         # middleware.
 73 | 
 74 |         # Must either:
 75 |         # - return None: continue processing this request
 76 |         # - or return a Response object
 77 |         # - or return a Request object
 78 |         # - or raise IgnoreRequest: process_exception() methods of
 79 |         #   installed downloader middleware will be called
 80 |         return None
 81 | 
 82 |     def process_response(self, request, response, spider):
 83 |         # Called with the response returned from the downloader.
 84 | 
 85 |         # Must either;
 86 |         # - return a Response object
 87 |         # - return a Request object
 88 |         # - or raise IgnoreRequest
 89 |         return response
 90 | 
 91 |     def process_exception(self, request, exception, spider):
 92 |         # Called when a download handler or a process_request()
 93 |         # (from other downloader middleware) raises an exception.
 94 | 
 95 |         # Must either:
 96 |         # - return None: continue processing this exception
 97 |         # - return a Response object: stops process_exception() chain
 98 |         # - return a Request object: stops process_exception() chain
 99 |         pass
100 | 
101 |     def spider_opened(self, spider):
102 |         spider.logger.info("Spider opened: %s" % spider.name)
103 | 


--------------------------------------------------------------------------------
/rtd_scraper/tutorial/settings.py:
--------------------------------------------------------------------------------
  1 | # Scrapy settings for tutorial project
  2 | #
  3 | # For simplicity, this file contains only settings considered important or
  4 | # commonly used. You can find more settings consulting the documentation:
  5 | #
  6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9 | 
 10 | from scrapy.utils.log import configure_logging
 11 | 
 12 | # Disable default Scrapy log settings.
 13 | configure_logging(install_root_handler=False)
 14 | BOT_NAME = "tutorial"
 15 | 
 16 | SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"]
 17 | NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders"
 18 | 
 19 | #  SPIDER_MODULES = ["tutorial.spiders"]
 20 | #  NEWSPIDER_MODULE = "tutorial.spiders"
 21 | 
 22 | LOG_ENABLED = False
 23 | LOG_LEVEL = "INFO"
 24 | 
 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 26 | # USER_AGENT = "tutorial (+http://www.yourdomain.com)"
 27 | 
 28 | # Obey robots.txt rules
 29 | ROBOTSTXT_OBEY = True
 30 | 
 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 32 | # CONCURRENT_REQUESTS = 32
 33 | 
 34 | # Configure a delay for requests for the same website (default: 0)
 35 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 36 | # See also autothrottle settings and docs
 37 | # DOWNLOAD_DELAY = 3
 38 | # The download delay setting will honor only one of:
 39 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 40 | # CONCURRENT_REQUESTS_PER_IP = 16
 41 | 
 42 | # Disable cookies (enabled by default)
 43 | # COOKIES_ENABLED = False
 44 | 
 45 | # Disable Telnet Console (enabled by default)
 46 | # TELNETCONSOLE_ENABLED = False
 47 | 
 48 | # Override the default request headers:
 49 | # DEFAULT_REQUEST_HEADERS = {
 50 | #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 51 | #    "Accept-Language": "en",
 52 | # }
 53 | 
 54 | # Enable or disable spider middlewares
 55 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 56 | # SPIDER_MIDDLEWARES = {
 57 | #    "tutorial.middlewares.TutorialSpiderMiddleware": 543,
 58 | # }
 59 | 
 60 | # Enable or disable downloader middlewares
 61 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 62 | # DOWNLOADER_MIDDLEWARES = {
 63 | #    "tutorial.middlewares.TutorialDownloaderMiddleware": 543,
 64 | # }
 65 | 
 66 | # Enable or disable extensions
 67 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 68 | # EXTENSIONS = {
 69 | #    "scrapy.extensions.telnet.TelnetConsole": None,
 70 | # }
 71 | 
 72 | # Configure item pipelines
 73 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 74 | # ITEM_PIPELINES = {
 75 | #    "tutorial.pipelines.TutorialPipeline": 300,
 76 | # }
 77 | 
 78 | # Enable and configure the AutoThrottle extension (disabled by default)
 79 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 80 | # AUTOTHROTTLE_ENABLED = True
 81 | # The initial download delay
 82 | # AUTOTHROTTLE_START_DELAY = 5
 83 | # The maximum download delay to be set in case of high latencies
 84 | # AUTOTHROTTLE_MAX_DELAY = 60
 85 | # The average number of requests Scrapy should be sending in parallel to
 86 | # each remote server
 87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 88 | # Enable showing throttling stats for every response received:
 89 | # AUTOTHROTTLE_DEBUG = False
 90 | 
 91 | # Enable and configure HTTP caching (disabled by default)
 92 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 93 | # HTTPCACHE_ENABLED = True
 94 | # HTTPCACHE_EXPIRATION_SECS = 0
 95 | # HTTPCACHE_DIR = "httpcache"
 96 | # HTTPCACHE_IGNORE_HTTP_CODES = []
 97 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 98 | 
 99 | # Set settings whose default value is deprecated to a future-proof value
100 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
101 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
102 | FEED_EXPORT_ENCODING = "utf-8"
103 | 


--------------------------------------------------------------------------------
/rtd_scraper/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/rtd_scraper/tutorial/spiders/docs_spider.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | from urllib.parse import urlparse
 4 | 
 5 | import scrapy
 6 | 
 7 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
 8 | 
 9 | from urllib.parse import urlparse
10 | 
11 | 
12 | def extract_domain(url):
13 |     """
14 |     Extract the domain (including subdomains) from a given URL.
15 | 
16 |     Args:
17 |     - url (str): The URL from which the domain needs to be extracted.
18 | 
19 |     Returns:
20 |     - str: The domain (with subdomains) extracted from the URL.
21 |            For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.
22 | 
23 |     """
24 |     parsed_uri = urlparse(url)
25 |     # The netloc attribute will contain the domain name
26 |     domain = parsed_uri.netloc
27 |     return domain
28 | 
29 | 
30 | def sanitize_url(url: str) -> str:
31 |     """Adds https:// and trailing backslash."""
32 |     if not url.startswith("https://"):
33 |         url = "https://" + url
34 | 
35 |     if not url.endswith("/"):
36 |         url = url + "/"
37 |     return url
38 | 
39 | 
40 | class DocsSpider(scrapy.Spider):
41 |     name = "docs"
42 | 
43 |     def __init__(
44 |         self,
45 |         homepage_url: str,
46 |         save_dir="outputs/",
47 |         target_version=None,
48 |         *args,
49 |         **kwargs,
50 |     ):
51 |         super(DocsSpider, self).__init__(*args, **kwargs)
52 | 
53 |         homepage_url = sanitize_url(homepage_url)
54 | 
55 |         self.allowed_domains = [extract_domain(homepage_url)]
56 |         self.start_urls = [homepage_url]
57 |         self.base_dir = Path(save_dir)
58 |         self.target_version = target_version
59 | 
60 |     def parse(self, response):
61 |         parsed_uri = urlparse(response.url)
62 |         # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
63 |         if parsed_uri.path.endswith("/"):
64 |             filepath = (
65 |                 self.base_dir
66 |                 / parsed_uri.netloc
67 |                 / parsed_uri.path.strip("/")
68 |                 / "index.html"
69 |             )
70 |         else:
71 |             filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
72 |         filepath.parent.mkdir(parents=True, exist_ok=True)
73 | 
74 |         with open(filepath, "wb") as f:
75 |             f.write(response.body)
76 | 
77 |         # Follow links to other documentation pages only if they contain the target version in the full URL
78 |         for href in response.css("a::attr(href)").getall():
79 |             if self.target_version:
80 |                 # A version was specified, check to see if it's the correct version from url
81 |                 full_url = response.urljoin(href)  # Expand href to a full URL
82 |                 if self.target_version in full_url:
83 |                     yield response.follow(href, self.parse)
84 |             else:
85 |                 # no version specified, follow all links
86 |                 yield response.follow(href, self.parse)
87 | 


--------------------------------------------------------------------------------