├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── cfg.py
├── embed_docs.py
├── requirements.txt
└── rtd_scraper
├── __init__.py
├── scrape_rtd.py
├── scrapy.cfg
└── tutorial
├── __init__.py
├── middlewares.py
├── settings.py
└── spiders
├── __init__.py
└── docs_spider.py
/.gitignore:
--------------------------------------------------------------------------------
1 | outputs/
2 | __pycache__/
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Jeremy Pinto
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: RAGTheDocs
3 | emoji: 👀
4 | colorFrom: gray
5 | colorTo: yellow
6 | sdk: gradio
7 | sdk_version: 3.50.2
8 | app_file: app.py
9 | pinned: false
10 | license: mit
11 | ---
12 |
13 | # RAGtheDocs
14 |
15 | ## Introduction 📚
16 |
17 | RAGTheDocs is an open-source library that allows you to **one-click deploy** retrieval augmented generation (RAG) on any readthedocs documentation on [huggingface 🤗 spaces](https://huggingface.co/spaces/jerpint/RAGTheDocs)!
18 |
19 | ## Usage 👉
20 |
21 | 1) Go to the [example space](https://huggingface.co/spaces/jerpint/RAGTheDocs)
22 | 2) Duplicate the space:
23 |
24 | 
25 |
26 | 3) Set your environment variables:
27 | * `OPENAI_API_KEY` (required): Needed for the app to work, e.g. `sk-...`
28 | * `READTHEDOCS_URL` (required): The url of the website you are interested in scraping (must be built with
29 | sphinx/readthedocs). e.g. `https://orion.readthedocs.io`
30 | * `READTHEDOCS_VERSION` (optional): This is important if there exist multiple versions of the docs (e.g. `en/v0.2.7` or `en/latest`). If left empty, it will scrape all available versions (there can be many for open-source projects!).
31 |
32 | ## Features 🚀
33 |
34 | - **Web Scraping and embeddings:** RAGtheDocs automatically scrapes and embeds documentation from any website generated by ReadTheDocs/Sphinx using OpenAI embeddings
35 |
36 | - **RAG Interface:** It comes built-in with a gradio UI for users to interact with [Buster 🤖](https://github.com/jerpint/buste) our RAG agent.
37 |
38 | - **Customization Options:** Tailor RAGtheDocs prompts and settings with customizable settings and options.
39 |
40 | ## Disclaimers ❗
41 |
42 | * This is a quickly hacked together side-project. This code should be considered experimental at best.
43 |
44 | * This library will automatically call OpenAI APIs for you (for embeddings and chatGPT).
45 |
46 | * Use at your own risk! ⚠️
47 |
48 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Optional, Tuple
3 |
4 | import gradio as gr
5 | import pandas as pd
6 | from buster.completers import Completion
7 |
8 | # from embed_docs import embed_rtd_website
9 | # from rtd_scraper.scrape_rtd import scrape_rtd
10 | from embed_docs import embed_documents
11 | import cfg
12 | from cfg import setup_buster
13 |
14 | # Typehint for chatbot history
15 | ChatHistory = list[list[Optional[str], Optional[str]]]
16 |
17 |
18 | # Because this is a one-click deploy app, we will be relying on env. variables being set
19 | openai_api_key = os.getenv("OPENAI_API_KEY") # Mandatory for app to work
20 | readthedocs_url = os.getenv("READTHEDOCS_URL") # Mandatory for app to work as intended
21 | readthedocs_version = os.getenv("READTHEDOCS_VERSION")
22 |
23 | if openai_api_key is None:
24 | print(
25 | "Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'."
26 | )
27 |
28 | if readthedocs_url is None:
29 | raise ValueError(
30 | "No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'"
31 | )
32 |
33 | if readthedocs_version is None:
34 | print(
35 | """
36 | Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped.
37 | Set it with e.g. 'export READTHEDOCS_VERSION=en/stable'
38 | """
39 | )
40 |
41 |
42 | # Override to put it anywhere
43 | save_directory = "outputs/"
44 |
45 | # scrape and embed content from readthedocs website
46 | # You only need to embed the first time the app runs, comment it out to skip
47 | embed_documents(
48 | homepage_url=readthedocs_url,
49 | save_directory=save_directory,
50 | target_version=readthedocs_version,
51 | )
52 |
53 | # Setup RAG agent
54 | buster = setup_buster(cfg.buster_cfg)
55 |
56 |
57 | # Setup Gradio app
58 | def add_user_question(
59 | user_question: str, chat_history: Optional[ChatHistory] = None
60 | ) -> ChatHistory:
61 | """Adds a user's question to the chat history.
62 |
63 | If no history is provided, the first element of the history will be the user conversation.
64 | """
65 | if chat_history is None:
66 | chat_history = []
67 | chat_history.append([user_question, None])
68 | return chat_history
69 |
70 |
71 | def format_sources(matched_documents: pd.DataFrame) -> str:
72 | if len(matched_documents) == 0:
73 | return ""
74 |
75 | matched_documents.similarity_to_answer = (
76 | matched_documents.similarity_to_answer * 100
77 | )
78 |
79 | # drop duplicate pages (by title), keep highest ranking ones
80 | matched_documents = matched_documents.sort_values(
81 | "similarity_to_answer", ascending=False
82 | ).drop_duplicates("title", keep="first")
83 |
84 | documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
85 | document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
86 |
87 | documents = "\n".join(
88 | [
89 | document_template.format(document=document)
90 | for _, document in matched_documents.iterrows()
91 | ]
92 | )
93 | footnote: str = "I'm a bot 🤖 and not always perfect."
94 |
95 | return documents_answer_template.format(documents=documents, footnote=footnote)
96 |
97 |
98 | def add_sources(history, completion):
99 | if completion.answer_relevant:
100 | formatted_sources = format_sources(completion.matched_documents)
101 | history.append([None, formatted_sources])
102 |
103 | return history
104 |
105 |
106 | def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
107 | """Answer a user's question using retrieval augmented generation."""
108 |
109 | # We assume that the question is the user's last interaction
110 | user_input = chat_history[-1][0]
111 |
112 | # Do retrieval + augmented generation with buster
113 | completion = buster.process_input(user_input)
114 |
115 | # Stream tokens one at a time to the user
116 | chat_history[-1][1] = ""
117 | for token in completion.answer_generator:
118 | chat_history[-1][1] += token
119 |
120 | yield chat_history, completion
121 |
122 |
123 | demo = gr.Blocks()
124 | with demo:
125 | with gr.Row():
126 | gr.Markdown("
RAGTheDocs
")
127 |
128 | gr.Markdown(
129 | """
130 | ## About
131 | [RAGTheDocs](https://github.com/jerpint/RAGTheDocs) allows you to ask questions about any documentation hosted on readthedocs.
132 | Simply clone this space and set the environment variables:
133 |
134 | * `OPENAI_API_KEY` (required): Needed for the app to work, e.g. `sk-...`
135 | * `READTHEDOCS_URL` (required): The url of the website you are interested in scraping (must be built with
136 | sphinx/readthedocs). e.g. `https://orion.readthedocs.io`
137 | * `READTHEDOCS_VERSION` (optional): This is important if there exist multiple versions of the docs (e.g. `en/v0.2.7` or `en/latest`). If left empty, it will scrape all available versions (there can be many for open-source projects!).
138 |
139 | Try it out by asking a question below 👇 about [orion](https://orion.readthedocs.io/), an open-source hyperparameter optimization library.
140 |
141 | ## How it works
142 | This app uses [Buster 🤖](https://github.com/jerpint/buster) and ChatGPT to search the docs for relevant info and
143 | answer questions.
144 | View the code on the [project homepage](https://github.com/jerpint/RAGTheDocs)
145 | """
146 | )
147 |
148 | chatbot = gr.Chatbot()
149 |
150 | with gr.Row():
151 | question = gr.Textbox(
152 | label="What's your question?",
153 | placeholder="Type your question here...",
154 | lines=1,
155 | )
156 | submit = gr.Button(value="Send", variant="secondary")
157 |
158 | examples = gr.Examples(
159 | examples=[
160 | "How can I install the library?",
161 | "What dependencies are required?",
162 | "Give a brief overview of the library.",
163 | ],
164 | inputs=question,
165 | )
166 |
167 | response = gr.State()
168 |
169 | # fmt: off
170 | gr.on(
171 | triggers=[submit.click, question.submit],
172 | fn=add_user_question,
173 | inputs=[question],
174 | outputs=[chatbot]
175 | ).then(
176 | chat,
177 | inputs=[chatbot],
178 | outputs=[chatbot, response]
179 | ).then(
180 | add_sources,
181 | inputs=[chatbot, response],
182 | outputs=[chatbot]
183 | )
184 |
185 |
186 | demo.queue(concurrency_count=8)
187 | demo.launch(share=False)
188 |
--------------------------------------------------------------------------------
/cfg.py:
--------------------------------------------------------------------------------
1 | from buster.busterbot import Buster, BusterConfig
2 | from buster.completers import ChatGPTCompleter, DocumentAnswerer
3 | from buster.formatters.documents import DocumentsFormatterJSON
4 | from buster.formatters.prompts import PromptFormatter
5 | from buster.retriever import DeepLakeRetriever, Retriever
6 | from buster.tokenizers import GPTTokenizer
7 | from buster.validators import QuestionAnswerValidator, Validator
8 |
9 | buster_cfg = BusterConfig(
10 | retriever_cfg={
11 | "path": "outputs/deeplake_store",
12 | "top_k": 3,
13 | "thresh": 0.7,
14 | "max_tokens": 2000,
15 | "embedding_model": "text-embedding-ada-002",
16 | },
17 | documents_answerer_cfg={
18 | "no_documents_message": "No documents are available for this question.",
19 | },
20 | completion_cfg={
21 | "completion_kwargs": {
22 | "model": "gpt-3.5-turbo",
23 | "stream": True,
24 | "temperature": 0,
25 | },
26 | },
27 | tokenizer_cfg={
28 | "model_name": "gpt-3.5-turbo",
29 | },
30 | documents_formatter_cfg={
31 | "max_tokens": 3500,
32 | "columns": ["content", "title", "source"],
33 | },
34 | prompt_formatter_cfg={
35 | "max_tokens": 3500,
36 | "text_before_docs": (
37 | "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
38 | "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
39 | "If the answer is in the documentation, summarize it in a helpful way to the user. "
40 | "If it isn't, simply reply that you cannot answer the question. "
41 | "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
42 | "Here is the documentation:\n"
43 | ),
44 | "text_after_docs": (
45 | "REMEMBER:\n"
46 | "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
47 | "Here are the rules you must follow:\n"
48 | "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
49 | "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
50 | "3) Do not reference any links, urls or hyperlinks in your answers.\n"
51 | "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
52 | "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
53 | "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
54 | "For example:\n"
55 | "What is the meaning of life for an qa bot?\n"
56 | "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
57 | "Now answer the following question:\n"
58 | ),
59 | },
60 | validator_cfg={
61 | "unknown_response_templates": [
62 | "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
63 | ],
64 | "unknown_threshold": 0.85,
65 | "embedding_model": "text-embedding-ada-002",
66 | "use_reranking": True,
67 | "invalid_question_response": "This question does not seem relevant to my current knowledge. If you think this is a mistake, you can modify the question validation prompt.",
68 | "check_question_prompt": """You are an chatbot answering questions on python libraries hosted on readthedocs.
69 |
70 | Your job is to determine wether or not a question is valid, and should be answered.
71 | A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
72 |
73 | For example:
74 |
75 | Q: How can I install the library?
76 | true
77 |
78 | Q: What is the meaning of life?
79 | false
80 |
81 | A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
82 | "completion_kwargs": {
83 | "model": "gpt-3.5-turbo",
84 | "stream": False,
85 | "temperature": 0,
86 | },
87 | },
88 | )
89 |
90 |
91 | def setup_buster(buster_cfg: BusterConfig):
92 | """initialize buster with a buster_cfg class"""
93 | retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
94 | tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
95 | document_answerer: DocumentAnswerer = DocumentAnswerer(
96 | completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
97 | documents_formatter=DocumentsFormatterJSON(
98 | tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
99 | ),
100 | prompt_formatter=PromptFormatter(
101 | tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
102 | ),
103 | **buster_cfg.documents_answerer_cfg,
104 | )
105 | validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
106 | buster: Buster = Buster(
107 | retriever=retriever, document_answerer=document_answerer, validator=validator
108 | )
109 | return buster
110 |
--------------------------------------------------------------------------------
/embed_docs.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from buster.docparser import get_all_documents
5 | from buster.documents_manager import DeepLakeDocumentsManager
6 | from buster.parser import SphinxParser
7 |
8 | from rtd_scraper.scrape_rtd import sanitize_url, run_spider
9 |
10 | # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
11 | for name in logging.root.manager.loggerDict:
12 | logger = logging.getLogger(name)
13 | logger.setLevel(logging.INFO)
14 |
15 |
16 | def embed_documents(homepage_url, save_directory, target_version=None):
17 | # adds https:// and trailing slash
18 | homepage_url = sanitize_url(homepage_url)
19 |
20 | # Crawl the website using scrapy
21 | run_spider(
22 | homepage_url, save_directory=save_directory, target_version=target_version
23 | )
24 |
25 | # # Convert the .html pages into chunks using Buster's SphinxParser
26 | # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
27 | root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
28 | df = get_all_documents(
29 | root_dir=root_dir,
30 | base_url=homepage_url,
31 | parser_cls=SphinxParser,
32 | min_section_length=100,
33 | max_section_length=1000,
34 | )
35 | df["source"] = "readthedocs" # Add the source column
36 |
37 | # Initialize the DeepLake vector store
38 | vector_store_path = os.path.join(save_directory, "deeplake_store")
39 | dm = DeepLakeDocumentsManager(
40 | vector_store_path=vector_store_path,
41 | overwrite=True,
42 | required_columns=["url", "content", "source", "title"],
43 | )
44 |
45 | # Add all embeddings to the vector store
46 | dm.batch_add(
47 | df=df,
48 | batch_size=3000,
49 | min_time_interval=60,
50 | num_workers=32,
51 | )
52 |
53 |
54 | if __name__ == "__main__":
55 | homepage_url = "https://orion.readthedocs.io/"
56 | target_version = "v0.2.7"
57 | save_directory = "outputs/"
58 | embed_documents(
59 | homepage_url=homepage_url,
60 | target_version=target_version,
61 | save_directory=save_directory,
62 | )
63 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | buster-doctalk
2 | scrapy
3 |
--------------------------------------------------------------------------------
/rtd_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerpint/RAGTheDocs/1895d54d811a9a21cdc9a6a0d872286bea1d8585/rtd_scraper/__init__.py
--------------------------------------------------------------------------------
/rtd_scraper/scrape_rtd.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | from scrapy.crawler import CrawlerProcess
5 | from scrapy.utils.project import get_project_settings
6 |
7 | from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
8 |
9 | # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
10 | for name in logging.root.manager.loggerDict:
11 | logger = logging.getLogger(name)
12 | logger.setLevel(logging.INFO)
13 |
14 |
15 | def run_spider(homepage_url, save_directory, target_version=None):
16 | process = CrawlerProcess(settings=get_project_settings())
17 | process.crawl(
18 | DocsSpider,
19 | homepage_url=homepage_url,
20 | save_dir=save_directory,
21 | target_version=target_version,
22 | )
23 |
24 | # Start the crawling process
25 | process.start()
26 |
27 | # To stop the crawling process gracefully
28 | process.stop()
29 |
--------------------------------------------------------------------------------
/rtd_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = tutorial.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 |
--------------------------------------------------------------------------------
/rtd_scraper/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jerpint/RAGTheDocs/1895d54d811a9a21cdc9a6a0d872286bea1d8585/rtd_scraper/tutorial/__init__.py
--------------------------------------------------------------------------------
/rtd_scraper/tutorial/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 |
6 | # useful for handling different item types with a single interface
7 | from itemadapter import ItemAdapter, is_item
8 | from scrapy import signals
9 |
10 |
11 | class TutorialSpiderMiddleware:
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, or item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Request or item objects.
43 | pass
44 |
45 | def process_start_requests(self, start_requests, spider):
46 | # Called with the start requests of the spider, and works
47 | # similarly to the process_spider_output() method, except
48 | # that it doesn’t have a response associated.
49 |
50 | # Must return only requests (not items).
51 | for r in start_requests:
52 | yield r
53 |
54 | def spider_opened(self, spider):
55 | spider.logger.info("Spider opened: %s" % spider.name)
56 |
57 |
58 | class TutorialDownloaderMiddleware:
59 | # Not all methods need to be defined. If a method is not defined,
60 | # scrapy acts as if the downloader middleware does not modify the
61 | # passed objects.
62 |
63 | @classmethod
64 | def from_crawler(cls, crawler):
65 | # This method is used by Scrapy to create your spiders.
66 | s = cls()
67 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
68 | return s
69 |
70 | def process_request(self, request, spider):
71 | # Called for each request that goes through the downloader
72 | # middleware.
73 |
74 | # Must either:
75 | # - return None: continue processing this request
76 | # - or return a Response object
77 | # - or return a Request object
78 | # - or raise IgnoreRequest: process_exception() methods of
79 | # installed downloader middleware will be called
80 | return None
81 |
82 | def process_response(self, request, response, spider):
83 | # Called with the response returned from the downloader.
84 |
85 | # Must either;
86 | # - return a Response object
87 | # - return a Request object
88 | # - or raise IgnoreRequest
89 | return response
90 |
91 | def process_exception(self, request, exception, spider):
92 | # Called when a download handler or a process_request()
93 | # (from other downloader middleware) raises an exception.
94 |
95 | # Must either:
96 | # - return None: continue processing this exception
97 | # - return a Response object: stops process_exception() chain
98 | # - return a Request object: stops process_exception() chain
99 | pass
100 |
101 | def spider_opened(self, spider):
102 | spider.logger.info("Spider opened: %s" % spider.name)
103 |
--------------------------------------------------------------------------------
/rtd_scraper/tutorial/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for tutorial project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | from scrapy.utils.log import configure_logging
11 |
12 | # Disable default Scrapy log settings.
13 | configure_logging(install_root_handler=False)
14 | BOT_NAME = "tutorial"
15 |
16 | SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"]
17 | NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders"
18 |
19 | # SPIDER_MODULES = ["tutorial.spiders"]
20 | # NEWSPIDER_MODULE = "tutorial.spiders"
21 |
22 | LOG_ENABLED = False
23 | LOG_LEVEL = "INFO"
24 |
25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
26 | # USER_AGENT = "tutorial (+http://www.yourdomain.com)"
27 |
28 | # Obey robots.txt rules
29 | ROBOTSTXT_OBEY = True
30 |
31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
32 | # CONCURRENT_REQUESTS = 32
33 |
34 | # Configure a delay for requests for the same website (default: 0)
35 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
36 | # See also autothrottle settings and docs
37 | # DOWNLOAD_DELAY = 3
38 | # The download delay setting will honor only one of:
39 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
40 | # CONCURRENT_REQUESTS_PER_IP = 16
41 |
42 | # Disable cookies (enabled by default)
43 | # COOKIES_ENABLED = False
44 |
45 | # Disable Telnet Console (enabled by default)
46 | # TELNETCONSOLE_ENABLED = False
47 |
48 | # Override the default request headers:
49 | # DEFAULT_REQUEST_HEADERS = {
50 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
51 | # "Accept-Language": "en",
52 | # }
53 |
54 | # Enable or disable spider middlewares
55 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
56 | # SPIDER_MIDDLEWARES = {
57 | # "tutorial.middlewares.TutorialSpiderMiddleware": 543,
58 | # }
59 |
60 | # Enable or disable downloader middlewares
61 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
62 | # DOWNLOADER_MIDDLEWARES = {
63 | # "tutorial.middlewares.TutorialDownloaderMiddleware": 543,
64 | # }
65 |
66 | # Enable or disable extensions
67 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
68 | # EXTENSIONS = {
69 | # "scrapy.extensions.telnet.TelnetConsole": None,
70 | # }
71 |
72 | # Configure item pipelines
73 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
74 | # ITEM_PIPELINES = {
75 | # "tutorial.pipelines.TutorialPipeline": 300,
76 | # }
77 |
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
80 | # AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | # AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | # AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | # AUTOTHROTTLE_DEBUG = False
90 |
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | # HTTPCACHE_ENABLED = True
94 | # HTTPCACHE_EXPIRATION_SECS = 0
95 | # HTTPCACHE_DIR = "httpcache"
96 | # HTTPCACHE_IGNORE_HTTP_CODES = []
97 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
98 |
99 | # Set settings whose default value is deprecated to a future-proof value
100 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
101 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
102 | FEED_EXPORT_ENCODING = "utf-8"
103 |
--------------------------------------------------------------------------------
/rtd_scraper/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/rtd_scraper/tutorial/spiders/docs_spider.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 | from urllib.parse import urlparse
4 |
5 | import scrapy
6 |
7 | logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
8 |
9 | from urllib.parse import urlparse
10 |
11 |
12 | def extract_domain(url):
13 | """
14 | Extract the domain (including subdomains) from a given URL.
15 |
16 | Args:
17 | - url (str): The URL from which the domain needs to be extracted.
18 |
19 | Returns:
20 | - str: The domain (with subdomains) extracted from the URL.
21 | For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.
22 |
23 | """
24 | parsed_uri = urlparse(url)
25 | # The netloc attribute will contain the domain name
26 | domain = parsed_uri.netloc
27 | return domain
28 |
29 |
30 | def sanitize_url(url: str) -> str:
31 | """Adds https:// and trailing backslash."""
32 | if not url.startswith("https://"):
33 | url = "https://" + url
34 |
35 | if not url.endswith("/"):
36 | url = url + "/"
37 | return url
38 |
39 |
40 | class DocsSpider(scrapy.Spider):
41 | name = "docs"
42 |
43 | def __init__(
44 | self,
45 | homepage_url: str,
46 | save_dir="outputs/",
47 | target_version=None,
48 | *args,
49 | **kwargs,
50 | ):
51 | super(DocsSpider, self).__init__(*args, **kwargs)
52 |
53 | homepage_url = sanitize_url(homepage_url)
54 |
55 | self.allowed_domains = [extract_domain(homepage_url)]
56 | self.start_urls = [homepage_url]
57 | self.base_dir = Path(save_dir)
58 | self.target_version = target_version
59 |
60 | def parse(self, response):
61 | parsed_uri = urlparse(response.url)
62 | # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
63 | if parsed_uri.path.endswith("/"):
64 | filepath = (
65 | self.base_dir
66 | / parsed_uri.netloc
67 | / parsed_uri.path.strip("/")
68 | / "index.html"
69 | )
70 | else:
71 | filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
72 | filepath.parent.mkdir(parents=True, exist_ok=True)
73 |
74 | with open(filepath, "wb") as f:
75 | f.write(response.body)
76 |
77 | # Follow links to other documentation pages only if they contain the target version in the full URL
78 | for href in response.css("a::attr(href)").getall():
79 | if self.target_version:
80 | # A version was specified, check to see if it's the correct version from url
81 | full_url = response.urljoin(href) # Expand href to a full URL
82 | if self.target_version in full_url:
83 | yield response.follow(href, self.parse)
84 | else:
85 | # no version specified, follow all links
86 | yield response.follow(href, self.parse)
87 |
--------------------------------------------------------------------------------