├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── ingest_data.py
├── query_data.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Harrison Chase
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Chat-LangChain-ReadTheDocs
 2 | 
 3 | Create a ChatGPT like experience over your ReadTheDocs using [LangChain](https://github.com/hwchase17/langchain).
 4 | 
 5 | 
 6 | ## 📊 Example Data
 7 | This repo uses the [LangChain Documentation](https://langchain.readthedocs.io/en/latest/) as an example.
 8 | 
 9 | ## 🧑 Instructions for ingesting your own ReadTheDocs documentation
10 | 
11 | Run the following command to download html for a given website. Replace `https://langchain.readthedocs.io/en/latest/` with a URL to your website.
12 | 
13 | ```shell
14 | wget -r -A.html https://langchain.readthedocs.io/en/latest/
15 | ```
16 | 
17 | ## Ingest data
18 | 
19 | The only thing that is needed is to be done to ingest data is run `python ingest_data.py`
20 | 
21 | ## Query data
22 | Custom prompts are used to ground the answers in LangChain Documentation files.
23 | 
24 | ## Running the Application
25 | 
26 | By running `python app.py` from the command line you can easily interact with your ChatGPT over your own data.
27 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from query_data import get_chain
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     with open("vectorstore.pkl", "rb") as f:
 7 |         vectorstore = pickle.load(f)
 8 |     qa_chain = get_chain(vectorstore)
 9 |     chat_history = []
10 |     print("Chat with your docs!")
11 |     while True:
12 |         print("Human:")
13 |         question = input()
14 |         result = qa_chain({"question": question, "chat_history": chat_history})
15 |         chat_history.append((question, result["answer"]))
16 |         print("AI:")
17 |         print(result["answer"])
18 | 


--------------------------------------------------------------------------------
/ingest_data.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 2 | from langchain.document_loaders import ReadTheDocsLoader
 3 | from langchain.vectorstores.faiss import FAISS
 4 | from langchain.embeddings import OpenAIEmbeddings
 5 | import pickle
 6 | 
 7 | # Load Data
 8 | loader = ReadTheDocsLoader("langchain.readthedocs.io")
 9 | raw_documents = loader.load()
10 | 
11 | # Split text
12 | text_splitter = RecursiveCharacterTextSplitter()
13 | documents = text_splitter.split_documents(raw_documents)
14 | 
15 | 
16 | # Load Data to vectorstore
17 | embeddings = OpenAIEmbeddings()
18 | vectorstore = FAISS.from_documents(documents, embeddings)
19 | 
20 | 
21 | # Save vectorstore
22 | with open("vectorstore.pkl", "wb") as f:
23 |     pickle.dump(vectorstore, f)
24 | 


--------------------------------------------------------------------------------
/query_data.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts.prompt import PromptTemplate
 2 | from langchain.llms import OpenAI
 3 | from langchain.chains import ChatVectorDBChain
 4 | 
 5 | _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
 6 | You can assume the question about LangChain.
 7 | 
 8 | Chat History:
 9 | {chat_history}
10 | Follow Up Input: {question}
11 | Standalone question:"""
12 | CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
13 | 
14 | template = """You are an AI assistant for the open source library LangChain. The documentation is located at https://langchain.readthedocs.io.
15 | You are given the following extracted parts of a long document and a question. Provide a conversational answer with a hyperlink to the documentation.
16 | You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
17 | If the question includes a request for code, provide a code block directly from the documentation.
18 | If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
19 | If the question is not about LangChain, politely inform them that you are tuned to only answer questions about LangChain.
20 | 
21 | Question: {question}
22 | =========
23 | {context}
24 | =========
25 | Answer in Markdown:"""
26 | QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
27 | 
28 | 
29 | def get_chain(vectorstore):
30 |     llm = OpenAI(temperature=0)
31 |     qa_chain = ChatVectorDBChain.from_llm(
32 |         llm,
33 |         vectorstore,
34 |         qa_prompt=QA_PROMPT,
35 |         condense_question_prompt=CONDENSE_QUESTION_PROMPT,
36 |     )
37 |     return qa_chain
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | openai
3 | unstructured
4 | faiss-cpu
5 | bs4
6 | 


--------------------------------------------------------------------------------