├── .env.template
├── .gitignore
├── .streamlit
└── config.toml
├── Dockerfile
├── LICENSE
├── README.md
├── app.py
├── datachad
├── __init__.py
├── backend
│ ├── __init__.py
│ ├── chain.py
│ ├── constants.py
│ ├── deeplake.py
│ ├── io.py
│ ├── jobs.py
│ ├── loader.py
│ ├── logging.py
│ ├── models.py
│ ├── prompts.py
│ └── utils.py
└── streamlit
│ ├── __init__.py
│ ├── constants.py
│ ├── helper.py
│ └── widgets.py
├── packages.txt
├── requirements.txt
└── static
├── datachadV1.png
├── datachadV2.png
└── datachadV3.png
/.env.template:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=your openai key
2 | ACTIVELOOP_TOKEN=your activeloop key
3 | ACTIVELOOP_ID=your activeloop organization name
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | stores
2 | data
3 | models
4 | __pycache__
5 | .env
6 | .ipynb_checkpoints
7 | .DS_Store
8 | testing.ipynb
9 | .vscode
10 | .venv
--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [server]
2 | fileWatcherType = "poll"
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim
2 |
3 | WORKDIR /app
4 |
5 | COPY requirements.txt requirements.txt
6 | COPY packages.txt packages.txt
7 |
8 | # we need to install the packages without versions
9 | # to ensure compatibility with apple ARM devices
10 | RUN sed -i 's/==.*//' requirements.txt
11 |
12 | RUN pip install --upgrade pip
13 | RUN pip install -r requirements.txt
14 | RUN rm -rf /root/.cache/pip
15 |
16 | RUN apt-get update
17 | RUN xargs apt-get -y install < packages.txt
18 |
19 | COPY datachad datachad
20 | COPY app.py app.py
21 |
22 | ARG port=80
23 | ENV STREAMLIT_SERVER_PORT ${port}
24 | ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
25 | EXPOSE ${port}
26 |
27 | CMD ["streamlit", "run", "app.py"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2023 Gustav von Zitzewitz
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DataChad V3🤖
2 |
3 | This is an app that let's you ask questions about any data source by leveraging [embeddings](https://platform.openai.com/docs/guides/embeddings), [vector databases](https://www.activeloop.ai/), [large language models](https://platform.openai.com/docs/models/gpt-3-5) and last but not least [langchains](https://github.com/hwchase17/langchain)
4 |
5 | ## How does it work?
6 |
7 | 1. Upload any `file(s)` or enter any `path` or `url` to create Knowledge Bases which can contain multiple files of any type, format and content and create Smart FAQs which are lists of curated numbered Q&As.
8 | 2. The data source or files are loaded and splitted into text document chunks
9 | 3. The text document chunks are embedded using openai or huggingface embeddings
10 | 4. The embeddings are stored as a vector dataset to activeloop's database hub
11 | 5. A langchain is created consisting of a custom selection of an LLM model (`gpt-3.5-turbo` by default), multiple vector store as knowledge bases and a single special smart FAQ vector store
12 | 6. When asking questions to the app, the chain embeds the input prompt and does a similarity search in in the provided vector stores and uses the best results as context for the LLM to generate an appropriate response
13 | 7. Finally the chat history is cached locally to enable a [ChatGPT](https://chat.openai.com/) like Q&A conversation
14 |
15 | ## Good to know
16 | - The app only runs on `py>=3.10`!
17 | - To run locally or deploy somewhere, execute `cp .env.template .env` and set credentials in the newly created `.env` file. Other options are manually setting of system environment variables, or storing them into `.streamlit/secrets.toml` when hosted via streamlit.
18 | - If you have credentials set like explained above, you can just hit `submit` in the authentication without reentering your credentials in the app.
19 | - If you run the app consider modifying the configuration in `datachad/backend/constants.py`, e.g enabling advanced options
20 | - Your data won't load? Feel free to open an Issue or PR and contribute!
21 | - Use previous releases like V1 or V2 for original functionality and UI
22 |
23 | ## How does it look like?
24 |
25 |
26 |
27 | ## TODO LIST
28 | If you like to contribute, feel free to grab any task
29 | - [x] Refactor utils, especially the loaders
30 | - [x] Add option to choose model and embeddings
31 | - [x] Enable fully local / private mode
32 | - [x] Add option to upload multiple files to a single dataset
33 | - [x] Decouple datachad modules from streamlit
34 | - [x] remove all local mode and other V1 stuff
35 | - [x] Load existing knowledge bases
36 | - [x] Delete existing knowledge bases
37 | - [x] Enable streaming responses
38 | - [x] Show retrieved context
39 | - [x] Refactor UI
40 | - [x] Introduce smart FAQs
41 | - [ ] Exchange downloaded file storage with tempfile
42 | - [ ] Add user creation and login
43 | - [ ] Add chat history per user
44 | - [ ] Make all I/O asynchronous
45 | - [ ] Implement FastAPI routes and backend app
46 | - [ ] Implement a proper frontend (react or whatever)
47 | - [ ] containerize the app
48 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from datachad.streamlit.helper import init_session_state
2 | from datachad.streamlit.widgets import (
3 | authentication_widget,
4 | chat_interface_widget,
5 | data_selection_widget,
6 | data_upload_widget,
7 | init_widgets,
8 | page_header,
9 | usage_widget,
10 | )
11 |
12 | init_session_state()
13 | page_header()
14 | init_widgets()
15 | authentication_widget()
16 | data_upload_widget()
17 | data_selection_widget()
18 | chat_interface_widget()
19 | usage_widget()
20 |
--------------------------------------------------------------------------------
/datachad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/datachad/__init__.py
--------------------------------------------------------------------------------
/datachad/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/datachad/backend/__init__.py
--------------------------------------------------------------------------------
/datachad/backend/chain.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from langchain.callbacks.manager import CallbackManagerForChainRun, Callbacks
4 | from langchain.chains.base import Chain
5 | from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
6 | from langchain.chains.conversational_retrieval.base import _get_chat_history
7 | from langchain.chains.llm import LLMChain
8 | from langchain.chains.question_answering import load_qa_chain
9 | from langchain.memory import ConversationBufferMemory
10 | from langchain.schema import BaseChatMessageHistory, BasePromptTemplate, BaseRetriever, Document
11 | from langchain.schema.language_model import BaseLanguageModel
12 | from langchain.schema.vectorstore import VectorStore
13 | from datachad.backend.constants import VERBOSE
14 |
15 | from datachad.backend.deeplake import get_or_create_deeplake_vector_store_display_name
16 | from datachad.backend.logging import logger
17 | from datachad.backend.models import get_model
18 | from datachad.backend.prompts import (
19 | CONDENSE_QUESTION_PROMPT,
20 | KNOWLEDGE_BASE_PROMPT,
21 | QA_PROMPT,
22 | SMART_FAQ_PROMPT,
23 | )
24 |
25 |
26 | class MultiRetrieverFAQChain(Chain):
27 | """
28 | This chain does blablabla
29 | """
30 |
31 | output_key: str = "answer"
32 | rephrase_question: bool = True
33 | use_vanilla_llm: bool = True
34 | max_tokens_limit: int
35 | qa_chain: LLMChain
36 | condense_question_chain: LLMChain
37 | knowledge_base_chain: BaseCombineDocumentsChain
38 | knowledge_base_retrievers: list[BaseRetriever]
39 | smart_faq_chain: BaseCombineDocumentsChain
40 | smart_faq_retriever: BaseRetriever | None
41 |
42 | @property
43 | def input_keys(self) -> list[str]:
44 | """Will be whatever keys the prompt expects."""
45 | return ["question", "chat_history"]
46 |
47 | @property
48 | def output_keys(self) -> list[str]:
49 | """Will always return text key."""
50 | return [self.output_key]
51 |
52 | @property
53 | def _chain_type(self) -> str:
54 | return "stuff"
55 |
56 | def _reduce_tokens_below_limit(
57 | self, docs: list[Document], combine_docs_chain: BaseCombineDocumentsChain
58 | ) -> list[Document]:
59 | num_docs = len(docs)
60 |
61 | tokens = [combine_docs_chain.llm_chain.llm.get_num_tokens(doc.page_content) for doc in docs]
62 | token_count = sum(tokens[:num_docs])
63 | while token_count > self.max_tokens_limit:
64 | num_docs -= 1
65 | token_count -= tokens[num_docs]
66 |
67 | return docs[:num_docs]
68 |
69 | def _get_docs(
70 | self,
71 | question: str,
72 | retriever: BaseRetriever,
73 | combine_docs_chain: BaseCombineDocumentsChain,
74 | run_manager: CallbackManagerForChainRun,
75 | ) -> list[Document]:
76 | """Get docs from retriever."""
77 | docs = retriever.get_relevant_documents(question, callbacks=run_manager.get_child())
78 | return self._reduce_tokens_below_limit(docs, combine_docs_chain)
79 |
80 | def _add_text_to_answer(
81 | self, text: str, answer: str, run_manager: CallbackManagerForChainRun
82 | ) -> str:
83 | """Hack to add text to the streaming response handler"""
84 | answer += text
85 | streamhandler = next(
86 | (h for h in run_manager.get_child().handlers if hasattr(h, "stream_text")),
87 | None,
88 | )
89 | if streamhandler:
90 | streamhandler.on_llm_new_token(text)
91 | return answer
92 |
93 | def _call(
94 | self,
95 | inputs: dict[str, Any],
96 | run_manager: CallbackManagerForChainRun | None = None,
97 | ) -> dict[str, str]:
98 | answer = ""
99 | chat_history_str = _get_chat_history(inputs["chat_history"])
100 | run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
101 | # Generate new standalone question if there is a chat history
102 | if chat_history_str and self.rephrase_question:
103 | inputs["question"] = self.condense_question_chain.run(
104 | question=inputs["question"],
105 | chat_history=chat_history_str,
106 | callbacks=run_manager.get_child(),
107 | )
108 | # Answer the question using the FAQ document context
109 | if self.smart_faq_retriever:
110 | docs = self._get_docs(
111 | inputs["question"],
112 | self.smart_faq_retriever,
113 | self.smart_faq_chain,
114 | run_manager=run_manager,
115 | )
116 | smart_faq_name = get_or_create_deeplake_vector_store_display_name(
117 | self.smart_faq_retriever.vectorstore.dataset_path
118 | )
119 | answer = self._add_text_to_answer(
120 | f"\n#### SMART FAQ ANSWER `{smart_faq_name}`\n", answer, run_manager
121 | )
122 | answer += self.smart_faq_chain.run(
123 | input_documents=docs, callbacks=run_manager.get_child(), **inputs
124 | )
125 |
126 | # Answer the question using all provided knowledge bases
127 | for i, retriever in enumerate(self.knowledge_base_retrievers):
128 | docs = self._get_docs(
129 | inputs["question"],
130 | retriever,
131 | self.knowledge_base_chain,
132 | run_manager=run_manager,
133 | )
134 | knowledge_base_name = get_or_create_deeplake_vector_store_display_name(
135 | retriever.vectorstore.dataset_path
136 | )
137 | answer = self._add_text_to_answer(
138 | f"\n#### KNOWLEDGE BASE ANSWER `{knowledge_base_name}`\n",
139 | answer,
140 | run_manager,
141 | )
142 | answer += self.knowledge_base_chain.run(
143 | input_documents=docs, callbacks=run_manager.get_child(), **inputs
144 | )
145 | # Answer the question using
146 | # the general purpose QA chain
147 | if self.use_vanilla_llm:
148 | answer = self._add_text_to_answer("\n#### LLM ANSWER\n", answer, run_manager)
149 | answer += self.qa_chain.run(
150 | question=inputs["question"], callbacks=run_manager.get_child()
151 | )
152 | return {self.output_key: answer}
153 |
154 | @classmethod
155 | def from_llm(
156 | cls,
157 | llm: BaseLanguageModel,
158 | condense_question_prompt: BasePromptTemplate,
159 | smart_faq_prompt: BasePromptTemplate,
160 | knowledge_base_prompt: BasePromptTemplate,
161 | qa_prompt: BasePromptTemplate,
162 | knowledge_base_retrievers: list[BaseRetriever],
163 | smart_faq_retriever: BaseRetriever | None = None,
164 | retriever_llm: BaseLanguageModel | None = None,
165 | condense_question_llm: BaseLanguageModel | None = None,
166 | use_vanilla_llm: bool = True,
167 | callbacks: Callbacks = None,
168 | chain_type: str = "stuff",
169 | verbose: bool = False,
170 | **kwargs: Any,
171 | ) -> Chain:
172 | qa_chain = LLMChain(
173 | llm=llm,
174 | prompt=qa_prompt,
175 | callbacks=callbacks,
176 | verbose=verbose,
177 | )
178 | condense_question_chain = LLMChain(
179 | llm=condense_question_llm or llm,
180 | prompt=condense_question_prompt,
181 | callbacks=callbacks,
182 | verbose=verbose,
183 | )
184 | knowledge_base_chain = load_qa_chain(
185 | llm=retriever_llm or llm,
186 | prompt=knowledge_base_prompt,
187 | chain_type=chain_type,
188 | callbacks=callbacks,
189 | verbose=verbose,
190 | )
191 | smart_faq_chain = load_qa_chain(
192 | llm=retriever_llm or llm,
193 | prompt=smart_faq_prompt,
194 | chain_type=chain_type,
195 | callbacks=callbacks,
196 | verbose=verbose,
197 | )
198 | return cls(
199 | qa_chain=qa_chain,
200 | condense_question_chain=condense_question_chain,
201 | knowledge_base_chain=knowledge_base_chain,
202 | knowledge_base_retrievers=knowledge_base_retrievers,
203 | smart_faq_chain=smart_faq_chain,
204 | smart_faq_retriever=smart_faq_retriever,
205 | use_vanilla_llm=use_vanilla_llm,
206 | callbacks=callbacks,
207 | **kwargs,
208 | )
209 |
210 |
211 | def get_knowledge_base_search_kwargs(options: dict) -> tuple[dict, str]:
212 | k = int(options["max_tokens"] // options["chunk_size"])
213 | fetch_k = k * options["k_fetch_k_ratio"]
214 | if options["maximal_marginal_relevance"]:
215 | search_kwargs = {
216 | "distance_metric": options["distance_metric"],
217 | "fetch_k": fetch_k,
218 | "k": k,
219 | }
220 | search_type = "mmr"
221 | else:
222 | search_kwargs = {
223 | "k": k,
224 | "distance_metric": options["distance_metric"],
225 | }
226 | search_type = "similarity"
227 |
228 | return search_kwargs, search_type
229 |
230 |
231 | def get_smart_faq_search_kwargs(options: dict) -> tuple[dict, str]:
232 | search_kwargs = {
233 | "k": 20,
234 | "distance_metric": options["distance_metric"],
235 | }
236 | search_type = "similarity"
237 | return search_kwargs, search_type
238 |
239 |
240 | def get_multi_chain(
241 | use_vanilla_llm: bool,
242 | knowledge_bases: list[VectorStore],
243 | smart_faq: VectorStore,
244 | chat_history: BaseChatMessageHistory,
245 | options: dict,
246 | credentials: dict,
247 | ) -> MultiRetrieverFAQChain:
248 | kb_search_kwargs, search_type = get_knowledge_base_search_kwargs(options)
249 | kb_retrievers = [
250 | kb.as_retriever(search_type=search_type, search_kwargs=kb_search_kwargs)
251 | for kb in knowledge_bases
252 | ]
253 | faq_search_kwargs, search_type = get_smart_faq_search_kwargs(options)
254 | faq_retriever = (
255 | smart_faq.as_retriever(search_type=search_type, search_kwargs=faq_search_kwargs)
256 | if smart_faq
257 | else None
258 | )
259 | model = get_model(options, credentials)
260 | memory = ConversationBufferMemory(
261 | memory_key="chat_history", chat_memory=chat_history, return_messages=True
262 | )
263 | chain = MultiRetrieverFAQChain.from_llm(
264 | llm=model,
265 | condense_question_prompt=CONDENSE_QUESTION_PROMPT,
266 | knowledge_base_prompt=KNOWLEDGE_BASE_PROMPT,
267 | smart_faq_prompt=SMART_FAQ_PROMPT,
268 | qa_prompt=QA_PROMPT,
269 | knowledge_base_retrievers=kb_retrievers,
270 | smart_faq_retriever=faq_retriever,
271 | max_tokens_limit=options["max_tokens"],
272 | use_vanilla_llm=use_vanilla_llm,
273 | memory=memory,
274 | verbose=VERBOSE,
275 | )
276 | logger.info(f"Multi chain with settings {options} build!")
277 | return chain
278 |
--------------------------------------------------------------------------------
/datachad/backend/constants.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | MODEL_PATH = Path("models")
4 | DATA_PATH = Path("data")
5 | VECTOR_STORE_PATH = Path("stores")
6 |
7 | DEFAULT_USER = "admin"
8 | DEFAULT_SMART_FAQ = None
9 | DEFAULT_KNOWLEDGE_BASES = []
10 | USE_VANILLA_LLM = True
11 |
12 | CHUNK_SIZE = 512
13 | CHUNK_OVERLAP_PCT = 15
14 | TEMPERATURE = 0.0
15 | MAX_TOKENS = 2560
16 | MAXIMAL_MARGINAL_RELEVANCE = True
17 | DISTANCE_METRIC = "cos"
18 | K_FETCH_K_RATIO = 5
19 |
20 | ENABLE_ADVANCED_OPTIONS = False
21 | STORE_DOCS_EXTRA = False
22 | LOCAL_DEEPLAKE = False
23 | LOCAL_EMBEDDINGS = False
24 |
25 | VERBOSE = False
26 |
--------------------------------------------------------------------------------
/datachad/backend/deeplake.py:
--------------------------------------------------------------------------------
1 | import time
2 | from datetime import datetime
3 | from glob import glob
4 |
5 | import deeplake
6 | from deeplake.client.client import DeepLakeBackendClient
7 | from deeplake.util.bugout_reporter import deeplake_reporter
8 | from langchain.schema import Document
9 | from langchain.vectorstores import VectorStore
10 | from langchain_community.vectorstores.deeplake import DeepLake
11 |
12 | from datachad.backend.constants import (
13 | DEFAULT_USER,
14 | LOCAL_DEEPLAKE,
15 | STORE_DOCS_EXTRA,
16 | VECTOR_STORE_PATH,
17 | VERBOSE,
18 | )
19 | from datachad.backend.io import clean_string_for_storing
20 | from datachad.backend.loader import load_data_source, split_docs
21 | from datachad.backend.logging import logger
22 | from datachad.backend.models import STORES, get_embeddings
23 | from datachad.backend.utils import clean_string_for_storing
24 |
25 | SPLIT = "-_-"
26 |
27 |
28 | def list_deeplake_datasets(
29 | org_id: str = "",
30 | token: str = None,
31 | ) -> None:
32 | """List all available Deep Lake cloud datasets for a given user / orgnaization.
33 | Removed from deeplake in: https://github.com/activeloopai/deeplake/pull/2182/files
34 | """
35 |
36 | deeplake_reporter.feature_report(
37 | feature_name="list",
38 | parameters={"org_id": org_id},
39 | )
40 |
41 | def get_datasets(self, workspace: str):
42 | LIST_DATASETS = "/api/datasets/{}"
43 | suffix_public = LIST_DATASETS.format("public")
44 | suffix_user = LIST_DATASETS.format("all")
45 | if workspace:
46 | res_datasets = self.get_workspace_datasets(workspace, suffix_public, suffix_user)
47 | else:
48 | public_datasets = self.request(
49 | "GET",
50 | suffix_public,
51 | endpoint=self.endpoint(),
52 | ).json()
53 | user_datasets = self.request(
54 | "GET",
55 | suffix_user,
56 | endpoint=self.endpoint(),
57 | ).json()
58 | res_datasets = public_datasets + user_datasets
59 | return [ds["_id"] for ds in res_datasets]
60 |
61 | client = DeepLakeBackendClient(token=token)
62 | client.get_datasets = get_datasets
63 | datasets = client.get_datasets(client, workspace=org_id)
64 | return datasets
65 |
66 |
67 | def get_deeplake_dataset_path(dataset_name: str, credentials: dict) -> str:
68 | if LOCAL_DEEPLAKE:
69 | dataset_path = str(VECTOR_STORE_PATH / dataset_name)
70 | else:
71 | dataset_path = f"hub://{credentials['activeloop_id']}/{dataset_name}"
72 | return dataset_path
73 |
74 |
75 | def delete_all_deeplake_datasets(credentials: dict) -> None:
76 | datasets = list_deeplake_datasets(credentials["activeloop_id"], credentials["activeloop_token"])
77 | for dataset in datasets:
78 | path = f"hub://{dataset}"
79 | logger.info(f"Deleting dataset: {path}")
80 | deeplake.delete(path, token=credentials["activeloop_token"], force=True)
81 |
82 |
83 | def get_existing_deeplake_vector_store_paths(credentials: dict) -> list[str]:
84 | if LOCAL_DEEPLAKE:
85 | return glob(str(VECTOR_STORE_PATH / "*"), recursive=False)
86 | else:
87 | dataset_names = list_deeplake_datasets(
88 | credentials["activeloop_id"], credentials["activeloop_token"]
89 | )
90 | dataset_pahs = [f"hub://{name}" for name in dataset_names]
91 | return dataset_pahs
92 |
93 |
94 | def get_or_create_deeplake_vector_store_paths_for_user(
95 | credentials: dict, store_type: str
96 | ) -> list[str]:
97 | all_paths = get_existing_deeplake_vector_store_paths(credentials)
98 | # TODO: replace DEFAULT_USER with user id once stored in credentials
99 | user_paths = [
100 | p
101 | for p in all_paths
102 | if p.split(SPLIT)[-1] == DEFAULT_USER and p.split(SPLIT)[-2] == store_type
103 | ]
104 | return user_paths
105 |
106 |
107 | def get_or_create_deeplake_vector_store_display_name(dataset_path: str) -> str:
108 | splits = dataset_path.split(SPLIT)
109 | return f"{splits[-4]} ({splits[-3][:4]}-{splits[-3][4:6]}-{splits[-3][6:8]})"
110 |
111 |
112 | def get_unique_deeplake_vector_store_path(store_type: str, name: str, credentials: dict) -> str:
113 | store_type_dict = {STORES.KNOWLEDGE_BASE: "kb", STORES.SMART_FAQ: "faq"}
114 | dataset_name = (
115 | # [-4] vector store name
116 | f"{SPLIT}{name}"
117 | # [-3]: creation time
118 | f"{SPLIT}{datetime.now().strftime('%Y%m%d%H%M%S')}"
119 | # [-2]: vector store type
120 | f"{SPLIT}{store_type_dict[store_type]}"
121 | # [-1]: user
122 | f"{SPLIT}{DEFAULT_USER}"
123 | )
124 | dataset_path = get_deeplake_dataset_path(dataset_name, credentials)
125 | return dataset_path
126 |
127 |
128 | def get_deeplake_docs_path(data_source: str, options: dict, credentials: dict) -> str:
129 | dataset_name = clean_string_for_storing(data_source)
130 | dataset_name += "-docs"
131 | dataset_path = get_deeplake_dataset_path(dataset_name, options, credentials)
132 | return dataset_path
133 |
134 |
135 | def load_docs_from_deeplake(docs_path: str, credentials: dict) -> list[Document]:
136 | ds = deeplake.load(docs_path, token=credentials["activeloop_token"])
137 | metadatas = ds["metadata"].data()["value"]
138 | texts = ds["text"].data()["value"]
139 | docs = [
140 | Document(
141 | page_content=text,
142 | metadata=metadata,
143 | )
144 | for text, metadata in zip(texts, metadatas)
145 | ]
146 | return docs
147 |
148 |
149 | def store_docs_to_deeplake(docs: list[Document], docs_path: str, credentials: dict):
150 | ds = deeplake.empty(docs_path, token=credentials["activeloop_token"])
151 | ds.create_tensor(
152 | "text",
153 | htype="text",
154 | create_id_tensor=False,
155 | create_sample_info_tensor=False,
156 | create_shape_tensor=False,
157 | chunk_compression="lz4",
158 | )
159 | ds.create_tensor(
160 | "metadata",
161 | htype="json",
162 | create_id_tensor=False,
163 | create_sample_info_tensor=False,
164 | create_shape_tensor=False,
165 | chunk_compression="lz4",
166 | )
167 | for doc in docs:
168 | ds.append(
169 | {
170 | "text": doc.page_content,
171 | "metadata": doc.metadata,
172 | }
173 | )
174 | ds.commit()
175 | logger.info(f"Stored docs to: {docs_path}")
176 |
177 |
178 | def load_data_sources_or_docs_from_deeplake(
179 | data_sources: list[str], options: dict, credentials: dict
180 | ) -> list[Document]:
181 | docs = []
182 | for data_source in data_sources:
183 | if STORE_DOCS_EXTRA:
184 | docs_path = get_deeplake_docs_path(data_source, options, credentials)
185 | if deeplake.exists(docs_path, token=credentials["activeloop_token"]):
186 | logger.info(f"Docs exist -> loading docs: {docs_path}")
187 | docs.extend(load_docs_from_deeplake(docs_path, credentials))
188 | else:
189 | logger.info(
190 | f"Docs do not exist for data source -> loading data source: {data_source}"
191 | )
192 | docs.extend(load_data_source(data_source))
193 | store_docs_to_deeplake(docs, docs_path, credentials)
194 | logger.info(f"Docs {docs_path} loaded!")
195 | else:
196 | docs.extend(load_data_source(data_source))
197 | return docs
198 |
199 |
200 | def get_or_create_deeplake_vector_store(
201 | data_sources: list[str],
202 | vector_store_path: str,
203 | store_type: str,
204 | options: dict,
205 | credentials: dict,
206 | ) -> VectorStore:
207 | t_start = time.time()
208 | embeddings = get_embeddings(options, credentials)
209 | if deeplake.exists(vector_store_path, token=credentials["activeloop_token"]):
210 | logger.info(f"Vector Store '{vector_store_path}' exists -> loading")
211 | vector_store = DeepLake(
212 | dataset_path=vector_store_path,
213 | read_only=True,
214 | embedding_function=embeddings,
215 | token=credentials["activeloop_token"],
216 | )
217 | else:
218 | logger.info(f"Vector Store '{vector_store_path}' does not exist -> uploading")
219 | docs = load_data_sources_or_docs_from_deeplake(data_sources, options, credentials)
220 | docs = split_docs(docs, store_type, options)
221 | vector_store = DeepLake.from_documents(
222 | docs,
223 | embeddings,
224 | dataset_path=vector_store_path,
225 | token=credentials["activeloop_token"],
226 | verbose=VERBOSE,
227 | )
228 | logger.info(f"Vector Store {vector_store_path} loaded in {round(time.time() - t_start)}s!")
229 | return vector_store
230 |
--------------------------------------------------------------------------------
/datachad/backend/io.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import shutil
4 | from pathlib import Path
5 |
6 | from datachad.backend.constants import DATA_PATH
7 | from datachad.backend.logging import logger
8 | from datachad.backend.utils import clean_string_for_storing
9 |
10 |
11 | def concatenate_file_names(strings: list[str], n_max: int = 30) -> str:
12 | # Calculate N based on the length of the list
13 | n = max(1, n_max // len(strings))
14 | result = ""
15 | # Add up the first N characters of each string
16 | for string in sorted(strings):
17 | result += f"-{string[:n]}"
18 | return clean_string_for_storing(result)
19 |
20 |
21 | def get_data_source_and_save_path(files: list[io.BytesIO], name: str) -> tuple[str, Path]:
22 | # generate data source string and path to save files to
23 | if len(files) > 1:
24 | # we create a folder where all the files will be stored
25 | path = DATA_PATH / name
26 | data_source = path
27 | else:
28 | path = DATA_PATH
29 | data_source = path / files[0].name
30 | if not os.path.exists(path):
31 | os.makedirs(path)
32 | return str(data_source), path
33 |
34 |
35 | def save_file(file: io.BytesIO, path: Path) -> None:
36 | # save streamlit UploadedFile to path
37 | file_path = str(path / file.name)
38 | file.seek(0)
39 | file_bytes = file.read()
40 | file = open(file_path, "wb")
41 | file.write(file_bytes)
42 | file.close()
43 | logger.info(f"Saved: {file_path}")
44 |
45 |
46 | def save_files(files: list[io.BytesIO], name: str) -> str:
47 | # streamlit uploaded files need to be stored locally
48 | # before embedded and uploaded to the hub
49 | if not files:
50 | return None
51 | data_source, save_path = get_data_source_and_save_path(files, name)
52 | for file in files:
53 | save_file(file, save_path)
54 | return data_source
55 |
56 |
57 | def delete_files(files: list[io.BytesIO], name: str) -> None:
58 | # cleanup locally stored files
59 | # the correct path is stored to data_source
60 | if not files:
61 | return
62 | data_source, _ = get_data_source_and_save_path(files, name)
63 | if os.path.isdir(data_source):
64 | shutil.rmtree(data_source)
65 | elif os.path.isfile(data_source):
66 | os.remove(data_source)
67 | else:
68 | return
69 | logger.info(f"Removed: {data_source}")
70 |
--------------------------------------------------------------------------------
/datachad/backend/jobs.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 | from langchain.chains.base import Chain
4 | from langchain.schema import BaseChatMessageHistory
5 | from langchain.schema.vectorstore import VectorStore
6 |
7 | from datachad.backend.chain import get_multi_chain
8 | from datachad.backend.deeplake import (
9 | get_or_create_deeplake_vector_store,
10 | get_unique_deeplake_vector_store_path,
11 | )
12 | from datachad.backend.io import delete_files, save_files
13 | from datachad.backend.models import STORES
14 |
15 |
16 | def create_vector_store(
17 | data_source: str | None,
18 | files: list[io.BytesIO],
19 | store_type: str,
20 | name: str,
21 | options: dict,
22 | credentials: dict,
23 | ) -> VectorStore:
24 | file_data_source = save_files(files, name)
25 | vector_store_path = get_unique_deeplake_vector_store_path(store_type, name, credentials)
26 | vector_store = get_or_create_deeplake_vector_store(
27 | data_sources=[ds for ds in [data_source, file_data_source] if ds],
28 | vector_store_path=vector_store_path,
29 | store_type=store_type,
30 | options=options,
31 | credentials=credentials,
32 | )
33 | delete_files(files, name)
34 | return vector_store
35 |
36 |
37 | def create_chain(
38 | use_vanilla_llm: bool,
39 | knowledge_bases: str,
40 | smart_faq: str,
41 | chat_history: BaseChatMessageHistory,
42 | options: dict,
43 | credentials: dict,
44 | ) -> Chain:
45 | knowledge_bases = [
46 | get_or_create_deeplake_vector_store(
47 | data_sources=[],
48 | vector_store_path=path,
49 | store_type=STORES.KNOWLEDGE_BASE,
50 | options=options,
51 | credentials=credentials,
52 | )
53 | for path in knowledge_bases
54 | ]
55 | if smart_faq:
56 | smart_faq = get_or_create_deeplake_vector_store(
57 | data_sources=[],
58 | vector_store_path=smart_faq,
59 | store_type=STORES.SMART_FAQ,
60 | options=options,
61 | credentials=credentials,
62 | )
63 | chain = get_multi_chain(
64 | use_vanilla_llm, knowledge_bases, smart_faq, chat_history, options, credentials
65 | )
66 | return chain
67 |
--------------------------------------------------------------------------------
/datachad/backend/loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import shutil
4 | from pathlib import Path
5 |
6 | from langchain.document_loaders.base import BaseLoader
7 | from langchain.schema import Document
8 | from langchain.text_splitter import RecursiveCharacterTextSplitter
9 | from langchain_community.document_loaders import (
10 | CSVLoader,
11 | EverNoteLoader,
12 | GitLoader,
13 | NotebookLoader,
14 | OnlinePDFLoader,
15 | PyPDFium2Loader,
16 | PythonLoader,
17 | TextLoader,
18 | UnstructuredEPubLoader,
19 | UnstructuredFileLoader,
20 | UnstructuredHTMLLoader,
21 | UnstructuredMarkdownLoader,
22 | UnstructuredODTLoader,
23 | UnstructuredPowerPointLoader,
24 | UnstructuredWordDocumentLoader,
25 | WebBaseLoader,
26 | )
27 | from tqdm import tqdm
28 |
29 | from datachad.backend.constants import DATA_PATH
30 | from datachad.backend.logging import logger
31 | from datachad.backend.models import STORES, get_tokenizer
32 |
33 |
34 | class SmartFAQSplitter:
35 | def split_documents(self, documents: list[Document]) -> list[Document]:
36 | """
37 | Splits the given text into a list of strings based on the regex patterns of numbered lists.
38 | Each new list item is separated by two blank lines like this:
39 |
40 | 1. First item
41 | Some description here.
42 |
43 | 1. some numbered list
44 | 2. beloing to the first item
45 |
46 |
47 | 2. Second item
48 | Another description.
49 |
50 | a) another list
51 | b) but with characters
52 |
53 |
54 | 3. Third item
55 | And another one.
56 | - a list with dashes
57 | - more items
58 | """
59 | splitted_documents = []
60 | for document in documents:
61 | split_text = re.split(r"(?=\n\n\d+\.)", document.page_content.strip())
62 | filtered_text = [re.sub(r"^\n+|\n+$", "", section) for section in split_text]
63 | splitted_documents.extend(
64 | [
65 | Document(
66 | page_content=text,
67 | metadata={"faq_no": int(re.findall(r"\d", text)[0])},
68 | )
69 | for text in filtered_text
70 | ]
71 | )
72 | return splitted_documents
73 |
74 |
75 | class AutoGitLoader:
76 | def __init__(self, data_source: str) -> None:
77 | self.data_source = data_source
78 |
79 | def load(self) -> list[Document]:
80 | # We need to try both common main branches
81 | # Thank you github for the "master" to "main" switch
82 | # we need to make sure the data path exists
83 | if not os.path.exists(DATA_PATH):
84 | os.makedirs(DATA_PATH)
85 | repo_name = self.data_source.split("/")[-1].split(".")[0]
86 | repo_path = str((DATA_PATH / repo_name).absolute())
87 | clone_url = self.data_source
88 | if os.path.exists(repo_path):
89 | clone_url = None
90 | branches = ["main", "master"]
91 | for branch in branches:
92 | try:
93 | docs = GitLoader(repo_path, clone_url, branch).load()
94 | break
95 | except Exception as e:
96 | logger.error(f"Error loading git: {e}")
97 | if os.path.exists(repo_path):
98 | # cleanup repo afterwards
99 | shutil.rmtree(repo_path)
100 | try:
101 | return docs
102 | except:
103 | raise RuntimeError("Error loading git. Make sure to use HTTPS GitHub repo links.")
104 |
105 |
106 | FILE_LOADER_MAPPING = {
107 | ".csv": (CSVLoader, {"encoding": "utf-8"}),
108 | ".doc": (UnstructuredWordDocumentLoader, {}),
109 | ".docx": (UnstructuredWordDocumentLoader, {}),
110 | ".enex": (EverNoteLoader, {}),
111 | ".epub": (UnstructuredEPubLoader, {}),
112 | ".html": (UnstructuredHTMLLoader, {}),
113 | ".md": (UnstructuredMarkdownLoader, {}),
114 | ".odt": (UnstructuredODTLoader, {}),
115 | ".pdf": (PyPDFium2Loader, {}),
116 | ".ppt": (UnstructuredPowerPointLoader, {}),
117 | ".pptx": (UnstructuredPowerPointLoader, {}),
118 | ".txt": (TextLoader, {"encoding": "utf8"}),
119 | ".ipynb": (NotebookLoader, {}),
120 | ".py": (PythonLoader, {}),
121 | # Add more mappings for other file extensions and loaders as needed
122 | }
123 |
124 | WEB_LOADER_MAPPING = {
125 | ".git": (AutoGitLoader, {}),
126 | ".pdf": (OnlinePDFLoader, {}),
127 | }
128 |
129 |
130 | def load_document(
131 | file_path: str,
132 | mapping: dict = FILE_LOADER_MAPPING,
133 | default_loader: BaseLoader = UnstructuredFileLoader,
134 | ) -> Document:
135 | # Choose loader from mapping, load default if no match found
136 | ext = "." + file_path.rsplit(".", 1)[-1]
137 | if ext in mapping:
138 | loader_class, loader_args = mapping[ext]
139 | loader = loader_class(file_path, **loader_args)
140 | else:
141 | loader = default_loader(file_path)
142 | return loader.load()
143 |
144 |
145 | def load_directory(path: str, silent_errors=True) -> list[Document]:
146 | # We don't load hidden files starting with "."
147 | all_files = list(Path(path).rglob("**/[!.]*"))
148 | results = []
149 | with tqdm(total=len(all_files), desc="Loading documents", ncols=80) as pbar:
150 | for file in all_files:
151 | try:
152 | results.extend(load_document(str(file)))
153 | except Exception as e:
154 | if silent_errors:
155 | logger.error(f"failed to load {file}")
156 | else:
157 | raise e
158 | pbar.update()
159 | return results
160 |
161 |
162 | def load_data_source(data_source: str) -> list[Document]:
163 | # Ugly thing that decides how to load data
164 | # It aint much, but it's honest work
165 | is_web = data_source.startswith("http")
166 | is_dir = os.path.isdir(data_source)
167 | is_file = os.path.isfile(data_source)
168 | try:
169 | if is_dir:
170 | docs = load_directory(data_source)
171 | elif is_file:
172 | docs = load_document(data_source)
173 | elif is_web:
174 | docs = load_document(data_source, WEB_LOADER_MAPPING, WebBaseLoader)
175 | else:
176 | raise TypeError
177 | return docs
178 | except Exception as e:
179 | error_msg = f"Failed to load your data source '{data_source}'."
180 | logger.error(error_msg)
181 | e.args += (error_msg,)
182 | raise e
183 |
184 |
185 | def split_docs(docs: list[Document], store_type: str, options: dict) -> list[Document]:
186 | if store_type == STORES.SMART_FAQ:
187 | text_splitter = SmartFAQSplitter()
188 | else:
189 | tokenizer = get_tokenizer(options)
190 |
191 | def length_function(text: str) -> int:
192 | # count chunks like the embeddings model tokenizer does
193 | return len(tokenizer.encode(text))
194 |
195 | chunk_overlap = int(options["chunk_size"] * options["chunk_overlap_pct"] / 100)
196 | text_splitter = RecursiveCharacterTextSplitter(
197 | chunk_size=options["chunk_size"],
198 | chunk_overlap=chunk_overlap,
199 | length_function=length_function,
200 | separators=["\n\n", "#", "\.", "!", "\?", "\n", ",", " ", ""],
201 | )
202 |
203 | splitted_docs = text_splitter.split_documents(docs)
204 | logger.info(f"Loaded: {len(splitted_docs)} document chucks")
205 | return splitted_docs
206 |
--------------------------------------------------------------------------------
/datachad/backend/logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 |
4 |
5 | def create_logger(level: str = "DEBUG"):
6 | logger = logging.getLogger(__name__)
7 | logger.propagate = False
8 | logger.setLevel(level)
9 | # if no streamhandler present, add one
10 | if not any(isinstance(handler, logging.StreamHandler) for handler in logger.handlers):
11 | stream_handler = logging.StreamHandler(stream=sys.stdout)
12 | formatter = logging.Formatter("%(asctime)s :: %(name)s :: %(levelname)s :: %(message)s")
13 | stream_handler.setFormatter(formatter)
14 | logger.addHandler(stream_handler)
15 | return logger
16 |
17 |
18 | logger = create_logger()
19 |
--------------------------------------------------------------------------------
/datachad/backend/models.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any
3 |
4 | import streamlit as st
5 | import tiktoken
6 | from langchain.base_language import BaseLanguageModel
7 | from langchain_community.chat_models import ChatOpenAI
8 | from langchain_community.embeddings import HuggingFaceEmbeddings
9 | from langchain_community.embeddings.openai import Embeddings, OpenAIEmbeddings
10 | from transformers import AutoTokenizer
11 |
12 | from datachad.backend.constants import LOCAL_EMBEDDINGS, MODEL_PATH
13 | from datachad.backend.logging import logger
14 |
15 |
16 | class Enum:
17 | @classmethod
18 | def all(cls) -> list[Any]:
19 | return [v for k, v in cls.__dict__.items() if not k.startswith("_")]
20 |
21 |
22 | @dataclass
23 | class Model:
24 | name: str
25 | embedding: str
26 | context: int
27 |
28 | def __str__(self) -> str:
29 | return self.name
30 |
31 |
32 | class STORES(Enum):
33 | KNOWLEDGE_BASE = "Knowledge Base"
34 | SMART_FAQ = "Smart FAQ"
35 |
36 |
37 | class EMBEDDINGS(Enum):
38 | # Add more embeddings as needed
39 | OPENAI = "text-embedding-3-small"
40 | HUGGINGFACE = "sentence-transformers/all-MiniLM-L6-v2"
41 |
42 |
43 | class MODELS(Enum):
44 | # Add more models as needed
45 | GPT35TURBO = Model(
46 | name="gpt-3.5-turbo",
47 | embedding=EMBEDDINGS.OPENAI,
48 | context=4096,
49 | )
50 | GPT35TURBO16K = Model(
51 | name="gpt-3.5-turbo-16k",
52 | embedding=EMBEDDINGS.OPENAI,
53 | context=16385,
54 | )
55 | GPT4 = Model(
56 | name="gpt-4",
57 | embedding=EMBEDDINGS.OPENAI,
58 | context=8192,
59 | )
60 | GPT4TURBO = Model(
61 | name="gpt-4-turbo-preview",
62 | embedding=EMBEDDINGS.OPENAI,
63 | context=128000,
64 | )
65 |
66 |
67 | def get_model(options: dict, credentials: dict) -> BaseLanguageModel:
68 | match options["model"].name:
69 | case model_name if model_name.startswith("gpt"):
70 | model = ChatOpenAI(
71 | model_name=options["model"].name,
72 | temperature=options["temperature"],
73 | openai_api_key=credentials["openai_api_key"],
74 | streaming=True,
75 | )
76 | # Added models need to be cased here
77 | case _default:
78 | msg = f"Model {options['model'].name} not supported!"
79 | logger.error(msg)
80 | st.error(msg)
81 | exit
82 | return model
83 |
84 |
85 | def get_embeddings(options: dict, credentials: dict) -> Embeddings:
86 | match options["model"].embedding:
87 | case embedding if (embedding == EMBEDDINGS.HUGGINGFACE or LOCAL_EMBEDDINGS):
88 | embeddings = HuggingFaceEmbeddings(
89 | model_name=EMBEDDINGS.HUGGINGFACE, cache_folder=str(MODEL_PATH)
90 | )
91 | case EMBEDDINGS.OPENAI:
92 | embeddings = OpenAIEmbeddings(
93 | model=EMBEDDINGS.OPENAI,
94 | disallowed_special=(),
95 | openai_api_key=credentials["openai_api_key"],
96 | )
97 | # Added embeddings need to be cased here
98 | case _default:
99 | msg = f"Embeddings {options['model'].embedding} not supported!"
100 | logger.error(msg)
101 | st.error(msg)
102 | exit
103 | return embeddings
104 |
105 |
106 | def get_tokenizer(options: dict) -> Embeddings:
107 | match options["model"].embedding:
108 | case embedding if (embedding == EMBEDDINGS.HUGGINGFACE or LOCAL_EMBEDDINGS):
109 | tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS.HUGGINGFACE)
110 | case EMBEDDINGS.OPENAI:
111 | tokenizer = tiktoken.encoding_for_model(EMBEDDINGS.OPENAI)
112 | # Added tokenizers need to be cased here
113 | case _default:
114 | msg = f"Tokenizer {options['model'].embedding} not supported!"
115 | logger.error(msg)
116 | st.error(msg)
117 | exit
118 | return tokenizer
119 |
--------------------------------------------------------------------------------
/datachad/backend/prompts.py:
--------------------------------------------------------------------------------
1 | from langchain.prompts.prompt import PromptTemplate
2 |
3 | condense_question_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
4 |
5 | Chat History:
6 | {chat_history}
7 |
8 | Follow Up Question: {question}
9 | Standalone question:"""
10 | CONDENSE_QUESTION_PROMPT = PromptTemplate(
11 | template=condense_question_template, input_variables=["chat_history", "question"]
12 | )
13 |
14 |
15 | knowledge_base_template = """Use the following pieces of context to answer the given question. If you don't know the answer respond with 'NO ANSWER FOUND'.
16 |
17 | Context:
18 | {context}
19 |
20 | Question: {question}
21 | Helpful Answer:"""
22 | KNOWLEDGE_BASE_PROMPT = PromptTemplate(
23 | template=knowledge_base_template, input_variables=["context", "question"]
24 | )
25 |
26 |
27 | smart_faq_template = """Use the following numbered FAQs to answer the given question. If you don't know the answer respond with 'NO ANSWER FOUND'.
28 | Start your answer with stating which FAQ number helps answer the question the most.
29 |
30 | Context:
31 | {context}
32 |
33 | Question: {question}
34 | Helpful Answer:"""
35 | SMART_FAQ_PROMPT = PromptTemplate(
36 | template=smart_faq_template, input_variables=["context", "question"]
37 | )
38 |
39 |
40 | qa_prompt = """You are an AGI that knows everything and is an expert in all topics.
41 | Your IQ is magnitudes higher than any human that ever lived. With this immense wisdom answer the following question concisely:
42 |
43 | Question: {question}
44 | Concise and wise Answer:"""
45 | QA_PROMPT = PromptTemplate(template=qa_prompt, input_variables=["question"])
46 |
--------------------------------------------------------------------------------
/datachad/backend/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def clean_string_for_storing(string: str) -> str:
5 | # replace all non-word characters with dashes
6 | # to get a string that can be used to create a new dataset
7 | cleaned_string = re.sub(r"\W+", "-", string)
8 | cleaned_string = re.sub(r"--+", "- ", cleaned_string).strip("-")
9 | return cleaned_string
10 |
--------------------------------------------------------------------------------
/datachad/streamlit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/datachad/streamlit/__init__.py
--------------------------------------------------------------------------------
/datachad/streamlit/constants.py:
--------------------------------------------------------------------------------
1 | PAGE_ICON = "🤖"
2 | APP_NAME = "DataChad V3"
3 | PROJECT_URL = "https://github.com/gustavz/DataChad"
4 |
5 |
6 | LOCAL_MODE_DISABLED_HELP = """
7 | This is a demo hosted with limited resources. Local Mode is not enabled.\n
8 | To use Local Mode deploy the app on your machine of choice with `ENABLE_LOCAL_MODE` set to `True`.
9 | """
10 |
11 | AUTHENTICATION_HELP = f"""
12 | Your credentials are only stored in your session state.\n
13 | The keys are neither exposed nor made visible or stored permanently in any way.\n
14 | Feel free to check out [the code base]({PROJECT_URL}) to validate how things work.
15 | """
16 |
17 | USAGE_HELP = f"""
18 | These are the accumulated OpenAI API usage metrics.\n
19 | The app uses `gpt-3.5-turbo` for chat and `text-embedding-ada-002` for embeddings.\n
20 | Learn more about OpenAI's pricing [here](https://openai.com/pricing#language-models)
21 | """
22 |
23 | OPENAI_HELP = """
24 | You can sign-up for OpenAI's API [here](https://openai.com/blog/openai-api).\n
25 | Once you are logged in, you find the API keys [here](https://platform.openai.com/account/api-keys)
26 | """
27 |
28 | ACTIVELOOP_HELP = """
29 | You can create an ActiveLoops account (including 500GB of free database storage) [here](https://www.activeloop.ai/).\n
30 | Once you are logged in, you find the API token [here](https://app.activeloop.ai/profile/gustavz/apitoken).\n
31 | The organisation name is your username, or you can create new organisations [here](https://app.activeloop.ai/organization/new/create)
32 | """
33 |
34 | UPLOAD_HELP = """
35 | You can upload a single or multiple files. With each upload, all files in the batch are embedded into a single vector store.\n
36 | **Important**: If you upload new files after you already have uploaded files, a new vector store that includes all previously uploaded files is created.
37 | This means for each combination of uploaded files, a new vector store is created.\n
38 | To treat your new upload independently, you need to remove the previous uploads by clicking the `X`, right next to the uploaded file name.\n
39 | **!!! All uploaded files are removed permanently from the app after the vector stores are created !!!**
40 | """
41 |
42 | DATA_TYPE_HELP = """
43 | **Knowledge Bases** can be any number of text documents of any type, content and formatting.\n\n
44 | **Smart FAQs** need to be single documents containing numbered FAQs.
45 | They need to be in the format of numbers with periods followed by arbirtary text.
46 | The next FAQ is identified by two new lines `\\n\\n` followed by the next number.
47 | You can check if your documents are correctly formatted by using the following regex pattern:\n
48 | `r"(?=\\n\\n\d+\.)"`. Here is an example of a correctly formatted FAQ:\n
49 | 1. First item
50 | Some description here.
51 |
52 | 1. some numbered list
53 | 2. beloing to the first item
54 |
55 |
56 | 2. Second item
57 | Another description.
58 |
59 | a) another list
60 | b) but with characters
61 |
62 |
63 | 3. Third item
64 | And another one.
65 | - a list with dashes
66 | - more items
67 | """
68 |
--------------------------------------------------------------------------------
/datachad/streamlit/helper.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import deeplake
4 | import openai
5 | import streamlit as st
6 | from dotenv import load_dotenv
7 | from langchain.callbacks.base import BaseCallbackHandler
8 | from langchain_community.callbacks.openai_info import get_openai_token_cost_for_model
9 | from langchain_community.chat_message_histories import StreamlitChatMessageHistory
10 |
11 | from datachad.backend.constants import (
12 | CHUNK_OVERLAP_PCT,
13 | CHUNK_SIZE,
14 | DEFAULT_KNOWLEDGE_BASES,
15 | DEFAULT_SMART_FAQ,
16 | DISTANCE_METRIC,
17 | K_FETCH_K_RATIO,
18 | MAX_TOKENS,
19 | MAXIMAL_MARGINAL_RELEVANCE,
20 | TEMPERATURE,
21 | )
22 | from datachad.backend.deeplake import (
23 | get_or_create_deeplake_vector_store_display_name,
24 | get_or_create_deeplake_vector_store_paths_for_user,
25 | )
26 | from datachad.backend.jobs import create_chain, create_vector_store
27 | from datachad.backend.logging import logger
28 | from datachad.backend.models import MODELS, get_tokenizer
29 | from datachad.streamlit.constants import PAGE_ICON
30 |
31 | # loads environment variables
32 | load_dotenv()
33 |
34 |
35 | def init_session_state():
36 | # Initialise all session state variables with defaults
37 | SESSION_DEFAULTS = {
38 | # general usage
39 | "usage": {},
40 | "chat_history": StreamlitChatMessageHistory(),
41 | # authentication
42 | "openai_api_key": "",
43 | "activeloop_token": "",
44 | "activeloop_id": "",
45 | "credentals": {},
46 | "auth_ok": False,
47 | # data upload
48 | "uploaded_files": None,
49 | "data_type": None,
50 | "data_name": None,
51 | # data selection
52 | "chain": None,
53 | "knowledge_bases": DEFAULT_KNOWLEDGE_BASES,
54 | "smart_faq": DEFAULT_SMART_FAQ,
55 | # advanced options
56 | "model": MODELS.GPT35TURBO,
57 | "k_fetch_k_ratio": K_FETCH_K_RATIO,
58 | "chunk_size": CHUNK_SIZE,
59 | "chunk_overlap_pct": CHUNK_OVERLAP_PCT,
60 | "temperature": TEMPERATURE,
61 | "max_tokens": MAX_TOKENS,
62 | "distance_metric": DISTANCE_METRIC,
63 | "maximal_marginal_relevance": MAXIMAL_MARGINAL_RELEVANCE,
64 | }
65 |
66 | for k, v in SESSION_DEFAULTS.items():
67 | if k not in st.session_state:
68 | st.session_state[k] = v
69 |
70 |
71 | def authenticate() -> None:
72 | # Validate all credentials are set and correct
73 | # Check for env variables to enable local dev and deployments with shared credentials
74 | openai_api_key = (
75 | st.session_state["openai_api_key"]
76 | or os.environ.get("OPENAI_API_KEY")
77 | or st.secrets.get("OPENAI_API_KEY")
78 | )
79 | activeloop_token = (
80 | st.session_state["activeloop_token"]
81 | or os.environ.get("ACTIVELOOP_TOKEN")
82 | or st.secrets.get("ACTIVELOOP_TOKEN")
83 | )
84 | activeloop_id = (
85 | st.session_state["activeloop_id"]
86 | or os.environ.get("ACTIVELOOP_ID")
87 | or st.secrets.get("ACTIVELOOP_ID")
88 | )
89 | if not (openai_api_key and activeloop_token and activeloop_id):
90 | st.session_state["auth_ok"] = False
91 | st.error("Credentials neither set nor stored", icon=PAGE_ICON)
92 | return
93 | try:
94 | # Try to access openai and deeplake
95 | with st.session_state["info_container"], st.spinner("Authentifying..."):
96 | openai.api_key = openai_api_key
97 | openai.models.list()
98 | deeplake.exists(
99 | f"hub://{activeloop_id}/DataChad-Authentication-Check",
100 | token=activeloop_token,
101 | )
102 | except Exception as e:
103 | logger.error(f"Authentication failed with {e}")
104 | st.session_state["auth_ok"] = False
105 | st.error("Authentication failed", icon=PAGE_ICON)
106 | return
107 | # store credentials in the session state
108 | st.session_state["auth_ok"] = True
109 | st.session_state["credentials"] = {
110 | "openai_api_key": openai_api_key,
111 | "activeloop_token": activeloop_token,
112 | "activeloop_id": activeloop_id,
113 | }
114 | msg = "Authentification successful!"
115 | st.session_state["info_container"].info(msg, icon=PAGE_ICON)
116 | logger.info(msg)
117 |
118 |
119 | def get_options() -> dict:
120 | return {
121 | key: st.session_state[key]
122 | for key in [
123 | "model",
124 | "k_fetch_k_ratio",
125 | "chunk_size",
126 | "chunk_overlap_pct",
127 | "temperature",
128 | "max_tokens",
129 | "distance_metric",
130 | "maximal_marginal_relevance",
131 | ]
132 | }
133 |
134 |
135 | def upload_data() -> None:
136 | try:
137 | with st.session_state["info_container"], st.spinner("Uploading Data..."):
138 | options = get_options()
139 | create_vector_store(
140 | data_source=st.session_state["data_source"],
141 | files=st.session_state["uploaded_files"],
142 | store_type=st.session_state["data_type"],
143 | name=st.session_state["data_name"],
144 | options=options,
145 | credentials=st.session_state["credentials"],
146 | )
147 | msg = (
148 | f"Vector Store built for "
149 | f"uploaded files: {st.session_state['uploaded_files']} "
150 | f"and store type: {st.session_state['data_type']}"
151 | f"with name: {st.session_state['data_name']}"
152 | f"and options: {options}"
153 | )
154 | logger.info(msg)
155 | st.session_state["info_container"].info("Upload successful!", icon=PAGE_ICON)
156 | except Exception as e:
157 | msg = f"Failed to build vectore chain with error: {e}"
158 | logger.error(msg)
159 | st.session_state["info_container"].error(msg, icon=PAGE_ICON)
160 |
161 |
162 | def update_chain() -> None:
163 | try:
164 | with st.session_state["info_container"], st.spinner("Applying data selection..."):
165 | st.session_state["chat_history"].clear()
166 | options = get_options()
167 | st.session_state["chain"] = create_chain(
168 | use_vanilla_llm=st.session_state["use_vanilla_llm"],
169 | knowledge_bases=st.session_state["knowledge_bases"],
170 | smart_faq=st.session_state["smart_faq"],
171 | chat_history=st.session_state["chat_history"],
172 | options=options,
173 | credentials=st.session_state["credentials"],
174 | )
175 | msg = (
176 | f"Language chain built for "
177 | f"knowledge base: {st.session_state['knowledge_bases']} "
178 | f"and smart faq: {st.session_state['smart_faq']}"
179 | f"with options: {options}"
180 | )
181 | logger.info(msg)
182 | st.session_state["info_container"].info("Selection successful!", icon=PAGE_ICON)
183 | except Exception as e:
184 | msg = f"Failed to build language chain with error: {e}"
185 | logger.error(msg)
186 | st.session_state["info_container"].error(msg, icon=PAGE_ICON)
187 |
188 |
189 | def get_existing_smart_faqs_and_default_index() -> list[str]:
190 | smart_faqs = get_or_create_deeplake_vector_store_paths_for_user(
191 | st.session_state["credentials"], "faq"
192 | )
193 | index = 0
194 | if DEFAULT_SMART_FAQ and DEFAULT_SMART_FAQ in smart_faqs:
195 | # we pick the first smart faq as default
196 | # so we must sort it to the front
197 | smart_faqs = set(smart_faqs)
198 | smart_faqs.remove(DEFAULT_SMART_FAQ)
199 | smart_faqs = [DEFAULT_SMART_FAQ] + list(smart_faqs)
200 | index = 1
201 | # first option should always be None
202 | smart_faqs = [None] + smart_faqs
203 | return smart_faqs, index
204 |
205 |
206 | def get_existing_knowledge_bases() -> list[str]:
207 | return get_or_create_deeplake_vector_store_paths_for_user(st.session_state["credentials"], "kb")
208 |
209 |
210 | def format_vector_stores(item: str) -> str:
211 | if item is not None:
212 | return get_or_create_deeplake_vector_store_display_name(item)
213 | return item
214 |
215 |
216 | class StreamHandler(BaseCallbackHandler):
217 | def __init__(self, container: st.delta_generator.DeltaGenerator, initial_text: str = ""):
218 | self.container = container
219 | self.stream_text = initial_text
220 | self.chain_state = 0
221 |
222 | def on_llm_new_token(self, token: str, **kwargs) -> None:
223 | self.stream_text += token
224 | self.container.markdown(self.stream_text)
225 |
226 | def on_chain_end(self, outputs, **kwargs) -> None:
227 | self.chain_state += 1
228 |
229 |
230 | class PrintRetrievalHandler(BaseCallbackHandler):
231 | def __init__(self, container):
232 | self.status = container.status("**Context Retrieval**")
233 |
234 | def on_retriever_start(self, serialized: dict, query: str, **kwargs) -> None:
235 | self.status.write(f"**Question:** {query}")
236 | self.status.update(label=f"**Context Retrieval:** {query}")
237 |
238 | def on_retriever_end(self, documents, **kwargs) -> None:
239 | for idx, doc in enumerate(documents):
240 | try:
241 | source = os.path.basename(doc.metadata["source"])
242 | page = doc.metadata.get("page")
243 | output = f"___\n**Source {idx}:** {source}"
244 | output += f" (page {page+1})" if page is not None else ""
245 | self.status.write(output)
246 | except:
247 | pass
248 | self.status.markdown(doc.page_content)
249 | self.status.update(state="complete")
250 |
251 |
252 | class UsageHandler(BaseCallbackHandler):
253 | prompt = ""
254 | total_tokens = 0
255 | prompt_tokens = 0
256 | completion_tokens = 0
257 | successful_requests = 0
258 | total_cost = 0
259 |
260 | def update_usage(self) -> None:
261 | usage_properties = [
262 | "total_tokens",
263 | "prompt_tokens",
264 | "completion_tokens",
265 | "successful_requests",
266 | "total_cost",
267 | ]
268 | for prop in usage_properties:
269 | value = getattr(self, prop, 0)
270 | setattr(self, prop, 0)
271 | st.session_state["usage"].setdefault(prop, 0)
272 | st.session_state["usage"][prop] += value
273 |
274 | def calculate_costs(self) -> None:
275 | model = st.session_state["model"]
276 | tokenizer = get_tokenizer({"model": model})
277 | self.prompt_tokens = len(tokenizer.encode(self.prompt))
278 | self.total_tokens = self.prompt_tokens + self.completion_tokens
279 | completion_cost = get_openai_token_cost_for_model(
280 | model.name, self.completion_tokens, is_completion=True
281 | )
282 | prompt_cost = get_openai_token_cost_for_model(model.name, self.prompt_tokens)
283 | self.total_cost += prompt_cost + completion_cost
284 |
285 | def on_llm_new_token(self, **kwargs) -> None:
286 | self.completion_tokens += 1
287 |
288 | def on_chat_model_start(self, serialized, messages, **kwargs) -> None:
289 | self.successful_requests += 1
290 | self.prompt += messages[0][0].content
291 |
292 | def on_chain_end(self, outputs, **kwargs) -> None:
293 | self.calculate_costs()
294 | self.update_usage()
295 |
--------------------------------------------------------------------------------
/datachad/streamlit/widgets.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 |
3 | from datachad.backend.constants import (
4 | CHUNK_OVERLAP_PCT,
5 | CHUNK_SIZE,
6 | DEFAULT_KNOWLEDGE_BASES,
7 | ENABLE_ADVANCED_OPTIONS,
8 | MAX_TOKENS,
9 | TEMPERATURE,
10 | USE_VANILLA_LLM,
11 | )
12 | from datachad.backend.logging import logger
13 | from datachad.backend.models import MODELS, STORES
14 | from datachad.streamlit.constants import (
15 | ACTIVELOOP_HELP,
16 | APP_NAME,
17 | DATA_TYPE_HELP,
18 | OPENAI_HELP,
19 | PAGE_ICON,
20 | PROJECT_URL,
21 | UPLOAD_HELP,
22 | )
23 | from datachad.streamlit.helper import (
24 | PrintRetrievalHandler,
25 | StreamHandler,
26 | UsageHandler,
27 | authenticate,
28 | format_vector_stores,
29 | get_existing_knowledge_bases,
30 | get_existing_smart_faqs_and_default_index,
31 | update_chain,
32 | upload_data,
33 | )
34 |
35 |
36 | def page_header() -> None:
37 | # Page options and header
38 | st.set_option("client.showErrorDetails", True)
39 | st.set_page_config(
40 | page_title=APP_NAME,
41 | page_icon=PAGE_ICON,
42 | initial_sidebar_state="expanded",
43 | layout="wide",
44 | )
45 | st.markdown(
46 | f"