├── .env.template
├── .gitignore
├── .streamlit
    └── config.toml
├── Dockerfile
├── LICENSE
├── README.md
├── app.py
├── datachad
    ├── __init__.py
    ├── backend
    │   ├── __init__.py
    │   ├── chain.py
    │   ├── constants.py
    │   ├── deeplake.py
    │   ├── io.py
    │   ├── jobs.py
    │   ├── loader.py
    │   ├── logging.py
    │   ├── models.py
    │   ├── prompts.py
    │   └── utils.py
    └── streamlit
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── helper.py
    │   └── widgets.py
├── packages.txt
├── requirements.txt
└── static
    ├── datachadV1.png
    ├── datachadV2.png
    └── datachadV3.png


/.env.template:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=your openai key
2 | ACTIVELOOP_TOKEN=your activeloop key
3 | ACTIVELOOP_ID=your activeloop organization name


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | stores
 2 | data
 3 | models
 4 | __pycache__
 5 | .env
 6 | .ipynb_checkpoints
 7 | .DS_Store
 8 | testing.ipynb
 9 | .vscode
10 | .venv


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [server]
2 | fileWatcherType = "poll"


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt requirements.txt
 6 | COPY packages.txt packages.txt
 7 | 
 8 | # we need to install the packages without versions
 9 | # to ensure compatibility with apple ARM devices
10 | RUN sed -i 's/==.*//' requirements.txt
11 | 
12 | RUN pip install --upgrade pip
13 | RUN pip install -r requirements.txt
14 | RUN rm -rf /root/.cache/pip
15 | 
16 | RUN apt-get update
17 | RUN xargs apt-get -y install < packages.txt
18 | 
19 | COPY datachad datachad
20 | COPY app.py app.py
21 | 
22 | ARG port=80
23 | ENV STREAMLIT_SERVER_PORT ${port}
24 | ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
25 | EXPOSE ${port}
26 | 
27 | CMD ["streamlit", "run", "app.py"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Gustav von Zitzewitz
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DataChad V3🤖
 2 | 
 3 | This is an app that let's you ask questions about any data source by leveraging [embeddings](https://platform.openai.com/docs/guides/embeddings), [vector databases](https://www.activeloop.ai/), [large language models](https://platform.openai.com/docs/models/gpt-3-5) and last but not least [langchains](https://github.com/hwchase17/langchain)
 4 | 
 5 | ## How does it work?
 6 | 
 7 | 1. Upload any `file(s)` or enter any `path` or `url` to create Knowledge Bases which can contain multiple files of any type, format and content and create Smart FAQs which are lists of curated numbered Q&As.
 8 | 2. The data source or files are loaded and splitted into text document chunks
 9 | 3. The text document chunks are embedded using openai or huggingface embeddings
10 | 4. The embeddings are stored as a vector dataset to activeloop's database hub
11 | 5. A langchain is created consisting of a custom selection of an LLM model (`gpt-3.5-turbo` by default), multiple vector store as knowledge bases and a single special smart FAQ vector store
12 | 6. When asking questions to the app, the chain embeds the input prompt and does a similarity search in in the provided vector stores and uses the best results as context for the LLM to generate an appropriate response
13 | 7. Finally the chat history is cached locally to enable a [ChatGPT](https://chat.openai.com/) like Q&A conversation
14 | 
15 | ## Good to know
16 | - The app only runs on `py>=3.10`!
17 | - To run locally or deploy somewhere, execute `cp .env.template .env` and set credentials in the newly created `.env` file. Other options are manually setting of system environment variables, or storing them into `.streamlit/secrets.toml` when hosted via streamlit.
18 | - If you have credentials set like explained above, you can just hit `submit` in the authentication without reentering your credentials in the app.
19 | - If you run the app consider modifying the configuration in `datachad/backend/constants.py`, e.g enabling advanced options
20 | - Your data won't load? Feel free to open an Issue or PR and contribute!
21 | - Use previous releases like V1 or V2 for original functionality and UI
22 | 
23 | ## How does it look like?
24 | 
25 | <img src="static/datachadV3.png" width="100%"/>
26 | 
27 | ## TODO LIST
28 | If you like to contribute, feel free to grab any task
29 | - [x] Refactor utils, especially the loaders
30 | - [x] Add option to choose model and embeddings
31 | - [x] Enable fully local / private mode
32 | - [x] Add option to upload multiple files to a single dataset
33 | - [x] Decouple datachad modules from streamlit
34 | - [x] remove all local mode and other V1 stuff
35 | - [x] Load existing knowledge bases
36 | - [x] Delete existing knowledge bases
37 | - [x] Enable streaming responses
38 | - [x] Show retrieved context
39 | - [x] Refactor UI
40 | - [x] Introduce smart FAQs
41 | - [ ] Exchange downloaded file storage with tempfile
42 | - [ ] Add user creation and login
43 | - [ ] Add chat history per user
44 | - [ ] Make all I/O asynchronous
45 | - [ ] Implement FastAPI routes and backend app
46 | - [ ] Implement a proper frontend (react or whatever)
47 | - [ ] containerize the app
48 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from datachad.streamlit.helper import init_session_state
 2 | from datachad.streamlit.widgets import (
 3 |     authentication_widget,
 4 |     chat_interface_widget,
 5 |     data_selection_widget,
 6 |     data_upload_widget,
 7 |     init_widgets,
 8 |     page_header,
 9 |     usage_widget,
10 | )
11 | 
12 | init_session_state()
13 | page_header()
14 | init_widgets()
15 | authentication_widget()
16 | data_upload_widget()
17 | data_selection_widget()
18 | chat_interface_widget()
19 | usage_widget()
20 | 


--------------------------------------------------------------------------------
/datachad/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/datachad/__init__.py


--------------------------------------------------------------------------------
/datachad/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/datachad/backend/__init__.py


--------------------------------------------------------------------------------
/datachad/backend/chain.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from langchain.callbacks.manager import CallbackManagerForChainRun, Callbacks
  4 | from langchain.chains.base import Chain
  5 | from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
  6 | from langchain.chains.conversational_retrieval.base import _get_chat_history
  7 | from langchain.chains.llm import LLMChain
  8 | from langchain.chains.question_answering import load_qa_chain
  9 | from langchain.memory import ConversationBufferMemory
 10 | from langchain.schema import BaseChatMessageHistory, BasePromptTemplate, BaseRetriever, Document
 11 | from langchain.schema.language_model import BaseLanguageModel
 12 | from langchain.schema.vectorstore import VectorStore
 13 | from datachad.backend.constants import VERBOSE
 14 | 
 15 | from datachad.backend.deeplake import get_or_create_deeplake_vector_store_display_name
 16 | from datachad.backend.logging import logger
 17 | from datachad.backend.models import get_model
 18 | from datachad.backend.prompts import (
 19 |     CONDENSE_QUESTION_PROMPT,
 20 |     KNOWLEDGE_BASE_PROMPT,
 21 |     QA_PROMPT,
 22 |     SMART_FAQ_PROMPT,
 23 | )
 24 | 
 25 | 
 26 | class MultiRetrieverFAQChain(Chain):
 27 |     """
 28 |     This chain does blablabla
 29 |     """
 30 | 
 31 |     output_key: str = "answer"
 32 |     rephrase_question: bool = True
 33 |     use_vanilla_llm: bool = True
 34 |     max_tokens_limit: int
 35 |     qa_chain: LLMChain
 36 |     condense_question_chain: LLMChain
 37 |     knowledge_base_chain: BaseCombineDocumentsChain
 38 |     knowledge_base_retrievers: list[BaseRetriever]
 39 |     smart_faq_chain: BaseCombineDocumentsChain
 40 |     smart_faq_retriever: BaseRetriever | None
 41 | 
 42 |     @property
 43 |     def input_keys(self) -> list[str]:
 44 |         """Will be whatever keys the prompt expects."""
 45 |         return ["question", "chat_history"]
 46 | 
 47 |     @property
 48 |     def output_keys(self) -> list[str]:
 49 |         """Will always return text key."""
 50 |         return [self.output_key]
 51 | 
 52 |     @property
 53 |     def _chain_type(self) -> str:
 54 |         return "stuff"
 55 | 
 56 |     def _reduce_tokens_below_limit(
 57 |         self, docs: list[Document], combine_docs_chain: BaseCombineDocumentsChain
 58 |     ) -> list[Document]:
 59 |         num_docs = len(docs)
 60 | 
 61 |         tokens = [combine_docs_chain.llm_chain.llm.get_num_tokens(doc.page_content) for doc in docs]
 62 |         token_count = sum(tokens[:num_docs])
 63 |         while token_count > self.max_tokens_limit:
 64 |             num_docs -= 1
 65 |             token_count -= tokens[num_docs]
 66 | 
 67 |         return docs[:num_docs]
 68 | 
 69 |     def _get_docs(
 70 |         self,
 71 |         question: str,
 72 |         retriever: BaseRetriever,
 73 |         combine_docs_chain: BaseCombineDocumentsChain,
 74 |         run_manager: CallbackManagerForChainRun,
 75 |     ) -> list[Document]:
 76 |         """Get docs from retriever."""
 77 |         docs = retriever.get_relevant_documents(question, callbacks=run_manager.get_child())
 78 |         return self._reduce_tokens_below_limit(docs, combine_docs_chain)
 79 | 
 80 |     def _add_text_to_answer(
 81 |         self, text: str, answer: str, run_manager: CallbackManagerForChainRun
 82 |     ) -> str:
 83 |         """Hack to add text to the streaming response handler"""
 84 |         answer += text
 85 |         streamhandler = next(
 86 |             (h for h in run_manager.get_child().handlers if hasattr(h, "stream_text")),
 87 |             None,
 88 |         )
 89 |         if streamhandler:
 90 |             streamhandler.on_llm_new_token(text)
 91 |         return answer
 92 | 
 93 |     def _call(
 94 |         self,
 95 |         inputs: dict[str, Any],
 96 |         run_manager: CallbackManagerForChainRun | None = None,
 97 |     ) -> dict[str, str]:
 98 |         answer = ""
 99 |         chat_history_str = _get_chat_history(inputs["chat_history"])
100 |         run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
101 |         # Generate new standalone question if there is a chat history
102 |         if chat_history_str and self.rephrase_question:
103 |             inputs["question"] = self.condense_question_chain.run(
104 |                 question=inputs["question"],
105 |                 chat_history=chat_history_str,
106 |                 callbacks=run_manager.get_child(),
107 |             )
108 |         # Answer the question using the FAQ document context
109 |         if self.smart_faq_retriever:
110 |             docs = self._get_docs(
111 |                 inputs["question"],
112 |                 self.smart_faq_retriever,
113 |                 self.smart_faq_chain,
114 |                 run_manager=run_manager,
115 |             )
116 |             smart_faq_name = get_or_create_deeplake_vector_store_display_name(
117 |                 self.smart_faq_retriever.vectorstore.dataset_path
118 |             )
119 |             answer = self._add_text_to_answer(
120 |                 f"\n#### SMART FAQ ANSWER `{smart_faq_name}`\n", answer, run_manager
121 |             )
122 |             answer += self.smart_faq_chain.run(
123 |                 input_documents=docs, callbacks=run_manager.get_child(), **inputs
124 |             )
125 | 
126 |         # Answer the question using all provided knowledge bases
127 |         for i, retriever in enumerate(self.knowledge_base_retrievers):
128 |             docs = self._get_docs(
129 |                 inputs["question"],
130 |                 retriever,
131 |                 self.knowledge_base_chain,
132 |                 run_manager=run_manager,
133 |             )
134 |             knowledge_base_name = get_or_create_deeplake_vector_store_display_name(
135 |                 retriever.vectorstore.dataset_path
136 |             )
137 |             answer = self._add_text_to_answer(
138 |                 f"\n#### KNOWLEDGE BASE ANSWER `{knowledge_base_name}`\n",
139 |                 answer,
140 |                 run_manager,
141 |             )
142 |             answer += self.knowledge_base_chain.run(
143 |                 input_documents=docs, callbacks=run_manager.get_child(), **inputs
144 |             )
145 |         # Answer the question using
146 |         # the general purpose QA chain
147 |         if self.use_vanilla_llm:
148 |             answer = self._add_text_to_answer("\n#### LLM ANSWER\n", answer, run_manager)
149 |             answer += self.qa_chain.run(
150 |                 question=inputs["question"], callbacks=run_manager.get_child()
151 |             )
152 |         return {self.output_key: answer}
153 | 
154 |     @classmethod
155 |     def from_llm(
156 |         cls,
157 |         llm: BaseLanguageModel,
158 |         condense_question_prompt: BasePromptTemplate,
159 |         smart_faq_prompt: BasePromptTemplate,
160 |         knowledge_base_prompt: BasePromptTemplate,
161 |         qa_prompt: BasePromptTemplate,
162 |         knowledge_base_retrievers: list[BaseRetriever],
163 |         smart_faq_retriever: BaseRetriever | None = None,
164 |         retriever_llm: BaseLanguageModel | None = None,
165 |         condense_question_llm: BaseLanguageModel | None = None,
166 |         use_vanilla_llm: bool = True,
167 |         callbacks: Callbacks = None,
168 |         chain_type: str = "stuff",
169 |         verbose: bool = False,
170 |         **kwargs: Any,
171 |     ) -> Chain:
172 |         qa_chain = LLMChain(
173 |             llm=llm,
174 |             prompt=qa_prompt,
175 |             callbacks=callbacks,
176 |             verbose=verbose,
177 |         )
178 |         condense_question_chain = LLMChain(
179 |             llm=condense_question_llm or llm,
180 |             prompt=condense_question_prompt,
181 |             callbacks=callbacks,
182 |             verbose=verbose,
183 |         )
184 |         knowledge_base_chain = load_qa_chain(
185 |             llm=retriever_llm or llm,
186 |             prompt=knowledge_base_prompt,
187 |             chain_type=chain_type,
188 |             callbacks=callbacks,
189 |             verbose=verbose,
190 |         )
191 |         smart_faq_chain = load_qa_chain(
192 |             llm=retriever_llm or llm,
193 |             prompt=smart_faq_prompt,
194 |             chain_type=chain_type,
195 |             callbacks=callbacks,
196 |             verbose=verbose,
197 |         )
198 |         return cls(
199 |             qa_chain=qa_chain,
200 |             condense_question_chain=condense_question_chain,
201 |             knowledge_base_chain=knowledge_base_chain,
202 |             knowledge_base_retrievers=knowledge_base_retrievers,
203 |             smart_faq_chain=smart_faq_chain,
204 |             smart_faq_retriever=smart_faq_retriever,
205 |             use_vanilla_llm=use_vanilla_llm,
206 |             callbacks=callbacks,
207 |             **kwargs,
208 |         )
209 | 
210 | 
211 | def get_knowledge_base_search_kwargs(options: dict) -> tuple[dict, str]:
212 |     k = int(options["max_tokens"] // options["chunk_size"])
213 |     fetch_k = k * options["k_fetch_k_ratio"]
214 |     if options["maximal_marginal_relevance"]:
215 |         search_kwargs = {
216 |             "distance_metric": options["distance_metric"],
217 |             "fetch_k": fetch_k,
218 |             "k": k,
219 |         }
220 |         search_type = "mmr"
221 |     else:
222 |         search_kwargs = {
223 |             "k": k,
224 |             "distance_metric": options["distance_metric"],
225 |         }
226 |         search_type = "similarity"
227 | 
228 |     return search_kwargs, search_type
229 | 
230 | 
231 | def get_smart_faq_search_kwargs(options: dict) -> tuple[dict, str]:
232 |     search_kwargs = {
233 |         "k": 20,
234 |         "distance_metric": options["distance_metric"],
235 |     }
236 |     search_type = "similarity"
237 |     return search_kwargs, search_type
238 | 
239 | 
240 | def get_multi_chain(
241 |     use_vanilla_llm: bool,
242 |     knowledge_bases: list[VectorStore],
243 |     smart_faq: VectorStore,
244 |     chat_history: BaseChatMessageHistory,
245 |     options: dict,
246 |     credentials: dict,
247 | ) -> MultiRetrieverFAQChain:
248 |     kb_search_kwargs, search_type = get_knowledge_base_search_kwargs(options)
249 |     kb_retrievers = [
250 |         kb.as_retriever(search_type=search_type, search_kwargs=kb_search_kwargs)
251 |         for kb in knowledge_bases
252 |     ]
253 |     faq_search_kwargs, search_type = get_smart_faq_search_kwargs(options)
254 |     faq_retriever = (
255 |         smart_faq.as_retriever(search_type=search_type, search_kwargs=faq_search_kwargs)
256 |         if smart_faq
257 |         else None
258 |     )
259 |     model = get_model(options, credentials)
260 |     memory = ConversationBufferMemory(
261 |         memory_key="chat_history", chat_memory=chat_history, return_messages=True
262 |     )
263 |     chain = MultiRetrieverFAQChain.from_llm(
264 |         llm=model,
265 |         condense_question_prompt=CONDENSE_QUESTION_PROMPT,
266 |         knowledge_base_prompt=KNOWLEDGE_BASE_PROMPT,
267 |         smart_faq_prompt=SMART_FAQ_PROMPT,
268 |         qa_prompt=QA_PROMPT,
269 |         knowledge_base_retrievers=kb_retrievers,
270 |         smart_faq_retriever=faq_retriever,
271 |         max_tokens_limit=options["max_tokens"],
272 |         use_vanilla_llm=use_vanilla_llm,
273 |         memory=memory,
274 |         verbose=VERBOSE,
275 |     )
276 |     logger.info(f"Multi chain with settings {options} build!")
277 |     return chain
278 | 


--------------------------------------------------------------------------------
/datachad/backend/constants.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | MODEL_PATH = Path("models")
 4 | DATA_PATH = Path("data")
 5 | VECTOR_STORE_PATH = Path("stores")
 6 | 
 7 | DEFAULT_USER = "admin"
 8 | DEFAULT_SMART_FAQ = None
 9 | DEFAULT_KNOWLEDGE_BASES = []
10 | USE_VANILLA_LLM = True
11 | 
12 | CHUNK_SIZE = 512
13 | CHUNK_OVERLAP_PCT = 15
14 | TEMPERATURE = 0.0
15 | MAX_TOKENS = 2560
16 | MAXIMAL_MARGINAL_RELEVANCE = True
17 | DISTANCE_METRIC = "cos"
18 | K_FETCH_K_RATIO = 5
19 | 
20 | ENABLE_ADVANCED_OPTIONS = False
21 | STORE_DOCS_EXTRA = False
22 | LOCAL_DEEPLAKE = False
23 | LOCAL_EMBEDDINGS = False
24 | 
25 | VERBOSE = False
26 | 


--------------------------------------------------------------------------------
/datachad/backend/deeplake.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from datetime import datetime
  3 | from glob import glob
  4 | 
  5 | import deeplake
  6 | from deeplake.client.client import DeepLakeBackendClient
  7 | from deeplake.util.bugout_reporter import deeplake_reporter
  8 | from langchain.schema import Document
  9 | from langchain.vectorstores import VectorStore
 10 | from langchain_community.vectorstores.deeplake import DeepLake
 11 | 
 12 | from datachad.backend.constants import (
 13 |     DEFAULT_USER,
 14 |     LOCAL_DEEPLAKE,
 15 |     STORE_DOCS_EXTRA,
 16 |     VECTOR_STORE_PATH,
 17 |     VERBOSE,
 18 | )
 19 | from datachad.backend.io import clean_string_for_storing
 20 | from datachad.backend.loader import load_data_source, split_docs
 21 | from datachad.backend.logging import logger
 22 | from datachad.backend.models import STORES, get_embeddings
 23 | from datachad.backend.utils import clean_string_for_storing
 24 | 
 25 | SPLIT = "-_-"
 26 | 
 27 | 
 28 | def list_deeplake_datasets(
 29 |     org_id: str = "",
 30 |     token: str = None,
 31 | ) -> None:
 32 |     """List all available Deep Lake cloud datasets for a given user / orgnaization.
 33 |     Removed from deeplake in: https://github.com/activeloopai/deeplake/pull/2182/files
 34 |     """
 35 | 
 36 |     deeplake_reporter.feature_report(
 37 |         feature_name="list",
 38 |         parameters={"org_id": org_id},
 39 |     )
 40 | 
 41 |     def get_datasets(self, workspace: str):
 42 |         LIST_DATASETS = "/api/datasets/{}"
 43 |         suffix_public = LIST_DATASETS.format("public")
 44 |         suffix_user = LIST_DATASETS.format("all")
 45 |         if workspace:
 46 |             res_datasets = self.get_workspace_datasets(workspace, suffix_public, suffix_user)
 47 |         else:
 48 |             public_datasets = self.request(
 49 |                 "GET",
 50 |                 suffix_public,
 51 |                 endpoint=self.endpoint(),
 52 |             ).json()
 53 |             user_datasets = self.request(
 54 |                 "GET",
 55 |                 suffix_user,
 56 |                 endpoint=self.endpoint(),
 57 |             ).json()
 58 |             res_datasets = public_datasets + user_datasets
 59 |         return [ds["_id"] for ds in res_datasets]
 60 | 
 61 |     client = DeepLakeBackendClient(token=token)
 62 |     client.get_datasets = get_datasets
 63 |     datasets = client.get_datasets(client, workspace=org_id)
 64 |     return datasets
 65 | 
 66 | 
 67 | def get_deeplake_dataset_path(dataset_name: str, credentials: dict) -> str:
 68 |     if LOCAL_DEEPLAKE:
 69 |         dataset_path = str(VECTOR_STORE_PATH / dataset_name)
 70 |     else:
 71 |         dataset_path = f"hub://{credentials['activeloop_id']}/{dataset_name}"
 72 |     return dataset_path
 73 | 
 74 | 
 75 | def delete_all_deeplake_datasets(credentials: dict) -> None:
 76 |     datasets = list_deeplake_datasets(credentials["activeloop_id"], credentials["activeloop_token"])
 77 |     for dataset in datasets:
 78 |         path = f"hub://{dataset}"
 79 |         logger.info(f"Deleting dataset: {path}")
 80 |         deeplake.delete(path, token=credentials["activeloop_token"], force=True)
 81 | 
 82 | 
 83 | def get_existing_deeplake_vector_store_paths(credentials: dict) -> list[str]:
 84 |     if LOCAL_DEEPLAKE:
 85 |         return glob(str(VECTOR_STORE_PATH / "*"), recursive=False)
 86 |     else:
 87 |         dataset_names = list_deeplake_datasets(
 88 |             credentials["activeloop_id"], credentials["activeloop_token"]
 89 |         )
 90 |         dataset_pahs = [f"hub://{name}" for name in dataset_names]
 91 |         return dataset_pahs
 92 | 
 93 | 
 94 | def get_or_create_deeplake_vector_store_paths_for_user(
 95 |     credentials: dict, store_type: str
 96 | ) -> list[str]:
 97 |     all_paths = get_existing_deeplake_vector_store_paths(credentials)
 98 |     # TODO: replace DEFAULT_USER with user id once stored in credentials
 99 |     user_paths = [
100 |         p
101 |         for p in all_paths
102 |         if p.split(SPLIT)[-1] == DEFAULT_USER and p.split(SPLIT)[-2] == store_type
103 |     ]
104 |     return user_paths
105 | 
106 | 
107 | def get_or_create_deeplake_vector_store_display_name(dataset_path: str) -> str:
108 |     splits = dataset_path.split(SPLIT)
109 |     return f"{splits[-4]} ({splits[-3][:4]}-{splits[-3][4:6]}-{splits[-3][6:8]})"
110 | 
111 | 
112 | def get_unique_deeplake_vector_store_path(store_type: str, name: str, credentials: dict) -> str:
113 |     store_type_dict = {STORES.KNOWLEDGE_BASE: "kb", STORES.SMART_FAQ: "faq"}
114 |     dataset_name = (
115 |         # [-4] vector store name
116 |         f"{SPLIT}{name}"
117 |         # [-3]: creation time
118 |         f"{SPLIT}{datetime.now().strftime('%Y%m%d%H%M%S')}"
119 |         # [-2]: vector store type
120 |         f"{SPLIT}{store_type_dict[store_type]}"
121 |         # [-1]: user
122 |         f"{SPLIT}{DEFAULT_USER}"
123 |     )
124 |     dataset_path = get_deeplake_dataset_path(dataset_name, credentials)
125 |     return dataset_path
126 | 
127 | 
128 | def get_deeplake_docs_path(data_source: str, options: dict, credentials: dict) -> str:
129 |     dataset_name = clean_string_for_storing(data_source)
130 |     dataset_name += "-docs"
131 |     dataset_path = get_deeplake_dataset_path(dataset_name, options, credentials)
132 |     return dataset_path
133 | 
134 | 
135 | def load_docs_from_deeplake(docs_path: str, credentials: dict) -> list[Document]:
136 |     ds = deeplake.load(docs_path, token=credentials["activeloop_token"])
137 |     metadatas = ds["metadata"].data()["value"]
138 |     texts = ds["text"].data()["value"]
139 |     docs = [
140 |         Document(
141 |             page_content=text,
142 |             metadata=metadata,
143 |         )
144 |         for text, metadata in zip(texts, metadatas)
145 |     ]
146 |     return docs
147 | 
148 | 
149 | def store_docs_to_deeplake(docs: list[Document], docs_path: str, credentials: dict):
150 |     ds = deeplake.empty(docs_path, token=credentials["activeloop_token"])
151 |     ds.create_tensor(
152 |         "text",
153 |         htype="text",
154 |         create_id_tensor=False,
155 |         create_sample_info_tensor=False,
156 |         create_shape_tensor=False,
157 |         chunk_compression="lz4",
158 |     )
159 |     ds.create_tensor(
160 |         "metadata",
161 |         htype="json",
162 |         create_id_tensor=False,
163 |         create_sample_info_tensor=False,
164 |         create_shape_tensor=False,
165 |         chunk_compression="lz4",
166 |     )
167 |     for doc in docs:
168 |         ds.append(
169 |             {
170 |                 "text": doc.page_content,
171 |                 "metadata": doc.metadata,
172 |             }
173 |         )
174 |     ds.commit()
175 |     logger.info(f"Stored docs to: {docs_path}")
176 | 
177 | 
178 | def load_data_sources_or_docs_from_deeplake(
179 |     data_sources: list[str], options: dict, credentials: dict
180 | ) -> list[Document]:
181 |     docs = []
182 |     for data_source in data_sources:
183 |         if STORE_DOCS_EXTRA:
184 |             docs_path = get_deeplake_docs_path(data_source, options, credentials)
185 |             if deeplake.exists(docs_path, token=credentials["activeloop_token"]):
186 |                 logger.info(f"Docs exist -> loading docs: {docs_path}")
187 |                 docs.extend(load_docs_from_deeplake(docs_path, credentials))
188 |             else:
189 |                 logger.info(
190 |                     f"Docs do not exist for data source -> loading data source: {data_source}"
191 |                 )
192 |                 docs.extend(load_data_source(data_source))
193 |                 store_docs_to_deeplake(docs, docs_path, credentials)
194 |             logger.info(f"Docs {docs_path} loaded!")
195 |         else:
196 |             docs.extend(load_data_source(data_source))
197 |     return docs
198 | 
199 | 
200 | def get_or_create_deeplake_vector_store(
201 |     data_sources: list[str],
202 |     vector_store_path: str,
203 |     store_type: str,
204 |     options: dict,
205 |     credentials: dict,
206 | ) -> VectorStore:
207 |     t_start = time.time()
208 |     embeddings = get_embeddings(options, credentials)
209 |     if deeplake.exists(vector_store_path, token=credentials["activeloop_token"]):
210 |         logger.info(f"Vector Store '{vector_store_path}' exists -> loading")
211 |         vector_store = DeepLake(
212 |             dataset_path=vector_store_path,
213 |             read_only=True,
214 |             embedding_function=embeddings,
215 |             token=credentials["activeloop_token"],
216 |         )
217 |     else:
218 |         logger.info(f"Vector Store '{vector_store_path}' does not exist -> uploading")
219 |         docs = load_data_sources_or_docs_from_deeplake(data_sources, options, credentials)
220 |         docs = split_docs(docs, store_type, options)
221 |         vector_store = DeepLake.from_documents(
222 |             docs,
223 |             embeddings,
224 |             dataset_path=vector_store_path,
225 |             token=credentials["activeloop_token"],
226 |             verbose=VERBOSE,
227 |         )
228 |     logger.info(f"Vector Store {vector_store_path} loaded in {round(time.time() - t_start)}s!")
229 |     return vector_store
230 | 


--------------------------------------------------------------------------------
/datachad/backend/io.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import shutil
 4 | from pathlib import Path
 5 | 
 6 | from datachad.backend.constants import DATA_PATH
 7 | from datachad.backend.logging import logger
 8 | from datachad.backend.utils import clean_string_for_storing
 9 | 
10 | 
11 | def concatenate_file_names(strings: list[str], n_max: int = 30) -> str:
12 |     # Calculate N based on the length of the list
13 |     n = max(1, n_max // len(strings))
14 |     result = ""
15 |     # Add up the first N characters of each string
16 |     for string in sorted(strings):
17 |         result += f"-{string[:n]}"
18 |     return clean_string_for_storing(result)
19 | 
20 | 
21 | def get_data_source_and_save_path(files: list[io.BytesIO], name: str) -> tuple[str, Path]:
22 |     # generate data source string and path to save files to
23 |     if len(files) > 1:
24 |         # we create a folder where all the files will be stored
25 |         path = DATA_PATH / name
26 |         data_source = path
27 |     else:
28 |         path = DATA_PATH
29 |         data_source = path / files[0].name
30 |     if not os.path.exists(path):
31 |         os.makedirs(path)
32 |     return str(data_source), path
33 | 
34 | 
35 | def save_file(file: io.BytesIO, path: Path) -> None:
36 |     # save streamlit UploadedFile to path
37 |     file_path = str(path / file.name)
38 |     file.seek(0)
39 |     file_bytes = file.read()
40 |     file = open(file_path, "wb")
41 |     file.write(file_bytes)
42 |     file.close()
43 |     logger.info(f"Saved: {file_path}")
44 | 
45 | 
46 | def save_files(files: list[io.BytesIO], name: str) -> str:
47 |     # streamlit uploaded files need to be stored locally
48 |     # before embedded and uploaded to the hub
49 |     if not files:
50 |         return None
51 |     data_source, save_path = get_data_source_and_save_path(files, name)
52 |     for file in files:
53 |         save_file(file, save_path)
54 |     return data_source
55 | 
56 | 
57 | def delete_files(files: list[io.BytesIO], name: str) -> None:
58 |     # cleanup locally stored files
59 |     # the correct path is stored to data_source
60 |     if not files:
61 |         return
62 |     data_source, _ = get_data_source_and_save_path(files, name)
63 |     if os.path.isdir(data_source):
64 |         shutil.rmtree(data_source)
65 |     elif os.path.isfile(data_source):
66 |         os.remove(data_source)
67 |     else:
68 |         return
69 |     logger.info(f"Removed: {data_source}")
70 | 


--------------------------------------------------------------------------------
/datachad/backend/jobs.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | from langchain.chains.base import Chain
 4 | from langchain.schema import BaseChatMessageHistory
 5 | from langchain.schema.vectorstore import VectorStore
 6 | 
 7 | from datachad.backend.chain import get_multi_chain
 8 | from datachad.backend.deeplake import (
 9 |     get_or_create_deeplake_vector_store,
10 |     get_unique_deeplake_vector_store_path,
11 | )
12 | from datachad.backend.io import delete_files, save_files
13 | from datachad.backend.models import STORES
14 | 
15 | 
16 | def create_vector_store(
17 |     data_source: str | None,
18 |     files: list[io.BytesIO],
19 |     store_type: str,
20 |     name: str,
21 |     options: dict,
22 |     credentials: dict,
23 | ) -> VectorStore:
24 |     file_data_source = save_files(files, name)
25 |     vector_store_path = get_unique_deeplake_vector_store_path(store_type, name, credentials)
26 |     vector_store = get_or_create_deeplake_vector_store(
27 |         data_sources=[ds for ds in [data_source, file_data_source] if ds],
28 |         vector_store_path=vector_store_path,
29 |         store_type=store_type,
30 |         options=options,
31 |         credentials=credentials,
32 |     )
33 |     delete_files(files, name)
34 |     return vector_store
35 | 
36 | 
37 | def create_chain(
38 |     use_vanilla_llm: bool,
39 |     knowledge_bases: str,
40 |     smart_faq: str,
41 |     chat_history: BaseChatMessageHistory,
42 |     options: dict,
43 |     credentials: dict,
44 | ) -> Chain:
45 |     knowledge_bases = [
46 |         get_or_create_deeplake_vector_store(
47 |             data_sources=[],
48 |             vector_store_path=path,
49 |             store_type=STORES.KNOWLEDGE_BASE,
50 |             options=options,
51 |             credentials=credentials,
52 |         )
53 |         for path in knowledge_bases
54 |     ]
55 |     if smart_faq:
56 |         smart_faq = get_or_create_deeplake_vector_store(
57 |             data_sources=[],
58 |             vector_store_path=smart_faq,
59 |             store_type=STORES.SMART_FAQ,
60 |             options=options,
61 |             credentials=credentials,
62 |         )
63 |     chain = get_multi_chain(
64 |         use_vanilla_llm, knowledge_bases, smart_faq, chat_history, options, credentials
65 |     )
66 |     return chain
67 | 


--------------------------------------------------------------------------------
/datachad/backend/loader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import shutil
  4 | from pathlib import Path
  5 | 
  6 | from langchain.document_loaders.base import BaseLoader
  7 | from langchain.schema import Document
  8 | from langchain.text_splitter import RecursiveCharacterTextSplitter
  9 | from langchain_community.document_loaders import (
 10 |     CSVLoader,
 11 |     EverNoteLoader,
 12 |     GitLoader,
 13 |     NotebookLoader,
 14 |     OnlinePDFLoader,
 15 |     PyPDFium2Loader,
 16 |     PythonLoader,
 17 |     TextLoader,
 18 |     UnstructuredEPubLoader,
 19 |     UnstructuredFileLoader,
 20 |     UnstructuredHTMLLoader,
 21 |     UnstructuredMarkdownLoader,
 22 |     UnstructuredODTLoader,
 23 |     UnstructuredPowerPointLoader,
 24 |     UnstructuredWordDocumentLoader,
 25 |     WebBaseLoader,
 26 | )
 27 | from tqdm import tqdm
 28 | 
 29 | from datachad.backend.constants import DATA_PATH
 30 | from datachad.backend.logging import logger
 31 | from datachad.backend.models import STORES, get_tokenizer
 32 | 
 33 | 
 34 | class SmartFAQSplitter:
 35 |     def split_documents(self, documents: list[Document]) -> list[Document]:
 36 |         """
 37 |         Splits the given text into a list of strings based on the regex patterns of numbered lists.
 38 |         Each new list item is separated by two blank lines like this:
 39 | 
 40 |             1. First item
 41 |             Some description here.
 42 | 
 43 |             1. some numbered list
 44 |             2. beloing to the first item
 45 | 
 46 | 
 47 |             2. Second item
 48 |             Another description.
 49 | 
 50 |             a) another list
 51 |             b) but with characters
 52 | 
 53 | 
 54 |             3. Third item
 55 |             And another one.
 56 |             - a list with dashes
 57 |             - more items
 58 |         """
 59 |         splitted_documents = []
 60 |         for document in documents:
 61 |             split_text = re.split(r"(?=\n\n\d+\.)", document.page_content.strip())
 62 |             filtered_text = [re.sub(r"^\n+|\n+$", "", section) for section in split_text]
 63 |             splitted_documents.extend(
 64 |                 [
 65 |                     Document(
 66 |                         page_content=text,
 67 |                         metadata={"faq_no": int(re.findall(r"\d", text)[0])},
 68 |                     )
 69 |                     for text in filtered_text
 70 |                 ]
 71 |             )
 72 |         return splitted_documents
 73 | 
 74 | 
 75 | class AutoGitLoader:
 76 |     def __init__(self, data_source: str) -> None:
 77 |         self.data_source = data_source
 78 | 
 79 |     def load(self) -> list[Document]:
 80 |         # We need to try both common main branches
 81 |         # Thank you github for the "master" to "main" switch
 82 |         # we need to make sure the data path exists
 83 |         if not os.path.exists(DATA_PATH):
 84 |             os.makedirs(DATA_PATH)
 85 |         repo_name = self.data_source.split("/")[-1].split(".")[0]
 86 |         repo_path = str((DATA_PATH / repo_name).absolute())
 87 |         clone_url = self.data_source
 88 |         if os.path.exists(repo_path):
 89 |             clone_url = None
 90 |         branches = ["main", "master"]
 91 |         for branch in branches:
 92 |             try:
 93 |                 docs = GitLoader(repo_path, clone_url, branch).load()
 94 |                 break
 95 |             except Exception as e:
 96 |                 logger.error(f"Error loading git: {e}")
 97 |         if os.path.exists(repo_path):
 98 |             # cleanup repo afterwards
 99 |             shutil.rmtree(repo_path)
100 |         try:
101 |             return docs
102 |         except:
103 |             raise RuntimeError("Error loading git. Make sure to use HTTPS GitHub repo links.")
104 | 
105 | 
106 | FILE_LOADER_MAPPING = {
107 |     ".csv": (CSVLoader, {"encoding": "utf-8"}),
108 |     ".doc": (UnstructuredWordDocumentLoader, {}),
109 |     ".docx": (UnstructuredWordDocumentLoader, {}),
110 |     ".enex": (EverNoteLoader, {}),
111 |     ".epub": (UnstructuredEPubLoader, {}),
112 |     ".html": (UnstructuredHTMLLoader, {}),
113 |     ".md": (UnstructuredMarkdownLoader, {}),
114 |     ".odt": (UnstructuredODTLoader, {}),
115 |     ".pdf": (PyPDFium2Loader, {}),
116 |     ".ppt": (UnstructuredPowerPointLoader, {}),
117 |     ".pptx": (UnstructuredPowerPointLoader, {}),
118 |     ".txt": (TextLoader, {"encoding": "utf8"}),
119 |     ".ipynb": (NotebookLoader, {}),
120 |     ".py": (PythonLoader, {}),
121 |     # Add more mappings for other file extensions and loaders as needed
122 | }
123 | 
124 | WEB_LOADER_MAPPING = {
125 |     ".git": (AutoGitLoader, {}),
126 |     ".pdf": (OnlinePDFLoader, {}),
127 | }
128 | 
129 | 
130 | def load_document(
131 |     file_path: str,
132 |     mapping: dict = FILE_LOADER_MAPPING,
133 |     default_loader: BaseLoader = UnstructuredFileLoader,
134 | ) -> Document:
135 |     # Choose loader from mapping, load default if no match found
136 |     ext = "." + file_path.rsplit(".", 1)[-1]
137 |     if ext in mapping:
138 |         loader_class, loader_args = mapping[ext]
139 |         loader = loader_class(file_path, **loader_args)
140 |     else:
141 |         loader = default_loader(file_path)
142 |     return loader.load()
143 | 
144 | 
145 | def load_directory(path: str, silent_errors=True) -> list[Document]:
146 |     # We don't load hidden files starting with "."
147 |     all_files = list(Path(path).rglob("**/[!.]*"))
148 |     results = []
149 |     with tqdm(total=len(all_files), desc="Loading documents", ncols=80) as pbar:
150 |         for file in all_files:
151 |             try:
152 |                 results.extend(load_document(str(file)))
153 |             except Exception as e:
154 |                 if silent_errors:
155 |                     logger.error(f"failed to load {file}")
156 |                 else:
157 |                     raise e
158 |             pbar.update()
159 |     return results
160 | 
161 | 
162 | def load_data_source(data_source: str) -> list[Document]:
163 |     # Ugly thing that decides how to load data
164 |     # It aint much, but it's honest work
165 |     is_web = data_source.startswith("http")
166 |     is_dir = os.path.isdir(data_source)
167 |     is_file = os.path.isfile(data_source)
168 |     try:
169 |         if is_dir:
170 |             docs = load_directory(data_source)
171 |         elif is_file:
172 |             docs = load_document(data_source)
173 |         elif is_web:
174 |             docs = load_document(data_source, WEB_LOADER_MAPPING, WebBaseLoader)
175 |         else:
176 |             raise TypeError
177 |         return docs
178 |     except Exception as e:
179 |         error_msg = f"Failed to load your data source '{data_source}'."
180 |         logger.error(error_msg)
181 |         e.args += (error_msg,)
182 |         raise e
183 | 
184 | 
185 | def split_docs(docs: list[Document], store_type: str, options: dict) -> list[Document]:
186 |     if store_type == STORES.SMART_FAQ:
187 |         text_splitter = SmartFAQSplitter()
188 |     else:
189 |         tokenizer = get_tokenizer(options)
190 | 
191 |         def length_function(text: str) -> int:
192 |             # count chunks like the embeddings model tokenizer does
193 |             return len(tokenizer.encode(text))
194 | 
195 |         chunk_overlap = int(options["chunk_size"] * options["chunk_overlap_pct"] / 100)
196 |         text_splitter = RecursiveCharacterTextSplitter(
197 |             chunk_size=options["chunk_size"],
198 |             chunk_overlap=chunk_overlap,
199 |             length_function=length_function,
200 |             separators=["\n\n", "#", "\.", "!", "\?", "\n", ",", " ", ""],
201 |         )
202 | 
203 |     splitted_docs = text_splitter.split_documents(docs)
204 |     logger.info(f"Loaded: {len(splitted_docs)} document chucks")
205 |     return splitted_docs
206 | 


--------------------------------------------------------------------------------
/datachad/backend/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def create_logger(level: str = "DEBUG"):
 6 |     logger = logging.getLogger(__name__)
 7 |     logger.propagate = False
 8 |     logger.setLevel(level)
 9 |     # if no streamhandler present, add one
10 |     if not any(isinstance(handler, logging.StreamHandler) for handler in logger.handlers):
11 |         stream_handler = logging.StreamHandler(stream=sys.stdout)
12 |         formatter = logging.Formatter("%(asctime)s :: %(name)s :: %(levelname)s :: %(message)s")
13 |         stream_handler.setFormatter(formatter)
14 |         logger.addHandler(stream_handler)
15 |     return logger
16 | 
17 | 
18 | logger = create_logger()
19 | 


--------------------------------------------------------------------------------
/datachad/backend/models.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Any
  3 | 
  4 | import streamlit as st
  5 | import tiktoken
  6 | from langchain.base_language import BaseLanguageModel
  7 | from langchain_community.chat_models import ChatOpenAI
  8 | from langchain_community.embeddings import HuggingFaceEmbeddings
  9 | from langchain_community.embeddings.openai import Embeddings, OpenAIEmbeddings
 10 | from transformers import AutoTokenizer
 11 | 
 12 | from datachad.backend.constants import LOCAL_EMBEDDINGS, MODEL_PATH
 13 | from datachad.backend.logging import logger
 14 | 
 15 | 
 16 | class Enum:
 17 |     @classmethod
 18 |     def all(cls) -> list[Any]:
 19 |         return [v for k, v in cls.__dict__.items() if not k.startswith("_")]
 20 | 
 21 | 
 22 | @dataclass
 23 | class Model:
 24 |     name: str
 25 |     embedding: str
 26 |     context: int
 27 | 
 28 |     def __str__(self) -> str:
 29 |         return self.name
 30 | 
 31 | 
 32 | class STORES(Enum):
 33 |     KNOWLEDGE_BASE = "Knowledge Base"
 34 |     SMART_FAQ = "Smart FAQ"
 35 | 
 36 | 
 37 | class EMBEDDINGS(Enum):
 38 |     # Add more embeddings as needed
 39 |     OPENAI = "text-embedding-3-small"
 40 |     HUGGINGFACE = "sentence-transformers/all-MiniLM-L6-v2"
 41 | 
 42 | 
 43 | class MODELS(Enum):
 44 |     # Add more models as needed
 45 |     GPT35TURBO = Model(
 46 |         name="gpt-3.5-turbo",
 47 |         embedding=EMBEDDINGS.OPENAI,
 48 |         context=4096,
 49 |     )
 50 |     GPT35TURBO16K = Model(
 51 |         name="gpt-3.5-turbo-16k",
 52 |         embedding=EMBEDDINGS.OPENAI,
 53 |         context=16385,
 54 |     )
 55 |     GPT4 = Model(
 56 |         name="gpt-4",
 57 |         embedding=EMBEDDINGS.OPENAI,
 58 |         context=8192,
 59 |     )
 60 |     GPT4TURBO = Model(
 61 |         name="gpt-4-turbo-preview",
 62 |         embedding=EMBEDDINGS.OPENAI,
 63 |         context=128000,
 64 |     )
 65 | 
 66 | 
 67 | def get_model(options: dict, credentials: dict) -> BaseLanguageModel:
 68 |     match options["model"].name:
 69 |         case model_name if model_name.startswith("gpt"):
 70 |             model = ChatOpenAI(
 71 |                 model_name=options["model"].name,
 72 |                 temperature=options["temperature"],
 73 |                 openai_api_key=credentials["openai_api_key"],
 74 |                 streaming=True,
 75 |             )
 76 |         # Added models need to be cased here
 77 |         case _default:
 78 |             msg = f"Model {options['model'].name} not supported!"
 79 |             logger.error(msg)
 80 |             st.error(msg)
 81 |             exit
 82 |     return model
 83 | 
 84 | 
 85 | def get_embeddings(options: dict, credentials: dict) -> Embeddings:
 86 |     match options["model"].embedding:
 87 |         case embedding if (embedding == EMBEDDINGS.HUGGINGFACE or LOCAL_EMBEDDINGS):
 88 |             embeddings = HuggingFaceEmbeddings(
 89 |                 model_name=EMBEDDINGS.HUGGINGFACE, cache_folder=str(MODEL_PATH)
 90 |             )
 91 |         case EMBEDDINGS.OPENAI:
 92 |             embeddings = OpenAIEmbeddings(
 93 |                 model=EMBEDDINGS.OPENAI,
 94 |                 disallowed_special=(),
 95 |                 openai_api_key=credentials["openai_api_key"],
 96 |             )
 97 |         # Added embeddings need to be cased here
 98 |         case _default:
 99 |             msg = f"Embeddings {options['model'].embedding} not supported!"
100 |             logger.error(msg)
101 |             st.error(msg)
102 |             exit
103 |     return embeddings
104 | 
105 | 
106 | def get_tokenizer(options: dict) -> Embeddings:
107 |     match options["model"].embedding:
108 |         case embedding if (embedding == EMBEDDINGS.HUGGINGFACE or LOCAL_EMBEDDINGS):
109 |             tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS.HUGGINGFACE)
110 |         case EMBEDDINGS.OPENAI:
111 |             tokenizer = tiktoken.encoding_for_model(EMBEDDINGS.OPENAI)
112 |         # Added tokenizers need to be cased here
113 |         case _default:
114 |             msg = f"Tokenizer {options['model'].embedding} not supported!"
115 |             logger.error(msg)
116 |             st.error(msg)
117 |             exit
118 |     return tokenizer
119 | 


--------------------------------------------------------------------------------
/datachad/backend/prompts.py:
--------------------------------------------------------------------------------
 1 | from langchain.prompts.prompt import PromptTemplate
 2 | 
 3 | condense_question_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
 4 | 
 5 | Chat History:
 6 | {chat_history}
 7 | 
 8 | Follow Up Question: {question}
 9 | Standalone question:"""
10 | CONDENSE_QUESTION_PROMPT = PromptTemplate(
11 |     template=condense_question_template, input_variables=["chat_history", "question"]
12 | )
13 | 
14 | 
15 | knowledge_base_template = """Use the following pieces of context to answer the given question. If you don't know the answer respond with 'NO ANSWER FOUND'.
16 | 
17 | Context:
18 | {context}
19 | 
20 | Question: {question}
21 | Helpful Answer:"""
22 | KNOWLEDGE_BASE_PROMPT = PromptTemplate(
23 |     template=knowledge_base_template, input_variables=["context", "question"]
24 | )
25 | 
26 | 
27 | smart_faq_template = """Use the following numbered FAQs to answer the given question. If you don't know the answer respond with 'NO ANSWER FOUND'.
28 | Start your answer with stating which FAQ number helps answer the question the most.
29 | 
30 | Context:
31 | {context}
32 | 
33 | Question: {question}
34 | Helpful Answer:"""
35 | SMART_FAQ_PROMPT = PromptTemplate(
36 |     template=smart_faq_template, input_variables=["context", "question"]
37 | )
38 | 
39 | 
40 | qa_prompt = """You are an AGI that knows everything and is an expert in all topics. 
41 | Your IQ is magnitudes higher than any human that ever lived. With this immense wisdom answer the following question concisely:
42 | 
43 | Question: {question}
44 | Concise and wise Answer:"""
45 | QA_PROMPT = PromptTemplate(template=qa_prompt, input_variables=["question"])
46 | 


--------------------------------------------------------------------------------
/datachad/backend/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def clean_string_for_storing(string: str) -> str:
 5 |     # replace all non-word characters with dashes
 6 |     # to get a string that can be used to create a new dataset
 7 |     cleaned_string = re.sub(r"\W+", "-", string)
 8 |     cleaned_string = re.sub(r"--+", "- ", cleaned_string).strip("-")
 9 |     return cleaned_string
10 | 


--------------------------------------------------------------------------------
/datachad/streamlit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/datachad/streamlit/__init__.py


--------------------------------------------------------------------------------
/datachad/streamlit/constants.py:
--------------------------------------------------------------------------------
 1 | PAGE_ICON = "🤖"
 2 | APP_NAME = "DataChad V3"
 3 | PROJECT_URL = "https://github.com/gustavz/DataChad"
 4 | 
 5 | 
 6 | LOCAL_MODE_DISABLED_HELP = """
 7 | This is a demo hosted with limited resources. Local Mode is not enabled.\n
 8 | To use Local Mode deploy the app on your machine of choice with `ENABLE_LOCAL_MODE` set to `True`.
 9 | """
10 | 
11 | AUTHENTICATION_HELP = f"""
12 | Your credentials are only stored in your session state.\n
13 | The keys are neither exposed nor made visible or stored permanently in any way.\n
14 | Feel free to check out [the code base]({PROJECT_URL}) to validate how things work.
15 | """
16 | 
17 | USAGE_HELP = f"""
18 | These are the accumulated OpenAI API usage metrics.\n
19 | The app uses `gpt-3.5-turbo` for chat and `text-embedding-ada-002` for embeddings.\n
20 | Learn more about OpenAI's pricing [here](https://openai.com/pricing#language-models)
21 | """
22 | 
23 | OPENAI_HELP = """
24 | You can sign-up for OpenAI's API [here](https://openai.com/blog/openai-api).\n
25 | Once you are logged in, you find the API keys [here](https://platform.openai.com/account/api-keys)
26 | """
27 | 
28 | ACTIVELOOP_HELP = """
29 | You can create an ActiveLoops account (including 500GB of free database storage) [here](https://www.activeloop.ai/).\n
30 | Once you are logged in, you find the API token [here](https://app.activeloop.ai/profile/gustavz/apitoken).\n
31 | The organisation name is your username, or you can create new organisations [here](https://app.activeloop.ai/organization/new/create)
32 | """
33 | 
34 | UPLOAD_HELP = """
35 | You can upload a single or multiple files. With each upload, all files in the batch are embedded into a single vector store.\n
36 | **Important**: If you upload new files after you already have uploaded files, a new vector store that includes all previously uploaded files is created.
37 | This means for each combination of uploaded files, a new vector store is created.\n
38 | To treat your new upload independently, you need to remove the previous uploads by clicking the `X`, right next to the uploaded file name.\n
39 | **!!! All uploaded files are removed permanently from the app after the vector stores are created !!!**
40 | """
41 | 
42 | DATA_TYPE_HELP = """
43 | **Knowledge Bases** can be any number of text documents of any type, content and formatting.\n\n
44 | **Smart FAQs** need to be single documents containing numbered FAQs.
45 | They need to be in the format of numbers with periods followed by arbirtary text.
46 | The next FAQ is identified by two new lines `\\n\\n` followed by the next number.
47 | You can check if your documents are correctly formatted by using the following regex pattern:\n
48 | `r"(?=\\n\\n\d+\.)"`. Here is an example of a correctly formatted FAQ:\n
49 |     1. First item
50 |     Some description here.
51 | 
52 |     1. some numbered list
53 |     2. beloing to the first item
54 | 
55 | 
56 |     2. Second item
57 |     Another description.
58 | 
59 |     a) another list
60 |     b) but with characters
61 | 
62 | 
63 |     3. Third item
64 |     And another one.
65 |     - a list with dashes
66 |     - more items
67 | """
68 | 


--------------------------------------------------------------------------------
/datachad/streamlit/helper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import deeplake
  4 | import openai
  5 | import streamlit as st
  6 | from dotenv import load_dotenv
  7 | from langchain.callbacks.base import BaseCallbackHandler
  8 | from langchain_community.callbacks.openai_info import get_openai_token_cost_for_model
  9 | from langchain_community.chat_message_histories import StreamlitChatMessageHistory
 10 | 
 11 | from datachad.backend.constants import (
 12 |     CHUNK_OVERLAP_PCT,
 13 |     CHUNK_SIZE,
 14 |     DEFAULT_KNOWLEDGE_BASES,
 15 |     DEFAULT_SMART_FAQ,
 16 |     DISTANCE_METRIC,
 17 |     K_FETCH_K_RATIO,
 18 |     MAX_TOKENS,
 19 |     MAXIMAL_MARGINAL_RELEVANCE,
 20 |     TEMPERATURE,
 21 | )
 22 | from datachad.backend.deeplake import (
 23 |     get_or_create_deeplake_vector_store_display_name,
 24 |     get_or_create_deeplake_vector_store_paths_for_user,
 25 | )
 26 | from datachad.backend.jobs import create_chain, create_vector_store
 27 | from datachad.backend.logging import logger
 28 | from datachad.backend.models import MODELS, get_tokenizer
 29 | from datachad.streamlit.constants import PAGE_ICON
 30 | 
 31 | # loads environment variables
 32 | load_dotenv()
 33 | 
 34 | 
 35 | def init_session_state():
 36 |     # Initialise all session state variables with defaults
 37 |     SESSION_DEFAULTS = {
 38 |         # general usage
 39 |         "usage": {},
 40 |         "chat_history": StreamlitChatMessageHistory(),
 41 |         # authentication
 42 |         "openai_api_key": "",
 43 |         "activeloop_token": "",
 44 |         "activeloop_id": "",
 45 |         "credentals": {},
 46 |         "auth_ok": False,
 47 |         # data upload
 48 |         "uploaded_files": None,
 49 |         "data_type": None,
 50 |         "data_name": None,
 51 |         # data selection
 52 |         "chain": None,
 53 |         "knowledge_bases": DEFAULT_KNOWLEDGE_BASES,
 54 |         "smart_faq": DEFAULT_SMART_FAQ,
 55 |         # advanced options
 56 |         "model": MODELS.GPT35TURBO,
 57 |         "k_fetch_k_ratio": K_FETCH_K_RATIO,
 58 |         "chunk_size": CHUNK_SIZE,
 59 |         "chunk_overlap_pct": CHUNK_OVERLAP_PCT,
 60 |         "temperature": TEMPERATURE,
 61 |         "max_tokens": MAX_TOKENS,
 62 |         "distance_metric": DISTANCE_METRIC,
 63 |         "maximal_marginal_relevance": MAXIMAL_MARGINAL_RELEVANCE,
 64 |     }
 65 | 
 66 |     for k, v in SESSION_DEFAULTS.items():
 67 |         if k not in st.session_state:
 68 |             st.session_state[k] = v
 69 | 
 70 | 
 71 | def authenticate() -> None:
 72 |     # Validate all credentials are set and correct
 73 |     # Check for env variables to enable local dev and deployments with shared credentials
 74 |     openai_api_key = (
 75 |         st.session_state["openai_api_key"]
 76 |         or os.environ.get("OPENAI_API_KEY")
 77 |         or st.secrets.get("OPENAI_API_KEY")
 78 |     )
 79 |     activeloop_token = (
 80 |         st.session_state["activeloop_token"]
 81 |         or os.environ.get("ACTIVELOOP_TOKEN")
 82 |         or st.secrets.get("ACTIVELOOP_TOKEN")
 83 |     )
 84 |     activeloop_id = (
 85 |         st.session_state["activeloop_id"]
 86 |         or os.environ.get("ACTIVELOOP_ID")
 87 |         or st.secrets.get("ACTIVELOOP_ID")
 88 |     )
 89 |     if not (openai_api_key and activeloop_token and activeloop_id):
 90 |         st.session_state["auth_ok"] = False
 91 |         st.error("Credentials neither set nor stored", icon=PAGE_ICON)
 92 |         return
 93 |     try:
 94 |         # Try to access openai and deeplake
 95 |         with st.session_state["info_container"], st.spinner("Authentifying..."):
 96 |             openai.api_key = openai_api_key
 97 |             openai.models.list()
 98 |             deeplake.exists(
 99 |                 f"hub://{activeloop_id}/DataChad-Authentication-Check",
100 |                 token=activeloop_token,
101 |             )
102 |     except Exception as e:
103 |         logger.error(f"Authentication failed with {e}")
104 |         st.session_state["auth_ok"] = False
105 |         st.error("Authentication failed", icon=PAGE_ICON)
106 |         return
107 |     # store credentials in the session state
108 |     st.session_state["auth_ok"] = True
109 |     st.session_state["credentials"] = {
110 |         "openai_api_key": openai_api_key,
111 |         "activeloop_token": activeloop_token,
112 |         "activeloop_id": activeloop_id,
113 |     }
114 |     msg = "Authentification successful!"
115 |     st.session_state["info_container"].info(msg, icon=PAGE_ICON)
116 |     logger.info(msg)
117 | 
118 | 
119 | def get_options() -> dict:
120 |     return {
121 |         key: st.session_state[key]
122 |         for key in [
123 |             "model",
124 |             "k_fetch_k_ratio",
125 |             "chunk_size",
126 |             "chunk_overlap_pct",
127 |             "temperature",
128 |             "max_tokens",
129 |             "distance_metric",
130 |             "maximal_marginal_relevance",
131 |         ]
132 |     }
133 | 
134 | 
135 | def upload_data() -> None:
136 |     try:
137 |         with st.session_state["info_container"], st.spinner("Uploading Data..."):
138 |             options = get_options()
139 |             create_vector_store(
140 |                 data_source=st.session_state["data_source"],
141 |                 files=st.session_state["uploaded_files"],
142 |                 store_type=st.session_state["data_type"],
143 |                 name=st.session_state["data_name"],
144 |                 options=options,
145 |                 credentials=st.session_state["credentials"],
146 |             )
147 |             msg = (
148 |                 f"Vector Store built for "
149 |                 f"uploaded files: {st.session_state['uploaded_files']} "
150 |                 f"and store type: {st.session_state['data_type']}"
151 |                 f"with name: {st.session_state['data_name']}"
152 |                 f"and options: {options}"
153 |             )
154 |             logger.info(msg)
155 |         st.session_state["info_container"].info("Upload successful!", icon=PAGE_ICON)
156 |     except Exception as e:
157 |         msg = f"Failed to build vectore chain with error: {e}"
158 |         logger.error(msg)
159 |         st.session_state["info_container"].error(msg, icon=PAGE_ICON)
160 | 
161 | 
162 | def update_chain() -> None:
163 |     try:
164 |         with st.session_state["info_container"], st.spinner("Applying data selection..."):
165 |             st.session_state["chat_history"].clear()
166 |             options = get_options()
167 |             st.session_state["chain"] = create_chain(
168 |                 use_vanilla_llm=st.session_state["use_vanilla_llm"],
169 |                 knowledge_bases=st.session_state["knowledge_bases"],
170 |                 smart_faq=st.session_state["smart_faq"],
171 |                 chat_history=st.session_state["chat_history"],
172 |                 options=options,
173 |                 credentials=st.session_state["credentials"],
174 |             )
175 |             msg = (
176 |                 f"Language chain built for "
177 |                 f"knowledge base: {st.session_state['knowledge_bases']} "
178 |                 f"and smart faq: {st.session_state['smart_faq']}"
179 |                 f"with options: {options}"
180 |             )
181 |             logger.info(msg)
182 |         st.session_state["info_container"].info("Selection successful!", icon=PAGE_ICON)
183 |     except Exception as e:
184 |         msg = f"Failed to build language chain with error: {e}"
185 |         logger.error(msg)
186 |         st.session_state["info_container"].error(msg, icon=PAGE_ICON)
187 | 
188 | 
189 | def get_existing_smart_faqs_and_default_index() -> list[str]:
190 |     smart_faqs = get_or_create_deeplake_vector_store_paths_for_user(
191 |         st.session_state["credentials"], "faq"
192 |     )
193 |     index = 0
194 |     if DEFAULT_SMART_FAQ and DEFAULT_SMART_FAQ in smart_faqs:
195 |         # we pick the first smart faq as default
196 |         # so we must sort it to the front
197 |         smart_faqs = set(smart_faqs)
198 |         smart_faqs.remove(DEFAULT_SMART_FAQ)
199 |         smart_faqs = [DEFAULT_SMART_FAQ] + list(smart_faqs)
200 |         index = 1
201 |     # first option should always be None
202 |     smart_faqs = [None] + smart_faqs
203 |     return smart_faqs, index
204 | 
205 | 
206 | def get_existing_knowledge_bases() -> list[str]:
207 |     return get_or_create_deeplake_vector_store_paths_for_user(st.session_state["credentials"], "kb")
208 | 
209 | 
210 | def format_vector_stores(item: str) -> str:
211 |     if item is not None:
212 |         return get_or_create_deeplake_vector_store_display_name(item)
213 |     return item
214 | 
215 | 
216 | class StreamHandler(BaseCallbackHandler):
217 |     def __init__(self, container: st.delta_generator.DeltaGenerator, initial_text: str = ""):
218 |         self.container = container
219 |         self.stream_text = initial_text
220 |         self.chain_state = 0
221 | 
222 |     def on_llm_new_token(self, token: str, **kwargs) -> None:
223 |         self.stream_text += token
224 |         self.container.markdown(self.stream_text)
225 | 
226 |     def on_chain_end(self, outputs, **kwargs) -> None:
227 |         self.chain_state += 1
228 | 
229 | 
230 | class PrintRetrievalHandler(BaseCallbackHandler):
231 |     def __init__(self, container):
232 |         self.status = container.status("**Context Retrieval**")
233 | 
234 |     def on_retriever_start(self, serialized: dict, query: str, **kwargs) -> None:
235 |         self.status.write(f"**Question:** {query}")
236 |         self.status.update(label=f"**Context Retrieval:** {query}")
237 | 
238 |     def on_retriever_end(self, documents, **kwargs) -> None:
239 |         for idx, doc in enumerate(documents):
240 |             try:
241 |                 source = os.path.basename(doc.metadata["source"])
242 |                 page = doc.metadata.get("page")
243 |                 output = f"___\n**Source {idx}:** {source}"
244 |                 output += f" (page {page+1})" if page is not None else ""
245 |                 self.status.write(output)
246 |             except:
247 |                 pass
248 |             self.status.markdown(doc.page_content)
249 |         self.status.update(state="complete")
250 | 
251 | 
252 | class UsageHandler(BaseCallbackHandler):
253 |     prompt = ""
254 |     total_tokens = 0
255 |     prompt_tokens = 0
256 |     completion_tokens = 0
257 |     successful_requests = 0
258 |     total_cost = 0
259 | 
260 |     def update_usage(self) -> None:
261 |         usage_properties = [
262 |             "total_tokens",
263 |             "prompt_tokens",
264 |             "completion_tokens",
265 |             "successful_requests",
266 |             "total_cost",
267 |         ]
268 |         for prop in usage_properties:
269 |             value = getattr(self, prop, 0)
270 |             setattr(self, prop, 0)
271 |             st.session_state["usage"].setdefault(prop, 0)
272 |             st.session_state["usage"][prop] += value
273 | 
274 |     def calculate_costs(self) -> None:
275 |         model = st.session_state["model"]
276 |         tokenizer = get_tokenizer({"model": model})
277 |         self.prompt_tokens = len(tokenizer.encode(self.prompt))
278 |         self.total_tokens = self.prompt_tokens + self.completion_tokens
279 |         completion_cost = get_openai_token_cost_for_model(
280 |             model.name, self.completion_tokens, is_completion=True
281 |         )
282 |         prompt_cost = get_openai_token_cost_for_model(model.name, self.prompt_tokens)
283 |         self.total_cost += prompt_cost + completion_cost
284 | 
285 |     def on_llm_new_token(self, **kwargs) -> None:
286 |         self.completion_tokens += 1
287 | 
288 |     def on_chat_model_start(self, serialized, messages, **kwargs) -> None:
289 |         self.successful_requests += 1
290 |         self.prompt += messages[0][0].content
291 | 
292 |     def on_chain_end(self, outputs, **kwargs) -> None:
293 |         self.calculate_costs()
294 |         self.update_usage()
295 | 


--------------------------------------------------------------------------------
/datachad/streamlit/widgets.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | 
  3 | from datachad.backend.constants import (
  4 |     CHUNK_OVERLAP_PCT,
  5 |     CHUNK_SIZE,
  6 |     DEFAULT_KNOWLEDGE_BASES,
  7 |     ENABLE_ADVANCED_OPTIONS,
  8 |     MAX_TOKENS,
  9 |     TEMPERATURE,
 10 |     USE_VANILLA_LLM,
 11 | )
 12 | from datachad.backend.logging import logger
 13 | from datachad.backend.models import MODELS, STORES
 14 | from datachad.streamlit.constants import (
 15 |     ACTIVELOOP_HELP,
 16 |     APP_NAME,
 17 |     DATA_TYPE_HELP,
 18 |     OPENAI_HELP,
 19 |     PAGE_ICON,
 20 |     PROJECT_URL,
 21 |     UPLOAD_HELP,
 22 | )
 23 | from datachad.streamlit.helper import (
 24 |     PrintRetrievalHandler,
 25 |     StreamHandler,
 26 |     UsageHandler,
 27 |     authenticate,
 28 |     format_vector_stores,
 29 |     get_existing_knowledge_bases,
 30 |     get_existing_smart_faqs_and_default_index,
 31 |     update_chain,
 32 |     upload_data,
 33 | )
 34 | 
 35 | 
 36 | def page_header() -> None:
 37 |     # Page options and header
 38 |     st.set_option("client.showErrorDetails", True)
 39 |     st.set_page_config(
 40 |         page_title=APP_NAME,
 41 |         page_icon=PAGE_ICON,
 42 |         initial_sidebar_state="expanded",
 43 |         layout="wide",
 44 |     )
 45 |     st.markdown(
 46 |         f"<h1 style='text-align: center;'>{APP_NAME} {PAGE_ICON} <br> I know all about your data!</h1>",
 47 |         unsafe_allow_html=True,
 48 |     )
 49 | 
 50 | 
 51 | def init_widgets() -> None:
 52 |     # widget container definition (order matters!)
 53 |     with st.sidebar:
 54 |         st.session_state["authentication_container"] = st.container()
 55 |         st.session_state["data_upload_container"] = st.container()
 56 |         st.session_state["data_selection_container"] = st.container()
 57 |         st.session_state["usage_container"] = st.container()
 58 |         st.session_state["info_container"] = st.empty()
 59 | 
 60 | 
 61 | def authentication_widget() -> None:
 62 |     # Sidebar with Authentication
 63 |     with st.session_state["authentication_container"]:
 64 |         st.info(f"Learn how it works [here]({PROJECT_URL})")
 65 |         with st.expander("Authentication", expanded=not st.session_state["auth_ok"]), st.form(
 66 |             "authentication"
 67 |         ):
 68 |             st.text_input(
 69 |                 f"OpenAI API Key",
 70 |                 type="password",
 71 |                 help=OPENAI_HELP,
 72 |                 placeholder="This field is mandatory",
 73 |                 key="openai_api_key",
 74 |             )
 75 |             st.text_input(
 76 |                 "ActiveLoop Token",
 77 |                 type="password",
 78 |                 help=ACTIVELOOP_HELP,
 79 |                 placeholder="Optional, using ours if empty",
 80 |                 key="activeloop_token",
 81 |             )
 82 |             st.text_input(
 83 |                 "ActiveLoop Organisation Name",
 84 |                 type="password",
 85 |                 help=ACTIVELOOP_HELP,
 86 |                 placeholder="Optional, using ours if empty",
 87 |                 key="activeloop_id",
 88 |             )
 89 |             submitted = st.form_submit_button("Submit")
 90 |             if submitted:
 91 |                 authenticate()
 92 | 
 93 |     if not st.session_state["auth_ok"]:
 94 |         st.stop()
 95 | 
 96 | 
 97 | def upload_options_widget() -> None:
 98 |     if ENABLE_ADVANCED_OPTIONS:
 99 |         col1, col2 = st.columns(2)
100 |         col1.number_input(
101 |             "chunk_size",
102 |             min_value=1,
103 |             max_value=100000,
104 |             value=CHUNK_SIZE,
105 |             help=(
106 |                 "The size at which the text is divided into smaller chunks "
107 |                 "before being embedded.\n\nChanging this parameter makes re-embedding "
108 |                 "and re-uploading the data to the database necessary "
109 |             ),
110 |             key="chunk_size",
111 |         )
112 |         col2.number_input(
113 |             "chunk_overlap",
114 |             min_value=0,
115 |             max_value=50,
116 |             value=CHUNK_OVERLAP_PCT,
117 |             help="The percentage of overlap between splitted document chunks",
118 |             key="chunk_overlap_pct",
119 |         )
120 | 
121 | 
122 | def selection_options_widget() -> None:
123 |     if ENABLE_ADVANCED_OPTIONS:
124 |         st.selectbox(
125 |             "model",
126 |             options=MODELS.all(),
127 |             help=f"Learn more about which models are supported [here]({PROJECT_URL})",
128 |             key="model",
129 |         )
130 |         col1, col2 = st.columns(2)
131 |         col1.number_input(
132 |             "temperature",
133 |             min_value=0.0,
134 |             max_value=1.0,
135 |             value=TEMPERATURE,
136 |             help="Controls the randomness of the language model output",
137 |             key="temperature",
138 |         )
139 |         col2.number_input(
140 |             "max_tokens",
141 |             min_value=1,
142 |             max_value=30000,
143 |             value=MAX_TOKENS,
144 |             help=("Limits the documents returned from " "database based on number of tokens"),
145 |             key="max_tokens",
146 |         )
147 | 
148 | 
149 | def data_upload_widget() -> None:
150 |     with st.session_state["data_upload_container"], st.expander("Data Upload"), st.form(
151 |         "data_upload"
152 |     ):
153 |         st.text_input(
154 |             "Data Source",
155 |             placeholder="Enter any public url or accessible path",
156 |             key="data_source",
157 |         )
158 |         st.file_uploader(
159 |             "Upload Files",
160 |             accept_multiple_files=True,
161 |             help=UPLOAD_HELP,
162 |             key="uploaded_files",
163 |         )
164 |         st.radio("Data Type", options=STORES.all(), key="data_type", help=DATA_TYPE_HELP)
165 |         st.text_input(
166 |             "Data Name",
167 |             placeholder="Give a descriptive and unique name",
168 |             key="data_name",
169 |         )
170 |         upload_options_widget()
171 |         submitted = st.form_submit_button("Submit")
172 |     if submitted:
173 |         if (
174 |             st.session_state["uploaded_files"] or st.session_state["data_source"]
175 |         ) and st.session_state["data_name"]:
176 |             upload_data()
177 |         else:
178 |             st.session_state["info_container"].error(
179 |                 "Missing required files and name!", icon=PAGE_ICON
180 |             )
181 | 
182 | 
183 | def data_selection_widget() -> None:
184 |     with st.session_state["data_selection_container"], st.expander("Data Selection"), st.form(
185 |         "data_selection"
186 |     ):
187 |         existing_smart_faqs, default_index = get_existing_smart_faqs_and_default_index()
188 |         st.selectbox(
189 |             "Select a single Smart FAQ",
190 |             options=existing_smart_faqs,
191 |             format_func=format_vector_stores,
192 |             index=default_index,
193 |             key="smart_faq",
194 |         )
195 |         existing_knowledge_bases = get_existing_knowledge_bases()
196 |         st.multiselect(
197 |             "Select multiple Knowledge Bases",
198 |             options=existing_knowledge_bases,
199 |             format_func=format_vector_stores,
200 |             default=DEFAULT_KNOWLEDGE_BASES,
201 |             key="knowledge_bases",
202 |         )
203 |         st.checkbox("Add vanilla LLM answer", value=USE_VANILLA_LLM, key="use_vanilla_llm")
204 |         selection_options_widget()
205 |         submitted = st.form_submit_button("Submit")
206 |     if submitted:
207 |         if not (
208 |             st.session_state["knowledge_bases"]
209 |             or st.session_state["smart_faq"]
210 |             or st.session_state["use_vanilla_llm"]
211 |         ):
212 |             st.session_state["info_container"].error(
213 |                 "Please select at least one of the data sources!", icon=PAGE_ICON
214 |             )
215 |             st.stop()
216 |         update_chain()
217 |     if not st.session_state["chain"]:
218 |         update_chain()
219 | 
220 | 
221 | def chat_interface_widget() -> None:
222 |     if len(st.session_state["chat_history"].messages) == 0:
223 |         st.session_state["chat_history"].clear()
224 | 
225 |     st.chat_message("assistant").write("How can I help you?")
226 | 
227 |     avatars = {"human": "user", "ai": "assistant"}
228 |     for msg in st.session_state["chat_history"].messages:
229 |         st.chat_message(avatars[msg.type]).write(msg.content)
230 | 
231 |     if user_query := st.chat_input(placeholder="Ask me anything!"):
232 |         st.chat_message("user").write(user_query)
233 | 
234 |         with st.chat_message("assistant"):
235 |             callbacks = []
236 |             if st.session_state["knowledge_bases"] or st.session_state["smart_faq"]:
237 |                 callbacks.append(PrintRetrievalHandler(st.container()))
238 |             callbacks.extend([StreamHandler(st.empty()), UsageHandler()])
239 | 
240 |             response = st.session_state["chain"].run(user_query, callbacks=callbacks)
241 |             logger.info(f"Response: '{response}'")
242 | 
243 | 
244 | def usage_widget() -> None:
245 |     # Usage sidebar with total used tokens and costs
246 |     # We put this at the end to be able to show usage after the first response
247 |     if st.session_state["usage"]:
248 |         with st.session_state["usage_container"], st.expander("Usage"):
249 |             col1, col2 = st.columns(2)
250 |             col1.metric("Total Tokens", st.session_state["usage"]["total_tokens"])
251 |             col2.metric("Total Costs in $", round(st.session_state["usage"]["total_cost"], 4))
252 | 


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | poppler-utils
2 | tesseract-ocr
3 | tesseract-ocr-por


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.31.0
 2 | deeplake==3.8.19
 3 | openai==1.12.0
 4 | langchain==0.1.6
 5 | tiktoken==0.6.0
 6 | unstructured==0.6.5
 7 | pdf2image==1.16.3
 8 | pytesseract==0.3.10
 9 | beautifulsoup4==4.12.2
10 | bs4==0.0.1
11 | python-dotenv==1.0.0
12 | sentence-transformers==2.2.2
13 | tqdm==4.65.0
14 | GitPython==3.1.31
15 | pypdfium2==4.20.0


--------------------------------------------------------------------------------
/static/datachadV1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/static/datachadV1.png


--------------------------------------------------------------------------------
/static/datachadV2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/static/datachadV2.png


--------------------------------------------------------------------------------
/static/datachadV3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gustavz/DataChad/5b6f9e925301fbd88e506bc30939ddeddabc50e3/static/datachadV3.png


--------------------------------------------------------------------------------