├── .github
    └── ISSUE_TEMPLATE
    │   └── default_issue.yml
├── .gitignore
├── .streamlit
    └── config.toml
├── Dockerfile
├── LICENSE
├── README.md
├── README.zh-TW.md
├── app.py
├── components
    ├── __init__.py
    ├── document_processor.py
    ├── response_handler.py
    ├── sidebar.py
    └── theme.py
├── docGPT
    ├── __init__.py
    ├── agent.py
    ├── check_api_key.py
    └── docGPT.py
├── docker-compose.yml
├── model
    ├── __init__.py
    └── data_connection.py
├── requirements.txt
└── static
    └── img
        ├── 2023-07-03-22-38-08.png
        ├── 2023-08-24-15-02-11.png
        ├── 2023-08-29-13-39-00.png
        ├── 2023-09-06-14-56-20.png
        ├── chatbot.png
        ├── chatbot_v2.1.png
        ├── chatbot_v2.png
        ├── docGPT.gif
        ├── repos_logo.png
        └── repos_logo_v1.png


/.github/ISSUE_TEMPLATE/default_issue.yml:
--------------------------------------------------------------------------------
 1 | name: Default Issue
 2 | description: Raise an issue that wouldn't be covered by the other templates.
 3 | title: "Issue: <Please write a comprehensive title after the 'Issue: ' prefix>"
 4 | labels: [Default Issue Template]
 5 | 
 6 | body:
 7 |   - type: textarea
 8 |     attributes:
 9 |       label: "Issue you'd like to raise."
10 |       description: >
11 |         Please describe the issue you'd like to raise as clearly as possible.
12 |         Make sure to include any relevant links or references.
13 | 
14 |   - type: textarea
15 |     attributes:
16 |       label: "Suggestion:"
17 |       description: >
18 |         Please outline a suggestion to improve the issue here.
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .chroma/
  2 | data/
  3 | External_Data_Pipeline/
  4 | PDF/Omren
  5 | config.py
  6 | main.py
  7 | note.md
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | share/python-wheels/
 32 | *.egg-info/
 33 | .installed.cfg
 34 | *.egg
 35 | MANIFEST
 36 | 
 37 | # PyInstaller
 38 | #  Usually these files are written by a python script from a template
 39 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 40 | *.manifest
 41 | *.spec
 42 | 
 43 | # Installer logs
 44 | pip-log.txt
 45 | pip-delete-this-directory.txt
 46 | 
 47 | # Unit test / coverage reports
 48 | htmlcov/
 49 | .tox/
 50 | .nox/
 51 | .coverage
 52 | .coverage.*
 53 | .cache
 54 | nosetests.xml
 55 | coverage.xml
 56 | *.cover
 57 | *.py,cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | cover/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | db.sqlite3-journal
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | .pybuilder/
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | #   For a library or package, you might want to ignore these files since the code is
 95 | #   intended to run in multiple environments; otherwise, check them in:
 96 | # .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | Pipfile.lock
104 | Pipfile
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | # pytype static type analyzer
159 | .pytype/
160 | 
161 | # Cython debug symbols
162 | cython_debug/
163 | 
164 | # PyCharm
165 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
168 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 | 


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [server]
2 | enableStaticServing = true


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | # Set the working directory in the container.
 4 | WORKDIR /app
 5 | 
 6 | # Copy the project's requirements file into the container
 7 | COPY requirements.txt ./requirements.txt
 8 | # Upgrade pip for the latest features and install the project's Python dependencies.
 9 | RUN pip install --upgrade pip && pip install -r requirements.txt
10 | 
11 | # Copy the entire project into the container.
12 | # This may include all code, assets, and configuration files required to run the application.
13 | COPY . /app
14 | 
15 | # Expose port 8501
16 | EXPOSE 8501
17 | 
18 | # Define the default command to run the app using Python's module mode.
19 | CMD ["streamlit", "run", "app.py"]
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 JunXiang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <img style="width: 50%; height: auto;" src="./static/img/repos_logo.png" alt="Chatbot Image">
  3 | </p>
  4 | 
  5 | 
  6 | [English](./README.md) | [中文版](./README.zh-TW.md)
  7 | 
  8 | Free `docGPT` allows you to chat with your documents (`.pdf`, `.docx`, `.csv`, `.txt`), without the need for any keys or fees.
  9 | 
 10 | Additionally, you can deploy the app anywhere based on the document.
 11 | 
 12 | - Table of Contents
 13 |     - [Introduction](#introduction)
 14 |     - [Features](#🧨features)
 15 |     - [What's LangChain?](#whats-langchain)
 16 |     - [How to Use docGPT?](#how-to-use-docgpt)
 17 |     - [How to Develop a docGPT with Streamlit?](#how-to-develop-a-docgpt-with-streamlit)
 18 |     - [Advanced - How to build a better model in langchain](#advanced---how-to-build-a-better-model-in-langchain)
 19 | 
 20 | * Main Development Software and Packages:
 21 |     * `Python 3.10.11`
 22 |     * `Langchain 0.0.218`
 23 |     * `Streamlit 1.22.0`
 24 |     * [more](./requirements.txt)
 25 | 
 26 | If you like this project, please give it a ⭐`Star` to support the developers~
 27 | 
 28 | ### 📚Introduction
 29 | 
 30 | * Upload a Document link from your local device (`.pdf`, `.docx`, `.csv`, `.txt`) and query `docGPT` about the content of the Document. For example, you can ask GPT to summarize an article.
 31 | 
 32 | * Provide two models:
 33 |   * `gpt4free`
 34 |     * **Completely free, allowing users to use the application without the need for API keys or payments.**
 35 |     * Select the `Provider`. For more details about `gpt4free`, please refer to the [source project](https://github.com/xtekky/gpt4free).
 36 |   * `openai`
 37 |     * **Requires an `openai_api_key`, which you can obtain from [this link](https://platform.openai.com/).**
 38 |     * If you have an `serpapi_key`, AI responses can include Google search results.
 39 | 
 40 | <p align="center">
 41 | <img src="static/img/2023-09-06-14-56-20.png" width="80%">
 42 | </p>
 43 | 
 44 | ---
 45 | 
 46 | ### 🧨Features
 47 | 
 48 | - **`gpt4free` Integration**: Everyone can use `docGPT` for **free** without needing an OpenAI API key.
 49 | - **Support docx, pdf, csv, txt file**: Users can upload PDF, Word, CSV, txt file.
 50 | - **Direct Document URL Input**: Users can input Document `URL` links for parsing without uploading document files(see the demo).
 51 | - **Langchain Agent**: Enables AI to answer current questions and achieve Google search-like functionality.
 52 | - **User-Friendly Environment**: Easy-to-use interface for simple operations.
 53 | 
 54 | ---
 55 | 
 56 | ### 🦜️What's LangChain?
 57 | 
 58 | * LangChain is a framework for developing applications powered by language models. It supports the following applications:
 59 |     1. Connecting LLM models with external data sources.
 60 |     2. Interactive communication with LLM models.
 61 | 
 62 | * For more details about LangChain, refer to the [official documentation](https://github.com/hwchase17/langchain).
 63 | 
 64 | **For questions that ChatGPT can't answer, turn to LangChain!**
 65 | 
 66 | LangChain fills in the gaps left by ChatGPT. Through the following example, you can understand the power of LangChain:
 67 | 
 68 | > In cases where ChatGPT can't solve mathematical problems or answer questions about events after 2020 (e.g., "Who is the president in 2023?"):
 69 | >
 70 | > * For mathematical problems: There's a math-LLM model dedicated to handling math queries.
 71 | > * For modern topics: You can use Google search.
 72 | >
 73 | > To create a comprehensive AI model, we need to combine "ChatGPT," "math-LLM," and "Google search" tools.
 74 | >
 75 | > In the non-AI era, we used `if...else...` to categorize user queries and had users select the question type through UI.
 76 | >
 77 | > In the AI era, users should be able to directly ask questions without preselecting the question type. With LangChain's agent:
 78 | >  * We provide tools to the agent, e.g., `tools = ['chatgpt', 'math-llm', 'google-search']`.
 79 | >  * Tools can include chains designed using LangChain, such as using a retrievalQA chain to answer questions from documents.
 80 | >  * **The agent automatically decides which tool to use based on user queries** (fully automated).
 81 | 
 82 | Through LangChain, you can create a universal AI model or tailor it for business applications.
 83 | 
 84 | 
 85 | ---
 86 | 
 87 | ### 🚩How to Use docGPT?
 88 | 
 89 | 1. 🎬Visit the [application](https://docgpt-app.streamlit.app/).
 90 | 
 91 | 2. 🔑Enter your `API_KEY` (optional in Version 3, as you can use the `gpt4free` free model):
 92 |    - `OpenAI API KEY`: Ensure you have available usage.
 93 |    - `SERPAPI API KEY`: Required if you want to query content not present in the Document.
 94 | 
 95 | 3. 📁Upload a Document file (choose one method)
 96 |     * Method 1: Browse and upload your own `.pdf`, `.docx`, `.csv`, `.txt` file from your local machine.
 97 |     * Method 2: Enter the Document `URL` link directly.
 98 | 
 99 | 4. 🚀Start asking questions!
100 | 
101 | ![docGPT](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/static/img/docGPT.gif?raw=true)
102 | 
103 | > [!WARNING]
104 | > Due to resource limitations in the free version of Streamlit Cloud, the application may experience crashes when used by multiple users simultaneously ([Oh no!](https://github.com/Lin-jun-xiang/docGPT-langchain/issues/2)). If you encounter this problem, feel free to report it in the issue tracker, and the developers will restart the application.
105 | 
106 | ---
107 | 
108 | ### 🧠How to Develop a docGPT with Streamlit?
109 | 
110 | A step-by-step tutorial to quickly build your own chatGPT!
111 | 
112 | First, clone the repository using `git clone https://github.com/Lin-jun-xiang/docGPT-streamlit.git`.
113 | 
114 | There are few methods:
115 | 
116 | * **Local development without docker**:
117 |     * Download the required packages for development.
118 |         ```
119 |         pip install -r requirements.txt
120 |         ```
121 | 
122 |     * Start the service in the project's root directory.
123 |         ```
124 |         streamlit run ./app.py
125 |         ```
126 | 
127 |     * Start exploring! You server will now be running at `http://localhost:8501`.
128 | 
129 | * **Local development with docker**:
130 |     * Start the service using Docker Compose
131 |         ```
132 |         docker-compose up
133 |         ```
134 | 
135 |         You server will now be running at `http://localhost:8501`. You can interact with the `docGPT` or run your tests as you would normally.
136 |     
137 |     * To stop the Docker containers, simply run:
138 |         ```
139 |         docker-compose down
140 |         ```
141 | 
142 | * **Streamlit Community Cloud for free** deployment, management, and sharing of applications:
143 |    - Place your application in a public GitHub repository (ensure you have `requirements.txt`).
144 |    - Log in to [share.streamlit.io](https://share.streamlit.io/).
145 |    - Click "Deploy an App," then paste your GitHub URL.
146 |    - Complete deployment and share your [application](https://docgpt-app.streamlit.app//).
147 | 
148 | Due to the limitations of the free version of Streamlit Cloud and its reliance on server resources, `docGPT` may experience some latency. We recommend users to consider deploying it locally for a smoother experience
149 | 
150 | ---
151 | 
152 | ### 💬Advanced - How to build a better model in langchain
153 | 
154 | To build a powerful docGPT model in LangChain, consider these tips to enhance performance:
155 | 
156 | 1. **Language Model**
157 | 
158 |     Select an appropriate LLM model, such as OpenAI's `gpt-3.5-turbo` or other models. Experiment with different models to find the best fit for your use case.
159 | 
160 |     ```python
161 |     # ./docGPT/docGPT.py
162 |     llm = ChatOpenAI(
163 |     temperature=0.2,
164 |     max_tokens=2000,
165 |     model_name='gpt-3.5-turbo'
166 |     )
167 |     ```
168 | 
169 |     Please note that there is no best or worst model. You need to try multiple models to find the one that suits your use case the best. For more OpenAI models, please refer to the [documentation](https://platform.openai.com/docs/models).
170 |     
171 |     (Some models support up to 16,000 tokens!)
172 | 
173 | 2. **PDF Loader**
174 | 
175 |     Choose a suitable PDF loader. Consider using `PyMuPDF` for fast text extraction and `PDFPlumber` for extracting text from tables.
176 |     
177 |     ([official Langchain documentation](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf))
178 | 
179 |     * `PyPDF`: Simple and easy to use.
180 |     * `PyMuPDF`: Reads the document very **quickly** and provides additional metadata such as page numbers and document dates.
181 |     * `PDFPlumber`: Can **extract text within tables**. Similar to PyMuPDF, it provides metadata but takes longer to parse.
182 | 
183 |     If your document contains multiple tables and important information is within those tables, it is recommended to try `PDFPlumber`, which may give you unexpected results!
184 | 
185 |     Please do not overlook this detail, as without correctly parsing the text from the document, even the most powerful LLM model would be useless!
186 | 
187 | 3. **Tracking Token Usage**
188 | 
189 |     Implement token usage tracking with callbacks in LangChain to monitor token and API key usage during the QA chain process.
190 | 
191 |     When using `chain.run`, you can try using the [method](https://python.langchain.com/docs/modules/model_io/models/llms/how_to/token_usage_tracking) provided by Langchain to track token usage here:
192 | 
193 |     ```python
194 |     from langchain.callbacks import get_openai_callback
195 | 
196 |     with get_openai_callback() as callback:
197 |         response = self.qa_chain.run(query)
198 | 
199 |     print(callback)
200 | 
201 |     # Result of print
202 |     """
203 |     chain...
204 |     ...
205 |     > Finished chain.
206 |     Total Tokens: 1506
207 |     Prompt Tokens: 1350
208 |     Completion Tokens: 156
209 |     Total Cost (USD): $0.03012
210 |     ```
211 | 
212 | <a href="#top">Back to top</a>
213 |  


--------------------------------------------------------------------------------
/README.zh-TW.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <img style="width: 50%; height: auto;" src="./static/img/repos_logo.png" alt="Chatbot Image">
  3 | </p>
  4 | 
  5 | [English](./README.md) | [中文版](./README.zh-TW.md)
  6 | 
  7 | 免費的`docGPT`允許您與您的文件 (`.pdf`, `.docx`, `.csv`, `.txt`) 進行對話，無需任何金鑰或費用。
  8 | 
  9 | 此外，您也可以根據該文件操作，將程序部屬在任何地方。
 10 | 
 11 | - 目錄
 12 |     - [Introduction](#introduction)
 13 |     - [Features](#🧨features)
 14 |     - [What's LangChain?](#whats-langchain)
 15 |     - [How to Use docGPT?](#how-to-use-docgpt)
 16 |     - [How to develope a docGPT with streamlit?](#how-to-develope-a-docgpt-with-streamlit)
 17 |     - [Advanced - How to build a better model in langchain](#advanced---how-to-build-a-better-model-in-langchain)
 18 | 
 19 | * 主要開發軟體與套件:
 20 |     * `Python 3.10.11`
 21 |     * `Langchain 0.0.218`
 22 |     * `Streamlit 1.22.0`
 23 |     * [more](./requirements.txt)
 24 | 
 25 | 如果您喜歡這個專案，請給予⭐`Star`以支持開發者~
 26 | 
 27 | ### 📚Introduction
 28 | 
 29 | * 上傳來自本地的 Document 連結 (`.pdf`, `.docx`, `.csv`, `.txt`)，並且向 `docGPT` 詢問有關 Document 內容。例如: 您可以請 GPT 幫忙總結文章
 30 | * 提供兩種模型選擇:
 31 |   * `gpt4free`
 32 |     * **完全免費，"允許使用者在無需輸入 API 金鑰或付款的情況下使用該應用程序"**
 33 |     * 需選擇 `Provider`。有關 `gpt4free` 的更多詳細信息，請參閱[源專案](https://github.com/xtekky/gpt4free)
 34 |   * `openai`
 35 |     * **須具備** `openai_api_key`，您可以從此[鏈接](https://platform.openai.com/)獲取金鑰
 36 |     * 若具備 `serpapi_key`，AI 的回應可以包括 Google 搜索結果
 37 | 
 38 | <p align="center">
 39 | <img src="static/img/2023-09-06-14-56-20.png" width="80%">
 40 | </p>
 41 | 
 42 | ---
 43 | 
 44 | ### 🧨Features
 45 | 
 46 | - **`gpt4free` 整合**：任何人都可以免費使用 GPT4，無需輸入 OpenAI API 金鑰。
 47 | - **支援 docx, pdf, csv, txt 檔案**: 可以上傳 PDF, Word, CSV, txt 檔
 48 | - **直接輸入 Document 網址**：使用者可以直接輸入 Document URL 進行解析，無需從本地上傳檔案(如下方demo所示)。
 49 | - **Langchain Agent**：AI 能夠回答當前問題，實現類似 Google 搜尋功能。
 50 | - **簡易操作環境**：友善的界面，操作簡便
 51 | 
 52 | ---
 53 | 
 54 | ### 🦜️What's LangChain?
 55 | 
 56 | * LangChain 是一個用於**開發由語言模型支持的應用程序的框架**。它支持以下應用程序
 57 |     1. 將 LLM 模型與外部數據源進行連接
 58 |     2. 允許與 LLM 模型進行交互
 59 | 
 60 | * 有關 langchain 的介紹，建議查看官方文件、[Github源專案](https://github.com/hwchase17/langchain)
 61 | 
 62 | 
 63 | **ChatGPT 無法回答的問題，交給 Langchain 實現!**
 64 | 
 65 | LangChain 填補了 ChatGPT 的不足之處。通過以下示例，您可以理解 LangChain 的威力：
 66 | 
 67 | > 在 ChatGPT 無法解答數學問題或回答 2020 年以後的問題（例如“2023 年的總統是誰？”）的情況下：
 68 | >
 69 | > * 數學問題: 有專門處理數學問題的 math-LLM 模型
 70 | > * 現今問題: 使用 Google 搜索
 71 | >
 72 | > 要創建一個全面的 AI 模型，我們需要結合 "ChatGPT"、"math-LLM" 和 "Google 搜索" 工具。
 73 | >
 74 | > 在非 AI 時代，我們將使用 `if...else...` 將用戶查詢進行分類，讓用戶選擇問題類型（通過 UI）。
 75 | >
 76 | > 在 AI 時代，用戶應能夠直接提問。通過 LangChain 的 agent：
 77 | >
 78 | >  * 我們向 agent 提供工具，例如 `tools = ['chatgpt', 'math-llm', 'google-search']`
 79 | >  * 工具可以包括使用 LangChain 設計的 chains，例如使用 `retrievalQA chain` 回答來自文檔的問題。
 80 | >  * agent 根據用戶查詢自動決定使用哪個工具（完全自動化）。
 81 | 
 82 | 通過 LangChain，您可以創建通用的 AI 模型，也可以為**商業應用**量身定制。
 83 | 
 84 | ---
 85 | 
 86 | ### 🚩How to Use docGPT?
 87 | 
 88 | 1. 🎬前往[應用程序](https://docgpt-app.streamlit.app/)
 89 | 
 90 | 2. 🔑輸入您的 `API_KEY` (在版本 3 中為可選，您可以使用 `gpt4free` 免費模型):
 91 |     * `OpenAI API KEY`: 確保還有可用的使用次數。
 92 |     * `SERPAPI API KEY`: 如果您要查詢 Document 中不存在的內容，則需要使用此金鑰。
 93 | 
 94 | 3. 📁上傳來自本地的 Document 檔案 (選擇一個方法)
 95 |     * 方法一: 從本地機瀏覽並上傳自己的 `.pdf`, `.docx`, `.csv` or `.txt` 檔
 96 |     * 方法二: 輸入 Document URL 連結
 97 | 
 98 | 4. 🚀開始提問 ! 
 99 | 
100 | ![RGB_cleanup](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/static/img/docGPT.gif?raw=true)
101 | 
102 | > [!WARNING]
103 | > 由於免費版 streamlit cloud 資源限制，該程序在多人同時使用時，容易引發崩潰([Oh no!](https://github.com/Lin-jun-xiang/docGPT-langchain/issues/2))，若遇上該問題歡迎到 Issue 提醒開發者，開發者會重啟程序。
104 | 
105 | 
106 | ---
107 | 
108 | ### 🧠How to develope a docGPT with streamlit?
109 | 
110 | 手把手教學，讓您快速建立一個屬於自己的 chatGPT !
111 | 
112 | 首先請進行 `git clone https://github.com/Lin-jun-xiang/docGPT-streamlit.git`
113 | 
114 | 方法有如下幾種方法:
115 | 
116 | * 於**本地開發方式(不使用docker)**:
117 |     * 下載開發需求套件
118 |         ```
119 |         pip install -r requirements.txt
120 |         ```
121 | 
122 |     * 於專案根目錄啟動服務
123 |         ```
124 |         streamlit run ./app.py
125 |         ```
126 | 
127 |     * 開始體驗! 您的服務會運行在 `http://localhost:8501`.
128 | 
129 | * 於**本地開發方式(使用docker)**:
130 |     * 使用 Docker Compose 啟動服務
131 |         ```
132 |         docker-compose up
133 |         ```
134 | 
135 |         您的服務會運行在 `http://localhost:8501`. 您可以開始使用 `docGPT` 應用程序
136 |     
137 |     * 停止服務運行
138 |         ```
139 |         docker-compose down
140 |         ```
141 | 
142 | * 使用 Streamlit Community **Cloud 免費部屬**、管理和共享應用程序
143 |     * 將您的應用程序放在公共 GitHub 存儲庫中（確保有 `requirements.txt`！）
144 |     * 登錄[share.streamlit.io](https://share.streamlit.io/)
145 |     * 單擊“部署應用程序”，然後粘貼您的 GitHub URL
146 |     * 完成部屬[應用程序](https://docgpt-app.streamlit.app//)
147 | 
148 | 由於 `docGPT` 是使用 streamlit cloud 免費版部屬，受限於設備關係會有不少延遲，建議使用者可以使用本地部屬方式來體驗。
149 | 
150 | ---
151 | 
152 | ### 💬Advanced - How to build a better model in langchain
153 | 
154 | 要在 LangChain 中構建功能強大的 docGPT 模型，請考慮以下技巧以改進性能
155 | 
156 | 1. **Language Model**
157 |    
158 |    使用適當的 LLM Model，會讓您事半功倍，例如您可以選擇使用 OpenAI 的 `gpt-3.5-turbo` (預設是 `text-davinci-003`):
159 | 
160 |    ```python
161 |    # ./docGPT/docGPT.py
162 |    llm = ChatOpenAI(
163 |     temperature=0.2,
164 |     max_tokens=2000,
165 |     model_name='gpt-3.5-turbo'
166 |    ) 
167 |    ```
168 | 
169 |    請注意，模型之間並沒有最好與最壞，您需要多試幾個模型，才會發現最適合自己案例的模型，更多 OpenAI model 請[參考](https://platform.openai.com/docs/models)
170 |    
171 |    (部分模型可以使用 16,000 tokens!)
172 | 
173 | 2. **PDF Loader**
174 | 
175 |     在 Python 中有許多解析 PDF 文字的 Loader，每個 Loader 各有優缺點，以下整理三個作者用過的
176 |     
177 |     ([Langchain官方介紹](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf)):
178 | 
179 |     * `PyPDF`: 簡單易用
180 |     * `PyMuPDF`: 讀取文件**速度非常快速**，除了能解析文字，還能取得頁數、文檔日期...等 MetaData。
181 |     * `PDFPlumber`: 能夠解析出**表格內部文字**，使用方面與 `PyMuPDF` 相似，皆能取得 MetaData，但是解析時間較長。
182 | 
183 |     如果您的文件具有多個表格，且重要資訊存在表格中，建議您嘗試 `PDFPlumber`，它會給您意想不到的結果!
184 |     請不要忽略這個細節，因為沒有正確解析出文件中的文字，即使 LLM 模型再強大也無用! 
185 | 
186 | 3. **Tracking Token Usage**
187 | 
188 |     這個並不能讓模型強大，但是能讓您清楚知道 QA Chain 的過程中，您使用的 tokens、openai api key 的使用量。
189 | 
190 |     當您使用 `chain.run` 時，可以嘗試用 langchain 提供的 [方法](https://python.langchain.com/docs/modules/model_io/models/llms/how_to/token_usage_tracking):
191 | 
192 |     ```python
193 |     from langchain.callbacks import get_openai_callback
194 | 
195 |     with get_openai_callback() as callback:
196 |         response = self.qa_chain.run(query)
197 | 
198 |     print(callback)
199 | 
200 |     # Result of print
201 |     """
202 |     chain...
203 |     ...
204 |     > Finished chain.
205 |     Total Tokens: 1506
206 |     Prompt Tokens: 1350
207 |     Completion Tokens: 156
208 |     Total Cost (USD): $0.03012
209 |     """
210 |     ```
211 | 
212 | <a href="#top">Back to top</a>
213 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.chdir(os.path.dirname(os.path.abspath(__file__)))
 4 | os.environ['SERPAPI_API_KEY'] = ''
 5 | 
 6 | import streamlit as st
 7 | from streamlit import logger
 8 | from streamlit_chat import message
 9 | 
10 | from components import get_response, side_bar, theme, upload_and_process_document
11 | from docGPT import create_doc_gpt
12 | 
13 | OPENAI_API_KEY = ''
14 | SERPAPI_API_KEY = ''
15 | model = None
16 | 
17 | st.session_state.openai_api_key = None
18 | st.session_state.serpapi_api_key = None
19 | st.session_state.g4f_provider = None
20 | st.session_state.button_clicked = None
21 | 
22 | 
23 | if 'response' not in st.session_state:
24 |     st.session_state['response'] = ['How can I help you?']
25 | 
26 | if 'query' not in st.session_state:
27 |     st.session_state['query'] = ['Hi']
28 | 
29 | app_logger = logger.get_logger(__name__)
30 | 
31 | 
32 | def main():
33 |     global model
34 |     theme()
35 |     side_bar()
36 | 
37 |     doc_container = st.container()
38 |     with doc_container:
39 |         docs = upload_and_process_document()
40 | 
41 |         if docs:
42 |             model = create_doc_gpt(
43 |                 docs,
44 |                 {k: v for k, v in docs[0].metadata.items() if k not in ['source', 'file_path']},
45 |                 st.session_state.g4f_provider
46 |             )
47 |             app_logger.info(f'{__file__}: Created model: {model}')
48 |             del docs
49 |         st.write('---')
50 | 
51 |     user_container = st.container()
52 |     response_container = st.container()
53 |     with user_container:
54 |         query = st.text_input(
55 |             "#### Question:",
56 |             placeholder='Enter your question'
57 |         )
58 | 
59 |         if model and query and query != '' and not st.session_state.button_clicked:
60 |             response = get_response(query, model)
61 |             st.session_state.query.append(query)
62 |             st.session_state.response.append(response) 
63 | 
64 |     with response_container:
65 |         if st.session_state['response']:
66 |             for i in range(len(st.session_state['response'])-1, -1, -1):
67 |                 message(
68 |                     st.session_state["response"][i], key=str(i),
69 |                     logo=(
70 |                         'https://github.com/Lin-jun-xiang/docGPT-streamlit/'
71 |                         'blob/main/static/img/chatbot_v2.png?raw=true'
72 |                     )
73 |                 )
74 |                 message(
75 |                     st.session_state['query'][i], is_user=True, key=str(i) + '_user',
76 |                     logo=(
77 |                         'https://api.dicebear.com/6.x/adventurer/svg?'
78 |                         'hair=short16&hairColor=85c2c6&'
79 |                         'eyes=variant12&size=100&'
80 |                         'mouth=variant26&skinColor=f2d3b1'
81 |                     )
82 |                 )
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/components/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sidebar import side_bar
 2 | from .document_processor import upload_and_process_document
 3 | from .response_handler import get_response
 4 | from .theme import theme
 5 | 
 6 | __all__ = [
 7 |     'get_response',
 8 |     'side_bar',
 9 |     'theme',
10 |     'upload_and_process_document'
11 | ]
12 | 


--------------------------------------------------------------------------------
/components/document_processor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | import streamlit as st
 5 | 
 6 | from model import DocumentLoader
 7 | 
 8 | 
 9 | def upload_and_process_document() -> list:
10 |     st.write('#### Upload a Document file')
11 |     browse, url_link = st.tabs(
12 |         ['Drag and drop file (Browse files)', 'Enter document URL link']
13 |     )
14 |     with browse:
15 |         upload_file = st.file_uploader(
16 |             'Browse file (.pdf, .docx, .csv, `.txt`)',
17 |             type=['pdf', 'docx', 'csv', 'txt'],
18 |             label_visibility='hidden'
19 |         )
20 |         filetype = os.path.splitext(upload_file.name)[1].lower() if upload_file else None
21 |         upload_file = upload_file.read() if upload_file else None
22 | 
23 |     with url_link:
24 |         doc_url = st.text_input(
25 |             "Enter document URL Link (.pdf, .docx, .csv, .txt)",
26 |             placeholder='https://www.xxx/uploads/file.pdf',
27 |             label_visibility='hidden'
28 |         )
29 |         if doc_url:
30 |             upload_file, filetype = DocumentLoader.crawl_file(doc_url)
31 | 
32 |     if upload_file and filetype:
33 |         temp_file = tempfile.NamedTemporaryFile(delete=False)
34 |         temp_file.write(upload_file)
35 |         temp_file_path = temp_file.name
36 | 
37 |         docs = DocumentLoader.load_documents(temp_file_path, filetype)
38 |         docs = DocumentLoader.split_documents(
39 |             docs, chunk_size=2000,
40 |             chunk_overlap=200
41 |         )
42 | 
43 |         temp_file.close()
44 |         if temp_file_path:
45 |             os.remove(temp_file_path)
46 | 
47 |         return docs
48 | 


--------------------------------------------------------------------------------
/components/response_handler.py:
--------------------------------------------------------------------------------
 1 | from streamlit import logger
 2 | 
 3 | app_logger = logger.get_logger(__name__)
 4 | 
 5 | def get_response(query: str, model) -> str:
 6 |     app_logger.info(f'\033[36mUser Query: {query}\033[0m')
 7 |     try:
 8 |         if model is not None and query:
 9 |             response = model.run(query)
10 |             app_logger.info(f'\033[36mLLM Response: {response}\033[0m')
11 |             return response
12 |         return (
13 |             'Your model still not created.\n'
14 |             '1. If you are using gpt4free model, '
15 |             'try to re-select a provider. '
16 |             '(Click the "Show Available Providers" button in sidebar)\n'
17 |             '2. If you are using openai model, '
18 |             'try to re-pass openai api key.\n'
19 |             '3. Or you did not pass the file successfully.\n'
20 |             '4. Try to Refresh the page (F5).'
21 |         )
22 |     except Exception as e:
23 |         app_logger.info(f'{__file__}: {e}')
24 |         return (
25 |             'Something wrong in docGPT...\n'
26 |             '1. If you are using gpt4free model, '
27 |             'try to select the different provider. '
28 |             '(Click the "Show Available Providers" button in sidebar)\n'
29 |             '2. If you are using openai model, '
30 |             'check your usage for openai api key.\n'
31 |             '3. Try to Refresh the page (F5).'
32 |         )
33 | 


--------------------------------------------------------------------------------
/components/sidebar.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | 
 4 | import streamlit as st
 5 | 
 6 | from docGPT import GPT4Free
 7 | 
 8 | 
 9 | def side_bar() -> None:
10 |     with st.sidebar:
11 |         with st.expander(':orange[How to use?]'):
12 |             st.markdown(
13 |                 """
14 |                 1. Enter your API keys: (You can use the `gpt4free` free model **without API keys**)
15 |                     * `OpenAI API Key`: Make sure you still have usage left
16 |                     * `SERPAPI API Key`: Optional. If you want to ask questions about content not appearing in the PDF document, you need this key.
17 |                 2. **Upload a Document** file (choose one method):
18 |                     * method1: Browse and upload your own document file from your local machine.
19 |                     * method2: Enter the document URL link directly.
20 |                     
21 |                     (**support documents**: `.pdf`, `.docx`, `.csv`, `.txt`)
22 |                 3. Start asking questions!
23 |                 4. More details.(https://github.com/Lin-jun-xiang/docGPT-streamlit)
24 |                 5. If you have any questions, feel free to leave comments and engage in discussions.(https://github.com/Lin-jun-xiang/docGPT-streamlit/issues)
25 |                 """
26 |             )
27 | 
28 |     with st.sidebar:
29 |         if st.session_state.openai_api_key:
30 |             OPENAI_API_KEY = st.session_state.openai_api_key
31 |             st.sidebar.success('API key loaded form previous input')
32 |         else:
33 |             OPENAI_API_KEY = st.sidebar.text_input(
34 |                 label='#### Your OpenAI API Key 👇',
35 |                 placeholder="sk-...",
36 |                 type="password",
37 |                 key='OPENAI_API_KEY'
38 |             )
39 |             st.session_state.openai_api_key = OPENAI_API_KEY
40 | 
41 |         os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
42 | 
43 |     with st.sidebar:
44 |         if st.session_state.serpapi_api_key:
45 |             SERPAPI_API_KEY = st.session_state.serpapi_api_key
46 |             st.sidebar.success('API key loaded form previous input')
47 |         else:
48 |             SERPAPI_API_KEY = st.sidebar.text_input(
49 |                 label='#### Your SERPAPI API Key 👇',
50 |                 placeholder="...",
51 |                 type="password",
52 |                 key='SERPAPI_API_KEY'
53 |             )
54 |             st.session_state.serpapi_api_key = SERPAPI_API_KEY
55 | 
56 |         os.environ['SERPAPI_API_KEY'] = SERPAPI_API_KEY
57 | 
58 |     with st.sidebar:
59 |         gpt4free = GPT4Free()
60 |         st.session_state.g4f_provider = st.selectbox(
61 |             (
62 |                 "#### Select a provider if you want to use free model. "
63 |                 "([details](https://github.com/xtekky/gpt4free#models))"
64 |             ),
65 |             (['BestProvider'] + list(gpt4free.providers_table.keys()))
66 |         )
67 | 
68 |         st.session_state.button_clicked = st.button(
69 |             'Show Available Providers',
70 |             help='Click to test which providers are currently available.',
71 |             type='primary'
72 |         )
73 |         if st.session_state.button_clicked:
74 |             available_providers = asyncio.run(gpt4free.show_available_providers())
75 |             st.session_state.query.append('What are the available providers right now?')
76 |             st.session_state.response.append(
77 |                 'The current available providers are:\n'
78 |                 f'{available_providers}'
79 |             )
80 | 


--------------------------------------------------------------------------------
/components/theme.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | 
3 | 
4 | def theme() -> None:
5 |     st.set_page_config(page_title="Document GPT")
6 |     st.image('./static/img/chatbot_v2.png', width=150)
7 | 


--------------------------------------------------------------------------------
/docGPT/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import openai
 4 | import streamlit as st
 5 | from langchain.chat_models import ChatOpenAI
 6 | from streamlit import logger
 7 | 
 8 | from .agent import AgentHelper
 9 | from .check_api_key import OpenAiAPI, SerpAPI
10 | from .docGPT import DocGPT, GPT4Free
11 | 
12 | openai.api_key = os.getenv('OPENAI_API_KEY')
13 | os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY')
14 | module_logger = logger.get_logger(__name__)
15 | 
16 | 
17 | @st.cache_resource(ttl=1200, max_entries=3)
18 | def create_doc_gpt(
19 |     _docs: list,
20 |     doc_metadata: str,
21 |     g4f_provider: str
22 | ) -> DocGPT:
23 |     docGPT = DocGPT(docs=_docs)
24 | 
25 |     try:
26 |         if OpenAiAPI.is_valid():
27 |             # Use openai llm model with agent
28 |             docGPT_tool, calculate_tool, search_tool, llm_tool = [None] * 4
29 |             agent_ = AgentHelper()
30 | 
31 |             llm_model = ChatOpenAI(
32 |                 temperature=0.2,
33 |                 max_tokens=6000,
34 |                 model_name='gpt-3.5-turbo-16k'
35 |             )
36 |             docGPT.llm = llm_model
37 |             agent_.llm = llm_model
38 | 
39 |             docGPT.create_qa_chain(chain_type='refine', verbose=False)
40 |             docGPT_tool = agent_.create_doc_chat(docGPT)
41 |             calculate_tool = agent_.get_calculate_chain
42 |             # llm_tool = agent_.create_llm_chain()
43 | 
44 |             module_logger.info('\033[43mUsing OpenAI model...\033[0m')
45 | 
46 |             if SerpAPI.is_valid():
47 |                 search_tool = agent_.get_searp_chain
48 | 
49 |                 tools = [
50 |                     docGPT_tool,
51 |                     search_tool,
52 |                     # llm_tool, # This will cause agent confuse
53 |                     calculate_tool
54 |                 ]
55 |                 agent_.initialize(tools)
56 |                 return agent_ if agent_ is not None else None
57 |             else:
58 |                 return docGPT
59 |         else:
60 |             # Use gpt4free llm model without agent
61 |             llm_model = GPT4Free(provider=g4f_provider)
62 |             docGPT.llm = llm_model
63 |             docGPT.create_qa_chain(chain_type='refine', verbose=False)
64 |             module_logger.info('\033[43mUsing Gpt4free model...\033[0m')
65 |             return docGPT
66 | 
67 |     except Exception as e:
68 |         print(e)
69 |         module_logger.info(f'{__file__}: {e}')
70 | 


--------------------------------------------------------------------------------
/docGPT/agent.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Optional
  3 | 
  4 | import openai
  5 | from langchain.agents import AgentType, Tool, initialize_agent
  6 | from langchain.callbacks import get_openai_callback
  7 | from langchain.chains import LLMChain
  8 | from langchain.prompts import PromptTemplate
  9 | 
 10 | openai.api_key = os.getenv('OPENAI_API_KEY')
 11 | os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY')
 12 | 
 13 | 
 14 | class AgentHelper:
 15 |     """Add agent to help docGPT can be perfonm better."""
 16 |     def __init__(self) -> None:
 17 |         self._llm = None
 18 |         self.agent_ = None
 19 |         self.tools = []
 20 | 
 21 |     @property
 22 |     def llm(self):
 23 |         return self._llm
 24 | 
 25 |     @llm.setter
 26 |     def llm(self, llm) -> None:
 27 |         self._llm = llm
 28 | 
 29 |     @property
 30 |     def get_calculate_chain(self) -> Tool:
 31 |         from langchain import LLMMathChain
 32 | 
 33 |         llm_math_chain = LLMMathChain.from_llm(llm=self.llm, verbose=True)
 34 |         tool = Tool(
 35 |             name='Calculator',
 36 |             func=llm_math_chain.run,
 37 |             description='useful for when you need to answer questions about math'
 38 |         )
 39 |         return tool
 40 | 
 41 |     @property
 42 |     def get_searp_chain(self) -> Tool:
 43 |         from langchain import SerpAPIWrapper
 44 | 
 45 |         search = SerpAPIWrapper()
 46 |         tool = Tool(
 47 |             name='Search',
 48 |             func=search.run,
 49 |             description='useful for when you need to answer questions about current events'
 50 |         )
 51 |         return tool
 52 | 
 53 |     def create_doc_chat(self, docGPT) -> Tool:
 54 |         """Add a custom docGPT tool"""
 55 |         tool = Tool(
 56 |             name='DocumentGPT',
 57 |             func=docGPT.run,
 58 |             description="""
 59 |             useful for when you need to answer questions from the context of PDF
 60 |             """
 61 |         )
 62 |         return tool
 63 | 
 64 |     def create_llm_chain(self) -> Tool:
 65 |         """Add a llm tool"""
 66 |         prompt = PromptTemplate(
 67 |             input_variables = ['query'],
 68 |             template = '{query}'
 69 |         )
 70 |         llm_chain = LLMChain(llm=self.llm, prompt=prompt)
 71 | 
 72 |         tool = Tool(
 73 |             name='LLM',
 74 |             func=llm_chain.run,
 75 |             description='useful for general purpose queries and logic.'
 76 |         )
 77 |         return tool
 78 | 
 79 |     def initialize(self, tools):
 80 |         for tool in tools:
 81 |             if isinstance(tool, Tool):
 82 |                 self.tools.append(tool)
 83 | 
 84 |         self.agent_ = initialize_agent(
 85 |             self.tools,
 86 |             self.llm,
 87 |             agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
 88 |             verbose=True
 89 |         )
 90 | 
 91 |     def run(self, query: str) -> Optional[str]:
 92 |         response = None
 93 |         with get_openai_callback() as callback:
 94 |             try:
 95 |                 response = self.agent_.run(query)
 96 |             except ValueError as e:
 97 |                 response = 'Something wrong in agent: ' + str(e)
 98 |                 if not response.startswith("Could not parse LLM output: `"):
 99 |                     raise e
100 | 
101 |             print(callback)
102 |         return response
103 | 


--------------------------------------------------------------------------------
/docGPT/check_api_key.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | import openai
 5 | import streamlit as st
 6 | 
 7 | 
 8 | class ApiKey(ABC):
 9 |     """Check the Api key is valid or not"""
10 |     query = 'This is a test.'
11 | 
12 |     @classmethod
13 |     @abstractmethod
14 |     def is_valid(cls):
15 |         pass
16 | 
17 | 
18 | class OpenAiAPI(ApiKey):
19 |     @classmethod
20 |     def is_valid(cls) -> str:
21 |         if not st.session_state['openai_api_key']:
22 |             st.error('⚠️ :red[You have not pass OpenAI API key.] Use default model')
23 |             return
24 | 
25 |         openai.api_key = os.getenv('OPENAI_API_KEY')
26 |         try:
27 |             response = openai.Completion.create(
28 |                 engine='davinci',
29 |                 prompt=cls.query,
30 |                 max_tokens=5
31 |             )
32 |             return response
33 |         except Exception as e:
34 |             st.error(
35 |                 '🚨 :red[Your OpenAI API key has a problem.] '
36 |                 '[Check your usage](https://platform.openai.com/account/usage)'
37 |             )
38 |             print(f'Test error\n{e}')
39 | 
40 | 
41 | class SerpAPI(ApiKey):
42 |     @classmethod
43 |     def is_valid(cls) -> str:
44 |         if not st.session_state['serpapi_api_key']:
45 |             st.warning('⚠️ You have not pass SerpAPI key. (You cannot ask current events.)')
46 |             return
47 |         from langchain import SerpAPIWrapper
48 | 
49 |         os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY')
50 |         try:
51 |             search = SerpAPIWrapper()
52 |             response = search.run(cls.query)
53 |             return response
54 |         except Exception as e:
55 |             st.error(
56 |                 '🚨 :red[Your SerpAPI key has a problem.] '
57 |                 '[Check your usage](https://serpapi.com/dashboard)'
58 |             )
59 |             print(f'Test error\n{e}')
60 | 


--------------------------------------------------------------------------------
/docGPT/docGPT.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import os
  3 | from abc import ABC, abstractmethod
  4 | from typing import List, Optional
  5 | 
  6 | import g4f
  7 | import openai
  8 | from langchain.callbacks import get_openai_callback
  9 | from langchain.callbacks.manager import CallbackManagerForLLMRun
 10 | from langchain.chains import RetrievalQA
 11 | from langchain.embeddings import HuggingFaceEmbeddings
 12 | from langchain.embeddings.openai import OpenAIEmbeddings
 13 | from langchain.llms.base import LLM
 14 | from langchain.prompts import PromptTemplate
 15 | from langchain.vectorstores import FAISS
 16 | from streamlit import logger
 17 | 
 18 | openai.api_key = os.getenv('OPENAI_API_KEY')
 19 | module_logger = logger.get_logger(__name__)
 20 | 
 21 | 
 22 | class BaseQaChain(ABC):
 23 |     def __init__(
 24 |         self,
 25 |         chain_type: str,
 26 |         retriever,
 27 |         llm
 28 |     ) -> None:
 29 |         self.chain_type = chain_type
 30 |         self.retriever = retriever
 31 |         self.llm = llm
 32 | 
 33 |     @abstractmethod
 34 |     def create_qa_chain(self):
 35 |         pass
 36 | 
 37 | 
 38 | class RChain(BaseQaChain):
 39 |     def __init__(
 40 |         self,
 41 |         chain_type: str,
 42 |         retriever,
 43 |         llm,
 44 |         chain_type_kwargs: dict
 45 |     ) -> None:
 46 |         super().__init__(chain_type, retriever, llm)
 47 |         self.chain_type_kwargs = chain_type_kwargs
 48 | 
 49 |     @property
 50 |     def create_qa_chain(self) -> RetrievalQA:
 51 |         qa_chain = RetrievalQA.from_chain_type(
 52 |             llm=self.llm,
 53 |             chain_type=self.chain_type,
 54 |             retriever=self.retriever,
 55 |             chain_type_kwargs=self.chain_type_kwargs
 56 |         )
 57 |         return qa_chain
 58 | 
 59 | 
 60 | class CRChain(BaseQaChain):
 61 |     def __init__(
 62 |         self,
 63 |         chain_type: str,
 64 |         retriever,
 65 |         llm,
 66 |     ) -> None:
 67 |         super().__init__(chain_type, retriever, llm)
 68 | 
 69 |     @property
 70 |     def create_qa_chain(self):
 71 |         # TODO: cannot use conversation qa chain
 72 |         from langchain.chains import ConversationalRetrievalChain
 73 |         from langchain.memory import ConversationBufferMemory
 74 | 
 75 |         memory = ConversationBufferMemory(
 76 |             memory_key='chat_history',
 77 |             return_messages=True
 78 |         )
 79 |         qa_chain = ConversationalRetrievalChain.from_llm(
 80 |             llm=self.llm,
 81 |             chain_type=self.chain_type,
 82 |             retriever=self.retriever,
 83 |             memory=memory
 84 |         )
 85 |         return qa_chain    
 86 | 
 87 | 
 88 | class DocGPT:
 89 |     def __init__(self, docs):
 90 |         self.docs = docs
 91 |         self.qa_chain = None
 92 |         self._llm = None
 93 | 
 94 |         self.prompt_template = (
 95 |             "Only answer what is asked. Answer step-by-step.\n"
 96 |             "If the content has sections, please summarize them "
 97 |             "in order and present them in a bulleted format.\n"
 98 |             "Utilize line breaks for better readability.\n"
 99 |             "For example, sequentially summarize the "
100 |             "introduction, methods, results, and so on.\n"
101 |             "Please use Python's newline symbols appropriately to "
102 |             "enhance the readability of the response, "
103 |             "but don't use two newline symbols consecutive.\n\n"
104 |             "{context}\n\n"
105 |             "Question: {question}\n"
106 |         )
107 |         self.prompt = PromptTemplate(
108 |             template=self.prompt_template,
109 |             input_variables=['context', 'question']
110 |         )
111 | 
112 |         self.refine_prompt_template = (
113 |             "The original question is as follows: {question}\n"
114 |             "We have provided an existing answer: {existing_answer}\n"
115 |             "We have the opportunity to refine the existing answer"
116 |             "(only if needed) with some more context below.\n"
117 |             "------------\n"
118 |             "{context_str}\n"
119 |             "------------\n"
120 |             "Given the new context, refine the original answer to better "
121 |             "answer the question. "
122 |             "If the context isn't useful, return the original answer.\n"
123 |             "Please use Python's newline symbols "
124 |             "appropriately to enhance the readability of the response, "
125 |             "but don't use two newline symbols consecutive.\n"
126 |         )
127 |         self.refine_prompt = PromptTemplate(
128 |             template=self.refine_prompt_template,
129 |             input_variables=['question', 'existing_answer', 'context_str']
130 |         )
131 | 
132 |     @property
133 |     def llm(self):
134 |         return self._llm
135 | 
136 |     @llm.setter
137 |     def llm(self, llm) -> None:
138 |         self._llm = llm
139 | 
140 |     def _helper_prompt(self, chain_type: str) -> None:
141 |         # TODO: Bug helper
142 |         if chain_type == 'refine':
143 |             self.prompt_template = self.prompt_template.replace(
144 |                 '{context}', '{context_str}'
145 |             )
146 |             self.prompt.template = self.prompt_template
147 |             for i in range(len(self.prompt.input_variables)):
148 |                 if self.prompt.input_variables[i] == 'context':
149 |                     self.prompt.input_variables[i] = 'context_str'
150 | 
151 |     def _embeddings(self):
152 |         try:
153 |             # If have openai api
154 |             embeddings = OpenAIEmbeddings()
155 |         except:
156 |             embeddings = HuggingFaceEmbeddings(
157 |                 model_name=(
158 |                     'sentence-transformers/'
159 |                     'multi-qa-MiniLM-L6-cos-v1'
160 |                 )
161 |             )
162 | 
163 |         db = FAISS.from_documents(
164 |             documents=self.docs,
165 |             embedding=embeddings
166 |         )
167 |         module_logger.info('embedded...')
168 |         return db
169 | 
170 |     def create_qa_chain(
171 |         self,
172 |         chain_type: str ='stuff',
173 |         verbose: bool = True
174 |     ) -> BaseQaChain:
175 |         # TODO: Bug helper
176 |         self._helper_prompt(chain_type)
177 |         chain_type_kwargs = {
178 |             'question_prompt': self.prompt,
179 |             'verbose': verbose,
180 |             'refine_prompt': self.refine_prompt
181 |         }
182 | 
183 |         db = self._embeddings()
184 |         retriever = db.as_retriever()
185 | 
186 |         self.qa_chain = RChain(
187 |             chain_type=chain_type,
188 |             retriever=retriever,
189 |             llm=self._llm,
190 |             chain_type_kwargs=chain_type_kwargs
191 |         ).create_qa_chain
192 | 
193 |     def run(self, query: str) -> str:
194 |         response = 'Nothing...'
195 |         with get_openai_callback() as callback:
196 |             if isinstance(self.qa_chain, RetrievalQA):
197 |                 response = self.qa_chain.run(query)
198 |             module_logger.info(callback)
199 |         return response
200 | 
201 | 
202 | class GPT4Free(LLM):
203 |     providers_table = {
204 |         f'g4f.Provider.{provider}': getattr(g4f.Provider, provider)
205 |         for provider in g4f.Provider.__all__
206 |     }
207 |     provider: str = 'g4f.Provider.DeepAi'
208 | 
209 |     @property
210 |     def _llm_type(self) -> str:
211 |         return 'gpt4free model'
212 | 
213 |     def _call(
214 |         self,
215 |         prompt: str,
216 |         stop: Optional[List[str]] = None,
217 |         run_manager: Optional[CallbackManagerForLLMRun] = None,
218 |     ) -> str:
219 |         try:
220 |             # print(f'\033[36mPromopt: {prompt}\033[0m')
221 |             provider = self.providers_table.get(self.provider, None)
222 |             module_logger.info(
223 |                 f'\033[36mProvider: {provider}\033[0m'
224 |             )
225 |             return g4f.ChatCompletion.create(
226 |                 model="gpt-3.5-turbo",
227 |                 messages=[{"role": "user", "content": prompt}],
228 |                 provider=provider,
229 |                 ignored=["ChatBase"]
230 |             )
231 |         except Exception as e:
232 |             module_logger.info(f'{__file__}: call gpt4free error - {e}')
233 | 
234 |     async def _test_provider(self, provider: g4f.Provider) -> str:
235 |         provider_name = provider.__name__
236 |         try:
237 |             await g4f.ChatCompletion.create_async(
238 |                 model="gpt-3.5-turbo",
239 |                 messages=[{"role": "user", "content": 'Hi, this is test'}],
240 |                 provider=provider,
241 |                 ignored=["ChatBase"]
242 |             )
243 |             return provider_name
244 |         except Exception as e:
245 |             print(f'{provider_name}: {e}')
246 | 
247 |     async def show_available_providers(self) -> list:
248 |         """Test all the providers then find out which are available"""
249 |         tasks = [
250 |             self._test_provider(provider)
251 |             for provider in self.providers_table.values()    
252 |         ]
253 |         available_providers = await asyncio.gather(*tasks)
254 | 
255 |         return [
256 |             available_provider for available_provider in available_providers
257 |             if available_provider is not None
258 |         ]
259 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   docgpt:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     ports:
 9 |       - '8501:8501'
10 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_connection import (
2 |     DocumentLoader
3 | )
4 | 


--------------------------------------------------------------------------------
/model/data_connection.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Iterator, Union
 3 | 
 4 | import requests
 5 | import streamlit as st
 6 | from langchain.document_loaders import (
 7 |     CSVLoader,
 8 |     Docx2txtLoader,
 9 |     PyMuPDFLoader,
10 |     TextLoader,
11 | )
12 | from langchain.text_splitter import RecursiveCharacterTextSplitter
13 | 
14 | 
15 | class DocumentLoader:
16 |     @staticmethod
17 |     def get_files(path: str, filetype: str = '.pdf') -> Iterator[str]:
18 |         try:
19 |             yield from [
20 |                 file_name for file_name in os.listdir(f'{path}')
21 |                 if file_name.endswith(filetype)
22 |             ]
23 |         except FileNotFoundError as e:
24 |             print(f'\033[31m{e}')
25 | 
26 |     @staticmethod
27 |     def load_documents(
28 |         file: str,
29 |         filetype: str = '.pdf'
30 |     ) -> Union[CSVLoader, Docx2txtLoader, PyMuPDFLoader, TextLoader]:
31 |         """Loading PDF, Docx, CSV"""
32 |         try:
33 |             if filetype == '.pdf':
34 |                 loader = PyMuPDFLoader(file)
35 |             elif filetype == '.docx':
36 |                 loader = Docx2txtLoader(file)
37 |             elif filetype == '.csv':
38 |                 loader = CSVLoader(file, encoding='utf-8')
39 |             elif filetype == '.txt':
40 |                 loader = TextLoader(file, encoding='utf-8')
41 | 
42 |             return loader.load()
43 | 
44 |         except Exception as e:
45 |             print(f'\033[31m{e}')
46 |             return []
47 | 
48 |     @staticmethod
49 |     def split_documents(
50 |         document: Union[CSVLoader, Docx2txtLoader, PyMuPDFLoader, TextLoader],
51 |         chunk_size: int=2000,
52 |         chunk_overlap: int=0
53 |     ) -> list:
54 |         splitter = RecursiveCharacterTextSplitter(
55 |             chunk_size=chunk_size,
56 |             chunk_overlap=chunk_overlap
57 |         )
58 | 
59 |         return splitter.split_documents(document)
60 | 
61 |     @staticmethod
62 |     def crawl_file(url: str) -> str:
63 |         try:
64 |             response = requests.get(url)
65 |             filetype = os.path.splitext(url)[1]
66 |             if response.status_code == 200 and (
67 |                 any(ext in filetype for ext in ['.pdf', '.docx', '.csv', '.txt'])
68 |             ):
69 |                 return response.content, filetype
70 |             else:
71 |                 st.warning('Url cannot parse correctly.')
72 |         except:
73 |             st.warning('Url cannot parse correctly.')
74 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | g4f
 2 | langchain==0.0.218
 3 | openai==0.27.8
 4 | streamlit==1.26.0
 5 | streamlit_chat==0.1.1
 6 | pymupdf==1.22.5
 7 | faiss-cpu==1.7.4
 8 | tiktoken==0.4.0
 9 | tenacity==8.1.0
10 | google-search-results==2.4.2
11 | sentence_transformers
12 | requests
13 | httpx
14 | docx2txt
15 | 


--------------------------------------------------------------------------------
/static/img/2023-07-03-22-38-08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-07-03-22-38-08.png


--------------------------------------------------------------------------------
/static/img/2023-08-24-15-02-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-08-24-15-02-11.png


--------------------------------------------------------------------------------
/static/img/2023-08-29-13-39-00.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-08-29-13-39-00.png


--------------------------------------------------------------------------------
/static/img/2023-09-06-14-56-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-09-06-14-56-20.png


--------------------------------------------------------------------------------
/static/img/chatbot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/chatbot.png


--------------------------------------------------------------------------------
/static/img/chatbot_v2.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/chatbot_v2.1.png


--------------------------------------------------------------------------------
/static/img/chatbot_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/chatbot_v2.png


--------------------------------------------------------------------------------
/static/img/docGPT.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/docGPT.gif


--------------------------------------------------------------------------------
/static/img/repos_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/repos_logo.png


--------------------------------------------------------------------------------
/static/img/repos_logo_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/repos_logo_v1.png


--------------------------------------------------------------------------------