├── .github └── ISSUE_TEMPLATE │ └── default_issue.yml ├── .gitignore ├── .streamlit └── config.toml ├── Dockerfile ├── LICENSE ├── README.md ├── README.zh-TW.md ├── app.py ├── components ├── __init__.py ├── document_processor.py ├── response_handler.py ├── sidebar.py └── theme.py ├── docGPT ├── __init__.py ├── agent.py ├── check_api_key.py └── docGPT.py ├── docker-compose.yml ├── model ├── __init__.py └── data_connection.py ├── requirements.txt └── static └── img ├── 2023-07-03-22-38-08.png ├── 2023-08-24-15-02-11.png ├── 2023-08-29-13-39-00.png ├── 2023-09-06-14-56-20.png ├── chatbot.png ├── chatbot_v2.1.png ├── chatbot_v2.png ├── docGPT.gif ├── repos_logo.png └── repos_logo_v1.png /.github/ISSUE_TEMPLATE/default_issue.yml: -------------------------------------------------------------------------------- 1 | name: Default Issue 2 | description: Raise an issue that wouldn't be covered by the other templates. 3 | title: "Issue: " 4 | labels: [Default Issue Template] 5 | 6 | body: 7 | - type: textarea 8 | attributes: 9 | label: "Issue you'd like to raise." 10 | description: > 11 | Please describe the issue you'd like to raise as clearly as possible. 12 | Make sure to include any relevant links or references. 13 | 14 | - type: textarea 15 | attributes: 16 | label: "Suggestion:" 17 | description: > 18 | Please outline a suggestion to improve the issue here. 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .chroma/ 2 | data/ 3 | External_Data_Pipeline/ 4 | PDF/Omren 5 | config.py 6 | main.py 7 | note.md 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | .pybuilder/ 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | # For a library or package, you might want to ignore these files since the code is 95 | # intended to run in multiple environments; otherwise, check them in: 96 | # .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | Pipfile.lock 104 | Pipfile 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | #.idea/ 170 | -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [server] 2 | enableStaticServing = true -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | # Set the working directory in the container. 4 | WORKDIR /app 5 | 6 | # Copy the project's requirements file into the container 7 | COPY requirements.txt ./requirements.txt 8 | # Upgrade pip for the latest features and install the project's Python dependencies. 9 | RUN pip install --upgrade pip && pip install -r requirements.txt 10 | 11 | # Copy the entire project into the container. 12 | # This may include all code, assets, and configuration files required to run the application. 13 | COPY . /app 14 | 15 | # Expose port 8501 16 | EXPOSE 8501 17 | 18 | # Define the default command to run the app using Python's module mode. 19 | CMD ["streamlit", "run", "app.py"] 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 JunXiang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Chatbot Image 3 |

4 | 5 | 6 | [English](./README.md) | [中文版](./README.zh-TW.md) 7 | 8 | Free `docGPT` allows you to chat with your documents (`.pdf`, `.docx`, `.csv`, `.txt`), without the need for any keys or fees. 9 | 10 | Additionally, you can deploy the app anywhere based on the document. 11 | 12 | - Table of Contents 13 | - [Introduction](#introduction) 14 | - [Features](#🧨features) 15 | - [What's LangChain?](#whats-langchain) 16 | - [How to Use docGPT?](#how-to-use-docgpt) 17 | - [How to Develop a docGPT with Streamlit?](#how-to-develop-a-docgpt-with-streamlit) 18 | - [Advanced - How to build a better model in langchain](#advanced---how-to-build-a-better-model-in-langchain) 19 | 20 | * Main Development Software and Packages: 21 | * `Python 3.10.11` 22 | * `Langchain 0.0.218` 23 | * `Streamlit 1.22.0` 24 | * [more](./requirements.txt) 25 | 26 | If you like this project, please give it a ⭐`Star` to support the developers~ 27 | 28 | ### 📚Introduction 29 | 30 | * Upload a Document link from your local device (`.pdf`, `.docx`, `.csv`, `.txt`) and query `docGPT` about the content of the Document. For example, you can ask GPT to summarize an article. 31 | 32 | * Provide two models: 33 | * `gpt4free` 34 | * **Completely free, allowing users to use the application without the need for API keys or payments.** 35 | * Select the `Provider`. For more details about `gpt4free`, please refer to the [source project](https://github.com/xtekky/gpt4free). 36 | * `openai` 37 | * **Requires an `openai_api_key`, which you can obtain from [this link](https://platform.openai.com/).** 38 | * If you have an `serpapi_key`, AI responses can include Google search results. 39 | 40 |

41 | 42 |

43 | 44 | --- 45 | 46 | ### 🧨Features 47 | 48 | - **`gpt4free` Integration**: Everyone can use `docGPT` for **free** without needing an OpenAI API key. 49 | - **Support docx, pdf, csv, txt file**: Users can upload PDF, Word, CSV, txt file. 50 | - **Direct Document URL Input**: Users can input Document `URL` links for parsing without uploading document files(see the demo). 51 | - **Langchain Agent**: Enables AI to answer current questions and achieve Google search-like functionality. 52 | - **User-Friendly Environment**: Easy-to-use interface for simple operations. 53 | 54 | --- 55 | 56 | ### 🦜️What's LangChain? 57 | 58 | * LangChain is a framework for developing applications powered by language models. It supports the following applications: 59 | 1. Connecting LLM models with external data sources. 60 | 2. Interactive communication with LLM models. 61 | 62 | * For more details about LangChain, refer to the [official documentation](https://github.com/hwchase17/langchain). 63 | 64 | **For questions that ChatGPT can't answer, turn to LangChain!** 65 | 66 | LangChain fills in the gaps left by ChatGPT. Through the following example, you can understand the power of LangChain: 67 | 68 | > In cases where ChatGPT can't solve mathematical problems or answer questions about events after 2020 (e.g., "Who is the president in 2023?"): 69 | > 70 | > * For mathematical problems: There's a math-LLM model dedicated to handling math queries. 71 | > * For modern topics: You can use Google search. 72 | > 73 | > To create a comprehensive AI model, we need to combine "ChatGPT," "math-LLM," and "Google search" tools. 74 | > 75 | > In the non-AI era, we used `if...else...` to categorize user queries and had users select the question type through UI. 76 | > 77 | > In the AI era, users should be able to directly ask questions without preselecting the question type. With LangChain's agent: 78 | > * We provide tools to the agent, e.g., `tools = ['chatgpt', 'math-llm', 'google-search']`. 79 | > * Tools can include chains designed using LangChain, such as using a retrievalQA chain to answer questions from documents. 80 | > * **The agent automatically decides which tool to use based on user queries** (fully automated). 81 | 82 | Through LangChain, you can create a universal AI model or tailor it for business applications. 83 | 84 | 85 | --- 86 | 87 | ### 🚩How to Use docGPT? 88 | 89 | 1. 🎬Visit the [application](https://docgpt-app.streamlit.app/). 90 | 91 | 2. 🔑Enter your `API_KEY` (optional in Version 3, as you can use the `gpt4free` free model): 92 | - `OpenAI API KEY`: Ensure you have available usage. 93 | - `SERPAPI API KEY`: Required if you want to query content not present in the Document. 94 | 95 | 3. 📁Upload a Document file (choose one method) 96 | * Method 1: Browse and upload your own `.pdf`, `.docx`, `.csv`, `.txt` file from your local machine. 97 | * Method 2: Enter the Document `URL` link directly. 98 | 99 | 4. 🚀Start asking questions! 100 | 101 | ![docGPT](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/static/img/docGPT.gif?raw=true) 102 | 103 | > [!WARNING] 104 | > Due to resource limitations in the free version of Streamlit Cloud, the application may experience crashes when used by multiple users simultaneously ([Oh no!](https://github.com/Lin-jun-xiang/docGPT-langchain/issues/2)). If you encounter this problem, feel free to report it in the issue tracker, and the developers will restart the application. 105 | 106 | --- 107 | 108 | ### 🧠How to Develop a docGPT with Streamlit? 109 | 110 | A step-by-step tutorial to quickly build your own chatGPT! 111 | 112 | First, clone the repository using `git clone https://github.com/Lin-jun-xiang/docGPT-streamlit.git`. 113 | 114 | There are few methods: 115 | 116 | * **Local development without docker**: 117 | * Download the required packages for development. 118 | ``` 119 | pip install -r requirements.txt 120 | ``` 121 | 122 | * Start the service in the project's root directory. 123 | ``` 124 | streamlit run ./app.py 125 | ``` 126 | 127 | * Start exploring! You server will now be running at `http://localhost:8501`. 128 | 129 | * **Local development with docker**: 130 | * Start the service using Docker Compose 131 | ``` 132 | docker-compose up 133 | ``` 134 | 135 | You server will now be running at `http://localhost:8501`. You can interact with the `docGPT` or run your tests as you would normally. 136 | 137 | * To stop the Docker containers, simply run: 138 | ``` 139 | docker-compose down 140 | ``` 141 | 142 | * **Streamlit Community Cloud for free** deployment, management, and sharing of applications: 143 | - Place your application in a public GitHub repository (ensure you have `requirements.txt`). 144 | - Log in to [share.streamlit.io](https://share.streamlit.io/). 145 | - Click "Deploy an App," then paste your GitHub URL. 146 | - Complete deployment and share your [application](https://docgpt-app.streamlit.app//). 147 | 148 | Due to the limitations of the free version of Streamlit Cloud and its reliance on server resources, `docGPT` may experience some latency. We recommend users to consider deploying it locally for a smoother experience 149 | 150 | --- 151 | 152 | ### 💬Advanced - How to build a better model in langchain 153 | 154 | To build a powerful docGPT model in LangChain, consider these tips to enhance performance: 155 | 156 | 1. **Language Model** 157 | 158 | Select an appropriate LLM model, such as OpenAI's `gpt-3.5-turbo` or other models. Experiment with different models to find the best fit for your use case. 159 | 160 | ```python 161 | # ./docGPT/docGPT.py 162 | llm = ChatOpenAI( 163 | temperature=0.2, 164 | max_tokens=2000, 165 | model_name='gpt-3.5-turbo' 166 | ) 167 | ``` 168 | 169 | Please note that there is no best or worst model. You need to try multiple models to find the one that suits your use case the best. For more OpenAI models, please refer to the [documentation](https://platform.openai.com/docs/models). 170 | 171 | (Some models support up to 16,000 tokens!) 172 | 173 | 2. **PDF Loader** 174 | 175 | Choose a suitable PDF loader. Consider using `PyMuPDF` for fast text extraction and `PDFPlumber` for extracting text from tables. 176 | 177 | ([official Langchain documentation](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf)) 178 | 179 | * `PyPDF`: Simple and easy to use. 180 | * `PyMuPDF`: Reads the document very **quickly** and provides additional metadata such as page numbers and document dates. 181 | * `PDFPlumber`: Can **extract text within tables**. Similar to PyMuPDF, it provides metadata but takes longer to parse. 182 | 183 | If your document contains multiple tables and important information is within those tables, it is recommended to try `PDFPlumber`, which may give you unexpected results! 184 | 185 | Please do not overlook this detail, as without correctly parsing the text from the document, even the most powerful LLM model would be useless! 186 | 187 | 3. **Tracking Token Usage** 188 | 189 | Implement token usage tracking with callbacks in LangChain to monitor token and API key usage during the QA chain process. 190 | 191 | When using `chain.run`, you can try using the [method](https://python.langchain.com/docs/modules/model_io/models/llms/how_to/token_usage_tracking) provided by Langchain to track token usage here: 192 | 193 | ```python 194 | from langchain.callbacks import get_openai_callback 195 | 196 | with get_openai_callback() as callback: 197 | response = self.qa_chain.run(query) 198 | 199 | print(callback) 200 | 201 | # Result of print 202 | """ 203 | chain... 204 | ... 205 | > Finished chain. 206 | Total Tokens: 1506 207 | Prompt Tokens: 1350 208 | Completion Tokens: 156 209 | Total Cost (USD): $0.03012 210 | ``` 211 | 212 | Back to top 213 | -------------------------------------------------------------------------------- /README.zh-TW.md: -------------------------------------------------------------------------------- 1 |

2 | Chatbot Image 3 |

4 | 5 | [English](./README.md) | [中文版](./README.zh-TW.md) 6 | 7 | 免費的`docGPT`允許您與您的文件 (`.pdf`, `.docx`, `.csv`, `.txt`) 進行對話,無需任何金鑰或費用。 8 | 9 | 此外,您也可以根據該文件操作,將程序部屬在任何地方。 10 | 11 | - 目錄 12 | - [Introduction](#introduction) 13 | - [Features](#🧨features) 14 | - [What's LangChain?](#whats-langchain) 15 | - [How to Use docGPT?](#how-to-use-docgpt) 16 | - [How to develope a docGPT with streamlit?](#how-to-develope-a-docgpt-with-streamlit) 17 | - [Advanced - How to build a better model in langchain](#advanced---how-to-build-a-better-model-in-langchain) 18 | 19 | * 主要開發軟體與套件: 20 | * `Python 3.10.11` 21 | * `Langchain 0.0.218` 22 | * `Streamlit 1.22.0` 23 | * [more](./requirements.txt) 24 | 25 | 如果您喜歡這個專案,請給予⭐`Star`以支持開發者~ 26 | 27 | ### 📚Introduction 28 | 29 | * 上傳來自本地的 Document 連結 (`.pdf`, `.docx`, `.csv`, `.txt`),並且向 `docGPT` 詢問有關 Document 內容。例如: 您可以請 GPT 幫忙總結文章 30 | * 提供兩種模型選擇: 31 | * `gpt4free` 32 | * **完全免費,"允許使用者在無需輸入 API 金鑰或付款的情況下使用該應用程序"** 33 | * 需選擇 `Provider`。有關 `gpt4free` 的更多詳細信息,請參閱[源專案](https://github.com/xtekky/gpt4free) 34 | * `openai` 35 | * **須具備** `openai_api_key`,您可以從此[鏈接](https://platform.openai.com/)獲取金鑰 36 | * 若具備 `serpapi_key`,AI 的回應可以包括 Google 搜索結果 37 | 38 |

39 | 40 |

41 | 42 | --- 43 | 44 | ### 🧨Features 45 | 46 | - **`gpt4free` 整合**:任何人都可以免費使用 GPT4,無需輸入 OpenAI API 金鑰。 47 | - **支援 docx, pdf, csv, txt 檔案**: 可以上傳 PDF, Word, CSV, txt 檔 48 | - **直接輸入 Document 網址**:使用者可以直接輸入 Document URL 進行解析,無需從本地上傳檔案(如下方demo所示)。 49 | - **Langchain Agent**:AI 能夠回答當前問題,實現類似 Google 搜尋功能。 50 | - **簡易操作環境**:友善的界面,操作簡便 51 | 52 | --- 53 | 54 | ### 🦜️What's LangChain? 55 | 56 | * LangChain 是一個用於**開發由語言模型支持的應用程序的框架**。它支持以下應用程序 57 | 1. 將 LLM 模型與外部數據源進行連接 58 | 2. 允許與 LLM 模型進行交互 59 | 60 | * 有關 langchain 的介紹,建議查看官方文件、[Github源專案](https://github.com/hwchase17/langchain) 61 | 62 | 63 | **ChatGPT 無法回答的問題,交給 Langchain 實現!** 64 | 65 | LangChain 填補了 ChatGPT 的不足之處。通過以下示例,您可以理解 LangChain 的威力: 66 | 67 | > 在 ChatGPT 無法解答數學問題或回答 2020 年以後的問題(例如“2023 年的總統是誰?”)的情況下: 68 | > 69 | > * 數學問題: 有專門處理數學問題的 math-LLM 模型 70 | > * 現今問題: 使用 Google 搜索 71 | > 72 | > 要創建一個全面的 AI 模型,我們需要結合 "ChatGPT"、"math-LLM" 和 "Google 搜索" 工具。 73 | > 74 | > 在非 AI 時代,我們將使用 `if...else...` 將用戶查詢進行分類,讓用戶選擇問題類型(通過 UI)。 75 | > 76 | > 在 AI 時代,用戶應能夠直接提問。通過 LangChain 的 agent: 77 | > 78 | > * 我們向 agent 提供工具,例如 `tools = ['chatgpt', 'math-llm', 'google-search']` 79 | > * 工具可以包括使用 LangChain 設計的 chains,例如使用 `retrievalQA chain` 回答來自文檔的問題。 80 | > * agent 根據用戶查詢自動決定使用哪個工具(完全自動化)。 81 | 82 | 通過 LangChain,您可以創建通用的 AI 模型,也可以為**商業應用**量身定制。 83 | 84 | --- 85 | 86 | ### 🚩How to Use docGPT? 87 | 88 | 1. 🎬前往[應用程序](https://docgpt-app.streamlit.app/) 89 | 90 | 2. 🔑輸入您的 `API_KEY` (在版本 3 中為可選,您可以使用 `gpt4free` 免費模型): 91 | * `OpenAI API KEY`: 確保還有可用的使用次數。 92 | * `SERPAPI API KEY`: 如果您要查詢 Document 中不存在的內容,則需要使用此金鑰。 93 | 94 | 3. 📁上傳來自本地的 Document 檔案 (選擇一個方法) 95 | * 方法一: 從本地機瀏覽並上傳自己的 `.pdf`, `.docx`, `.csv` or `.txt` 檔 96 | * 方法二: 輸入 Document URL 連結 97 | 98 | 4. 🚀開始提問 ! 99 | 100 | ![RGB_cleanup](https://github.com/Lin-jun-xiang/docGPT-streamlit/blob/main/static/img/docGPT.gif?raw=true) 101 | 102 | > [!WARNING] 103 | > 由於免費版 streamlit cloud 資源限制,該程序在多人同時使用時,容易引發崩潰([Oh no!](https://github.com/Lin-jun-xiang/docGPT-langchain/issues/2)),若遇上該問題歡迎到 Issue 提醒開發者,開發者會重啟程序。 104 | 105 | 106 | --- 107 | 108 | ### 🧠How to develope a docGPT with streamlit? 109 | 110 | 手把手教學,讓您快速建立一個屬於自己的 chatGPT ! 111 | 112 | 首先請進行 `git clone https://github.com/Lin-jun-xiang/docGPT-streamlit.git` 113 | 114 | 方法有如下幾種方法: 115 | 116 | * 於**本地開發方式(不使用docker)**: 117 | * 下載開發需求套件 118 | ``` 119 | pip install -r requirements.txt 120 | ``` 121 | 122 | * 於專案根目錄啟動服務 123 | ``` 124 | streamlit run ./app.py 125 | ``` 126 | 127 | * 開始體驗! 您的服務會運行在 `http://localhost:8501`. 128 | 129 | * 於**本地開發方式(使用docker)**: 130 | * 使用 Docker Compose 啟動服務 131 | ``` 132 | docker-compose up 133 | ``` 134 | 135 | 您的服務會運行在 `http://localhost:8501`. 您可以開始使用 `docGPT` 應用程序 136 | 137 | * 停止服務運行 138 | ``` 139 | docker-compose down 140 | ``` 141 | 142 | * 使用 Streamlit Community **Cloud 免費部屬**、管理和共享應用程序 143 | * 將您的應用程序放在公共 GitHub 存儲庫中(確保有 `requirements.txt`!) 144 | * 登錄[share.streamlit.io](https://share.streamlit.io/) 145 | * 單擊“部署應用程序”,然後粘貼您的 GitHub URL 146 | * 完成部屬[應用程序](https://docgpt-app.streamlit.app//) 147 | 148 | 由於 `docGPT` 是使用 streamlit cloud 免費版部屬,受限於設備關係會有不少延遲,建議使用者可以使用本地部屬方式來體驗。 149 | 150 | --- 151 | 152 | ### 💬Advanced - How to build a better model in langchain 153 | 154 | 要在 LangChain 中構建功能強大的 docGPT 模型,請考慮以下技巧以改進性能 155 | 156 | 1. **Language Model** 157 | 158 | 使用適當的 LLM Model,會讓您事半功倍,例如您可以選擇使用 OpenAI 的 `gpt-3.5-turbo` (預設是 `text-davinci-003`): 159 | 160 | ```python 161 | # ./docGPT/docGPT.py 162 | llm = ChatOpenAI( 163 | temperature=0.2, 164 | max_tokens=2000, 165 | model_name='gpt-3.5-turbo' 166 | ) 167 | ``` 168 | 169 | 請注意,模型之間並沒有最好與最壞,您需要多試幾個模型,才會發現最適合自己案例的模型,更多 OpenAI model 請[參考](https://platform.openai.com/docs/models) 170 | 171 | (部分模型可以使用 16,000 tokens!) 172 | 173 | 2. **PDF Loader** 174 | 175 | 在 Python 中有許多解析 PDF 文字的 Loader,每個 Loader 各有優缺點,以下整理三個作者用過的 176 | 177 | ([Langchain官方介紹](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf)): 178 | 179 | * `PyPDF`: 簡單易用 180 | * `PyMuPDF`: 讀取文件**速度非常快速**,除了能解析文字,還能取得頁數、文檔日期...等 MetaData。 181 | * `PDFPlumber`: 能夠解析出**表格內部文字**,使用方面與 `PyMuPDF` 相似,皆能取得 MetaData,但是解析時間較長。 182 | 183 | 如果您的文件具有多個表格,且重要資訊存在表格中,建議您嘗試 `PDFPlumber`,它會給您意想不到的結果! 184 | 請不要忽略這個細節,因為沒有正確解析出文件中的文字,即使 LLM 模型再強大也無用! 185 | 186 | 3. **Tracking Token Usage** 187 | 188 | 這個並不能讓模型強大,但是能讓您清楚知道 QA Chain 的過程中,您使用的 tokens、openai api key 的使用量。 189 | 190 | 當您使用 `chain.run` 時,可以嘗試用 langchain 提供的 [方法](https://python.langchain.com/docs/modules/model_io/models/llms/how_to/token_usage_tracking): 191 | 192 | ```python 193 | from langchain.callbacks import get_openai_callback 194 | 195 | with get_openai_callback() as callback: 196 | response = self.qa_chain.run(query) 197 | 198 | print(callback) 199 | 200 | # Result of print 201 | """ 202 | chain... 203 | ... 204 | > Finished chain. 205 | Total Tokens: 1506 206 | Prompt Tokens: 1350 207 | Completion Tokens: 156 208 | Total Cost (USD): $0.03012 209 | """ 210 | ``` 211 | 212 | Back to top 213 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 4 | os.environ['SERPAPI_API_KEY'] = '' 5 | 6 | import streamlit as st 7 | from streamlit import logger 8 | from streamlit_chat import message 9 | 10 | from components import get_response, side_bar, theme, upload_and_process_document 11 | from docGPT import create_doc_gpt 12 | 13 | OPENAI_API_KEY = '' 14 | SERPAPI_API_KEY = '' 15 | model = None 16 | 17 | st.session_state.openai_api_key = None 18 | st.session_state.serpapi_api_key = None 19 | st.session_state.g4f_provider = None 20 | st.session_state.button_clicked = None 21 | 22 | 23 | if 'response' not in st.session_state: 24 | st.session_state['response'] = ['How can I help you?'] 25 | 26 | if 'query' not in st.session_state: 27 | st.session_state['query'] = ['Hi'] 28 | 29 | app_logger = logger.get_logger(__name__) 30 | 31 | 32 | def main(): 33 | global model 34 | theme() 35 | side_bar() 36 | 37 | doc_container = st.container() 38 | with doc_container: 39 | docs = upload_and_process_document() 40 | 41 | if docs: 42 | model = create_doc_gpt( 43 | docs, 44 | {k: v for k, v in docs[0].metadata.items() if k not in ['source', 'file_path']}, 45 | st.session_state.g4f_provider 46 | ) 47 | app_logger.info(f'{__file__}: Created model: {model}') 48 | del docs 49 | st.write('---') 50 | 51 | user_container = st.container() 52 | response_container = st.container() 53 | with user_container: 54 | query = st.text_input( 55 | "#### Question:", 56 | placeholder='Enter your question' 57 | ) 58 | 59 | if model and query and query != '' and not st.session_state.button_clicked: 60 | response = get_response(query, model) 61 | st.session_state.query.append(query) 62 | st.session_state.response.append(response) 63 | 64 | with response_container: 65 | if st.session_state['response']: 66 | for i in range(len(st.session_state['response'])-1, -1, -1): 67 | message( 68 | st.session_state["response"][i], key=str(i), 69 | logo=( 70 | 'https://github.com/Lin-jun-xiang/docGPT-streamlit/' 71 | 'blob/main/static/img/chatbot_v2.png?raw=true' 72 | ) 73 | ) 74 | message( 75 | st.session_state['query'][i], is_user=True, key=str(i) + '_user', 76 | logo=( 77 | 'https://api.dicebear.com/6.x/adventurer/svg?' 78 | 'hair=short16&hairColor=85c2c6&' 79 | 'eyes=variant12&size=100&' 80 | 'mouth=variant26&skinColor=f2d3b1' 81 | ) 82 | ) 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /components/__init__.py: -------------------------------------------------------------------------------- 1 | from .sidebar import side_bar 2 | from .document_processor import upload_and_process_document 3 | from .response_handler import get_response 4 | from .theme import theme 5 | 6 | __all__ = [ 7 | 'get_response', 8 | 'side_bar', 9 | 'theme', 10 | 'upload_and_process_document' 11 | ] 12 | -------------------------------------------------------------------------------- /components/document_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import streamlit as st 5 | 6 | from model import DocumentLoader 7 | 8 | 9 | def upload_and_process_document() -> list: 10 | st.write('#### Upload a Document file') 11 | browse, url_link = st.tabs( 12 | ['Drag and drop file (Browse files)', 'Enter document URL link'] 13 | ) 14 | with browse: 15 | upload_file = st.file_uploader( 16 | 'Browse file (.pdf, .docx, .csv, `.txt`)', 17 | type=['pdf', 'docx', 'csv', 'txt'], 18 | label_visibility='hidden' 19 | ) 20 | filetype = os.path.splitext(upload_file.name)[1].lower() if upload_file else None 21 | upload_file = upload_file.read() if upload_file else None 22 | 23 | with url_link: 24 | doc_url = st.text_input( 25 | "Enter document URL Link (.pdf, .docx, .csv, .txt)", 26 | placeholder='https://www.xxx/uploads/file.pdf', 27 | label_visibility='hidden' 28 | ) 29 | if doc_url: 30 | upload_file, filetype = DocumentLoader.crawl_file(doc_url) 31 | 32 | if upload_file and filetype: 33 | temp_file = tempfile.NamedTemporaryFile(delete=False) 34 | temp_file.write(upload_file) 35 | temp_file_path = temp_file.name 36 | 37 | docs = DocumentLoader.load_documents(temp_file_path, filetype) 38 | docs = DocumentLoader.split_documents( 39 | docs, chunk_size=2000, 40 | chunk_overlap=200 41 | ) 42 | 43 | temp_file.close() 44 | if temp_file_path: 45 | os.remove(temp_file_path) 46 | 47 | return docs 48 | -------------------------------------------------------------------------------- /components/response_handler.py: -------------------------------------------------------------------------------- 1 | from streamlit import logger 2 | 3 | app_logger = logger.get_logger(__name__) 4 | 5 | def get_response(query: str, model) -> str: 6 | app_logger.info(f'\033[36mUser Query: {query}\033[0m') 7 | try: 8 | if model is not None and query: 9 | response = model.run(query) 10 | app_logger.info(f'\033[36mLLM Response: {response}\033[0m') 11 | return response 12 | return ( 13 | 'Your model still not created.\n' 14 | '1. If you are using gpt4free model, ' 15 | 'try to re-select a provider. ' 16 | '(Click the "Show Available Providers" button in sidebar)\n' 17 | '2. If you are using openai model, ' 18 | 'try to re-pass openai api key.\n' 19 | '3. Or you did not pass the file successfully.\n' 20 | '4. Try to Refresh the page (F5).' 21 | ) 22 | except Exception as e: 23 | app_logger.info(f'{__file__}: {e}') 24 | return ( 25 | 'Something wrong in docGPT...\n' 26 | '1. If you are using gpt4free model, ' 27 | 'try to select the different provider. ' 28 | '(Click the "Show Available Providers" button in sidebar)\n' 29 | '2. If you are using openai model, ' 30 | 'check your usage for openai api key.\n' 31 | '3. Try to Refresh the page (F5).' 32 | ) 33 | -------------------------------------------------------------------------------- /components/sidebar.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | import streamlit as st 5 | 6 | from docGPT import GPT4Free 7 | 8 | 9 | def side_bar() -> None: 10 | with st.sidebar: 11 | with st.expander(':orange[How to use?]'): 12 | st.markdown( 13 | """ 14 | 1. Enter your API keys: (You can use the `gpt4free` free model **without API keys**) 15 | * `OpenAI API Key`: Make sure you still have usage left 16 | * `SERPAPI API Key`: Optional. If you want to ask questions about content not appearing in the PDF document, you need this key. 17 | 2. **Upload a Document** file (choose one method): 18 | * method1: Browse and upload your own document file from your local machine. 19 | * method2: Enter the document URL link directly. 20 | 21 | (**support documents**: `.pdf`, `.docx`, `.csv`, `.txt`) 22 | 3. Start asking questions! 23 | 4. More details.(https://github.com/Lin-jun-xiang/docGPT-streamlit) 24 | 5. If you have any questions, feel free to leave comments and engage in discussions.(https://github.com/Lin-jun-xiang/docGPT-streamlit/issues) 25 | """ 26 | ) 27 | 28 | with st.sidebar: 29 | if st.session_state.openai_api_key: 30 | OPENAI_API_KEY = st.session_state.openai_api_key 31 | st.sidebar.success('API key loaded form previous input') 32 | else: 33 | OPENAI_API_KEY = st.sidebar.text_input( 34 | label='#### Your OpenAI API Key 👇', 35 | placeholder="sk-...", 36 | type="password", 37 | key='OPENAI_API_KEY' 38 | ) 39 | st.session_state.openai_api_key = OPENAI_API_KEY 40 | 41 | os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY 42 | 43 | with st.sidebar: 44 | if st.session_state.serpapi_api_key: 45 | SERPAPI_API_KEY = st.session_state.serpapi_api_key 46 | st.sidebar.success('API key loaded form previous input') 47 | else: 48 | SERPAPI_API_KEY = st.sidebar.text_input( 49 | label='#### Your SERPAPI API Key 👇', 50 | placeholder="...", 51 | type="password", 52 | key='SERPAPI_API_KEY' 53 | ) 54 | st.session_state.serpapi_api_key = SERPAPI_API_KEY 55 | 56 | os.environ['SERPAPI_API_KEY'] = SERPAPI_API_KEY 57 | 58 | with st.sidebar: 59 | gpt4free = GPT4Free() 60 | st.session_state.g4f_provider = st.selectbox( 61 | ( 62 | "#### Select a provider if you want to use free model. " 63 | "([details](https://github.com/xtekky/gpt4free#models))" 64 | ), 65 | (['BestProvider'] + list(gpt4free.providers_table.keys())) 66 | ) 67 | 68 | st.session_state.button_clicked = st.button( 69 | 'Show Available Providers', 70 | help='Click to test which providers are currently available.', 71 | type='primary' 72 | ) 73 | if st.session_state.button_clicked: 74 | available_providers = asyncio.run(gpt4free.show_available_providers()) 75 | st.session_state.query.append('What are the available providers right now?') 76 | st.session_state.response.append( 77 | 'The current available providers are:\n' 78 | f'{available_providers}' 79 | ) 80 | -------------------------------------------------------------------------------- /components/theme.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | 4 | def theme() -> None: 5 | st.set_page_config(page_title="Document GPT") 6 | st.image('./static/img/chatbot_v2.png', width=150) 7 | -------------------------------------------------------------------------------- /docGPT/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import openai 4 | import streamlit as st 5 | from langchain.chat_models import ChatOpenAI 6 | from streamlit import logger 7 | 8 | from .agent import AgentHelper 9 | from .check_api_key import OpenAiAPI, SerpAPI 10 | from .docGPT import DocGPT, GPT4Free 11 | 12 | openai.api_key = os.getenv('OPENAI_API_KEY') 13 | os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY') 14 | module_logger = logger.get_logger(__name__) 15 | 16 | 17 | @st.cache_resource(ttl=1200, max_entries=3) 18 | def create_doc_gpt( 19 | _docs: list, 20 | doc_metadata: str, 21 | g4f_provider: str 22 | ) -> DocGPT: 23 | docGPT = DocGPT(docs=_docs) 24 | 25 | try: 26 | if OpenAiAPI.is_valid(): 27 | # Use openai llm model with agent 28 | docGPT_tool, calculate_tool, search_tool, llm_tool = [None] * 4 29 | agent_ = AgentHelper() 30 | 31 | llm_model = ChatOpenAI( 32 | temperature=0.2, 33 | max_tokens=6000, 34 | model_name='gpt-3.5-turbo-16k' 35 | ) 36 | docGPT.llm = llm_model 37 | agent_.llm = llm_model 38 | 39 | docGPT.create_qa_chain(chain_type='refine', verbose=False) 40 | docGPT_tool = agent_.create_doc_chat(docGPT) 41 | calculate_tool = agent_.get_calculate_chain 42 | # llm_tool = agent_.create_llm_chain() 43 | 44 | module_logger.info('\033[43mUsing OpenAI model...\033[0m') 45 | 46 | if SerpAPI.is_valid(): 47 | search_tool = agent_.get_searp_chain 48 | 49 | tools = [ 50 | docGPT_tool, 51 | search_tool, 52 | # llm_tool, # This will cause agent confuse 53 | calculate_tool 54 | ] 55 | agent_.initialize(tools) 56 | return agent_ if agent_ is not None else None 57 | else: 58 | return docGPT 59 | else: 60 | # Use gpt4free llm model without agent 61 | llm_model = GPT4Free(provider=g4f_provider) 62 | docGPT.llm = llm_model 63 | docGPT.create_qa_chain(chain_type='refine', verbose=False) 64 | module_logger.info('\033[43mUsing Gpt4free model...\033[0m') 65 | return docGPT 66 | 67 | except Exception as e: 68 | print(e) 69 | module_logger.info(f'{__file__}: {e}') 70 | -------------------------------------------------------------------------------- /docGPT/agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | import openai 5 | from langchain.agents import AgentType, Tool, initialize_agent 6 | from langchain.callbacks import get_openai_callback 7 | from langchain.chains import LLMChain 8 | from langchain.prompts import PromptTemplate 9 | 10 | openai.api_key = os.getenv('OPENAI_API_KEY') 11 | os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY') 12 | 13 | 14 | class AgentHelper: 15 | """Add agent to help docGPT can be perfonm better.""" 16 | def __init__(self) -> None: 17 | self._llm = None 18 | self.agent_ = None 19 | self.tools = [] 20 | 21 | @property 22 | def llm(self): 23 | return self._llm 24 | 25 | @llm.setter 26 | def llm(self, llm) -> None: 27 | self._llm = llm 28 | 29 | @property 30 | def get_calculate_chain(self) -> Tool: 31 | from langchain import LLMMathChain 32 | 33 | llm_math_chain = LLMMathChain.from_llm(llm=self.llm, verbose=True) 34 | tool = Tool( 35 | name='Calculator', 36 | func=llm_math_chain.run, 37 | description='useful for when you need to answer questions about math' 38 | ) 39 | return tool 40 | 41 | @property 42 | def get_searp_chain(self) -> Tool: 43 | from langchain import SerpAPIWrapper 44 | 45 | search = SerpAPIWrapper() 46 | tool = Tool( 47 | name='Search', 48 | func=search.run, 49 | description='useful for when you need to answer questions about current events' 50 | ) 51 | return tool 52 | 53 | def create_doc_chat(self, docGPT) -> Tool: 54 | """Add a custom docGPT tool""" 55 | tool = Tool( 56 | name='DocumentGPT', 57 | func=docGPT.run, 58 | description=""" 59 | useful for when you need to answer questions from the context of PDF 60 | """ 61 | ) 62 | return tool 63 | 64 | def create_llm_chain(self) -> Tool: 65 | """Add a llm tool""" 66 | prompt = PromptTemplate( 67 | input_variables = ['query'], 68 | template = '{query}' 69 | ) 70 | llm_chain = LLMChain(llm=self.llm, prompt=prompt) 71 | 72 | tool = Tool( 73 | name='LLM', 74 | func=llm_chain.run, 75 | description='useful for general purpose queries and logic.' 76 | ) 77 | return tool 78 | 79 | def initialize(self, tools): 80 | for tool in tools: 81 | if isinstance(tool, Tool): 82 | self.tools.append(tool) 83 | 84 | self.agent_ = initialize_agent( 85 | self.tools, 86 | self.llm, 87 | agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 88 | verbose=True 89 | ) 90 | 91 | def run(self, query: str) -> Optional[str]: 92 | response = None 93 | with get_openai_callback() as callback: 94 | try: 95 | response = self.agent_.run(query) 96 | except ValueError as e: 97 | response = 'Something wrong in agent: ' + str(e) 98 | if not response.startswith("Could not parse LLM output: `"): 99 | raise e 100 | 101 | print(callback) 102 | return response 103 | -------------------------------------------------------------------------------- /docGPT/check_api_key.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | 4 | import openai 5 | import streamlit as st 6 | 7 | 8 | class ApiKey(ABC): 9 | """Check the Api key is valid or not""" 10 | query = 'This is a test.' 11 | 12 | @classmethod 13 | @abstractmethod 14 | def is_valid(cls): 15 | pass 16 | 17 | 18 | class OpenAiAPI(ApiKey): 19 | @classmethod 20 | def is_valid(cls) -> str: 21 | if not st.session_state['openai_api_key']: 22 | st.error('⚠️ :red[You have not pass OpenAI API key.] Use default model') 23 | return 24 | 25 | openai.api_key = os.getenv('OPENAI_API_KEY') 26 | try: 27 | response = openai.Completion.create( 28 | engine='davinci', 29 | prompt=cls.query, 30 | max_tokens=5 31 | ) 32 | return response 33 | except Exception as e: 34 | st.error( 35 | '🚨 :red[Your OpenAI API key has a problem.] ' 36 | '[Check your usage](https://platform.openai.com/account/usage)' 37 | ) 38 | print(f'Test error\n{e}') 39 | 40 | 41 | class SerpAPI(ApiKey): 42 | @classmethod 43 | def is_valid(cls) -> str: 44 | if not st.session_state['serpapi_api_key']: 45 | st.warning('⚠️ You have not pass SerpAPI key. (You cannot ask current events.)') 46 | return 47 | from langchain import SerpAPIWrapper 48 | 49 | os.environ['SERPAPI_API_KEY'] = os.getenv('SERPAPI_API_KEY') 50 | try: 51 | search = SerpAPIWrapper() 52 | response = search.run(cls.query) 53 | return response 54 | except Exception as e: 55 | st.error( 56 | '🚨 :red[Your SerpAPI key has a problem.] ' 57 | '[Check your usage](https://serpapi.com/dashboard)' 58 | ) 59 | print(f'Test error\n{e}') 60 | -------------------------------------------------------------------------------- /docGPT/docGPT.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | from abc import ABC, abstractmethod 4 | from typing import List, Optional 5 | 6 | import g4f 7 | import openai 8 | from langchain.callbacks import get_openai_callback 9 | from langchain.callbacks.manager import CallbackManagerForLLMRun 10 | from langchain.chains import RetrievalQA 11 | from langchain.embeddings import HuggingFaceEmbeddings 12 | from langchain.embeddings.openai import OpenAIEmbeddings 13 | from langchain.llms.base import LLM 14 | from langchain.prompts import PromptTemplate 15 | from langchain.vectorstores import FAISS 16 | from streamlit import logger 17 | 18 | openai.api_key = os.getenv('OPENAI_API_KEY') 19 | module_logger = logger.get_logger(__name__) 20 | 21 | 22 | class BaseQaChain(ABC): 23 | def __init__( 24 | self, 25 | chain_type: str, 26 | retriever, 27 | llm 28 | ) -> None: 29 | self.chain_type = chain_type 30 | self.retriever = retriever 31 | self.llm = llm 32 | 33 | @abstractmethod 34 | def create_qa_chain(self): 35 | pass 36 | 37 | 38 | class RChain(BaseQaChain): 39 | def __init__( 40 | self, 41 | chain_type: str, 42 | retriever, 43 | llm, 44 | chain_type_kwargs: dict 45 | ) -> None: 46 | super().__init__(chain_type, retriever, llm) 47 | self.chain_type_kwargs = chain_type_kwargs 48 | 49 | @property 50 | def create_qa_chain(self) -> RetrievalQA: 51 | qa_chain = RetrievalQA.from_chain_type( 52 | llm=self.llm, 53 | chain_type=self.chain_type, 54 | retriever=self.retriever, 55 | chain_type_kwargs=self.chain_type_kwargs 56 | ) 57 | return qa_chain 58 | 59 | 60 | class CRChain(BaseQaChain): 61 | def __init__( 62 | self, 63 | chain_type: str, 64 | retriever, 65 | llm, 66 | ) -> None: 67 | super().__init__(chain_type, retriever, llm) 68 | 69 | @property 70 | def create_qa_chain(self): 71 | # TODO: cannot use conversation qa chain 72 | from langchain.chains import ConversationalRetrievalChain 73 | from langchain.memory import ConversationBufferMemory 74 | 75 | memory = ConversationBufferMemory( 76 | memory_key='chat_history', 77 | return_messages=True 78 | ) 79 | qa_chain = ConversationalRetrievalChain.from_llm( 80 | llm=self.llm, 81 | chain_type=self.chain_type, 82 | retriever=self.retriever, 83 | memory=memory 84 | ) 85 | return qa_chain 86 | 87 | 88 | class DocGPT: 89 | def __init__(self, docs): 90 | self.docs = docs 91 | self.qa_chain = None 92 | self._llm = None 93 | 94 | self.prompt_template = ( 95 | "Only answer what is asked. Answer step-by-step.\n" 96 | "If the content has sections, please summarize them " 97 | "in order and present them in a bulleted format.\n" 98 | "Utilize line breaks for better readability.\n" 99 | "For example, sequentially summarize the " 100 | "introduction, methods, results, and so on.\n" 101 | "Please use Python's newline symbols appropriately to " 102 | "enhance the readability of the response, " 103 | "but don't use two newline symbols consecutive.\n\n" 104 | "{context}\n\n" 105 | "Question: {question}\n" 106 | ) 107 | self.prompt = PromptTemplate( 108 | template=self.prompt_template, 109 | input_variables=['context', 'question'] 110 | ) 111 | 112 | self.refine_prompt_template = ( 113 | "The original question is as follows: {question}\n" 114 | "We have provided an existing answer: {existing_answer}\n" 115 | "We have the opportunity to refine the existing answer" 116 | "(only if needed) with some more context below.\n" 117 | "------------\n" 118 | "{context_str}\n" 119 | "------------\n" 120 | "Given the new context, refine the original answer to better " 121 | "answer the question. " 122 | "If the context isn't useful, return the original answer.\n" 123 | "Please use Python's newline symbols " 124 | "appropriately to enhance the readability of the response, " 125 | "but don't use two newline symbols consecutive.\n" 126 | ) 127 | self.refine_prompt = PromptTemplate( 128 | template=self.refine_prompt_template, 129 | input_variables=['question', 'existing_answer', 'context_str'] 130 | ) 131 | 132 | @property 133 | def llm(self): 134 | return self._llm 135 | 136 | @llm.setter 137 | def llm(self, llm) -> None: 138 | self._llm = llm 139 | 140 | def _helper_prompt(self, chain_type: str) -> None: 141 | # TODO: Bug helper 142 | if chain_type == 'refine': 143 | self.prompt_template = self.prompt_template.replace( 144 | '{context}', '{context_str}' 145 | ) 146 | self.prompt.template = self.prompt_template 147 | for i in range(len(self.prompt.input_variables)): 148 | if self.prompt.input_variables[i] == 'context': 149 | self.prompt.input_variables[i] = 'context_str' 150 | 151 | def _embeddings(self): 152 | try: 153 | # If have openai api 154 | embeddings = OpenAIEmbeddings() 155 | except: 156 | embeddings = HuggingFaceEmbeddings( 157 | model_name=( 158 | 'sentence-transformers/' 159 | 'multi-qa-MiniLM-L6-cos-v1' 160 | ) 161 | ) 162 | 163 | db = FAISS.from_documents( 164 | documents=self.docs, 165 | embedding=embeddings 166 | ) 167 | module_logger.info('embedded...') 168 | return db 169 | 170 | def create_qa_chain( 171 | self, 172 | chain_type: str ='stuff', 173 | verbose: bool = True 174 | ) -> BaseQaChain: 175 | # TODO: Bug helper 176 | self._helper_prompt(chain_type) 177 | chain_type_kwargs = { 178 | 'question_prompt': self.prompt, 179 | 'verbose': verbose, 180 | 'refine_prompt': self.refine_prompt 181 | } 182 | 183 | db = self._embeddings() 184 | retriever = db.as_retriever() 185 | 186 | self.qa_chain = RChain( 187 | chain_type=chain_type, 188 | retriever=retriever, 189 | llm=self._llm, 190 | chain_type_kwargs=chain_type_kwargs 191 | ).create_qa_chain 192 | 193 | def run(self, query: str) -> str: 194 | response = 'Nothing...' 195 | with get_openai_callback() as callback: 196 | if isinstance(self.qa_chain, RetrievalQA): 197 | response = self.qa_chain.run(query) 198 | module_logger.info(callback) 199 | return response 200 | 201 | 202 | class GPT4Free(LLM): 203 | providers_table = { 204 | f'g4f.Provider.{provider}': getattr(g4f.Provider, provider) 205 | for provider in g4f.Provider.__all__ 206 | } 207 | provider: str = 'g4f.Provider.DeepAi' 208 | 209 | @property 210 | def _llm_type(self) -> str: 211 | return 'gpt4free model' 212 | 213 | def _call( 214 | self, 215 | prompt: str, 216 | stop: Optional[List[str]] = None, 217 | run_manager: Optional[CallbackManagerForLLMRun] = None, 218 | ) -> str: 219 | try: 220 | # print(f'\033[36mPromopt: {prompt}\033[0m') 221 | provider = self.providers_table.get(self.provider, None) 222 | module_logger.info( 223 | f'\033[36mProvider: {provider}\033[0m' 224 | ) 225 | return g4f.ChatCompletion.create( 226 | model="gpt-3.5-turbo", 227 | messages=[{"role": "user", "content": prompt}], 228 | provider=provider, 229 | ignored=["ChatBase"] 230 | ) 231 | except Exception as e: 232 | module_logger.info(f'{__file__}: call gpt4free error - {e}') 233 | 234 | async def _test_provider(self, provider: g4f.Provider) -> str: 235 | provider_name = provider.__name__ 236 | try: 237 | await g4f.ChatCompletion.create_async( 238 | model="gpt-3.5-turbo", 239 | messages=[{"role": "user", "content": 'Hi, this is test'}], 240 | provider=provider, 241 | ignored=["ChatBase"] 242 | ) 243 | return provider_name 244 | except Exception as e: 245 | print(f'{provider_name}: {e}') 246 | 247 | async def show_available_providers(self) -> list: 248 | """Test all the providers then find out which are available""" 249 | tasks = [ 250 | self._test_provider(provider) 251 | for provider in self.providers_table.values() 252 | ] 253 | available_providers = await asyncio.gather(*tasks) 254 | 255 | return [ 256 | available_provider for available_provider in available_providers 257 | if available_provider is not None 258 | ] 259 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | docgpt: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | ports: 9 | - '8501:8501' 10 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_connection import ( 2 | DocumentLoader 3 | ) 4 | -------------------------------------------------------------------------------- /model/data_connection.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Iterator, Union 3 | 4 | import requests 5 | import streamlit as st 6 | from langchain.document_loaders import ( 7 | CSVLoader, 8 | Docx2txtLoader, 9 | PyMuPDFLoader, 10 | TextLoader, 11 | ) 12 | from langchain.text_splitter import RecursiveCharacterTextSplitter 13 | 14 | 15 | class DocumentLoader: 16 | @staticmethod 17 | def get_files(path: str, filetype: str = '.pdf') -> Iterator[str]: 18 | try: 19 | yield from [ 20 | file_name for file_name in os.listdir(f'{path}') 21 | if file_name.endswith(filetype) 22 | ] 23 | except FileNotFoundError as e: 24 | print(f'\033[31m{e}') 25 | 26 | @staticmethod 27 | def load_documents( 28 | file: str, 29 | filetype: str = '.pdf' 30 | ) -> Union[CSVLoader, Docx2txtLoader, PyMuPDFLoader, TextLoader]: 31 | """Loading PDF, Docx, CSV""" 32 | try: 33 | if filetype == '.pdf': 34 | loader = PyMuPDFLoader(file) 35 | elif filetype == '.docx': 36 | loader = Docx2txtLoader(file) 37 | elif filetype == '.csv': 38 | loader = CSVLoader(file, encoding='utf-8') 39 | elif filetype == '.txt': 40 | loader = TextLoader(file, encoding='utf-8') 41 | 42 | return loader.load() 43 | 44 | except Exception as e: 45 | print(f'\033[31m{e}') 46 | return [] 47 | 48 | @staticmethod 49 | def split_documents( 50 | document: Union[CSVLoader, Docx2txtLoader, PyMuPDFLoader, TextLoader], 51 | chunk_size: int=2000, 52 | chunk_overlap: int=0 53 | ) -> list: 54 | splitter = RecursiveCharacterTextSplitter( 55 | chunk_size=chunk_size, 56 | chunk_overlap=chunk_overlap 57 | ) 58 | 59 | return splitter.split_documents(document) 60 | 61 | @staticmethod 62 | def crawl_file(url: str) -> str: 63 | try: 64 | response = requests.get(url) 65 | filetype = os.path.splitext(url)[1] 66 | if response.status_code == 200 and ( 67 | any(ext in filetype for ext in ['.pdf', '.docx', '.csv', '.txt']) 68 | ): 69 | return response.content, filetype 70 | else: 71 | st.warning('Url cannot parse correctly.') 72 | except: 73 | st.warning('Url cannot parse correctly.') 74 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | g4f 2 | langchain==0.0.218 3 | openai==0.27.8 4 | streamlit==1.26.0 5 | streamlit_chat==0.1.1 6 | pymupdf==1.22.5 7 | faiss-cpu==1.7.4 8 | tiktoken==0.4.0 9 | tenacity==8.1.0 10 | google-search-results==2.4.2 11 | sentence_transformers 12 | requests 13 | httpx 14 | docx2txt 15 | -------------------------------------------------------------------------------- /static/img/2023-07-03-22-38-08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-07-03-22-38-08.png -------------------------------------------------------------------------------- /static/img/2023-08-24-15-02-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-08-24-15-02-11.png -------------------------------------------------------------------------------- /static/img/2023-08-29-13-39-00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-08-29-13-39-00.png -------------------------------------------------------------------------------- /static/img/2023-09-06-14-56-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/2023-09-06-14-56-20.png -------------------------------------------------------------------------------- /static/img/chatbot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/chatbot.png -------------------------------------------------------------------------------- /static/img/chatbot_v2.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/chatbot_v2.1.png -------------------------------------------------------------------------------- /static/img/chatbot_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/chatbot_v2.png -------------------------------------------------------------------------------- /static/img/docGPT.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/docGPT.gif -------------------------------------------------------------------------------- /static/img/repos_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/repos_logo.png -------------------------------------------------------------------------------- /static/img/repos_logo_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lin-jun-xiang/docGPT-langchain/f369908cfc5ca100fa231e7189ed2afde29d4d49/static/img/repos_logo_v1.png --------------------------------------------------------------------------------