├── .DS_Store ├── .dockerignore ├── .gitignore ├── .python-version ├── Dockerfile ├── README.md ├── bot ├── app.py ├── backend.py ├── create.py ├── frontend.py └── query.py ├── notebook └── pinecone.ipynb └── requirements.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/teekyboy/gcp_chatbot/45239befd187c558963ff1bcf7bbfb64aaa426a1/.DS_Store -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | *.swp 6 | *.bak 7 | *.log 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | #config.ini file 163 | config.ini 164 | 165 | #notebook folder 166 | notebook/ 167 | 168 | #personal data 169 | chatbot.json 170 | .python-version 171 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | firebase-bot 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=linux/amd64 python:3.10.11-buster 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | build-essential \ 7 | gcc \ 8 | libpq-dev \ 9 | python3-dev \ 10 | && apt-get clean \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | COPY requirements.txt /app 14 | COPY chatbot.json . 15 | 16 | RUN pip install --no-cache-dir --trusted-host pypi.python.org -r requirements.txt 17 | 18 | COPY . /app 19 | 20 | ENV PORT=8080 21 | 22 | EXPOSE $PORT 23 | 24 | CMD ["sh", "-c", "streamlit run --server.port $PORT bot/app.py"] 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Using Firebase and Pinecone to Create a Question Answering App 2 | 3 | This code enables the creation of a database on Firbase and facilitates answering queries utilizing the said database. The vectors are preserved within Pinecone. 4 | 5 | Queries and responses are recorded in a csv on Firebase. It is no longer necessary for the uploaded files to originate from local uploads. 6 | 7 | Used Google Secret Manager to store all keys. 8 | -------------------------------------------------------------------------------- /bot/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from backend import app as backend_app 3 | from frontend import main as frontend_app 4 | 5 | st.set_page_config(page_title="My App", page_icon=None, layout="wide", initial_sidebar_state="expanded") 6 | 7 | def main(): 8 | st.sidebar.title("Navigation") 9 | app_mode = st.sidebar.radio("What do you want to do?", ["Load New Files" , "Test My Knowledge"]) 10 | 11 | if app_mode == "Load New Files": 12 | backend_app() 13 | elif app_mode == "Test My Knowledge": 14 | frontend_app() 15 | 16 | 17 | add_custom_css() 18 | 19 | def add_custom_css(): 20 | custom_css = """ 21 | 42 | """ 43 | st.markdown(custom_css, unsafe_allow_html=True) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /bot/backend.py: -------------------------------------------------------------------------------- 1 | import os 2 | from google.cloud import storage 3 | import streamlit as st 4 | import pinecone 5 | from create import load_documents 6 | from langchain.vectorstores import Pinecone 7 | 8 | #for cloud run 9 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/app/chatbot.json' 10 | #for local run 11 | #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../chatbot.json' 12 | 13 | def upload_file_to_gcs(uploaded_file, folder_prefix): 14 | storage_client = storage.Client() 15 | bucket_name = "chatbot.appspot.com" 16 | bucket = storage_client.get_bucket(bucket_name) 17 | blob = bucket.blob(f"{folder_prefix}/{uploaded_file.name}") 18 | blob.upload_from_string(uploaded_file.getvalue(), content_type=uploaded_file.type) 19 | 20 | def create_database(docs, embeddings, PINECONE_API_KEY, PINECONE_ENV, PINECONE_INDEX): 21 | pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV) 22 | index_name = PINECONE_INDEX 23 | db = Pinecone.from_documents(docs, embeddings, index_name=index_name) 24 | return db 25 | 26 | def app(): 27 | st.title("Upload New Files") 28 | uploaded_file = st.file_uploader("Choose a file to upload", type=['txt', 'pdf', 'doc', 'docx']) 29 | 30 | if uploaded_file is not None: 31 | file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size} 32 | st.write(file_details) 33 | 34 | if st.button("Upload and Process Documents"): 35 | folder_prefix = "data/input" 36 | upload_file_to_gcs(uploaded_file, folder_prefix) 37 | docs, embeddings, PINECONE_API_KEY, PINECONE_ENV, PINECONE_INDEX = load_documents() 38 | db = create_database(docs, embeddings, PINECONE_API_KEY, PINECONE_ENV, PINECONE_INDEX) 39 | st.success("Database created successfully.") 40 | 41 | if __name__ == "__main__": 42 | app() 43 | -------------------------------------------------------------------------------- /bot/create.py: -------------------------------------------------------------------------------- 1 | from langchain.document_loaders import GCSDirectoryLoader 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter 3 | from langchain.embeddings.openai import OpenAIEmbeddings 4 | import os 5 | from google.cloud import secretmanager_v1 6 | 7 | os.environ['GOOGLE_CLOUD_PROJECT'] = 'chatbot' 8 | 9 | def get_secret(secret_id): 10 | project_id = os.environ['GOOGLE_CLOUD_PROJECT'] 11 | client = secretmanager_v1.SecretManagerServiceClient() 12 | secret_name = f"projects/{project_id}/secrets/{secret_id}/versions/latest" 13 | response = client.access_secret_version(request={"name": secret_name}) 14 | return response.payload.data.decode('UTF-8') 15 | 16 | def load_documents(): 17 | #for cloud run 18 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/app/chatbot.json' 19 | #for local run 20 | #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../chatbot.json' 21 | OPENAI_API_KEY = get_secret('openai_api_key') 22 | PINECONE_API_KEY = get_secret('pinecone_api_key') 23 | PINECONE_ENV = get_secret('pinecone_env') 24 | PINECONE_INDEX = get_secret('pinecone_index') 25 | loader = GCSDirectoryLoader(project_name="chatbot", bucket="chatbot.appspot.com", prefix="data/input") 26 | documents = loader.load() 27 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20) 28 | docs = text_splitter.split_documents(documents) 29 | embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) 30 | return docs, embeddings, PINECONE_API_KEY, PINECONE_ENV, PINECONE_INDEX 31 | -------------------------------------------------------------------------------- /bot/frontend.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from query import run_query, log_question_answer 3 | import os 4 | from langchain.vectorstores import Pinecone 5 | from langchain.embeddings.openai import OpenAIEmbeddings 6 | from google.cloud import secretmanager_v1 7 | 8 | os.environ['GOOGLE_CLOUD_PROJECT'] = 'chatbot' 9 | 10 | def get_secret(secret_id): 11 | project_id = os.environ['GOOGLE_CLOUD_PROJECT'] 12 | client = secretmanager_v1.SecretManagerServiceClient() 13 | secret_name = f"projects/{project_id}/secrets/{secret_id}/versions/latest" 14 | response = client.access_secret_version(request={"name": secret_name}) 15 | return response.payload.data.decode('UTF-8') 16 | 17 | #for cloud run 18 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/app/chatbot.json' 19 | #for local run 20 | #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../chatbot.json' 21 | OPENAI_API_KEY = get_secret('openai_api_key') 22 | PINECONE_API_KEY = get_secret('pinecone_api_key') 23 | PINECONE_ENV = get_secret('pinecone_env') 24 | PINECONE_INDEX = get_secret('pinecone_index') 25 | index_name = PINECONE_INDEX 26 | embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) 27 | db = Pinecone.from_existing_index(index_name, embeddings) 28 | 29 | def main(): 30 | st.title("What do you want to know?") 31 | st.subheader("Ask Questions") 32 | user_input = st.text_input("Enter your question:") 33 | 34 | if st.button("Submit"): 35 | if user_input: 36 | answer = run_query(user_input, db, OPENAI_API_KEY) 37 | st.write(answer) 38 | log_question_answer(user_input, answer) 39 | else: 40 | st.error("Please enter a question before submitting.") 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /bot/query.py: -------------------------------------------------------------------------------- 1 | from langchain.llms import OpenAI 2 | from langchain.chains import RetrievalQA 3 | import pandas as pd 4 | from google.cloud import storage 5 | import os 6 | from langchain.vectorstores import Pinecone 7 | import pinecone 8 | from langchain.embeddings.openai import OpenAIEmbeddings 9 | import io 10 | from google.cloud import secretmanager_v1 11 | 12 | os.environ['GOOGLE_CLOUD_PROJECT'] = 'chatbot' 13 | 14 | def get_secret(secret_id): 15 | project_id = os.environ['GOOGLE_CLOUD_PROJECT'] 16 | client = secretmanager_v1.SecretManagerServiceClient() 17 | secret_name = f"projects/{project_id}/secrets/{secret_id}/versions/latest" 18 | response = client.access_secret_version(request={"name": secret_name}) 19 | return response.payload.data.decode('UTF-8') 20 | 21 | #for cloud run 22 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/app/chatbot.json' 23 | #for local run 24 | #os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../chatbot.json' 25 | OPENAI_API_KEY = get_secret('openai_api_key') 26 | PINECONE_API_KEY = get_secret('pinecone_api_key') 27 | PINECONE_ENV = get_secret('pinecone_env') 28 | PINECONE_INDEX = get_secret('pinecone_index') 29 | index_name = PINECONE_INDEX 30 | embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) 31 | 32 | pinecone.init( 33 | api_key=PINECONE_API_KEY, 34 | environment=PINECONE_ENV 35 | ) 36 | index_name = PINECONE_INDEX 37 | 38 | db = Pinecone.from_existing_index(index_name, embeddings) 39 | 40 | def run_query(user_input, db, OPENAI_API_KEY): 41 | llm = OpenAI(temperature=0.5, openai_api_key=OPENAI_API_KEY) 42 | qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever()) 43 | answer = qa.run(user_input) 44 | return answer 45 | 46 | def log_question_answer(query, answer): 47 | prefix = 'data/output/' 48 | log_file = f'{prefix}questions_answers.csv' 49 | <<<<<<< HEAD 50 | bucket_name = 'chatbot.com' 51 | ======= 52 | bucket_name = 'chatbot.appspot.com' 53 | >>>>>>> fabfcc05889160208b1260cc8b26beaa4f76377a 54 | data = {'question': [query], 'answer': [answer]} 55 | df = pd.DataFrame(data) 56 | storage_client = storage.Client() 57 | bucket = storage_client.get_bucket(bucket_name) 58 | blob = storage.Blob(log_file, bucket) 59 | if blob.exists(): 60 | content = blob.download_as_text() 61 | existing_df = pd.read_csv(io.StringIO(content)) 62 | new_df = existing_df.append(df, ignore_index=True) 63 | else: 64 | new_df = df 65 | new_content = new_df.to_csv(index=False) 66 | blob.upload_from_string(new_content, content_type='text/csv') 67 | -------------------------------------------------------------------------------- /notebook/pinecone.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "id": "683953b3", 7 | "metadata": {}, 8 | "source": [ 9 | "# Creating a personal bot that lets you upload files to GC Storage and then stores the question and answer pairs in a CSV file in GC Storage.\n", 10 | "\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "aac9563e", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from langchain.embeddings.openai import OpenAIEmbeddings\n", 21 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 22 | "from langchain.vectorstores import Pinecone\n", 23 | "from langchain.document_loaders import GCSDirectoryLoader\n", 24 | "from langchain.chains import RetrievalQA\n", 25 | "from langchain.llms import OpenAI\n", 26 | "import pinecone\n", 27 | "import os\n", 28 | "import configparser\n", 29 | "import pandas as pd\n", 30 | "from google.cloud import storage" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "id": "4f5e0b7b", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '~/gcp/chatbot.json'" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "b306a846", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "config = configparser.ConfigParser()\n", 51 | "config.read('../config.ini')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "216df103", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "OPENAI_API_KEY = config.get('api_key', 'openai')\n", 62 | "PINECONE_API_KEY = config.get('api_key', 'pinecone')\n", 63 | "PINECONE_ENV = config.get('env', 'pinecone')\n", 64 | "PINECONE_INDEX = config.get('index', 'pinecone')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "a3c3999a", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "loader = GCSDirectoryLoader(project_name=\"chatbot\", bucket=\"chatbot.appspot.com\", prefix=\"data/input\")\n", 75 | "documents = loader.load()\n", 76 | "text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)\n", 77 | "docs = text_splitter.split_documents(documents)\n", 78 | "\n", 79 | "embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "6e104aee", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "pinecone.init(\n", 90 | " api_key=PINECONE_API_KEY,\n", 91 | " environment=PINECONE_ENV \n", 92 | ")\n", 93 | "\n", 94 | "index_name = PINECONE_INDEX\n", 95 | "\n", 96 | "db = Pinecone.from_documents(docs, embeddings, index_name=index_name)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "626f69ef", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "db = Pinecone.from_existing_index(index_name, embeddings)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "4f373eb2", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "llm = OpenAI(temperature=0.2, openai_api_key=OPENAI_API_KEY)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "ee0fa12c", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "qa = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=db.as_retriever())" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "9d059506", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "query = \"What is the meaning of life?\"\n", 137 | "answer = qa.run(query)\n", 138 | "answer" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "a359ed74", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "def log_question_answer(query, answer):\n", 149 | " prefix = 'data/output/'\n", 150 | " log_file = f'{prefix}questions_answers.csv'\n", 151 | " bucket_name = 'chatbot.appspot.com'\n", 152 | " data = {'question': [query], 'answer': [answer]}\n", 153 | "\n", 154 | " df = pd.DataFrame(data)\n", 155 | "\n", 156 | " # Initialize Google Cloud Storage client\n", 157 | " storage_client = storage.Client()\n", 158 | " bucket = storage_client.get_bucket(bucket_name)\n", 159 | "\n", 160 | " # Check if the file exists in the bucket\n", 161 | " blob = storage.Blob(log_file, bucket)\n", 162 | " if blob.exists():\n", 163 | " # Read the existing data and append the new data\n", 164 | " content = blob.download_as_text()\n", 165 | " existing_df = pd.read_csv(pd.StringIO(content))\n", 166 | " new_df = existing_df.append(df, ignore_index=True)\n", 167 | " else:\n", 168 | " new_df = df\n", 169 | "\n", 170 | " # Upload the updated data to Google Cloud Storage\n", 171 | " new_content = new_df.to_csv(index=False)\n", 172 | " blob.upload_from_string(new_content, content_type='text/csv')\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "841229a7", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "log_question_answer(query, answer)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "55f33167", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3 (ipykernel)", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.10.6" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 5 215 | } 216 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | altair==4.2.2 4 | antlr4-python3-runtime==4.9.3 5 | anyio==3.6.2 6 | argilla==1.6.0 7 | async-timeout==4.0.2 8 | attrs==23.1.0 9 | backoff==2.2.1 10 | blinker==1.6.2 11 | cachetools==5.3.0 12 | certifi==2022.12.7 13 | cffi==1.15.1 14 | charset-normalizer==3.1.0 15 | click==8.1.3 16 | coloredlogs==15.0.1 17 | commonmark==0.9.1 18 | contourpy==1.0.7 19 | cryptography==40.0.2 20 | cycler==0.11.0 21 | dataclasses-json==0.5.7 22 | decorator==5.1.1 23 | Deprecated==1.2.13 24 | dnspython==2.3.0 25 | effdet==0.3.0 26 | entrypoints==0.4 27 | et-xmlfile==1.1.0 28 | fastapi==0.95.1 29 | filelock==3.12.0 30 | flatbuffers==23.3.3 31 | fonttools==4.39.3 32 | frozenlist==1.3.3 33 | fsspec==2023.4.0 34 | gitdb==4.0.10 35 | GitPython==3.1.31 36 | google-api-core==2.11.0 37 | google-auth==2.17.3 38 | google-cloud-core==2.3.2 39 | google-cloud-secret-manager==2.16.1 40 | google-cloud-storage==2.8.0 41 | google-crc32c==1.5.0 42 | google-resumable-media==2.5.0 43 | googleapis-common-protos==1.59.0 44 | grpc-google-iam-v1==0.12.6 45 | grpcio==1.54.0 46 | grpcio-status==1.48.2 47 | h11==0.14.0 48 | httpcore==0.16.3 49 | httpx==0.23.3 50 | huggingface-hub==0.14.1 51 | humanfriendly==10.0 52 | idna==3.4 53 | importlib-metadata==6.6.0 54 | iopath==0.1.10 55 | Jinja2==3.1.2 56 | joblib==1.2.0 57 | jsonschema==4.17.3 58 | kiwisolver==1.4.4 59 | langchain==0.0.153 60 | layoutparser==0.3.4 61 | loguru==0.7.0 62 | lxml==4.9.2 63 | Markdown==3.4.3 64 | markdown-it-py==2.2.0 65 | MarkupSafe==2.1.2 66 | marshmallow==3.19.0 67 | marshmallow-enum==1.5.1 68 | matplotlib==3.7.1 69 | mdurl==0.1.2 70 | monotonic==1.6 71 | mpmath==1.3.0 72 | msg-parser==1.2.0 73 | multidict==6.0.4 74 | mypy-extensions==1.0.0 75 | networkx==3.1 76 | nltk==3.8.1 77 | numexpr==2.8.4 78 | numpy==1.23.5 79 | olefile==0.46 80 | omegaconf==2.3.0 81 | onnxruntime==1.14.1 82 | openai==0.27.5 83 | openapi-schema-pydantic==1.2.4 84 | opencv-python==4.7.0.72 85 | openpyxl==3.1.2 86 | packaging==23.1 87 | pandas==1.5.3 88 | pdf2image==1.16.3 89 | pdfminer.six==20221105 90 | pdfplumber==0.9.0 91 | Pillow==9.5.0 92 | pinecone-client==2.2.1 93 | portalocker==2.7.0 94 | proto-plus==1.22.2 95 | protobuf==3.20.3 96 | pyarrow==11.0.0 97 | pyasn1==0.5.0 98 | pyasn1-modules==0.3.0 99 | pycocotools==2.0.6 100 | pycparser==2.21 101 | pydantic==1.10.7 102 | pydeck==0.8.1b0 103 | Pygments==2.15.1 104 | Pympler==1.0.1 105 | pypandoc==1.11 106 | pyparsing==3.0.9 107 | pyrsistent==0.19.3 108 | pytesseract==0.3.10 109 | python-dateutil==2.8.2 110 | python-docx==0.8.11 111 | python-magic==0.4.27 112 | python-multipart==0.0.6 113 | python-pptx==0.6.21 114 | pytz==2023.3 115 | pytz-deprecation-shim==0.1.0.post0 116 | PyYAML==6.0 117 | regex==2023.3.23 118 | requests==2.29.0 119 | rfc3986==1.5.0 120 | rich==13.0.1 121 | rsa==4.9 122 | scipy==1.10.1 123 | six==1.16.0 124 | smmap==5.0.0 125 | sniffio==1.3.0 126 | SQLAlchemy==2.0.11 127 | starlette==0.26.1 128 | streamlit==1.22.0 129 | sympy==1.11.1 130 | tenacity==8.2.2 131 | tiktoken==0.3.3 132 | timm==0.6.13 133 | tokenizers==0.13.3 134 | toml==0.10.2 135 | toolz==0.12.0 136 | torch==2.0.0 137 | torchvision==0.15.1 138 | tornado==6.3.1 139 | tqdm==4.65.0 140 | transformers==4.28.1 141 | typing-inspect==0.8.0 142 | typing_extensions==4.5.0 143 | tzdata==2023.3 144 | tzlocal==4.3 145 | unstructured==0.6.2 146 | unstructured-inference==0.4.2 147 | urllib3==1.26.15 148 | uvicorn==0.22.0 149 | validators==0.20.0 150 | Wand==0.6.11 151 | wrapt==1.14.1 152 | XlsxWriter==3.1.0 153 | yarl==1.9.2 154 | zipp==3.15.0 155 | --------------------------------------------------------------------------------