├── README.md ├── faiss_index ├── index.pkl └── index.faiss ├── requirements.txt ├── LICENSE ├── app.py ├── .gitignore ├── multimodal_gpt_x.py ├── templates └── index.html └── Multimodal_GPT_X.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Multimodal-RAG-using-Langchain 2 | Multimodal RAG using Langchain 3 | -------------------------------------------------------------------------------- /faiss_index/index.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIAnytime/Multimodal-RAG-using-Langchain/HEAD/faiss_index/index.pkl -------------------------------------------------------------------------------- /faiss_index/index.faiss: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AIAnytime/Multimodal-RAG-using-Langchain/HEAD/faiss_index/index.faiss -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu 2 | langchain 3 | openai 4 | python-dotenv 5 | fastapi 6 | jinja2 7 | python-multipart 8 | uvicorn 9 | tiktoken -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 AI Anytime 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from langchain.chat_models import ChatOpenAI 2 | from langchain.embeddings import OpenAIEmbeddings 3 | from langchain.chains import LLMChain 4 | from langchain.prompts import PromptTemplate 5 | from langchain.schema.messages import HumanMessage, SystemMessage 6 | from langchain.schema.document import Document 7 | from langchain.vectorstores import FAISS 8 | from langchain.retrievers.multi_vector import MultiVectorRetriever 9 | import os 10 | import uuid 11 | import base64 12 | from fastapi import FastAPI, Request, Form, Response, File, UploadFile 13 | from fastapi.responses import HTMLResponse, JSONResponse 14 | from fastapi.templating import Jinja2Templates 15 | from fastapi.encoders import jsonable_encoder 16 | from fastapi.middleware.cors import CORSMiddleware 17 | import json 18 | from dotenv import load_dotenv 19 | load_dotenv() 20 | 21 | app = FastAPI() 22 | templates = Jinja2Templates(directory="templates") 23 | 24 | # Configure CORS 25 | app.add_middleware( 26 | CORSMiddleware, 27 | allow_origins=["*"], 28 | allow_credentials=True, 29 | allow_methods=["*"], 30 | allow_headers=["*"], 31 | ) 32 | 33 | openai_api_key = os.getenv("OPENAI_API_KEY") 34 | embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) 35 | 36 | db = FAISS.load_local("faiss_index", embeddings) 37 | 38 | prompt_template = """You are a vet doctor and an expert in analyzing dog's health. 39 | Answer the question based only on the following context, which can include text, images and tables: 40 | {context} 41 | Question: {question} 42 | Don't answer if you are not sure and decline to answer and say "Sorry, I don't have much information about it." 43 | Just return the helpful answer in as much as detailed possible. 44 | Answer: 45 | """ 46 | 47 | qa_chain = LLMChain(llm=ChatOpenAI(model="gpt-4", openai_api_key = openai_api_key, max_tokens=1024), 48 | prompt=PromptTemplate.from_template(prompt_template)) 49 | 50 | @app.get("/", response_class=HTMLResponse) 51 | async def index(request: Request): 52 | return templates.TemplateResponse("index.html", {"request": request}) 53 | 54 | @app.post("/get_answer") 55 | async def get_answer(question: str = Form(...)): 56 | relevant_docs = db.similarity_search(question) 57 | context = "" 58 | relevant_images = [] 59 | for d in relevant_docs: 60 | if d.metadata['type'] == 'text': 61 | context += '[text]' + d.metadata['original_content'] 62 | elif d.metadata['type'] == 'table': 63 | context += '[table]' + d.metadata['original_content'] 64 | elif d.metadata['type'] == 'image': 65 | context += '[image]' + d.page_content 66 | relevant_images.append(d.metadata['original_content']) 67 | result = qa_chain.run({'context': context, 'question': question}) 68 | return JSONResponse({"relevant_images": relevant_images[0], "result": result}) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /multimodal_gpt_x.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Multimodal GPT-X.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/146odvT3BhpeChmoMzKJL8IsKQJjk_mIR 8 | """ 9 | 10 | !sudo apt install tesseract-ocr -y 11 | !sudo apt install libtesseract-dev -y 12 | !sudo apt-get install poppler-utils -y 13 | 14 | !pip install langchain unstructured[all-docs] pydantic lxml openai chromadb tiktoken opencv-python 15 | 16 | import os 17 | import uuid 18 | import base64 19 | from IPython import display 20 | from unstructured.partition.pdf import partition_pdf 21 | from langchain.chat_models import ChatOpenAI 22 | from langchain.embeddings import OpenAIEmbeddings 23 | from langchain.chains import LLMChain 24 | from langchain.prompts import PromptTemplate 25 | from langchain.schema.messages import HumanMessage, SystemMessage 26 | from langchain.schema.document import Document 27 | from langchain.vectorstores import FAISS 28 | from langchain.retrievers.multi_vector import MultiVectorRetriever 29 | 30 | from google.colab import userdata 31 | openai_api_key = userdata.get('OPENAI_API_KEY') 32 | 33 | output_path = "./images" 34 | 35 | # Get elements 36 | raw_pdf_elements = partition_pdf( 37 | filename="/content/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf", 38 | extract_images_in_pdf=True, 39 | infer_table_structure=True, 40 | chunking_strategy="by_title", 41 | max_characters=4000, 42 | new_after_n_chars=3800, 43 | combine_text_under_n_chars=2000, 44 | extract_image_block_output_dir=output_path, 45 | ) 46 | 47 | # Get text summaries and table summaries 48 | text_elements = [] 49 | table_elements = [] 50 | 51 | text_summaries = [] 52 | table_summaries = [] 53 | 54 | summary_prompt = """ 55 | Summarize the following {element_type}: 56 | {element} 57 | """ 58 | summary_chain = LLMChain( 59 | llm=ChatOpenAI(model="gpt-3.5-turbo", openai_api_key = openai_api_key, max_tokens=1024), 60 | prompt=PromptTemplate.from_template(summary_prompt) 61 | ) 62 | 63 | for e in raw_pdf_elements: 64 | if 'CompositeElement' in repr(e): 65 | text_elements.append(e.text) 66 | summary = summary_chain.run({'element_type': 'text', 'element': e}) 67 | text_summaries.append(summary) 68 | 69 | elif 'Table' in repr(e): 70 | table_elements.append(e.text) 71 | summary = summary_chain.run({'element_type': 'table', 'element': e}) 72 | table_summaries.append(summary) 73 | 74 | # Get image summaries 75 | image_elements = [] 76 | image_summaries = [] 77 | 78 | def encode_image(image_path): 79 | with open(image_path, "rb") as f: 80 | return base64.b64encode(f.read()).decode('utf-8') 81 | 82 | def summarize_image(encoded_image): 83 | prompt = [ 84 | SystemMessage(content="You are a bot that is good at analyzing images related to Dog's health."), 85 | HumanMessage(content=[ 86 | { 87 | "type": "text", 88 | "text": "Describe the contents of this image." 89 | }, 90 | { 91 | "type": "image_url", 92 | "image_url": { 93 | "url": f"data:image/jpeg;base64,{encoded_image}" 94 | }, 95 | }, 96 | ]) 97 | ] 98 | response = ChatOpenAI(model="gpt-4-vision-preview", openai_api_key=openai_api_key, max_tokens=1024).invoke(prompt) 99 | return response.content 100 | 101 | for i in os.listdir(output_path): 102 | if i.endswith(('.png', '.jpg', '.jpeg')): 103 | image_path = os.path.join(output_path, i) 104 | encoded_image = encode_image(image_path) 105 | image_elements.append(encoded_image) 106 | summary = summarize_image(encoded_image) 107 | image_summaries.append(summary) 108 | 109 | !pip install faiss-cpu 110 | 111 | # Create Documents and Vectorstore 112 | documents = [] 113 | retrieve_contents = [] 114 | 115 | for e, s in zip(text_elements, text_summaries): 116 | i = str(uuid.uuid4()) 117 | doc = Document( 118 | page_content = s, 119 | metadata = { 120 | 'id': i, 121 | 'type': 'text', 122 | 'original_content': e 123 | } 124 | ) 125 | retrieve_contents.append((i, e)) 126 | documents.append(doc) 127 | 128 | for e, s in zip(table_elements, table_summaries): 129 | doc = Document( 130 | page_content = s, 131 | metadata = { 132 | 'id': i, 133 | 'type': 'table', 134 | 'original_content': e 135 | } 136 | ) 137 | retrieve_contents.append((i, e)) 138 | documents.append(doc) 139 | 140 | for e, s in zip(image_elements, image_summaries): 141 | doc = Document( 142 | page_content = s, 143 | metadata = { 144 | 'id': i, 145 | 'type': 'image', 146 | 'original_content': e 147 | } 148 | ) 149 | retrieve_contents.append((i, s)) 150 | documents.append(doc) 151 | 152 | vectorstore = FAISS.from_documents(documents=documents, embedding=OpenAIEmbeddings(openai_api_key=openai_api_key)) 153 | 154 | vectorstore.save_local("faiss_index") 155 | 156 | prompt_template = """ 157 | You are a vet doctor and an expert in analyzing dog's health. 158 | Answer the question based only on the following context, which can include text, images and tables: 159 | {context} 160 | Question: {question} 161 | Don't answer if you are not sure and decline to answer and say "Sorry, I don't have much information about it." 162 | Just return the helpful answer in as much as detailed possible. 163 | Answer: 164 | """ 165 | 166 | qa_chain = LLMChain(llm=ChatOpenAI(model="gpt-4", openai_api_key = openai_api_key, max_tokens=1024), 167 | prompt=PromptTemplate.from_template(prompt_template)) 168 | 169 | def answer(question): 170 | relevant_docs = vectorstore.similarity_search(question) 171 | context = "" 172 | relevant_images = [] 173 | for d in relevant_docs: 174 | if d.metadata['type'] == 'text': 175 | context += '[text]' + d.metadata['original_content'] 176 | elif d.metadata['type'] == 'table': 177 | context += '[table]' + d.metadata['original_content'] 178 | elif d.metadata['type'] == 'image': 179 | context += '[image]' + d.page_content 180 | relevant_images.append(d.metadata['original_content']) 181 | result = qa_chain.run({'context': context, 'question': question}) 182 | return result, relevant_images 183 | 184 | result, relevant_images = answer("What is Gingivitis?") 185 | print(result) 186 | 187 | relevant_images[0] 188 | 189 | display.display(display.Image(base64.b64decode(relevant_images[0]))) 190 | 191 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Multimodal RAG App 4 | 5 | 6 | 76 | 77 | 78 |
79 |
80 |
81 |
Multimodal RAG App
82 | 83 |
84 | 85 |
86 |
87 |
88 | Loading... 89 |
90 |
91 |
92 | 93 |
94 |
95 |
96 | 97 |
98 |
99 | 100 |
101 | 102 | 103 | 104 | 164 | 165 | -------------------------------------------------------------------------------- /Multimodal_GPT_X.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | }, 15 | "widgets": { 16 | "application/vnd.jupyter.widget-state+json": { 17 | "1add53568adc46d2b5db1915d8c2213d": { 18 | "model_module": "@jupyter-widgets/controls", 19 | "model_name": "HBoxModel", 20 | "model_module_version": "1.5.0", 21 | "state": { 22 | "_dom_classes": [], 23 | "_model_module": "@jupyter-widgets/controls", 24 | "_model_module_version": "1.5.0", 25 | "_model_name": "HBoxModel", 26 | "_view_count": null, 27 | "_view_module": "@jupyter-widgets/controls", 28 | "_view_module_version": "1.5.0", 29 | "_view_name": "HBoxView", 30 | "box_style": "", 31 | "children": [ 32 | "IPY_MODEL_5094d25d081541fb8517b76d7e2c0981", 33 | "IPY_MODEL_a7779fc06b814d19828a1db85b6023f1", 34 | "IPY_MODEL_f8d0262a97c34430be1018daf58d5865" 35 | ], 36 | "layout": "IPY_MODEL_7dd112470621458ab600fdd9f3806942" 37 | } 38 | }, 39 | "5094d25d081541fb8517b76d7e2c0981": { 40 | "model_module": "@jupyter-widgets/controls", 41 | "model_name": "HTMLModel", 42 | "model_module_version": "1.5.0", 43 | "state": { 44 | "_dom_classes": [], 45 | "_model_module": "@jupyter-widgets/controls", 46 | "_model_module_version": "1.5.0", 47 | "_model_name": "HTMLModel", 48 | "_view_count": null, 49 | "_view_module": "@jupyter-widgets/controls", 50 | "_view_module_version": "1.5.0", 51 | "_view_name": "HTMLView", 52 | "description": "", 53 | "description_tooltip": null, 54 | "layout": "IPY_MODEL_6f9cfbd6b6e043a29f07abb63fcab102", 55 | "placeholder": "​", 56 | "style": "IPY_MODEL_fed331066467473f9e15608afe5909c6", 57 | "value": "config.json: 100%" 58 | } 59 | }, 60 | "a7779fc06b814d19828a1db85b6023f1": { 61 | "model_module": "@jupyter-widgets/controls", 62 | "model_name": "FloatProgressModel", 63 | "model_module_version": "1.5.0", 64 | "state": { 65 | "_dom_classes": [], 66 | "_model_module": "@jupyter-widgets/controls", 67 | "_model_module_version": "1.5.0", 68 | "_model_name": "FloatProgressModel", 69 | "_view_count": null, 70 | "_view_module": "@jupyter-widgets/controls", 71 | "_view_module_version": "1.5.0", 72 | "_view_name": "ProgressView", 73 | "bar_style": "success", 74 | "description": "", 75 | "description_tooltip": null, 76 | "layout": "IPY_MODEL_9ad99d5e37034c5698b34e9781616112", 77 | "max": 1469, 78 | "min": 0, 79 | "orientation": "horizontal", 80 | "style": "IPY_MODEL_adb253cbb29444dba8553d37b324fdea", 81 | "value": 1469 82 | } 83 | }, 84 | "f8d0262a97c34430be1018daf58d5865": { 85 | "model_module": "@jupyter-widgets/controls", 86 | "model_name": "HTMLModel", 87 | "model_module_version": "1.5.0", 88 | "state": { 89 | "_dom_classes": [], 90 | "_model_module": "@jupyter-widgets/controls", 91 | "_model_module_version": "1.5.0", 92 | "_model_name": "HTMLModel", 93 | "_view_count": null, 94 | "_view_module": "@jupyter-widgets/controls", 95 | "_view_module_version": "1.5.0", 96 | "_view_name": "HTMLView", 97 | "description": "", 98 | "description_tooltip": null, 99 | "layout": "IPY_MODEL_090c6f296b01428cbc94af75f188b27c", 100 | "placeholder": "​", 101 | "style": "IPY_MODEL_dcf1df1528f547729376f60a6486cae6", 102 | "value": " 1.47k/1.47k [00:00<00:00, 53.8kB/s]" 103 | } 104 | }, 105 | "7dd112470621458ab600fdd9f3806942": { 106 | "model_module": "@jupyter-widgets/base", 107 | "model_name": "LayoutModel", 108 | "model_module_version": "1.2.0", 109 | "state": { 110 | "_model_module": "@jupyter-widgets/base", 111 | "_model_module_version": "1.2.0", 112 | "_model_name": "LayoutModel", 113 | "_view_count": null, 114 | "_view_module": "@jupyter-widgets/base", 115 | "_view_module_version": "1.2.0", 116 | "_view_name": "LayoutView", 117 | "align_content": null, 118 | "align_items": null, 119 | "align_self": null, 120 | "border": null, 121 | "bottom": null, 122 | "display": null, 123 | "flex": null, 124 | "flex_flow": null, 125 | "grid_area": null, 126 | "grid_auto_columns": null, 127 | "grid_auto_flow": null, 128 | "grid_auto_rows": null, 129 | "grid_column": null, 130 | "grid_gap": null, 131 | "grid_row": null, 132 | "grid_template_areas": null, 133 | "grid_template_columns": null, 134 | "grid_template_rows": null, 135 | "height": null, 136 | "justify_content": null, 137 | "justify_items": null, 138 | "left": null, 139 | "margin": null, 140 | "max_height": null, 141 | "max_width": null, 142 | "min_height": null, 143 | "min_width": null, 144 | "object_fit": null, 145 | "object_position": null, 146 | "order": null, 147 | "overflow": null, 148 | "overflow_x": null, 149 | "overflow_y": null, 150 | "padding": null, 151 | "right": null, 152 | "top": null, 153 | "visibility": null, 154 | "width": null 155 | } 156 | }, 157 | "6f9cfbd6b6e043a29f07abb63fcab102": { 158 | "model_module": "@jupyter-widgets/base", 159 | "model_name": "LayoutModel", 160 | "model_module_version": "1.2.0", 161 | "state": { 162 | "_model_module": "@jupyter-widgets/base", 163 | "_model_module_version": "1.2.0", 164 | "_model_name": "LayoutModel", 165 | "_view_count": null, 166 | "_view_module": "@jupyter-widgets/base", 167 | "_view_module_version": "1.2.0", 168 | "_view_name": "LayoutView", 169 | "align_content": null, 170 | "align_items": null, 171 | "align_self": null, 172 | "border": null, 173 | "bottom": null, 174 | "display": null, 175 | "flex": null, 176 | "flex_flow": null, 177 | "grid_area": null, 178 | "grid_auto_columns": null, 179 | "grid_auto_flow": null, 180 | "grid_auto_rows": null, 181 | "grid_column": null, 182 | "grid_gap": null, 183 | "grid_row": null, 184 | "grid_template_areas": null, 185 | "grid_template_columns": null, 186 | "grid_template_rows": null, 187 | "height": null, 188 | "justify_content": null, 189 | "justify_items": null, 190 | "left": null, 191 | "margin": null, 192 | "max_height": null, 193 | "max_width": null, 194 | "min_height": null, 195 | "min_width": null, 196 | "object_fit": null, 197 | "object_position": null, 198 | "order": null, 199 | "overflow": null, 200 | "overflow_x": null, 201 | "overflow_y": null, 202 | "padding": null, 203 | "right": null, 204 | "top": null, 205 | "visibility": null, 206 | "width": null 207 | } 208 | }, 209 | "fed331066467473f9e15608afe5909c6": { 210 | "model_module": "@jupyter-widgets/controls", 211 | "model_name": "DescriptionStyleModel", 212 | "model_module_version": "1.5.0", 213 | "state": { 214 | "_model_module": "@jupyter-widgets/controls", 215 | "_model_module_version": "1.5.0", 216 | "_model_name": "DescriptionStyleModel", 217 | "_view_count": null, 218 | "_view_module": "@jupyter-widgets/base", 219 | "_view_module_version": "1.2.0", 220 | "_view_name": "StyleView", 221 | "description_width": "" 222 | } 223 | }, 224 | "9ad99d5e37034c5698b34e9781616112": { 225 | "model_module": "@jupyter-widgets/base", 226 | "model_name": "LayoutModel", 227 | "model_module_version": "1.2.0", 228 | "state": { 229 | "_model_module": "@jupyter-widgets/base", 230 | "_model_module_version": "1.2.0", 231 | "_model_name": "LayoutModel", 232 | "_view_count": null, 233 | "_view_module": "@jupyter-widgets/base", 234 | "_view_module_version": "1.2.0", 235 | "_view_name": "LayoutView", 236 | "align_content": null, 237 | "align_items": null, 238 | "align_self": null, 239 | "border": null, 240 | "bottom": null, 241 | "display": null, 242 | "flex": null, 243 | "flex_flow": null, 244 | "grid_area": null, 245 | "grid_auto_columns": null, 246 | "grid_auto_flow": null, 247 | "grid_auto_rows": null, 248 | "grid_column": null, 249 | "grid_gap": null, 250 | "grid_row": null, 251 | "grid_template_areas": null, 252 | "grid_template_columns": null, 253 | "grid_template_rows": null, 254 | "height": null, 255 | "justify_content": null, 256 | "justify_items": null, 257 | "left": null, 258 | "margin": null, 259 | "max_height": null, 260 | "max_width": null, 261 | "min_height": null, 262 | "min_width": null, 263 | "object_fit": null, 264 | "object_position": null, 265 | "order": null, 266 | "overflow": null, 267 | "overflow_x": null, 268 | "overflow_y": null, 269 | "padding": null, 270 | "right": null, 271 | "top": null, 272 | "visibility": null, 273 | "width": null 274 | } 275 | }, 276 | "adb253cbb29444dba8553d37b324fdea": { 277 | "model_module": "@jupyter-widgets/controls", 278 | "model_name": "ProgressStyleModel", 279 | "model_module_version": "1.5.0", 280 | "state": { 281 | "_model_module": "@jupyter-widgets/controls", 282 | "_model_module_version": "1.5.0", 283 | "_model_name": "ProgressStyleModel", 284 | "_view_count": null, 285 | "_view_module": "@jupyter-widgets/base", 286 | "_view_module_version": "1.2.0", 287 | "_view_name": "StyleView", 288 | "bar_color": null, 289 | "description_width": "" 290 | } 291 | }, 292 | "090c6f296b01428cbc94af75f188b27c": { 293 | "model_module": "@jupyter-widgets/base", 294 | "model_name": "LayoutModel", 295 | "model_module_version": "1.2.0", 296 | "state": { 297 | "_model_module": "@jupyter-widgets/base", 298 | "_model_module_version": "1.2.0", 299 | "_model_name": "LayoutModel", 300 | "_view_count": null, 301 | "_view_module": "@jupyter-widgets/base", 302 | "_view_module_version": "1.2.0", 303 | "_view_name": "LayoutView", 304 | "align_content": null, 305 | "align_items": null, 306 | "align_self": null, 307 | "border": null, 308 | "bottom": null, 309 | "display": null, 310 | "flex": null, 311 | "flex_flow": null, 312 | "grid_area": null, 313 | "grid_auto_columns": null, 314 | "grid_auto_flow": null, 315 | "grid_auto_rows": null, 316 | "grid_column": null, 317 | "grid_gap": null, 318 | "grid_row": null, 319 | "grid_template_areas": null, 320 | "grid_template_columns": null, 321 | "grid_template_rows": null, 322 | "height": null, 323 | "justify_content": null, 324 | "justify_items": null, 325 | "left": null, 326 | "margin": null, 327 | "max_height": null, 328 | "max_width": null, 329 | "min_height": null, 330 | "min_width": null, 331 | "object_fit": null, 332 | "object_position": null, 333 | "order": null, 334 | "overflow": null, 335 | "overflow_x": null, 336 | "overflow_y": null, 337 | "padding": null, 338 | "right": null, 339 | "top": null, 340 | "visibility": null, 341 | "width": null 342 | } 343 | }, 344 | "dcf1df1528f547729376f60a6486cae6": { 345 | "model_module": "@jupyter-widgets/controls", 346 | "model_name": "DescriptionStyleModel", 347 | "model_module_version": "1.5.0", 348 | "state": { 349 | "_model_module": "@jupyter-widgets/controls", 350 | "_model_module_version": "1.5.0", 351 | "_model_name": "DescriptionStyleModel", 352 | "_view_count": null, 353 | "_view_module": "@jupyter-widgets/base", 354 | "_view_module_version": "1.2.0", 355 | "_view_name": "StyleView", 356 | "description_width": "" 357 | } 358 | }, 359 | "0da6cfb5f1214e478f996d1657f11977": { 360 | "model_module": "@jupyter-widgets/controls", 361 | "model_name": "HBoxModel", 362 | "model_module_version": "1.5.0", 363 | "state": { 364 | "_dom_classes": [], 365 | "_model_module": "@jupyter-widgets/controls", 366 | "_model_module_version": "1.5.0", 367 | "_model_name": "HBoxModel", 368 | "_view_count": null, 369 | "_view_module": "@jupyter-widgets/controls", 370 | "_view_module_version": "1.5.0", 371 | "_view_name": "HBoxView", 372 | "box_style": "", 373 | "children": [ 374 | "IPY_MODEL_5dd3cd164480493f8c24778b8c534aff", 375 | "IPY_MODEL_d935cf926c0047b68baed7ad340b6524", 376 | "IPY_MODEL_8ca1716d945146fda75297f4b4981145" 377 | ], 378 | "layout": "IPY_MODEL_fec79a20e11f47f1a2e14a5c2461c6fc" 379 | } 380 | }, 381 | "5dd3cd164480493f8c24778b8c534aff": { 382 | "model_module": "@jupyter-widgets/controls", 383 | "model_name": "HTMLModel", 384 | "model_module_version": "1.5.0", 385 | "state": { 386 | "_dom_classes": [], 387 | "_model_module": "@jupyter-widgets/controls", 388 | "_model_module_version": "1.5.0", 389 | "_model_name": "HTMLModel", 390 | "_view_count": null, 391 | "_view_module": "@jupyter-widgets/controls", 392 | "_view_module_version": "1.5.0", 393 | "_view_name": "HTMLView", 394 | "description": "", 395 | "description_tooltip": null, 396 | "layout": "IPY_MODEL_1563d2660f85468a82dae95c55783fa4", 397 | "placeholder": "​", 398 | "style": "IPY_MODEL_dca4cc9b977b4fff94d76449623b9dda", 399 | "value": "model.safetensors: 100%" 400 | } 401 | }, 402 | "d935cf926c0047b68baed7ad340b6524": { 403 | "model_module": "@jupyter-widgets/controls", 404 | "model_name": "FloatProgressModel", 405 | "model_module_version": "1.5.0", 406 | "state": { 407 | "_dom_classes": [], 408 | "_model_module": "@jupyter-widgets/controls", 409 | "_model_module_version": "1.5.0", 410 | "_model_name": "FloatProgressModel", 411 | "_view_count": null, 412 | "_view_module": "@jupyter-widgets/controls", 413 | "_view_module_version": "1.5.0", 414 | "_view_name": "ProgressView", 415 | "bar_style": "success", 416 | "description": "", 417 | "description_tooltip": null, 418 | "layout": "IPY_MODEL_a27dd7076eed4b5aa88291caec14e162", 419 | "max": 115434268, 420 | "min": 0, 421 | "orientation": "horizontal", 422 | "style": "IPY_MODEL_e016e45e12b64c8a8ff1e90cec8ae9fc", 423 | "value": 115434268 424 | } 425 | }, 426 | "8ca1716d945146fda75297f4b4981145": { 427 | "model_module": "@jupyter-widgets/controls", 428 | "model_name": "HTMLModel", 429 | "model_module_version": "1.5.0", 430 | "state": { 431 | "_dom_classes": [], 432 | "_model_module": "@jupyter-widgets/controls", 433 | "_model_module_version": "1.5.0", 434 | "_model_name": "HTMLModel", 435 | "_view_count": null, 436 | "_view_module": "@jupyter-widgets/controls", 437 | "_view_module_version": "1.5.0", 438 | "_view_name": "HTMLView", 439 | "description": "", 440 | "description_tooltip": null, 441 | "layout": "IPY_MODEL_5be15fdcb04e42b09113a68b2eaa21e8", 442 | "placeholder": "​", 443 | "style": "IPY_MODEL_3b45d3eb28c449f89403fcb7bc3d84c3", 444 | "value": " 115M/115M [00:00<00:00, 140MB/s]" 445 | } 446 | }, 447 | "fec79a20e11f47f1a2e14a5c2461c6fc": { 448 | "model_module": "@jupyter-widgets/base", 449 | "model_name": "LayoutModel", 450 | "model_module_version": "1.2.0", 451 | "state": { 452 | "_model_module": "@jupyter-widgets/base", 453 | "_model_module_version": "1.2.0", 454 | "_model_name": "LayoutModel", 455 | "_view_count": null, 456 | "_view_module": "@jupyter-widgets/base", 457 | "_view_module_version": "1.2.0", 458 | "_view_name": "LayoutView", 459 | "align_content": null, 460 | "align_items": null, 461 | "align_self": null, 462 | "border": null, 463 | "bottom": null, 464 | "display": null, 465 | "flex": null, 466 | "flex_flow": null, 467 | "grid_area": null, 468 | "grid_auto_columns": null, 469 | "grid_auto_flow": null, 470 | "grid_auto_rows": null, 471 | "grid_column": null, 472 | "grid_gap": null, 473 | "grid_row": null, 474 | "grid_template_areas": null, 475 | "grid_template_columns": null, 476 | "grid_template_rows": null, 477 | "height": null, 478 | "justify_content": null, 479 | "justify_items": null, 480 | "left": null, 481 | "margin": null, 482 | "max_height": null, 483 | "max_width": null, 484 | "min_height": null, 485 | "min_width": null, 486 | "object_fit": null, 487 | "object_position": null, 488 | "order": null, 489 | "overflow": null, 490 | "overflow_x": null, 491 | "overflow_y": null, 492 | "padding": null, 493 | "right": null, 494 | "top": null, 495 | "visibility": null, 496 | "width": null 497 | } 498 | }, 499 | "1563d2660f85468a82dae95c55783fa4": { 500 | "model_module": "@jupyter-widgets/base", 501 | "model_name": "LayoutModel", 502 | "model_module_version": "1.2.0", 503 | "state": { 504 | "_model_module": "@jupyter-widgets/base", 505 | "_model_module_version": "1.2.0", 506 | "_model_name": "LayoutModel", 507 | "_view_count": null, 508 | "_view_module": "@jupyter-widgets/base", 509 | "_view_module_version": "1.2.0", 510 | "_view_name": "LayoutView", 511 | "align_content": null, 512 | "align_items": null, 513 | "align_self": null, 514 | "border": null, 515 | "bottom": null, 516 | "display": null, 517 | "flex": null, 518 | "flex_flow": null, 519 | "grid_area": null, 520 | "grid_auto_columns": null, 521 | "grid_auto_flow": null, 522 | "grid_auto_rows": null, 523 | "grid_column": null, 524 | "grid_gap": null, 525 | "grid_row": null, 526 | "grid_template_areas": null, 527 | "grid_template_columns": null, 528 | "grid_template_rows": null, 529 | "height": null, 530 | "justify_content": null, 531 | "justify_items": null, 532 | "left": null, 533 | "margin": null, 534 | "max_height": null, 535 | "max_width": null, 536 | "min_height": null, 537 | "min_width": null, 538 | "object_fit": null, 539 | "object_position": null, 540 | "order": null, 541 | "overflow": null, 542 | "overflow_x": null, 543 | "overflow_y": null, 544 | "padding": null, 545 | "right": null, 546 | "top": null, 547 | "visibility": null, 548 | "width": null 549 | } 550 | }, 551 | "dca4cc9b977b4fff94d76449623b9dda": { 552 | "model_module": "@jupyter-widgets/controls", 553 | "model_name": "DescriptionStyleModel", 554 | "model_module_version": "1.5.0", 555 | "state": { 556 | "_model_module": "@jupyter-widgets/controls", 557 | "_model_module_version": "1.5.0", 558 | "_model_name": "DescriptionStyleModel", 559 | "_view_count": null, 560 | "_view_module": "@jupyter-widgets/base", 561 | "_view_module_version": "1.2.0", 562 | "_view_name": "StyleView", 563 | "description_width": "" 564 | } 565 | }, 566 | "a27dd7076eed4b5aa88291caec14e162": { 567 | "model_module": "@jupyter-widgets/base", 568 | "model_name": "LayoutModel", 569 | "model_module_version": "1.2.0", 570 | "state": { 571 | "_model_module": "@jupyter-widgets/base", 572 | "_model_module_version": "1.2.0", 573 | "_model_name": "LayoutModel", 574 | "_view_count": null, 575 | "_view_module": "@jupyter-widgets/base", 576 | "_view_module_version": "1.2.0", 577 | "_view_name": "LayoutView", 578 | "align_content": null, 579 | "align_items": null, 580 | "align_self": null, 581 | "border": null, 582 | "bottom": null, 583 | "display": null, 584 | "flex": null, 585 | "flex_flow": null, 586 | "grid_area": null, 587 | "grid_auto_columns": null, 588 | "grid_auto_flow": null, 589 | "grid_auto_rows": null, 590 | "grid_column": null, 591 | "grid_gap": null, 592 | "grid_row": null, 593 | "grid_template_areas": null, 594 | "grid_template_columns": null, 595 | "grid_template_rows": null, 596 | "height": null, 597 | "justify_content": null, 598 | "justify_items": null, 599 | "left": null, 600 | "margin": null, 601 | "max_height": null, 602 | "max_width": null, 603 | "min_height": null, 604 | "min_width": null, 605 | "object_fit": null, 606 | "object_position": null, 607 | "order": null, 608 | "overflow": null, 609 | "overflow_x": null, 610 | "overflow_y": null, 611 | "padding": null, 612 | "right": null, 613 | "top": null, 614 | "visibility": null, 615 | "width": null 616 | } 617 | }, 618 | "e016e45e12b64c8a8ff1e90cec8ae9fc": { 619 | "model_module": "@jupyter-widgets/controls", 620 | "model_name": "ProgressStyleModel", 621 | "model_module_version": "1.5.0", 622 | "state": { 623 | "_model_module": "@jupyter-widgets/controls", 624 | "_model_module_version": "1.5.0", 625 | "_model_name": "ProgressStyleModel", 626 | "_view_count": null, 627 | "_view_module": "@jupyter-widgets/base", 628 | "_view_module_version": "1.2.0", 629 | "_view_name": "StyleView", 630 | "bar_color": null, 631 | "description_width": "" 632 | } 633 | }, 634 | "5be15fdcb04e42b09113a68b2eaa21e8": { 635 | "model_module": "@jupyter-widgets/base", 636 | "model_name": "LayoutModel", 637 | "model_module_version": "1.2.0", 638 | "state": { 639 | "_model_module": "@jupyter-widgets/base", 640 | "_model_module_version": "1.2.0", 641 | "_model_name": "LayoutModel", 642 | "_view_count": null, 643 | "_view_module": "@jupyter-widgets/base", 644 | "_view_module_version": "1.2.0", 645 | "_view_name": "LayoutView", 646 | "align_content": null, 647 | "align_items": null, 648 | "align_self": null, 649 | "border": null, 650 | "bottom": null, 651 | "display": null, 652 | "flex": null, 653 | "flex_flow": null, 654 | "grid_area": null, 655 | "grid_auto_columns": null, 656 | "grid_auto_flow": null, 657 | "grid_auto_rows": null, 658 | "grid_column": null, 659 | "grid_gap": null, 660 | "grid_row": null, 661 | "grid_template_areas": null, 662 | "grid_template_columns": null, 663 | "grid_template_rows": null, 664 | "height": null, 665 | "justify_content": null, 666 | "justify_items": null, 667 | "left": null, 668 | "margin": null, 669 | "max_height": null, 670 | "max_width": null, 671 | "min_height": null, 672 | "min_width": null, 673 | "object_fit": null, 674 | "object_position": null, 675 | "order": null, 676 | "overflow": null, 677 | "overflow_x": null, 678 | "overflow_y": null, 679 | "padding": null, 680 | "right": null, 681 | "top": null, 682 | "visibility": null, 683 | "width": null 684 | } 685 | }, 686 | "3b45d3eb28c449f89403fcb7bc3d84c3": { 687 | "model_module": "@jupyter-widgets/controls", 688 | "model_name": "DescriptionStyleModel", 689 | "model_module_version": "1.5.0", 690 | "state": { 691 | "_model_module": "@jupyter-widgets/controls", 692 | "_model_module_version": "1.5.0", 693 | "_model_name": "DescriptionStyleModel", 694 | "_view_count": null, 695 | "_view_module": "@jupyter-widgets/base", 696 | "_view_module_version": "1.2.0", 697 | "_view_name": "StyleView", 698 | "description_width": "" 699 | } 700 | }, 701 | "794b59f933a749e88c6ac73694f0d36a": { 702 | "model_module": "@jupyter-widgets/controls", 703 | "model_name": "HBoxModel", 704 | "model_module_version": "1.5.0", 705 | "state": { 706 | "_dom_classes": [], 707 | "_model_module": "@jupyter-widgets/controls", 708 | "_model_module_version": "1.5.0", 709 | "_model_name": "HBoxModel", 710 | "_view_count": null, 711 | "_view_module": "@jupyter-widgets/controls", 712 | "_view_module_version": "1.5.0", 713 | "_view_name": "HBoxView", 714 | "box_style": "", 715 | "children": [ 716 | "IPY_MODEL_c20d2db8870548979209ef73e6e125c3", 717 | "IPY_MODEL_fcc5d303f2d643bba765e27287212067", 718 | "IPY_MODEL_6f2a5ab0f9b44c3ebf2880ddf61317b9" 719 | ], 720 | "layout": "IPY_MODEL_dc6cecb88e1d4dadbf247140fb278dc7" 721 | } 722 | }, 723 | "c20d2db8870548979209ef73e6e125c3": { 724 | "model_module": "@jupyter-widgets/controls", 725 | "model_name": "HTMLModel", 726 | "model_module_version": "1.5.0", 727 | "state": { 728 | "_dom_classes": [], 729 | "_model_module": "@jupyter-widgets/controls", 730 | "_model_module_version": "1.5.0", 731 | "_model_name": "HTMLModel", 732 | "_view_count": null, 733 | "_view_module": "@jupyter-widgets/controls", 734 | "_view_module_version": "1.5.0", 735 | "_view_name": "HTMLView", 736 | "description": "", 737 | "description_tooltip": null, 738 | "layout": "IPY_MODEL_4c0bf2156912436286f313c1c35418ed", 739 | "placeholder": "​", 740 | "style": "IPY_MODEL_9c3e160fb1464a29b2f6673952959219", 741 | "value": "model.safetensors: 100%" 742 | } 743 | }, 744 | "fcc5d303f2d643bba765e27287212067": { 745 | "model_module": "@jupyter-widgets/controls", 746 | "model_name": "FloatProgressModel", 747 | "model_module_version": "1.5.0", 748 | "state": { 749 | "_dom_classes": [], 750 | "_model_module": "@jupyter-widgets/controls", 751 | "_model_module_version": "1.5.0", 752 | "_model_name": "FloatProgressModel", 753 | "_view_count": null, 754 | "_view_module": "@jupyter-widgets/controls", 755 | "_view_module_version": "1.5.0", 756 | "_view_name": "ProgressView", 757 | "bar_style": "success", 758 | "description": "", 759 | "description_tooltip": null, 760 | "layout": "IPY_MODEL_61d9630501324251b8e91e60732bb56f", 761 | "max": 46807446, 762 | "min": 0, 763 | "orientation": "horizontal", 764 | "style": "IPY_MODEL_0d83444850874c0ca8bc6ff02b6dc8a3", 765 | "value": 46807446 766 | } 767 | }, 768 | "6f2a5ab0f9b44c3ebf2880ddf61317b9": { 769 | "model_module": "@jupyter-widgets/controls", 770 | "model_name": "HTMLModel", 771 | "model_module_version": "1.5.0", 772 | "state": { 773 | "_dom_classes": [], 774 | "_model_module": "@jupyter-widgets/controls", 775 | "_model_module_version": "1.5.0", 776 | "_model_name": "HTMLModel", 777 | "_view_count": null, 778 | "_view_module": "@jupyter-widgets/controls", 779 | "_view_module_version": "1.5.0", 780 | "_view_name": "HTMLView", 781 | "description": "", 782 | "description_tooltip": null, 783 | "layout": "IPY_MODEL_e5b7873fc72140a68a2231175564a046", 784 | "placeholder": "​", 785 | "style": "IPY_MODEL_3a56855bd12f4c6992f7e51e64e5f246", 786 | "value": " 46.8M/46.8M [00:00<00:00, 135MB/s]" 787 | } 788 | }, 789 | "dc6cecb88e1d4dadbf247140fb278dc7": { 790 | "model_module": "@jupyter-widgets/base", 791 | "model_name": "LayoutModel", 792 | "model_module_version": "1.2.0", 793 | "state": { 794 | "_model_module": "@jupyter-widgets/base", 795 | "_model_module_version": "1.2.0", 796 | "_model_name": "LayoutModel", 797 | "_view_count": null, 798 | "_view_module": "@jupyter-widgets/base", 799 | "_view_module_version": "1.2.0", 800 | "_view_name": "LayoutView", 801 | "align_content": null, 802 | "align_items": null, 803 | "align_self": null, 804 | "border": null, 805 | "bottom": null, 806 | "display": null, 807 | "flex": null, 808 | "flex_flow": null, 809 | "grid_area": null, 810 | "grid_auto_columns": null, 811 | "grid_auto_flow": null, 812 | "grid_auto_rows": null, 813 | "grid_column": null, 814 | "grid_gap": null, 815 | "grid_row": null, 816 | "grid_template_areas": null, 817 | "grid_template_columns": null, 818 | "grid_template_rows": null, 819 | "height": null, 820 | "justify_content": null, 821 | "justify_items": null, 822 | "left": null, 823 | "margin": null, 824 | "max_height": null, 825 | "max_width": null, 826 | "min_height": null, 827 | "min_width": null, 828 | "object_fit": null, 829 | "object_position": null, 830 | "order": null, 831 | "overflow": null, 832 | "overflow_x": null, 833 | "overflow_y": null, 834 | "padding": null, 835 | "right": null, 836 | "top": null, 837 | "visibility": null, 838 | "width": null 839 | } 840 | }, 841 | "4c0bf2156912436286f313c1c35418ed": { 842 | "model_module": "@jupyter-widgets/base", 843 | "model_name": "LayoutModel", 844 | "model_module_version": "1.2.0", 845 | "state": { 846 | "_model_module": "@jupyter-widgets/base", 847 | "_model_module_version": "1.2.0", 848 | "_model_name": "LayoutModel", 849 | "_view_count": null, 850 | "_view_module": "@jupyter-widgets/base", 851 | "_view_module_version": "1.2.0", 852 | "_view_name": "LayoutView", 853 | "align_content": null, 854 | "align_items": null, 855 | "align_self": null, 856 | "border": null, 857 | "bottom": null, 858 | "display": null, 859 | "flex": null, 860 | "flex_flow": null, 861 | "grid_area": null, 862 | "grid_auto_columns": null, 863 | "grid_auto_flow": null, 864 | "grid_auto_rows": null, 865 | "grid_column": null, 866 | "grid_gap": null, 867 | "grid_row": null, 868 | "grid_template_areas": null, 869 | "grid_template_columns": null, 870 | "grid_template_rows": null, 871 | "height": null, 872 | "justify_content": null, 873 | "justify_items": null, 874 | "left": null, 875 | "margin": null, 876 | "max_height": null, 877 | "max_width": null, 878 | "min_height": null, 879 | "min_width": null, 880 | "object_fit": null, 881 | "object_position": null, 882 | "order": null, 883 | "overflow": null, 884 | "overflow_x": null, 885 | "overflow_y": null, 886 | "padding": null, 887 | "right": null, 888 | "top": null, 889 | "visibility": null, 890 | "width": null 891 | } 892 | }, 893 | "9c3e160fb1464a29b2f6673952959219": { 894 | "model_module": "@jupyter-widgets/controls", 895 | "model_name": "DescriptionStyleModel", 896 | "model_module_version": "1.5.0", 897 | "state": { 898 | "_model_module": "@jupyter-widgets/controls", 899 | "_model_module_version": "1.5.0", 900 | "_model_name": "DescriptionStyleModel", 901 | "_view_count": null, 902 | "_view_module": "@jupyter-widgets/base", 903 | "_view_module_version": "1.2.0", 904 | "_view_name": "StyleView", 905 | "description_width": "" 906 | } 907 | }, 908 | "61d9630501324251b8e91e60732bb56f": { 909 | "model_module": "@jupyter-widgets/base", 910 | "model_name": "LayoutModel", 911 | "model_module_version": "1.2.0", 912 | "state": { 913 | "_model_module": "@jupyter-widgets/base", 914 | "_model_module_version": "1.2.0", 915 | "_model_name": "LayoutModel", 916 | "_view_count": null, 917 | "_view_module": "@jupyter-widgets/base", 918 | "_view_module_version": "1.2.0", 919 | "_view_name": "LayoutView", 920 | "align_content": null, 921 | "align_items": null, 922 | "align_self": null, 923 | "border": null, 924 | "bottom": null, 925 | "display": null, 926 | "flex": null, 927 | "flex_flow": null, 928 | "grid_area": null, 929 | "grid_auto_columns": null, 930 | "grid_auto_flow": null, 931 | "grid_auto_rows": null, 932 | "grid_column": null, 933 | "grid_gap": null, 934 | "grid_row": null, 935 | "grid_template_areas": null, 936 | "grid_template_columns": null, 937 | "grid_template_rows": null, 938 | "height": null, 939 | "justify_content": null, 940 | "justify_items": null, 941 | "left": null, 942 | "margin": null, 943 | "max_height": null, 944 | "max_width": null, 945 | "min_height": null, 946 | "min_width": null, 947 | "object_fit": null, 948 | "object_position": null, 949 | "order": null, 950 | "overflow": null, 951 | "overflow_x": null, 952 | "overflow_y": null, 953 | "padding": null, 954 | "right": null, 955 | "top": null, 956 | "visibility": null, 957 | "width": null 958 | } 959 | }, 960 | "0d83444850874c0ca8bc6ff02b6dc8a3": { 961 | "model_module": "@jupyter-widgets/controls", 962 | "model_name": "ProgressStyleModel", 963 | "model_module_version": "1.5.0", 964 | "state": { 965 | "_model_module": "@jupyter-widgets/controls", 966 | "_model_module_version": "1.5.0", 967 | "_model_name": "ProgressStyleModel", 968 | "_view_count": null, 969 | "_view_module": "@jupyter-widgets/base", 970 | "_view_module_version": "1.2.0", 971 | "_view_name": "StyleView", 972 | "bar_color": null, 973 | "description_width": "" 974 | } 975 | }, 976 | "e5b7873fc72140a68a2231175564a046": { 977 | "model_module": "@jupyter-widgets/base", 978 | "model_name": "LayoutModel", 979 | "model_module_version": "1.2.0", 980 | "state": { 981 | "_model_module": "@jupyter-widgets/base", 982 | "_model_module_version": "1.2.0", 983 | "_model_name": "LayoutModel", 984 | "_view_count": null, 985 | "_view_module": "@jupyter-widgets/base", 986 | "_view_module_version": "1.2.0", 987 | "_view_name": "LayoutView", 988 | "align_content": null, 989 | "align_items": null, 990 | "align_self": null, 991 | "border": null, 992 | "bottom": null, 993 | "display": null, 994 | "flex": null, 995 | "flex_flow": null, 996 | "grid_area": null, 997 | "grid_auto_columns": null, 998 | "grid_auto_flow": null, 999 | "grid_auto_rows": null, 1000 | "grid_column": null, 1001 | "grid_gap": null, 1002 | "grid_row": null, 1003 | "grid_template_areas": null, 1004 | "grid_template_columns": null, 1005 | "grid_template_rows": null, 1006 | "height": null, 1007 | "justify_content": null, 1008 | "justify_items": null, 1009 | "left": null, 1010 | "margin": null, 1011 | "max_height": null, 1012 | "max_width": null, 1013 | "min_height": null, 1014 | "min_width": null, 1015 | "object_fit": null, 1016 | "object_position": null, 1017 | "order": null, 1018 | "overflow": null, 1019 | "overflow_x": null, 1020 | "overflow_y": null, 1021 | "padding": null, 1022 | "right": null, 1023 | "top": null, 1024 | "visibility": null, 1025 | "width": null 1026 | } 1027 | }, 1028 | "3a56855bd12f4c6992f7e51e64e5f246": { 1029 | "model_module": "@jupyter-widgets/controls", 1030 | "model_name": "DescriptionStyleModel", 1031 | "model_module_version": "1.5.0", 1032 | "state": { 1033 | "_model_module": "@jupyter-widgets/controls", 1034 | "_model_module_version": "1.5.0", 1035 | "_model_name": "DescriptionStyleModel", 1036 | "_view_count": null, 1037 | "_view_module": "@jupyter-widgets/base", 1038 | "_view_module_version": "1.2.0", 1039 | "_view_name": "StyleView", 1040 | "description_width": "" 1041 | } 1042 | } 1043 | } 1044 | } 1045 | }, 1046 | "cells": [ 1047 | { 1048 | "cell_type": "code", 1049 | "execution_count": 1, 1050 | "metadata": { 1051 | "colab": { 1052 | "base_uri": "https://localhost:8080/" 1053 | }, 1054 | "id": "X8rjgQb7vWdW", 1055 | "outputId": "2f4558ae-c481-401b-c2e4-f4858c8b045a" 1056 | }, 1057 | "outputs": [ 1058 | { 1059 | "output_type": "stream", 1060 | "name": "stdout", 1061 | "text": [ 1062 | "Reading package lists... Done\n", 1063 | "Building dependency tree... Done\n", 1064 | "Reading state information... Done\n", 1065 | "The following additional packages will be installed:\n", 1066 | " tesseract-ocr-eng tesseract-ocr-osd\n", 1067 | "The following NEW packages will be installed:\n", 1068 | " tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n", 1069 | "0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.\n", 1070 | "Need to get 4,816 kB of archives.\n", 1071 | "After this operation, 15.6 MB of additional disk space will be used.\n", 1072 | "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n", 1073 | "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n", 1074 | "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n", 1075 | "Fetched 4,816 kB in 1s (3,232 kB/s)\n", 1076 | "debconf: unable to initialize frontend: Dialog\n", 1077 | "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n", 1078 | "debconf: falling back to frontend: Readline\n", 1079 | "debconf: unable to initialize frontend: Readline\n", 1080 | "debconf: (This frontend requires a controlling tty.)\n", 1081 | "debconf: falling back to frontend: Teletype\n", 1082 | "dpkg-preconfigure: unable to re-open stdin: \n", 1083 | "Selecting previously unselected package tesseract-ocr-eng.\n", 1084 | "(Reading database ... 121658 files and directories currently installed.)\n", 1085 | "Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", 1086 | "Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n", 1087 | "Selecting previously unselected package tesseract-ocr-osd.\n", 1088 | "Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n", 1089 | "Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n", 1090 | "Selecting previously unselected package tesseract-ocr.\n", 1091 | "Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n", 1092 | "Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n", 1093 | "Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n", 1094 | "Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n", 1095 | "Setting up tesseract-ocr (4.1.1-2.1build1) ...\n", 1096 | "Processing triggers for man-db (2.10.2-1) ...\n", 1097 | "Reading package lists... Done\n", 1098 | "Building dependency tree... Done\n", 1099 | "Reading state information... Done\n", 1100 | "The following additional packages will be installed:\n", 1101 | " libarchive-dev libleptonica-dev\n", 1102 | "The following NEW packages will be installed:\n", 1103 | " libarchive-dev libleptonica-dev libtesseract-dev\n", 1104 | "0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.\n", 1105 | "Need to get 3,743 kB of archives.\n", 1106 | "After this operation, 16.0 MB of additional disk space will be used.\n", 1107 | "Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1 [581 kB]\n", 1108 | "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]\n", 1109 | "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]\n", 1110 | "Fetched 3,743 kB in 1s (2,808 kB/s)\n", 1111 | "debconf: unable to initialize frontend: Dialog\n", 1112 | "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 3.)\n", 1113 | "debconf: falling back to frontend: Readline\n", 1114 | "debconf: unable to initialize frontend: Readline\n", 1115 | "debconf: (This frontend requires a controlling tty.)\n", 1116 | "debconf: falling back to frontend: Teletype\n", 1117 | "dpkg-preconfigure: unable to re-open stdin: \n", 1118 | "Selecting previously unselected package libarchive-dev:amd64.\n", 1119 | "(Reading database ... 121705 files and directories currently installed.)\n", 1120 | "Preparing to unpack .../libarchive-dev_3.6.0-1ubuntu1_amd64.deb ...\n", 1121 | "Unpacking libarchive-dev:amd64 (3.6.0-1ubuntu1) ...\n", 1122 | "Selecting previously unselected package libleptonica-dev.\n", 1123 | "Preparing to unpack .../libleptonica-dev_1.82.0-3build1_amd64.deb ...\n", 1124 | "Unpacking libleptonica-dev (1.82.0-3build1) ...\n", 1125 | "Selecting previously unselected package libtesseract-dev:amd64.\n", 1126 | "Preparing to unpack .../libtesseract-dev_4.1.1-2.1build1_amd64.deb ...\n", 1127 | "Unpacking libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n", 1128 | "Setting up libleptonica-dev (1.82.0-3build1) ...\n", 1129 | "Setting up libarchive-dev:amd64 (3.6.0-1ubuntu1) ...\n", 1130 | "Setting up libtesseract-dev:amd64 (4.1.1-2.1build1) ...\n", 1131 | "Processing triggers for man-db (2.10.2-1) ...\n", 1132 | "Reading package lists... Done\n", 1133 | "Building dependency tree... Done\n", 1134 | "Reading state information... Done\n", 1135 | "The following NEW packages will be installed:\n", 1136 | " poppler-utils\n", 1137 | "0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.\n", 1138 | "Need to get 186 kB of archives.\n", 1139 | "After this operation, 696 kB of additional disk space will be used.\n", 1140 | "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.3 [186 kB]\n", 1141 | "Fetched 186 kB in 1s (265 kB/s)\n", 1142 | "debconf: unable to initialize frontend: Dialog\n", 1143 | "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)\n", 1144 | "debconf: falling back to frontend: Readline\n", 1145 | "debconf: unable to initialize frontend: Readline\n", 1146 | "debconf: (This frontend requires a controlling tty.)\n", 1147 | "debconf: falling back to frontend: Teletype\n", 1148 | "dpkg-preconfigure: unable to re-open stdin: \n", 1149 | "Selecting previously unselected package poppler-utils.\n", 1150 | "(Reading database ... 121838 files and directories currently installed.)\n", 1151 | "Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.3_amd64.deb ...\n", 1152 | "Unpacking poppler-utils (22.02.0-2ubuntu0.3) ...\n", 1153 | "Setting up poppler-utils (22.02.0-2ubuntu0.3) ...\n", 1154 | "Processing triggers for man-db (2.10.2-1) ...\n" 1155 | ] 1156 | } 1157 | ], 1158 | "source": [ 1159 | "!sudo apt install tesseract-ocr -y\n", 1160 | "!sudo apt install libtesseract-dev -y\n", 1161 | "!sudo apt-get install poppler-utils -y" 1162 | ] 1163 | }, 1164 | { 1165 | "cell_type": "code", 1166 | "source": [ 1167 | "!pip install langchain unstructured[all-docs] pydantic lxml openai chromadb tiktoken opencv-python" 1168 | ], 1169 | "metadata": { 1170 | "colab": { 1171 | "base_uri": "https://localhost:8080/", 1172 | "height": 1000 1173 | }, 1174 | "id": "unj2xdPbzDZZ", 1175 | "outputId": "04c3f4e9-e8f8-4fec-cd61-481fac2e4f0f" 1176 | }, 1177 | "execution_count": 2, 1178 | "outputs": [ 1179 | { 1180 | "output_type": "stream", 1181 | "name": "stdout", 1182 | "text": [ 1183 | "Collecting langchain\n", 1184 | " Downloading langchain-0.1.1-py3-none-any.whl (802 kB)\n", 1185 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m802.4/802.4 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1186 | "\u001b[?25hCollecting unstructured[all-docs]\n", 1187 | " Downloading unstructured-0.12.0-py3-none-any.whl (1.8 MB)\n", 1188 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m26.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1189 | "\u001b[?25hRequirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (1.10.13)\n", 1190 | "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (4.9.4)\n", 1191 | "Collecting openai\n", 1192 | " Downloading openai-1.8.0-py3-none-any.whl (222 kB)\n", 1193 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m222.3/222.3 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1194 | "\u001b[?25hCollecting chromadb\n", 1195 | " Downloading chromadb-0.4.22-py3-none-any.whl (509 kB)\n", 1196 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m509.0/509.0 kB\u001b[0m \u001b[31m31.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1197 | "\u001b[?25hCollecting tiktoken\n", 1198 | " Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", 1199 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1200 | "\u001b[?25hRequirement already satisfied: opencv-python in /usr/local/lib/python3.10/dist-packages (4.8.0.76)\n", 1201 | "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (6.0.1)\n", 1202 | "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.0.24)\n", 1203 | "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from langchain) (3.9.1)\n", 1204 | "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (4.0.3)\n", 1205 | "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", 1206 | " Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)\n", 1207 | "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", 1208 | " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", 1209 | "Collecting langchain-community<0.1,>=0.0.13 (from langchain)\n", 1210 | " Downloading langchain_community-0.0.13-py3-none-any.whl (1.6 MB)\n", 1211 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m52.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1212 | "\u001b[?25hCollecting langchain-core<0.2,>=0.1.9 (from langchain)\n", 1213 | " Downloading langchain_core-0.1.13-py3-none-any.whl (228 kB)\n", 1214 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.7/228.7 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1215 | "\u001b[?25hCollecting langsmith<0.1.0,>=0.0.77 (from langchain)\n", 1216 | " Downloading langsmith-0.0.83-py3-none-any.whl (49 kB)\n", 1217 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1218 | "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from langchain) (1.23.5)\n", 1219 | "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from langchain) (2.31.0)\n", 1220 | "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from langchain) (8.2.3)\n", 1221 | "Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (5.2.0)\n", 1222 | "Collecting filetype (from unstructured[all-docs])\n", 1223 | " Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)\n", 1224 | "Collecting python-magic (from unstructured[all-docs])\n", 1225 | " Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)\n", 1226 | "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (3.8.1)\n", 1227 | "Requirement already satisfied: tabulate in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (0.9.0)\n", 1228 | "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (4.11.2)\n", 1229 | "Collecting emoji (from unstructured[all-docs])\n", 1230 | " Downloading emoji-2.10.0-py2.py3-none-any.whl (457 kB)\n", 1231 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m457.9/457.9 kB\u001b[0m \u001b[31m31.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1232 | "\u001b[?25hCollecting python-iso639 (from unstructured[all-docs])\n", 1233 | " Downloading python_iso639-2024.1.2-py3-none-any.whl (274 kB)\n", 1234 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m274.7/274.7 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1235 | "\u001b[?25hCollecting langdetect (from unstructured[all-docs])\n", 1236 | " Downloading langdetect-1.0.9.tar.gz (981 kB)\n", 1237 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m981.5/981.5 kB\u001b[0m \u001b[31m52.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1238 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1239 | "Collecting rapidfuzz (from unstructured[all-docs])\n", 1240 | " Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n", 1241 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m41.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1242 | "\u001b[?25hCollecting backoff (from unstructured[all-docs])\n", 1243 | " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", 1244 | "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (4.5.0)\n", 1245 | "Collecting unstructured-client (from unstructured[all-docs])\n", 1246 | " Downloading unstructured_client-0.15.2-py3-none-any.whl (20 kB)\n", 1247 | "Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (1.14.1)\n", 1248 | "Collecting unstructured.pytesseract>=0.3.12 (from unstructured[all-docs])\n", 1249 | " Downloading unstructured.pytesseract-0.3.12-py3-none-any.whl (14 kB)\n", 1250 | "Collecting pdfminer.six (from unstructured[all-docs])\n", 1251 | " Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)\n", 1252 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m47.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1253 | "\u001b[?25hRequirement already satisfied: markdown in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (3.5.2)\n", 1254 | "Collecting python-pptx<=0.6.23 (from unstructured[all-docs])\n", 1255 | " Downloading python_pptx-0.6.23-py3-none-any.whl (471 kB)\n", 1256 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m471.6/471.6 kB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1257 | "\u001b[?25hCollecting pypandoc (from unstructured[all-docs])\n", 1258 | " Downloading pypandoc-1.12-py3-none-any.whl (20 kB)\n", 1259 | "Collecting unstructured-inference==0.7.21 (from unstructured[all-docs])\n", 1260 | " Downloading unstructured_inference-0.7.21-py3-none-any.whl (60 kB)\n", 1261 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.0/60.0 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1262 | "\u001b[?25hCollecting pdf2image (from unstructured[all-docs])\n", 1263 | " Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n", 1264 | "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (3.2.1)\n", 1265 | "Collecting pypdf (from unstructured[all-docs])\n", 1266 | " Downloading pypdf-4.0.0-py3-none-any.whl (283 kB)\n", 1267 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m283.9/283.9 kB\u001b[0m \u001b[31m31.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1268 | "\u001b[?25hCollecting msg-parser (from unstructured[all-docs])\n", 1269 | " Downloading msg_parser-1.2.0-py2.py3-none-any.whl (101 kB)\n", 1270 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m101.8/101.8 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1271 | "\u001b[?25hRequirement already satisfied: xlrd in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (2.0.1)\n", 1272 | "Collecting python-docx (from unstructured[all-docs])\n", 1273 | " Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)\n", 1274 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.6/239.6 kB\u001b[0m \u001b[31m26.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1275 | "\u001b[?25hRequirement already satisfied: openpyxl in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (3.1.2)\n", 1276 | "Collecting onnx (from unstructured[all-docs])\n", 1277 | " Downloading onnx-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.7 MB)\n", 1278 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m31.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1279 | "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from unstructured[all-docs]) (1.5.3)\n", 1280 | "Collecting pikepdf (from unstructured[all-docs])\n", 1281 | " Downloading pikepdf-8.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)\n", 1282 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m63.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1283 | "\u001b[?25hCollecting layoutparser[layoutmodels,tesseract] (from unstructured-inference==0.7.21->unstructured[all-docs])\n", 1284 | " Downloading layoutparser-0.3.4-py3-none-any.whl (19.2 MB)\n", 1285 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m19.2/19.2 MB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1286 | "\u001b[?25hCollecting python-multipart (from unstructured-inference==0.7.21->unstructured[all-docs])\n", 1287 | " Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n", 1288 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1289 | "\u001b[?25hRequirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.21->unstructured[all-docs]) (0.20.2)\n", 1290 | "Collecting onnxruntime<1.16 (from unstructured-inference==0.7.21->unstructured[all-docs])\n", 1291 | " Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)\n", 1292 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m52.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1293 | "\u001b[?25hRequirement already satisfied: transformers>=4.25.1 in /usr/local/lib/python3.10/dist-packages (from unstructured-inference==0.7.21->unstructured[all-docs]) (4.35.2)\n", 1294 | "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai) (3.7.1)\n", 1295 | "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai) (1.7.0)\n", 1296 | "Collecting httpx<1,>=0.23.0 (from openai)\n", 1297 | " Downloading httpx-0.26.0-py3-none-any.whl (75 kB)\n", 1298 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1299 | "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai) (1.3.0)\n", 1300 | "Requirement already satisfied: tqdm>4 in /usr/local/lib/python3.10/dist-packages (from openai) (4.66.1)\n", 1301 | "Collecting typing-extensions (from unstructured[all-docs])\n", 1302 | " Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n", 1303 | "Requirement already satisfied: build>=1.0.3 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.0.3)\n", 1304 | "Collecting chroma-hnswlib==0.7.3 (from chromadb)\n", 1305 | " Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)\n", 1306 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.4/2.4 MB\u001b[0m \u001b[31m85.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1307 | "\u001b[?25hCollecting fastapi>=0.95.2 (from chromadb)\n", 1308 | " Downloading fastapi-0.109.0-py3-none-any.whl (92 kB)\n", 1309 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1310 | "\u001b[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)\n", 1311 | " Downloading uvicorn-0.26.0-py3-none-any.whl (60 kB)\n", 1312 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.5/60.5 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1313 | "\u001b[?25hCollecting posthog>=2.4.0 (from chromadb)\n", 1314 | " Downloading posthog-3.3.2-py2.py3-none-any.whl (40 kB)\n", 1315 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.7/40.7 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1316 | "\u001b[?25hCollecting pulsar-client>=3.1.0 (from chromadb)\n", 1317 | " Downloading pulsar_client-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n", 1318 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m49.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1319 | "\u001b[?25hCollecting opentelemetry-api>=1.2.0 (from chromadb)\n", 1320 | " Downloading opentelemetry_api-1.22.0-py3-none-any.whl (57 kB)\n", 1321 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.9/57.9 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1322 | "\u001b[?25hCollecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)\n", 1323 | " Downloading opentelemetry_exporter_otlp_proto_grpc-1.22.0-py3-none-any.whl (18 kB)\n", 1324 | "Collecting opentelemetry-instrumentation-fastapi>=0.41b0 (from chromadb)\n", 1325 | " Downloading opentelemetry_instrumentation_fastapi-0.43b0-py3-none-any.whl (11 kB)\n", 1326 | "Collecting opentelemetry-sdk>=1.2.0 (from chromadb)\n", 1327 | " Downloading opentelemetry_sdk-1.22.0-py3-none-any.whl (105 kB)\n", 1328 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.6/105.6 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1329 | "\u001b[?25hRequirement already satisfied: tokenizers>=0.13.2 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.15.0)\n", 1330 | "Collecting pypika>=0.48.9 (from chromadb)\n", 1331 | " Downloading PyPika-0.48.9.tar.gz (67 kB)\n", 1332 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.3/67.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1333 | "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", 1334 | " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", 1335 | " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", 1336 | "Collecting overrides>=7.3.1 (from chromadb)\n", 1337 | " Downloading overrides-7.5.0-py3-none-any.whl (17 kB)\n", 1338 | "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.10/dist-packages (from chromadb) (6.1.1)\n", 1339 | "Requirement already satisfied: grpcio>=1.58.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (1.60.0)\n", 1340 | "Collecting bcrypt>=4.0.1 (from chromadb)\n", 1341 | " Downloading bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl (698 kB)\n", 1342 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m698.9/698.9 kB\u001b[0m \u001b[31m55.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1343 | "\u001b[?25hRequirement already satisfied: typer>=0.9.0 in /usr/local/lib/python3.10/dist-packages (from chromadb) (0.9.0)\n", 1344 | "Collecting kubernetes>=28.1.0 (from chromadb)\n", 1345 | " Downloading kubernetes-29.0.0-py2.py3-none-any.whl (1.6 MB)\n", 1346 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m69.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1347 | "\u001b[?25hCollecting mmh3>=4.0.1 (from chromadb)\n", 1348 | " Downloading mmh3-4.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67 kB)\n", 1349 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.6/67.6 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1350 | "\u001b[?25hRequirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.6.3)\n", 1351 | "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", 1352 | "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n", 1353 | "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", 1354 | "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n", 1355 | "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", 1356 | "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (3.6)\n", 1357 | "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai) (1.2.0)\n", 1358 | "Requirement already satisfied: packaging>=19.0 in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (23.2)\n", 1359 | "Requirement already satisfied: pyproject_hooks in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (1.0.0)\n", 1360 | "Requirement already satisfied: tomli>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from build>=1.0.3->chromadb) (2.0.1)\n", 1361 | "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", 1362 | " Downloading marshmallow-3.20.2-py3-none-any.whl (49 kB)\n", 1363 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1364 | "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", 1365 | " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", 1366 | "Collecting starlette<0.36.0,>=0.35.0 (from fastapi>=0.95.2->chromadb)\n", 1367 | " Downloading starlette-0.35.1-py3-none-any.whl (71 kB)\n", 1368 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.1/71.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1369 | "\u001b[?25hRequirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n", 1370 | "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)\n", 1371 | " Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n", 1372 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1373 | "\u001b[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)\n", 1374 | " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", 1375 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1376 | "\u001b[?25hCollecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1.33->langchain)\n", 1377 | " Downloading jsonpointer-2.4-py2.py3-none-any.whl (7.8 kB)\n", 1378 | "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.16.0)\n", 1379 | "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.8.2)\n", 1380 | "Requirement already satisfied: google-auth>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.17.3)\n", 1381 | "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.7.0)\n", 1382 | "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (1.3.1)\n", 1383 | "Requirement already satisfied: oauthlib>=3.2.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (3.2.2)\n", 1384 | "Requirement already satisfied: urllib3>=1.24.2 in /usr/local/lib/python3.10/dist-packages (from kubernetes>=28.1.0->chromadb) (2.0.7)\n", 1385 | "Collecting coloredlogs (from onnxruntime<1.16->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1386 | " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", 1387 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1388 | "\u001b[?25hRequirement already satisfied: flatbuffers in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.21->unstructured[all-docs]) (23.5.26)\n", 1389 | "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.21->unstructured[all-docs]) (3.20.3)\n", 1390 | "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from onnxruntime<1.16->unstructured-inference==0.7.21->unstructured[all-docs]) (1.12)\n", 1391 | "Collecting deprecated>=1.2.6 (from opentelemetry-api>=1.2.0->chromadb)\n", 1392 | " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", 1393 | "Collecting importlib-metadata<7.0,>=6.0 (from opentelemetry-api>=1.2.0->chromadb)\n", 1394 | " Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)\n", 1395 | "Requirement already satisfied: googleapis-common-protos~=1.52 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb) (1.62.0)\n", 1396 | "Collecting opentelemetry-exporter-otlp-proto-common==1.22.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", 1397 | " Downloading opentelemetry_exporter_otlp_proto_common-1.22.0-py3-none-any.whl (17 kB)\n", 1398 | "Collecting opentelemetry-proto==1.22.0 (from opentelemetry-exporter-otlp-proto-grpc>=1.2.0->chromadb)\n", 1399 | " Downloading opentelemetry_proto-1.22.0-py3-none-any.whl (50 kB)\n", 1400 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1401 | "\u001b[?25hCollecting opentelemetry-instrumentation-asgi==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", 1402 | " Downloading opentelemetry_instrumentation_asgi-0.43b0-py3-none-any.whl (14 kB)\n", 1403 | "Collecting opentelemetry-instrumentation==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", 1404 | " Downloading opentelemetry_instrumentation-0.43b0-py3-none-any.whl (28 kB)\n", 1405 | "Collecting opentelemetry-semantic-conventions==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", 1406 | " Downloading opentelemetry_semantic_conventions-0.43b0-py3-none-any.whl (36 kB)\n", 1407 | "Collecting opentelemetry-util-http==0.43b0 (from opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", 1408 | " Downloading opentelemetry_util_http-0.43b0-py3-none-any.whl (6.9 kB)\n", 1409 | "Requirement already satisfied: setuptools>=16.0 in /usr/local/lib/python3.10/dist-packages (from opentelemetry-instrumentation==0.43b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb) (67.7.2)\n", 1410 | "Collecting asgiref~=3.0 (from opentelemetry-instrumentation-asgi==0.43b0->opentelemetry-instrumentation-fastapi>=0.41b0->chromadb)\n", 1411 | " Downloading asgiref-3.7.2-py3-none-any.whl (24 kB)\n", 1412 | "Collecting monotonic>=1.5 (from posthog>=2.4.0->chromadb)\n", 1413 | " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", 1414 | "Requirement already satisfied: Pillow>=3.3.2 in /usr/local/lib/python3.10/dist-packages (from python-pptx<=0.6.23->unstructured[all-docs]) (9.4.0)\n", 1415 | "Collecting XlsxWriter>=0.5.7 (from python-pptx<=0.6.23->unstructured[all-docs])\n", 1416 | " Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)\n", 1417 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m154.8/154.8 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1418 | "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->langchain) (3.3.2)\n", 1419 | "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n", 1420 | "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.10/dist-packages (from typer>=0.9.0->chromadb) (8.1.7)\n", 1421 | "Collecting httptools>=0.5.0 (from uvicorn[standard]>=0.18.3->chromadb)\n", 1422 | " Downloading httptools-0.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (341 kB)\n", 1423 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.4/341.4 kB\u001b[0m \u001b[31m32.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1424 | "\u001b[?25hCollecting python-dotenv>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n", 1425 | " Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)\n", 1426 | "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]>=0.18.3->chromadb)\n", 1427 | " Downloading uvloop-0.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n", 1428 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1429 | "\u001b[?25hCollecting watchfiles>=0.13 (from uvicorn[standard]>=0.18.3->chromadb)\n", 1430 | " Downloading watchfiles-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", 1431 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m71.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1432 | "\u001b[?25hCollecting websockets>=10.4 (from uvicorn[standard]>=0.18.3->chromadb)\n", 1433 | " Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n", 1434 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1435 | "\u001b[?25hRequirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->unstructured[all-docs]) (2.5)\n", 1436 | "Collecting olefile>=0.46 (from msg-parser->unstructured[all-docs])\n", 1437 | " Downloading olefile-0.47-py2.py3-none-any.whl (114 kB)\n", 1438 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.6/114.6 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1439 | "\u001b[?25hRequirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->unstructured[all-docs]) (1.3.2)\n", 1440 | "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.10/dist-packages (from openpyxl->unstructured[all-docs]) (1.1.0)\n", 1441 | "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->unstructured[all-docs]) (2023.3.post1)\n", 1442 | "Requirement already satisfied: cryptography>=36.0.0 in /usr/local/lib/python3.10/dist-packages (from pdfminer.six->unstructured[all-docs]) (41.0.7)\n", 1443 | "Collecting Pillow>=3.3.2 (from python-pptx<=0.6.23->unstructured[all-docs])\n", 1444 | " Downloading pillow-10.2.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)\n", 1445 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m61.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1446 | "\u001b[?25hCollecting jsonpath-python>=1.0.6 (from unstructured-client->unstructured[all-docs])\n", 1447 | " Downloading jsonpath_python-1.0.6-py3-none-any.whl (7.6 kB)\n", 1448 | "Collecting mypy-extensions>=1.0.0 (from unstructured-client->unstructured[all-docs])\n", 1449 | " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", 1450 | "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.10/dist-packages (from cryptography>=36.0.0->pdfminer.six->unstructured[all-docs]) (1.16.0)\n", 1451 | "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (5.3.2)\n", 1452 | "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.3.0)\n", 1453 | "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (4.9)\n", 1454 | "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->unstructured-inference==0.7.21->unstructured[all-docs]) (3.13.1)\n", 1455 | "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub->unstructured-inference==0.7.21->unstructured[all-docs]) (2023.6.0)\n", 1456 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<7.0,>=6.0->opentelemetry-api>=1.2.0->chromadb) (3.17.0)\n", 1457 | "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->unstructured-inference==0.7.21->unstructured[all-docs]) (0.4.1)\n", 1458 | "Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<1.16->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1459 | " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", 1460 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1461 | "\u001b[?25hRequirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (1.11.4)\n", 1462 | "Collecting iopath (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1463 | " Downloading iopath-0.1.10.tar.gz (42 kB)\n", 1464 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1465 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1466 | "Collecting pdfplumber (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1467 | " Downloading pdfplumber-0.10.3-py3-none-any.whl (48 kB)\n", 1468 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.0/49.0 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1469 | "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (2.1.0+cu121)\n", 1470 | "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (0.16.0+cu121)\n", 1471 | "Collecting effdet (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1472 | " Downloading effdet-0.4.1-py3-none-any.whl (112 kB)\n", 1473 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.5/112.5 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1474 | "\u001b[?25hCollecting pytesseract (from layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1475 | " Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n", 1476 | "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->onnxruntime<1.16->unstructured-inference==0.7.21->unstructured[all-docs]) (1.3.0)\n", 1477 | "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six->unstructured[all-docs]) (2.21)\n", 1478 | "Requirement already satisfied: pyasn1<0.6.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=28.1.0->chromadb) (0.5.1)\n", 1479 | "Collecting timm>=0.9.2 (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1480 | " Downloading timm-0.9.12-py3-none-any.whl (2.2 MB)\n", 1481 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m54.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1482 | "\u001b[?25hRequirement already satisfied: pycocotools>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (2.0.7)\n", 1483 | "Collecting omegaconf>=2.0 (from effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1484 | " Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n", 1485 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1486 | "\u001b[?25hRequirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (3.1.3)\n", 1487 | "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (2.1.0)\n", 1488 | "Collecting portalocker (from iopath->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1489 | " Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n", 1490 | "Collecting pdfminer.six (from unstructured[all-docs])\n", 1491 | " Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)\n", 1492 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m48.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1493 | "\u001b[?25hCollecting pypdfium2>=4.18.0 (from pdfplumber->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1494 | " Downloading pypdfium2-4.26.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n", 1495 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m80.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1496 | "\u001b[?25hCollecting antlr4-python3-runtime==4.9.* (from omegaconf>=2.0->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs])\n", 1497 | " Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n", 1498 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1499 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1500 | "Requirement already satisfied: matplotlib>=2.1.0 in /usr/local/lib/python3.10/dist-packages (from pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (3.7.1)\n", 1501 | "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (2.1.3)\n", 1502 | "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (1.2.0)\n", 1503 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (0.12.1)\n", 1504 | "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (4.47.2)\n", 1505 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (1.4.5)\n", 1506 | "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=2.1.0->pycocotools>=2.0.2->effdet->layoutparser[layoutmodels,tesseract]->unstructured-inference==0.7.21->unstructured[all-docs]) (3.1.1)\n", 1507 | "Building wheels for collected packages: pypika, langdetect, iopath, antlr4-python3-runtime\n", 1508 | " Building wheel for pypika (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", 1509 | " Created wheel for pypika: filename=PyPika-0.48.9-py2.py3-none-any.whl size=53723 sha256=4f739ddaff5ad1a04179c77ab7963e81723e45ed7320d6bc7f1678d90d6f2f6b\n", 1510 | " Stored in directory: /root/.cache/pip/wheels/e1/26/51/d0bffb3d2fd82256676d7ad3003faea3bd6dddc9577af665f4\n", 1511 | " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1512 | " Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=f2b3d203b7d31499b31bb1d43464a946fba796a189dafe5e2d4b562c31ff7b0d\n", 1513 | " Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106\n", 1514 | " Building wheel for iopath (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1515 | " Created wheel for iopath: filename=iopath-0.1.10-py3-none-any.whl size=31532 sha256=70559bd955e45d646cc360746c6e2028bf19a1a6a8cef19d6813a75d91d6a42f\n", 1516 | " Stored in directory: /root/.cache/pip/wheels/9a/a3/b6/ac0fcd1b4ed5cfeb3db92e6a0e476cfd48ed0df92b91080c1d\n", 1517 | " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1518 | " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=2efb7713dadaebf976aec87d442066c7f1f39ae603e0a2817156361e132addfe\n", 1519 | " Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n", 1520 | "Successfully built pypika langdetect iopath antlr4-python3-runtime\n", 1521 | "Installing collected packages: pypika, monotonic, mmh3, filetype, antlr4-python3-runtime, XlsxWriter, websockets, uvloop, typing-extensions, rapidfuzz, python-multipart, python-magic, python-iso639, python-dotenv, pypdfium2, pypdf, pypandoc, pulsar-client, portalocker, Pillow, overrides, opentelemetry-util-http, opentelemetry-semantic-conventions, opentelemetry-proto, onnx, omegaconf, olefile, mypy-extensions, marshmallow, langdetect, jsonpointer, jsonpath-python, importlib-metadata, humanfriendly, httptools, h11, emoji, deprecated, chroma-hnswlib, bcrypt, backoff, watchfiles, uvicorn, unstructured.pytesseract, typing-inspect, tiktoken, starlette, python-pptx, python-docx, pytesseract, posthog, pikepdf, pdf2image, opentelemetry-exporter-otlp-proto-common, opentelemetry-api, msg-parser, jsonpatch, iopath, httpcore, coloredlogs, asgiref, pdfminer.six, opentelemetry-sdk, opentelemetry-instrumentation, onnxruntime, langsmith, kubernetes, httpx, fastapi, dataclasses-json, unstructured-client, timm, pdfplumber, opentelemetry-instrumentation-asgi, opentelemetry-exporter-otlp-proto-grpc, openai, langchain-core, unstructured, opentelemetry-instrumentation-fastapi, layoutparser, langchain-community, effdet, langchain, chromadb, unstructured-inference\n", 1522 | " Attempting uninstall: typing-extensions\n", 1523 | " Found existing installation: typing_extensions 4.5.0\n", 1524 | " Uninstalling typing_extensions-4.5.0:\n", 1525 | " Successfully uninstalled typing_extensions-4.5.0\n", 1526 | " Attempting uninstall: Pillow\n", 1527 | " Found existing installation: Pillow 9.4.0\n", 1528 | " Uninstalling Pillow-9.4.0:\n", 1529 | " Successfully uninstalled Pillow-9.4.0\n", 1530 | " Attempting uninstall: importlib-metadata\n", 1531 | " Found existing installation: importlib-metadata 7.0.1\n", 1532 | " Uninstalling importlib-metadata-7.0.1:\n", 1533 | " Successfully uninstalled importlib-metadata-7.0.1\n", 1534 | "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", 1535 | "lida 0.0.10 requires kaleido, which is not installed.\n", 1536 | "llmx 0.0.15a0 requires cohere, which is not installed.\n", 1537 | "imageio 2.31.6 requires pillow<10.1.0,>=8.3.2, but you have pillow 10.2.0 which is incompatible.\n", 1538 | "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.9.0 which is incompatible.\u001b[0m\u001b[31m\n", 1539 | "\u001b[0mSuccessfully installed Pillow-10.2.0 XlsxWriter-3.1.9 antlr4-python3-runtime-4.9.3 asgiref-3.7.2 backoff-2.2.1 bcrypt-4.1.2 chroma-hnswlib-0.7.3 chromadb-0.4.22 coloredlogs-15.0.1 dataclasses-json-0.6.3 deprecated-1.2.14 effdet-0.4.1 emoji-2.10.0 fastapi-0.109.0 filetype-1.2.0 h11-0.14.0 httpcore-1.0.2 httptools-0.6.1 httpx-0.26.0 humanfriendly-10.0 importlib-metadata-6.11.0 iopath-0.1.10 jsonpatch-1.33 jsonpath-python-1.0.6 jsonpointer-2.4 kubernetes-29.0.0 langchain-0.1.1 langchain-community-0.0.13 langchain-core-0.1.13 langdetect-1.0.9 langsmith-0.0.83 layoutparser-0.3.4 marshmallow-3.20.2 mmh3-4.1.0 monotonic-1.6 msg-parser-1.2.0 mypy-extensions-1.0.0 olefile-0.47 omegaconf-2.3.0 onnx-1.15.0 onnxruntime-1.15.1 openai-1.8.0 opentelemetry-api-1.22.0 opentelemetry-exporter-otlp-proto-common-1.22.0 opentelemetry-exporter-otlp-proto-grpc-1.22.0 opentelemetry-instrumentation-0.43b0 opentelemetry-instrumentation-asgi-0.43b0 opentelemetry-instrumentation-fastapi-0.43b0 opentelemetry-proto-1.22.0 opentelemetry-sdk-1.22.0 opentelemetry-semantic-conventions-0.43b0 opentelemetry-util-http-0.43b0 overrides-7.5.0 pdf2image-1.17.0 pdfminer.six-20221105 pdfplumber-0.10.3 pikepdf-8.11.2 portalocker-2.8.2 posthog-3.3.2 pulsar-client-3.4.0 pypandoc-1.12 pypdf-4.0.0 pypdfium2-4.26.0 pypika-0.48.9 pytesseract-0.3.10 python-docx-1.1.0 python-dotenv-1.0.0 python-iso639-2024.1.2 python-magic-0.4.27 python-multipart-0.0.6 python-pptx-0.6.23 rapidfuzz-3.6.1 starlette-0.35.1 tiktoken-0.5.2 timm-0.9.12 typing-extensions-4.9.0 typing-inspect-0.9.0 unstructured-0.12.0 unstructured-client-0.15.2 unstructured-inference-0.7.21 unstructured.pytesseract-0.3.12 uvicorn-0.26.0 uvloop-0.19.0 watchfiles-0.21.0 websockets-12.0\n" 1540 | ] 1541 | }, 1542 | { 1543 | "output_type": "display_data", 1544 | "data": { 1545 | "application/vnd.colab-display-data+json": { 1546 | "pip_warning": { 1547 | "packages": [ 1548 | "PIL", 1549 | "pydevd_plugins" 1550 | ] 1551 | } 1552 | } 1553 | }, 1554 | "metadata": {} 1555 | } 1556 | ] 1557 | }, 1558 | { 1559 | "cell_type": "code", 1560 | "source": [ 1561 | "import os\n", 1562 | "import uuid\n", 1563 | "import base64\n", 1564 | "from IPython import display\n", 1565 | "from unstructured.partition.pdf import partition_pdf\n", 1566 | "from langchain.chat_models import ChatOpenAI\n", 1567 | "from langchain.embeddings import OpenAIEmbeddings\n", 1568 | "from langchain.chains import LLMChain\n", 1569 | "from langchain.prompts import PromptTemplate\n", 1570 | "from langchain.schema.messages import HumanMessage, SystemMessage\n", 1571 | "from langchain.schema.document import Document\n", 1572 | "from langchain.vectorstores import FAISS\n", 1573 | "from langchain.retrievers.multi_vector import MultiVectorRetriever" 1574 | ], 1575 | "metadata": { 1576 | "id": "h0FY1Ne8zK11" 1577 | }, 1578 | "execution_count": 1, 1579 | "outputs": [] 1580 | }, 1581 | { 1582 | "cell_type": "code", 1583 | "source": [ 1584 | "from google.colab import userdata\n", 1585 | "openai_api_key = userdata.get('OPENAI_API_KEY')" 1586 | ], 1587 | "metadata": { 1588 | "id": "dnS-u234zK4Z" 1589 | }, 1590 | "execution_count": 2, 1591 | "outputs": [] 1592 | }, 1593 | { 1594 | "cell_type": "code", 1595 | "source": [ 1596 | "output_path = \"./images\"" 1597 | ], 1598 | "metadata": { 1599 | "id": "x5zgAXR2zK7J" 1600 | }, 1601 | "execution_count": 3, 1602 | "outputs": [] 1603 | }, 1604 | { 1605 | "cell_type": "code", 1606 | "source": [ 1607 | "# Get elements\n", 1608 | "raw_pdf_elements = partition_pdf(\n", 1609 | " filename=\"/content/AC-Aids-for-Dogs_Canine-Periodontal-Disease.pdf\",\n", 1610 | " extract_images_in_pdf=True,\n", 1611 | " infer_table_structure=True,\n", 1612 | " chunking_strategy=\"by_title\",\n", 1613 | " max_characters=4000,\n", 1614 | " new_after_n_chars=3800,\n", 1615 | " combine_text_under_n_chars=2000,\n", 1616 | " extract_image_block_output_dir=output_path,\n", 1617 | ")" 1618 | ], 1619 | "metadata": { 1620 | "colab": { 1621 | "base_uri": "https://localhost:8080/", 1622 | "height": 185, 1623 | "referenced_widgets": [ 1624 | "1add53568adc46d2b5db1915d8c2213d", 1625 | "5094d25d081541fb8517b76d7e2c0981", 1626 | "a7779fc06b814d19828a1db85b6023f1", 1627 | "f8d0262a97c34430be1018daf58d5865", 1628 | "7dd112470621458ab600fdd9f3806942", 1629 | "6f9cfbd6b6e043a29f07abb63fcab102", 1630 | "fed331066467473f9e15608afe5909c6", 1631 | "9ad99d5e37034c5698b34e9781616112", 1632 | "adb253cbb29444dba8553d37b324fdea", 1633 | "090c6f296b01428cbc94af75f188b27c", 1634 | "dcf1df1528f547729376f60a6486cae6", 1635 | "0da6cfb5f1214e478f996d1657f11977", 1636 | "5dd3cd164480493f8c24778b8c534aff", 1637 | "d935cf926c0047b68baed7ad340b6524", 1638 | "8ca1716d945146fda75297f4b4981145", 1639 | "fec79a20e11f47f1a2e14a5c2461c6fc", 1640 | "1563d2660f85468a82dae95c55783fa4", 1641 | "dca4cc9b977b4fff94d76449623b9dda", 1642 | "a27dd7076eed4b5aa88291caec14e162", 1643 | "e016e45e12b64c8a8ff1e90cec8ae9fc", 1644 | "5be15fdcb04e42b09113a68b2eaa21e8", 1645 | "3b45d3eb28c449f89403fcb7bc3d84c3", 1646 | "794b59f933a749e88c6ac73694f0d36a", 1647 | "c20d2db8870548979209ef73e6e125c3", 1648 | "fcc5d303f2d643bba765e27287212067", 1649 | "6f2a5ab0f9b44c3ebf2880ddf61317b9", 1650 | "dc6cecb88e1d4dadbf247140fb278dc7", 1651 | "4c0bf2156912436286f313c1c35418ed", 1652 | "9c3e160fb1464a29b2f6673952959219", 1653 | "61d9630501324251b8e91e60732bb56f", 1654 | "0d83444850874c0ca8bc6ff02b6dc8a3", 1655 | "e5b7873fc72140a68a2231175564a046", 1656 | "3a56855bd12f4c6992f7e51e64e5f246" 1657 | ] 1658 | }, 1659 | "id": "0usp5uBazK9h", 1660 | "outputId": "d01e3fa7-06b4-4e82-a04f-415ed8da41fd" 1661 | }, 1662 | "execution_count": 4, 1663 | "outputs": [ 1664 | { 1665 | "output_type": "display_data", 1666 | "data": { 1667 | "text/plain": [ 1668 | "config.json: 0%| | 0.00/1.47k [00:00" 2045 | ] 2046 | }, 2047 | "metadata": {} 2048 | } 2049 | ] 2050 | }, 2051 | { 2052 | "cell_type": "code", 2053 | "source": [], 2054 | "metadata": { 2055 | "id": "YfmC39EK5Bll" 2056 | }, 2057 | "execution_count": null, 2058 | "outputs": [] 2059 | } 2060 | ] 2061 | } --------------------------------------------------------------------------------