├── .gitignore ├── LICENSE ├── README.md ├── backend ├── .dockerignore ├── .env ├── .env.secret ├── Dockerfile ├── app_backend.py └── utils.py ├── frontend ├── Dockerfile └── app_frontend.py └── misc └── screenshot.png /.gitignore: -------------------------------------------------------------------------------- 1 | tmp 2 | pdf 3 | data 4 | notebook 5 | csv 6 | .env* 7 | frontend/flagged 8 | __pycache__ 9 | NOTES* 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi-LLM Chat 2 | 3 | ![screenshot](misc/screenshot.png) 4 | 5 | This application provides the ability to query a collection of LLM models via a web 6 | based interface. Optionally users can provide URL(s) in the form of web sites, PDFs or CSVs which will be fed to the LLM of choice for additional context. Multi-LLM Chat will automatically support whichever models you install (see instructions down below). 7 | 8 | Inference for OpenAI is provided via Azure's AI OpemAPI endpoint, other models are hosted via Ollama. Running instructions include both local (quickstart) as well as Azure Container App. 9 | 10 | 11 | ## Quickstart 12 | 13 | This sections assumes the following requirements are met: 14 | - You have a host which has Docker and CUDA configured (GPU support will be needed for local model inference) 15 | - You have Azure OpenAI credentials and endpoint specified in `.env`. 16 | - You are running Qdrant vector db locally on port `6333`. 17 | 18 | ``` 19 | # Run the backend 20 | docker run -d -p 8080:8080 -e OPENAI_API_KEY= simonj.azurecr.io/aca-multi-llm-backend 21 | 22 | # Run the frontend 23 | docker run -d -p 8088:8088 -e BACKEND_URL=http://172.17.0.1:8080 simonj.azurecr.io/aca-multi-llm-frontend 24 | 25 | # Run the model-endpoint 26 | # we store models on host filesystem under ollama-models 27 | docker run -d -p 11434:11434 -v `pwd`/ollama-models:/root/.ollama --gpus=all --name ollama ollama/ollama 28 | 29 | # Download models to Ollama 30 | docker exec -it ollama ollama pull phi 31 | docker exec -it ollama ollama pull llama2 32 | docker exec -it ollama ollama pull mixtral 33 | ``` 34 | You should be able to access the application at `http://localhost:8088` now. 35 | 36 | 37 | ## Application Details 38 | 39 | The application is composed of three different microservices each hosted and run as Docker containers. 40 | 41 | ### Frontend 42 | 43 | The application frontend is written in Python and uses [Gradio](https://grad.io). It renders the UI as well as shuttles in/output between the user and the backend. By default the application runs on port `8088` and requires the environment variable `BACKEND_URL` in order to function (the default value is `http://localhost:8080`). 44 | 45 | ### Backend 46 | 47 | The application backend is written in Python and uses [FastAPI](https://fastapi.org). It acts as middleware to broker frontend data between Azure OpenAI or the Ollama model endpoint depending on user choices. By defailt the backend listens on port `8080`. 48 | 49 | Secrets and settings are handled via environment files and/or command line `ENV` variables (command line variables overwrite file based variables). Secrets should be provided via command line inpts but can also be added to `.env.secret`. 50 | Non-secret settings can be added to `.env` or similalry provided on the command line. 51 | 52 | The backend requires the following settings: 53 | * `AZURE_OPENAI_ENDPOINT=https://.openai.azure.com/` 54 | * `AZURE_OPENAI_API_VERSION=2024-02-15-preview` (March '24 version but might change) 55 | * `MODEL_DEPLOYMENT_NAME=` (configured during Azure OpenAI setup) 56 | * `OLLAMA_BASE_URL=http://localhost:11434` (endpoint for Ollama) 57 | * `OPENAI_API_KEY=` 58 | * `QDRANT_ENDPOINT=http://localhost:6333` (where does Qdrant run) 59 | * `QDRANT_COLLECTION=multi-llm-chat` (mostly leave as is) 60 | 61 | For setup instructions on how to configure and retrieve the above [Azure OpenAI details](https://learn.microsoft.com/en-us/azure/ai-services/openai/chatgpt-quickstart?tabs=command-line%2Cpython&pivots=programming-language-python). 62 | 63 | ### Model Endpoint 64 | 65 | For all other models outside of OpenAI we utilize [Ollama]() running on a GPU. Ollama hosts as well as manages the different models and conveniently runs in a Docker container. When running locally make sure to pass the needed `--gpus=all` flag when starting the container (see more below). Ollama listens on port `11434`. 66 | 67 | In order for different models to be hosted and provided as part of the Multi-LLM Chat we need to tell Ollama which model to download first. We do this by running: `ollama pull `. Multi-LLM Chat will offer whatever model you pull into Ollama. The full model inventory is available at [Ollama model library](https://ollama.com/library). For detailed running instructions please see below. 68 | 69 | 70 | ## Running 71 | 72 | ### Azure Container Apps 73 | 74 | In order to running this application on a Azure Container Apps we setup a new environment with a GPU workload profile. While frontend and backend can run on a consumption profile it is essential to run the model endpoint (Ollama) on GPU. 75 | 76 | 1. Set Environment Variables 77 | Make sure you have GPU quota allocated in whichever location you specifiy. 78 | ``` 79 | export ACA_ENV=multi-llm 80 | export RG=llm-playground 81 | export LOCATION=westus3 82 | export AZURE_STORAGE_ACCOUNT=multillmchat`head /dev/random | md5sum | cut -c1-8` 83 | ``` 84 | 1. Create Container Apps Environment and GPU Workload Profile 85 | The GPU profile will automatically recevie the profile name `gpu`. 86 | ``` 87 | az containerapp env create \ 88 | --name $ACA_ENV \ 89 | --resource-group $RG \ 90 | --location $LOCATION \ 91 | --enable-dedicated-gpu \ 92 | --enable-workload-profiles 93 | ``` 94 | 1. Launch the Qdrant Vector DB Add-on 95 | Storing anything we ingest via the provided URLs in a vector database will make retrieval the second time much faster. We will hence use Qdrant to store the embeddings for the content we ingest. 96 | ``` 97 | az containerapp add-on qdrant create \ 98 | --environment $ACA_ENV \ 99 | --resource-group $RG --name qdrantdb-multi-llm 100 | ``` 101 | 1. Standup the Backend 102 | Assuming we provided the majority of our settings as part of `.env` we only need to inject the `OPENAI_API_KEY`. The `OLLAMA_BASE_URL` is set to `http://model-endpoint` below. There is no need to change that unless, you use a differnt name for the app below when setting up the model endpoint. To also point the backend to our Qdrant databse we set `QDRANT_ENDPOINT=http://qdrantdb-multi-llm:6333`. Also note: Ollama's usual default port of `11434` is not used because Container App's internal ingress handling. 103 | ``` 104 | az containerapp create \ 105 | --name backend-service-multi-llm \ 106 | --resource-group $RG \ 107 | --environment $ACA_ENV \ 108 | --cpu 2 --memory 4Gi \ 109 | --image simonj.azurecr.io/aca-multi-llm-backend \ 110 | --min-replicas 1 --max-replicas 1 \ 111 | --target-port 8080 --ingress internal \ 112 | --env-vars \ 113 | OLLAMA_BASE_URL=http://model-endpoint \ 114 | OPENAI_API_KEY= \ 115 | QDRANT_ENDPOINT=http://qdrantdb-multi-llm:6333 116 | ``` 117 | 1. Standup the Frontend 118 | No changes should be required, unless you've changed the name of the backend app. In which case you will need to adjust `BACKEND_URL` accordingly. 119 | ``` 120 | az containerapp create \ 121 | --name frontend-service-multi-llm \ 122 | --resource-group $RG \ 123 | --environment $ACA_ENV \ 124 | --cpu 4 --memory 8Gi \ 125 | --image simonj.azurecr.io/aca-multi-llm-frontend \ 126 | --min-replicas 1 --max-replicas 1 \ 127 | --target-port 8088 --ingress external \ 128 | --env-vars \ 129 | BACKEND_URL=http://backend-service-multi-llm 130 | ``` 131 | 1. Standup the Model Endpoint 132 | We make sure that Ollama runs on our GPU workload profile by using the command below. 133 | ``` 134 | az containerapp create \ 135 | --name model-endpoint \ 136 | --resource-group $RG \ 137 | --environment $ACA_ENV \ 138 | --workload-profile-name gpu \ 139 | --cpu 12 --memory 24Gi \ 140 | --image ollama/ollama \ 141 | --min-replicas 1 --max-replicas 1 \ 142 | --target-port 11434 \ 143 | --ingress internal 144 | ``` 145 | 1. Create Storage for our Models 146 | This is an optional step if you don't intend to install too many or overly large models. You will get a 8Gi allocation of ephemeral storage. This should be sufficient to install a couple models. For example Phi2 needs 1.6Gi, Llama2 requires 3.8Gi, Mistral needs 4.1Gi for a more complete list of model sizer please see [Ollama's model library reference](https://github.com/ollama/ollama?tab=readme-ov-file#model-library). Please note, if you take this approach your models will need to be repulled after a container restart. 147 | Otherwise, for Ollama to store the models it needs a fair amount of space (3-80Gi per model). These numbers exceed the ephemeral storage containers get on Azure Container Apps. In order to accomodate this we create the needed storage to keep the models. We hence have to do the following: 148 | * Create a storage account 149 | * Create a share using the account 150 | * Add the storage to the Container Apps Environment 151 | * Mount the storage to the Ollama container 152 | ``` 153 | # create the account 154 | az storage account create \ 155 | --name $AZURE_STORAGE_ACCOUNT \ 156 | --resource-group $RG \ 157 | --location $LOCATION \ 158 | --sku Premium_LRS \ 159 | --kind FileStorage \ 160 | --https-only true \ 161 | --allow-blob-public-access false \ 162 | --allow-shared-key-access true \ 163 | --default-action Allow \ 164 | --bypass AzureServices 165 | 166 | # we create a share (100Gi in size) 167 | az storage share-rm create \ 168 | --resource-group $RG \ 169 | --storage-account $AZURE_STORAGE_ACCOUNT \ 170 | --name model-store \ 171 | --quota 100 \ 172 | --enabled-protocols SMB \ 173 | --access-tier Premium 174 | 175 | # we retrieve and store the secret we need next 176 | export AZURE_STORAGE_KEY=`az storage account keys list -g $RG --account-name $AZURE_STORAGE_ACCOUNT --query "[0].value" --output tsv` 177 | 178 | # we allocate the share to the ACA environment 179 | az containerapp env storage set \ 180 | --access-mode ReadWrite \ 181 | --azure-file-account-name $AZURE_STORAGE_ACCOUNT \ 182 | --azure-file-account-key $AZURE_STORAGE_KEY \ 183 | --azure-file-share-name model-store \ 184 | --storage-name azurefilesstorage \ 185 | --name $ACA_ENV \ 186 | --resource-group $RG \ 187 | --output table 188 | 189 | # now we mount the storage to Ollama container 190 | az containerapp show \ 191 | --name model-endpoint \ 192 | --resource-group $RG \ 193 | --output yaml > model-endpoint-definition.yaml 194 | 195 | # edit the model-endpoint-definition.yaml file to look like this: 196 | template: 197 | containers: 198 | - image: ollama/ollama 199 | name: model-endpoint 200 | resources: 201 | cpu: 12.0 202 | ephemeralStorage: 8Gi 203 | memory: 24Gi 204 | volumeMounts: << new section start 205 | - volumeName: model-storage 206 | mountPath: /root/.ollama << new section end 207 | initContainers: null 208 | revisionSuffix: '' 209 | scale: 210 | maxReplicas: 1 211 | minReplicas: 1 212 | rules: null 213 | serviceBinds: null 214 | terminationGracePeriodSeconds: null 215 | volumes: << new section start 216 | - name: model-storage 217 | storageName: azurefilesstorage 218 | storageType: AzureFile << new section end 219 | 220 | # now reapply the yaml file to update the application/model-endpoint 221 | az containerapp update \ 222 | --name model-endpoint \ 223 | --resource-group $RG \ 224 | --yaml model-endpoint-definition.yaml 225 | 226 | ``` 227 | 1. Download the Models 228 | After the model container comes up we need to ensure our models get deployed. Currently this is a manual process where we instruct Ollama to pull the models and make them available for eventual inference requests via it's API. In order to download the models we use the `ollama` CLI. From the CLI we can run the commands below. Alternatively we can use the portal's web console to run the commands. 229 | ``` 230 | az containerapp exec \ 231 | --name model-endpoint \ 232 | --resource-group $RG \ 233 | --container model-endpoint \ 234 | --command 'ollama pull phi' 235 | 236 | az containerapp exec \ 237 | --name model-endpoint \ 238 | --resource-group $RG \ 239 | --container model-endpoint \ 240 | --command 'ollama pull llama2' 241 | ``` 242 | 243 | ## Limitations & Todo 244 | * Multi-Chat LLM is not intended for production use. 245 | * Dependencies should be moved from Dockerfile to requirements.txt. 246 | * Logging should be moved to logging library. 247 | * RAG setup is relatively naive currently 248 | * Troubleshooting section should be added to documentation. -------------------------------------------------------------------------------- /backend/.dockerignore: -------------------------------------------------------------------------------- 1 | ./data 2 | ./notebook 3 | *secret* 4 | ./pdf 5 | ./csv 6 | -------------------------------------------------------------------------------- /backend/.env: -------------------------------------------------------------------------------- 1 | AZURE_OPENAI_ENDPOINT=https://.openai.azure.com/ 2 | AZURE_OPENAI_API_VERSION=2024-02-15-preview 3 | MODEL_DEPLOYMENT_NAME= 4 | OLLAMA_BASE_URL=http://172.17.0.1:11434 5 | QDRANT_ENDPOINT=http://172.17.0.1:6333 6 | QDRANT_COLLECTION=multi-llm-chat 7 | -------------------------------------------------------------------------------- /backend/.env.secret: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= -------------------------------------------------------------------------------- /backend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.12-slim-bullseye 3 | 4 | RUN apt-get update && apt-get install -y \ 5 | libxml2-dev \ 6 | libxslt-dev \ 7 | build-essential \ 8 | python3-numpy 9 | 10 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 11 | python3 -m pip install --no-cache-dir \ 12 | llama-index-llms-azure-openai \ 13 | llama-index-embeddings-huggingface \ 14 | llama-index-llms-ollama \ 15 | pydantic-settings \ 16 | llama-index-readers-web \ 17 | llama-index \ 18 | llama-index-vector-stores-qdrant \ 19 | llama-index-embeddings-huggingface \ 20 | trafilatura \ 21 | qdrant-client \ 22 | ollama 23 | 24 | WORKDIR /app 25 | 26 | ADD . /app 27 | 28 | # Make port 80 available to the world outside this container 29 | EXPOSE 8080 30 | 31 | # Run app.py when the container launches 32 | CMD ["uvicorn", "app_backend:app", "--host", "0.0.0.0", "--port", "8080"] -------------------------------------------------------------------------------- /backend/app_backend.py: -------------------------------------------------------------------------------- 1 | from llama_index.core import SummaryIndex 2 | 3 | # from IPython.display import Markdown, display 4 | import os 5 | import time 6 | import json 7 | 8 | from llama_index.core.llms import ChatMessage 9 | 10 | 11 | import requests 12 | 13 | 14 | from dotenv import dotenv_values 15 | from fastapi import FastAPI 16 | from pydantic_settings import BaseSettings 17 | 18 | from typing import List 19 | from fastapi import FastAPI, HTTPException 20 | from pydantic import BaseModel, HttpUrl 21 | 22 | import utils 23 | 24 | # download and install dependencies 25 | #OllamaQueryEnginePack = download_llama_pack( 26 | # "OllamaQueryEnginePack", "./ollama_pack" 27 | #) 28 | 29 | class Settings(BaseSettings): 30 | MODEL_DEPLOYMENT_NAME: str = "None" 31 | AZURE_OPENAI_ENDPOINT: str = "None" 32 | OPENAI_API_KEY: str = "None" 33 | AZURE_OPENAI_API_VERSION: str = "None" 34 | OLLAMA_BASE_URL: str = "None" 35 | QDRANT_ENDPOINT: str = "None" 36 | QDRANT_COLLECTION: str = "None" 37 | 38 | # ignore any other env variables 39 | class Config: 40 | extra = "ignore" 41 | 42 | 43 | app = FastAPI() 44 | 45 | @app.on_event("startup") 46 | async def startup_event(): 47 | mysettings = Settings() 48 | config = { 49 | **dotenv_values(".env"), # load shared development variables 50 | **dotenv_values(".env.secret"), # load sensitive variables 51 | **os.environ, # override loaded values with environment variables 52 | } 53 | print("Using the following settings:") 54 | mysettings = Settings(**config) 55 | print("+------------------------------------+") 56 | print("MODEL_DEPLOYMENT_NAME: %s" % mysettings.MODEL_DEPLOYMENT_NAME) 57 | print("AZURE_OPENAI_ENDPOINT: %s" % mysettings.AZURE_OPENAI_ENDPOINT) 58 | print("OPENAI_API_KEY: ....%s" % mysettings.OPENAI_API_KEY[-5:]) 59 | print("AZURE_OPENAI_API_VERSION: %s" % mysettings.AZURE_OPENAI_API_VERSION) 60 | print("OLLAMA_BASE_URL: %s" % mysettings.OLLAMA_BASE_URL) 61 | print("QDRANT_ENDPOINT: %s" % mysettings.QDRANT_ENDPOINT) 62 | print("QDRANT_COLLECTION: %s" % mysettings.QDRANT_COLLECTION) 63 | print("+------------------------------------+") 64 | app.state.settings = mysettings 65 | 66 | 67 | @app.get("/list_llm") 68 | async def list_llm(): 69 | response = requests.get(f"{app.state.settings.OLLAMA_BASE_URL}/api/tags") 70 | model_list = [] 71 | if response.status_code == 200: 72 | models = response.json()['models'] 73 | for model in models: 74 | model_list.append(model['name'].replace(":latest", "")) 75 | print("Found the following Ollama models: %s" % model_list) 76 | # always support openai 77 | model_list.append("openai") 78 | return model_list 79 | else: 80 | raise HTTPException(status_code=400, detail="Could not fetch LLM list") 81 | 82 | 83 | @app.post("/set_llm/{name}") 84 | async def set_llm(name: str): 85 | llm, embed_model = utils.get_llm(name, app.state.settings) 86 | #print(dir(llm)) 87 | print("Model %s loaded" % llm.model) 88 | app.state.llm = llm 89 | app.state.embed_model = embed_model 90 | return {"message": f"Set llm to {name}"} 91 | 92 | 93 | class Query(BaseModel): 94 | question: str 95 | urls: List[HttpUrl] 96 | 97 | 98 | @app.post("/query_chat") 99 | async def query_chat(query: Query): 100 | # Check if llm instance exists 101 | if not hasattr(app.state, 'llm'): 102 | raise HTTPException(status_code=400, detail="Please choose an LLM") 103 | end, start = 0, 0 104 | response = "Error: No response found" 105 | if query.urls: 106 | # we only support one URL for now because otherwise merging the fecthed documents is harder 107 | urls = [str(u) for u in query.urls][:1] 108 | app.state.dbclient = utils.get_dbclient(app.state.settings) 109 | query_engine = utils.get_query_engine(urls, app.state.dbclient, app.state.settings.QDRANT_COLLECTION, app.state.embed_model) 110 | start = time.time() 111 | print("Querying LLM...") 112 | response = query_engine.query(query.question) 113 | end = time.time() 114 | else: 115 | llm = app.state.llm 116 | print("No documents to index... querying LLM...") 117 | start = time.time() 118 | response = llm.chat([ChatMessage(role="user", content=query.question)]) 119 | response = {"response" : response.message.content} 120 | end = time.time() 121 | print("Query complete...") 122 | exec_time = end - start 123 | print(f"The query took {exec_time:.1f} seconds to execute.") 124 | print(response) 125 | return {"response": response, "time_taken": f"{exec_time:.1f}"} 126 | 127 | -------------------------------------------------------------------------------- /backend/utils.py: -------------------------------------------------------------------------------- 1 | from llama_index.core import SummaryIndex 2 | from llama_index.core import VectorStoreIndex 3 | import os 4 | from llama_index.embeddings.openai import OpenAIEmbedding 5 | from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter, FilterCondition, MetadataFilter 6 | from llama_index.embeddings.huggingface import HuggingFaceEmbedding 7 | from llama_index.core import Settings 8 | import qdrant_client 9 | from qdrant_client.http.models import MatchValue, FieldCondition, Filter 10 | from llama_index.core import StorageContext 11 | from llama_index.vector_stores.qdrant import QdrantVectorStore 12 | from llama_index.llms.azure_openai import AzureOpenAI 13 | from llama_index.core.llms import ChatMessage 14 | 15 | from llama_index.readers.web import TrafilaturaWebReader 16 | from trafilatura.spider import focused_crawler 17 | import time 18 | import requests 19 | from urllib.parse import urlparse 20 | from llama_index.core import SimpleDirectoryReader 21 | from llama_index.llms.ollama import Ollama 22 | 23 | from typing import List 24 | from fastapi import HTTPException 25 | 26 | 27 | def get_llm(name = "openai", settings = None): 28 | if not settings: 29 | raise HTTPException(status_code=400, detail="Please provide settings") 30 | # we use the OpenAIEmbedding model for all LLMs 31 | #Settings.embed_model = OpenAIEmbedding() 32 | Settings.embed_model = HuggingFaceEmbedding( 33 | model_name="BAAI/bge-small-en-v1.5" 34 | ) 35 | 36 | if name == "openai": 37 | llm = AzureOpenAI( 38 | engine=settings.MODEL_DEPLOYMENT_NAME, 39 | model="gpt-35-turbo", 40 | temperature=0.0, 41 | azure_endpoint=settings.AZURE_OPENAI_ENDPOINT, 42 | api_key=settings.OPENAI_API_KEY, 43 | api_version=settings.AZURE_OPENAI_API_VERSION) 44 | # all other models we get from Ollama 45 | else: 46 | llm = Ollama( 47 | model=name, 48 | request_timeout=180.0, 49 | base_url=settings.OLLAMA_BASE_URL, 50 | temperature=0.0) 51 | 52 | Settings.llm = llm 53 | return llm, Settings.embed_model 54 | 55 | 56 | def get_fetch_urls(urls, skip_reading = False): 57 | fetch_list = [] 58 | # BUG: OR condition is not working 59 | # https://github.com/run-llama/llama_index/issues?q=is%3Aissue+is%3Aopen+FilterCondition.OR 60 | # hence we have to turn crawling off since we can't get the docs later 61 | """ 62 | for url in urls: 63 | to_visit, known_urls = focused_crawler(url, max_seen_urls=10, max_known_urls=100000) 64 | print(f"visit ${to_visit} known ${known_urls}") 65 | # this is a workaround for now because to_visit list seems empty so we grab some from known 66 | if not to_visit: 67 | for url in known_urls: 68 | response = requests.head(url) 69 | if response.status_code == 200: fetch_list.append(url) 70 | fetch_list = fetch_list + list(to_visit) 71 | # we want to make sure the original URL is definitely in the list 72 | if not url in fetch_list: 73 | print(fetch_list) 74 | print("adding %s to the list" % url) 75 | fetch_list.insert(0, url) 76 | # if we skip reading we know we already have the parent in the vector db 77 | # we just needed the other URLS as query keys 78 | if skip_reading: 79 | return fetch_list 80 | """ 81 | fetch_list.append(urls[0][1]) 82 | print(f"fetch_list: {fetch_list}") 83 | documents = [] 84 | reader = TrafilaturaWebReader() 85 | td= None 86 | for url in fetch_list[0:10]: 87 | try: 88 | td = reader.load_data([url]) 89 | except: 90 | print("error with %s" % url) 91 | documents = documents + td 92 | print("fetched a total of %s documents" % len(documents)) 93 | return documents 94 | 95 | 96 | def _get_file_destination(url, type, askey = False): 97 | path = urlparse(url).path 98 | name = path.split("/")[-1] 99 | if not askey: 100 | return os.path.join(".", type, name) 101 | else: 102 | return os.path.join(type, name) 103 | 104 | 105 | def get_download_files(urls): 106 | for dir in ["pdf", "csv"]: 107 | if not os.path.exists(os.path.join(".", dir)): 108 | os.makedirs(dir) 109 | all_files = [] 110 | for url in urls: 111 | type = url[0] 112 | furl = url[1] 113 | response = requests.get(furl) 114 | if response.status_code == 200: 115 | destination = _get_file_destination(furl, type) 116 | fd = os.open(destination, os.O_WRONLY | os.O_CREAT) 117 | with open(fd, 'wb') as output_file: 118 | output_file.write(response.content) 119 | all_files.append(destination) 120 | else: 121 | print(f"ERROR: respone status was {response.status}") 122 | if all_files: 123 | reader = SimpleDirectoryReader(input_files=all_files, num_files_limit=10) 124 | return reader.load_data() 125 | else: 126 | return None 127 | 128 | 129 | def determine_types(urls): 130 | def _get_content_type(c_url): 131 | response = requests.head(c_url) 132 | return response.headers.get('Content-Type') 133 | fetch_urls = [] 134 | for url in urls: 135 | content_type = _get_content_type(url) 136 | if 'text/csv' in content_type: 137 | print(f'URL {url} points to a CSV file.') 138 | fetch_urls.append(['csv', url]) 139 | elif 'application/pdf' in content_type: 140 | print(f'URL {url} points to a PDF file.') 141 | fetch_urls.append(['pdf', url]) 142 | elif 'text/html' in content_type: 143 | # some pdfs apparently have text/html content type 144 | if url.endswith('.pdf'): 145 | print(f'URL {url} points to a PDF file.') 146 | fetch_urls.append(['pdf', url]) 147 | else: 148 | print(f"URL {url} points to a web page.") 149 | fetch_urls.append(['web', url]) 150 | else: 151 | print('This URL points to %s of content.' % content_type) 152 | return fetch_urls 153 | 154 | 155 | def ingest_urls(url_types): 156 | print("Categorized URLs as follows: %s" % url_types) 157 | 158 | download_urls = [url for url in url_types if url[0] != "web"] 159 | if download_urls: 160 | print("Downloading the following urls: %s" % download_urls) 161 | return get_download_files(download_urls) 162 | 163 | fetch_urls = [url for url in url_types if url[0] == "web"] 164 | if fetch_urls: 165 | print("Fetching the following urls: %s" % fetch_urls) 166 | return get_fetch_urls(fetch_urls) 167 | return None 168 | 169 | 170 | def get_dbclient(settings): 171 | client = qdrant_client.QdrantClient( 172 | url=settings.QDRANT_ENDPOINT 173 | ) 174 | return client 175 | 176 | 177 | 178 | def is_vector_in_db(qclient, collection_name, key, value): 179 | print(f"Checking if {key} with value {value} is in the vector store") 180 | try: 181 | result = qclient.scroll(collection_name=collection_name, 182 | scroll_filter = Filter(must=[ 183 | FieldCondition(key=key, match=MatchValue(value=value)) 184 | ])) 185 | return result[0] 186 | except: 187 | return None 188 | 189 | 190 | def get_preseeded_query_engine(type, urls, index): 191 | filters = [] 192 | print(f"Getting preseeded query engine for {type} with urls {urls}") 193 | if type == "web": 194 | for url in urls: 195 | filter = MetadataFilter(key="document_id", value=url) 196 | filters.append(filter) 197 | else: 198 | for url in urls: 199 | file_location =_get_file_destination(url, type, askey=True) 200 | filter = MetadataFilter(key="file_path", value=file_location) 201 | filters.append(filter) 202 | query_engine = index.as_query_engine( 203 | vector_store_query_mode="default", 204 | filters=MetadataFilters(filters=filters)) 205 | return query_engine 206 | 207 | 208 | def get_query_engine(urls, dbclient, collection_name, embed_model = None): 209 | # see if the urls are already in the vector store 210 | filter = None 211 | url_and_types = determine_types(urls) 212 | vector_store = QdrantVectorStore(client=dbclient, collection_name=collection_name) 213 | index = VectorStoreIndex.from_vector_store(vector_store=vector_store) 214 | 215 | if url_and_types: 216 | type = url_and_types[0][0] 217 | urls = [url_and_types[0][1]] 218 | # we query qdrant for metadata because with query_engine we don't know 219 | # if we have documents until we inference with the llm 220 | in_db = None 221 | if type == "web": 222 | in_db = is_vector_in_db(dbclient, collection_name, "document_id", urls[0]) 223 | elif type == "csv" or type == "pdf": 224 | in_db = is_vector_in_db(dbclient, collection_name, "file_path", _get_file_destination(urls[0], type, askey=True)) 225 | if in_db: 226 | print(f"Found documents in the vector store for {type} with urls {urls}") 227 | return get_preseeded_query_engine(type, urls, index) 228 | 229 | # we didn't find anything in the vector db so we get it anew 230 | docs = ingest_urls(url_and_types) 231 | start = time.time() 232 | vector_store = QdrantVectorStore(client=dbclient, collection_name=collection_name) 233 | storage_context = StorageContext.from_defaults(vector_store=vector_store) 234 | index = VectorStoreIndex.from_documents(docs, embed_model=embed_model, storage_context=storage_context) 235 | end = time.time() 236 | exec_time = end - start 237 | print(f"Indexing/embedding took {exec_time:.1f} seconds") 238 | return index.as_query_engine() 239 | 240 | -------------------------------------------------------------------------------- /frontend/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.11-slim-bookworm 3 | #FROM nvidia/cuda:12.0.1-runtime-ubuntu22.04 4 | 5 | RUN python3 -m pip install --no-cache-dir --upgrade pip && \ 6 | python3 -m pip install --no-cache-dir \ 7 | gradio \ 8 | requests 9 | 10 | WORKDIR /app 11 | #ENV HF_HOME=${WORKDIR}/.cache 12 | 13 | # Add the current directory contents into the container at /app 14 | ADD . /app 15 | 16 | 17 | # Make port 80 available to the world outside this container 18 | EXPOSE 8088 19 | 20 | # Run app.py when the container launches 21 | CMD ["python3", "app_frontend.py"] -------------------------------------------------------------------------------- /frontend/app_frontend.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import requests 3 | import os 4 | 5 | model_endpoint = os.getenv('BACKEND_URL', 'http://localhost:8080') 6 | 7 | def ask_question(question, model, urls): 8 | requests.post(f"{model_endpoint}/set_llm/{model}") 9 | if urls: 10 | urls_list = [url.strip() for url in urls.split(',')] 11 | else: 12 | urls_list = [] 13 | 14 | api_payload = { 15 | "question": question, 16 | "urls": urls_list 17 | } 18 | response = requests.post("%s/query_chat" % model_endpoint, json=api_payload) 19 | res = response.json() 20 | # llm response is nested, other nodes are source_nodes, template 21 | answer = res.get('response', "Error: No response found") 22 | if isinstance(answer, dict): 23 | answer = answer.get("response", "Error: No response found") 24 | time = res.get('time_taken', "Error: No time found") 25 | return answer, time 26 | 27 | 28 | def get_model_info(): 29 | response = requests.get("%s/info" % model_endpoint) 30 | info = response.json() 31 | return info 32 | 33 | 34 | def list_llm(): 35 | response = requests.get(f"{model_endpoint}/list_llm") 36 | models = response.json() 37 | return models 38 | 39 | 40 | question_input = gr.Textbox("What would you like to know?", label="Ask a question!", lines=1) 41 | prompt_output = gr.Textbox("Here's my answer!", label="Query Result", lines=10) 42 | time_output = gr.Textbox("0.0", label="Inference Time (seconds)", lines=1) 43 | model_selection = gr.Radio(choices=list_llm(), label="Model", value="openai") 44 | url_input = gr.Textbox(lines=1, placeholder="accepted types are CSV, web, PDF", label="Ingest URL") 45 | 46 | iface = gr.Interface( 47 | fn=ask_question, 48 | inputs=[question_input, model_selection, url_input], 49 | outputs=[prompt_output, time_output], 50 | title="Multi-LLM Chat", 51 | description="Pick your model, ask a question, and get an answer!", 52 | theme=gr.themes.Soft(), 53 | allow_flagging="never", 54 | css='footer{display:none !important}') 55 | iface.launch(inline=True, share=False, server_name="0.0.0.0", server_port=8088) 56 | 57 | # description="Type a question and get an answer from the LLM. " + str(get_model_info()) -------------------------------------------------------------------------------- /misc/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonjj/multi-llm-chat/963a3770d6db8fc4f82ab96410db499bcf1dd8d6/misc/screenshot.png --------------------------------------------------------------------------------