├── .dockerignore ├── .env.example ├── .github └── pull_request_template.md ├── .gitignore ├── .well-known ├── ai-plugin.json ├── logo.png └── openapi.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── assets └── example.png ├── datastore ├── __init__.py ├── datastore.py ├── factory.py └── providers │ ├── __init__.py │ ├── analyticdb_datastore.py │ ├── azurecosmosdb_datastore.py │ ├── azuresearch_datastore.py │ ├── chroma_datastore.py │ ├── elasticsearch_datastore.py │ ├── llama_datastore.py │ ├── milvus_datastore.py │ ├── mongodb_atlas_datastore.py │ ├── pgvector_datastore.py │ ├── pinecone_datastore.py │ ├── postgres_datastore.py │ ├── qdrant_datastore.py │ ├── redis_datastore.py │ ├── supabase_datastore.py │ ├── weaviate_datastore.py │ └── zilliz_datastore.py ├── docs ├── deployment │ ├── flyio.md │ ├── heroku.md │ ├── other-options.md │ ├── removing-unused-dependencies.md │ ├── render-thumbnail.png │ └── render.md ├── deprecated │ └── plugins.md └── providers │ ├── analyticdb │ └── setup.md │ ├── azurecosmosdb │ └── setup.md │ ├── azuresearch │ └── setup.md │ ├── chroma │ └── setup.md │ ├── elasticsearch │ └── setup.md │ ├── llama │ └── setup.md │ ├── milvus │ └── setup.md │ ├── mongodb │ └── setup.md │ ├── pinecone │ └── setup.md │ ├── postgres │ └── setup.md │ ├── qdrant │ └── setup.md │ ├── redis │ └── setup.md │ ├── supabase │ └── setup.md │ ├── weaviate │ └── setup.md │ └── zilliz │ └── setup.md ├── examples ├── authentication-methods │ ├── no-auth │ │ ├── ai-plugin.json │ │ └── main.py │ ├── oauth │ │ └── ai-plugin.json │ ├── service-http │ │ └── ai-plugin.json │ └── user-http │ │ └── ai-plugin.json ├── docker │ ├── elasticsearch │ │ ├── README.md │ │ └── docker-compose.yaml │ ├── milvus │ │ └── docker-compose.yaml │ ├── qdrant │ │ ├── README.md │ │ ├── docker-compose.yaml │ │ ├── documents.json │ │ └── queries.json │ └── redis │ │ └── docker-compose.yml ├── function-calling │ └── README.md ├── memory │ ├── README.md │ ├── ai-plugin.json │ ├── main.py │ └── openapi.yaml └── providers │ ├── azurecosmosdb │ └── semantic-search.ipynb │ ├── elasticsearch │ └── search.ipynb │ ├── mongodb │ └── semantic-search.ipynb │ ├── pinecone │ └── semantic-search.ipynb │ ├── redis │ └── semantic-search-and-filter.ipynb │ └── supabase │ ├── .gitignore │ ├── config.toml │ ├── migrations │ └── 20230414142107_init_pg_vector.sql │ └── seed.sql ├── local_server ├── ai-plugin.json ├── logo.png ├── main.py └── openapi.yaml ├── models ├── api.py └── models.py ├── poetry.lock ├── pyproject.toml ├── scripts ├── process_json │ ├── README.md │ ├── example.json │ └── process_json.py ├── process_jsonl │ ├── README.md │ ├── example.jsonl │ └── process_jsonl.py └── process_zip │ ├── README.md │ ├── example.zip │ └── process_zip.py ├── server └── main.py ├── services ├── chunks.py ├── date.py ├── extract_metadata.py ├── file.py ├── openai.py └── pii_detection.py └── tests ├── __init__.py └── datastore └── providers ├── analyticdb └── test_analyticdb_datastore.py ├── azurecosmosdb └── test_azurecosmosdb_datastore.py ├── azuresearch └── test_azuresearch_datastore.py ├── chroma └── test_chroma_datastore.py ├── elasticsearch └── test_elasticsearch_datastore.py ├── llama └── test_llama_datastore.py ├── milvus └── test_milvus_datastore.py ├── mongodb_atlas ├── test_integration.py └── test_mongodb_datastore.py ├── postgres └── test_postgres_datastore.py ├── qdrant └── test_qdrant_datastore.py ├── redis └── test_redis_datastore.py ├── supabase └── test_supabase_datastore.py ├── weaviate ├── docker-compose.yml └── test_weaviate_datastore.py └── zilliz └── test_zilliz_datastore.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Ignore files that are already ignored by git 2 | .gitignore 3 | 4 | scripts/ 5 | tests/ 6 | examples/ 7 | local_server/ 8 | assets/ 9 | *.md 10 | *.pyc 11 | .dockerignore 12 | Dockerfile 13 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Core environment variables 2 | DATASTORE="" 3 | BEARER_TOKEN="" 4 | OPENAI_API_KEY="" 5 | EMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use 6 | EMBEDDING_MODEL="text-embedding-3-large" # edit this value based on the model you want to use e.g. text-embedding-3-small, text-embedding-ada-002 7 | 8 | # Optional environment variables for Azure OpenAI 9 | OPENAI_API_BASE="https://.openai.azure.com/" 10 | OPENAI_API_TYPE="azure" 11 | OPENAI_EMBEDDINGMODEL_DEPLOYMENTID="" 12 | OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID="" 13 | OPENAI_COMPLETIONMODEL_DEPLOYMENTID="" 14 | OPENAI_EMBEDDING_BATCH_SIZE="" 15 | 16 | # Pinecone configuration 17 | PINECONE_API_KEY="" 18 | PINECONE_ENVIRONMENT="" 19 | PINECONE_INDEX="" 20 | 21 | # Weaviate configuration 22 | WEAVIATE_URL="" 23 | WEAVIATE_API_KEY="" 24 | WEAVIATE_CLASS="" 25 | 26 | # Zilliz configuration 27 | ZILLIZ_COLLECTION="" 28 | ZILLIZ_URI="" 29 | ZILLIZ_USER="" 30 | ZILLIZ_PASSWORD="" 31 | 32 | # Milvus configuration 33 | MILVUS_COLLECTION="" 34 | MILVUS_HOST="" 35 | MILVUS_PORT="" 36 | MILVUS_USER="" 37 | MILVUS_PASSWORD="" 38 | 39 | # Qdrant configuration 40 | QDRANT_URL="" 41 | QDRANT_PORT="" 42 | QDRANT_GRPC_PORT="" 43 | QDRANT_API_KEY="" 44 | QDRANT_COLLECTION="" 45 | 46 | # AnalyticDB configuration 47 | PG_HOST="" 48 | PG_PORT="" 49 | PG_USER="" 50 | PG_PASSWORD="" 51 | PG_DATABASE="" 52 | PG_COLLECTION="" 53 | 54 | # Redis configuration 55 | REDIS_HOST="" 56 | REDIS_PORT="" 57 | REDIS_PASSWORD="" 58 | REDIS_INDEX_NAME="" 59 | REDIS_DOC_PREFIX="" 60 | REDIS_DISTANCE_METRIC="" 61 | REDIS_INDEX_TYPE="" 62 | 63 | # Llama configuration 64 | LLAMA_INDEX_TYPE="" 65 | LLAMA_INDEX_JSON_PATH="" 66 | LLAMA_QUERY_KWARGS_JSON_PATH="" 67 | LLAMA_RESPONSE_MODE="" 68 | 69 | # Chroma configuration 70 | CHROMA_COLLECTION="" 71 | CHROMA_IN_MEMORY="" 72 | CHROMA_PERSISTENCE_DIR="" 73 | CHROMA_HOST="" 74 | CHROMA_PORT="" 75 | 76 | # Azure Cognitive Search configuration 77 | AZURESEARCH_SERVICE="" 78 | AZURESEARCH_INDEX="" 79 | AZURESEARCH_API_KEY="" # (optional, uses key-free managed identity if not set) 80 | 81 | # Azure CosmosDB Mongo vCore configuration 82 | AZCOSMOS_API="" 83 | AZCOSMOS_CONNSTR="" 84 | AZCOSMOS_DATABASE_NAME="" 85 | AZCOSMOS_CONTAINER_NAME="" 86 | 87 | # Supabase configuration 88 | SUPABASE_URL="" 89 | SUPABASE_ANON_KEY="" 90 | 91 | # Postgres configuration 92 | PG_HOST="" 93 | PG_PORT="" 94 | PG_USER="" 95 | PG_PASSWORD="" 96 | PG_DB="" 97 | 98 | # Elasticsearch configuration 99 | ELASTICSEARCH_URL="" # (either specify host or cloud_id) 100 | ELASTICSEARCH_CLOUD_ID="" 101 | ELASTICSEARCH_USERNAME="" 102 | ELASTICSEARCH_PASSWORD="" 103 | ELASTICSEARCH_API_KEY="" 104 | ELASTICSEARCH_INDEX="" 105 | ELASTICSEARCH_REPLICAS="" 106 | ELASTICSEARCH_SHARDS="" -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Pull Request (PR) Checklist 2 | If you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing! 3 | 4 | 1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`. 5 | 6 | 2. **Short Description**: Provide a brief, informative description of the PR that explains the changes made. 7 | 8 | 3. **Issue(s) Linked**: Mention any related issue(s) by using the keyword `Fixes` or `Closes` followed by the respective issue number(s) (e.g., Fixes #123, Closes #456). 9 | 10 | 4. **Branch**: Ensure that you have created a new branch for the changes, and it is based on the latest version of the `main` branch. 11 | 12 | 5. **Code Changes**: Make sure the code changes are minimal, focused, and relevant to the issue or feature being addressed. 13 | 14 | 6. **Commit Messages**: Write clear and concise commit messages that explain the purpose of each commit. 15 | 16 | 7. **Tests**: Include unit tests and/or integration tests for any new code or changes to existing code. Make sure all tests pass before submitting the PR. 17 | 18 | 8. **Documentation**: Update relevant documentation (e.g., README, inline comments, or external documentation) to reflect any changes made. 19 | 20 | 9. **Review Requested**: Request a review from at least one other contributor or maintainer of the repository. 21 | 22 | 10. **Video Submission** (For Complex/Large PRs): If your PR introduces significant changes, complexities, or a large number of lines of code, submit a brief video walkthrough along with the PR. The video should explain the purpose of the changes, the logic behind them, and how they address the issue or add the proposed feature. This will help reviewers to better understand your contribution and expedite the review process. 23 | 24 | ## Pull Request Naming Convention 25 | 26 | Use the following naming convention for your PR branches: 27 | 28 | ``` 29 | /- 30 | ``` 31 | 32 | - ``: The type of PR, such as `bugfix`, `feature`, `enhancement`, `refactor`, or `docs`. Multiple types are ok and should appear as , 33 | - ``: A brief description of the changes made, using hyphens to separate words. 34 | - ``: The issue number associated with the changes made (if applicable). 35 | 36 | Example: 37 | 38 | ``` 39 | feature/advanced-chunking-strategy-123 40 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # .vscode files 30 | .vscode/* 31 | 32 | # Pycharm 33 | .idea/ 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | myvenv/ 118 | 119 | # Exception for .env.example 120 | !.env.example 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | 140 | # macOS .DS_Store files 141 | .DS_Store -------------------------------------------------------------------------------- /.well-known/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.", 6 | "description_for_human": "Search through your documents.", 7 | "auth": { 8 | "type": "user_http", 9 | "authorization_type": "bearer" 10 | }, 11 | "api": { 12 | "type": "openapi", 13 | "url": "https://your-app-url.com/.well-known/openapi.yaml", 14 | "has_user_authentication": false 15 | }, 16 | "logo_url": "https://your-app-url.com/.well-known/logo.png", 17 | "contact_email": "hello@contact.com", 18 | "legal_info_url": "http://example.com/legal-info" 19 | } 20 | -------------------------------------------------------------------------------- /.well-known/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/.well-known/logo.png -------------------------------------------------------------------------------- /.well-known/openapi.yaml: -------------------------------------------------------------------------------- 1 | openapi: 3.0.2 2 | info: 3 | title: Retrieval Plugin API 4 | description: A retrieval API for querying and filtering documents based on natural language queries and metadata 5 | version: 1.0.0 6 | servers: 7 | - url: https://your-app-url.com 8 | paths: 9 | /query: 10 | post: 11 | summary: Query 12 | description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs. 13 | operationId: query_query_post 14 | requestBody: 15 | content: 16 | application/json: 17 | schema: 18 | $ref: "#/components/schemas/QueryRequest" 19 | required: true 20 | responses: 21 | "200": 22 | description: Successful Response 23 | content: 24 | application/json: 25 | schema: 26 | $ref: "#/components/schemas/QueryResponse" 27 | "422": 28 | description: Validation Error 29 | content: 30 | application/json: 31 | schema: 32 | $ref: "#/components/schemas/HTTPValidationError" 33 | security: 34 | - HTTPBearer: [] 35 | components: 36 | schemas: 37 | DocumentChunkMetadata: 38 | title: DocumentChunkMetadata 39 | type: object 40 | properties: 41 | source: 42 | $ref: "#/components/schemas/Source" 43 | source_id: 44 | title: Source Id 45 | type: string 46 | url: 47 | title: Url 48 | type: string 49 | created_at: 50 | title: Created At 51 | type: string 52 | author: 53 | title: Author 54 | type: string 55 | document_id: 56 | title: Document Id 57 | type: string 58 | DocumentChunkWithScore: 59 | title: DocumentChunkWithScore 60 | required: 61 | - text 62 | - metadata 63 | - score 64 | type: object 65 | properties: 66 | id: 67 | title: Id 68 | type: string 69 | text: 70 | title: Text 71 | type: string 72 | metadata: 73 | $ref: "#/components/schemas/DocumentChunkMetadata" 74 | embedding: 75 | title: Embedding 76 | type: array 77 | items: 78 | type: number 79 | score: 80 | title: Score 81 | type: number 82 | DocumentMetadataFilter: 83 | title: DocumentMetadataFilter 84 | type: object 85 | properties: 86 | document_id: 87 | title: Document Id 88 | type: string 89 | source: 90 | $ref: "#/components/schemas/Source" 91 | source_id: 92 | title: Source Id 93 | type: string 94 | author: 95 | title: Author 96 | type: string 97 | start_date: 98 | title: Start Date 99 | type: string 100 | end_date: 101 | title: End Date 102 | type: string 103 | HTTPValidationError: 104 | title: HTTPValidationError 105 | type: object 106 | properties: 107 | detail: 108 | title: Detail 109 | type: array 110 | items: 111 | $ref: "#/components/schemas/ValidationError" 112 | Query: 113 | title: Query 114 | required: 115 | - query 116 | type: object 117 | properties: 118 | query: 119 | title: Query 120 | type: string 121 | filter: 122 | $ref: "#/components/schemas/DocumentMetadataFilter" 123 | top_k: 124 | title: Top K 125 | type: integer 126 | default: 3 127 | QueryRequest: 128 | title: QueryRequest 129 | required: 130 | - queries 131 | type: object 132 | properties: 133 | queries: 134 | title: Queries 135 | type: array 136 | items: 137 | $ref: "#/components/schemas/Query" 138 | QueryResponse: 139 | title: QueryResponse 140 | required: 141 | - results 142 | type: object 143 | properties: 144 | results: 145 | title: Results 146 | type: array 147 | items: 148 | $ref: "#/components/schemas/QueryResult" 149 | QueryResult: 150 | title: QueryResult 151 | required: 152 | - query 153 | - results 154 | type: object 155 | properties: 156 | query: 157 | title: Query 158 | type: string 159 | results: 160 | title: Results 161 | type: array 162 | items: 163 | $ref: "#/components/schemas/DocumentChunkWithScore" 164 | Source: 165 | title: Source 166 | enum: 167 | - email 168 | - file 169 | - chat 170 | type: string 171 | description: An enumeration. 172 | ValidationError: 173 | title: ValidationError 174 | required: 175 | - loc 176 | - msg 177 | - type 178 | type: object 179 | properties: 180 | loc: 181 | title: Location 182 | type: array 183 | items: 184 | anyOf: 185 | - type: string 186 | - type: integer 187 | msg: 188 | title: Message 189 | type: string 190 | type: 191 | title: Error Type 192 | type: string 193 | securitySchemes: 194 | HTTPBearer: 195 | type: http 196 | scheme: bearer 197 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM python:3.10 as requirements-stage 3 | 4 | WORKDIR /tmp 5 | 6 | RUN pip install poetry 7 | 8 | COPY ./pyproject.toml ./poetry.lock* /tmp/ 9 | 10 | 11 | RUN poetry export -f requirements.txt --output requirements.txt --without-hashes 12 | 13 | FROM python:3.10 14 | 15 | WORKDIR /code 16 | 17 | COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt 18 | 19 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt 20 | 21 | COPY . /code/ 22 | 23 | # Heroku uses PORT, Azure App Services uses WEBSITES_PORT, Fly.io uses 8080 by default 24 | CMD ["sh", "-c", "uvicorn server.main:app --host 0.0.0.0 --port ${PORT:-${WEBSITES_PORT:-8080}}"] 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Heroku 2 | # make heroku-login 3 | # make heroku-push 4 | 5 | HEROKU_APP = 6 | 7 | heroku-push: 8 | docker buildx build --platform linux/amd64 -t ${HEROKU_APP} . 9 | docker tag ${HEROKU_APP} registry.heroku.com/${HEROKU_APP}/web 10 | docker push registry.heroku.com/${HEROKU_APP}/web 11 | heroku container:release web -a ${HEROKU_APP} 12 | 13 | heroku-login: 14 | heroku container:login 15 | -------------------------------------------------------------------------------- /assets/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/assets/example.png -------------------------------------------------------------------------------- /datastore/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/datastore/__init__.py -------------------------------------------------------------------------------- /datastore/datastore.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, List, Optional 3 | import asyncio 4 | 5 | from models.models import ( 6 | Document, 7 | DocumentChunk, 8 | DocumentMetadataFilter, 9 | Query, 10 | QueryResult, 11 | QueryWithEmbedding, 12 | ) 13 | from services.chunks import get_document_chunks 14 | from services.openai import get_embeddings 15 | 16 | 17 | class DataStore(ABC): 18 | async def upsert( 19 | self, documents: List[Document], chunk_token_size: Optional[int] = None 20 | ) -> List[str]: 21 | """ 22 | Takes in a list of documents and inserts them into the database. 23 | First deletes all the existing vectors with the document id (if necessary, depends on the vector db), then inserts the new ones. 24 | Return a list of document ids. 25 | """ 26 | # Delete any existing vectors for documents with the input document ids 27 | await asyncio.gather( 28 | *[ 29 | self.delete( 30 | filter=DocumentMetadataFilter( 31 | document_id=document.id, 32 | ), 33 | delete_all=False, 34 | ) 35 | for document in documents 36 | if document.id 37 | ] 38 | ) 39 | 40 | chunks = get_document_chunks(documents, chunk_token_size) 41 | 42 | return await self._upsert(chunks) 43 | 44 | @abstractmethod 45 | async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]: 46 | """ 47 | Takes in a list of document chunks and inserts them into the database. 48 | Return a list of document ids. 49 | """ 50 | 51 | raise NotImplementedError 52 | 53 | async def query(self, queries: List[Query]) -> List[QueryResult]: 54 | """ 55 | Takes in a list of queries and filters and returns a list of query results with matching document chunks and scores. 56 | """ 57 | # get a list of just the queries from the Query list 58 | query_texts = [query.query for query in queries] 59 | query_embeddings = get_embeddings(query_texts) 60 | # hydrate the queries with embeddings 61 | queries_with_embeddings = [ 62 | QueryWithEmbedding(**query.dict(), embedding=embedding) 63 | for query, embedding in zip(queries, query_embeddings) 64 | ] 65 | return await self._query(queries_with_embeddings) 66 | 67 | @abstractmethod 68 | async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]: 69 | """ 70 | Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores. 71 | """ 72 | raise NotImplementedError 73 | 74 | @abstractmethod 75 | async def delete( 76 | self, 77 | ids: Optional[List[str]] = None, 78 | filter: Optional[DocumentMetadataFilter] = None, 79 | delete_all: Optional[bool] = None, 80 | ) -> bool: 81 | """ 82 | Removes vectors by ids, filter, or everything in the datastore. 83 | Multiple parameters can be used at once. 84 | Returns whether the operation was successful. 85 | """ 86 | raise NotImplementedError 87 | -------------------------------------------------------------------------------- /datastore/factory.py: -------------------------------------------------------------------------------- 1 | from datastore.datastore import DataStore 2 | import os 3 | 4 | 5 | async def get_datastore() -> DataStore: 6 | datastore = os.environ.get("DATASTORE") 7 | assert datastore is not None 8 | 9 | match datastore: 10 | case "chroma": 11 | from datastore.providers.chroma_datastore import ChromaDataStore 12 | 13 | return ChromaDataStore() 14 | case "llama": 15 | from datastore.providers.llama_datastore import LlamaDataStore 16 | 17 | return LlamaDataStore() 18 | 19 | case "pinecone": 20 | from datastore.providers.pinecone_datastore import PineconeDataStore 21 | 22 | return PineconeDataStore() 23 | case "weaviate": 24 | from datastore.providers.weaviate_datastore import WeaviateDataStore 25 | 26 | return WeaviateDataStore() 27 | case "milvus": 28 | from datastore.providers.milvus_datastore import MilvusDataStore 29 | 30 | return MilvusDataStore() 31 | case "zilliz": 32 | from datastore.providers.zilliz_datastore import ZillizDataStore 33 | 34 | return ZillizDataStore() 35 | case "redis": 36 | from datastore.providers.redis_datastore import RedisDataStore 37 | 38 | return await RedisDataStore.init() 39 | case "azurecosmosdb": 40 | from datastore.providers.azurecosmosdb_datastore import ( 41 | AzureCosmosDBDataStore, 42 | ) 43 | 44 | return await AzureCosmosDBDataStore.create() 45 | case "qdrant": 46 | from datastore.providers.qdrant_datastore import QdrantDataStore 47 | 48 | return QdrantDataStore() 49 | case "azuresearch": 50 | from datastore.providers.azuresearch_datastore import AzureSearchDataStore 51 | 52 | return AzureSearchDataStore() 53 | case "supabase": 54 | from datastore.providers.supabase_datastore import SupabaseDataStore 55 | 56 | return SupabaseDataStore() 57 | case "postgres": 58 | from datastore.providers.postgres_datastore import PostgresDataStore 59 | 60 | return PostgresDataStore() 61 | case "analyticdb": 62 | from datastore.providers.analyticdb_datastore import AnalyticDBDataStore 63 | 64 | return AnalyticDBDataStore() 65 | case "elasticsearch": 66 | from datastore.providers.elasticsearch_datastore import ( 67 | ElasticsearchDataStore, 68 | ) 69 | 70 | return ElasticsearchDataStore() 71 | case "mongodb": 72 | from datastore.providers.mongodb_atlas_datastore import ( 73 | MongoDBAtlasDataStore, 74 | ) 75 | 76 | return MongoDBAtlasDataStore() 77 | case _: 78 | raise ValueError( 79 | f"Unsupported vector database: {datastore}. " 80 | f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, azuresearch, or qdrant" 81 | ) 82 | -------------------------------------------------------------------------------- /datastore/providers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/datastore/providers/__init__.py -------------------------------------------------------------------------------- /datastore/providers/postgres_datastore.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, List 3 | from datetime import datetime 4 | import numpy as np 5 | 6 | from psycopg2 import connect 7 | from psycopg2.extras import DictCursor 8 | from pgvector.psycopg2 import register_vector 9 | 10 | from services.date import to_unix_timestamp 11 | from datastore.providers.pgvector_datastore import PGClient, PgVectorDataStore 12 | from models.models import ( 13 | DocumentMetadataFilter, 14 | ) 15 | 16 | PG_HOST = os.environ.get("PG_HOST", "localhost") 17 | PG_PORT = int(os.environ.get("PG_PORT", 5432)) 18 | PG_DB = os.environ.get("PG_DB", "postgres") 19 | PG_USER = os.environ.get("PG_USER", "postgres") 20 | PG_PASSWORD = os.environ.get("PG_PASSWORD", "postgres") 21 | 22 | 23 | # class that implements the DataStore interface for Postgres Datastore provider 24 | class PostgresDataStore(PgVectorDataStore): 25 | def create_db_client(self): 26 | return PostgresClient() 27 | 28 | 29 | class PostgresClient(PGClient): 30 | def __init__(self) -> None: 31 | super().__init__() 32 | self.client = connect( 33 | dbname=PG_DB, user=PG_USER, password=PG_PASSWORD, host=PG_HOST, port=PG_PORT 34 | ) 35 | register_vector(self.client) 36 | 37 | def __del__(self): 38 | # close the connection when the client is destroyed 39 | self.client.close() 40 | 41 | async def upsert(self, table: str, json: dict[str, Any]): 42 | """ 43 | Takes in a list of documents and inserts them into the table. 44 | """ 45 | with self.client.cursor() as cur: 46 | if not json.get("created_at"): 47 | json["created_at"] = datetime.now() 48 | json["embedding"] = np.array(json["embedding"]) 49 | cur.execute( 50 | f"INSERT INTO {table} (id, content, embedding, document_id, source, source_id, url, author, created_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO UPDATE SET content = %s, embedding = %s, document_id = %s, source = %s, source_id = %s, url = %s, author = %s, created_at = %s", 51 | ( 52 | json["id"], 53 | json["content"], 54 | json["embedding"], 55 | json["document_id"], 56 | json["source"], 57 | json["source_id"], 58 | json["url"], 59 | json["author"], 60 | json["created_at"], 61 | json["content"], 62 | json["embedding"], 63 | json["document_id"], 64 | json["source"], 65 | json["source_id"], 66 | json["url"], 67 | json["author"], 68 | json["created_at"], 69 | ), 70 | ) 71 | self.client.commit() 72 | 73 | async def rpc(self, function_name: str, params: dict[str, Any]): 74 | """ 75 | Calls a stored procedure in the database with the given parameters. 76 | """ 77 | data = [] 78 | params["in_embedding"] = np.array(params["in_embedding"]) 79 | with self.client.cursor(cursor_factory=DictCursor) as cur: 80 | cur.callproc(function_name, params) 81 | rows = cur.fetchall() 82 | self.client.commit() 83 | for row in rows: 84 | row["created_at"] = to_unix_timestamp(row["created_at"]) 85 | data.append(dict(row)) 86 | return data 87 | 88 | async def delete_like(self, table: str, column: str, pattern: str): 89 | """ 90 | Deletes rows in the table that match the pattern. 91 | """ 92 | with self.client.cursor() as cur: 93 | cur.execute( 94 | f"DELETE FROM {table} WHERE {column} LIKE %s", 95 | (f"%{pattern}%",), 96 | ) 97 | self.client.commit() 98 | 99 | async def delete_in(self, table: str, column: str, ids: List[str]): 100 | """ 101 | Deletes rows in the table that match the ids. 102 | """ 103 | with self.client.cursor() as cur: 104 | cur.execute( 105 | f"DELETE FROM {table} WHERE {column} IN %s", 106 | (tuple(ids),), 107 | ) 108 | self.client.commit() 109 | 110 | async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter): 111 | """ 112 | Deletes rows in the table that match the filter. 113 | """ 114 | 115 | filters = "WHERE" 116 | if filter.document_id: 117 | filters += f" document_id = '{filter.document_id}' AND" 118 | if filter.source: 119 | filters += f" source = '{filter.source}' AND" 120 | if filter.source_id: 121 | filters += f" source_id = '{filter.source_id}' AND" 122 | if filter.author: 123 | filters += f" author = '{filter.author}' AND" 124 | if filter.start_date: 125 | filters += f" created_at >= '{filter.start_date}' AND" 126 | if filter.end_date: 127 | filters += f" created_at <= '{filter.end_date}' AND" 128 | filters = filters[:-4] 129 | 130 | with self.client.cursor() as cur: 131 | cur.execute(f"DELETE FROM {table} {filters}") 132 | self.client.commit() 133 | -------------------------------------------------------------------------------- /datastore/providers/supabase_datastore.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, List 3 | from datetime import datetime 4 | 5 | from supabase import Client 6 | 7 | from datastore.providers.pgvector_datastore import PGClient, PgVectorDataStore 8 | from models.models import ( 9 | DocumentMetadataFilter, 10 | ) 11 | 12 | SUPABASE_URL = os.environ.get("SUPABASE_URL") 13 | assert SUPABASE_URL is not None, "SUPABASE_URL is not set" 14 | SUPABASE_ANON_KEY = os.environ.get("SUPABASE_ANON_KEY") 15 | # use service role key if you want this app to be able to bypass your Row Level Security policies 16 | SUPABASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY") 17 | assert ( 18 | SUPABASE_ANON_KEY is not None or SUPABASE_SERVICE_ROLE_KEY is not None 19 | ), "SUPABASE_ANON_KEY or SUPABASE_SERVICE_ROLE_KEY must be set" 20 | 21 | 22 | # class that implements the DataStore interface for Supabase Datastore provider 23 | class SupabaseDataStore(PgVectorDataStore): 24 | def create_db_client(self): 25 | return SupabaseClient() 26 | 27 | 28 | class SupabaseClient(PGClient): 29 | def __init__(self) -> None: 30 | super().__init__() 31 | if not SUPABASE_SERVICE_ROLE_KEY: 32 | self.client = Client(SUPABASE_URL, SUPABASE_ANON_KEY) 33 | else: 34 | self.client = Client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY) 35 | 36 | async def upsert(self, table: str, json: dict[str, Any]): 37 | """ 38 | Takes in a list of documents and inserts them into the table. 39 | """ 40 | if "created_at" in json: 41 | json["created_at"] = json["created_at"][0].isoformat() 42 | 43 | self.client.table(table).upsert(json).execute() 44 | 45 | async def rpc(self, function_name: str, params: dict[str, Any]): 46 | """ 47 | Calls a stored procedure in the database with the given parameters. 48 | """ 49 | if "in_start_date" in params: 50 | params["in_start_date"] = params["in_start_date"].isoformat() 51 | if "in_end_date" in params: 52 | params["in_end_date"] = params["in_end_date"].isoformat() 53 | 54 | response = self.client.rpc(function_name, params=params).execute() 55 | return response.data 56 | 57 | async def delete_like(self, table: str, column: str, pattern: str): 58 | """ 59 | Deletes rows in the table that match the pattern. 60 | """ 61 | self.client.table(table).delete().like(column, pattern).execute() 62 | 63 | async def delete_in(self, table: str, column: str, ids: List[str]): 64 | """ 65 | Deletes rows in the table that match the ids. 66 | """ 67 | self.client.table(table).delete().in_(column, ids).execute() 68 | 69 | async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter): 70 | """ 71 | Deletes rows in the table that match the filter. 72 | """ 73 | builder = self.client.table(table).delete() 74 | if filter.document_id: 75 | builder = builder.eq( 76 | "document_id", 77 | filter.document_id, 78 | ) 79 | if filter.source: 80 | builder = builder.eq("source", filter.source) 81 | if filter.source_id: 82 | builder = builder.eq("source_id", filter.source_id) 83 | if filter.author: 84 | builder = builder.eq("author", filter.author) 85 | if filter.start_date: 86 | builder = builder.gte( 87 | "created_at", 88 | filter.start_date[0].isoformat(), 89 | ) 90 | if filter.end_date: 91 | builder = builder.lte( 92 | "created_at", 93 | filter.end_date[0].isoformat(), 94 | ) 95 | builder.execute() 96 | -------------------------------------------------------------------------------- /datastore/providers/zilliz_datastore.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from loguru import logger 4 | from typing import Optional 5 | from pymilvus import ( 6 | connections, 7 | ) 8 | from uuid import uuid4 9 | 10 | from datastore.providers.milvus_datastore import ( 11 | MilvusDataStore, 12 | ) 13 | 14 | 15 | ZILLIZ_COLLECTION = os.environ.get("ZILLIZ_COLLECTION") or "c" + uuid4().hex 16 | ZILLIZ_URI = os.environ.get("ZILLIZ_URI") 17 | ZILLIZ_USER = os.environ.get("ZILLIZ_USER") 18 | ZILLIZ_PASSWORD = os.environ.get("ZILLIZ_PASSWORD") 19 | ZILLIZ_USE_SECURITY = False if ZILLIZ_PASSWORD is None else True 20 | 21 | ZILLIZ_CONSISTENCY_LEVEL = os.environ.get("ZILLIZ_CONSISTENCY_LEVEL") 22 | 23 | 24 | class ZillizDataStore(MilvusDataStore): 25 | def __init__(self, create_new: Optional[bool] = False): 26 | """Create a Zilliz DataStore. 27 | 28 | The Zilliz Datastore allows for storing your indexes and metadata within a Zilliz Cloud instance. 29 | 30 | Args: 31 | create_new (Optional[bool], optional): Whether to overwrite if collection already exists. Defaults to True. 32 | """ 33 | # Overwrite the default consistency level by MILVUS_CONSISTENCY_LEVEL 34 | self._consistency_level = ZILLIZ_CONSISTENCY_LEVEL or "Bounded" 35 | self._create_connection() 36 | 37 | self._create_collection(ZILLIZ_COLLECTION, create_new) # type: ignore 38 | self._create_index() 39 | 40 | def _create_connection(self): 41 | # Check if the connection already exists 42 | try: 43 | i = [ 44 | connections.get_connection_addr(x[0]) 45 | for x in connections.list_connections() 46 | ].index({"address": ZILLIZ_URI, "user": ZILLIZ_USER}) 47 | self.alias = connections.list_connections()[i][0] 48 | except ValueError: 49 | # Connect to the Zilliz instance using the passed in Environment variables 50 | self.alias = uuid4().hex 51 | connections.connect(alias=self.alias, uri=ZILLIZ_URI, user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, secure=ZILLIZ_USE_SECURITY) # type: ignore 52 | logger.info("Connect to zilliz cloud server") 53 | 54 | def _create_index(self): 55 | try: 56 | # If no index on the collection, create one 57 | if len(self.col.indexes) == 0: 58 | self.index_params = { 59 | "metric_type": "IP", 60 | "index_type": "AUTOINDEX", 61 | "params": {}, 62 | } 63 | self.col.create_index("embedding", index_params=self.index_params) 64 | 65 | self.col.load() 66 | self.search_params = {"metric_type": "IP", "params": {}} 67 | except Exception as e: 68 | logger.error("Failed to create index, error: {}".format(e)) 69 | -------------------------------------------------------------------------------- /docs/deployment/flyio.md: -------------------------------------------------------------------------------- 1 | # Deploying to Fly.io 2 | 3 | ## Removing Unused Dependencies 4 | 5 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. 6 | 7 | Find the packages you can remove for each vector database provider [here](removing-unused-dependencies.md). 8 | 9 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command. 10 | 11 | ## Deployment 12 | 13 | To deploy the Docker container from this repository to Fly.io, follow 14 | these steps: 15 | 16 | [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed. 17 | 18 | Install the [Fly.io CLI](https://fly.io/docs/getting-started/installing-flyctl/) on your local machine. 19 | 20 | Clone the repository from GitHub: 21 | 22 | ``` 23 | git clone https://github.com/openai/chatgpt-retrieval-plugin.git 24 | ``` 25 | 26 | Navigate to the cloned repository directory: 27 | 28 | ``` 29 | cd path/to/chatgpt-retrieval-plugin 30 | ``` 31 | 32 | Log in to the Fly.io CLI: 33 | 34 | ``` 35 | flyctl auth login 36 | ``` 37 | 38 | Create and launch your Fly.io app: 39 | 40 | ``` 41 | flyctl launch 42 | ``` 43 | 44 | Follow the instructions in your terminal: 45 | 46 | - Choose your app name 47 | - Choose your app region 48 | - Don't add any databases 49 | - Don't deploy yet (if you do, the first deploy might fail as the environment variables are not yet set) 50 | 51 | Set the required environment variables: 52 | 53 | ``` 54 | flyctl secrets set DATASTORE=your_datastore \ 55 | OPENAI_API_KEY=your_openai_api_key \ 56 | BEARER_TOKEN=your_bearer_token \ 57 | 58 | ``` 59 | 60 | Alternatively, you could set environment variables in the [Fly.io Console](https://fly.io/dashboard). 61 | 62 | At this point, you can change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml) to the url for your Fly.io app, which will be `https://your-app-name.fly.dev`. 63 | 64 | Deploy your app with: 65 | 66 | ``` 67 | flyctl deploy 68 | ``` 69 | 70 | After completing these steps, your Docker container should be deployed to Fly.io and running with the necessary environment variables set. You can view your app by running: 71 | 72 | ``` 73 | flyctl open 74 | ``` 75 | 76 | which will open your app url. You should be able to find the OpenAPI schema at `/.well-known/openapi.yaml` and the manifest at `/.well-known/ai-plugin.json`. 77 | 78 | To view your app logs: 79 | 80 | ``` 81 | flyctl logs 82 | ``` 83 | 84 | Now, make sure you have changed the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy with `flyctl deploy`. This url will be `https://.fly.dev`. 85 | 86 | **Debugging tips:** 87 | Fly.io uses port 8080 by default. 88 | 89 | If your app fails to deploy, check if the environment variables are set correctly, and then check if your port is configured correctly. You could also try using the [`-e` flag](https://fly.io/docs/flyctl/launch/) with the `flyctl launch` command to set the environment variables at launch. 90 | -------------------------------------------------------------------------------- /docs/deployment/heroku.md: -------------------------------------------------------------------------------- 1 | # Deploying to Heroku 2 | 3 | ## Removing Unused Dependencies 4 | 5 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. 6 | 7 | Find the packages you can remove for each vector database provider [here](removing-unused-dependencies.md). 8 | 9 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command. 10 | 11 | ## Deployment 12 | 13 | To deploy the Docker container from this repository to Heroku and set the required environment variables, follow these steps: 14 | 15 | [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed. 16 | 17 | Install the [Heroku CLI](https://devcenter.heroku.com/articles/heroku-cli) on your local machine. 18 | 19 | Clone the repository from GitHub: 20 | 21 | ``` 22 | git clone https://github.com/openai/chatgpt-retrieval-plugin.git 23 | ``` 24 | 25 | Navigate to the cloned repository directory: 26 | 27 | ``` 28 | cd path/to/chatgpt-retrieval-plugin 29 | ``` 30 | 31 | Log in to the Heroku CLI: 32 | 33 | ``` 34 | heroku login 35 | ``` 36 | 37 | Create a Heroku app: 38 | 39 | ``` 40 | heroku create [app-name] 41 | ``` 42 | 43 | Log in to the Heroku Container Registry: 44 | 45 | ``` 46 | heroku container:login 47 | ``` 48 | 49 | Alternatively, you can use a command from the Makefile to log in to the Heroku Container Registry by running: 50 | 51 | ``` 52 | make heroku-login 53 | ``` 54 | 55 | Build the Docker image using the Dockerfile: 56 | 57 | ``` 58 | docker buildx build --platform linux/amd64 -t [image-name] . 59 | ``` 60 | 61 | (Replace `[image-name]` with the name you want to give your Docker image) 62 | 63 | Push the Docker image to the Heroku Container Registry, and release the newly pushed image to your Heroku app. 64 | 65 | ``` 66 | docker tag [image-name] registry.heroku.com/[app-name]/web 67 | docker push registry.heroku.com/[app-name]/web 68 | heroku container:release web -a [app-name] 69 | ``` 70 | 71 | Alternatively, you can use a command from the to push the Docker image to the Heroku Container Registry by running: 72 | 73 | ``` 74 | make heroku-push 75 | ``` 76 | 77 | **Note:** You will need to edit the Makefile and replace `` with your actual app name. 78 | 79 | Set the required environment variables for your Heroku app: 80 | 81 | ``` 82 | heroku config:set DATASTORE=your_datastore \ 83 | OPENAI_API_KEY=your_openai_api_key \ 84 | BEARER_TOKEN=your_bearer_token \ 85 | \ 86 | -a [app-name] 87 | ``` 88 | 89 | You could also set environment variables in the [Heroku Console](https://dashboard.heroku.com/apps). 90 | 91 | After completing these steps, your Docker container should be deployed to Heroku and running with the necessary environment variables set. You can view your app by running: 92 | 93 | ``` 94 | heroku open -a [app-name] 95 | ``` 96 | 97 | which will open your app url. You should be able to find the OpenAPI schema at `/.well-known/openapi.yaml` and the manifest at `/.well-known/ai-plugin.json`. 98 | 99 | To view your app logs: 100 | 101 | ``` 102 | heroku logs --tail -a [app-name] 103 | ``` 104 | 105 | Now make sure to change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy with `make heroku-push`. This url will be `https://your-app-name.herokuapp.com`. 106 | -------------------------------------------------------------------------------- /docs/deployment/other-options.md: -------------------------------------------------------------------------------- 1 | # Other Deployment Options 2 | 3 | Some possible other options for deploying the app are: 4 | 5 | - **Azure Container Apps**: This is a cloud platform that allows you to deploy and manage web apps using Docker containers. You can use the Azure CLI or the Azure Portal to create and configure your app service, and then push your Docker image to a container registry and deploy it to your app service. You can also set environment variables and scale your app using the Azure Portal. Learn more [here](https://learn.microsoft.com/en-us/azure/container-apps/get-started-existing-container-image-portal?pivots=container-apps-private-registry). 6 | - **Google Cloud Run**: This is a serverless platform that allows you to run stateless web apps using Docker containers. You can use the Google Cloud Console or the gcloud command-line tool to create and deploy your Cloud Run service, and then push your Docker image to the Google Container Registry and deploy it to your service. You can also set environment variables and scale your app using the Google Cloud Console. Learn more [here](https://cloud.google.com/run/docs/quickstarts/build-and-deploy). 7 | - **AWS Elastic Container Service**: This is a cloud platform that allows you to run and manage web apps using Docker containers. You can use the AWS CLI or the AWS Management Console to create and configure your ECS cluster, and then push your Docker image to the Amazon Elastic Container Registry and deploy it to your cluster. You can also set environment variables and scale your app using the AWS Management Console. Learn more [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/docker-basics.html). 8 | 9 | After you create your app, make sure to change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy. 10 | 11 | ## Removing Unused Dependencies 12 | 13 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. 14 | 15 | Find the packages you can remove for each vector database provider [here](removing_unused_dependencies.md). 16 | 17 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command. 18 | -------------------------------------------------------------------------------- /docs/deployment/removing-unused-dependencies.md: -------------------------------------------------------------------------------- 1 | # Removing Unused Dependencies 2 | 3 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. 4 | 5 | Here are the packages you can remove for each vector database provider: 6 | 7 | - **Pinecone:** Remove `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity`, `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 8 | - **Weaviate:** Remove `pinecone-client`, `pymilvus`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, `psycopg2cffi`. 9 | - **Zilliz:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 10 | - **Milvus:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 11 | - **Qdrant:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 12 | - **Redis:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 13 | - **LlamaIndex:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `chromadb`, `redis`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 14 | - **Chroma:**: Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `llama-index`, `redis`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 15 | - **Azure Cognitive Search**: Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `llama-index`, `redis` and `chromadb`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 16 | - **Supabase:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `psycopg2`+`pgvector`, and `psycopg2cffi`. 17 | - **Postgres:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, and `psycopg2cffi`. 18 | - **AnalyticDB:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, and `psycopg2`+`pgvector`. 19 | 20 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command. 21 | -------------------------------------------------------------------------------- /docs/deployment/render-thumbnail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/docs/deployment/render-thumbnail.png -------------------------------------------------------------------------------- /docs/deployment/render.md: -------------------------------------------------------------------------------- 1 | # Deploying to Render 2 | 3 | ## Removing Unused Dependencies 4 | 5 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider. 6 | 7 | Find the packages you can remove for each vector database provider [here](removing-unused-dependencies.md). 8 | 9 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command. 10 | 11 | ## Deployment 12 | 13 | Render maintains a [fork](https://github.com/render-examples/chatgpt-retrieval-plugin/) of this repository with a few small changes that facilitate easy deployment. The source code is unchanged. To deploy both the Docker container from this repository and a self-hosted Weaviate database to back it, just click the button below. Enter your OpenAI API key when prompted. 14 | 15 | [Deploy to Render](https://render.com/deploy?repo=https://github.com/render-examples/chatgpt-retrieval-plugin/tree/main) 16 | 17 | The bearer token will be randomly generated for you. You can view it in in the "Environment" tab on the [Render dashboard](https://dashboard.render.com) page for your server. For more guidance, consult the [README in Render's fork](https://github.com/render-examples/chatgpt-retrieval-plugin/blob/main/README.md), [Render's documentation](https://render.com/docs), or the screen recording linked below. 18 | 19 | [![Deploy to Render screen recording](render-thumbnail.png)](https://vimeo.com/823610578) 20 | -------------------------------------------------------------------------------- /docs/deprecated/plugins.md: -------------------------------------------------------------------------------- 1 | ## Plugins (deprecated) 2 | 3 | Plugins are chat extensions designed specifically for language models like ChatGPT, enabling them to access up-to-date information, run computations, or interact with third-party services in response to a user's request. They unlock a wide range of potential use cases and enhance the capabilities of language models. 4 | 5 | Developers can create a plugin by exposing an API through their website and providing a standardized manifest file that describes the API. ChatGPT consumes these files and allows the AI models to make calls to the API defined by the developer. 6 | 7 | A plugin consists of: 8 | 9 | - An API 10 | - An API schema (OpenAPI JSON or YAML format) 11 | - A manifest (JSON file) that defines relevant metadata for the plugin 12 | 13 | The Retrieval Plugin already contains all of these components. Read the Chat Plugins blogpost [here](https://openai.com/blog/chatgpt-plugins), and find the docs [here](https://platform.openai.com/docs/plugins/introduction). 14 | 15 | To access the plugins model, navigate [here](https://chat.openai.com/?model=gpt-4-plugins). 16 | 17 | ### Testing a Localhost Plugin in ChatGPT 18 | 19 | To test a localhost plugin in ChatGPT, use the provided [`local_server/main.py`](/local_server/main.py) file, which is specifically configured for localhost testing with CORS settings, no authentication and routes for the manifest, OpenAPI schema and logo. 20 | 21 | Follow these steps to test your localhost plugin: 22 | 23 | 1. Run the localhost server using the `poetry run dev` command. This starts the server at the default address (e.g. `localhost:3333`). 24 | 25 | 2. Visit [ChatGPT](https://chat.openai.com/), select "Plugins" from the model picker, click on the plugins picker, and click on "Plugin store" at the bottom of the list. 26 | 27 | 3. Choose "Develop your own plugin" and enter your localhost URL (e.g. `localhost:3333`) when prompted. 28 | 29 | 4. Your localhost plugin is now enabled for your ChatGPT session. 30 | 31 | For more information, refer to the [OpenAI documentation](https://platform.openai.com/docs/plugins/getting-started/openapi-definition). 32 | 33 | ## Installing a Developer Plugin 34 | 35 | To install a developer plugin, follow the steps below: 36 | 37 | - First, create your developer plugin by deploying it to your preferred hosting platform (e.g. Fly.io, Heroku, etc.) and updating the plugin URL in the manifest file and OpenAPI schema. 38 | 39 | - Go to [ChatGPT](https://chat.openai.com/) and select "Plugins" from the model picker. 40 | 41 | - From the plugins picker, scroll to the bottom and click on "Plugin store." 42 | 43 | - Go to "Develop your own plugin" and follow the instructions provided. You will need to enter the domain where your plugin is deployed. 44 | 45 | - Follow the instructions based on the authentication type you have chosen for your plugin (e.g. if your plugin uses Service Level HTTP, you will have to paste in your access token, then paste the new access token you receive from the plugin flow into your [ai-plugin.json](/.well-known/ai-plugin.json) file and redeploy your app). 46 | 47 | - Next, you must add your plugin. Go to the "Plugin store" again and click on "Install an unverified plugin." 48 | 49 | - Follow the instructions provided, which will require you to enter the domain where your plugin is deployed. 50 | 51 | - Follow the instructions based on the authentication type you have chosen for your plugin (e.g. if your plugin uses User Level HTTP, you will have to paste in your bearer token). 52 | 53 | After completing these steps, your developer plugin should be installed and ready to use in ChatGPT. 54 | -------------------------------------------------------------------------------- /docs/providers/analyticdb/setup.md: -------------------------------------------------------------------------------- 1 | # AnalyticDB 2 | 3 | [AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is a high-performance vector database that is fully compatible with PostgreSQL syntax, making it easy to use. Managed by Alibaba Cloud, AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing a wide range of features, including indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. Additionally, it offers full OLAP database functionality and an SLA commitment for production use. 4 | 5 | ## Install Requirements 6 | 7 | Run the following command to install the required packages, including the `psycopg2cffi` package: 8 | 9 | ``` 10 | poetry install --extras "postgresql" 11 | ``` 12 | 13 | If you encounter the `Error: pg_config executable not found.` issue, you need to install the PostgreSQL development package on your system. Follow the instructions for your specific Linux distribution: 14 | 15 | 1. Debian-based systems (e.g., Ubuntu): 16 | 17 | ```bash 18 | sudo apt-get update 19 | sudo apt-get install libpq-dev 20 | ``` 21 | 22 | 2. RHEL-based systems (e.g., CentOS, Fedora): 23 | 24 | ```bash 25 | sudo yum install postgresql-devel 26 | ``` 27 | 28 | 3. Arch-based systems (e.g., Manjaro, Arch Linux): 29 | 30 | ```bash 31 | sudo pacman -S postgresql-libs 32 | ``` 33 | 34 | 4. macOS: 35 | 36 | ```bash 37 | brew install postgresql 38 | ``` 39 | 40 | After installing the required package, try to install `psycopg2cffi` again. If the `pg_config` executable is still not found, add its location to your system's `PATH` variable. You can typically find the `pg_config` executable in the `bin` directory of your PostgreSQL installation, for example `/usr/pgsql-13/bin/pg_config`. To add it to your `PATH` variable, use the following command (replace the path with the correct one for your system): 41 | 42 | ```bash 43 | export PATH=$PATH:/usr/pgsql-13/bin 44 | ``` 45 | 46 | Now, try installing `psycopg2cffi` again using Poetry. 47 | 48 | **Environment Variables:** 49 | 50 | | Name | Required | Description | Default | 51 | | ---------------- | -------- | ----------------------------------- | ----------------- | 52 | | `DATASTORE` | Yes | Datastore name, set to `analyticdb` | | 53 | | `BEARER_TOKEN` | Yes | Secret token | | 54 | | `OPENAI_API_KEY` | Yes | OpenAI API key | | 55 | | `PG_HOST` | Yes | AnalyticDB instance URL | `localhost` | 56 | | `PG_USER` | Yes | Database user | `user` | 57 | | `PG_PASSWORD` | Yes | Database password | `password` | 58 | | `PG_PORT` | Optional | Port for AnalyticDB communication | `5432` | 59 | | `PG_DATABASE` | Optional | Database name | `postgres` | 60 | | `PG_COLLECTION` | Optional | AnalyticDB relation name | `document_chunks` | 61 | 62 | ## AnalyticDB Cloud 63 | 64 | For a hosted [AnalyticDB Cloud](https://cloud.qdrant.io/) version, provide the AnalyticDB instance URL: 65 | 66 | **Example:** 67 | 68 | ```bash 69 | PG_HOST="https://YOUR-CLUSTER-URL.gpdb.rds.aliyuncs.com" 70 | PG_USER="YOUR-USER-NAME" 71 | PG_PASSWORD="YOUR-PASSWORD" 72 | ``` 73 | 74 | The other parameters are optional and can be changed if needed. 75 | 76 | ## Running AnalyticDB Integration Tests 77 | 78 | A suite of integration tests verifies the AnalyticDB integration. Launch the test suite with this command: 79 | 80 | ```bash 81 | pytest ./tests/datastore/providers/analyticdb/test_analyticdb_datastore.py 82 | ``` 83 | -------------------------------------------------------------------------------- /docs/providers/azurecosmosdb/setup.md: -------------------------------------------------------------------------------- 1 | # Azure Cosmos DB 2 | 3 | [Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service. 4 | 5 | Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/). 6 | 7 | ## Environment variables 8 | 9 | | Name | Required | Description | Default | 10 | | ---------------------------- | -------- |-------------------------------------------------------------------------| ------------------- | 11 | | `DATASTORE` | Yes | Datastore name, set to `azurecosmosdb` | | 12 | | `BEARER_TOKEN` | Yes | Secret token | | 13 | | `OPENAI_API_KEY` | Yes | OpenAI API key | | 14 | | `AZCOSMOS_API` | Yes | Name of the API you're connecting to. Currently supported `mongo-vcore` | | 15 | | `AZCOSMOS_CONNSTR` | Yes | The connection string to your account. | | 16 | | `AZCOSMOS_DATABASE_NAME` | Yes | The database where the data is stored/queried | | 17 | | `AZCOSMOS_CONTAINER_NAME` | Yes | The container where the data is stored/queried | | 18 | 19 | ## Indexing 20 | On first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported. 21 | -------------------------------------------------------------------------------- /docs/providers/azuresearch/setup.md: -------------------------------------------------------------------------------- 1 | # Azure Cognitive Search 2 | 3 | [Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval cloud service that supports vector search, text search, and hybrid (vectors + text combined to yield the best of the two approaches). Azure Cognitive Search also offers an [optional L2 re-ranking step](https://learn.microsoft.com/azure/search/semantic-search-overview) to further improve results quality. 4 | 5 | You can find the Azure Cognitive Search documentation [here](https://learn.microsoft.com/azure/search/search-what-is-azure-search). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/). 6 | 7 | ## Signing up for vector search 8 | 9 | Azure Cognitive Search supports searching using pure vectors, pure text, or hybrid mode where both are combined. For the vector-based cases, you'll need to sign up for vector search private preview. To sign up, please fill in this form: https://aka.ms/VectorSearchSignUp 10 | 11 | ## Environment variables 12 | 13 | | Name | Required | Description | Default | 14 | | ----------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------- | 15 | | `DATASTORE` | Yes | Datastore name, set to `azuresearch` | | 16 | | `BEARER_TOKEN` | Yes | Secret token | | 17 | | `OPENAI_API_KEY` | Yes | OpenAI API key | | 18 | | `AZURESEARCH_SERVICE` | Yes | Name of your search service | | 19 | | `AZURESEARCH_INDEX` | Yes | Name of your search index | | 20 | | `AZURESEARCH_API_KEY` | No | Your API key, if using key-based auth instead of Azure managed identity | Uses managed identity | 21 | | `AZURESEARCH_DISABLE_HYBRID` | No | Disable hybrid search and only use vector similarity | Use hybrid search | 22 | | `AZURESEARCH_SEMANTIC_CONFIG` | No | Enable L2 re-ranking with this configuration name [see re-ranking below](#re-ranking) | L2 not enabled | 23 | | `AZURESEARCH_LANGUAGE` | No | If using L2 re-ranking, language for queries/documents (valid values [listed here](https://learn.microsoft.com/rest/api/searchservice/preview-api/search-documents#queryLanguage)) | `en-us` | 24 | | `AZURESEARCH_DIMENSIONS` | No | Vector size for embeddings | 256, or other | 25 | 26 | ## Authentication Options 27 | 28 | - API key: this is enabled by default; you can obtain the key in the Azure Portal or using the Azure CLI. 29 | - Managed identity: If the plugin is running in Azure, you can enable managed identity for the host and give that identity access to the service, without having to manage keys (avoiding secret storage, rotation, etc.). More details [here](https://learn.microsoft.com/azure/search/search-security-rbac). 30 | 31 | ## Re-ranking 32 | 33 | Azure Cognitive Search offers the option to enable a second (L2) ranking step after retrieval to further improve results quality. This only applies when using text or hybrid search. Since it has latency and cost implications, if you want to try this option you need to explicitly [enable "semantic search"](https://learn.microsoft.com/azure/search/semantic-search-overview#enable-semantic-search) in your Cognitive Search service, and [create a semantic search configuration](https://learn.microsoft.com/azure/search/semantic-how-to-query-request#2---create-a-semantic-configuration) for your index. 34 | 35 | ## Using existing search indexes 36 | 37 | If an existing index has fields that align with what's needed by the retrieval plugin but just differ in names, you can map your fields to the plugin fields using the following environment variables: 38 | 39 | | Plugin field name | Environment variable to override it | 40 | | ----------------- | ----------------------------------- | 41 | | id | AZURESEARCH_FIELDS_ID | 42 | | text | AZURESEARCH_FIELDS_TEXT | 43 | | embedding | AZURESEARCH_FIELDS_EMBEDDING | 44 | | document_id | AZURESEARCH_FIELDS_DOCUMENT_ID | 45 | | source | AZURESEARCH_FIELDS_SOURCE | 46 | | source_id | AZURESEARCH_FIELDS_SOURCE_ID | 47 | | url | AZURESEARCH_FIELDS_URL | 48 | | created_at | AZURESEARCH_FIELDS_CREATED_AT | 49 | | author | AZURESEARCH_FIELDS_AUTHOR | 50 | -------------------------------------------------------------------------------- /docs/providers/chroma/setup.md: -------------------------------------------------------------------------------- 1 | [Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make it easy to work with embeddings. Chroma runs in-memory, or in a client-server setup. 2 | 3 | Install Chroma by running `pip install chromadb`. Once installed, the core API consists of four essential commands for creating collections, adding embeddings, documents, and metadata, and querying embeddings to find similar documents. Get started with Chroma by visiting the [Getting Started](https://docs.trychroma.com) page on their documentation website, or explore the open-source code on their [GitHub repository](https://github.com/chroma-core/chroma). 4 | 5 | **Chroma Environment Variables** 6 | 7 | To set up Chroma and start using it as your vector database provider, you need to define some environment variables to connect to your Chroma instance. 8 | 9 | **Chroma Datastore Environment Variables** 10 | 11 | Chroma runs _in-memory_ by default, with local persistence. It can also run in [self-hosted](https://docs.trychroma.com/usage-guide#running-chroma-in-clientserver-mode) client-server mode, with a fully managed hosted version coming soon. 12 | 13 | | Name | Required | Description | Default | 14 | | ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ---------------- | 15 | | `DATASTORE` | Yes | Datastore name. Set this to `chroma` | | 16 | | `BEARER_TOKEN` | Yes | Your secret token for authenticating requests to the API | | 17 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key for generating embeddings | | 18 | | `CHROMA_COLLECTION` | Optional | Your chosen Chroma collection name to store your embeddings | openaiembeddings | 19 | | `CHROMA_IN_MEMORY` | Optional | If set to `True`, ignore `CHROMA_HOST` and `CHROMA_PORT` and just use an in-memory Chroma instance | `True` | 20 | | `CHROMA_PERSISTENCE_DIR` | Optional | If set, and `CHROMA_IN_MEMORY` is set, persist to and load from this directory. | `openai` | 21 | 22 | To run Chroma in self-hosted client-server mode, st the following variables: 23 | 24 | | Name | Required | Description | Default | 25 | | ------------- | -------- | --------------------------------------------------- | ------------------ | 26 | | `CHROMA_HOST` | Optional | Your Chroma instance host address (see notes below) | `http://127.0.0.1` | 27 | | `CHROMA_PORT` | Optional | Your Chroma port number | `8000` | 28 | 29 | > For **self-hosted instances**, if your instance is not at 127.0.0.1:8000, set `CHROMA_HOST` and `CHROMA_PORT` accordingly. For example: `CHROMA_HOST=http://localhost/` and `CHROMA_PORT=8080`. 30 | -------------------------------------------------------------------------------- /docs/providers/elasticsearch/setup.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch 2 | 3 | Elasticsearch is a search engine based on the Lucene library. It provides a distributed, full-text and vector search engine with an HTTP web interface and schema-free JSON documents. To use Elasticsearch as your vector database, start by [installing Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) or signing up for a free trial of [Elastic Cloud](https://www.elastic.co/cloud/). 4 | 5 | The app will create an Elasticsearch index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable. 6 | 7 | **Environment Variables:** 8 | 9 | | Name | Required | Description | 10 | | --------------------- | -------- | -------------------------------------------------------------------------------------------------------------------- | 11 | | `DATASTORE` | Yes | Datastore name, set this to `elasticsearch` | 12 | | `BEARER_TOKEN` | Yes | Your secret token for authenticating requests to the API | 13 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key for generating embeddings with the OpenAI embeddings model | 14 | | `ELASTICSEARCH_INDEX` | Yes | Your chosen Elasticsearch index name. **Note:** Index name must consist of lower case alphanumeric characters or '-' | 15 | 16 | **Connection Evironment Variables:** 17 | Depending on your Elasticsearch setup, you may need to set one of the following environment variables to connect to your Elasticsearch instance. If you are using Elastic Cloud, you can connect via `ELASTICSEARCH_CLOUD_ID`. If you are using a local instance of Elasticsearch, you will need to set `ELASTICSEARCH_URL`. 18 | 19 | You can authenticate to Elasticsearch using either `ELASTICSEARCH_USERNAME` and `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY`. If you are using Elastic Cloud, you can find this in Kibana. 20 | 21 | | Name | Required | Description | 22 | | ------------------------ | -------- | ------------------------------------------------------------------------------------------------ | 23 | | `ELASTICSEARCH_URL` | Yes | Your Elasticsearch URL. If installed locally, this would be https://localhost:9200 | 24 | | `ELASTICSEARCH_CLOUD_ID` | Yes | Your cloud id, linked to your deployment. This can be found in the deployment's console | 25 | | `ELASTICSEARCH_USERNAME` | Yes | Your username for authenticating requests to the API. Commonly 'elastic'. | 26 | | `ELASTICSEARCH_PASSWORD` | Yes | Your password for authenticating requests to the API | 27 | | `ELASTICSEARCH_API_KEY` | Yes | Alternatively you can authenticate using api-key. This can be created in Kibana stack management | 28 | 29 | ## Running Elasticsearch Integration Tests 30 | 31 | A suite of integration tests is available to verify the Elasticsearch integration. To run the tests, run the docker compose found in the `examples/docker/elasticsearch` folder with `docker-compose up`. This will start Elasticsearch in single node, security off mode, listening on `http://localhost:9200`. 32 | 33 | Then, launch the test suite with this command: 34 | 35 | ```bash 36 | pytest ./tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py 37 | ``` 38 | -------------------------------------------------------------------------------- /docs/providers/llama/setup.md: -------------------------------------------------------------------------------- 1 | 2 | # LlamaIndex 3 | 4 | [LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data. 5 | It provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT. 6 | Unlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases. 7 | It is light-weight, easy-to-use, and requires no additional deployment. 8 | All you need to do is specifying a few environment variables (optionally point to an existing saved Index json file). 9 | Note that metadata filters in queries are not yet supported. 10 | 11 | ## Setup 12 | Currently, LlamaIndex requires no additional deployment 13 | and runs as a part of the Retrieval Plugin. 14 | It is super easy to setup and great for quick prototyping 15 | with ChatGPT and your external data. 16 | 17 | **Retrieval App Environment Variables** 18 | 19 | | Name | Required | Description | 20 | |------------------|----------|-------------------------------------| 21 | | `DATASTORE` | Yes | Datastore name. Set this to `llama` | 22 | | `BEARER_TOKEN` | Yes | Your secret token | 23 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key | 24 | 25 | **Llama Datastore Environment Variables** 26 | 27 | | Name | Required | Description | Default | 28 | |--------------------------------|----------|--------------------------------------|---------------| 29 | | `LLAMA_INDEX_TYPE` | Optional | Index type (see below for details) | `simple_dict` | 30 | | `LLAMA_INDEX_JSON_PATH` | Optional | Path to saved Index json file | None | 31 | | `LLAMA_QUERY_KWARGS_JSON_PATH` | Optional | Path to saved query kwargs json file | None | 32 | | `LLAMA_RESPONSE_MODE` | Optional | Response mode for query | `no_text` | 33 | 34 | 35 | **Different Index Types** 36 | By default, we use a `GPTVectorStoreIndex` to store document chunks in memory, 37 | and retrieve top-k nodes by embedding similarity. 38 | Different index types are optimized for different data and query use-cases. 39 | See this guide on [How Each Index Works](https://gpt-index.readthedocs.io/en/latest/guides/primer/index_guide.html) to learn more. 40 | You can configure the index type via the `LLAMA_INDEX_TYPE`, see [here](https://gpt-index.readthedocs.io/en/latest/reference/indices/composability_query.html#gpt_index.data_structs.struct_type.IndexStructType) for the full list of accepted index type identifiers. 41 | 42 | 43 | Read more details on [readthedocs](https://gpt-index.readthedocs.io/en/latest/), 44 | and engage with the community on [discord](https://discord.com/invite/dGcwcsnxhU). 45 | 46 | ## Running Tests 47 | You can launch the test suite with this command: 48 | 49 | ```bash 50 | pytest ./tests/datastore/providers/llama/test_llama_datastore.py 51 | ``` 52 | -------------------------------------------------------------------------------- /docs/providers/milvus/setup.md: -------------------------------------------------------------------------------- 1 | # Milvus 2 | 3 | [Milvus](https://milvus.io/) is the open-source, cloud-native vector database that scales to billions of vectors. It's the open-source version of Zilliz. It supports: 4 | 5 | - Various indexing algorithms and distance metrics 6 | - Scalar filtering and time travel searches 7 | - Rollback and snapshots 8 | - Multi-language SDKs 9 | - Storage and compute separation 10 | - Cloud scalability 11 | - A developer-first community with multi-language support 12 | 13 | Visit the [Github](https://github.com/milvus-io/milvus) to learn more. 14 | 15 | ## Deploying the Database 16 | 17 | You can deploy and manage Milvus using Docker Compose, Helm, K8's Operator, or Ansible. Follow the instructions [here](https://milvus.io/docs) to get started. 18 | 19 | **Environment Variables:** 20 | 21 | | Name | Required | Description | 22 | |----------------------------| -------- |----------------------------------------------------------------------------------------------------------------------------------------------| 23 | | `DATASTORE` | Yes | Datastore name, set to `milvus` | 24 | | `BEARER_TOKEN` | Yes | Your bearer token | 25 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key | 26 | | `MILVUS_COLLECTION` | Optional | Milvus collection name, defaults to a random UUID | 27 | | `MILVUS_HOST` | Optional | Milvus host IP, defaults to `localhost` | 28 | | `MILVUS_PORT` | Optional | Milvus port, defaults to `19530` | 29 | | `MILVUS_USER` | Optional | Milvus username if RBAC is enabled, defaults to `None` | 30 | | `MILVUS_PASSWORD` | Optional | Milvus password if required, defaults to `None` | 31 | | `MILVUS_INDEX_PARAMS` | Optional | Custom index options for the collection, defaults to `{"metric_type": "IP", "index_type": "HNSW", "params": {"M": 8, "efConstruction": 64}}` | 32 | | `MILVUS_SEARCH_PARAMS` | Optional | Custom search options for the collection, defaults to `{"metric_type": "IP", "params": {"ef": 10}}` | 33 | | `MILVUS_CONSISTENCY_LEVEL` | Optional | Data consistency level for the collection, defaults to `Bounded` | 34 | 35 | ## Running Milvus Integration Tests 36 | 37 | A suite of integration tests is available to verify the Milvus integration. To run the tests, run the milvus docker compose found in the examples folder. 38 | 39 | Then, launch the test suite with this command: 40 | 41 | ```bash 42 | pytest ./tests/datastore/providers/milvus/test_milvus_datastore.py 43 | ``` 44 | -------------------------------------------------------------------------------- /docs/providers/pinecone/setup.md: -------------------------------------------------------------------------------- 1 | # Pinecone 2 | 3 | [Pinecone](https://www.pinecone.io) is a managed vector database built for speed, scale, and shipping to production sooner. To use Pinecone as your vector database provider, first get an API key by [signing up for an account](https://app.pinecone.io/). You can access your API key from the "API Keys" section in the sidebar of your dashboard. Pinecone also supports hybrid search and at the time of writing is the only datastore to support SPLADE sparse vectors natively. 4 | 5 | A full Jupyter notebook walkthrough for the Pinecone flavor of the retrieval plugin can be found [here](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/examples/providers/pinecone/semantic-search.ipynb). There is also a [video walkthrough here](https://youtu.be/hpePPqKxNq8). 6 | 7 | The app will create a Pinecone index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable. 8 | 9 | **Environment Variables:** 10 | 11 | | Name | Required | Description | 12 | | ---------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------- | 13 | | `DATASTORE` | Yes | Datastore name, set this to `pinecone` | 14 | | `BEARER_TOKEN` | Yes | Your secret token for authenticating requests to the API | 15 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key for generating embeddings with one of the OpenAI embeddings models | 16 | | `PINECONE_API_KEY` | Yes | Your Pinecone API key, found in the [Pinecone console](https://app.pinecone.io/) | 17 | | `PINECONE_ENVIRONMENT` | Yes | Your Pinecone environment, found in the [Pinecone console](https://app.pinecone.io/), e.g. `us-west1-gcp`, `us-east-1-aws`, etc. | 18 | | `PINECONE_INDEX` | Yes | Your chosen Pinecone index name. **Note:** Index name must consist of lower case alphanumeric characters or '-' | 19 | 20 | If you want to create your own index with custom configurations, you can do so using the Pinecone SDK, API, or web interface ([see docs](https://docs.pinecone.io/docs/manage-indexes)). Make sure to use a dimensionality of 256 (or another dimension) for the embeddings and avoid indexing on the text field in the metadata, as this will reduce the performance significantly. 21 | 22 | ```python 23 | # Creating index with Pinecone SDK - use only if you wish to create the index manually. 24 | 25 | import os, pinecone 26 | 27 | pinecone.init(api_key=os.environ['PINECONE_API_KEY'], 28 | environment=os.environ['PINECONE_ENVIRONMENT']) 29 | 30 | EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256)) 31 | 32 | pinecone.create_index(name=os.environ['PINECONE_INDEX'], 33 | dimension=EMBEDDING_DIMENSION, 34 | metric='cosine', 35 | metadata_config={ 36 | "indexed": ['source', 'source_id', 'url', 'created_at', 'author', 'document_id']}) 37 | ``` 38 | -------------------------------------------------------------------------------- /docs/providers/postgres/setup.md: -------------------------------------------------------------------------------- 1 | # Postgres 2 | 3 | Postgres Database offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension. To use pgvector, you will need to set up a PostgreSQL database with the pgvector extension enabled or use a managed solution that provides pgvector. For a hosted/managed solution, you can use any of the cloud vendors which support [pgvector](https://github.com/pgvector/pgvector#hosted-postgres). 4 | 5 | - The database needs the `pgvector` extension. 6 | - To apply required migrations you may use any tool you are more familiar with like [pgAdmin](https://www.pgadmin.org/), [DBeaver](https://dbeaver.io/), [DataGrip](https://www.jetbrains.com/datagrip/), or `psql` cli. 7 | 8 | **Retrieval App Environment Variables** 9 | 10 | | Name | Required | Description | 11 | | ---------------- | -------- | -------------------------------------- | 12 | | `DATASTORE` | Yes | Datastore name. Set this to `postgres` | 13 | | `BEARER_TOKEN` | Yes | Your secret token | 14 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key | 15 | 16 | **Postgres Datastore Environment Variables** 17 | 18 | | Name | Required | Description | Default | 19 | | ------------- | -------- | ----------------- | ---------- | 20 | | `PG_HOST` | Optional | Postgres host | localhost | 21 | | `PG_PORT` | Optional | Postgres port | `5432` | 22 | | `PG_PASSWORD` | Optional | Postgres password | `postgres` | 23 | | `PG_USER` | Optional | Postgres username | `postgres` | 24 | | `PG_DB` | Optional | Postgres database | `postgres` | 25 | 26 | ## Postgres Datastore local development & testing 27 | 28 | In order to test your changes to the Postgres Datastore, you can run the following: 29 | 30 | 1. You can run local or self-hosted instance of PostgreSQL with `pgvector` enabled using Docker. 31 | 32 | ```bash 33 | docker pull ankane/pgvector 34 | ``` 35 | 36 | ```bash 37 | docker run --name pgvector -e POSTGRES_PASSWORD=mysecretpassword -d postgres 38 | ``` 39 | 40 | Check PostgreSQL [official docker image](https://github.com/docker-library/docs/blob/master/postgres/README.md) for more options. 41 | 42 | 2. Apply migrations using any tool you like most [pgAdmin](https://www.pgadmin.org/), [DBeaver](https://dbeaver.io/), [DataGrip](https://www.jetbrains.com/datagrip/), or `psql` cli. 43 | 44 | ```bash 45 | # apply migrations using psql cli 46 | psql -h localhost -p 5432 -U postgres -d postgres -f examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql 47 | ``` 48 | 49 | 3. Export environment variables required for the Postgres Datastore 50 | 51 | ```bash 52 | export PG_HOST=localhost 53 | export PG_PORT=54322 54 | export PG_PASSWORD=mysecretpassword 55 | ``` 56 | 57 | 4. Run the Postgres datastore tests from the project's root directory 58 | 59 | ```bash 60 | # Run the Postgres datastore tests 61 | # go to project's root directory and run 62 | poetry run pytest -s ./tests/datastore/providers/postgres/test_postgres_datastore.py 63 | ``` 64 | 65 | 5. When going to prod don't forget to set the password for the `postgres` user to something more secure and apply migrations. 66 | 67 | 6. You may want to remove RLS (Row Level Security) from the `documents` table. If you are not using RLS, it is not required in this setup. But it may be useful if you want to separate documents by user or group of users, or if you want to give permissions to insert or query documents to different users. And RLS is especially important if you are willing to use PostgREST. To do so you can just remove the following statement from the `20230414142107_init_pg_vector.sql` migration file: `alter table documents enable row level security;`. 68 | 69 | ## Indexes for Postgres 70 | 71 | By default, pgvector performs exact nearest neighbor search. To speed up the vector comparison, you may want to create indexes for the `embedding` column in the `documents` table. You should do this **only** after a few thousand records are inserted. 72 | 73 | As datasotre is using inner product for similarity search, you can add index as follows: 74 | 75 | ```sql 76 | create index on documents using ivfflat (embedding vector_ip_ops) with (lists = 100); 77 | ``` 78 | 79 | To choose `lists` constant - a good place to start is records / 1000 for up to 1M records and sqrt(records) for over 1M records 80 | 81 | For more information about indexes, see [pgvector docs](https://github.com/pgvector/pgvector#indexing). 82 | -------------------------------------------------------------------------------- /docs/providers/qdrant/setup.md: -------------------------------------------------------------------------------- 1 | # Qdrant 2 | 3 | [Qdrant](https://qdrant.tech/) is a vector database that can store documents and vector embeddings. It can run as a self-hosted version or a managed [Qdrant Cloud](https://cloud.qdrant.io/) 4 | solution. The configuration is almost identical for both options, except for the API key that [Qdrant Cloud](https://cloud.qdrant.io/) provides. 5 | 6 | **Environment Variables:** 7 | 8 | | Name | Required | Description | Default | 9 | | ------------------- | -------- | ----------------------------------------------------------- | ------------------ | 10 | | `DATASTORE` | Yes | Datastore name, set to `qdrant` | | 11 | | `BEARER_TOKEN` | Yes | Secret token | | 12 | | `OPENAI_API_KEY` | Yes | OpenAI API key | | 13 | | `QDRANT_URL` | Yes | Qdrant instance URL | `http://localhost` | 14 | | `QDRANT_PORT` | Optional | TCP port for Qdrant HTTP communication | `6333` | 15 | | `QDRANT_GRPC_PORT` | Optional | TCP port for Qdrant GRPC communication | `6334` | 16 | | `QDRANT_API_KEY` | Optional | Qdrant API key for [Qdrant Cloud](https://cloud.qdrant.io/) | | 17 | | `QDRANT_COLLECTION` | Optional | Qdrant collection name | `document_chunks` | 18 | 19 | ## Qdrant Cloud 20 | 21 | For a hosted [Qdrant Cloud](https://cloud.qdrant.io/) version, provide the Qdrant instance 22 | URL and the API key from the [Qdrant Cloud UI](https://cloud.qdrant.io/). 23 | 24 | **Example:** 25 | 26 | ```bash 27 | QDRANT_URL="https://YOUR-CLUSTER-URL.aws.cloud.qdrant.io" 28 | QDRANT_API_KEY="" 29 | ``` 30 | 31 | The other parameters are optional and can be changed if needed. 32 | 33 | ## Self-hosted Qdrant Instance 34 | 35 | For a self-hosted version, use Docker containers or the official Helm chart for deployment. The only 36 | required parameter is the `QDRANT_URL` that points to the Qdrant server URL. 37 | 38 | **Example:** 39 | 40 | ```bash 41 | QDRANT_URL="http://YOUR_HOST.example.com:6333" 42 | ``` 43 | 44 | The other parameters are optional and can be changed if needed. 45 | 46 | ## Running Qdrant Integration Tests 47 | 48 | A suite of integration tests verifies the Qdrant integration. To run it, start a local Qdrant instance in a Docker container. 49 | 50 | ```bash 51 | docker run -p "6333:6333" -p "6334:6334" qdrant/qdrant:v1.0.3 52 | ``` 53 | 54 | Then, launch the test suite with this command: 55 | 56 | ```bash 57 | pytest ./tests/datastore/providers/qdrant/test_qdrant_datastore.py 58 | ``` 59 | -------------------------------------------------------------------------------- /docs/providers/redis/setup.md: -------------------------------------------------------------------------------- 1 | # Redis 2 | 3 | [Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform that supports a variety of use cases for everyday applications as well as AI/ML workloads. Use Redis as a low-latency vector engine by creating a Redis database with the [Redis Stack docker container](/examples/docker/redis/docker-compose.yml). For a hosted/managed solution, try [Redis Cloud](https://app.redislabs.com/#/). See more helpful examples of Redis as a vector database [here](https://github.com/RedisVentures/redis-ai-resources). 4 | 5 | - The database **needs the RediSearch module (>=v2.6) and RedisJSON**, which are included in the self-hosted docker compose above. 6 | - Run the App with the Redis docker image: `docker compose up -d` in [this dir](/examples/docker/redis/). 7 | - The app automatically creates a Redis vector search index on the first run. Optionally, create a custom index with a specific name and set it as an environment variable (see below). 8 | - To enable more hybrid searching capabilities, adjust the document schema [here](/datastore/providers/redis_datastore.py). 9 | 10 | **Environment Variables:** 11 | 12 | | Name | Required | Description | Default | 13 | | ----------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- | 14 | | `DATASTORE` | Yes | Datastore name, set to `redis` | | 15 | | `BEARER_TOKEN` | Yes | Secret token | | 16 | | `OPENAI_API_KEY` | Yes | OpenAI API key | | 17 | | `REDIS_HOST` | Optional | Redis host url | `localhost` | 18 | | `REDIS_PORT` | Optional | Redis port | `6379` | 19 | | `REDIS_PASSWORD` | Optional | Redis password | none | 20 | | `REDIS_INDEX_NAME` | Optional | Redis vector index name | `index` | 21 | | `REDIS_DOC_PREFIX` | Optional | Redis key prefix for the index | `doc` | 22 | | `REDIS_DISTANCE_METRIC` | Optional | Vector similarity distance metric | `COSINE` | 23 | | `REDIS_INDEX_TYPE` | Optional | [Vector index algorithm type](https://redis.io/docs/stack/search/reference/vectors/#creation-attributes-per-algorithm) | `FLAT` | 24 | 25 | 26 | ## Redis Datastore development & testing 27 | In order to test your changes to the Redis Datastore, you can run the following commands: 28 | 29 | ```bash 30 | # Run the Redis stack docker image 31 | docker run -it --rm -p 6379:6379 redis/redis-stack-server:latest 32 | ``` 33 | 34 | ```bash 35 | # Run the Redis datastore tests 36 | poetry run pytest -s ./tests/datastore/providers/redis/test_redis_datastore.py 37 | ``` -------------------------------------------------------------------------------- /docs/providers/supabase/setup.md: -------------------------------------------------------------------------------- 1 | # Supabase 2 | 3 | [Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension for Postgres Database. [You can use Supabase CLI](https://github.com/supabase/cli) to set up a whole Supabase stack locally or in the cloud or you can also use docker-compose, k8s and other options available. For a hosted/managed solution, try [Supabase.com](https://supabase.com/) and unlock the full power of Postgres with built-in authentication, storage, auto APIs, and Realtime features. See more helpful examples of Supabase & pgvector as a vector database [here](https://github.com/supabase-community/nextjs-openai-doc-search). 4 | 5 | - The database needs the `pgvector` extension, which is included in [Supabase distribution of Postgres](https://github.com/supabase/postgres). 6 | - It is possible to provide a Postgres connection string and an app will add `documents` table, query Postgres function, and `pgvector` extension automatically. 7 | - But it is recommended to separate the migration process from an app. And execute the migration script in a different pipeline by using SQL statements from `_init_db()` function in [Supabase datastore provider](/datastore/providers/supabase_datastore.py). 8 | 9 | **Retrieval App Environment Variables** 10 | 11 | | Name | Required | Description | 12 | | ---------------- | -------- | -------------------------------------- | 13 | | `DATASTORE` | Yes | Datastore name. Set this to `supabase` | 14 | | `BEARER_TOKEN` | Yes | Your secret token | 15 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key | 16 | 17 | **Supabase Datastore Environment Variables** 18 | 19 | | Name | Required | Description | Default | 20 | | --------------------------- | -------- | ------------------------------------------------------------------------------ | ------- | 21 | | `SUPABASE_URL` | Yes | Supabase Project URL | | 22 | | `SUPABASE_ANON_KEY` | Optional | Supabase Project API anon key | | 23 | | `SUPABASE_SERVICE_ROLE_KEY` | Optional | Supabase Project API service key, will be used if provided instead of anon key | | 24 | 25 | ## Supabase Datastore local development & testing 26 | 27 | In order to test your changes to the Supabase Datastore, you can run the following commands: 28 | 29 | 1. Install [Supabase CLI](https://github.com/supabase/cli) and [Docker](https://docs.docker.com/get-docker/) 30 | 31 | 2. Run the Supabase `start` command from `examples/providers` directory. Config for Supabase local setup is available in `examples/providers/supabase` directory with required migrations. 32 | 33 | ```bash 34 | # Run the Supabase stack using cli in docker 35 | # go to examples/providers and run supabase start 36 | cd examples/providers 37 | supabase start 38 | ``` 39 | 40 | 3. Supabase `start` will download docker images and launch Supabase stack locally. You will see similar output: 41 | 42 | ```bash 43 | Applying migration 20230414142107_init_pg_vector.sql... 44 | Seeding data supabase/seed.sql... 45 | Started supabase local development setup. 46 | 47 | API URL: http://localhost:54321 48 | DB URL: postgresql://postgres:postgres@localhost:54322/postgres 49 | Studio URL: http://localhost:54323 50 | Inbucket URL: http://localhost:54324 51 | JWT secret: super-secret-jwt-token-with-at-least-32-characters-long 52 | anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0 53 | service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU 54 | ``` 55 | 56 | 4. Export environment variables required for the Supabase Datastore 57 | 58 | ```bash 59 | export SUPABASE_URL=http://localhost:54321 60 | export SUPABASE_SERVICE_ROLE_KEY='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU' 61 | ``` 62 | 63 | 5. Run the Supabase datastore tests from the project's root directory 64 | 65 | ```bash 66 | # Run the Supabase datastore tests 67 | # go to project's root directory and run 68 | poetry run pytest -s ./tests/datastore/providers/supabase/test_supabase_datastore.py 69 | ``` 70 | 71 | 6. When you go to prod (if cloud hosted) it is recommended to link your supabase project with the local setup from `examples/providers/supabase`. All migrations will be synced with the cloud project after you run `supabase db push`. Or you can manually apply migrations from `examples/providers/supabase/migrations` directory. 72 | 73 | 7. You might want to add RLS policies to the `documents` table. Or you can just continue using it on the server side only with the service role key. But you should not use service role key on the client side in any case. 74 | 75 | ## Indexes for Postgres 76 | 77 | By default, pgvector performs exact nearest neighbor search. To speed up the vector comparison, you may want to create indexes for the `embedding` column in the `documents` table. You should do this **only** after a few thousand records are inserted. 78 | 79 | As datasotre is using inner product for similarity search, you can add index as follows: 80 | 81 | ```sql 82 | create index on documents using ivfflat (embedding vector_ip_ops) with (lists = 100); 83 | ``` 84 | 85 | To choose `lists` constant - a good place to start is records / 1000 for up to 1M records and sqrt(records) for over 1M records 86 | 87 | For more information about indexes, see [pgvector docs](https://github.com/pgvector/pgvector#indexing). 88 | -------------------------------------------------------------------------------- /docs/providers/weaviate/setup.md: -------------------------------------------------------------------------------- 1 | # Weaviate 2 | 3 | ## Set up a Weaviate Instance 4 | 5 | [Weaviate](https://weaviate.io/) is an open-source vector search engine designed to scale seamlessly into billions of data objects. This implementation supports hybrid search out-of-the-box (meaning it will perform better for keyword searches). 6 | 7 | You can run Weaviate in 4 ways: 8 | 9 | - **SaaS** – with [Weaviate Cloud Services (WCS)](https://weaviate.io/pricing). 10 | 11 | WCS is a fully managed service that takes care of hosting, scaling, and updating your Weaviate instance. You can try it out for free with a sandbox that lasts for 30 days. 12 | 13 | To set up a SaaS Weaviate instance with WCS: 14 | 15 | 1. Navigate to [Weaviate Cloud Console](https://console.weaviate.io/). 16 | 2. Register or sign in to your WCS account. 17 | 3. Create a new cluster with the following settings: 18 | - `Name` – a unique name for your cluster. The name will become part of the URL used to access this instance. 19 | - `Subscription Tier` – Sandbox for a free trial, or contact [hello@weaviate.io](mailto:hello@weaviate.io) for other options. 20 | - `Weaviate Version` - The latest version by default. 21 | - `OIDC Authentication` – Enabled by default. This requires a username and password to access your instance. 22 | 4. Wait for a few minutes until your cluster is ready. You will see a green tick ✔️ when it's done. Copy your cluster URL. 23 | 24 | - **Hybrid SaaS** 25 | 26 | > If you need to keep your data on-premise for security or compliance reasons, Weaviate also offers a Hybrid SaaS option: Weaviate runs within your cloud instances, but the cluster is managed remotely by Weaviate. This gives you the benefits of a managed service without sending data to an external party. 27 | 28 | The Weaviate Hybrid SaaS is a custom solution. If you are interested in this option, please reach out to [hello@weaviate.io](mailto:hello@weaviate.io). 29 | 30 | - **Self-hosted** – with a Docker container 31 | 32 | To set up a Weaviate instance with Docker: 33 | 34 | 1. [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed. 35 | 2. [Install the Docker Compose Plugin](https://docs.docker.com/compose/install/) 36 | 3. Download a `docker-compose.yml` file with this `curl` command: 37 | 38 | ``` 39 | curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v1.18.0" 40 | ``` 41 | 42 | Alternatively, you can use Weaviate's docker compose [configuration tool](https://weaviate.io/developers/weaviate/installation/docker-compose) to generate your own `docker-compose.yml` file. 43 | 44 | 4. Run `docker compose up -d` to spin up a Weaviate instance. 45 | 46 | > To shut it down, run `docker compose down`. 47 | 48 | - **Self-hosted** – with a Kubernetes cluster 49 | 50 | To configure a self-hosted instance with Kubernetes, follow Weaviate's [documentation](https://weaviate.io/developers/weaviate/installation/kubernetes). 51 | 52 | ## Configure Weaviate Environment Variables 53 | 54 | You need to set some environment variables to connect to your Weaviate instance. 55 | 56 | **Retrieval App Environment Variables** 57 | 58 | | Name | Required | Description | 59 | | ---------------- | -------- |--------------------------------------------------------------------------------------| 60 | | `DATASTORE` | Yes | Datastore name. Set this to `weaviate` | 61 | | `BEARER_TOKEN` | Yes | Your [secret token](/README.md#general-environment-variables) (not the Weaviate one) | 62 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key | 63 | 64 | **Weaviate Datastore Environment Variables** 65 | 66 | | Name | Required | Description | Default | 67 | |------------------| -------- | ------------------------------------------------------------------ | ------------------ | 68 | | `WEAVIATE_URL` | Optional | Your weaviate instance's url/WCS endpoint | `http://localhost:8080` | | 69 | | `WEAVIATE_CLASS` | Optional | Your chosen Weaviate class/collection name to store your documents | OpenAIDocument | 70 | 71 | **Weaviate Auth Environment Variables** 72 | 73 | If using WCS instances, set the following environment variables: 74 | 75 | | Name | Required | Description | 76 | | ------------------- | -------- | ------------------------------ | 77 | | `WEAVIATE_API_KEY` | Yes | Your API key WCS | 78 | 79 | Learn more about accessing your [WCS API key](https://weaviate.io/developers/wcs/guides/authentication#access-api-keys). -------------------------------------------------------------------------------- /docs/providers/zilliz/setup.md: -------------------------------------------------------------------------------- 1 | # Zilliz 2 | 3 | [Zilliz](https://zilliz.com) is a managed cloud-native vector database designed for the billion scale. Zilliz offers many key features, such as: 4 | 5 | - Multiple indexing algorithms 6 | - Multiple distance metrics 7 | - Scalar filtering 8 | - Time travel searches 9 | - Rollback and with snapshots 10 | - Full RBAC 11 | - 99.9% uptime 12 | - Separated storage and compute 13 | - Multi-language SDK's 14 | 15 | Find more information [here](https://zilliz.com). 16 | 17 | **Self Hosted vs SaaS** 18 | 19 | Zilliz is a SaaS database, but offers an open-source solution, Milvus. Both options offer fast searches at the billion scale, but Zilliz handles data management for you. It automatically scales compute and storage resources and creates optimal indexes for your data. See the comparison [here](https://zilliz.com/doc/about_zilliz_cloud). 20 | 21 | ## Deploying the Database 22 | 23 | Zilliz Cloud is deployable in a few simple steps. First, create an account [here](https://cloud.zilliz.com/signup). Once you have an account set up, follow the guide [here](https://zilliz.com/doc/quick_start) to set up a database and get the parameters needed for this application. 24 | 25 | Environment Variables: 26 | 27 | | Name | Required | Description | 28 | |----------------------------| -------- |------------------------------------------------------------------| 29 | | `DATASTORE` | Yes | Datastore name, set to `zilliz` | 30 | | `BEARER_TOKEN` | Yes | Your secret token | 31 | | `OPENAI_API_KEY` | Yes | Your OpenAI API key | 32 | | `ZILLIZ_COLLECTION` | Optional | Zilliz collection name. Defaults to a random UUID | 33 | | `ZILLIZ_URI` | Yes | URI for the Zilliz instance | 34 | | `ZILLIZ_USER` | Yes | Zilliz username | 35 | | `ZILLIZ_PASSWORD` | Yes | Zilliz password | 36 | | `ZILLIZ_CONSISTENCY_LEVEL` | Optional | Data consistency level for the collection, defaults to `Bounded` | 37 | 38 | ## Running Zilliz Integration Tests 39 | 40 | A suite of integration tests is available to verify the Zilliz integration. To run the tests, create a Zilliz database and update the environment variables. 41 | 42 | Then, launch the test suite with this command: 43 | 44 | ```bash 45 | pytest ./tests/datastore/providers/zilliz/test_zilliz_datastore.py 46 | ``` 47 | -------------------------------------------------------------------------------- /examples/authentication-methods/no-auth/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.", 6 | "description_for_human": "Search through your documents.", 7 | "auth": { 8 | "type": "none" 9 | }, 10 | "api": { 11 | "type": "openapi", 12 | "url": "https://your-app-url.com/.well-known/openapi.yaml" 13 | }, 14 | "logo_url": "https://your-app-url.com/.well-known/logo.png", 15 | "contact_email": "hello@contact.com", 16 | "legal_info_url": "hello@legal.com" 17 | } 18 | 19 | -------------------------------------------------------------------------------- /examples/authentication-methods/no-auth/main.py: -------------------------------------------------------------------------------- 1 | # This is a version of the main.py file found in ../../../server/main.py without authentication. 2 | # Copy and paste this into the main file at ../../../server/main.py if you choose to use no authentication for your retrieval plugin. 3 | from typing import Optional 4 | import uvicorn 5 | from fastapi import FastAPI, File, Form, HTTPException, Body, UploadFile 6 | from fastapi.staticfiles import StaticFiles 7 | from loguru import logger 8 | 9 | from models.api import ( 10 | DeleteRequest, 11 | DeleteResponse, 12 | QueryRequest, 13 | QueryResponse, 14 | UpsertRequest, 15 | UpsertResponse, 16 | ) 17 | from datastore.factory import get_datastore 18 | from services.file import get_document_from_file 19 | 20 | from models.models import DocumentMetadata, Source 21 | 22 | 23 | app = FastAPI() 24 | app.mount("/.well-known", StaticFiles(directory=".well-known"), name="static") 25 | 26 | # Create a sub-application, in order to access just the query endpoints in the OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally 27 | sub_app = FastAPI( 28 | title="Retrieval Plugin API", 29 | description="A retrieval API for querying and filtering documents based on natural language queries and metadata", 30 | version="1.0.0", 31 | servers=[{"url": "https://your-app-url.com"}], 32 | ) 33 | app.mount("/sub", sub_app) 34 | 35 | 36 | @app.post( 37 | "/upsert-file", 38 | response_model=UpsertResponse, 39 | ) 40 | async def upsert_file( 41 | file: UploadFile = File(...), 42 | metadata: Optional[str] = Form(None), 43 | ): 44 | try: 45 | metadata_obj = ( 46 | DocumentMetadata.parse_raw(metadata) 47 | if metadata 48 | else DocumentMetadata(source=Source.file) 49 | ) 50 | except: 51 | metadata_obj = DocumentMetadata(source=Source.file) 52 | 53 | document = await get_document_from_file(file, metadata_obj) 54 | 55 | try: 56 | ids = await datastore.upsert([document]) 57 | return UpsertResponse(ids=ids) 58 | except Exception as e: 59 | logger.error(e) 60 | raise HTTPException(status_code=500, detail=f"str({e})") 61 | 62 | 63 | @app.post( 64 | "/upsert", 65 | response_model=UpsertResponse, 66 | ) 67 | async def upsert( 68 | request: UpsertRequest = Body(...), 69 | ): 70 | try: 71 | ids = await datastore.upsert(request.documents) 72 | return UpsertResponse(ids=ids) 73 | except Exception as e: 74 | logger.error(e) 75 | raise HTTPException(status_code=500, detail="Internal Service Error") 76 | 77 | 78 | @app.post( 79 | "/query", 80 | response_model=QueryResponse, 81 | ) 82 | async def query_main( 83 | request: QueryRequest = Body(...), 84 | ): 85 | try: 86 | results = await datastore.query( 87 | request.queries, 88 | ) 89 | return QueryResponse(results=results) 90 | except Exception as e: 91 | logger.error(e) 92 | raise HTTPException(status_code=500, detail="Internal Service Error") 93 | 94 | 95 | @sub_app.post( 96 | "/query", 97 | response_model=QueryResponse, 98 | description="Accepts search query objects with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.", 99 | ) 100 | async def query( 101 | request: QueryRequest = Body(...), 102 | ): 103 | try: 104 | results = await datastore.query( 105 | request.queries, 106 | ) 107 | return QueryResponse(results=results) 108 | except Exception as e: 109 | logger.error(e) 110 | raise HTTPException(status_code=500, detail="Internal Service Error") 111 | 112 | 113 | @app.delete( 114 | "/delete", 115 | response_model=DeleteResponse, 116 | ) 117 | async def delete( 118 | request: DeleteRequest = Body(...), 119 | ): 120 | if not (request.ids or request.filter or request.delete_all): 121 | raise HTTPException( 122 | status_code=400, 123 | detail="One of ids, filter, or delete_all is required", 124 | ) 125 | try: 126 | success = await datastore.delete( 127 | ids=request.ids, 128 | filter=request.filter, 129 | delete_all=request.delete_all, 130 | ) 131 | return DeleteResponse(success=success) 132 | except Exception as e: 133 | logger.error(e) 134 | raise HTTPException(status_code=500, detail="Internal Service Error") 135 | 136 | 137 | @app.on_event("startup") 138 | async def startup(): 139 | global datastore 140 | datastore = await get_datastore() 141 | 142 | 143 | def start(): 144 | uvicorn.run("server.main:app", host="0.0.0.0", port=8000, reload=True) 145 | -------------------------------------------------------------------------------- /examples/authentication-methods/oauth/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.", 6 | "description_for_human": "Search through your documents.", 7 | "auth" : { 8 | "type":"oauth", 9 | "client_url":"e.g. https:///oauth/v2/authorize", 10 | "authorization_url":"e.g. https:///api/oauth.v2.access", 11 | "scope":"search:read", 12 | "authorization_content_type":"application/x-www-form-urlencoded", 13 | "verification_tokens":{ 14 | "openai":"" 15 | } 16 | }, 17 | "api":{ 18 | "url": "https://your-app-url.com/.well-known/openapi.yaml", 19 | "has_user_authentication":true, 20 | "type":"openapi" 21 | }, 22 | "logo_url": "https://your-app-url.com/.well-known/logo.png", 23 | "contact_email": "hello@contact.com", 24 | "legal_info_url": "hello@legal.com" 25 | } 26 | -------------------------------------------------------------------------------- /examples/authentication-methods/service-http/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.", 6 | "description_for_human": "Search through your documents.", 7 | "auth":{ 8 | "type":"service_http", 9 | "authorization_type":"bearer", 10 | "verification_tokens":{ 11 | "openai":"" 12 | } 13 | }, 14 | "api":{ 15 | "url": "https://your-app-url.com/.well-known/openapi.yaml", 16 | "has_user_authentication":false, 17 | "type":"openapi" 18 | }, 19 | "logo_url": "https://your-app-url.com/.well-known/logo.png", 20 | "contact_email": "hello@contact.com", 21 | "legal_info_url": "hello@legal.com" 22 | } 23 | -------------------------------------------------------------------------------- /examples/authentication-methods/user-http/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.", 6 | "description_for_human": "Search through your documents.", 7 | "auth": { 8 | "type": "user_http", 9 | "authorization_type": "bearer" 10 | }, 11 | "api": { 12 | "type": "openapi", 13 | "url": "https://your-app-url.com/.well-known/openapi.yaml", 14 | "has_user_authentication": false 15 | }, 16 | "logo_url": "https://your-app-url.com/.well-known/logo.png", 17 | "contact_email": "hello@contact.com", 18 | "legal_info_url": "hello@legal.com" 19 | } -------------------------------------------------------------------------------- /examples/docker/elasticsearch/README.md: -------------------------------------------------------------------------------- 1 | ## Running Elasticsearch 2 | 3 | ```bash 4 | docker-compose up -d 5 | ``` 6 | 7 | should now be running at http://localhost:9200 8 | -------------------------------------------------------------------------------- /examples/docker/elasticsearch/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | elasticsearch: 5 | image: docker.elastic.co/elasticsearch/elasticsearch:8.8.2 6 | container_name: elasticsearch 7 | environment: 8 | - discovery.type=single-node 9 | - node.name=elasticsearch 10 | - xpack.security.enabled=false 11 | ulimits: 12 | memlock: 13 | soft: -1 14 | hard: -1 15 | ports: 16 | - "9200:9200" 17 | networks: 18 | - esnet 19 | volumes: 20 | - esdata:/usr/share/elasticsearch/data 21 | 22 | networks: 23 | esnet: 24 | 25 | volumes: 26 | esdata: 27 | driver: local 28 | -------------------------------------------------------------------------------- /examples/docker/milvus/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.5' 2 | 3 | services: 4 | etcd: 5 | container_name: milvus-etcd 6 | image: quay.io/coreos/etcd:v3.5.0 7 | environment: 8 | - ETCD_AUTO_COMPACTION_MODE=revision 9 | - ETCD_AUTO_COMPACTION_RETENTION=1000 10 | - ETCD_QUOTA_BACKEND_BYTES=4294967296 11 | - ETCD_SNAPSHOT_COUNT=50000 12 | volumes: 13 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd 14 | command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd 15 | 16 | minio: 17 | container_name: milvus-minio 18 | image: minio/minio:RELEASE.2023-03-20T20-16-18Z 19 | environment: 20 | MINIO_ACCESS_KEY: minioadmin 21 | MINIO_SECRET_KEY: minioadmin 22 | volumes: 23 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data 24 | command: minio server /minio_data 25 | healthcheck: 26 | test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] 27 | interval: 30s 28 | timeout: 20s 29 | retries: 3 30 | 31 | standalone: 32 | container_name: milvus-standalone 33 | image: milvusdb/milvus:v2.2.5 34 | command: ["milvus", "run", "standalone"] 35 | environment: 36 | ETCD_ENDPOINTS: etcd:2379 37 | MINIO_ADDRESS: minio:9000 38 | volumes: 39 | - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus 40 | ports: 41 | - "19530:19530" 42 | - "9091:9091" 43 | depends_on: 44 | - "etcd" 45 | - "minio" 46 | 47 | networks: 48 | default: 49 | name: milvus -------------------------------------------------------------------------------- /examples/docker/qdrant/README.md: -------------------------------------------------------------------------------- 1 | # Running the Retrieval Plugin with Qdrant in Docker Containers 2 | 3 | To set up the ChatGPT retrieval plugin with a single instance of a Qdrant vector database, follow these steps: 4 | 5 | ## Set Environment Variables 6 | 7 | Set the following environment variables: 8 | 9 | ```bash 10 | # Provide your own OpenAI API key in order to start. 11 | export OPENAI_API_KEY="" 12 | # This is an example of a minimal token generated by https://jwt.io/ 13 | export BEARER_TOKEN="" 14 | ``` 15 | 16 | ## Run Qdrant and the Retrieval Plugin in Docker Containers 17 | 18 | Both Docker containers might be launched with docker-compose: 19 | 20 | ```bash 21 | docker-compose up -d 22 | ``` 23 | 24 | ## Store the Documents 25 | 26 | Store an initial batch of documents by calling the `/upsert` endpoint: 27 | 28 | ```bash 29 | curl -X POST \ 30 | -H "Content-type: application/json" \ 31 | -H "Authorization: Bearer $BEARER_TOKEN" \ 32 | --data-binary '@documents.json' \ 33 | "http://localhost:80/upsert" 34 | ``` 35 | 36 | ## Send a Test Query 37 | 38 | You can query Qdrant to find relevant document chunks by calling the `/query` endpoint: 39 | 40 | ```bash 41 | curl -X POST \ 42 | -H "Content-type: application/json" \ 43 | -H "Authorization: Bearer $BEARER_TOKEN" \ 44 | --data-binary '@queries.json' \ 45 | "http://localhost:80/query" 46 | ``` 47 | -------------------------------------------------------------------------------- /examples/docker/qdrant/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | retrieval-app: 3 | build: 4 | context: ../../../ 5 | dockerfile: Dockerfile 6 | image: openai/chatgpt-retrieval-plugin 7 | ports: 8 | - "80:80" 9 | depends_on: 10 | - qdrant 11 | environment: 12 | DATASTORE: "qdrant" 13 | QDRANT_URL: "http://qdrant" 14 | BEARER_TOKEN: "${BEARER_TOKEN}" 15 | OPENAI_API_KEY: "${OPENAI_API_KEY}" 16 | qdrant: 17 | image: qdrant/qdrant:v1.0.3 -------------------------------------------------------------------------------- /examples/docker/qdrant/documents.json: -------------------------------------------------------------------------------- 1 | { 2 | "documents": [ 3 | { 4 | "id": "openai", 5 | "text": "OpenAI is an AI research and deployment company. Our mission is to ensure that artificial general intelligence benefits all of humanity.", 6 | "metadata": { 7 | "created_at": "2023-03-14" 8 | } 9 | }, 10 | { 11 | "id": "chatgpt", 12 | "text": "ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. The dialogue format makes it possible for ChatGPT to answer followup questions, admit its mistakes, challenge incorrect premises, and reject inappropriate requests." 13 | }, 14 | { 15 | "id": "qdrant", 16 | "text": "Qdrant is a vector similarity engine & vector database. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!", 17 | "metadata": { 18 | "created_at": "2023-03-14", 19 | "author": "Kacper Łukawski" 20 | } 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /examples/docker/qdrant/queries.json: -------------------------------------------------------------------------------- 1 | { 2 | "queries": [ 3 | { 4 | "query": "What vector database should I use?" 5 | } 6 | ] 7 | } -------------------------------------------------------------------------------- /examples/docker/redis/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | services: 4 | redis: 5 | image: redis/redis-stack-server:latest 6 | ports: 7 | - "6379:6379" 8 | volumes: 9 | - redis_data:/data 10 | healthcheck: 11 | test: ["CMD", "redis-cli", "-h", "localhost", "-p", "6379", "ping"] 12 | interval: 2s 13 | timeout: 1m30s 14 | retries: 5 15 | start_period: 5s 16 | 17 | volumes: 18 | redis_data: -------------------------------------------------------------------------------- /examples/memory/README.md: -------------------------------------------------------------------------------- 1 | # ChatGPT Retrieval Plugin with Memory 2 | 3 | This example demonstrates how to give ChatGPT the ability to remember information from conversations and store it in the retrieval plugin for later use. By allowing the model to access the `/upsert` endpoint, it can save snippets from the conversation to the vector database and retrieve them when needed. 4 | 5 | ## Setup 6 | 7 | To enable ChatGPT to save information from conversations, follow these steps: 8 | 9 | - Navigate to the "Configure" tab in the [create GPT page](https://chat.openai.com/gpts/editor), and copy the contents of [openapi.yaml](openapi.yaml) into the custom actions section. This will give the custom GPT access to both the Retrieval Plugin's query and upsert endpoints. 10 | 11 | **Optional:** If you make any changes to the plugin instructions or metadata models, you can also copy the contents of [main.py](main.py) into the main [main.py](../../server/main.py) file. This will allow you to access the openapi.json at `http://0.0.0.0:8000/sub/openapi.json` when you run the app locally. You can convert from JSON to YAML format with [Swagger Editor](https://editor.swagger.io/). Alternatively, you can replace the openapi.yaml file with an openapi.json file. 12 | 13 | After completing these steps, your custom GPT will be able to access your plugin's `/upsert` endpoint and save snippets from the conversation to the vector database. This enables the model to remember information from previous conversations and retrieve it when needed. 14 | -------------------------------------------------------------------------------- /examples/memory/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information, or asks you to save information for later.", 6 | "description_for_human": "Search through your documents.", 7 | "auth": { 8 | "type": "user_http", 9 | "authorization_type": "bearer" 10 | }, 11 | "api": { 12 | "type": "openapi", 13 | "url": "https://your-app-url.com/.well-known/openapi.yaml", 14 | "has_user_authentication": false 15 | }, 16 | "logo_url": "https://your-app-url.com/.well-known/logo.png", 17 | "contact_email": "hello@contact.com", 18 | "legal_info_url": "hello@legal.com" 19 | } -------------------------------------------------------------------------------- /examples/providers/supabase/.gitignore: -------------------------------------------------------------------------------- 1 | # Supabase 2 | .branches 3 | .temp 4 | -------------------------------------------------------------------------------- /examples/providers/supabase/config.toml: -------------------------------------------------------------------------------- 1 | # A string used to distinguish different Supabase projects on the same host. Defaults to the working 2 | # directory name when running `supabase init`. 3 | project_id = "providers" 4 | 5 | [api] 6 | # Port to use for the API URL. 7 | port = 54321 8 | # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API 9 | # endpoints. public and storage are always included. 10 | schemas = ["public", "storage", "graphql_public"] 11 | # Extra schemas to add to the search_path of every request. public is always included. 12 | extra_search_path = ["public", "extensions"] 13 | # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size 14 | # for accidental or malicious requests. 15 | max_rows = 1000 16 | 17 | [db] 18 | # Port to use for the local database URL. 19 | port = 54322 20 | # The database major version to use. This has to be the same as your remote database's. Run `SHOW 21 | # server_version;` on the remote database to check. 22 | major_version = 15 23 | 24 | [studio] 25 | # Port to use for Supabase Studio. 26 | port = 54323 27 | 28 | # Email testing server. Emails sent with the local dev setup are not actually sent - rather, they 29 | # are monitored, and you can view the emails that would have been sent from the web interface. 30 | [inbucket] 31 | # Port to use for the email testing server web interface. 32 | port = 54324 33 | smtp_port = 54325 34 | pop3_port = 54326 35 | 36 | [storage] 37 | # The maximum file size allowed (e.g. "5MB", "500KB"). 38 | file_size_limit = "50MiB" 39 | 40 | [auth] 41 | # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used 42 | # in emails. 43 | site_url = "http://localhost:3000" 44 | # A list of *exact* URLs that auth providers are permitted to redirect to post authentication. 45 | additional_redirect_urls = ["https://localhost:3000"] 46 | # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 seconds (one 47 | # week). 48 | jwt_expiry = 3600 49 | # Allow/disallow new user signups to your project. 50 | enable_signup = true 51 | 52 | [auth.email] 53 | # Allow/disallow new user signups via email to your project. 54 | enable_signup = true 55 | # If enabled, a user will be required to confirm any email change on both the old, and new email 56 | # addresses. If disabled, only the new email is required to confirm. 57 | double_confirm_changes = true 58 | # If enabled, users need to confirm their email address before signing in. 59 | enable_confirmations = false 60 | 61 | # Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`, 62 | # `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin`, `notion`, `twitch`, 63 | # `twitter`, `slack`, `spotify`, `workos`, `zoom`. 64 | [auth.external.apple] 65 | enabled = false 66 | client_id = "" 67 | secret = "" 68 | # Overrides the default auth redirectUrl. 69 | redirect_uri = "" 70 | # Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure, 71 | # or any other third-party OIDC providers. 72 | url = "" 73 | -------------------------------------------------------------------------------- /examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql: -------------------------------------------------------------------------------- 1 | create extension vector; 2 | 3 | create table if not exists documents ( 4 | id text primary key default gen_random_uuid()::text, 5 | source text, 6 | source_id text, 7 | content text, 8 | document_id text, 9 | author text, 10 | url text, 11 | created_at timestamptz default now(), 12 | embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model 13 | ); 14 | 15 | create index ix_documents_document_id on documents using btree ( document_id ); 16 | create index ix_documents_source on documents using btree ( source ); 17 | create index ix_documents_source_id on documents using btree ( source_id ); 18 | create index ix_documents_author on documents using btree ( author ); 19 | create index ix_documents_created_at on documents using brin ( created_at ); 20 | 21 | alter table documents enable row level security; 22 | 23 | create or replace function match_page_sections(in_embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model 24 | , in_match_count int default 3 25 | , in_document_id text default '%%' 26 | , in_source_id text default '%%' 27 | , in_source text default '%%' 28 | , in_author text default '%%' 29 | , in_start_date timestamptz default '-infinity' 30 | , in_end_date timestamptz default 'infinity') 31 | returns table (id text 32 | , source text 33 | , source_id text 34 | , document_id text 35 | , url text 36 | , created_at timestamptz 37 | , author text 38 | , content text 39 | , embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model 40 | , similarity float) 41 | language plpgsql 42 | as $$ 43 | #variable_conflict use_variable 44 | begin 45 | return query 46 | select 47 | documents.id, 48 | documents.source, 49 | documents.source_id, 50 | documents.document_id, 51 | documents.url, 52 | documents.created_at, 53 | documents.author, 54 | documents.content, 55 | documents.embedding, 56 | (documents.embedding <#> in_embedding) * -1 as similarity 57 | from documents 58 | 59 | where in_start_date <= documents.created_at and 60 | documents.created_at <= in_end_date and 61 | (documents.source_id like in_source_id or documents.source_id is null) and 62 | (documents.source like in_source or documents.source is null) and 63 | (documents.author like in_author or documents.author is null) and 64 | (documents.document_id like in_document_id or documents.document_id is null) 65 | 66 | order by documents.embedding <#> in_embedding 67 | 68 | limit in_match_count; 69 | end; 70 | $$; -------------------------------------------------------------------------------- /examples/providers/supabase/seed.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/examples/providers/supabase/seed.sql -------------------------------------------------------------------------------- /local_server/ai-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": "v1", 3 | "name_for_model": "retrieval", 4 | "name_for_human": "Retrieval Plugin", 5 | "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.", 6 | "description_for_human": "Search through your documents.", 7 | "auth": { 8 | "type": "none" 9 | }, 10 | "api": { 11 | "type": "openapi", 12 | "url": "http://localhost:3333/.well-known/openapi.yaml" 13 | }, 14 | "logo_url": "http://localhost:3333/.well-known/logo.png", 15 | "contact_email": "hello@contact.com", 16 | "legal_info_url": "hello@legal.com" 17 | } 18 | 19 | -------------------------------------------------------------------------------- /local_server/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/local_server/logo.png -------------------------------------------------------------------------------- /local_server/main.py: -------------------------------------------------------------------------------- 1 | # This is a version of the main.py file found in ../../../server/main.py for testing the plugin locally. 2 | # Use the command `poetry run dev` to run this. 3 | from typing import Optional 4 | import uvicorn 5 | from fastapi import FastAPI, File, Form, HTTPException, Body, UploadFile 6 | from loguru import logger 7 | 8 | from models.api import ( 9 | DeleteRequest, 10 | DeleteResponse, 11 | QueryRequest, 12 | QueryResponse, 13 | UpsertRequest, 14 | UpsertResponse, 15 | ) 16 | from datastore.factory import get_datastore 17 | from services.file import get_document_from_file 18 | 19 | from starlette.responses import FileResponse 20 | 21 | from models.models import DocumentMetadata, Source 22 | from fastapi.middleware.cors import CORSMiddleware 23 | 24 | 25 | app = FastAPI() 26 | 27 | PORT = 3333 28 | 29 | origins = [ 30 | f"http://localhost:{PORT}", 31 | "https://chat.openai.com", 32 | ] 33 | 34 | app.add_middleware( 35 | CORSMiddleware, 36 | allow_origins=origins, 37 | allow_credentials=True, 38 | allow_methods=["*"], 39 | allow_headers=["*"], 40 | ) 41 | 42 | 43 | @app.route("/.well-known/ai-plugin.json") 44 | async def get_manifest(request): 45 | file_path = "./local_server/ai-plugin.json" 46 | simple_headers = {} 47 | simple_headers["Access-Control-Allow-Private-Network"] = "true" 48 | return FileResponse(file_path, media_type="text/json", headers=simple_headers) 49 | 50 | 51 | @app.route("/.well-known/logo.png") 52 | async def get_logo(request): 53 | file_path = "./local_server/logo.png" 54 | return FileResponse(file_path, media_type="text/json") 55 | 56 | 57 | @app.route("/.well-known/openapi.yaml") 58 | async def get_openapi(request): 59 | file_path = "./local_server/openapi.yaml" 60 | return FileResponse(file_path, media_type="text/json") 61 | 62 | 63 | @app.post( 64 | "/upsert-file", 65 | response_model=UpsertResponse, 66 | ) 67 | async def upsert_file( 68 | file: UploadFile = File(...), 69 | metadata: Optional[str] = Form(None), 70 | ): 71 | try: 72 | metadata_obj = ( 73 | DocumentMetadata.parse_raw(metadata) 74 | if metadata 75 | else DocumentMetadata(source=Source.file) 76 | ) 77 | except: 78 | metadata_obj = DocumentMetadata(source=Source.file) 79 | 80 | document = await get_document_from_file(file, metadata_obj) 81 | 82 | try: 83 | ids = await datastore.upsert([document]) 84 | return UpsertResponse(ids=ids) 85 | except Exception as e: 86 | logger.error(e) 87 | raise HTTPException(status_code=500, detail=f"str({e})") 88 | 89 | 90 | @app.post( 91 | "/upsert", 92 | response_model=UpsertResponse, 93 | ) 94 | async def upsert( 95 | request: UpsertRequest = Body(...), 96 | ): 97 | try: 98 | ids = await datastore.upsert(request.documents) 99 | return UpsertResponse(ids=ids) 100 | except Exception as e: 101 | logger.error(e) 102 | raise HTTPException(status_code=500, detail="Internal Service Error") 103 | 104 | 105 | @app.post("/query", response_model=QueryResponse) 106 | async def query_main(request: QueryRequest = Body(...)): 107 | try: 108 | results = await datastore.query( 109 | request.queries, 110 | ) 111 | return QueryResponse(results=results) 112 | except Exception as e: 113 | logger.error(e) 114 | raise HTTPException(status_code=500, detail="Internal Service Error") 115 | 116 | 117 | @app.delete( 118 | "/delete", 119 | response_model=DeleteResponse, 120 | ) 121 | async def delete( 122 | request: DeleteRequest = Body(...), 123 | ): 124 | if not (request.ids or request.filter or request.delete_all): 125 | raise HTTPException( 126 | status_code=400, 127 | detail="One of ids, filter, or delete_all is required", 128 | ) 129 | try: 130 | success = await datastore.delete( 131 | ids=request.ids, 132 | filter=request.filter, 133 | delete_all=request.delete_all, 134 | ) 135 | return DeleteResponse(success=success) 136 | except Exception as e: 137 | logger.error(e) 138 | raise HTTPException(status_code=500, detail="Internal Service Error") 139 | 140 | 141 | @app.on_event("startup") 142 | async def startup(): 143 | global datastore 144 | datastore = await get_datastore() 145 | 146 | 147 | def start(): 148 | uvicorn.run("local_server.main:app", host="localhost", port=PORT, reload=True) 149 | -------------------------------------------------------------------------------- /local_server/openapi.yaml: -------------------------------------------------------------------------------- 1 | openapi: 3.0.2 2 | info: 3 | title: Retrieval Plugin API 4 | description: A retrieval API for querying and filtering documents based on natural language queries and metadata 5 | version: 1.0.0 6 | servers: 7 | - url: http://localhost:3333 8 | paths: 9 | /query: 10 | post: 11 | summary: Query 12 | description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs. 13 | operationId: query_query_post 14 | requestBody: 15 | content: 16 | application/json: 17 | schema: 18 | $ref: "#/components/schemas/QueryRequest" 19 | required: true 20 | responses: 21 | "200": 22 | description: Successful Response 23 | content: 24 | application/json: 25 | schema: 26 | $ref: "#/components/schemas/QueryResponse" 27 | "422": 28 | description: Validation Error 29 | content: 30 | application/json: 31 | schema: 32 | $ref: "#/components/schemas/HTTPValidationError" 33 | components: 34 | schemas: 35 | DocumentChunkMetadata: 36 | title: DocumentChunkMetadata 37 | type: object 38 | properties: 39 | source: 40 | $ref: "#/components/schemas/Source" 41 | source_id: 42 | title: Source Id 43 | type: string 44 | url: 45 | title: Url 46 | type: string 47 | created_at: 48 | title: Created At 49 | type: string 50 | author: 51 | title: Author 52 | type: string 53 | document_id: 54 | title: Document Id 55 | type: string 56 | DocumentChunkWithScore: 57 | title: DocumentChunkWithScore 58 | required: 59 | - text 60 | - metadata 61 | - score 62 | type: object 63 | properties: 64 | id: 65 | title: Id 66 | type: string 67 | text: 68 | title: Text 69 | type: string 70 | metadata: 71 | $ref: "#/components/schemas/DocumentChunkMetadata" 72 | embedding: 73 | title: Embedding 74 | type: array 75 | items: 76 | type: number 77 | score: 78 | title: Score 79 | type: number 80 | DocumentMetadataFilter: 81 | title: DocumentMetadataFilter 82 | type: object 83 | properties: 84 | document_id: 85 | title: Document Id 86 | type: string 87 | source: 88 | $ref: "#/components/schemas/Source" 89 | source_id: 90 | title: Source Id 91 | type: string 92 | author: 93 | title: Author 94 | type: string 95 | start_date: 96 | title: Start Date 97 | type: string 98 | end_date: 99 | title: End Date 100 | type: string 101 | HTTPValidationError: 102 | title: HTTPValidationError 103 | type: object 104 | properties: 105 | detail: 106 | title: Detail 107 | type: array 108 | items: 109 | $ref: "#/components/schemas/ValidationError" 110 | Query: 111 | title: Query 112 | required: 113 | - query 114 | type: object 115 | properties: 116 | query: 117 | title: Query 118 | type: string 119 | filter: 120 | $ref: "#/components/schemas/DocumentMetadataFilter" 121 | top_k: 122 | title: Top K 123 | type: integer 124 | default: 3 125 | QueryRequest: 126 | title: QueryRequest 127 | required: 128 | - queries 129 | type: object 130 | properties: 131 | queries: 132 | title: Queries 133 | type: array 134 | items: 135 | $ref: "#/components/schemas/Query" 136 | QueryResponse: 137 | title: QueryResponse 138 | required: 139 | - results 140 | type: object 141 | properties: 142 | results: 143 | title: Results 144 | type: array 145 | items: 146 | $ref: "#/components/schemas/QueryResult" 147 | QueryResult: 148 | title: QueryResult 149 | required: 150 | - query 151 | - results 152 | type: object 153 | properties: 154 | query: 155 | title: Query 156 | type: string 157 | results: 158 | title: Results 159 | type: array 160 | items: 161 | $ref: "#/components/schemas/DocumentChunkWithScore" 162 | Source: 163 | title: Source 164 | enum: 165 | - email 166 | - file 167 | - chat 168 | type: string 169 | description: An enumeration. 170 | ValidationError: 171 | title: ValidationError 172 | required: 173 | - loc 174 | - msg 175 | - type 176 | type: object 177 | properties: 178 | loc: 179 | title: Location 180 | type: array 181 | items: 182 | anyOf: 183 | - type: string 184 | - type: integer 185 | msg: 186 | title: Message 187 | type: string 188 | type: 189 | title: Error Type 190 | type: string 191 | -------------------------------------------------------------------------------- /models/api.py: -------------------------------------------------------------------------------- 1 | from models.models import ( 2 | Document, 3 | DocumentMetadataFilter, 4 | Query, 5 | QueryResult, 6 | ) 7 | from pydantic import BaseModel 8 | from typing import List, Optional 9 | 10 | 11 | class UpsertRequest(BaseModel): 12 | documents: List[Document] 13 | 14 | 15 | class UpsertResponse(BaseModel): 16 | ids: List[str] 17 | 18 | 19 | class QueryRequest(BaseModel): 20 | queries: List[Query] 21 | 22 | 23 | class QueryResponse(BaseModel): 24 | results: List[QueryResult] 25 | 26 | 27 | class DeleteRequest(BaseModel): 28 | ids: Optional[List[str]] = None 29 | filter: Optional[DocumentMetadataFilter] = None 30 | delete_all: Optional[bool] = False 31 | 32 | 33 | class DeleteResponse(BaseModel): 34 | success: bool 35 | -------------------------------------------------------------------------------- /models/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional 3 | from enum import Enum 4 | 5 | 6 | class Source(str, Enum): 7 | email = "email" 8 | file = "file" 9 | chat = "chat" 10 | 11 | 12 | class DocumentMetadata(BaseModel): 13 | source: Optional[Source] = None 14 | source_id: Optional[str] = None 15 | url: Optional[str] = None 16 | created_at: Optional[str] = None 17 | author: Optional[str] = None 18 | 19 | 20 | class DocumentChunkMetadata(DocumentMetadata): 21 | document_id: Optional[str] = None 22 | 23 | 24 | class DocumentChunk(BaseModel): 25 | id: Optional[str] = None 26 | text: str 27 | metadata: DocumentChunkMetadata 28 | embedding: Optional[List[float]] = None 29 | 30 | 31 | class DocumentChunkWithScore(DocumentChunk): 32 | score: float 33 | 34 | 35 | class Document(BaseModel): 36 | id: Optional[str] = None 37 | text: str 38 | metadata: Optional[DocumentMetadata] = None 39 | 40 | 41 | class DocumentWithChunks(Document): 42 | chunks: List[DocumentChunk] 43 | 44 | 45 | class DocumentMetadataFilter(BaseModel): 46 | document_id: Optional[str] = None 47 | source: Optional[Source] = None 48 | source_id: Optional[str] = None 49 | author: Optional[str] = None 50 | start_date: Optional[str] = None # any date string format 51 | end_date: Optional[str] = None # any date string format 52 | 53 | 54 | class Query(BaseModel): 55 | query: str 56 | filter: Optional[DocumentMetadataFilter] = None 57 | top_k: Optional[int] = 3 58 | 59 | 60 | class QueryWithEmbedding(Query): 61 | embedding: List[float] 62 | 63 | 64 | class QueryResult(BaseModel): 65 | query: str 66 | results: List[DocumentChunkWithScore] 67 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "chatgpt-retrieval-plugin" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["isafulf "] 6 | readme = "README.md" 7 | packages = [{include = "server"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | fastapi = "^0.92.0" 12 | uvicorn = "^0.20.0" 13 | openai = "^0.27.5" 14 | python-dotenv = "^0.21.1" 15 | pydantic = "^1.10.5" 16 | tenacity = "^8.2.1" 17 | tiktoken = "^0.2.0" 18 | numpy = "^1.24.2" 19 | docx2txt = "^0.8" 20 | PyPDF2 = "^3.0.1" 21 | python-pptx = "^0.6.21" 22 | python-multipart = "^0.0.6" 23 | arrow = "^1.2.3" 24 | chromadb = "^0.3.25" 25 | pinecone-client = "^2.1.0" 26 | weaviate-client = "^3.12.0" 27 | pymilvus = "^2.2.2" 28 | qdrant-client = {version = "^1.0.4", python = "<3.12"} 29 | redis = "4.5.4" 30 | supabase = "^1.0.2" 31 | psycopg2 = "^2.9.5" 32 | llama-index = "0.5.4" 33 | azure-identity = "^1.12.0" 34 | azure-search-documents = "11.4.0b8" 35 | pgvector = "^0.1.7" 36 | psycopg2cffi = {version = "^2.9.0", optional = true} 37 | loguru = "^0.7.0" 38 | elasticsearch = "8.8.2" 39 | pymongo = "^4.3.3" 40 | motor = "^3.3.2" 41 | 42 | [tool.poetry.scripts] 43 | start = "server.main:start" 44 | dev = "local_server.main:start" 45 | 46 | [tool.poetry.extras] 47 | postgresql = ["psycopg2cffi"] 48 | 49 | [tool.poetry.group.dev.dependencies] 50 | httpx = "^0.23.3" 51 | pytest = "^7.2.1" 52 | pytest-cov = "^4.0.0" 53 | pytest-asyncio = "^0.20.3" 54 | 55 | [build-system] 56 | requires = ["poetry-core"] 57 | build-backend = "poetry.core.masonry.api" 58 | 59 | [tool.pytest.ini_options] 60 | pythonpath = [ 61 | "." 62 | ] 63 | asyncio_mode="auto" 64 | -------------------------------------------------------------------------------- /scripts/process_json/README.md: -------------------------------------------------------------------------------- 1 | ## Process a JSON File 2 | 3 | This script is a utility to process a file dump of documents in a JSON format and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case. 4 | 5 | ## Usage 6 | 7 | To run this script from the terminal, navigate to this folder and use the following command: 8 | 9 | ``` 10 | python process_json.py --filepath path/to/file_dump.json --custom_metadata '{"source": "file"}' --screen_for_pii True --extract_metadata True 11 | ``` 12 | 13 | where: 14 | 15 | - `path/to/file_dump.json` is the name or path to the file dump to be processed. The format of this JSON file should be a list of JSON objects, where each object represents a document. The JSON object should have a subset of the following fields: `id`, `text`, `source`, `source_id`, `url`, `created_at`, and `author`. The `text` field is required, while the rest are optional and will be used to populate the metadata of the document. If the `id` field is not specified, a random UUID will be generated for the document. 16 | - `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{"source": "file"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`). 17 | - `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`. 18 | - `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`. 19 | 20 | The script will load the JSON file as a list of dictionaries, iterate over the data, create document objects, and batch upsert them into the database. It will also print some progress messages and error messages if any, as well as the number and content of the skipped items due to errors or PII detection. 21 | 22 | You can use `python process_json.py -h` to get a summary of the options and their descriptions. 23 | 24 | Test the script with the example file, [example.json](example.json). 25 | -------------------------------------------------------------------------------- /scripts/process_json/example.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "123", 4 | "text": "This is a document about something", 5 | "source": "file", 6 | "source_id": "https://example.com/doc1", 7 | "url": "https://example.com/doc1", 8 | "created_at": "2021-01-01T12:00:00Z", 9 | "author": "Alice" 10 | }, 11 | { 12 | "text": "This is another document about something else", 13 | "source": "file", 14 | "source_id": "doc2.txt", 15 | "author": "Bob" 16 | }, 17 | { 18 | "id": "456", 19 | "text": "This is Alice's phone number: 123-456-7890", 20 | "source": "email", 21 | "source_id": "567", 22 | "created_at": "2021-01-02T13:00:00Z", 23 | "author": "Alice" 24 | } 25 | ] -------------------------------------------------------------------------------- /scripts/process_json/process_json.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import json 3 | import argparse 4 | import asyncio 5 | 6 | from loguru import logger 7 | from models.models import Document, DocumentMetadata 8 | from datastore.datastore import DataStore 9 | from datastore.factory import get_datastore 10 | from services.extract_metadata import extract_metadata_from_document 11 | from services.pii_detection import screen_text_for_pii 12 | 13 | DOCUMENT_UPSERT_BATCH_SIZE = 50 14 | 15 | 16 | async def process_json_dump( 17 | filepath: str, 18 | datastore: DataStore, 19 | custom_metadata: dict, 20 | screen_for_pii: bool, 21 | extract_metadata: bool, 22 | ): 23 | # load the json file as a list of dictionaries 24 | with open(filepath) as json_file: 25 | data = json.load(json_file) 26 | 27 | documents = [] 28 | skipped_items = [] 29 | # iterate over the data and create document objects 30 | for item in data: 31 | if len(documents) % 20 == 0: 32 | logger.info(f"Processed {len(documents)} documents") 33 | 34 | try: 35 | # get the id, text, source, source_id, url, created_at and author from the item 36 | # use default values if not specified 37 | id = item.get("id", None) 38 | text = item.get("text", None) 39 | source = item.get("source", None) 40 | source_id = item.get("source_id", None) 41 | url = item.get("url", None) 42 | created_at = item.get("created_at", None) 43 | author = item.get("author", None) 44 | 45 | if not text: 46 | logger.info("No document text, skipping...") 47 | continue 48 | 49 | # create a metadata object with the source, source_id, url, created_at and author 50 | metadata = DocumentMetadata( 51 | source=source, 52 | source_id=source_id, 53 | url=url, 54 | created_at=created_at, 55 | author=author, 56 | ) 57 | logger.info("metadata: ", str(metadata)) 58 | 59 | # update metadata with custom values 60 | for key, value in custom_metadata.items(): 61 | if hasattr(metadata, key): 62 | setattr(metadata, key, value) 63 | 64 | # screen for pii if requested 65 | if screen_for_pii: 66 | pii_detected = screen_text_for_pii(text) 67 | # if pii detected, print a warning and skip the document 68 | if pii_detected: 69 | logger.info("PII detected in document, skipping") 70 | skipped_items.append(item) # add the skipped item to the list 71 | continue 72 | 73 | # extract metadata if requested 74 | if extract_metadata: 75 | # extract metadata from the document text 76 | extracted_metadata = extract_metadata_from_document( 77 | f"Text: {text}; Metadata: {str(metadata)}" 78 | ) 79 | # get a Metadata object from the extracted metadata 80 | metadata = DocumentMetadata(**extracted_metadata) 81 | 82 | # create a document object with the id or a random id, text and metadata 83 | document = Document( 84 | id=id or str(uuid.uuid4()), 85 | text=text, 86 | metadata=metadata, 87 | ) 88 | documents.append(document) 89 | except Exception as e: 90 | # log the error and continue with the next item 91 | logger.error(f"Error processing {item}: {e}") 92 | skipped_items.append(item) # add the skipped item to the list 93 | 94 | # do this in batches, the upsert method already batches documents but this allows 95 | # us to add more descriptive logging 96 | for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE): 97 | # Get the text of the chunks in the current batch 98 | batch_documents = documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE] 99 | logger.info(f"Upserting batch of {len(batch_documents)} documents, batch {i}") 100 | logger.info("documents: ", documents) 101 | await datastore.upsert(batch_documents) 102 | 103 | # print the skipped items 104 | logger.info(f"Skipped {len(skipped_items)} items due to errors or PII detection") 105 | for item in skipped_items: 106 | logger.info(item) 107 | 108 | 109 | async def main(): 110 | # parse the command-line arguments 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument("--filepath", required=True, help="The path to the json dump") 113 | parser.add_argument( 114 | "--custom_metadata", 115 | default="{}", 116 | help="A JSON string of key-value pairs to update the metadata of the documents", 117 | ) 118 | parser.add_argument( 119 | "--screen_for_pii", 120 | default=False, 121 | type=bool, 122 | help="A boolean flag to indicate whether to try the PII detection function (using a language model)", 123 | ) 124 | parser.add_argument( 125 | "--extract_metadata", 126 | default=False, 127 | type=bool, 128 | help="A boolean flag to indicate whether to try to extract metadata from the document (using a language model)", 129 | ) 130 | args = parser.parse_args() 131 | 132 | # get the arguments 133 | filepath = args.filepath 134 | custom_metadata = json.loads(args.custom_metadata) 135 | screen_for_pii = args.screen_for_pii 136 | extract_metadata = args.extract_metadata 137 | 138 | # initialize the db instance once as a global variable 139 | datastore = await get_datastore() 140 | # process the json dump 141 | await process_json_dump( 142 | filepath, datastore, custom_metadata, screen_for_pii, extract_metadata 143 | ) 144 | 145 | 146 | if __name__ == "__main__": 147 | asyncio.run(main()) 148 | -------------------------------------------------------------------------------- /scripts/process_jsonl/README.md: -------------------------------------------------------------------------------- 1 | ## Process a JSONL File 2 | 3 | This script is a utility to process a file dump of documents in a JSONL format and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case. 4 | 5 | ## Usage 6 | 7 | To run this script from the terminal, navigate to this folder and use the following command: 8 | 9 | ``` 10 | python process_jsonl.py --filepath path/to/file_dump.jsonl --custom_metadata '{"source": "email"}' --screen_for_pii True --extract_metadata True 11 | ``` 12 | 13 | where: 14 | 15 | - `path/to/file_dump.jsonl` is the name or path to the file dump to be processed. The format of this JSONL file should be a newline-delimited JSON file, where each line is a valid JSON object representing a document. The JSON object should have a subset of the following fields: `id`, `text`, `source`, `source_id`, `url`, `created_at`, and `author`. The `text` field is required, while the rest are optional and will be used to populate the metadata of the document. If the `id` field is not specified, a random UUID will be generated for the document. 16 | - `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{"source": "file"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`). 17 | - `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`. 18 | - `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`. 19 | 20 | The script will open the JSONL file as a generator of dictionaries, iterate over the data, create document objects, and batch upsert them into the database. It will also print some progress messages and error messages if any, as well as the number and content of the skipped items due to errors, PII detection, or metadata extraction issues. 21 | 22 | You can use `python process_jsonl.py -h` to get a summary of the options and their descriptions. 23 | 24 | Test the script with the example file, [example.jsonl](example.jsonl). 25 | -------------------------------------------------------------------------------- /scripts/process_jsonl/example.jsonl: -------------------------------------------------------------------------------- 1 | {"id": "4", "text": "This document only has an ID and text. The other fields are missing."} 2 | {"text": "This document has no ID, but it has text and a source.", "source": "email"} 3 | {"id": "6", "text": "This document has an ID, text, and author, but no source information.", "author": "John Doe"} 4 | {"text": "This document has text, a source, and a URL, but no ID or author.", "source": "file", "url": "https://example.com/file/2"} 5 | {"id": "8", "text": "This document has an ID, text, source, and created_at timestamp, but no author or URL.", "source": "chat", "created_at": "2022-01-04T00:00:00"} 6 | {"id": "9", "text": "This document contains PII. John Smith's email address is john.smith@example.com and his phone number is +1 (555) 123-4567.", "source": "email", "source_id": "email_2", "url": "https://example.com/email/2", "created_at": "2022-01-05T00:00:00", "author": "John Smith"} -------------------------------------------------------------------------------- /scripts/process_jsonl/process_jsonl.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import json 3 | import argparse 4 | import asyncio 5 | 6 | from loguru import logger 7 | from models.models import Document, DocumentMetadata 8 | from datastore.datastore import DataStore 9 | from datastore.factory import get_datastore 10 | from services.extract_metadata import extract_metadata_from_document 11 | from services.pii_detection import screen_text_for_pii 12 | 13 | DOCUMENT_UPSERT_BATCH_SIZE = 50 14 | 15 | 16 | async def process_jsonl_dump( 17 | filepath: str, 18 | datastore: DataStore, 19 | custom_metadata: dict, 20 | screen_for_pii: bool, 21 | extract_metadata: bool, 22 | ): 23 | # open the jsonl file as a generator of dictionaries 24 | with open(filepath) as jsonl_file: 25 | data = [json.loads(line) for line in jsonl_file] 26 | 27 | documents = [] 28 | skipped_items = [] 29 | # iterate over the data and create document objects 30 | for item in data: 31 | if len(documents) % 20 == 0: 32 | logger.info(f"Processed {len(documents)} documents") 33 | 34 | try: 35 | # get the id, text, source, source_id, url, created_at and author from the item 36 | # use default values if not specified 37 | id = item.get("id", None) 38 | text = item.get("text", None) 39 | source = item.get("source", None) 40 | source_id = item.get("source_id", None) 41 | url = item.get("url", None) 42 | created_at = item.get("created_at", None) 43 | author = item.get("author", None) 44 | 45 | if not text: 46 | logger.info("No document text, skipping...") 47 | continue 48 | 49 | # create a metadata object with the source, source_id, url, created_at and author 50 | metadata = DocumentMetadata( 51 | source=source, 52 | source_id=source_id, 53 | url=url, 54 | created_at=created_at, 55 | author=author, 56 | ) 57 | 58 | # update metadata with custom values 59 | for key, value in custom_metadata.items(): 60 | if hasattr(metadata, key): 61 | setattr(metadata, key, value) 62 | 63 | # screen for pii if requested 64 | if screen_for_pii: 65 | pii_detected = screen_text_for_pii(text) 66 | # if pii detected, print a warning and skip the document 67 | if pii_detected: 68 | logger.info("PII detected in document, skipping") 69 | skipped_items.append(item) # add the skipped item to the list 70 | continue 71 | 72 | # extract metadata if requested 73 | if extract_metadata: 74 | # extract metadata from the document text 75 | extracted_metadata = extract_metadata_from_document( 76 | f"Text: {text}; Metadata: {str(metadata)}" 77 | ) 78 | # get a Metadata object from the extracted metadata 79 | metadata = DocumentMetadata(**extracted_metadata) 80 | 81 | # create a document object with the id, text and metadata 82 | document = Document( 83 | id=id, 84 | text=text, 85 | metadata=metadata, 86 | ) 87 | documents.append(document) 88 | except Exception as e: 89 | # log the error and continue with the next item 90 | logger.error(f"Error processing {item}: {e}") 91 | skipped_items.append(item) # add the skipped item to the list 92 | 93 | # do this in batches, the upsert method already batches documents but this allows 94 | # us to add more descriptive logging 95 | for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE): 96 | # Get the text of the chunks in the current batch 97 | batch_documents = documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE] 98 | logger.info(f"Upserting batch of {len(batch_documents)} documents, batch {i}") 99 | await datastore.upsert(batch_documents) 100 | 101 | # print the skipped items 102 | logger.info(f"Skipped {len(skipped_items)} items due to errors or PII detection") 103 | for item in skipped_items: 104 | logger.info(item) 105 | 106 | 107 | async def main(): 108 | # parse the command-line arguments 109 | parser = argparse.ArgumentParser() 110 | parser.add_argument("--filepath", required=True, help="The path to the jsonl dump") 111 | parser.add_argument( 112 | "--custom_metadata", 113 | default="{}", 114 | help="A JSON string of key-value pairs to update the metadata of the documents", 115 | ) 116 | parser.add_argument( 117 | "--screen_for_pii", 118 | default=False, 119 | type=bool, 120 | help="A boolean flag to indicate whether to try the PII detection function (using a language model)", 121 | ) 122 | parser.add_argument( 123 | "--extract_metadata", 124 | default=False, 125 | type=bool, 126 | help="A boolean flag to indicate whether to try to extract metadata from the document (using a language model)", 127 | ) 128 | args = parser.parse_args() 129 | 130 | # get the arguments 131 | filepath = args.filepath 132 | custom_metadata = json.loads(args.custom_metadata) 133 | screen_for_pii = args.screen_for_pii 134 | extract_metadata = args.extract_metadata 135 | 136 | # initialize the db instance once as a global variable 137 | datastore = await get_datastore() 138 | # process the jsonl dump 139 | await process_jsonl_dump( 140 | filepath, datastore, custom_metadata, screen_for_pii, extract_metadata 141 | ) 142 | 143 | 144 | if __name__ == "__main__": 145 | asyncio.run(main()) 146 | -------------------------------------------------------------------------------- /scripts/process_zip/README.md: -------------------------------------------------------------------------------- 1 | ## Process a ZIP File 2 | 3 | This script is a utility to process a file dump of documents in a zip file and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case. 4 | 5 | ## Usage 6 | 7 | To run this script from the terminal, navigate to this folder and use the following command: 8 | 9 | ``` 10 | python process_zip.py --filepath path/to/file_dump.zip --custom_metadata '{"source": "email"}' --screen_for_pii True --extract_metadata True 11 | ``` 12 | 13 | where: 14 | 15 | - `path/to/file_dump.zip` is the name or path to the file dump to be processed. The format of this zip file should be a zip file containing of docx, pdf, txt, md and pptx files (any internal folder structure is acceptable). 16 | - `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{"source": "file"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`). 17 | - `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`. 18 | - `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`. 19 | 20 | The script will extract the files from the zip file into a temporary directory named `dump`, process each file and store the document text and metadata in the database, and then delete the temporary directory and its contents. It will also print some progress messages and error messages if any. 21 | 22 | You can use `python process_zip.py -h` to get a summary of the options and their descriptions. 23 | 24 | Test the script with the example file, [example.zip](example.zip). 25 | -------------------------------------------------------------------------------- /scripts/process_zip/example.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/scripts/process_zip/example.zip -------------------------------------------------------------------------------- /scripts/process_zip/process_zip.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import zipfile 3 | import os 4 | import json 5 | import argparse 6 | import asyncio 7 | 8 | from loguru import logger 9 | from models.models import Document, DocumentMetadata, Source 10 | from datastore.datastore import DataStore 11 | from datastore.factory import get_datastore 12 | from services.extract_metadata import extract_metadata_from_document 13 | from services.file import extract_text_from_filepath 14 | from services.pii_detection import screen_text_for_pii 15 | 16 | DOCUMENT_UPSERT_BATCH_SIZE = 50 17 | 18 | 19 | async def process_file_dump( 20 | filepath: str, 21 | datastore: DataStore, 22 | custom_metadata: dict, 23 | screen_for_pii: bool, 24 | extract_metadata: bool, 25 | ): 26 | # create a ZipFile object and extract all the files into a directory named 'dump' 27 | with zipfile.ZipFile(filepath) as zip_file: 28 | zip_file.extractall("dump") 29 | 30 | documents = [] 31 | skipped_files = [] 32 | # use os.walk to traverse the dump directory and its subdirectories 33 | for root, dirs, files in os.walk("dump"): 34 | for filename in files: 35 | if len(documents) % 20 == 0: 36 | logger.info(f"Processed {len(documents)} documents") 37 | 38 | filepath = os.path.join(root, filename) 39 | 40 | try: 41 | extracted_text = extract_text_from_filepath(filepath) 42 | logger.info(f"extracted_text from {filepath}") 43 | 44 | # create a metadata object with the source and source_id fields 45 | metadata = DocumentMetadata( 46 | source=Source.file, 47 | source_id=filename, 48 | ) 49 | 50 | # update metadata with custom values 51 | for key, value in custom_metadata.items(): 52 | if hasattr(metadata, key): 53 | setattr(metadata, key, value) 54 | 55 | # screen for pii if requested 56 | if screen_for_pii: 57 | pii_detected = screen_text_for_pii(extracted_text) 58 | # if pii detected, print a warning and skip the document 59 | if pii_detected: 60 | logger.info("PII detected in document, skipping") 61 | skipped_files.append( 62 | filepath 63 | ) # add the skipped file to the list 64 | continue 65 | 66 | # extract metadata if requested 67 | if extract_metadata: 68 | # extract metadata from the document text 69 | extracted_metadata = extract_metadata_from_document( 70 | f"Text: {extracted_text}; Metadata: {str(metadata)}" 71 | ) 72 | # get a Metadata object from the extracted metadata 73 | metadata = DocumentMetadata(**extracted_metadata) 74 | 75 | # create a document object with a random id, text and metadata 76 | document = Document( 77 | id=str(uuid.uuid4()), 78 | text=extracted_text, 79 | metadata=metadata, 80 | ) 81 | documents.append(document) 82 | except Exception as e: 83 | # log the error and continue with the next file 84 | logger.error(f"Error processing {filepath}: {e}") 85 | skipped_files.append(filepath) # add the skipped file to the list 86 | 87 | # do this in batches, the upsert method already batches documents but this allows 88 | # us to add more descriptive logging 89 | for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE): 90 | # Get the text of the chunks in the current batch 91 | batch_documents = [doc for doc in documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]] 92 | logger.info(f"Upserting batch of {len(batch_documents)} documents, batch {i}") 93 | logger.info("documents: ", documents) 94 | await datastore.upsert(batch_documents) 95 | 96 | # delete all files in the dump directory 97 | for root, dirs, files in os.walk("dump", topdown=False): 98 | for filename in files: 99 | filepath = os.path.join(root, filename) 100 | os.remove(filepath) 101 | for dirname in dirs: 102 | dirpath = os.path.join(root, dirname) 103 | os.rmdir(dirpath) 104 | 105 | # delete the dump directory 106 | os.rmdir("dump") 107 | 108 | # print the skipped files 109 | logger.info(f"Skipped {len(skipped_files)} files due to errors or PII detection") 110 | for file in skipped_files: 111 | logger.info(file) 112 | 113 | 114 | async def main(): 115 | # parse the command-line arguments 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument("--filepath", required=True, help="The path to the file dump") 118 | parser.add_argument( 119 | "--custom_metadata", 120 | default="{}", 121 | help="A JSON string of key-value pairs to update the metadata of the documents", 122 | ) 123 | parser.add_argument( 124 | "--screen_for_pii", 125 | default=False, 126 | type=bool, 127 | help="A boolean flag to indicate whether to try the PII detection function (using a language model)", 128 | ) 129 | parser.add_argument( 130 | "--extract_metadata", 131 | default=False, 132 | type=bool, 133 | help="A boolean flag to indicate whether to try to extract metadata from the document (using a language model)", 134 | ) 135 | args = parser.parse_args() 136 | 137 | # get the arguments 138 | filepath = args.filepath 139 | custom_metadata = json.loads(args.custom_metadata) 140 | screen_for_pii = args.screen_for_pii 141 | extract_metadata = args.extract_metadata 142 | 143 | # initialize the db instance once as a global variable 144 | datastore = await get_datastore() 145 | # process the file dump 146 | await process_file_dump( 147 | filepath, datastore, custom_metadata, screen_for_pii, extract_metadata 148 | ) 149 | 150 | 151 | if __name__ == "__main__": 152 | asyncio.run(main()) 153 | -------------------------------------------------------------------------------- /server/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | import uvicorn 4 | from fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile 5 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials 6 | from fastapi.staticfiles import StaticFiles 7 | from loguru import logger 8 | 9 | from models.api import ( 10 | DeleteRequest, 11 | DeleteResponse, 12 | QueryRequest, 13 | QueryResponse, 14 | UpsertRequest, 15 | UpsertResponse, 16 | ) 17 | from datastore.factory import get_datastore 18 | from services.file import get_document_from_file 19 | 20 | from models.models import DocumentMetadata, Source 21 | 22 | bearer_scheme = HTTPBearer() 23 | BEARER_TOKEN = os.environ.get("BEARER_TOKEN") 24 | assert BEARER_TOKEN is not None 25 | 26 | 27 | def validate_token(credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme)): 28 | if credentials.scheme != "Bearer" or credentials.credentials != BEARER_TOKEN: 29 | raise HTTPException(status_code=401, detail="Invalid or missing token") 30 | return credentials 31 | 32 | 33 | app = FastAPI(dependencies=[Depends(validate_token)]) 34 | app.mount("/.well-known", StaticFiles(directory=".well-known"), name="static") 35 | 36 | # Create a sub-application, in order to access just the query endpoint in an OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally 37 | sub_app = FastAPI( 38 | title="Retrieval Plugin API", 39 | description="A retrieval API for querying and filtering documents based on natural language queries and metadata", 40 | version="1.0.0", 41 | servers=[{"url": "https://your-app-url.com"}], 42 | dependencies=[Depends(validate_token)], 43 | ) 44 | app.mount("/sub", sub_app) 45 | 46 | 47 | @app.post( 48 | "/upsert-file", 49 | response_model=UpsertResponse, 50 | ) 51 | async def upsert_file( 52 | file: UploadFile = File(...), 53 | metadata: Optional[str] = Form(None), 54 | ): 55 | try: 56 | metadata_obj = ( 57 | DocumentMetadata.parse_raw(metadata) 58 | if metadata 59 | else DocumentMetadata(source=Source.file) 60 | ) 61 | except: 62 | metadata_obj = DocumentMetadata(source=Source.file) 63 | 64 | document = await get_document_from_file(file, metadata_obj) 65 | 66 | try: 67 | ids = await datastore.upsert([document]) 68 | return UpsertResponse(ids=ids) 69 | except Exception as e: 70 | logger.error(e) 71 | raise HTTPException(status_code=500, detail=f"str({e})") 72 | 73 | 74 | @app.post( 75 | "/upsert", 76 | response_model=UpsertResponse, 77 | ) 78 | async def upsert( 79 | request: UpsertRequest = Body(...), 80 | ): 81 | try: 82 | ids = await datastore.upsert(request.documents) 83 | return UpsertResponse(ids=ids) 84 | except Exception as e: 85 | logger.error(e) 86 | raise HTTPException(status_code=500, detail="Internal Service Error") 87 | 88 | 89 | @app.post( 90 | "/query", 91 | response_model=QueryResponse, 92 | ) 93 | async def query_main( 94 | request: QueryRequest = Body(...), 95 | ): 96 | try: 97 | results = await datastore.query( 98 | request.queries, 99 | ) 100 | return QueryResponse(results=results) 101 | except Exception as e: 102 | logger.error(e) 103 | raise HTTPException(status_code=500, detail="Internal Service Error") 104 | 105 | 106 | @sub_app.post( 107 | "/query", 108 | response_model=QueryResponse, 109 | # NOTE: We are describing the shape of the API endpoint input due to a current limitation in parsing arrays of objects from OpenAPI schemas. This will not be necessary in the future. 110 | description="Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.", 111 | ) 112 | async def query( 113 | request: QueryRequest = Body(...), 114 | ): 115 | try: 116 | results = await datastore.query( 117 | request.queries, 118 | ) 119 | return QueryResponse(results=results) 120 | except Exception as e: 121 | logger.error(e) 122 | raise HTTPException(status_code=500, detail="Internal Service Error") 123 | 124 | 125 | @app.delete( 126 | "/delete", 127 | response_model=DeleteResponse, 128 | ) 129 | async def delete( 130 | request: DeleteRequest = Body(...), 131 | ): 132 | if not (request.ids or request.filter or request.delete_all): 133 | raise HTTPException( 134 | status_code=400, 135 | detail="One of ids, filter, or delete_all is required", 136 | ) 137 | try: 138 | success = await datastore.delete( 139 | ids=request.ids, 140 | filter=request.filter, 141 | delete_all=request.delete_all, 142 | ) 143 | return DeleteResponse(success=success) 144 | except Exception as e: 145 | logger.error(e) 146 | raise HTTPException(status_code=500, detail="Internal Service Error") 147 | 148 | 149 | @app.on_event("startup") 150 | async def startup(): 151 | global datastore 152 | datastore = await get_datastore() 153 | 154 | 155 | def start(): 156 | uvicorn.run("server.main:app", host="0.0.0.0", port=8000, reload=True) 157 | -------------------------------------------------------------------------------- /services/date.py: -------------------------------------------------------------------------------- 1 | import arrow 2 | from loguru import logger 3 | 4 | 5 | def to_unix_timestamp(date_str: str) -> int: 6 | """ 7 | Convert a date string to a unix timestamp (seconds since epoch). 8 | 9 | Args: 10 | date_str: The date string to convert. 11 | 12 | Returns: 13 | The unix timestamp corresponding to the date string. 14 | 15 | If the date string cannot be parsed as a valid date format, returns the current unix timestamp and prints a warning. 16 | """ 17 | # Try to parse the date string using arrow, which supports many common date formats 18 | try: 19 | date_obj = arrow.get(date_str) 20 | return int(date_obj.timestamp()) 21 | except arrow.parser.ParserError: 22 | # If the parsing fails, return the current unix timestamp and print a warning 23 | logger.info(f"Invalid date format: {date_str}") 24 | return int(arrow.now().timestamp()) 25 | -------------------------------------------------------------------------------- /services/extract_metadata.py: -------------------------------------------------------------------------------- 1 | from models.models import Source 2 | from services.openai import get_chat_completion 3 | import json 4 | from typing import Dict 5 | import os 6 | from loguru import logger 7 | 8 | 9 | def extract_metadata_from_document(text: str) -> Dict[str, str]: 10 | sources = Source.__members__.keys() 11 | sources_string = ", ".join(sources) 12 | # This prompt is just an example, change it to fit your use case 13 | messages = [ 14 | { 15 | "role": "system", 16 | "content": f""" 17 | Given a document from a user, try to extract the following metadata: 18 | - source: string, one of {sources_string} 19 | - url: string or don't specify 20 | - created_at: string or don't specify 21 | - author: string or don't specify 22 | 23 | Respond with a JSON containing the extracted metadata in key value pairs. If you don't find a metadata field, don't specify it. 24 | """, 25 | }, 26 | {"role": "user", "content": text}, 27 | ] 28 | 29 | # NOTE: Azure Open AI requires deployment id 30 | # Read environment variable - if not set - not used 31 | completion = get_chat_completion( 32 | messages, 33 | "gpt-4", 34 | # os.environ.get("OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID") 35 | ) # TODO: change to your preferred model name 36 | 37 | logger.info(f"completion: {completion}") 38 | 39 | try: 40 | metadata = json.loads(completion) 41 | except Exception as e: 42 | logger.error(f"Error parsing completion: {e}") 43 | metadata = {} 44 | 45 | return metadata 46 | -------------------------------------------------------------------------------- /services/file.py: -------------------------------------------------------------------------------- 1 | import os 2 | from io import BufferedReader 3 | from typing import Optional 4 | from fastapi import UploadFile 5 | import mimetypes 6 | from PyPDF2 import PdfReader 7 | import docx2txt 8 | import csv 9 | import pptx 10 | from loguru import logger 11 | 12 | from models.models import Document, DocumentMetadata 13 | 14 | 15 | async def get_document_from_file( 16 | file: UploadFile, metadata: DocumentMetadata 17 | ) -> Document: 18 | extracted_text = await extract_text_from_form_file(file) 19 | 20 | doc = Document(text=extracted_text, metadata=metadata) 21 | 22 | return doc 23 | 24 | 25 | def extract_text_from_filepath(filepath: str, mimetype: Optional[str] = None) -> str: 26 | """Return the text content of a file given its filepath.""" 27 | 28 | if mimetype is None: 29 | # Get the mimetype of the file based on its extension 30 | mimetype, _ = mimetypes.guess_type(filepath) 31 | 32 | if not mimetype: 33 | if filepath.endswith(".md"): 34 | mimetype = "text/markdown" 35 | else: 36 | raise Exception("Unsupported file type") 37 | 38 | try: 39 | with open(filepath, "rb") as file: 40 | extracted_text = extract_text_from_file(file, mimetype) 41 | except Exception as e: 42 | logger.error(e) 43 | raise e 44 | 45 | return extracted_text 46 | 47 | 48 | def extract_text_from_file(file: BufferedReader, mimetype: str) -> str: 49 | if mimetype == "application/pdf": 50 | # Extract text from pdf using PyPDF2 51 | reader = PdfReader(file) 52 | extracted_text = " ".join([page.extract_text() for page in reader.pages]) 53 | elif mimetype == "text/plain" or mimetype == "text/markdown": 54 | # Read text from plain text file 55 | extracted_text = file.read().decode("utf-8") 56 | elif ( 57 | mimetype 58 | == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 59 | ): 60 | # Extract text from docx using docx2txt 61 | extracted_text = docx2txt.process(file) 62 | elif mimetype == "text/csv": 63 | # Extract text from csv using csv module 64 | extracted_text = "" 65 | decoded_buffer = (line.decode("utf-8") for line in file) 66 | reader = csv.reader(decoded_buffer) 67 | for row in reader: 68 | extracted_text += " ".join(row) + "\n" 69 | elif ( 70 | mimetype 71 | == "application/vnd.openxmlformats-officedocument.presentationml.presentation" 72 | ): 73 | # Extract text from pptx using python-pptx 74 | extracted_text = "" 75 | presentation = pptx.Presentation(file) 76 | for slide in presentation.slides: 77 | for shape in slide.shapes: 78 | if shape.has_text_frame: 79 | for paragraph in shape.text_frame.paragraphs: 80 | for run in paragraph.runs: 81 | extracted_text += run.text + " " 82 | extracted_text += "\n" 83 | else: 84 | # Unsupported file type 85 | raise ValueError("Unsupported file type: {}".format(mimetype)) 86 | 87 | return extracted_text 88 | 89 | 90 | # Extract text from a file based on its mimetype 91 | async def extract_text_from_form_file(file: UploadFile): 92 | """Return the text content of a file.""" 93 | # get the file body from the upload file object 94 | mimetype = file.content_type 95 | logger.info(f"mimetype: {mimetype}") 96 | logger.info(f"file.file: {file.file}") 97 | logger.info("file: ", file) 98 | 99 | file_stream = await file.read() 100 | 101 | temp_file_path = "/tmp/temp_file" 102 | 103 | # write the file to a temporary location 104 | with open(temp_file_path, "wb") as f: 105 | f.write(file_stream) 106 | 107 | try: 108 | extracted_text = extract_text_from_filepath(temp_file_path, mimetype) 109 | except Exception as e: 110 | logger.error(e) 111 | os.remove(temp_file_path) 112 | raise e 113 | 114 | # remove file from temp location 115 | os.remove(temp_file_path) 116 | 117 | return extracted_text 118 | -------------------------------------------------------------------------------- /services/openai.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import openai 3 | import os 4 | from loguru import logger 5 | 6 | from tenacity import retry, wait_random_exponential, stop_after_attempt 7 | 8 | EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-large") 9 | EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256)) 10 | 11 | 12 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3)) 13 | def get_embeddings(texts: List[str]) -> List[List[float]]: 14 | """ 15 | Embed texts using OpenAI's ada model. 16 | 17 | Args: 18 | texts: The list of texts to embed. 19 | 20 | Returns: 21 | A list of embeddings, each of which is a list of floats. 22 | 23 | Raises: 24 | Exception: If the OpenAI API call fails. 25 | """ 26 | # Call the OpenAI API to get the embeddings 27 | # NOTE: Azure Open AI requires deployment id 28 | deployment = os.environ.get("OPENAI_EMBEDDINGMODEL_DEPLOYMENTID") 29 | 30 | response = {} 31 | if deployment is None: 32 | response = openai.Embedding.create(input=texts, model=EMBEDDING_MODEL, dimensions=EMBEDDING_DIMENSION) 33 | else: 34 | response = openai.Embedding.create(input=texts, deployment_id=deployment) 35 | 36 | # Extract the embedding data from the response 37 | data = response["data"] # type: ignore 38 | 39 | # Return the embeddings as a list of lists of floats 40 | return [result["embedding"] for result in data] 41 | 42 | 43 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3)) 44 | def get_chat_completion( 45 | messages, 46 | model="gpt-3.5-turbo", # use "gpt-4" for better results 47 | deployment_id=None, 48 | ): 49 | """ 50 | Generate a chat completion using OpenAI's chat completion API. 51 | 52 | Args: 53 | messages: The list of messages in the chat history. 54 | model: The name of the model to use for the completion. Default is gpt-3.5-turbo, which is a fast, cheap and versatile model. Use gpt-4 for higher quality but slower results. 55 | 56 | Returns: 57 | A string containing the chat completion. 58 | 59 | Raises: 60 | Exception: If the OpenAI API call fails. 61 | """ 62 | # call the OpenAI chat completion API with the given messages 63 | # Note: Azure Open AI requires deployment id 64 | response = {} 65 | if deployment_id == None: 66 | response = openai.ChatCompletion.create( 67 | model=model, 68 | messages=messages, 69 | ) 70 | else: 71 | response = openai.ChatCompletion.create( 72 | deployment_id=deployment_id, 73 | messages=messages, 74 | ) 75 | 76 | choices = response["choices"] # type: ignore 77 | completion = choices[0].message.content.strip() 78 | logger.info(f"Completion: {completion}") 79 | return completion 80 | -------------------------------------------------------------------------------- /services/pii_detection.py: -------------------------------------------------------------------------------- 1 | import os 2 | from services.openai import get_chat_completion 3 | 4 | 5 | def screen_text_for_pii(text: str) -> bool: 6 | # This prompt is just an example, change it to fit your use case 7 | messages = [ 8 | { 9 | "role": "system", 10 | "content": f""" 11 | You can only respond with the word "True" or "False", where your answer indicates whether the text in the user's message contains PII. 12 | Do not explain your answer, and do not use punctuation. 13 | Your task is to identify whether the text extracted from your company files 14 | contains sensitive PII information that should not be shared with the broader company. Here are some things to look out for: 15 | - An email address that identifies a specific person in either the local-part or the domain 16 | - The postal address of a private residence (must include at least a street name) 17 | - The postal address of a public place (must include either a street name or business name) 18 | - Notes about hiring decisions with mentioned names of candidates. The user will send a document for you to analyze. 19 | """, 20 | }, 21 | {"role": "user", "content": text}, 22 | ] 23 | 24 | completion = get_chat_completion( 25 | messages, deployment_id=os.environ.get("OPENAI_COMPLETIONMODEL_DEPLOYMENTID") 26 | ) 27 | 28 | if completion.startswith("True"): 29 | return True 30 | 31 | return False 32 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/tests/__init__.py -------------------------------------------------------------------------------- /tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from typing import Dict, List 3 | from dotenv import dotenv_values 4 | 5 | from datastore.datastore import DataStore 6 | from datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore 7 | from models.models import ( 8 | DocumentChunk, 9 | DocumentChunkMetadata, 10 | QueryWithEmbedding, 11 | ) 12 | import os 13 | 14 | num_lists = 1 15 | similarity = "COS" 16 | 17 | EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256)) 18 | 19 | 20 | def create_embedding(non_zero_pos: int) -> List[float]: 21 | # create a vector with a single non-zero value of dimension EMBEDDING_DIMENSION 22 | vector = [0.0] * EMBEDDING_DIMENSION 23 | vector[non_zero_pos - 1] = 1.0 24 | return vector 25 | 26 | 27 | @pytest.fixture 28 | def azure_cosmos_db_settings_from_dot_env() -> dict: 29 | """ 30 | Reads the Azure CosmosDB environment variables for the .env file. 31 | 32 | Returns: 33 | dict: The Azure CosmosDB environment variables 34 | """ 35 | config = dotenv_values(".env") 36 | env_variables = { 37 | "DATASTORE": "azurecosmosdb", 38 | "AZCOSMOS_API": config.get( 39 | ("AZCOSMOS_API") 40 | ), # Right now CosmosDB only supports vector search in Mongo vCore. 41 | "AZCOSMOS_CONNSTR": config.get("AZCOSMOS_CONNSTR"), 42 | "AZCOSMOS_DATABASE_NAME": config.get("AZCOSMOS_DATABASE_NAME"), 43 | "AZCOSMOS_CONTAINER_NAME": config.get("AZCOSMOS_CONTAINER_NAME"), 44 | } 45 | 46 | return env_variables 47 | 48 | 49 | @pytest.fixture 50 | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]: 51 | first_doc_chunks = [ 52 | DocumentChunk( 53 | id=f"first-doc-{i}", 54 | text=f"Lorem ipsum {i}", 55 | metadata=DocumentChunkMetadata(), 56 | embedding=create_embedding(i), 57 | ) 58 | for i in range(4, 7) 59 | ] 60 | return { 61 | "first-doc": first_doc_chunks, 62 | } 63 | 64 | 65 | @pytest.fixture 66 | def queries() -> List[QueryWithEmbedding]: 67 | queries = [ 68 | QueryWithEmbedding( 69 | query="Query 1", 70 | top_k=1, 71 | embedding=create_embedding(4), 72 | ), 73 | QueryWithEmbedding( 74 | query="Query 2", 75 | top_k=2, 76 | embedding=create_embedding(5), 77 | ), 78 | ] 79 | return queries 80 | 81 | 82 | @pytest.fixture 83 | async def azurecosmosdb_datastore() -> DataStore: 84 | return await AzureCosmosDBDataStore.create( 85 | num_lists=num_lists, similarity=similarity 86 | ) 87 | 88 | 89 | @pytest.mark.asyncio 90 | async def test_upsert( 91 | azurecosmosdb_datastore: AzureCosmosDBDataStore, 92 | initial_document_chunks: Dict[str, List[DocumentChunk]], 93 | ) -> None: 94 | """Test basic upsert.""" 95 | doc_ids = await azurecosmosdb_datastore._upsert(initial_document_chunks) 96 | assert doc_ids == [ 97 | f"doc:{doc_id}:chunk:{chunk.id}" 98 | for doc_id, chunk_list in initial_document_chunks.items() 99 | for chunk in chunk_list 100 | ] 101 | 102 | 103 | @pytest.mark.asyncio 104 | async def test_query( 105 | azurecosmosdb_datastore: AzureCosmosDBDataStore, 106 | initial_document_chunks: Dict[str, List[DocumentChunk]], 107 | queries: List[QueryWithEmbedding], 108 | ) -> None: 109 | """Test basic query.""" 110 | await azurecosmosdb_datastore.delete(delete_all=True) 111 | # insert to prepare for the test 112 | await azurecosmosdb_datastore._upsert(initial_document_chunks) 113 | 114 | query_results = await azurecosmosdb_datastore._query(queries) 115 | assert len(query_results) == len(queries) 116 | 117 | query_0_results = query_results[0].results 118 | query_1_results = query_results[1].results 119 | 120 | assert len(query_0_results) == 1 121 | assert len(query_1_results) == 2 122 | 123 | # NOTE: this is the correct behavior 124 | assert query_0_results[0].id == "doc:first-doc:chunk:first-doc-4" 125 | assert query_1_results[0].id == "doc:first-doc:chunk:first-doc-5" 126 | assert query_1_results[1].id == "doc:first-doc:chunk:first-doc-4" 127 | 128 | 129 | @pytest.mark.asyncio 130 | async def test_delete(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None: 131 | await azurecosmosdb_datastore.delete(delete_all=True) 132 | chunk1 = DocumentChunk( 133 | id="deleteChunk1", 134 | text="delete text 1", 135 | embedding=[1] * EMBEDDING_DIMENSION, 136 | metadata=DocumentChunkMetadata(), 137 | ) 138 | chunk2 = DocumentChunk( 139 | id="deleteChunk2", 140 | text="delete text 2", 141 | embedding=[1] * EMBEDDING_DIMENSION, 142 | metadata=DocumentChunkMetadata(), 143 | ) 144 | # insert to prepare for test 145 | await azurecosmosdb_datastore._upsert( 146 | {"deleteDoc1": [chunk1], "deleteDoc2": [chunk2]} 147 | ) 148 | 149 | query_embedding = [1] * EMBEDDING_DIMENSION 150 | query = QueryWithEmbedding( 151 | query="Query for delete", 152 | embedding=query_embedding, 153 | ) 154 | results = await azurecosmosdb_datastore._query([query]) 155 | 156 | assert len(results[0].results) == 2 157 | assert results[0].results[0].id == "doc:deleteDoc1:chunk:deleteChunk1" 158 | assert results[0].results[1].id == "doc:deleteDoc2:chunk:deleteChunk2" 159 | 160 | await azurecosmosdb_datastore.delete(ids=["doc:deleteDoc1:chunk:deleteChunk1"]) 161 | results_after_delete = await azurecosmosdb_datastore._query([query]) 162 | 163 | assert len(results_after_delete[0].results) == 1 164 | assert results_after_delete[0].results[0].id == "doc:deleteDoc2:chunk:deleteChunk2" 165 | 166 | 167 | @pytest.mark.asynio 168 | async def test_delete_all(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None: 169 | await azurecosmosdb_datastore.delete(delete_all=True) 170 | chunk = DocumentChunk( 171 | id="deleteChunk", 172 | text="delete text", 173 | embedding=[1] * EMBEDDING_DIMENSION, 174 | metadata=DocumentChunkMetadata(), 175 | ) 176 | await azurecosmosdb_datastore._upsert({"deleteDoc": [chunk]}) 177 | 178 | query_embedding = [1] * EMBEDDING_DIMENSION 179 | query = QueryWithEmbedding( 180 | query="delete query", 181 | embedding=query_embedding, 182 | top_k=1, 183 | ) 184 | results = await azurecosmosdb_datastore._query([query]) 185 | 186 | assert len(results) == 1 187 | assert len(results[0].results) == 1 188 | assert results[0].results[0].id == "doc:deleteDoc:chunk:deleteChunk" 189 | 190 | await azurecosmosdb_datastore.delete(delete_all=True) 191 | results_after_delete = await azurecosmosdb_datastore._query([query]) 192 | 193 | assert len(results_after_delete[0].results) == 0 194 | -------------------------------------------------------------------------------- /tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from models.models import ( 3 | DocumentChunkMetadata, 4 | DocumentMetadataFilter, 5 | DocumentChunk, 6 | QueryWithEmbedding, 7 | Source, 8 | ) 9 | from datastore.providers.elasticsearch_datastore import ( 10 | ElasticsearchDataStore, 11 | ) 12 | import time 13 | import os 14 | 15 | DIM_SIZE = int(os.environ.get("EMBEDDING_DIMENSION", 256)) 16 | 17 | 18 | @pytest.fixture 19 | def elasticsearch_datastore(): 20 | return ElasticsearchDataStore() 21 | 22 | 23 | def sample_embedding(one_element_poz: int): 24 | embedding = [0] * DIM_SIZE 25 | embedding[one_element_poz % DIM_SIZE] = 1 26 | return embedding 27 | 28 | 29 | def sample_embeddings(num: int, one_element_start: int = 0): 30 | embeddings = [] 31 | for x in range(num): 32 | embedding = [0] * DIM_SIZE 33 | embedding[(x + one_element_start) % DIM_SIZE] = 1 34 | embeddings.append(embedding) 35 | return embeddings 36 | 37 | 38 | @pytest.fixture 39 | def document_chunk_one(): 40 | doc_id = "abc" 41 | doc_chunks = [] 42 | 43 | ids = ["123", "456", "789"] 44 | texts = [ 45 | "Aenean euismod bibendum laoreet", 46 | "Vivamus non enim vitae tortor", 47 | "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae", 48 | ] 49 | sources = [Source.email, Source.file, Source.chat] 50 | created_ats = [ 51 | "1929-10-28T09:30:00-05:00", 52 | "2009-01-03T16:39:57-08:00", 53 | "2021-01-21T10:00:00-02:00", 54 | ] 55 | authors = ["Fred Smith", "Bob Doe", "Appleton Doe"] 56 | 57 | embeddings = sample_embeddings(len(texts)) 58 | 59 | for i in range(3): 60 | chunk = DocumentChunk( 61 | id=ids[i], 62 | text=texts[i], 63 | metadata=DocumentChunkMetadata( 64 | document_id=doc_id, 65 | source=sources[i], 66 | created_at=created_ats[i], 67 | author=authors[i], 68 | ), 69 | embedding=embeddings[i], # type: ignore 70 | ) 71 | 72 | doc_chunks.append(chunk) 73 | 74 | return {doc_id: doc_chunks} 75 | 76 | 77 | async def test_upsert(elasticsearch_datastore, document_chunk_one): 78 | await elasticsearch_datastore.delete(delete_all=True) 79 | res = await elasticsearch_datastore._upsert(document_chunk_one) 80 | assert res == list(document_chunk_one.keys()) 81 | time.sleep(1) 82 | 83 | results = elasticsearch_datastore.client.search( 84 | index=elasticsearch_datastore.index_name, query={"match_all": {}} 85 | ) 86 | assert results["hits"]["total"]["value"] == 3 87 | elasticsearch_datastore.client.indices.delete( 88 | index=elasticsearch_datastore.index_name 89 | ) 90 | 91 | 92 | async def test_upsert_query_all(elasticsearch_datastore, document_chunk_one): 93 | await elasticsearch_datastore.delete(delete_all=True) 94 | res = await elasticsearch_datastore._upsert(document_chunk_one) 95 | assert res == list(document_chunk_one.keys()) 96 | time.sleep(1) 97 | 98 | query = QueryWithEmbedding( 99 | query="Aenean", 100 | top_k=10, 101 | embedding=sample_embedding(0), # type: ignore 102 | ) 103 | query_results = await elasticsearch_datastore._query(queries=[query]) 104 | 105 | assert 1 == len(query_results) 106 | assert 3 == len(query_results[0].results) 107 | 108 | 109 | async def test_delete_with_document_id(elasticsearch_datastore, document_chunk_one): 110 | await elasticsearch_datastore.delete(delete_all=True) 111 | res = await elasticsearch_datastore._upsert(document_chunk_one) 112 | time.sleep(1) 113 | assert res == list(document_chunk_one.keys()) 114 | await elasticsearch_datastore.delete([res[0]]) 115 | time.sleep(1) 116 | 117 | query = QueryWithEmbedding( 118 | query="Aenean", 119 | top_k=9, 120 | embedding=sample_embedding(0), # type: ignore 121 | ) 122 | query_results = await elasticsearch_datastore._query(queries=[query]) 123 | 124 | assert 1 == len(query_results) 125 | assert 0 == len(query_results[0].results) 126 | 127 | elasticsearch_datastore.client.indices.delete( 128 | index=elasticsearch_datastore.index_name 129 | ) 130 | 131 | 132 | async def test_delete_with_source_filter(elasticsearch_datastore, document_chunk_one): 133 | await elasticsearch_datastore.delete(delete_all=True) 134 | res = await elasticsearch_datastore._upsert(document_chunk_one) 135 | assert res == list(document_chunk_one.keys()) 136 | time.sleep(1) 137 | 138 | await elasticsearch_datastore.delete( 139 | filter=DocumentMetadataFilter( 140 | source=Source.email, 141 | ) 142 | ) 143 | 144 | time.sleep(1) 145 | 146 | query = QueryWithEmbedding( 147 | query="Aenean", 148 | top_k=9, 149 | embedding=sample_embedding(0), # type: ignore 150 | ) 151 | query_results = await elasticsearch_datastore._query(queries=[query]) 152 | 153 | assert 1 == len(query_results) 154 | assert 2 == len(query_results[0].results) 155 | assert "456" == query_results[0].results[0].id 156 | 157 | elasticsearch_datastore.client.indices.delete( 158 | index=elasticsearch_datastore.index_name 159 | ) 160 | -------------------------------------------------------------------------------- /tests/datastore/providers/llama/test_llama_datastore.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import pytest 3 | from datastore.providers.llama_datastore import LlamaDataStore 4 | from models.models import DocumentChunk, DocumentChunkMetadata, QueryWithEmbedding 5 | 6 | 7 | def create_embedding(non_zero_pos: int, size: int) -> List[float]: 8 | vector = [0.0] * size 9 | vector[non_zero_pos % size] = 1.0 10 | return vector 11 | 12 | 13 | @pytest.fixture 14 | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]: 15 | first_doc_chunks = [ 16 | DocumentChunk( 17 | id=f"first-doc-{i}", 18 | text=f"Lorem ipsum {i}", 19 | metadata=DocumentChunkMetadata(), 20 | embedding=create_embedding(i, 5), 21 | ) 22 | for i in range(4, 7) 23 | ] 24 | return { 25 | "first-doc": first_doc_chunks, 26 | } 27 | 28 | 29 | @pytest.fixture 30 | def queries() -> List[QueryWithEmbedding]: 31 | queries = [ 32 | QueryWithEmbedding( 33 | query="Query 1", 34 | top_k=1, 35 | embedding=create_embedding(4, 5), 36 | ), 37 | QueryWithEmbedding( 38 | query="Query 2", 39 | top_k=2, 40 | embedding=create_embedding(5, 5), 41 | ), 42 | ] 43 | return queries 44 | 45 | 46 | @pytest.fixture 47 | def llama_datastore() -> LlamaDataStore: 48 | return LlamaDataStore() 49 | 50 | 51 | @pytest.mark.asyncio 52 | async def test_upsert( 53 | llama_datastore: LlamaDataStore, 54 | initial_document_chunks: Dict[str, List[DocumentChunk]], 55 | ) -> None: 56 | """Test basic upsert.""" 57 | doc_ids = await llama_datastore._upsert(initial_document_chunks) 58 | assert doc_ids == [doc_id for doc_id in initial_document_chunks] 59 | 60 | 61 | @pytest.mark.asyncio 62 | async def test_query( 63 | llama_datastore: LlamaDataStore, 64 | initial_document_chunks: Dict[str, List[DocumentChunk]], 65 | queries: List[QueryWithEmbedding], 66 | ) -> None: 67 | """Test basic query.""" 68 | # insert to prepare for test 69 | await llama_datastore._upsert(initial_document_chunks) 70 | 71 | query_results = await llama_datastore._query(queries) 72 | assert len(query_results) == len(queries) 73 | 74 | query_0_results = query_results[0].results 75 | query_1_results = query_results[1].results 76 | 77 | assert len(query_0_results) == 1 78 | assert len(query_1_results) == 2 79 | 80 | # NOTE: this is the correct behavior 81 | assert query_0_results[0].id == "first-doc-4" 82 | assert query_1_results[0].id == "first-doc-5" 83 | assert query_1_results[1].id == "first-doc-4" 84 | 85 | 86 | @pytest.mark.asyncio 87 | async def test_delete( 88 | llama_datastore: LlamaDataStore, 89 | initial_document_chunks: Dict[str, List[DocumentChunk]], 90 | ) -> None: 91 | # insert to prepare for test 92 | await llama_datastore._upsert(initial_document_chunks) 93 | 94 | is_success = llama_datastore.delete(["first-doc"]) 95 | assert is_success 96 | -------------------------------------------------------------------------------- /tests/datastore/providers/mongodb_atlas/test_integration.py: -------------------------------------------------------------------------------- 1 | """Integration Tests of ChatGPT Retrieval Plugin 2 | with MongoDB Atlas Vector Datastore and OPENAI Embedding model. 3 | 4 | As described in docs/providers/mongodb/setup.md, to run this, one must 5 | have a running MongoDB Atlas Cluster, and 6 | provide a valid OPENAI_API_KEY. 7 | """ 8 | 9 | import os 10 | from time import sleep 11 | 12 | import openai 13 | import pytest 14 | from fastapi.testclient import TestClient 15 | from httpx import Response 16 | from pymongo import MongoClient 17 | 18 | from server.main import app 19 | 20 | 21 | @pytest.fixture(scope="session") 22 | def documents(): 23 | """ List of documents represents data to be embedded in the datastore. 24 | Minimum requirements fpr Documents in the /upsert endpoint's UpsertRequest. 25 | """ 26 | return [ 27 | {"text": "The quick brown fox jumped over the slimy green toad."}, 28 | {"text": "The big brown bear jumped over the lazy dog."}, 29 | {"text": "Toads are frogs."}, 30 | {"text": "Green toads are basically red frogs."}, 31 | ] 32 | 33 | 34 | @pytest.fixture(scope="session", autouse=True) 35 | def client(): 36 | """TestClient makes requests to FastAPI service.""" 37 | endpoint_url = "http://127.0.0.1:8000" 38 | headers = {"Authorization": f"Bearer {os.environ['BEARER_TOKEN']}"} 39 | with TestClient(app=app, base_url=endpoint_url, headers=headers) as client: 40 | yield client 41 | 42 | 43 | @pytest.fixture(scope="session") 44 | def delete(client) -> bool: 45 | """Drop existing documents from the collection""" 46 | response = client.request("DELETE", "/delete", json={"delete_all": True}) 47 | sleep(2) 48 | return response 49 | 50 | 51 | @pytest.fixture(scope="session") 52 | def upsert(delete, documents, client) -> bool: 53 | """Upload documents to the datastore via plugin's REST API.""" 54 | response = client.post("/upsert", json={"documents": documents}) 55 | sleep(2) # At this point, the Vector Search Index is being built 56 | return response 57 | 58 | 59 | def test_delete(delete) -> None: 60 | """Simply confirm that delete fixture ran successfully""" 61 | assert delete.status_code == 200 62 | assert delete.json()['success'] 63 | 64 | 65 | def test_upsert(upsert) -> None: 66 | """Simply confirm that upsert fixture has run successfully""" 67 | assert upsert.status_code == 200 68 | assert len(upsert.json()['ids']) == 4 69 | 70 | 71 | def test_query(upsert, client) -> None: # upsert, 72 | """Test queries produce reasonable results, 73 | now that datastore contains embedded data which has been indexed 74 | """ 75 | question = "What did the fox jump over?" 76 | n_requested = 2 # top N results per query 77 | got_response = False 78 | retries = 5 79 | query_result = {} 80 | while retries and not got_response: 81 | response = client.post("/query", json={'queries': [{"query": question, "top_k": n_requested}]}) 82 | assert isinstance(response, Response) 83 | assert response.status_code == 200 84 | assert len(response.json()) == 1 85 | query_result = response.json()['results'][0] 86 | if len(query_result['results']) == n_requested: 87 | got_response = True 88 | else: 89 | retries -= 1 90 | sleep(5) 91 | 92 | assert got_response # we got n_requested responses 93 | assert query_result['query'] == question 94 | answers = [] 95 | scores = [] 96 | for result in query_result['results']: 97 | answers.append(result['text']) 98 | scores.append(round(result['score'], 2)) 99 | assert 0.8 < scores[0] < 0.9 100 | assert answers[0] == "The quick brown fox jumped over the slimy green toad." 101 | 102 | 103 | def test_required_vars() -> None: 104 | """Confirm that the environment has all it needs""" 105 | required_vars = {'BEARER_TOKEN', 'OPENAI_API_KEY', 'DATASTORE', 'EMBEDDING_DIMENSION', 'EMBEDDING_MODEL', 106 | 'MONGODB_COLLECTION', 'MONGODB_DATABASE', 'MONGODB_INDEX', 'MONGODB_URI'} 107 | assert os.environ["DATASTORE"] == 'mongodb' 108 | missing = required_vars - set(os.environ) 109 | assert len(missing) == 0 110 | 111 | 112 | def test_mongodb_connection() -> None: 113 | """Confirm that the connection to the datastore works.""" 114 | client = MongoClient(os.environ["MONGODB_URI"]) 115 | assert client.admin.command('ping')['ok'] 116 | 117 | 118 | def test_openai_connection() -> None: 119 | """Check that we can call OpenAI Embedding models.""" 120 | openai.api_key = os.environ["OPENAI_API_KEY"] 121 | models = openai.Model.list() 122 | model_names = [model["id"] for model in models['data']] 123 | for model_name in model_names: 124 | try: 125 | response = openai.Embedding.create(input=["Some input text"], model=model_name) 126 | assert len(response['data'][0]['embedding']) >= int(os.environ['EMBEDDING_DIMENSION']) 127 | except: 128 | pass # Not all models are for text embedding. 129 | -------------------------------------------------------------------------------- /tests/datastore/providers/redis/test_redis_datastore.py: -------------------------------------------------------------------------------- 1 | from datastore.providers.redis_datastore import RedisDataStore 2 | from models.models import ( 3 | DocumentChunk, 4 | DocumentChunkMetadata, 5 | QueryWithEmbedding, 6 | Source, 7 | DocumentMetadataFilter, 8 | ) 9 | import pytest 10 | import redis.asyncio as redis 11 | import numpy as np 12 | 13 | NUM_TEST_DOCS = 10 14 | 15 | 16 | @pytest.fixture 17 | async def redis_datastore(): 18 | return await RedisDataStore.init(dim=5) 19 | 20 | 21 | def create_embedding(i, dim): 22 | vec = np.array([0.1] * dim).astype(np.float64).tolist() 23 | vec[dim - 1] = i + 1 / 10 24 | return vec 25 | 26 | 27 | def create_document_chunk(i, dim): 28 | return DocumentChunk( 29 | id=f"first-doc_{i}", 30 | text=f"Lorem ipsum {i}", 31 | embedding=create_embedding(i, dim), 32 | metadata=DocumentChunkMetadata( 33 | source=Source.file, created_at="1970-01-01", document_id="docs" 34 | ), 35 | ) 36 | 37 | 38 | def create_document_chunks(n, dim): 39 | docs = [create_document_chunk(i, dim) for i in range(n)] 40 | return {"docs": docs} 41 | 42 | 43 | @pytest.mark.asyncio 44 | async def test_redis_upsert_query(redis_datastore): 45 | docs = create_document_chunks(NUM_TEST_DOCS, 5) 46 | await redis_datastore._upsert(docs) 47 | query = QueryWithEmbedding( 48 | query="Lorem ipsum 0", 49 | top_k=5, 50 | embedding=create_embedding(0, 5), 51 | ) 52 | query_results = await redis_datastore._query(queries=[query]) 53 | assert 1 == len(query_results) 54 | for i in range(5): 55 | assert f"Lorem ipsum {i}" == query_results[0].results[i].text 56 | assert "docs" == query_results[0].results[i].id 57 | 58 | 59 | @pytest.mark.asyncio 60 | async def test_redis_filter_query(redis_datastore): 61 | query = QueryWithEmbedding( 62 | query="Lorem ipsum 0", 63 | filter=DocumentMetadataFilter(document_id="docs"), 64 | top_k=5, 65 | embedding=create_embedding(0, 5), 66 | ) 67 | query_results = await redis_datastore._query(queries=[query]) 68 | print(query_results) 69 | assert 1 == len(query_results) 70 | assert "docs" == query_results[0].results[0].id 71 | 72 | 73 | @pytest.mark.asyncio 74 | async def test_redis_delete_docs(redis_datastore): 75 | res = await redis_datastore.delete(ids=["docs"]) 76 | assert res 77 | -------------------------------------------------------------------------------- /tests/datastore/providers/weaviate/docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3.4' 3 | services: 4 | weaviate: 5 | command: 6 | - --host 7 | - 0.0.0.0 8 | - --port 9 | - '8080' 10 | - --scheme 11 | - http 12 | image: semitechnologies/weaviate:1.18.0 13 | ports: 14 | - 8080:8080 15 | restart: on-failure:0 16 | environment: 17 | QUERY_DEFAULTS_LIMIT: 25 18 | AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' 19 | PERSISTENCE_DATA_PATH: '/var/lib/weaviate' 20 | DEFAULT_VECTORIZER_MODULE: 'none' 21 | ENABLE_MODULES: '' 22 | CLUSTER_HOSTNAME: 'node1' 23 | LOG_LEVEL: debug 24 | AUTOSCHEMA_ENABLED: 'false' 25 | ... -------------------------------------------------------------------------------- /tests/datastore/providers/zilliz/test_zilliz_datastore.py: -------------------------------------------------------------------------------- 1 | # from pathlib import Path 2 | # from dotenv import find_dotenv, load_dotenv 3 | # env_path = Path(".") / "zilliz.env" 4 | # load_dotenv(dotenv_path=env_path, verbose=True) 5 | 6 | import pytest 7 | 8 | from datastore.providers.zilliz_datastore import ( 9 | ZillizDataStore, 10 | ) 11 | 12 | from datastore.providers.milvus_datastore import ( 13 | EMBEDDING_FIELD, 14 | ) 15 | 16 | # Note: Only do basic test here, the ZillizDataStore is derived from MilvusDataStore. 17 | 18 | 19 | @pytest.fixture 20 | def zilliz_datastore(): 21 | return ZillizDataStore() 22 | 23 | 24 | @pytest.mark.asyncio 25 | async def test_zilliz(zilliz_datastore): 26 | assert True == zilliz_datastore.col.has_index() 27 | index_list = [x.to_dict() for x in zilliz_datastore.col.indexes] 28 | for index in index_list: 29 | if index["index_name"] == EMBEDDING_FIELD: 30 | assert "AUTOINDEX" == index["index_param"]["index_type"] 31 | --------------------------------------------------------------------------------