├── .dockerignore
├── .env.example
├── .github
    └── pull_request_template.md
├── .gitignore
├── .well-known
    ├── ai-plugin.json
    ├── logo.png
    └── openapi.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── assets
    └── example.png
├── datastore
    ├── __init__.py
    ├── datastore.py
    ├── factory.py
    └── providers
    │   ├── __init__.py
    │   ├── analyticdb_datastore.py
    │   ├── azurecosmosdb_datastore.py
    │   ├── azuresearch_datastore.py
    │   ├── chroma_datastore.py
    │   ├── elasticsearch_datastore.py
    │   ├── llama_datastore.py
    │   ├── milvus_datastore.py
    │   ├── mongodb_atlas_datastore.py
    │   ├── pgvector_datastore.py
    │   ├── pinecone_datastore.py
    │   ├── postgres_datastore.py
    │   ├── qdrant_datastore.py
    │   ├── redis_datastore.py
    │   ├── supabase_datastore.py
    │   ├── weaviate_datastore.py
    │   └── zilliz_datastore.py
├── docs
    ├── deployment
    │   ├── flyio.md
    │   ├── heroku.md
    │   ├── other-options.md
    │   ├── removing-unused-dependencies.md
    │   ├── render-thumbnail.png
    │   └── render.md
    ├── deprecated
    │   └── plugins.md
    └── providers
    │   ├── analyticdb
    │       └── setup.md
    │   ├── azurecosmosdb
    │       └── setup.md
    │   ├── azuresearch
    │       └── setup.md
    │   ├── chroma
    │       └── setup.md
    │   ├── elasticsearch
    │       └── setup.md
    │   ├── llama
    │       └── setup.md
    │   ├── milvus
    │       └── setup.md
    │   ├── mongodb
    │       └── setup.md
    │   ├── pinecone
    │       └── setup.md
    │   ├── postgres
    │       └── setup.md
    │   ├── qdrant
    │       └── setup.md
    │   ├── redis
    │       └── setup.md
    │   ├── supabase
    │       └── setup.md
    │   ├── weaviate
    │       └── setup.md
    │   └── zilliz
    │       └── setup.md
├── examples
    ├── authentication-methods
    │   ├── no-auth
    │   │   ├── ai-plugin.json
    │   │   └── main.py
    │   ├── oauth
    │   │   └── ai-plugin.json
    │   ├── service-http
    │   │   └── ai-plugin.json
    │   └── user-http
    │   │   └── ai-plugin.json
    ├── docker
    │   ├── elasticsearch
    │   │   ├── README.md
    │   │   └── docker-compose.yaml
    │   ├── milvus
    │   │   └── docker-compose.yaml
    │   ├── qdrant
    │   │   ├── README.md
    │   │   ├── docker-compose.yaml
    │   │   ├── documents.json
    │   │   └── queries.json
    │   └── redis
    │   │   └── docker-compose.yml
    ├── function-calling
    │   └── README.md
    ├── memory
    │   ├── README.md
    │   ├── ai-plugin.json
    │   ├── main.py
    │   └── openapi.yaml
    └── providers
    │   ├── azurecosmosdb
    │       └── semantic-search.ipynb
    │   ├── elasticsearch
    │       └── search.ipynb
    │   ├── mongodb
    │       └── semantic-search.ipynb
    │   ├── pinecone
    │       └── semantic-search.ipynb
    │   ├── redis
    │       └── semantic-search-and-filter.ipynb
    │   └── supabase
    │       ├── .gitignore
    │       ├── config.toml
    │       ├── migrations
    │           └── 20230414142107_init_pg_vector.sql
    │       └── seed.sql
├── local_server
    ├── ai-plugin.json
    ├── logo.png
    ├── main.py
    └── openapi.yaml
├── models
    ├── api.py
    └── models.py
├── poetry.lock
├── pyproject.toml
├── scripts
    ├── process_json
    │   ├── README.md
    │   ├── example.json
    │   └── process_json.py
    ├── process_jsonl
    │   ├── README.md
    │   ├── example.jsonl
    │   └── process_jsonl.py
    └── process_zip
    │   ├── README.md
    │   ├── example.zip
    │   └── process_zip.py
├── server
    └── main.py
├── services
    ├── chunks.py
    ├── date.py
    ├── extract_metadata.py
    ├── file.py
    ├── openai.py
    └── pii_detection.py
└── tests
    ├── __init__.py
    └── datastore
        └── providers
            ├── analyticdb
                └── test_analyticdb_datastore.py
            ├── azurecosmosdb
                └── test_azurecosmosdb_datastore.py
            ├── azuresearch
                └── test_azuresearch_datastore.py
            ├── chroma
                └── test_chroma_datastore.py
            ├── elasticsearch
                └── test_elasticsearch_datastore.py
            ├── llama
                └── test_llama_datastore.py
            ├── milvus
                └── test_milvus_datastore.py
            ├── mongodb_atlas
                ├── test_integration.py
                └── test_mongodb_datastore.py
            ├── postgres
                └── test_postgres_datastore.py
            ├── qdrant
                └── test_qdrant_datastore.py
            ├── redis
                └── test_redis_datastore.py
            ├── supabase
                └── test_supabase_datastore.py
            ├── weaviate
                ├── docker-compose.yml
                └── test_weaviate_datastore.py
            └── zilliz
                └── test_zilliz_datastore.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Ignore files that are already ignored by git
 2 | .gitignore
 3 | 
 4 | scripts/
 5 | tests/
 6 | examples/
 7 | local_server/
 8 | assets/
 9 | *.md
10 | *.pyc
11 | .dockerignore
12 | Dockerfile
13 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
  1 | # Core environment variables
  2 | DATASTORE="<your_datastore>"
  3 | BEARER_TOKEN="<your_bearer_token>"
  4 | OPENAI_API_KEY="<your_openai_api_key>"
  5 | EMBEDDING_DIMENSION=256 # edit this value based on the dimension of the embeddings you want to use
  6 | EMBEDDING_MODEL="text-embedding-3-large" # edit this value based on the model you want to use e.g. text-embedding-3-small, text-embedding-ada-002
  7 |  
  8 | # Optional environment variables for Azure OpenAI
  9 | OPENAI_API_BASE="https://<AzureOpenAIName>.openai.azure.com/"
 10 | OPENAI_API_TYPE="azure"
 11 | OPENAI_EMBEDDINGMODEL_DEPLOYMENTID="<Name of embedding model deployment>"
 12 | OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID="<Name of deployment of model for metatdata>"
 13 | OPENAI_COMPLETIONMODEL_DEPLOYMENTID="<Name of general model deployment used for completion>"
 14 | OPENAI_EMBEDDING_BATCH_SIZE="<Batch size of embedding, for AzureOAI, this value need to be set as 1>"
 15 | 
 16 | # Pinecone configuration
 17 | PINECONE_API_KEY="<your_pinecone_api_key>"
 18 | PINECONE_ENVIRONMENT="<your_pinecone_environment>"
 19 | PINECONE_INDEX="<your_pinecone_index>"
 20 | 
 21 | # Weaviate configuration
 22 | WEAVIATE_URL="<your_weaviate_instance_url>"
 23 | WEAVIATE_API_KEY="<your_api_key_for_WCS>"
 24 | WEAVIATE_CLASS="<your_optional_weaviate_class>"
 25 | 
 26 | # Zilliz configuration
 27 | ZILLIZ_COLLECTION="<your_zilliz_collection>"
 28 | ZILLIZ_URI="<your_zilliz_uri>"
 29 | ZILLIZ_USER="<your_zilliz_username>"
 30 | ZILLIZ_PASSWORD="<your_zilliz_password>"
 31 | 
 32 | # Milvus configuration
 33 | MILVUS_COLLECTION="<your_milvus_collection>"
 34 | MILVUS_HOST="<your_milvus_host>"
 35 | MILVUS_PORT="<your_milvus_port>"
 36 | MILVUS_USER="<your_milvus_username>"
 37 | MILVUS_PASSWORD="<your_milvus_password>"
 38 | 
 39 | # Qdrant configuration
 40 | QDRANT_URL="<your_qdrant_url>"
 41 | QDRANT_PORT="<your_qdrant_port>"
 42 | QDRANT_GRPC_PORT="<your_qdrant_grpc_port>"
 43 | QDRANT_API_KEY="<your_qdrant_api_key>"
 44 | QDRANT_COLLECTION="<your_qdrant_collection>"
 45 | 
 46 | # AnalyticDB configuration
 47 | PG_HOST="<your_analyticdb_host>"
 48 | PG_PORT="<your_analyticdb_port>"
 49 | PG_USER="<your_analyticdb_username>"
 50 | PG_PASSWORD="<your_analyticdb_password>"
 51 | PG_DATABASE="<your_analyticdb_database>"
 52 | PG_COLLECTION="<your_analyticdb_collection>"
 53 | 
 54 | # Redis configuration
 55 | REDIS_HOST="<your_redis_host>"
 56 | REDIS_PORT="<your_redis_port>"
 57 | REDIS_PASSWORD="<your_redis_password>"
 58 | REDIS_INDEX_NAME="<your_redis_index_name>"
 59 | REDIS_DOC_PREFIX="<your_redis_doc_prefix>"
 60 | REDIS_DISTANCE_METRIC="<your_redis_distance_metric>"
 61 | REDIS_INDEX_TYPE="<your_redis_index_type>"
 62 | 
 63 | # Llama configuration
 64 | LLAMA_INDEX_TYPE="<gpt_vector_index_type>"
 65 | LLAMA_INDEX_JSON_PATH="<path_to_saved_index_json_file>"
 66 | LLAMA_QUERY_KWARGS_JSON_PATH="<path_to_saved_query_kwargs_json_file>"
 67 | LLAMA_RESPONSE_MODE="<response_mode_for_query>"
 68 | 
 69 | # Chroma configuration
 70 | CHROMA_COLLECTION="<your_chroma_collection>"
 71 | CHROMA_IN_MEMORY="<true_or_false>"
 72 | CHROMA_PERSISTENCE_DIR="<your_chroma_persistence_directory>"
 73 | CHROMA_HOST="<your_chroma_host>"
 74 | CHROMA_PORT="<your_chroma_port>"
 75 | 
 76 | # Azure Cognitive Search configuration
 77 | AZURESEARCH_SERVICE="<your_search_service_name>"
 78 | AZURESEARCH_INDEX="<your_search_index_name>"
 79 | AZURESEARCH_API_KEY="<your_api_key>" # (optional, uses key-free managed identity if not set)
 80 | 
 81 | # Azure CosmosDB Mongo vCore configuration
 82 | AZCOSMOS_API="<your azure cosmos db api, for now it only supports mongo>"
 83 | AZCOSMOS_CONNSTR="<your azure cosmos db mongo vcore connection string>"
 84 | AZCOSMOS_DATABASE_NAME="<your mongo database name>"
 85 | AZCOSMOS_CONTAINER_NAME="<your mongo container name>"
 86 | 
 87 | # Supabase configuration
 88 | SUPABASE_URL="<supabase_project_url>"
 89 | SUPABASE_ANON_KEY="<supabase_project_api_anon_key>"
 90 | 
 91 | # Postgres configuration
 92 | PG_HOST="<postgres_host>"
 93 | PG_PORT="<postgres_port>"
 94 | PG_USER="<postgres_user>"
 95 | PG_PASSWORD="<postgres_password>"
 96 | PG_DB="<postgres_database>"
 97 | 
 98 | # Elasticsearch configuration
 99 | ELASTICSEARCH_URL="<elasticsearch_host_and_port>" # (either specify host or cloud_id)
100 | ELASTICSEARCH_CLOUD_ID="<elasticsearch_cloud_id>"
101 | ELASTICSEARCH_USERNAME="<elasticsearch_username>"
102 | ELASTICSEARCH_PASSWORD="<elasticsearch_password>"
103 | ELASTICSEARCH_API_KEY="<elasticsearch_api_key>"
104 | ELASTICSEARCH_INDEX="<elasticsearch_index_name>"
105 | ELASTICSEARCH_REPLICAS="<elasticsearch_replicas>"
106 | ELASTICSEARCH_SHARDS="<elasticsearch_shards>"


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Pull Request (PR) Checklist
 2 | If you'd like to contribute, please follow the checklist below when submitting a PR. This will help us review and merge your changes faster! Thank you for contributing!
 3 | 
 4 | 1. **Type of PR**: Indicate the type of PR by adding a label in square brackets at the beginning of the title, such as `[Bugfix]`, `[Feature]`, `[Enhancement]`, `[Refactor]`, or `[Documentation]`.
 5 | 
 6 | 2. **Short Description**: Provide a brief, informative description of the PR that explains the changes made.
 7 | 
 8 | 3. **Issue(s) Linked**: Mention any related issue(s) by using the keyword `Fixes` or `Closes` followed by the respective issue number(s) (e.g., Fixes #123, Closes #456).
 9 | 
10 | 4. **Branch**: Ensure that you have created a new branch for the changes, and it is based on the latest version of the `main` branch.
11 | 
12 | 5. **Code Changes**: Make sure the code changes are minimal, focused, and relevant to the issue or feature being addressed.
13 | 
14 | 6. **Commit Messages**: Write clear and concise commit messages that explain the purpose of each commit.
15 | 
16 | 7. **Tests**: Include unit tests and/or integration tests for any new code or changes to existing code. Make sure all tests pass before submitting the PR.
17 | 
18 | 8. **Documentation**: Update relevant documentation (e.g., README, inline comments, or external documentation) to reflect any changes made.
19 | 
20 | 9. **Review Requested**: Request a review from at least one other contributor or maintainer of the repository.
21 | 
22 | 10. **Video Submission** (For Complex/Large PRs): If your PR introduces significant changes, complexities, or a large number of lines of code, submit a brief video walkthrough along with the PR. The video should explain the purpose of the changes, the logic behind them, and how they address the issue or add the proposed feature. This will help reviewers to better understand your contribution and expedite the review process.
23 | 
24 | ## Pull Request Naming Convention
25 | 
26 | Use the following naming convention for your PR branches:
27 | 
28 | ```
29 | <type>/<short-description>-<issue-number>
30 | ```
31 | 
32 | - `<type>`: The type of PR, such as `bugfix`, `feature`, `enhancement`, `refactor`, or `docs`. Multiple types are ok and should appear as <type>, <type2>
33 | - `<short-description>`: A brief description of the changes made, using hyphens to separate words.
34 | - `<issue-number>`: The issue number associated with the changes made (if applicable).
35 | 
36 | Example:
37 | 
38 | ```
39 | feature/advanced-chunking-strategy-123
40 | ```


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | build/
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # .vscode files
 30 | .vscode/*
 31 | 
 32 | # Pycharm
 33 | .idea/
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | myvenv/
118 | 
119 | # Exception for .env.example
120 | !.env.example
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 
140 | # macOS .DS_Store files
141 | .DS_Store


--------------------------------------------------------------------------------
/.well-known/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schema_version": "v1",
 3 |   "name_for_model": "retrieval",
 4 |   "name_for_human": "Retrieval Plugin",
 5 |   "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
 6 |   "description_for_human": "Search through your documents.",
 7 |   "auth": {
 8 |     "type": "user_http",
 9 |     "authorization_type": "bearer"
10 |   },
11 |   "api": {
12 |     "type": "openapi",
13 |     "url": "https://your-app-url.com/.well-known/openapi.yaml",
14 |     "has_user_authentication": false
15 |   },
16 |   "logo_url": "https://your-app-url.com/.well-known/logo.png",
17 |   "contact_email": "hello@contact.com", 
18 |   "legal_info_url": "http://example.com/legal-info"
19 | }
20 | 


--------------------------------------------------------------------------------
/.well-known/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/.well-known/logo.png


--------------------------------------------------------------------------------
/.well-known/openapi.yaml:
--------------------------------------------------------------------------------
  1 | openapi: 3.0.2
  2 | info:
  3 |   title: Retrieval Plugin API
  4 |   description: A retrieval API for querying and filtering documents based on natural language queries and metadata
  5 |   version: 1.0.0
  6 | servers:
  7 |   - url: https://your-app-url.com
  8 | paths:
  9 |   /query:
 10 |     post:
 11 |       summary: Query
 12 |       description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.
 13 |       operationId: query_query_post
 14 |       requestBody:
 15 |         content:
 16 |           application/json:
 17 |             schema:
 18 |               $ref: "#/components/schemas/QueryRequest"
 19 |         required: true
 20 |       responses:
 21 |         "200":
 22 |           description: Successful Response
 23 |           content:
 24 |             application/json:
 25 |               schema:
 26 |                 $ref: "#/components/schemas/QueryResponse"
 27 |         "422":
 28 |           description: Validation Error
 29 |           content:
 30 |             application/json:
 31 |               schema:
 32 |                 $ref: "#/components/schemas/HTTPValidationError"
 33 |       security:
 34 |         - HTTPBearer: []
 35 | components:
 36 |   schemas:
 37 |     DocumentChunkMetadata:
 38 |       title: DocumentChunkMetadata
 39 |       type: object
 40 |       properties:
 41 |         source:
 42 |           $ref: "#/components/schemas/Source"
 43 |         source_id:
 44 |           title: Source Id
 45 |           type: string
 46 |         url:
 47 |           title: Url
 48 |           type: string
 49 |         created_at:
 50 |           title: Created At
 51 |           type: string
 52 |         author:
 53 |           title: Author
 54 |           type: string
 55 |         document_id:
 56 |           title: Document Id
 57 |           type: string
 58 |     DocumentChunkWithScore:
 59 |       title: DocumentChunkWithScore
 60 |       required:
 61 |         - text
 62 |         - metadata
 63 |         - score
 64 |       type: object
 65 |       properties:
 66 |         id:
 67 |           title: Id
 68 |           type: string
 69 |         text:
 70 |           title: Text
 71 |           type: string
 72 |         metadata:
 73 |           $ref: "#/components/schemas/DocumentChunkMetadata"
 74 |         embedding:
 75 |           title: Embedding
 76 |           type: array
 77 |           items:
 78 |             type: number
 79 |         score:
 80 |           title: Score
 81 |           type: number
 82 |     DocumentMetadataFilter:
 83 |       title: DocumentMetadataFilter
 84 |       type: object
 85 |       properties:
 86 |         document_id:
 87 |           title: Document Id
 88 |           type: string
 89 |         source:
 90 |           $ref: "#/components/schemas/Source"
 91 |         source_id:
 92 |           title: Source Id
 93 |           type: string
 94 |         author:
 95 |           title: Author
 96 |           type: string
 97 |         start_date:
 98 |           title: Start Date
 99 |           type: string
100 |         end_date:
101 |           title: End Date
102 |           type: string
103 |     HTTPValidationError:
104 |       title: HTTPValidationError
105 |       type: object
106 |       properties:
107 |         detail:
108 |           title: Detail
109 |           type: array
110 |           items:
111 |             $ref: "#/components/schemas/ValidationError"
112 |     Query:
113 |       title: Query
114 |       required:
115 |         - query
116 |       type: object
117 |       properties:
118 |         query:
119 |           title: Query
120 |           type: string
121 |         filter:
122 |           $ref: "#/components/schemas/DocumentMetadataFilter"
123 |         top_k:
124 |           title: Top K
125 |           type: integer
126 |           default: 3
127 |     QueryRequest:
128 |       title: QueryRequest
129 |       required:
130 |         - queries
131 |       type: object
132 |       properties:
133 |         queries:
134 |           title: Queries
135 |           type: array
136 |           items:
137 |             $ref: "#/components/schemas/Query"
138 |     QueryResponse:
139 |       title: QueryResponse
140 |       required:
141 |         - results
142 |       type: object
143 |       properties:
144 |         results:
145 |           title: Results
146 |           type: array
147 |           items:
148 |             $ref: "#/components/schemas/QueryResult"
149 |     QueryResult:
150 |       title: QueryResult
151 |       required:
152 |         - query
153 |         - results
154 |       type: object
155 |       properties:
156 |         query:
157 |           title: Query
158 |           type: string
159 |         results:
160 |           title: Results
161 |           type: array
162 |           items:
163 |             $ref: "#/components/schemas/DocumentChunkWithScore"
164 |     Source:
165 |       title: Source
166 |       enum:
167 |         - email
168 |         - file
169 |         - chat
170 |       type: string
171 |       description: An enumeration.
172 |     ValidationError:
173 |       title: ValidationError
174 |       required:
175 |         - loc
176 |         - msg
177 |         - type
178 |       type: object
179 |       properties:
180 |         loc:
181 |           title: Location
182 |           type: array
183 |           items:
184 |             anyOf:
185 |               - type: string
186 |               - type: integer
187 |         msg:
188 |           title: Message
189 |           type: string
190 |         type:
191 |           title: Error Type
192 |           type: string
193 |   securitySchemes:
194 |     HTTPBearer:
195 |       type: http
196 |       scheme: bearer
197 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM python:3.10 as requirements-stage
 3 | 
 4 | WORKDIR /tmp
 5 | 
 6 | RUN pip install poetry
 7 | 
 8 | COPY ./pyproject.toml ./poetry.lock* /tmp/
 9 | 
10 | 
11 | RUN poetry export -f requirements.txt --output requirements.txt --without-hashes
12 | 
13 | FROM python:3.10
14 | 
15 | WORKDIR /code
16 | 
17 | COPY --from=requirements-stage /tmp/requirements.txt /code/requirements.txt
18 | 
19 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
20 | 
21 | COPY . /code/
22 | 
23 | # Heroku uses PORT, Azure App Services uses WEBSITES_PORT, Fly.io uses 8080 by default
24 | CMD ["sh", "-c", "uvicorn server.main:app --host 0.0.0.0 --port ${PORT:-${WEBSITES_PORT:-8080}}"]
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Heroku
 2 | # make heroku-login
 3 | # make heroku-push
 4 | 
 5 | HEROKU_APP = <your app name> 
 6 | 
 7 | heroku-push:
 8 | 	docker buildx build --platform linux/amd64 -t ${HEROKU_APP} .
 9 | 	docker tag ${HEROKU_APP} registry.heroku.com/${HEROKU_APP}/web
10 | 	docker push registry.heroku.com/${HEROKU_APP}/web
11 | 	heroku container:release web -a ${HEROKU_APP}
12 | 
13 | heroku-login:
14 | 	heroku container:login
15 | 


--------------------------------------------------------------------------------
/assets/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/assets/example.png


--------------------------------------------------------------------------------
/datastore/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/datastore/__init__.py


--------------------------------------------------------------------------------
/datastore/datastore.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, List, Optional
 3 | import asyncio
 4 | 
 5 | from models.models import (
 6 |     Document,
 7 |     DocumentChunk,
 8 |     DocumentMetadataFilter,
 9 |     Query,
10 |     QueryResult,
11 |     QueryWithEmbedding,
12 | )
13 | from services.chunks import get_document_chunks
14 | from services.openai import get_embeddings
15 | 
16 | 
17 | class DataStore(ABC):
18 |     async def upsert(
19 |         self, documents: List[Document], chunk_token_size: Optional[int] = None
20 |     ) -> List[str]:
21 |         """
22 |         Takes in a list of documents and inserts them into the database.
23 |         First deletes all the existing vectors with the document id (if necessary, depends on the vector db), then inserts the new ones.
24 |         Return a list of document ids.
25 |         """
26 |         # Delete any existing vectors for documents with the input document ids
27 |         await asyncio.gather(
28 |             *[
29 |                 self.delete(
30 |                     filter=DocumentMetadataFilter(
31 |                         document_id=document.id,
32 |                     ),
33 |                     delete_all=False,
34 |                 )
35 |                 for document in documents
36 |                 if document.id
37 |             ]
38 |         )
39 | 
40 |         chunks = get_document_chunks(documents, chunk_token_size)
41 | 
42 |         return await self._upsert(chunks)
43 | 
44 |     @abstractmethod
45 |     async def _upsert(self, chunks: Dict[str, List[DocumentChunk]]) -> List[str]:
46 |         """
47 |         Takes in a list of document chunks and inserts them into the database.
48 |         Return a list of document ids.
49 |         """
50 | 
51 |         raise NotImplementedError
52 | 
53 |     async def query(self, queries: List[Query]) -> List[QueryResult]:
54 |         """
55 |         Takes in a list of queries and filters and returns a list of query results with matching document chunks and scores.
56 |         """
57 |         # get a list of just the queries from the Query list
58 |         query_texts = [query.query for query in queries]
59 |         query_embeddings = get_embeddings(query_texts)
60 |         # hydrate the queries with embeddings
61 |         queries_with_embeddings = [
62 |             QueryWithEmbedding(**query.dict(), embedding=embedding)
63 |             for query, embedding in zip(queries, query_embeddings)
64 |         ]
65 |         return await self._query(queries_with_embeddings)
66 | 
67 |     @abstractmethod
68 |     async def _query(self, queries: List[QueryWithEmbedding]) -> List[QueryResult]:
69 |         """
70 |         Takes in a list of queries with embeddings and filters and returns a list of query results with matching document chunks and scores.
71 |         """
72 |         raise NotImplementedError
73 | 
74 |     @abstractmethod
75 |     async def delete(
76 |         self,
77 |         ids: Optional[List[str]] = None,
78 |         filter: Optional[DocumentMetadataFilter] = None,
79 |         delete_all: Optional[bool] = None,
80 |     ) -> bool:
81 |         """
82 |         Removes vectors by ids, filter, or everything in the datastore.
83 |         Multiple parameters can be used at once.
84 |         Returns whether the operation was successful.
85 |         """
86 |         raise NotImplementedError
87 | 


--------------------------------------------------------------------------------
/datastore/factory.py:
--------------------------------------------------------------------------------
 1 | from datastore.datastore import DataStore
 2 | import os
 3 | 
 4 | 
 5 | async def get_datastore() -> DataStore:
 6 |     datastore = os.environ.get("DATASTORE")
 7 |     assert datastore is not None
 8 | 
 9 |     match datastore:
10 |         case "chroma":
11 |             from datastore.providers.chroma_datastore import ChromaDataStore
12 | 
13 |             return ChromaDataStore()
14 |         case "llama":
15 |             from datastore.providers.llama_datastore import LlamaDataStore
16 | 
17 |             return LlamaDataStore()
18 | 
19 |         case "pinecone":
20 |             from datastore.providers.pinecone_datastore import PineconeDataStore
21 | 
22 |             return PineconeDataStore()
23 |         case "weaviate":
24 |             from datastore.providers.weaviate_datastore import WeaviateDataStore
25 | 
26 |             return WeaviateDataStore()
27 |         case "milvus":
28 |             from datastore.providers.milvus_datastore import MilvusDataStore
29 | 
30 |             return MilvusDataStore()
31 |         case "zilliz":
32 |             from datastore.providers.zilliz_datastore import ZillizDataStore
33 | 
34 |             return ZillizDataStore()
35 |         case "redis":
36 |             from datastore.providers.redis_datastore import RedisDataStore
37 | 
38 |             return await RedisDataStore.init()
39 |         case "azurecosmosdb":
40 |             from datastore.providers.azurecosmosdb_datastore import (
41 |                 AzureCosmosDBDataStore,
42 |             )
43 | 
44 |             return await AzureCosmosDBDataStore.create()
45 |         case "qdrant":
46 |             from datastore.providers.qdrant_datastore import QdrantDataStore
47 | 
48 |             return QdrantDataStore()
49 |         case "azuresearch":
50 |             from datastore.providers.azuresearch_datastore import AzureSearchDataStore
51 | 
52 |             return AzureSearchDataStore()
53 |         case "supabase":
54 |             from datastore.providers.supabase_datastore import SupabaseDataStore
55 | 
56 |             return SupabaseDataStore()
57 |         case "postgres":
58 |             from datastore.providers.postgres_datastore import PostgresDataStore
59 | 
60 |             return PostgresDataStore()
61 |         case "analyticdb":
62 |             from datastore.providers.analyticdb_datastore import AnalyticDBDataStore
63 | 
64 |             return AnalyticDBDataStore()
65 |         case "elasticsearch":
66 |             from datastore.providers.elasticsearch_datastore import (
67 |                 ElasticsearchDataStore,
68 |             )
69 | 
70 |             return ElasticsearchDataStore()
71 |         case "mongodb":
72 |             from datastore.providers.mongodb_atlas_datastore import (
73 |                 MongoDBAtlasDataStore,
74 |             )
75 | 
76 |             return MongoDBAtlasDataStore()
77 |         case _:
78 |             raise ValueError(
79 |                 f"Unsupported vector database: {datastore}. "
80 |                 f"Try one of the following: llama, elasticsearch, pinecone, weaviate, milvus, zilliz, redis, azuresearch, or qdrant"
81 |             )
82 | 


--------------------------------------------------------------------------------
/datastore/providers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/datastore/providers/__init__.py


--------------------------------------------------------------------------------
/datastore/providers/postgres_datastore.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Any, List
  3 | from datetime import datetime
  4 | import numpy as np
  5 | 
  6 | from psycopg2 import connect
  7 | from psycopg2.extras import DictCursor
  8 | from pgvector.psycopg2 import register_vector
  9 | 
 10 | from services.date import to_unix_timestamp
 11 | from datastore.providers.pgvector_datastore import PGClient, PgVectorDataStore
 12 | from models.models import (
 13 |     DocumentMetadataFilter,
 14 | )
 15 | 
 16 | PG_HOST = os.environ.get("PG_HOST", "localhost")
 17 | PG_PORT = int(os.environ.get("PG_PORT", 5432))
 18 | PG_DB = os.environ.get("PG_DB", "postgres")
 19 | PG_USER = os.environ.get("PG_USER", "postgres")
 20 | PG_PASSWORD = os.environ.get("PG_PASSWORD", "postgres")
 21 | 
 22 | 
 23 | # class that implements the DataStore interface for Postgres Datastore provider
 24 | class PostgresDataStore(PgVectorDataStore):
 25 |     def create_db_client(self):
 26 |         return PostgresClient()
 27 | 
 28 | 
 29 | class PostgresClient(PGClient):
 30 |     def __init__(self) -> None:
 31 |         super().__init__()
 32 |         self.client = connect(
 33 |             dbname=PG_DB, user=PG_USER, password=PG_PASSWORD, host=PG_HOST, port=PG_PORT
 34 |         )
 35 |         register_vector(self.client)
 36 | 
 37 |     def __del__(self):
 38 |         # close the connection when the client is destroyed
 39 |         self.client.close()
 40 | 
 41 |     async def upsert(self, table: str, json: dict[str, Any]):
 42 |         """
 43 |         Takes in a list of documents and inserts them into the table.
 44 |         """
 45 |         with self.client.cursor() as cur:
 46 |             if not json.get("created_at"):
 47 |                 json["created_at"] = datetime.now()
 48 |             json["embedding"] = np.array(json["embedding"])
 49 |             cur.execute(
 50 |                 f"INSERT INTO {table} (id, content, embedding, document_id, source, source_id, url, author, created_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id) DO UPDATE SET content = %s, embedding = %s, document_id = %s, source = %s, source_id = %s, url = %s, author = %s, created_at = %s",
 51 |                 (
 52 |                     json["id"],
 53 |                     json["content"],
 54 |                     json["embedding"],
 55 |                     json["document_id"],
 56 |                     json["source"],
 57 |                     json["source_id"],
 58 |                     json["url"],
 59 |                     json["author"],
 60 |                     json["created_at"],
 61 |                     json["content"],
 62 |                     json["embedding"],
 63 |                     json["document_id"],
 64 |                     json["source"],
 65 |                     json["source_id"],
 66 |                     json["url"],
 67 |                     json["author"],
 68 |                     json["created_at"],
 69 |                 ),
 70 |             )
 71 |             self.client.commit()
 72 | 
 73 |     async def rpc(self, function_name: str, params: dict[str, Any]):
 74 |         """
 75 |         Calls a stored procedure in the database with the given parameters.
 76 |         """
 77 |         data = []
 78 |         params["in_embedding"] = np.array(params["in_embedding"])
 79 |         with self.client.cursor(cursor_factory=DictCursor) as cur:
 80 |             cur.callproc(function_name, params)
 81 |             rows = cur.fetchall()
 82 |             self.client.commit()
 83 |             for row in rows:
 84 |                 row["created_at"] = to_unix_timestamp(row["created_at"])
 85 |                 data.append(dict(row))
 86 |         return data
 87 | 
 88 |     async def delete_like(self, table: str, column: str, pattern: str):
 89 |         """
 90 |         Deletes rows in the table that match the pattern.
 91 |         """
 92 |         with self.client.cursor() as cur:
 93 |             cur.execute(
 94 |                 f"DELETE FROM {table} WHERE {column} LIKE %s",
 95 |                 (f"%{pattern}%",),
 96 |             )
 97 |             self.client.commit()
 98 | 
 99 |     async def delete_in(self, table: str, column: str, ids: List[str]):
100 |         """
101 |         Deletes rows in the table that match the ids.
102 |         """
103 |         with self.client.cursor() as cur:
104 |             cur.execute(
105 |                 f"DELETE FROM {table} WHERE {column} IN %s",
106 |                 (tuple(ids),),
107 |             )
108 |             self.client.commit()
109 | 
110 |     async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter):
111 |         """
112 |         Deletes rows in the table that match the filter.
113 |         """
114 | 
115 |         filters = "WHERE"
116 |         if filter.document_id:
117 |             filters += f" document_id = '{filter.document_id}' AND"
118 |         if filter.source:
119 |             filters += f" source = '{filter.source}' AND"
120 |         if filter.source_id:
121 |             filters += f" source_id = '{filter.source_id}' AND"
122 |         if filter.author:
123 |             filters += f" author = '{filter.author}' AND"
124 |         if filter.start_date:
125 |             filters += f" created_at >= '{filter.start_date}' AND"
126 |         if filter.end_date:
127 |             filters += f" created_at <= '{filter.end_date}' AND"
128 |         filters = filters[:-4]
129 | 
130 |         with self.client.cursor() as cur:
131 |             cur.execute(f"DELETE FROM {table} {filters}")
132 |             self.client.commit()
133 | 


--------------------------------------------------------------------------------
/datastore/providers/supabase_datastore.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, List
 3 | from datetime import datetime
 4 | 
 5 | from supabase import Client
 6 | 
 7 | from datastore.providers.pgvector_datastore import PGClient, PgVectorDataStore
 8 | from models.models import (
 9 |     DocumentMetadataFilter,
10 | )
11 | 
12 | SUPABASE_URL = os.environ.get("SUPABASE_URL")
13 | assert SUPABASE_URL is not None, "SUPABASE_URL is not set"
14 | SUPABASE_ANON_KEY = os.environ.get("SUPABASE_ANON_KEY")
15 | # use service role key if you want this app to be able to bypass your Row Level Security policies
16 | SUPABASE_SERVICE_ROLE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
17 | assert (
18 |     SUPABASE_ANON_KEY is not None or SUPABASE_SERVICE_ROLE_KEY is not None
19 | ), "SUPABASE_ANON_KEY or SUPABASE_SERVICE_ROLE_KEY must be set"
20 | 
21 | 
22 | # class that implements the DataStore interface for Supabase Datastore provider
23 | class SupabaseDataStore(PgVectorDataStore):
24 |     def create_db_client(self):
25 |         return SupabaseClient()
26 | 
27 | 
28 | class SupabaseClient(PGClient):
29 |     def __init__(self) -> None:
30 |         super().__init__()
31 |         if not SUPABASE_SERVICE_ROLE_KEY:
32 |             self.client = Client(SUPABASE_URL, SUPABASE_ANON_KEY)
33 |         else:
34 |             self.client = Client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
35 | 
36 |     async def upsert(self, table: str, json: dict[str, Any]):
37 |         """
38 |         Takes in a list of documents and inserts them into the table.
39 |         """
40 |         if "created_at" in json:
41 |             json["created_at"] = json["created_at"][0].isoformat()
42 | 
43 |         self.client.table(table).upsert(json).execute()
44 | 
45 |     async def rpc(self, function_name: str, params: dict[str, Any]):
46 |         """
47 |         Calls a stored procedure in the database with the given parameters.
48 |         """
49 |         if "in_start_date" in params:
50 |             params["in_start_date"] = params["in_start_date"].isoformat()
51 |         if "in_end_date" in params:
52 |             params["in_end_date"] = params["in_end_date"].isoformat()
53 | 
54 |         response = self.client.rpc(function_name, params=params).execute()
55 |         return response.data
56 | 
57 |     async def delete_like(self, table: str, column: str, pattern: str):
58 |         """
59 |         Deletes rows in the table that match the pattern.
60 |         """
61 |         self.client.table(table).delete().like(column, pattern).execute()
62 | 
63 |     async def delete_in(self, table: str, column: str, ids: List[str]):
64 |         """
65 |         Deletes rows in the table that match the ids.
66 |         """
67 |         self.client.table(table).delete().in_(column, ids).execute()
68 | 
69 |     async def delete_by_filters(self, table: str, filter: DocumentMetadataFilter):
70 |         """
71 |         Deletes rows in the table that match the filter.
72 |         """
73 |         builder = self.client.table(table).delete()
74 |         if filter.document_id:
75 |             builder = builder.eq(
76 |                 "document_id",
77 |                 filter.document_id,
78 |             )
79 |         if filter.source:
80 |             builder = builder.eq("source", filter.source)
81 |         if filter.source_id:
82 |             builder = builder.eq("source_id", filter.source_id)
83 |         if filter.author:
84 |             builder = builder.eq("author", filter.author)
85 |         if filter.start_date:
86 |             builder = builder.gte(
87 |                 "created_at",
88 |                 filter.start_date[0].isoformat(),
89 |             )
90 |         if filter.end_date:
91 |             builder = builder.lte(
92 |                 "created_at",
93 |                 filter.end_date[0].isoformat(),
94 |             )
95 |         builder.execute()
96 | 


--------------------------------------------------------------------------------
/datastore/providers/zilliz_datastore.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from loguru import logger
 4 | from typing import Optional
 5 | from pymilvus import (
 6 |     connections,
 7 | )
 8 | from uuid import uuid4
 9 | 
10 | from datastore.providers.milvus_datastore import (
11 |     MilvusDataStore,
12 | )
13 | 
14 | 
15 | ZILLIZ_COLLECTION = os.environ.get("ZILLIZ_COLLECTION") or "c" + uuid4().hex
16 | ZILLIZ_URI = os.environ.get("ZILLIZ_URI")
17 | ZILLIZ_USER = os.environ.get("ZILLIZ_USER")
18 | ZILLIZ_PASSWORD = os.environ.get("ZILLIZ_PASSWORD")
19 | ZILLIZ_USE_SECURITY = False if ZILLIZ_PASSWORD is None else True
20 | 
21 | ZILLIZ_CONSISTENCY_LEVEL = os.environ.get("ZILLIZ_CONSISTENCY_LEVEL")
22 | 
23 | 
24 | class ZillizDataStore(MilvusDataStore):
25 |     def __init__(self, create_new: Optional[bool] = False):
26 |         """Create a Zilliz DataStore.
27 | 
28 |         The Zilliz Datastore allows for storing your indexes and metadata within a Zilliz Cloud instance.
29 | 
30 |         Args:
31 |             create_new (Optional[bool], optional): Whether to overwrite if collection already exists. Defaults to True.
32 |         """
33 |         # Overwrite the default consistency level by MILVUS_CONSISTENCY_LEVEL
34 |         self._consistency_level = ZILLIZ_CONSISTENCY_LEVEL or "Bounded"
35 |         self._create_connection()
36 | 
37 |         self._create_collection(ZILLIZ_COLLECTION, create_new)  # type: ignore
38 |         self._create_index()
39 | 
40 |     def _create_connection(self):
41 |         # Check if the connection already exists
42 |         try:
43 |             i = [
44 |                 connections.get_connection_addr(x[0])
45 |                 for x in connections.list_connections()
46 |             ].index({"address": ZILLIZ_URI, "user": ZILLIZ_USER})
47 |             self.alias = connections.list_connections()[i][0]
48 |         except ValueError:
49 |             # Connect to the Zilliz instance using the passed in Environment variables
50 |             self.alias = uuid4().hex
51 |             connections.connect(alias=self.alias, uri=ZILLIZ_URI, user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, secure=ZILLIZ_USE_SECURITY)  # type: ignore
52 |             logger.info("Connect to zilliz cloud server")
53 | 
54 |     def _create_index(self):
55 |         try:
56 |             # If no index on the collection, create one
57 |             if len(self.col.indexes) == 0:
58 |                 self.index_params = {
59 |                     "metric_type": "IP",
60 |                     "index_type": "AUTOINDEX",
61 |                     "params": {},
62 |                 }
63 |                 self.col.create_index("embedding", index_params=self.index_params)
64 | 
65 |             self.col.load()
66 |             self.search_params = {"metric_type": "IP", "params": {}}
67 |         except Exception as e:
68 |             logger.error("Failed to create index, error: {}".format(e))
69 | 


--------------------------------------------------------------------------------
/docs/deployment/flyio.md:
--------------------------------------------------------------------------------
 1 | # Deploying to Fly.io
 2 | 
 3 | ## Removing Unused Dependencies
 4 | 
 5 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.
 6 | 
 7 | Find the packages you can remove for each vector database provider [here](removing-unused-dependencies.md).
 8 | 
 9 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.
10 | 
11 | ## Deployment
12 | 
13 | To deploy the Docker container from this repository to Fly.io, follow
14 | these steps:
15 | 
16 | [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed.
17 | 
18 | Install the [Fly.io CLI](https://fly.io/docs/getting-started/installing-flyctl/) on your local machine.
19 | 
20 | Clone the repository from GitHub:
21 | 
22 | ```
23 | git clone https://github.com/openai/chatgpt-retrieval-plugin.git
24 | ```
25 | 
26 | Navigate to the cloned repository directory:
27 | 
28 | ```
29 | cd path/to/chatgpt-retrieval-plugin
30 | ```
31 | 
32 | Log in to the Fly.io CLI:
33 | 
34 | ```
35 | flyctl auth login
36 | ```
37 | 
38 | Create and launch your Fly.io app:
39 | 
40 | ```
41 | flyctl launch
42 | ```
43 | 
44 | Follow the instructions in your terminal:
45 | 
46 | - Choose your app name
47 | - Choose your app region
48 | - Don't add any databases
49 | - Don't deploy yet (if you do, the first deploy might fail as the environment variables are not yet set)
50 | 
51 | Set the required environment variables:
52 | 
53 | ```
54 | flyctl secrets set DATASTORE=your_datastore \
55 | OPENAI_API_KEY=your_openai_api_key \
56 | BEARER_TOKEN=your_bearer_token \
57 | <Add the environment variables for your chosen vector DB here>
58 | ```
59 | 
60 | Alternatively, you could set environment variables in the [Fly.io Console](https://fly.io/dashboard).
61 | 
62 | At this point, you can change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml) to the url for your Fly.io app, which will be `https://your-app-name.fly.dev`.
63 | 
64 | Deploy your app with:
65 | 
66 | ```
67 | flyctl deploy
68 | ```
69 | 
70 | After completing these steps, your Docker container should be deployed to Fly.io and running with the necessary environment variables set. You can view your app by running:
71 | 
72 | ```
73 | flyctl open
74 | ```
75 | 
76 | which will open your app url. You should be able to find the OpenAPI schema at `<your_app_url>/.well-known/openapi.yaml` and the manifest at `<your_app_url>/.well-known/ai-plugin.json`.
77 | 
78 | To view your app logs:
79 | 
80 | ```
81 | flyctl logs
82 | ```
83 | 
84 | Now, make sure you have changed the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy with `flyctl deploy`. This url will be `https://<your-app-name>.fly.dev`.
85 | 
86 | **Debugging tips:**
87 | Fly.io uses port 8080 by default.
88 | 
89 | If your app fails to deploy, check if the environment variables are set correctly, and then check if your port is configured correctly. You could also try using the [`-e` flag](https://fly.io/docs/flyctl/launch/) with the `flyctl launch` command to set the environment variables at launch.
90 | 


--------------------------------------------------------------------------------
/docs/deployment/heroku.md:
--------------------------------------------------------------------------------
  1 | # Deploying to Heroku
  2 | 
  3 | ## Removing Unused Dependencies
  4 | 
  5 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.
  6 | 
  7 | Find the packages you can remove for each vector database provider [here](removing-unused-dependencies.md).
  8 | 
  9 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.
 10 | 
 11 | ## Deployment
 12 | 
 13 | To deploy the Docker container from this repository to Heroku and set the required environment variables, follow these steps:
 14 | 
 15 | [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed.
 16 | 
 17 | Install the [Heroku CLI](https://devcenter.heroku.com/articles/heroku-cli) on your local machine.
 18 | 
 19 | Clone the repository from GitHub:
 20 | 
 21 | ```
 22 | git clone https://github.com/openai/chatgpt-retrieval-plugin.git
 23 | ```
 24 | 
 25 | Navigate to the cloned repository directory:
 26 | 
 27 | ```
 28 | cd path/to/chatgpt-retrieval-plugin
 29 | ```
 30 | 
 31 | Log in to the Heroku CLI:
 32 | 
 33 | ```
 34 | heroku login
 35 | ```
 36 | 
 37 | Create a Heroku app:
 38 | 
 39 | ```
 40 | heroku create [app-name]
 41 | ```
 42 | 
 43 | Log in to the Heroku Container Registry:
 44 | 
 45 | ```
 46 | heroku container:login
 47 | ```
 48 | 
 49 | Alternatively, you can use a command from the Makefile to log in to the Heroku Container Registry by running:
 50 | 
 51 | ```
 52 | make heroku-login
 53 | ```
 54 | 
 55 | Build the Docker image using the Dockerfile:
 56 | 
 57 | ```
 58 | docker buildx build --platform linux/amd64 -t [image-name] .
 59 | ```
 60 | 
 61 | (Replace `[image-name]` with the name you want to give your Docker image)
 62 | 
 63 | Push the Docker image to the Heroku Container Registry, and release the newly pushed image to your Heroku app.
 64 | 
 65 | ```
 66 | docker tag [image-name] registry.heroku.com/[app-name]/web
 67 | docker push registry.heroku.com/[app-name]/web
 68 | heroku container:release web -a [app-name]
 69 | ```
 70 | 
 71 | Alternatively, you can use a command from the to push the Docker image to the Heroku Container Registry by running:
 72 | 
 73 | ```
 74 | make heroku-push
 75 | ```
 76 | 
 77 | **Note:** You will need to edit the Makefile and replace `<your app name>` with your actual app name.
 78 | 
 79 | Set the required environment variables for your Heroku app:
 80 | 
 81 | ```
 82 | heroku config:set DATASTORE=your_datastore \
 83 | OPENAI_API_KEY=your_openai_api_key \
 84 | BEARER_TOKEN=your_bearer_token \
 85 | <Add the environment variables for your chosen vector DB here> \
 86 | -a [app-name]
 87 | ```
 88 | 
 89 | You could also set environment variables in the [Heroku Console](https://dashboard.heroku.com/apps).
 90 | 
 91 | After completing these steps, your Docker container should be deployed to Heroku and running with the necessary environment variables set. You can view your app by running:
 92 | 
 93 | ```
 94 | heroku open -a [app-name]
 95 | ```
 96 | 
 97 | which will open your app url. You should be able to find the OpenAPI schema at `<your_app_url>/.well-known/openapi.yaml` and the manifest at `<your_app_url>/.well-known/ai-plugin.json`.
 98 | 
 99 | To view your app logs:
100 | 
101 | ```
102 | heroku logs --tail -a [app-name]
103 | ```
104 | 
105 | Now make sure to change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy with `make heroku-push`. This url will be `https://your-app-name.herokuapp.com`.
106 | 


--------------------------------------------------------------------------------
/docs/deployment/other-options.md:
--------------------------------------------------------------------------------
 1 | # Other Deployment Options
 2 | 
 3 | Some possible other options for deploying the app are:
 4 | 
 5 | - **Azure Container Apps**: This is a cloud platform that allows you to deploy and manage web apps using Docker containers. You can use the Azure CLI or the Azure Portal to create and configure your app service, and then push your Docker image to a container registry and deploy it to your app service. You can also set environment variables and scale your app using the Azure Portal. Learn more [here](https://learn.microsoft.com/en-us/azure/container-apps/get-started-existing-container-image-portal?pivots=container-apps-private-registry).
 6 | - **Google Cloud Run**: This is a serverless platform that allows you to run stateless web apps using Docker containers. You can use the Google Cloud Console or the gcloud command-line tool to create and deploy your Cloud Run service, and then push your Docker image to the Google Container Registry and deploy it to your service. You can also set environment variables and scale your app using the Google Cloud Console. Learn more [here](https://cloud.google.com/run/docs/quickstarts/build-and-deploy).
 7 | - **AWS Elastic Container Service**: This is a cloud platform that allows you to run and manage web apps using Docker containers. You can use the AWS CLI or the AWS Management Console to create and configure your ECS cluster, and then push your Docker image to the Amazon Elastic Container Registry and deploy it to your cluster. You can also set environment variables and scale your app using the AWS Management Console. Learn more [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/docker-basics.html).
 8 | 
 9 | After you create your app, make sure to change the plugin url in your plugin manifest file [here](/.well-known/ai-plugin.json), and in your OpenAPI schema [here](/.well-known/openapi.yaml), and redeploy.
10 | 
11 | ## Removing Unused Dependencies
12 | 
13 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.
14 | 
15 | Find the packages you can remove for each vector database provider [here](removing_unused_dependencies.md).
16 | 
17 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.
18 | 


--------------------------------------------------------------------------------
/docs/deployment/removing-unused-dependencies.md:
--------------------------------------------------------------------------------
 1 | # Removing Unused Dependencies
 2 | 
 3 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.
 4 | 
 5 | Here are the packages you can remove for each vector database provider:
 6 | 
 7 | - **Pinecone:** Remove `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity`, `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
 8 | - **Weaviate:** Remove `pinecone-client`, `pymilvus`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, `psycopg2cffi`.
 9 | - **Zilliz:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
10 | - **Milvus:** Remove `pinecone-client`, `weaviate-client`, `qdrant-client`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
11 | - **Qdrant:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `redis`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
12 | - **Redis:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `chromadb`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
13 | - **LlamaIndex:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `chromadb`, `redis`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
14 | - **Chroma:**: Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `llama-index`, `redis`, `azure-identity` and `azure-search-documents`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
15 | - **Azure Cognitive Search**: Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `llama-index`, `redis` and `chromadb`, `supabase`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
16 | - **Supabase:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `psycopg2`+`pgvector`, and `psycopg2cffi`.
17 | - **Postgres:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, and `psycopg2cffi`.
18 | - **AnalyticDB:** Remove `pinecone-client`, `weaviate-client`, `pymilvus`, `qdrant-client`, `redis`, `llama-index`, `azure-identity` and `azure-search-documents`, `supabase`, and `psycopg2`+`pgvector`.
19 | 
20 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.
21 | 


--------------------------------------------------------------------------------
/docs/deployment/render-thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/docs/deployment/render-thumbnail.png


--------------------------------------------------------------------------------
/docs/deployment/render.md:
--------------------------------------------------------------------------------
 1 | # Deploying to Render
 2 | 
 3 | ## Removing Unused Dependencies
 4 | 
 5 | Before deploying your app, you might want to remove unused dependencies from your [pyproject.toml](/pyproject.toml) file to reduce the size of your app and improve its performance. Depending on the vector database provider you choose, you can remove the packages that are not needed for your specific provider.
 6 | 
 7 | Find the packages you can remove for each vector database provider [here](removing-unused-dependencies.md).
 8 | 
 9 | After removing the unnecessary packages from the `pyproject.toml` file, you don't need to run `poetry lock` and `poetry install` manually. The provided Dockerfile takes care of installing the required dependencies using the `requirements.txt` file generated by the `poetry export` command.
10 | 
11 | ## Deployment
12 | 
13 | Render maintains a [fork](https://github.com/render-examples/chatgpt-retrieval-plugin/) of this repository with a few small changes that facilitate easy deployment. The source code is unchanged. To deploy both the Docker container from this repository and a self-hosted Weaviate database to back it, just click the button below. Enter your OpenAI API key when prompted.
14 | 
15 | [<img src="https://render.com/images/deploy-to-render-button.svg" alt="Deploy to Render" />](https://render.com/deploy?repo=https://github.com/render-examples/chatgpt-retrieval-plugin/tree/main)
16 | 
17 | The bearer token will be randomly generated for you. You can view it in in the "Environment" tab on the [Render dashboard](https://dashboard.render.com) page for your server. For more guidance, consult the [README in Render's fork](https://github.com/render-examples/chatgpt-retrieval-plugin/blob/main/README.md), [Render's documentation](https://render.com/docs), or the screen recording linked below.
18 | 
19 | [![Deploy to Render screen recording](render-thumbnail.png)](https://vimeo.com/823610578)
20 | 


--------------------------------------------------------------------------------
/docs/deprecated/plugins.md:
--------------------------------------------------------------------------------
 1 | ## Plugins (deprecated)
 2 | 
 3 | Plugins are chat extensions designed specifically for language models like ChatGPT, enabling them to access up-to-date information, run computations, or interact with third-party services in response to a user's request. They unlock a wide range of potential use cases and enhance the capabilities of language models.
 4 | 
 5 | Developers can create a plugin by exposing an API through their website and providing a standardized manifest file that describes the API. ChatGPT consumes these files and allows the AI models to make calls to the API defined by the developer.
 6 | 
 7 | A plugin consists of:
 8 | 
 9 | - An API
10 | - An API schema (OpenAPI JSON or YAML format)
11 | - A manifest (JSON file) that defines relevant metadata for the plugin
12 | 
13 | The Retrieval Plugin already contains all of these components. Read the Chat Plugins blogpost [here](https://openai.com/blog/chatgpt-plugins), and find the docs [here](https://platform.openai.com/docs/plugins/introduction).
14 | 
15 | To access the plugins model, navigate [here](https://chat.openai.com/?model=gpt-4-plugins).
16 | 
17 | ### Testing a Localhost Plugin in ChatGPT
18 | 
19 | To test a localhost plugin in ChatGPT, use the provided [`local_server/main.py`](/local_server/main.py) file, which is specifically configured for localhost testing with CORS settings, no authentication and routes for the manifest, OpenAPI schema and logo.
20 | 
21 | Follow these steps to test your localhost plugin:
22 | 
23 | 1. Run the localhost server using the `poetry run dev` command. This starts the server at the default address (e.g. `localhost:3333`).
24 | 
25 | 2. Visit [ChatGPT](https://chat.openai.com/), select "Plugins" from the model picker, click on the plugins picker, and click on "Plugin store" at the bottom of the list.
26 | 
27 | 3. Choose "Develop your own plugin" and enter your localhost URL (e.g. `localhost:3333`) when prompted.
28 | 
29 | 4. Your localhost plugin is now enabled for your ChatGPT session.
30 | 
31 | For more information, refer to the [OpenAI documentation](https://platform.openai.com/docs/plugins/getting-started/openapi-definition).
32 | 
33 | ## Installing a Developer Plugin
34 | 
35 | To install a developer plugin, follow the steps below:
36 | 
37 | - First, create your developer plugin by deploying it to your preferred hosting platform (e.g. Fly.io, Heroku, etc.) and updating the plugin URL in the manifest file and OpenAPI schema.
38 | 
39 | - Go to [ChatGPT](https://chat.openai.com/) and select "Plugins" from the model picker.
40 | 
41 | - From the plugins picker, scroll to the bottom and click on "Plugin store."
42 | 
43 | - Go to "Develop your own plugin" and follow the instructions provided. You will need to enter the domain where your plugin is deployed.
44 | 
45 | - Follow the instructions based on the authentication type you have chosen for your plugin (e.g. if your plugin uses Service Level HTTP, you will have to paste in your access token, then paste the new access token you receive from the plugin flow into your [ai-plugin.json](/.well-known/ai-plugin.json) file and redeploy your app).
46 | 
47 | - Next, you must add your plugin. Go to the "Plugin store" again and click on "Install an unverified plugin."
48 | 
49 | - Follow the instructions provided, which will require you to enter the domain where your plugin is deployed.
50 | 
51 | - Follow the instructions based on the authentication type you have chosen for your plugin (e.g. if your plugin uses User Level HTTP, you will have to paste in your bearer token).
52 | 
53 | After completing these steps, your developer plugin should be installed and ready to use in ChatGPT.
54 | 


--------------------------------------------------------------------------------
/docs/providers/analyticdb/setup.md:
--------------------------------------------------------------------------------
 1 | # AnalyticDB
 2 | 
 3 | [AnalyticDB](https://www.alibabacloud.com/help/en/analyticdb-for-postgresql/latest/product-introduction-overview) is a distributed cloud-native vector database designed for storing documents and vector embeddings. It is a high-performance vector database that is fully compatible with PostgreSQL syntax, making it easy to use. Managed by Alibaba Cloud, AnalyticDB offers a powerful vector compute engine, processing billions of data vectors and providing a wide range of features, including indexing algorithms, structured and unstructured data capabilities, real-time updates, distance metrics, scalar filtering, and time travel searches. Additionally, it offers full OLAP database functionality and an SLA commitment for production use.
 4 | 
 5 | ## Install Requirements
 6 | 
 7 | Run the following command to install the required packages, including the `psycopg2cffi` package:
 8 | 
 9 | ```
10 | poetry install --extras "postgresql"
11 | ```
12 | 
13 | If you encounter the `Error: pg_config executable not found.` issue, you need to install the PostgreSQL development package on your system. Follow the instructions for your specific Linux distribution:
14 | 
15 | 1. Debian-based systems (e.g., Ubuntu):
16 | 
17 | ```bash
18 | sudo apt-get update
19 | sudo apt-get install libpq-dev
20 | ```
21 | 
22 | 2. RHEL-based systems (e.g., CentOS, Fedora):
23 | 
24 | ```bash
25 | sudo yum install postgresql-devel
26 | ```
27 | 
28 | 3. Arch-based systems (e.g., Manjaro, Arch Linux):
29 | 
30 | ```bash
31 | sudo pacman -S postgresql-libs
32 | ```
33 | 
34 | 4. macOS:
35 | 
36 | ```bash
37 | brew install postgresql
38 | ```
39 | 
40 | After installing the required package, try to install `psycopg2cffi` again. If the `pg_config` executable is still not found, add its location to your system's `PATH` variable. You can typically find the `pg_config` executable in the `bin` directory of your PostgreSQL installation, for example `/usr/pgsql-13/bin/pg_config`. To add it to your `PATH` variable, use the following command (replace the path with the correct one for your system):
41 | 
42 | ```bash
43 | export PATH=$PATH:/usr/pgsql-13/bin
44 | ```
45 | 
46 | Now, try installing `psycopg2cffi` again using Poetry.
47 | 
48 | **Environment Variables:**
49 | 
50 | | Name             | Required | Description                         | Default           |
51 | | ---------------- | -------- | ----------------------------------- | ----------------- |
52 | | `DATASTORE`      | Yes      | Datastore name, set to `analyticdb` |                   |
53 | | `BEARER_TOKEN`   | Yes      | Secret token                        |                   |
54 | | `OPENAI_API_KEY` | Yes      | OpenAI API key                      |                   |
55 | | `PG_HOST`        | Yes      | AnalyticDB instance URL             | `localhost`       |
56 | | `PG_USER`        | Yes      | Database user                       | `user`            |
57 | | `PG_PASSWORD`    | Yes      | Database password                   | `password`        |
58 | | `PG_PORT`        | Optional | Port for AnalyticDB communication   | `5432`            |
59 | | `PG_DATABASE`    | Optional | Database name                       | `postgres`        |
60 | | `PG_COLLECTION`  | Optional | AnalyticDB relation name            | `document_chunks` |
61 | 
62 | ## AnalyticDB Cloud
63 | 
64 | For a hosted [AnalyticDB Cloud](https://cloud.qdrant.io/) version, provide the AnalyticDB instance URL:
65 | 
66 | **Example:**
67 | 
68 | ```bash
69 | PG_HOST="https://YOUR-CLUSTER-URL.gpdb.rds.aliyuncs.com"
70 | PG_USER="YOUR-USER-NAME"
71 | PG_PASSWORD="YOUR-PASSWORD"
72 | ```
73 | 
74 | The other parameters are optional and can be changed if needed.
75 | 
76 | ## Running AnalyticDB Integration Tests
77 | 
78 | A suite of integration tests verifies the AnalyticDB integration. Launch the test suite with this command:
79 | 
80 | ```bash
81 | pytest ./tests/datastore/providers/analyticdb/test_analyticdb_datastore.py
82 | ```
83 | 


--------------------------------------------------------------------------------
/docs/providers/azurecosmosdb/setup.md:
--------------------------------------------------------------------------------
 1 | # Azure Cosmos DB
 2 | 
 3 | [Azure Cosmos DB](https://azure.microsoft.com/en-us/products/cosmos-db/) Azure Cosmos DB is a fully managed NoSQL and relational database for modern app development. Using Azure Cosmos DB for MongoDB vCore, you can store vector embeddings in your documents and perform [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) on a fully managed MongoDB-compatible database service.
 4 | 
 5 | Learn more about Azure Cosmos DB for MongoDB vCore [here](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/).
 6 | 
 7 | ## Environment variables
 8 | 
 9 | | Name                         | Required | Description                                                             | Default             |
10 | | ---------------------------- | -------- |-------------------------------------------------------------------------| ------------------- |
11 | | `DATASTORE`                  | Yes      | Datastore name, set to `azurecosmosdb`                                  |                     |
12 | | `BEARER_TOKEN`               | Yes      | Secret token                                                            |                     |
13 | | `OPENAI_API_KEY`             | Yes      | OpenAI API key                                                          |                     |
14 | | `AZCOSMOS_API`               | Yes      | Name of the API you're connecting to. Currently supported `mongo-vcore` |                     |
15 | | `AZCOSMOS_CONNSTR`           | Yes      | The connection string to your account.                                  |                     |
16 | | `AZCOSMOS_DATABASE_NAME`     | Yes      | The database where the data is stored/queried                           |                     |
17 | | `AZCOSMOS_CONTAINER_NAME`    | Yes      | The container where the data is stored/queried                          |                     |
18 | 
19 | ## Indexing
20 | On first insert, the datastore will create the collection and index if necessary on the field `embedding`. Currently hybrid search is not yet supported.
21 | 


--------------------------------------------------------------------------------
/docs/providers/azuresearch/setup.md:
--------------------------------------------------------------------------------
 1 | # Azure Cognitive Search
 2 | 
 3 | [Azure Cognitive Search](https://azure.microsoft.com/products/search/) is a complete retrieval cloud service that supports vector search, text search, and hybrid (vectors + text combined to yield the best of the two approaches). Azure Cognitive Search also offers an [optional L2 re-ranking step](https://learn.microsoft.com/azure/search/semantic-search-overview) to further improve results quality.
 4 | 
 5 | You can find the Azure Cognitive Search documentation [here](https://learn.microsoft.com/azure/search/search-what-is-azure-search). If you don't have an Azure account, you can start setting one up [here](https://azure.microsoft.com/).
 6 | 
 7 | ## Signing up for vector search
 8 | 
 9 | Azure Cognitive Search supports searching using pure vectors, pure text, or hybrid mode where both are combined. For the vector-based cases, you'll need to sign up for vector search private preview. To sign up, please fill in this form: https://aka.ms/VectorSearchSignUp
10 | 
11 | ## Environment variables
12 | 
13 | | Name                          | Required | Description                                                                                                                                                                        | Default               |
14 | | ----------------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------- |
15 | | `DATASTORE`                   | Yes      | Datastore name, set to `azuresearch`                                                                                                                                               |                       |
16 | | `BEARER_TOKEN`                | Yes      | Secret token                                                                                                                                                                       |                       |
17 | | `OPENAI_API_KEY`              | Yes      | OpenAI API key                                                                                                                                                                     |                       |
18 | | `AZURESEARCH_SERVICE`         | Yes      | Name of your search service                                                                                                                                                        |                       |
19 | | `AZURESEARCH_INDEX`           | Yes      | Name of your search index                                                                                                                                                          |                       |
20 | | `AZURESEARCH_API_KEY`         | No       | Your API key, if using key-based auth instead of Azure managed identity                                                                                                            | Uses managed identity |
21 | | `AZURESEARCH_DISABLE_HYBRID`  | No       | Disable hybrid search and only use vector similarity                                                                                                                               | Use hybrid search     |
22 | | `AZURESEARCH_SEMANTIC_CONFIG` | No       | Enable L2 re-ranking with this configuration name [see re-ranking below](#re-ranking)                                                                                              | L2 not enabled        |
23 | | `AZURESEARCH_LANGUAGE`        | No       | If using L2 re-ranking, language for queries/documents (valid values [listed here](https://learn.microsoft.com/rest/api/searchservice/preview-api/search-documents#queryLanguage)) | `en-us`               |
24 | | `AZURESEARCH_DIMENSIONS`      | No       | Vector size for embeddings                                                                                                                                                         | 256, or other         |
25 | 
26 | ## Authentication Options
27 | 
28 | - API key: this is enabled by default; you can obtain the key in the Azure Portal or using the Azure CLI.
29 | - Managed identity: If the plugin is running in Azure, you can enable managed identity for the host and give that identity access to the service, without having to manage keys (avoiding secret storage, rotation, etc.). More details [here](https://learn.microsoft.com/azure/search/search-security-rbac).
30 | 
31 | ## Re-ranking
32 | 
33 | Azure Cognitive Search offers the option to enable a second (L2) ranking step after retrieval to further improve results quality. This only applies when using text or hybrid search. Since it has latency and cost implications, if you want to try this option you need to explicitly [enable "semantic search"](https://learn.microsoft.com/azure/search/semantic-search-overview#enable-semantic-search) in your Cognitive Search service, and [create a semantic search configuration](https://learn.microsoft.com/azure/search/semantic-how-to-query-request#2---create-a-semantic-configuration) for your index.
34 | 
35 | ## Using existing search indexes
36 | 
37 | If an existing index has fields that align with what's needed by the retrieval plugin but just differ in names, you can map your fields to the plugin fields using the following environment variables:
38 | 
39 | | Plugin field name | Environment variable to override it |
40 | | ----------------- | ----------------------------------- |
41 | | id                | AZURESEARCH_FIELDS_ID               |
42 | | text              | AZURESEARCH_FIELDS_TEXT             |
43 | | embedding         | AZURESEARCH_FIELDS_EMBEDDING        |
44 | | document_id       | AZURESEARCH_FIELDS_DOCUMENT_ID      |
45 | | source            | AZURESEARCH_FIELDS_SOURCE           |
46 | | source_id         | AZURESEARCH_FIELDS_SOURCE_ID        |
47 | | url               | AZURESEARCH_FIELDS_URL              |
48 | | created_at        | AZURESEARCH_FIELDS_CREATED_AT       |
49 | | author            | AZURESEARCH_FIELDS_AUTHOR           |
50 | 


--------------------------------------------------------------------------------
/docs/providers/chroma/setup.md:
--------------------------------------------------------------------------------
 1 | [Chroma](https://trychroma.com) is an AI-native open-source embedding database designed to make it easy to work with embeddings. Chroma runs in-memory, or in a client-server setup.
 2 | 
 3 | Install Chroma by running `pip install chromadb`. Once installed, the core API consists of four essential commands for creating collections, adding embeddings, documents, and metadata, and querying embeddings to find similar documents. Get started with Chroma by visiting the [Getting Started](https://docs.trychroma.com) page on their documentation website, or explore the open-source code on their [GitHub repository](https://github.com/chroma-core/chroma).
 4 | 
 5 | **Chroma Environment Variables**
 6 | 
 7 | To set up Chroma and start using it as your vector database provider, you need to define some environment variables to connect to your Chroma instance.
 8 | 
 9 | **Chroma Datastore Environment Variables**
10 | 
11 | Chroma runs _in-memory_ by default, with local persistence. It can also run in [self-hosted](https://docs.trychroma.com/usage-guide#running-chroma-in-clientserver-mode) client-server mode, with a fully managed hosted version coming soon.
12 | 
13 | | Name                     | Required | Description                                                                                        | Default          |
14 | | ------------------------ | -------- | -------------------------------------------------------------------------------------------------- | ---------------- |
15 | | `DATASTORE`              | Yes      | Datastore name. Set this to `chroma`                                                               |                  |
16 | | `BEARER_TOKEN`           | Yes      | Your secret token for authenticating requests to the API                                           |                  |
17 | | `OPENAI_API_KEY`         | Yes      | Your OpenAI API key for generating embeddings                                                      |                  |
18 | | `CHROMA_COLLECTION`      | Optional | Your chosen Chroma collection name to store your embeddings                                        | openaiembeddings |
19 | | `CHROMA_IN_MEMORY`       | Optional | If set to `True`, ignore `CHROMA_HOST` and `CHROMA_PORT` and just use an in-memory Chroma instance | `True`           |
20 | | `CHROMA_PERSISTENCE_DIR` | Optional | If set, and `CHROMA_IN_MEMORY` is set, persist to and load from this directory.                    | `openai`         |
21 | 
22 | To run Chroma in self-hosted client-server mode, st the following variables:
23 | 
24 | | Name          | Required | Description                                         | Default            |
25 | | ------------- | -------- | --------------------------------------------------- | ------------------ |
26 | | `CHROMA_HOST` | Optional | Your Chroma instance host address (see notes below) | `http://127.0.0.1` |
27 | | `CHROMA_PORT` | Optional | Your Chroma port number                             | `8000`             |
28 | 
29 | > For **self-hosted instances**, if your instance is not at 127.0.0.1:8000, set `CHROMA_HOST` and `CHROMA_PORT` accordingly. For example: `CHROMA_HOST=http://localhost/` and `CHROMA_PORT=8080`.
30 | 


--------------------------------------------------------------------------------
/docs/providers/elasticsearch/setup.md:
--------------------------------------------------------------------------------
 1 | # Elasticsearch
 2 | 
 3 | Elasticsearch is a search engine based on the Lucene library. It provides a distributed, full-text and vector search engine with an HTTP web interface and schema-free JSON documents. To use Elasticsearch as your vector database, start by [installing Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html) or signing up for a free trial of [Elastic Cloud](https://www.elastic.co/cloud/).
 4 | 
 5 | The app will create an Elasticsearch index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable.
 6 | 
 7 | **Environment Variables:**
 8 | 
 9 | | Name                  | Required | Description                                                                                                          |
10 | | --------------------- | -------- | -------------------------------------------------------------------------------------------------------------------- |
11 | | `DATASTORE`           | Yes      | Datastore name, set this to `elasticsearch`                                                                          |
12 | | `BEARER_TOKEN`        | Yes      | Your secret token for authenticating requests to the API                                                             |
13 | | `OPENAI_API_KEY`      | Yes      | Your OpenAI API key for generating embeddings with the OpenAI embeddings model                                       |
14 | | `ELASTICSEARCH_INDEX` | Yes      | Your chosen Elasticsearch index name. **Note:** Index name must consist of lower case alphanumeric characters or '-' |
15 | 
16 | **Connection Evironment Variables:**
17 | Depending on your Elasticsearch setup, you may need to set one of the following environment variables to connect to your Elasticsearch instance. If you are using Elastic Cloud, you can connect via `ELASTICSEARCH_CLOUD_ID`. If you are using a local instance of Elasticsearch, you will need to set `ELASTICSEARCH_URL`.
18 | 
19 | You can authenticate to Elasticsearch using either `ELASTICSEARCH_USERNAME` and `ELASTICSEARCH_PASSWORD` or `ELASTICSEARCH_API_KEY`. If you are using Elastic Cloud, you can find this in Kibana.
20 | 
21 | | Name                     | Required | Description                                                                                      |
22 | | ------------------------ | -------- | ------------------------------------------------------------------------------------------------ |
23 | | `ELASTICSEARCH_URL`      | Yes      | Your Elasticsearch URL. If installed locally, this would be https://localhost:9200               |
24 | | `ELASTICSEARCH_CLOUD_ID` | Yes      | Your cloud id, linked to your deployment. This can be found in the deployment's console          |
25 | | `ELASTICSEARCH_USERNAME` | Yes      | Your username for authenticating requests to the API. Commonly 'elastic'.                        |
26 | | `ELASTICSEARCH_PASSWORD` | Yes      | Your password for authenticating requests to the API                                             |
27 | | `ELASTICSEARCH_API_KEY`  | Yes      | Alternatively you can authenticate using api-key. This can be created in Kibana stack management |
28 | 
29 | ## Running Elasticsearch Integration Tests
30 | 
31 | A suite of integration tests is available to verify the Elasticsearch integration. To run the tests, run the docker compose found in the `examples/docker/elasticsearch` folder with `docker-compose up`. This will start Elasticsearch in single node, security off mode, listening on `http://localhost:9200`.
32 | 
33 | Then, launch the test suite with this command:
34 | 
35 | ```bash
36 | pytest ./tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py
37 | ```
38 | 


--------------------------------------------------------------------------------
/docs/providers/llama/setup.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # LlamaIndex
 3 | 
 4 | [LlamaIndex](https://github.com/jerryjliu/llama_index) is a central interface to connect your LLM's with external data.
 5 | It provides a suite of in-memory indices over your unstructured and structured data for use with ChatGPT.
 6 | Unlike standard vector databases, LlamaIndex supports a wide range of indexing strategies (e.g. tree, keyword table, knowledge graph) optimized for different use-cases.
 7 | It is light-weight, easy-to-use, and requires no additional deployment.
 8 | All you need to do is specifying a few environment variables (optionally point to an existing saved Index json file).
 9 | Note that metadata filters in queries are not yet supported.
10 | 
11 | ## Setup
12 | Currently, LlamaIndex requires no additional deployment
13 | and runs as a part of the Retrieval Plugin.
14 | It is super easy to setup and great for quick prototyping
15 | with ChatGPT and your external data.
16 | 
17 | **Retrieval App Environment Variables**
18 | 
19 | | Name             | Required | Description                         |
20 | |------------------|----------|-------------------------------------|
21 | | `DATASTORE`      | Yes      | Datastore name. Set this to `llama` |
22 | | `BEARER_TOKEN`   | Yes      | Your secret token                   |
23 | | `OPENAI_API_KEY` | Yes      | Your OpenAI API key                 |
24 | 
25 | **Llama Datastore Environment Variables**
26 | 
27 | | Name                           | Required | Description                          | Default       |
28 | |--------------------------------|----------|--------------------------------------|---------------|
29 | | `LLAMA_INDEX_TYPE`             | Optional | Index type (see below for details)   | `simple_dict` |
30 | | `LLAMA_INDEX_JSON_PATH`        | Optional | Path to saved Index json file        | None          |
31 | | `LLAMA_QUERY_KWARGS_JSON_PATH` | Optional | Path to saved query kwargs json file | None          |
32 | | `LLAMA_RESPONSE_MODE`          | Optional | Response mode for query              | `no_text`     | 
33 | 
34 | 
35 | **Different Index Types**
36 | By default, we use a `GPTVectorStoreIndex` to store document chunks in memory, 
37 | and retrieve top-k nodes by embedding similarity.
38 | Different index types are optimized for different data and query use-cases.
39 | See this guide on [How Each Index Works](https://gpt-index.readthedocs.io/en/latest/guides/primer/index_guide.html) to learn more.
40 | You can configure the index type via the `LLAMA_INDEX_TYPE`, see [here](https://gpt-index.readthedocs.io/en/latest/reference/indices/composability_query.html#gpt_index.data_structs.struct_type.IndexStructType) for the full list of accepted index type identifiers.
41 | 
42 | 
43 | Read more details on [readthedocs](https://gpt-index.readthedocs.io/en/latest/), 
44 | and engage with the community on [discord](https://discord.com/invite/dGcwcsnxhU).
45 | 
46 | ## Running Tests
47 | You can launch the test suite with this command:
48 | 
49 | ```bash
50 | pytest ./tests/datastore/providers/llama/test_llama_datastore.py
51 | ```
52 | 


--------------------------------------------------------------------------------
/docs/providers/milvus/setup.md:
--------------------------------------------------------------------------------
 1 | # Milvus
 2 | 
 3 | [Milvus](https://milvus.io/) is the open-source, cloud-native vector database that scales to billions of vectors. It's the open-source version of Zilliz. It supports:
 4 | 
 5 | - Various indexing algorithms and distance metrics
 6 | - Scalar filtering and time travel searches
 7 | - Rollback and snapshots
 8 | - Multi-language SDKs
 9 | - Storage and compute separation
10 | - Cloud scalability
11 | - A developer-first community with multi-language support
12 | 
13 | Visit the [Github](https://github.com/milvus-io/milvus) to learn more.
14 | 
15 | ## Deploying the Database
16 | 
17 | You can deploy and manage Milvus using Docker Compose, Helm, K8's Operator, or Ansible. Follow the instructions [here](https://milvus.io/docs) to get started.
18 | 
19 | **Environment Variables:**
20 | 
21 | | Name                       | Required | Description                                                                                                                                  |
22 | |----------------------------| -------- |----------------------------------------------------------------------------------------------------------------------------------------------|
23 | | `DATASTORE`                | Yes      | Datastore name, set to `milvus`                                                                                                              |
24 | | `BEARER_TOKEN`             | Yes      | Your bearer token                                                                                                                            |
25 | | `OPENAI_API_KEY`           | Yes      | Your OpenAI API key                                                                                                                          |
26 | | `MILVUS_COLLECTION`        | Optional | Milvus collection name, defaults to a random UUID                                                                                            |
27 | | `MILVUS_HOST`              | Optional | Milvus host IP, defaults to `localhost`                                                                                                      |
28 | | `MILVUS_PORT`              | Optional | Milvus port, defaults to `19530`                                                                                                             |
29 | | `MILVUS_USER`              | Optional | Milvus username if RBAC is enabled, defaults to `None`                                                                                       |
30 | | `MILVUS_PASSWORD`          | Optional | Milvus password if required, defaults to `None`                                                                                              |
31 | | `MILVUS_INDEX_PARAMS`      | Optional | Custom index options for the collection, defaults to `{"metric_type": "IP", "index_type": "HNSW", "params": {"M": 8, "efConstruction": 64}}` |
32 | | `MILVUS_SEARCH_PARAMS`     | Optional | Custom search options for the collection, defaults to `{"metric_type": "IP", "params": {"ef": 10}}`                                          |
33 | | `MILVUS_CONSISTENCY_LEVEL` | Optional | Data consistency level for the collection, defaults to `Bounded`                                                                             |
34 | 
35 | ## Running Milvus Integration Tests
36 | 
37 | A suite of integration tests is available to verify the Milvus integration. To run the tests, run the milvus docker compose found in the examples folder.
38 | 
39 | Then, launch the test suite with this command:
40 | 
41 | ```bash
42 | pytest ./tests/datastore/providers/milvus/test_milvus_datastore.py
43 | ```
44 | 


--------------------------------------------------------------------------------
/docs/providers/pinecone/setup.md:
--------------------------------------------------------------------------------
 1 | # Pinecone
 2 | 
 3 | [Pinecone](https://www.pinecone.io) is a managed vector database built for speed, scale, and shipping to production sooner. To use Pinecone as your vector database provider, first get an API key by [signing up for an account](https://app.pinecone.io/). You can access your API key from the "API Keys" section in the sidebar of your dashboard. Pinecone also supports hybrid search and at the time of writing is the only datastore to support SPLADE sparse vectors natively.
 4 | 
 5 | A full Jupyter notebook walkthrough for the Pinecone flavor of the retrieval plugin can be found [here](https://github.com/openai/chatgpt-retrieval-plugin/blob/main/examples/providers/pinecone/semantic-search.ipynb). There is also a [video walkthrough here](https://youtu.be/hpePPqKxNq8).
 6 | 
 7 | The app will create a Pinecone index for you automatically when you run it for the first time. Just pick a name for your index and set it as an environment variable.
 8 | 
 9 | **Environment Variables:**
10 | 
11 | | Name                   | Required | Description                                                                                                                      |
12 | | ---------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------- |
13 | | `DATASTORE`            | Yes      | Datastore name, set this to `pinecone`                                                                                           |
14 | | `BEARER_TOKEN`         | Yes      | Your secret token for authenticating requests to the API                                                                         |
15 | | `OPENAI_API_KEY`       | Yes      | Your OpenAI API key for generating embeddings with one of the OpenAI embeddings models                                           |
16 | | `PINECONE_API_KEY`     | Yes      | Your Pinecone API key, found in the [Pinecone console](https://app.pinecone.io/)                                                 |
17 | | `PINECONE_ENVIRONMENT` | Yes      | Your Pinecone environment, found in the [Pinecone console](https://app.pinecone.io/), e.g. `us-west1-gcp`, `us-east-1-aws`, etc. |
18 | | `PINECONE_INDEX`       | Yes      | Your chosen Pinecone index name. **Note:** Index name must consist of lower case alphanumeric characters or '-'                  |
19 | 
20 | If you want to create your own index with custom configurations, you can do so using the Pinecone SDK, API, or web interface ([see docs](https://docs.pinecone.io/docs/manage-indexes)). Make sure to use a dimensionality of 256 (or another dimension) for the embeddings and avoid indexing on the text field in the metadata, as this will reduce the performance significantly.
21 | 
22 | ```python
23 | # Creating index with Pinecone SDK - use only if you wish to create the index manually.
24 | 
25 | import os, pinecone
26 | 
27 | pinecone.init(api_key=os.environ['PINECONE_API_KEY'],
28 |               environment=os.environ['PINECONE_ENVIRONMENT'])
29 | 
30 | EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256))
31 | 
32 | pinecone.create_index(name=os.environ['PINECONE_INDEX'],
33 |                       dimension=EMBEDDING_DIMENSION,
34 |                       metric='cosine',
35 |                       metadata_config={
36 |                           "indexed": ['source', 'source_id', 'url', 'created_at', 'author', 'document_id']})
37 | ```
38 | 


--------------------------------------------------------------------------------
/docs/providers/postgres/setup.md:
--------------------------------------------------------------------------------
 1 | # Postgres
 2 | 
 3 | Postgres Database offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension. To use pgvector, you will need to set up a PostgreSQL database with the pgvector extension enabled or use a managed solution that provides pgvector. For a hosted/managed solution, you can use any of the cloud vendors which support [pgvector](https://github.com/pgvector/pgvector#hosted-postgres).
 4 | 
 5 | - The database needs the `pgvector` extension.
 6 | - To apply required migrations you may use any tool you are more familiar with like [pgAdmin](https://www.pgadmin.org/), [DBeaver](https://dbeaver.io/), [DataGrip](https://www.jetbrains.com/datagrip/), or `psql` cli.
 7 | 
 8 | **Retrieval App Environment Variables**
 9 | 
10 | | Name             | Required | Description                            |
11 | | ---------------- | -------- | -------------------------------------- |
12 | | `DATASTORE`      | Yes      | Datastore name. Set this to `postgres` |
13 | | `BEARER_TOKEN`   | Yes      | Your secret token                      |
14 | | `OPENAI_API_KEY` | Yes      | Your OpenAI API key                    |
15 | 
16 | **Postgres Datastore Environment Variables**
17 | 
18 | | Name          | Required | Description       | Default    |
19 | | ------------- | -------- | ----------------- | ---------- |
20 | | `PG_HOST`     | Optional | Postgres host     | localhost  |
21 | | `PG_PORT`     | Optional | Postgres port     | `5432`     |
22 | | `PG_PASSWORD` | Optional | Postgres password | `postgres` |
23 | | `PG_USER`     | Optional | Postgres username | `postgres` |
24 | | `PG_DB`       | Optional | Postgres database | `postgres` |
25 | 
26 | ## Postgres Datastore local development & testing
27 | 
28 | In order to test your changes to the Postgres Datastore, you can run the following:
29 | 
30 | 1. You can run local or self-hosted instance of PostgreSQL with `pgvector` enabled using Docker.
31 | 
32 | ```bash
33 | docker pull ankane/pgvector
34 | ```
35 | 
36 | ```bash
37 | docker run --name pgvector -e POSTGRES_PASSWORD=mysecretpassword -d postgres
38 | ```
39 | 
40 | Check PostgreSQL [official docker image](https://github.com/docker-library/docs/blob/master/postgres/README.md) for more options.
41 | 
42 | 2. Apply migrations using any tool you like most [pgAdmin](https://www.pgadmin.org/), [DBeaver](https://dbeaver.io/), [DataGrip](https://www.jetbrains.com/datagrip/), or `psql` cli.
43 | 
44 | ```bash
45 | # apply migrations using psql cli
46 | psql -h localhost -p 5432 -U postgres -d postgres -f examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql
47 | ```
48 | 
49 | 3. Export environment variables required for the Postgres Datastore
50 | 
51 | ```bash
52 | export PG_HOST=localhost
53 | export PG_PORT=54322
54 | export PG_PASSWORD=mysecretpassword
55 | ```
56 | 
57 | 4. Run the Postgres datastore tests from the project's root directory
58 | 
59 | ```bash
60 | # Run the Postgres datastore tests
61 | # go to project's root directory and run
62 | poetry run pytest -s ./tests/datastore/providers/postgres/test_postgres_datastore.py
63 | ```
64 | 
65 | 5. When going to prod don't forget to set the password for the `postgres` user to something more secure and apply migrations.
66 | 
67 | 6. You may want to remove RLS (Row Level Security) from the `documents` table. If you are not using RLS, it is not required in this setup. But it may be useful if you want to separate documents by user or group of users, or if you want to give permissions to insert or query documents to different users. And RLS is especially important if you are willing to use PostgREST. To do so you can just remove the following statement from the `20230414142107_init_pg_vector.sql` migration file: `alter table documents enable row level security;`.
68 | 
69 | ## Indexes for Postgres
70 | 
71 | By default, pgvector performs exact nearest neighbor search. To speed up the vector comparison, you may want to create indexes for the `embedding` column in the `documents` table. You should do this **only** after a few thousand records are inserted.
72 | 
73 | As datasotre is using inner product for similarity search, you can add index as follows:
74 | 
75 | ```sql
76 | create index on documents using ivfflat (embedding vector_ip_ops) with (lists = 100);
77 | ```
78 | 
79 | To choose `lists` constant - a good place to start is records / 1000 for up to 1M records and sqrt(records) for over 1M records
80 | 
81 | For more information about indexes, see [pgvector docs](https://github.com/pgvector/pgvector#indexing).
82 | 


--------------------------------------------------------------------------------
/docs/providers/qdrant/setup.md:
--------------------------------------------------------------------------------
 1 | # Qdrant
 2 | 
 3 | [Qdrant](https://qdrant.tech/) is a vector database that can store documents and vector embeddings. It can run as a self-hosted version or a managed [Qdrant Cloud](https://cloud.qdrant.io/)
 4 | solution. The configuration is almost identical for both options, except for the API key that [Qdrant Cloud](https://cloud.qdrant.io/) provides.
 5 | 
 6 | **Environment Variables:**
 7 | 
 8 | | Name                | Required | Description                                                 | Default            |
 9 | | ------------------- | -------- | ----------------------------------------------------------- | ------------------ |
10 | | `DATASTORE`         | Yes      | Datastore name, set to `qdrant`                             |                    |
11 | | `BEARER_TOKEN`      | Yes      | Secret token                                                |                    |
12 | | `OPENAI_API_KEY`    | Yes      | OpenAI API key                                              |                    |
13 | | `QDRANT_URL`        | Yes      | Qdrant instance URL                                         | `http://localhost` |
14 | | `QDRANT_PORT`       | Optional | TCP port for Qdrant HTTP communication                      | `6333`             |
15 | | `QDRANT_GRPC_PORT`  | Optional | TCP port for Qdrant GRPC communication                      | `6334`             |
16 | | `QDRANT_API_KEY`    | Optional | Qdrant API key for [Qdrant Cloud](https://cloud.qdrant.io/) |                    |
17 | | `QDRANT_COLLECTION` | Optional | Qdrant collection name                                      | `document_chunks`  |
18 | 
19 | ## Qdrant Cloud
20 | 
21 | For a hosted [Qdrant Cloud](https://cloud.qdrant.io/) version, provide the Qdrant instance
22 | URL and the API key from the [Qdrant Cloud UI](https://cloud.qdrant.io/).
23 | 
24 | **Example:**
25 | 
26 | ```bash
27 | QDRANT_URL="https://YOUR-CLUSTER-URL.aws.cloud.qdrant.io"
28 | QDRANT_API_KEY="<YOUR_QDRANT_CLOUD_CLUSTER_API_KEY>"
29 | ```
30 | 
31 | The other parameters are optional and can be changed if needed.
32 | 
33 | ## Self-hosted Qdrant Instance
34 | 
35 | For a self-hosted version, use Docker containers or the official Helm chart for deployment. The only
36 | required parameter is the `QDRANT_URL` that points to the Qdrant server URL.
37 | 
38 | **Example:**
39 | 
40 | ```bash
41 | QDRANT_URL="http://YOUR_HOST.example.com:6333"
42 | ```
43 | 
44 | The other parameters are optional and can be changed if needed.
45 | 
46 | ## Running Qdrant Integration Tests
47 | 
48 | A suite of integration tests verifies the Qdrant integration. To run it, start a local Qdrant instance in a Docker container.
49 | 
50 | ```bash
51 | docker run -p "6333:6333" -p "6334:6334" qdrant/qdrant:v1.0.3
52 | ```
53 | 
54 | Then, launch the test suite with this command:
55 | 
56 | ```bash
57 | pytest ./tests/datastore/providers/qdrant/test_qdrant_datastore.py
58 | ```
59 | 


--------------------------------------------------------------------------------
/docs/providers/redis/setup.md:
--------------------------------------------------------------------------------
 1 | # Redis
 2 | 
 3 | [Redis](https://redis.com/solutions/use-cases/vector-database/) is a real-time data platform that supports a variety of use cases for everyday applications as well as AI/ML workloads. Use Redis as a low-latency vector engine by creating a Redis database with the [Redis Stack docker container](/examples/docker/redis/docker-compose.yml). For a hosted/managed solution, try [Redis Cloud](https://app.redislabs.com/#/). See more helpful examples of Redis as a vector database [here](https://github.com/RedisVentures/redis-ai-resources).
 4 | 
 5 | - The database **needs the RediSearch module (>=v2.6) and RedisJSON**, which are included in the self-hosted docker compose above.
 6 | - Run the App with the Redis docker image: `docker compose up -d` in [this dir](/examples/docker/redis/).
 7 | - The app automatically creates a Redis vector search index on the first run. Optionally, create a custom index with a specific name and set it as an environment variable (see below).
 8 | - To enable more hybrid searching capabilities, adjust the document schema [here](/datastore/providers/redis_datastore.py).
 9 | 
10 | **Environment Variables:**
11 | 
12 | | Name                    | Required | Description                                                                                                            | Default     |
13 | | ----------------------- | -------- | ---------------------------------------------------------------------------------------------------------------------- | ----------- |
14 | | `DATASTORE`             | Yes      | Datastore name, set to `redis`                                                                                         |             |
15 | | `BEARER_TOKEN`          | Yes      | Secret token                                                                                                           |             |
16 | | `OPENAI_API_KEY`        | Yes      | OpenAI API key                                                                                                         |             |
17 | | `REDIS_HOST`            | Optional | Redis host url                                                                                                         | `localhost` |
18 | | `REDIS_PORT`            | Optional | Redis port                                                                                                             | `6379`      |
19 | | `REDIS_PASSWORD`        | Optional | Redis password                                                                                                         | none        |
20 | | `REDIS_INDEX_NAME`      | Optional | Redis vector index name                                                                                                | `index`     |
21 | | `REDIS_DOC_PREFIX`      | Optional | Redis key prefix for the index                                                                                         | `doc`       |
22 | | `REDIS_DISTANCE_METRIC` | Optional | Vector similarity distance metric                                                                                      | `COSINE`    |
23 | | `REDIS_INDEX_TYPE`      | Optional | [Vector index algorithm type](https://redis.io/docs/stack/search/reference/vectors/#creation-attributes-per-algorithm) | `FLAT`      |
24 | 
25 | 
26 | ## Redis Datastore development & testing
27 | In order to test your changes to the Redis Datastore, you can run the following commands:
28 | 
29 | ```bash
30 | # Run the Redis stack docker image
31 | docker run -it --rm -p 6379:6379 redis/redis-stack-server:latest
32 | ```
33 |     
34 | ```bash
35 | # Run the Redis datastore tests
36 | poetry run pytest -s ./tests/datastore/providers/redis/test_redis_datastore.py
37 | ```


--------------------------------------------------------------------------------
/docs/providers/supabase/setup.md:
--------------------------------------------------------------------------------
 1 | # Supabase
 2 | 
 3 | [Supabase](https://supabase.com/blog/openai-embeddings-postgres-vector) offers an easy and efficient way to store vectors via [pgvector](https://github.com/pgvector/pgvector) extension for Postgres Database. [You can use Supabase CLI](https://github.com/supabase/cli) to set up a whole Supabase stack locally or in the cloud or you can also use docker-compose, k8s and other options available. For a hosted/managed solution, try [Supabase.com](https://supabase.com/) and unlock the full power of Postgres with built-in authentication, storage, auto APIs, and Realtime features. See more helpful examples of Supabase & pgvector as a vector database [here](https://github.com/supabase-community/nextjs-openai-doc-search).
 4 | 
 5 | - The database needs the `pgvector` extension, which is included in [Supabase distribution of Postgres](https://github.com/supabase/postgres).
 6 | - It is possible to provide a Postgres connection string and an app will add `documents` table, query Postgres function, and `pgvector` extension automatically.
 7 | - But it is recommended to separate the migration process from an app. And execute the migration script in a different pipeline by using SQL statements from `_init_db()` function in [Supabase datastore provider](/datastore/providers/supabase_datastore.py).
 8 | 
 9 | **Retrieval App Environment Variables**
10 | 
11 | | Name             | Required | Description                            |
12 | | ---------------- | -------- | -------------------------------------- |
13 | | `DATASTORE`      | Yes      | Datastore name. Set this to `supabase` |
14 | | `BEARER_TOKEN`   | Yes      | Your secret token                      |
15 | | `OPENAI_API_KEY` | Yes      | Your OpenAI API key                    |
16 | 
17 | **Supabase Datastore Environment Variables**
18 | 
19 | | Name                        | Required | Description                                                                    | Default |
20 | | --------------------------- | -------- | ------------------------------------------------------------------------------ | ------- |
21 | | `SUPABASE_URL`              | Yes      | Supabase Project URL                                                           |         |
22 | | `SUPABASE_ANON_KEY`         | Optional | Supabase Project API anon key                                                  |         |
23 | | `SUPABASE_SERVICE_ROLE_KEY` | Optional | Supabase Project API service key, will be used if provided instead of anon key |         |
24 | 
25 | ## Supabase Datastore local development & testing
26 | 
27 | In order to test your changes to the Supabase Datastore, you can run the following commands:
28 | 
29 | 1. Install [Supabase CLI](https://github.com/supabase/cli) and [Docker](https://docs.docker.com/get-docker/)
30 | 
31 | 2. Run the Supabase `start` command from `examples/providers` directory. Config for Supabase local setup is available in `examples/providers/supabase` directory with required migrations.
32 | 
33 | ```bash
34 | # Run the Supabase stack using cli in docker
35 | # go to examples/providers and run supabase start
36 | cd examples/providers
37 | supabase start
38 | ```
39 | 
40 | 3. Supabase `start` will download docker images and launch Supabase stack locally. You will see similar output:
41 | 
42 | ```bash
43 | Applying migration 20230414142107_init_pg_vector.sql...
44 | Seeding data supabase/seed.sql...
45 | Started supabase local development setup.
46 | 
47 |          API URL: http://localhost:54321
48 |           DB URL: postgresql://postgres:postgres@localhost:54322/postgres
49 |       Studio URL: http://localhost:54323
50 |     Inbucket URL: http://localhost:54324
51 |       JWT secret: super-secret-jwt-token-with-at-least-32-characters-long
52 |         anon key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6ImFub24iLCJleHAiOjE5ODM4MTI5OTZ9.CRXP1A7WOeoJeXxjNni43kdQwgnWNReilDMblYTn_I0
53 | service_role key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU
54 | ```
55 | 
56 | 4. Export environment variables required for the Supabase Datastore
57 | 
58 | ```bash
59 | export SUPABASE_URL=http://localhost:54321
60 | export SUPABASE_SERVICE_ROLE_KEY='eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU'
61 | ```
62 | 
63 | 5. Run the Supabase datastore tests from the project's root directory
64 | 
65 | ```bash
66 | # Run the Supabase datastore tests
67 | # go to project's root directory and run
68 | poetry run pytest -s ./tests/datastore/providers/supabase/test_supabase_datastore.py
69 | ```
70 | 
71 | 6. When you go to prod (if cloud hosted) it is recommended to link your supabase project with the local setup from `examples/providers/supabase`. All migrations will be synced with the cloud project after you run `supabase db push`. Or you can manually apply migrations from `examples/providers/supabase/migrations` directory.
72 | 
73 | 7. You might want to add RLS policies to the `documents` table. Or you can just continue using it on the server side only with the service role key. But you should not use service role key on the client side in any case.
74 | 
75 | ## Indexes for Postgres
76 | 
77 | By default, pgvector performs exact nearest neighbor search. To speed up the vector comparison, you may want to create indexes for the `embedding` column in the `documents` table. You should do this **only** after a few thousand records are inserted.
78 | 
79 | As datasotre is using inner product for similarity search, you can add index as follows:
80 | 
81 | ```sql
82 | create index on documents using ivfflat (embedding vector_ip_ops) with (lists = 100);
83 | ```
84 | 
85 | To choose `lists` constant - a good place to start is records / 1000 for up to 1M records and sqrt(records) for over 1M records
86 | 
87 | For more information about indexes, see [pgvector docs](https://github.com/pgvector/pgvector#indexing).
88 | 


--------------------------------------------------------------------------------
/docs/providers/weaviate/setup.md:
--------------------------------------------------------------------------------
 1 | # Weaviate
 2 | 
 3 | ## Set up a Weaviate Instance
 4 | 
 5 | [Weaviate](https://weaviate.io/) is an open-source vector search engine designed to scale seamlessly into billions of data objects. This implementation supports hybrid search out-of-the-box (meaning it will perform better for keyword searches).
 6 | 
 7 | You can run Weaviate in 4 ways:
 8 | 
 9 | - **SaaS** – with [Weaviate Cloud Services (WCS)](https://weaviate.io/pricing).
10 | 
11 |   WCS is a fully managed service that takes care of hosting, scaling, and updating your Weaviate instance. You can try it out for free with a sandbox that lasts for 30 days.
12 | 
13 |   To set up a SaaS Weaviate instance with WCS:
14 | 
15 |   1.  Navigate to [Weaviate Cloud Console](https://console.weaviate.io/).
16 |   2.  Register or sign in to your WCS account.
17 |   3.  Create a new cluster with the following settings:
18 |       - `Name` – a unique name for your cluster. The name will become part of the URL used to access this instance.
19 |       - `Subscription Tier` – Sandbox for a free trial, or contact [hello@weaviate.io](mailto:hello@weaviate.io) for other options.
20 |       - `Weaviate Version` - The latest version by default.
21 |       - `OIDC Authentication` – Enabled by default. This requires a username and password to access your instance.
22 |   4.  Wait for a few minutes until your cluster is ready. You will see a green tick ✔️ when it's done. Copy your cluster URL.
23 | 
24 | - **Hybrid SaaS**
25 | 
26 |   > If you need to keep your data on-premise for security or compliance reasons, Weaviate also offers a Hybrid SaaS option: Weaviate runs within your cloud instances, but the cluster is managed remotely by Weaviate. This gives you the benefits of a managed service without sending data to an external party.
27 | 
28 |   The Weaviate Hybrid SaaS is a custom solution. If you are interested in this option, please reach out to [hello@weaviate.io](mailto:hello@weaviate.io).
29 | 
30 | - **Self-hosted** – with a Docker container
31 | 
32 |   To set up a Weaviate instance with Docker:
33 | 
34 |   1. [Install Docker](https://docs.docker.com/engine/install/) on your local machine if it is not already installed.
35 |   2. [Install the Docker Compose Plugin](https://docs.docker.com/compose/install/)
36 |   3. Download a `docker-compose.yml` file with this `curl` command:
37 | 
38 |      ```
39 |      curl -o docker-compose.yml "https://configuration.weaviate.io/v2/docker-compose/docker-compose.yml?modules=standalone&runtime=docker-compose&weaviate_version=v1.18.0"
40 |      ```
41 | 
42 |      Alternatively, you can use Weaviate's docker compose [configuration tool](https://weaviate.io/developers/weaviate/installation/docker-compose) to generate your own `docker-compose.yml` file.
43 | 
44 |   4. Run `docker compose up -d` to spin up a Weaviate instance.
45 | 
46 |      > To shut it down, run `docker compose down`.
47 | 
48 | - **Self-hosted** – with a Kubernetes cluster
49 | 
50 |   To configure a self-hosted instance with Kubernetes, follow Weaviate's [documentation](https://weaviate.io/developers/weaviate/installation/kubernetes).
51 | 
52 | ## Configure Weaviate Environment Variables
53 | 
54 | You need to set some environment variables to connect to your Weaviate instance.
55 | 
56 | **Retrieval App Environment Variables**
57 | 
58 | | Name             | Required | Description                                                                          |
59 | | ---------------- | -------- |--------------------------------------------------------------------------------------|
60 | | `DATASTORE`      | Yes      | Datastore name. Set this to `weaviate`                                               |
61 | | `BEARER_TOKEN`   | Yes      | Your [secret token](/README.md#general-environment-variables) (not the Weaviate one) |
62 | | `OPENAI_API_KEY` | Yes      | Your OpenAI API key                                                                  |
63 | 
64 | **Weaviate Datastore Environment Variables**
65 | 
66 | | Name             | Required | Description                                                        | Default            |
67 | |------------------| -------- | ------------------------------------------------------------------ | ------------------ |
68 | | `WEAVIATE_URL`  | Optional | Your weaviate instance's url/WCS endpoint              | `http://localhost:8080` |           |
69 | | `WEAVIATE_CLASS` | Optional | Your chosen Weaviate class/collection name to store your documents | OpenAIDocument     |
70 | 
71 | **Weaviate Auth Environment Variables**
72 | 
73 | If using WCS instances, set the following environment variables:
74 | 
75 | | Name                | Required | Description                    |
76 | | ------------------- | -------- | ------------------------------ |
77 | | `WEAVIATE_API_KEY` | Yes      | Your API key WCS      |
78 | 
79 | Learn more about accessing your [WCS API key](https://weaviate.io/developers/wcs/guides/authentication#access-api-keys).


--------------------------------------------------------------------------------
/docs/providers/zilliz/setup.md:
--------------------------------------------------------------------------------
 1 | # Zilliz
 2 | 
 3 | [Zilliz](https://zilliz.com) is a managed cloud-native vector database designed for the billion scale. Zilliz offers many key features, such as:
 4 | 
 5 | - Multiple indexing algorithms
 6 | - Multiple distance metrics
 7 | - Scalar filtering
 8 | - Time travel searches
 9 | - Rollback and with snapshots
10 | - Full RBAC
11 | - 99.9% uptime
12 | - Separated storage and compute
13 | - Multi-language SDK's
14 | 
15 | Find more information [here](https://zilliz.com).
16 | 
17 | **Self Hosted vs SaaS**
18 | 
19 | Zilliz is a SaaS database, but offers an open-source solution, Milvus. Both options offer fast searches at the billion scale, but Zilliz handles data management for you. It automatically scales compute and storage resources and creates optimal indexes for your data. See the comparison [here](https://zilliz.com/doc/about_zilliz_cloud).
20 | 
21 | ## Deploying the Database
22 | 
23 | Zilliz Cloud is deployable in a few simple steps. First, create an account [here](https://cloud.zilliz.com/signup). Once you have an account set up, follow the guide [here](https://zilliz.com/doc/quick_start) to set up a database and get the parameters needed for this application.
24 | 
25 | Environment Variables:
26 | 
27 | | Name                       | Required | Description                                                      |
28 | |----------------------------| -------- |------------------------------------------------------------------|
29 | | `DATASTORE`                | Yes      | Datastore name, set to `zilliz`                                  |
30 | | `BEARER_TOKEN`             | Yes      | Your secret token                                                |
31 | | `OPENAI_API_KEY`           | Yes      | Your OpenAI API key                                              |
32 | | `ZILLIZ_COLLECTION`        | Optional | Zilliz collection name. Defaults to a random UUID                |
33 | | `ZILLIZ_URI`               | Yes      | URI for the Zilliz instance                                      |
34 | | `ZILLIZ_USER`              | Yes      | Zilliz username                                                  |
35 | | `ZILLIZ_PASSWORD`          | Yes      | Zilliz password                                                  |
36 | | `ZILLIZ_CONSISTENCY_LEVEL` | Optional | Data consistency level for the collection, defaults to `Bounded` |
37 | 
38 | ## Running Zilliz Integration Tests
39 | 
40 | A suite of integration tests is available to verify the Zilliz integration. To run the tests, create a Zilliz database and update the environment variables.
41 | 
42 | Then, launch the test suite with this command:
43 | 
44 | ```bash
45 | pytest ./tests/datastore/providers/zilliz/test_zilliz_datastore.py
46 | ```
47 | 


--------------------------------------------------------------------------------
/examples/authentication-methods/no-auth/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schema_version": "v1",
 3 |   "name_for_model": "retrieval",
 4 |   "name_for_human": "Retrieval Plugin",
 5 |   "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
 6 |   "description_for_human": "Search through your documents.",
 7 |   "auth": {
 8 |     "type": "none"
 9 |   },
10 |   "api": {
11 |     "type": "openapi",
12 |     "url": "https://your-app-url.com/.well-known/openapi.yaml"
13 |   },
14 |   "logo_url": "https://your-app-url.com/.well-known/logo.png",
15 |   "contact_email": "hello@contact.com", 
16 |   "legal_info_url": "hello@legal.com"
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/authentication-methods/no-auth/main.py:
--------------------------------------------------------------------------------
  1 | # This is a version of the main.py file found in ../../../server/main.py without authentication.
  2 | # Copy and paste this into the main file at ../../../server/main.py if you choose to use no authentication for your retrieval plugin.
  3 | from typing import Optional
  4 | import uvicorn
  5 | from fastapi import FastAPI, File, Form, HTTPException, Body, UploadFile
  6 | from fastapi.staticfiles import StaticFiles
  7 | from loguru import logger
  8 | 
  9 | from models.api import (
 10 |     DeleteRequest,
 11 |     DeleteResponse,
 12 |     QueryRequest,
 13 |     QueryResponse,
 14 |     UpsertRequest,
 15 |     UpsertResponse,
 16 | )
 17 | from datastore.factory import get_datastore
 18 | from services.file import get_document_from_file
 19 | 
 20 | from models.models import DocumentMetadata, Source
 21 | 
 22 | 
 23 | app = FastAPI()
 24 | app.mount("/.well-known", StaticFiles(directory=".well-known"), name="static")
 25 | 
 26 | # Create a sub-application, in order to access just the query endpoints in the OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally
 27 | sub_app = FastAPI(
 28 |     title="Retrieval Plugin API",
 29 |     description="A retrieval API for querying and filtering documents based on natural language queries and metadata",
 30 |     version="1.0.0",
 31 |     servers=[{"url": "https://your-app-url.com"}],
 32 | )
 33 | app.mount("/sub", sub_app)
 34 | 
 35 | 
 36 | @app.post(
 37 |     "/upsert-file",
 38 |     response_model=UpsertResponse,
 39 | )
 40 | async def upsert_file(
 41 |     file: UploadFile = File(...),
 42 |     metadata: Optional[str] = Form(None),
 43 | ):
 44 |     try:
 45 |         metadata_obj = (
 46 |             DocumentMetadata.parse_raw(metadata)
 47 |             if metadata
 48 |             else DocumentMetadata(source=Source.file)
 49 |         )
 50 |     except:
 51 |         metadata_obj = DocumentMetadata(source=Source.file)
 52 | 
 53 |     document = await get_document_from_file(file, metadata_obj)
 54 | 
 55 |     try:
 56 |         ids = await datastore.upsert([document])
 57 |         return UpsertResponse(ids=ids)
 58 |     except Exception as e:
 59 |         logger.error(e)
 60 |         raise HTTPException(status_code=500, detail=f"str({e})")
 61 | 
 62 | 
 63 | @app.post(
 64 |     "/upsert",
 65 |     response_model=UpsertResponse,
 66 | )
 67 | async def upsert(
 68 |     request: UpsertRequest = Body(...),
 69 | ):
 70 |     try:
 71 |         ids = await datastore.upsert(request.documents)
 72 |         return UpsertResponse(ids=ids)
 73 |     except Exception as e:
 74 |         logger.error(e)
 75 |         raise HTTPException(status_code=500, detail="Internal Service Error")
 76 | 
 77 | 
 78 | @app.post(
 79 |     "/query",
 80 |     response_model=QueryResponse,
 81 | )
 82 | async def query_main(
 83 |     request: QueryRequest = Body(...),
 84 | ):
 85 |     try:
 86 |         results = await datastore.query(
 87 |             request.queries,
 88 |         )
 89 |         return QueryResponse(results=results)
 90 |     except Exception as e:
 91 |         logger.error(e)
 92 |         raise HTTPException(status_code=500, detail="Internal Service Error")
 93 | 
 94 | 
 95 | @sub_app.post(
 96 |     "/query",
 97 |     response_model=QueryResponse,
 98 |     description="Accepts search query objects with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.",
 99 | )
100 | async def query(
101 |     request: QueryRequest = Body(...),
102 | ):
103 |     try:
104 |         results = await datastore.query(
105 |             request.queries,
106 |         )
107 |         return QueryResponse(results=results)
108 |     except Exception as e:
109 |         logger.error(e)
110 |         raise HTTPException(status_code=500, detail="Internal Service Error")
111 | 
112 | 
113 | @app.delete(
114 |     "/delete",
115 |     response_model=DeleteResponse,
116 | )
117 | async def delete(
118 |     request: DeleteRequest = Body(...),
119 | ):
120 |     if not (request.ids or request.filter or request.delete_all):
121 |         raise HTTPException(
122 |             status_code=400,
123 |             detail="One of ids, filter, or delete_all is required",
124 |         )
125 |     try:
126 |         success = await datastore.delete(
127 |             ids=request.ids,
128 |             filter=request.filter,
129 |             delete_all=request.delete_all,
130 |         )
131 |         return DeleteResponse(success=success)
132 |     except Exception as e:
133 |         logger.error(e)
134 |         raise HTTPException(status_code=500, detail="Internal Service Error")
135 | 
136 | 
137 | @app.on_event("startup")
138 | async def startup():
139 |     global datastore
140 |     datastore = await get_datastore()
141 | 
142 | 
143 | def start():
144 |     uvicorn.run("server.main:app", host="0.0.0.0", port=8000, reload=True)
145 | 


--------------------------------------------------------------------------------
/examples/authentication-methods/oauth/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schema_version": "v1",
 3 |   "name_for_model": "retrieval",
 4 |   "name_for_human": "Retrieval Plugin",
 5 |   "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
 6 |   "description_for_human": "Search through your documents.",
 7 |   "auth" : {
 8 |     "type":"oauth",
 9 |     "client_url":"e.g. https://<your domain>/oauth/v2/authorize",
10 |     "authorization_url":"e.g. https://<your domain>/api/oauth.v2.access",
11 |     "scope":"search:read",
12 |     "authorization_content_type":"application/x-www-form-urlencoded",
13 |     "verification_tokens":{
14 | 			"openai":"<token from add plugin flow from the ChatGPT UI>"
15 |     }
16 |   },
17 | 	"api":{
18 |     "url": "https://your-app-url.com/.well-known/openapi.yaml",
19 | 		"has_user_authentication":true,
20 | 		"type":"openapi"
21 | 	},
22 |   "logo_url": "https://your-app-url.com/.well-known/logo.png",
23 |   "contact_email": "hello@contact.com", 
24 |   "legal_info_url": "hello@legal.com"
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/authentication-methods/service-http/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schema_version": "v1",
 3 |   "name_for_model": "retrieval",
 4 |   "name_for_human": "Retrieval Plugin",
 5 |   "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
 6 |   "description_for_human": "Search through your documents.",
 7 |   "auth":{
 8 | 		"type":"service_http",
 9 | 		"authorization_type":"bearer",
10 | 		"verification_tokens":{
11 | 			"openai":"<token from add plugin flow from the ChatGPT UI>"
12 | 		}
13 | 	},
14 | 	"api":{
15 |     "url": "https://your-app-url.com/.well-known/openapi.yaml",
16 | 		"has_user_authentication":false,
17 | 		"type":"openapi"
18 | 	},
19 |   "logo_url": "https://your-app-url.com/.well-known/logo.png",
20 |   "contact_email": "hello@contact.com", 
21 |   "legal_info_url": "hello@legal.com"
22 | }
23 | 


--------------------------------------------------------------------------------
/examples/authentication-methods/user-http/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schema_version": "v1",
 3 |   "name_for_model": "retrieval",
 4 |   "name_for_human": "Retrieval Plugin",
 5 |   "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
 6 |   "description_for_human": "Search through your documents.",
 7 |   "auth": {
 8 |     "type": "user_http",
 9 |     "authorization_type": "bearer"
10 |   },
11 |   "api": {
12 |     "type": "openapi",
13 |     "url": "https://your-app-url.com/.well-known/openapi.yaml",
14 |     "has_user_authentication": false
15 |   },
16 |   "logo_url": "https://your-app-url.com/.well-known/logo.png",
17 |   "contact_email": "hello@contact.com", 
18 |   "legal_info_url": "hello@legal.com"
19 | }


--------------------------------------------------------------------------------
/examples/docker/elasticsearch/README.md:
--------------------------------------------------------------------------------
1 | ## Running Elasticsearch
2 | 
3 | ```bash
4 | docker-compose up -d
5 | ```
6 | 
7 | should now be running at http://localhost:9200
8 | 


--------------------------------------------------------------------------------
/examples/docker/elasticsearch/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | 
 3 | services:
 4 |   elasticsearch:
 5 |     image: docker.elastic.co/elasticsearch/elasticsearch:8.8.2
 6 |     container_name: elasticsearch
 7 |     environment:
 8 |       - discovery.type=single-node
 9 |       - node.name=elasticsearch
10 |       - xpack.security.enabled=false
11 |     ulimits:
12 |       memlock:
13 |         soft: -1
14 |         hard: -1
15 |     ports:
16 |       - "9200:9200"
17 |     networks:
18 |       - esnet
19 |     volumes:
20 |       - esdata:/usr/share/elasticsearch/data
21 | 
22 | networks:
23 |   esnet:
24 | 
25 | volumes:
26 |   esdata:
27 |     driver: local
28 | 


--------------------------------------------------------------------------------
/examples/docker/milvus/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.5'
 2 | 
 3 | services:
 4 |   etcd:
 5 |     container_name: milvus-etcd
 6 |     image: quay.io/coreos/etcd:v3.5.0
 7 |     environment:
 8 |       - ETCD_AUTO_COMPACTION_MODE=revision
 9 |       - ETCD_AUTO_COMPACTION_RETENTION=1000
10 |       - ETCD_QUOTA_BACKEND_BYTES=4294967296
11 |       - ETCD_SNAPSHOT_COUNT=50000
12 |     volumes:
13 |       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
14 |     command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
15 | 
16 |   minio:
17 |     container_name: milvus-minio
18 |     image: minio/minio:RELEASE.2023-03-20T20-16-18Z
19 |     environment:
20 |       MINIO_ACCESS_KEY: minioadmin
21 |       MINIO_SECRET_KEY: minioadmin
22 |     volumes:
23 |       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
24 |     command: minio server /minio_data
25 |     healthcheck:
26 |       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
27 |       interval: 30s
28 |       timeout: 20s
29 |       retries: 3
30 | 
31 |   standalone:
32 |     container_name: milvus-standalone
33 |     image: milvusdb/milvus:v2.2.5
34 |     command: ["milvus", "run", "standalone"]
35 |     environment:
36 |       ETCD_ENDPOINTS: etcd:2379
37 |       MINIO_ADDRESS: minio:9000
38 |     volumes:
39 |       - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
40 |     ports:
41 |       - "19530:19530"
42 |       - "9091:9091"
43 |     depends_on:
44 |       - "etcd"
45 |       - "minio"
46 | 
47 | networks:
48 |   default:
49 |     name: milvus


--------------------------------------------------------------------------------
/examples/docker/qdrant/README.md:
--------------------------------------------------------------------------------
 1 | # Running the Retrieval Plugin with Qdrant in Docker Containers
 2 | 
 3 | To set up the ChatGPT retrieval plugin with a single instance of a Qdrant vector database, follow these steps:
 4 | 
 5 | ## Set Environment Variables
 6 | 
 7 | Set the following environment variables:
 8 | 
 9 | ```bash
10 | # Provide your own OpenAI API key in order to start.
11 | export OPENAI_API_KEY="<your_OpenAI_API_key>"
12 | # This is an example of a minimal token generated by https://jwt.io/
13 | export BEARER_TOKEN="<your_bearer_token>"
14 | ```
15 | 
16 | ## Run Qdrant and the Retrieval Plugin in Docker Containers
17 | 
18 | Both Docker containers might be launched with docker-compose:
19 | 
20 | ```bash
21 | docker-compose up -d
22 | ```
23 | 
24 | ## Store the Documents
25 | 
26 | Store an initial batch of documents by calling the `/upsert` endpoint:
27 | 
28 | ```bash
29 | curl -X POST \
30 |   -H "Content-type: application/json" \
31 |   -H "Authorization: Bearer $BEARER_TOKEN" \
32 |   --data-binary '@documents.json' \
33 |   "http://localhost:80/upsert"
34 | ```
35 | 
36 | ## Send a Test Query
37 | 
38 | You can query Qdrant to find relevant document chunks by calling the `/query` endpoint:
39 | 
40 | ```bash
41 | curl -X POST \
42 |   -H "Content-type: application/json" \
43 |   -H "Authorization: Bearer $BEARER_TOKEN" \
44 |   --data-binary '@queries.json' \
45 |   "http://localhost:80/query"
46 | ```
47 | 


--------------------------------------------------------------------------------
/examples/docker/qdrant/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   retrieval-app:
 3 |     build:
 4 |       context: ../../../
 5 |       dockerfile: Dockerfile
 6 |     image: openai/chatgpt-retrieval-plugin
 7 |     ports:
 8 |       - "80:80"
 9 |     depends_on:
10 |       - qdrant
11 |     environment:
12 |       DATASTORE: "qdrant"
13 |       QDRANT_URL: "http://qdrant"
14 |       BEARER_TOKEN: "${BEARER_TOKEN}"
15 |       OPENAI_API_KEY: "${OPENAI_API_KEY}"
16 |   qdrant:
17 |     image: qdrant/qdrant:v1.0.3


--------------------------------------------------------------------------------
/examples/docker/qdrant/documents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "documents": [
 3 |     {
 4 |       "id": "openai",
 5 |       "text": "OpenAI is an AI research and deployment company. Our mission is to ensure that artificial general intelligence benefits all of humanity.",
 6 |       "metadata": {
 7 |         "created_at": "2023-03-14"
 8 |       }
 9 |     },
10 |     {
11 |       "id": "chatgpt",
12 |       "text": "ChatGPT is a sibling model to InstructGPT, which is trained to follow an instruction in a prompt and provide a detailed response. The dialogue format makes it possible for ChatGPT to answer followup questions, admit its mistakes, challenge incorrect premises, and reject inappropriate requests."
13 |     },
14 |     {
15 |       "id": "qdrant",
16 |       "text": "Qdrant is a vector similarity engine & vector database. It deploys as an API service providing search for the nearest high-dimensional vectors. With Qdrant, embeddings or neural network encoders can be turned into full-fledged applications for matching, searching, recommending, and much more!",
17 |       "metadata": {
18 |         "created_at": "2023-03-14",
19 |         "author": "Kacper Łukawski"
20 |       }
21 |     }
22 |   ]
23 | }


--------------------------------------------------------------------------------
/examples/docker/qdrant/queries.json:
--------------------------------------------------------------------------------
1 | {
2 |   "queries": [
3 |     {
4 |       "query": "What vector database should I use?"
5 |     }
6 |   ]
7 | }


--------------------------------------------------------------------------------
/examples/docker/redis/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services:
 4 |   redis:
 5 |     image: redis/redis-stack-server:latest
 6 |     ports:
 7 |       - "6379:6379"
 8 |     volumes:
 9 |         - redis_data:/data
10 |     healthcheck:
11 |       test: ["CMD", "redis-cli", "-h", "localhost", "-p", "6379", "ping"]
12 |       interval: 2s
13 |       timeout: 1m30s
14 |       retries: 5
15 |       start_period: 5s
16 | 
17 | volumes:
18 |   redis_data:


--------------------------------------------------------------------------------
/examples/memory/README.md:
--------------------------------------------------------------------------------
 1 | # ChatGPT Retrieval Plugin with Memory
 2 | 
 3 | This example demonstrates how to give ChatGPT the ability to remember information from conversations and store it in the retrieval plugin for later use. By allowing the model to access the `/upsert` endpoint, it can save snippets from the conversation to the vector database and retrieve them when needed.
 4 | 
 5 | ## Setup
 6 | 
 7 | To enable ChatGPT to save information from conversations, follow these steps:
 8 | 
 9 | - Navigate to the "Configure" tab in the [create GPT page](https://chat.openai.com/gpts/editor), and copy the contents of [openapi.yaml](openapi.yaml) into the custom actions section. This will give the custom GPT access to both the Retrieval Plugin's query and upsert endpoints.
10 | 
11 | **Optional:** If you make any changes to the plugin instructions or metadata models, you can also copy the contents of [main.py](main.py) into the main [main.py](../../server/main.py) file. This will allow you to access the openapi.json at `http://0.0.0.0:8000/sub/openapi.json` when you run the app locally. You can convert from JSON to YAML format with [Swagger Editor](https://editor.swagger.io/). Alternatively, you can replace the openapi.yaml file with an openapi.json file.
12 | 
13 | After completing these steps, your custom GPT will be able to access your plugin's `/upsert` endpoint and save snippets from the conversation to the vector database. This enables the model to remember information from previous conversations and retrieve it when needed.
14 | 


--------------------------------------------------------------------------------
/examples/memory/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "schema_version": "v1",
 3 |     "name_for_model": "retrieval",
 4 |     "name_for_human": "Retrieval Plugin",
 5 |     "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information, or asks you to save information for later.",
 6 |     "description_for_human": "Search through your documents.",
 7 |     "auth": {
 8 |       "type": "user_http",
 9 |       "authorization_type": "bearer"
10 |     },
11 |     "api": {
12 |       "type": "openapi",
13 |       "url": "https://your-app-url.com/.well-known/openapi.yaml",
14 |       "has_user_authentication": false
15 |     },
16 |     "logo_url": "https://your-app-url.com/.well-known/logo.png",
17 |     "contact_email": "hello@contact.com", 
18 |     "legal_info_url": "hello@legal.com"
19 |   }


--------------------------------------------------------------------------------
/examples/providers/supabase/.gitignore:
--------------------------------------------------------------------------------
1 | # Supabase
2 | .branches
3 | .temp
4 | 


--------------------------------------------------------------------------------
/examples/providers/supabase/config.toml:
--------------------------------------------------------------------------------
 1 | # A string used to distinguish different Supabase projects on the same host. Defaults to the working
 2 | # directory name when running `supabase init`.
 3 | project_id = "providers"
 4 | 
 5 | [api]
 6 | # Port to use for the API URL.
 7 | port = 54321
 8 | # Schemas to expose in your API. Tables, views and stored procedures in this schema will get API
 9 | # endpoints. public and storage are always included.
10 | schemas = ["public", "storage", "graphql_public"]
11 | # Extra schemas to add to the search_path of every request. public is always included.
12 | extra_search_path = ["public", "extensions"]
13 | # The maximum number of rows returns from a view, table, or stored procedure. Limits payload size
14 | # for accidental or malicious requests.
15 | max_rows = 1000
16 | 
17 | [db]
18 | # Port to use for the local database URL.
19 | port = 54322
20 | # The database major version to use. This has to be the same as your remote database's. Run `SHOW
21 | # server_version;` on the remote database to check.
22 | major_version = 15
23 | 
24 | [studio]
25 | # Port to use for Supabase Studio.
26 | port = 54323
27 | 
28 | # Email testing server. Emails sent with the local dev setup are not actually sent - rather, they
29 | # are monitored, and you can view the emails that would have been sent from the web interface.
30 | [inbucket]
31 | # Port to use for the email testing server web interface.
32 | port = 54324
33 | smtp_port = 54325
34 | pop3_port = 54326
35 | 
36 | [storage]
37 | # The maximum file size allowed (e.g. "5MB", "500KB").
38 | file_size_limit = "50MiB"
39 | 
40 | [auth]
41 | # The base URL of your website. Used as an allow-list for redirects and for constructing URLs used
42 | # in emails.
43 | site_url = "http://localhost:3000"
44 | # A list of *exact* URLs that auth providers are permitted to redirect to post authentication.
45 | additional_redirect_urls = ["https://localhost:3000"]
46 | # How long tokens are valid for, in seconds. Defaults to 3600 (1 hour), maximum 604,800 seconds (one
47 | # week).
48 | jwt_expiry = 3600
49 | # Allow/disallow new user signups to your project.
50 | enable_signup = true
51 | 
52 | [auth.email]
53 | # Allow/disallow new user signups via email to your project.
54 | enable_signup = true
55 | # If enabled, a user will be required to confirm any email change on both the old, and new email
56 | # addresses. If disabled, only the new email is required to confirm.
57 | double_confirm_changes = true
58 | # If enabled, users need to confirm their email address before signing in.
59 | enable_confirmations = false
60 | 
61 | # Use an external OAuth provider. The full list of providers are: `apple`, `azure`, `bitbucket`,
62 | # `discord`, `facebook`, `github`, `gitlab`, `google`, `keycloak`, `linkedin`, `notion`, `twitch`,
63 | # `twitter`, `slack`, `spotify`, `workos`, `zoom`.
64 | [auth.external.apple]
65 | enabled = false
66 | client_id = ""
67 | secret = ""
68 | # Overrides the default auth redirectUrl.
69 | redirect_uri = ""
70 | # Overrides the default auth provider URL. Used to support self-hosted gitlab, single-tenant Azure,
71 | # or any other third-party OIDC providers.
72 | url = ""
73 | 


--------------------------------------------------------------------------------
/examples/providers/supabase/migrations/20230414142107_init_pg_vector.sql:
--------------------------------------------------------------------------------
 1 | create extension vector;
 2 | 
 3 | create table if not exists documents (
 4 |     id text primary key default gen_random_uuid()::text,
 5 |     source text,
 6 |     source_id text,
 7 |     content text,
 8 |     document_id text,
 9 |     author text,
10 |     url text,
11 |     created_at timestamptz default now(),
12 |     embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model
13 | );
14 | 
15 | create index ix_documents_document_id on documents using btree ( document_id );
16 | create index ix_documents_source on documents using btree ( source );
17 | create index ix_documents_source_id on documents using btree ( source_id );
18 | create index ix_documents_author on documents using btree ( author );
19 | create index ix_documents_created_at on documents using brin ( created_at );
20 | 
21 | alter table documents enable row level security;
22 | 
23 | create or replace function match_page_sections(in_embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model
24 |                                             , in_match_count int default 3
25 |                                             , in_document_id text default '%%'
26 |                                             , in_source_id text default '%%'
27 |                                             , in_source text default '%%'
28 |                                             , in_author text default '%%'
29 |                                             , in_start_date timestamptz default '-infinity'
30 |                                             , in_end_date timestamptz default 'infinity')
31 | returns table (id text
32 |             , source text
33 |             , source_id text
34 |             , document_id text
35 |             , url text
36 |             , created_at timestamptz
37 |             , author text
38 |             , content text
39 |             , embedding vector(256) -- 256 is the default dimension, change depending on dimensionality of your chosen embeddings model
40 |             , similarity float)
41 | language plpgsql
42 | as $$
43 | #variable_conflict use_variable
44 | begin
45 | return query
46 | select
47 |     documents.id,
48 |     documents.source,
49 |     documents.source_id,
50 |     documents.document_id,
51 |     documents.url,
52 |     documents.created_at,
53 |     documents.author,
54 |     documents.content,
55 |     documents.embedding,
56 |     (documents.embedding <#> in_embedding) * -1 as similarity
57 | from documents
58 | 
59 | where in_start_date <= documents.created_at and 
60 |     documents.created_at <= in_end_date and
61 |     (documents.source_id like in_source_id or documents.source_id is null) and
62 |     (documents.source like in_source or documents.source is null) and
63 |     (documents.author like in_author or documents.author is null) and
64 |     (documents.document_id like in_document_id or documents.document_id is null)
65 | 
66 | order by documents.embedding <#> in_embedding
67 | 
68 | limit in_match_count;
69 | end;
70 | $$;


--------------------------------------------------------------------------------
/examples/providers/supabase/seed.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/examples/providers/supabase/seed.sql


--------------------------------------------------------------------------------
/local_server/ai-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "schema_version": "v1",
 3 |   "name_for_model": "retrieval",
 4 |   "name_for_human": "Retrieval Plugin",
 5 |   "description_for_model": "Plugin for searching through the user's documents (such as files, emails, and more) to find answers to questions and retrieve relevant information. Use it whenever a user asks something that might be found in their personal information.",
 6 |   "description_for_human": "Search through your documents.",
 7 |   "auth": {
 8 |     "type": "none"
 9 |   },
10 |   "api": {
11 |     "type": "openapi",
12 |     "url": "http://localhost:3333/.well-known/openapi.yaml"
13 |   },
14 |   "logo_url": "http://localhost:3333/.well-known/logo.png",
15 |   "contact_email": "hello@contact.com", 
16 |   "legal_info_url": "hello@legal.com"
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/local_server/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/local_server/logo.png


--------------------------------------------------------------------------------
/local_server/main.py:
--------------------------------------------------------------------------------
  1 | # This is a version of the main.py file found in ../../../server/main.py for testing the plugin locally.
  2 | # Use the command `poetry run dev` to run this.
  3 | from typing import Optional
  4 | import uvicorn
  5 | from fastapi import FastAPI, File, Form, HTTPException, Body, UploadFile
  6 | from loguru import logger
  7 | 
  8 | from models.api import (
  9 |     DeleteRequest,
 10 |     DeleteResponse,
 11 |     QueryRequest,
 12 |     QueryResponse,
 13 |     UpsertRequest,
 14 |     UpsertResponse,
 15 | )
 16 | from datastore.factory import get_datastore
 17 | from services.file import get_document_from_file
 18 | 
 19 | from starlette.responses import FileResponse
 20 | 
 21 | from models.models import DocumentMetadata, Source
 22 | from fastapi.middleware.cors import CORSMiddleware
 23 | 
 24 | 
 25 | app = FastAPI()
 26 | 
 27 | PORT = 3333
 28 | 
 29 | origins = [
 30 |     f"http://localhost:{PORT}",
 31 |     "https://chat.openai.com",
 32 | ]
 33 | 
 34 | app.add_middleware(
 35 |     CORSMiddleware,
 36 |     allow_origins=origins,
 37 |     allow_credentials=True,
 38 |     allow_methods=["*"],
 39 |     allow_headers=["*"],
 40 | )
 41 | 
 42 | 
 43 | @app.route("/.well-known/ai-plugin.json")
 44 | async def get_manifest(request):
 45 |     file_path = "./local_server/ai-plugin.json"
 46 |     simple_headers = {}
 47 |     simple_headers["Access-Control-Allow-Private-Network"] = "true"
 48 |     return FileResponse(file_path, media_type="text/json", headers=simple_headers)
 49 | 
 50 | 
 51 | @app.route("/.well-known/logo.png")
 52 | async def get_logo(request):
 53 |     file_path = "./local_server/logo.png"
 54 |     return FileResponse(file_path, media_type="text/json")
 55 | 
 56 | 
 57 | @app.route("/.well-known/openapi.yaml")
 58 | async def get_openapi(request):
 59 |     file_path = "./local_server/openapi.yaml"
 60 |     return FileResponse(file_path, media_type="text/json")
 61 | 
 62 | 
 63 | @app.post(
 64 |     "/upsert-file",
 65 |     response_model=UpsertResponse,
 66 | )
 67 | async def upsert_file(
 68 |     file: UploadFile = File(...),
 69 |     metadata: Optional[str] = Form(None),
 70 | ):
 71 |     try:
 72 |         metadata_obj = (
 73 |             DocumentMetadata.parse_raw(metadata)
 74 |             if metadata
 75 |             else DocumentMetadata(source=Source.file)
 76 |         )
 77 |     except:
 78 |         metadata_obj = DocumentMetadata(source=Source.file)
 79 | 
 80 |     document = await get_document_from_file(file, metadata_obj)
 81 | 
 82 |     try:
 83 |         ids = await datastore.upsert([document])
 84 |         return UpsertResponse(ids=ids)
 85 |     except Exception as e:
 86 |         logger.error(e)
 87 |         raise HTTPException(status_code=500, detail=f"str({e})")
 88 | 
 89 | 
 90 | @app.post(
 91 |     "/upsert",
 92 |     response_model=UpsertResponse,
 93 | )
 94 | async def upsert(
 95 |     request: UpsertRequest = Body(...),
 96 | ):
 97 |     try:
 98 |         ids = await datastore.upsert(request.documents)
 99 |         return UpsertResponse(ids=ids)
100 |     except Exception as e:
101 |         logger.error(e)
102 |         raise HTTPException(status_code=500, detail="Internal Service Error")
103 | 
104 | 
105 | @app.post("/query", response_model=QueryResponse)
106 | async def query_main(request: QueryRequest = Body(...)):
107 |     try:
108 |         results = await datastore.query(
109 |             request.queries,
110 |         )
111 |         return QueryResponse(results=results)
112 |     except Exception as e:
113 |         logger.error(e)
114 |         raise HTTPException(status_code=500, detail="Internal Service Error")
115 | 
116 | 
117 | @app.delete(
118 |     "/delete",
119 |     response_model=DeleteResponse,
120 | )
121 | async def delete(
122 |     request: DeleteRequest = Body(...),
123 | ):
124 |     if not (request.ids or request.filter or request.delete_all):
125 |         raise HTTPException(
126 |             status_code=400,
127 |             detail="One of ids, filter, or delete_all is required",
128 |         )
129 |     try:
130 |         success = await datastore.delete(
131 |             ids=request.ids,
132 |             filter=request.filter,
133 |             delete_all=request.delete_all,
134 |         )
135 |         return DeleteResponse(success=success)
136 |     except Exception as e:
137 |         logger.error(e)
138 |         raise HTTPException(status_code=500, detail="Internal Service Error")
139 | 
140 | 
141 | @app.on_event("startup")
142 | async def startup():
143 |     global datastore
144 |     datastore = await get_datastore()
145 | 
146 | 
147 | def start():
148 |     uvicorn.run("local_server.main:app", host="localhost", port=PORT, reload=True)
149 | 


--------------------------------------------------------------------------------
/local_server/openapi.yaml:
--------------------------------------------------------------------------------
  1 | openapi: 3.0.2
  2 | info:
  3 |   title: Retrieval Plugin API
  4 |   description: A retrieval API for querying and filtering documents based on natural language queries and metadata
  5 |   version: 1.0.0
  6 | servers:
  7 |   - url: http://localhost:3333
  8 | paths:
  9 |   /query:
 10 |     post:
 11 |       summary: Query
 12 |       description: Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.
 13 |       operationId: query_query_post
 14 |       requestBody:
 15 |         content:
 16 |           application/json:
 17 |             schema:
 18 |               $ref: "#/components/schemas/QueryRequest"
 19 |         required: true
 20 |       responses:
 21 |         "200":
 22 |           description: Successful Response
 23 |           content:
 24 |             application/json:
 25 |               schema:
 26 |                 $ref: "#/components/schemas/QueryResponse"
 27 |         "422":
 28 |           description: Validation Error
 29 |           content:
 30 |             application/json:
 31 |               schema:
 32 |                 $ref: "#/components/schemas/HTTPValidationError"
 33 | components:
 34 |   schemas:
 35 |     DocumentChunkMetadata:
 36 |       title: DocumentChunkMetadata
 37 |       type: object
 38 |       properties:
 39 |         source:
 40 |           $ref: "#/components/schemas/Source"
 41 |         source_id:
 42 |           title: Source Id
 43 |           type: string
 44 |         url:
 45 |           title: Url
 46 |           type: string
 47 |         created_at:
 48 |           title: Created At
 49 |           type: string
 50 |         author:
 51 |           title: Author
 52 |           type: string
 53 |         document_id:
 54 |           title: Document Id
 55 |           type: string
 56 |     DocumentChunkWithScore:
 57 |       title: DocumentChunkWithScore
 58 |       required:
 59 |         - text
 60 |         - metadata
 61 |         - score
 62 |       type: object
 63 |       properties:
 64 |         id:
 65 |           title: Id
 66 |           type: string
 67 |         text:
 68 |           title: Text
 69 |           type: string
 70 |         metadata:
 71 |           $ref: "#/components/schemas/DocumentChunkMetadata"
 72 |         embedding:
 73 |           title: Embedding
 74 |           type: array
 75 |           items:
 76 |             type: number
 77 |         score:
 78 |           title: Score
 79 |           type: number
 80 |     DocumentMetadataFilter:
 81 |       title: DocumentMetadataFilter
 82 |       type: object
 83 |       properties:
 84 |         document_id:
 85 |           title: Document Id
 86 |           type: string
 87 |         source:
 88 |           $ref: "#/components/schemas/Source"
 89 |         source_id:
 90 |           title: Source Id
 91 |           type: string
 92 |         author:
 93 |           title: Author
 94 |           type: string
 95 |         start_date:
 96 |           title: Start Date
 97 |           type: string
 98 |         end_date:
 99 |           title: End Date
100 |           type: string
101 |     HTTPValidationError:
102 |       title: HTTPValidationError
103 |       type: object
104 |       properties:
105 |         detail:
106 |           title: Detail
107 |           type: array
108 |           items:
109 |             $ref: "#/components/schemas/ValidationError"
110 |     Query:
111 |       title: Query
112 |       required:
113 |         - query
114 |       type: object
115 |       properties:
116 |         query:
117 |           title: Query
118 |           type: string
119 |         filter:
120 |           $ref: "#/components/schemas/DocumentMetadataFilter"
121 |         top_k:
122 |           title: Top K
123 |           type: integer
124 |           default: 3
125 |     QueryRequest:
126 |       title: QueryRequest
127 |       required:
128 |         - queries
129 |       type: object
130 |       properties:
131 |         queries:
132 |           title: Queries
133 |           type: array
134 |           items:
135 |             $ref: "#/components/schemas/Query"
136 |     QueryResponse:
137 |       title: QueryResponse
138 |       required:
139 |         - results
140 |       type: object
141 |       properties:
142 |         results:
143 |           title: Results
144 |           type: array
145 |           items:
146 |             $ref: "#/components/schemas/QueryResult"
147 |     QueryResult:
148 |       title: QueryResult
149 |       required:
150 |         - query
151 |         - results
152 |       type: object
153 |       properties:
154 |         query:
155 |           title: Query
156 |           type: string
157 |         results:
158 |           title: Results
159 |           type: array
160 |           items:
161 |             $ref: "#/components/schemas/DocumentChunkWithScore"
162 |     Source:
163 |       title: Source
164 |       enum:
165 |         - email
166 |         - file
167 |         - chat
168 |       type: string
169 |       description: An enumeration.
170 |     ValidationError:
171 |       title: ValidationError
172 |       required:
173 |         - loc
174 |         - msg
175 |         - type
176 |       type: object
177 |       properties:
178 |         loc:
179 |           title: Location
180 |           type: array
181 |           items:
182 |             anyOf:
183 |               - type: string
184 |               - type: integer
185 |         msg:
186 |           title: Message
187 |           type: string
188 |         type:
189 |           title: Error Type
190 |           type: string
191 | 


--------------------------------------------------------------------------------
/models/api.py:
--------------------------------------------------------------------------------
 1 | from models.models import (
 2 |     Document,
 3 |     DocumentMetadataFilter,
 4 |     Query,
 5 |     QueryResult,
 6 | )
 7 | from pydantic import BaseModel
 8 | from typing import List, Optional
 9 | 
10 | 
11 | class UpsertRequest(BaseModel):
12 |     documents: List[Document]
13 | 
14 | 
15 | class UpsertResponse(BaseModel):
16 |     ids: List[str]
17 | 
18 | 
19 | class QueryRequest(BaseModel):
20 |     queries: List[Query]
21 | 
22 | 
23 | class QueryResponse(BaseModel):
24 |     results: List[QueryResult]
25 | 
26 | 
27 | class DeleteRequest(BaseModel):
28 |     ids: Optional[List[str]] = None
29 |     filter: Optional[DocumentMetadataFilter] = None
30 |     delete_all: Optional[bool] = False
31 | 
32 | 
33 | class DeleteResponse(BaseModel):
34 |     success: bool
35 | 


--------------------------------------------------------------------------------
/models/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class Source(str, Enum):
 7 |     email = "email"
 8 |     file = "file"
 9 |     chat = "chat"
10 | 
11 | 
12 | class DocumentMetadata(BaseModel):
13 |     source: Optional[Source] = None
14 |     source_id: Optional[str] = None
15 |     url: Optional[str] = None
16 |     created_at: Optional[str] = None
17 |     author: Optional[str] = None
18 | 
19 | 
20 | class DocumentChunkMetadata(DocumentMetadata):
21 |     document_id: Optional[str] = None
22 | 
23 | 
24 | class DocumentChunk(BaseModel):
25 |     id: Optional[str] = None
26 |     text: str
27 |     metadata: DocumentChunkMetadata
28 |     embedding: Optional[List[float]] = None
29 | 
30 | 
31 | class DocumentChunkWithScore(DocumentChunk):
32 |     score: float
33 | 
34 | 
35 | class Document(BaseModel):
36 |     id: Optional[str] = None
37 |     text: str
38 |     metadata: Optional[DocumentMetadata] = None
39 | 
40 | 
41 | class DocumentWithChunks(Document):
42 |     chunks: List[DocumentChunk]
43 | 
44 | 
45 | class DocumentMetadataFilter(BaseModel):
46 |     document_id: Optional[str] = None
47 |     source: Optional[Source] = None
48 |     source_id: Optional[str] = None
49 |     author: Optional[str] = None
50 |     start_date: Optional[str] = None  # any date string format
51 |     end_date: Optional[str] = None  # any date string format
52 | 
53 | 
54 | class Query(BaseModel):
55 |     query: str
56 |     filter: Optional[DocumentMetadataFilter] = None
57 |     top_k: Optional[int] = 3
58 | 
59 | 
60 | class QueryWithEmbedding(Query):
61 |     embedding: List[float]
62 | 
63 | 
64 | class QueryResult(BaseModel):
65 |     query: str
66 |     results: List[DocumentChunkWithScore]
67 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "chatgpt-retrieval-plugin"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["isafulf <isabella@openai.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "server"}]
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.10"
11 | fastapi = "^0.92.0"
12 | uvicorn = "^0.20.0"
13 | openai = "^0.27.5"
14 | python-dotenv = "^0.21.1"
15 | pydantic = "^1.10.5"
16 | tenacity = "^8.2.1"
17 | tiktoken = "^0.2.0"
18 | numpy = "^1.24.2"
19 | docx2txt = "^0.8"
20 | PyPDF2 = "^3.0.1"
21 | python-pptx = "^0.6.21"
22 | python-multipart = "^0.0.6"
23 | arrow = "^1.2.3"
24 | chromadb = "^0.3.25"
25 | pinecone-client = "^2.1.0"
26 | weaviate-client = "^3.12.0"
27 | pymilvus = "^2.2.2"
28 | qdrant-client = {version = "^1.0.4", python = "<3.12"}
29 | redis = "4.5.4"
30 | supabase = "^1.0.2"
31 | psycopg2 = "^2.9.5"
32 | llama-index = "0.5.4"
33 | azure-identity = "^1.12.0"
34 | azure-search-documents = "11.4.0b8"
35 | pgvector = "^0.1.7"
36 | psycopg2cffi = {version = "^2.9.0", optional = true}
37 | loguru = "^0.7.0"
38 | elasticsearch = "8.8.2"
39 | pymongo = "^4.3.3"
40 | motor = "^3.3.2"
41 | 
42 | [tool.poetry.scripts]
43 | start = "server.main:start"
44 | dev = "local_server.main:start"
45 | 
46 | [tool.poetry.extras]
47 | postgresql = ["psycopg2cffi"]
48 | 
49 | [tool.poetry.group.dev.dependencies]
50 | httpx = "^0.23.3"
51 | pytest = "^7.2.1"
52 | pytest-cov = "^4.0.0"
53 | pytest-asyncio = "^0.20.3"
54 | 
55 | [build-system]
56 | requires = ["poetry-core"]
57 | build-backend = "poetry.core.masonry.api"
58 | 
59 | [tool.pytest.ini_options]
60 | pythonpath = [
61 |   "."
62 | ]
63 | asyncio_mode="auto"
64 | 


--------------------------------------------------------------------------------
/scripts/process_json/README.md:
--------------------------------------------------------------------------------
 1 | ## Process a JSON File
 2 | 
 3 | This script is a utility to process a file dump of documents in a JSON format and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case.
 4 | 
 5 | ## Usage
 6 | 
 7 | To run this script from the terminal, navigate to this folder and use the following command:
 8 | 
 9 | ```
10 | python process_json.py --filepath path/to/file_dump.json --custom_metadata '{"source": "file"}' --screen_for_pii True --extract_metadata True
11 | ```
12 | 
13 | where:
14 | 
15 | - `path/to/file_dump.json` is the name or path to the file dump to be processed. The format of this JSON file should be a list of JSON objects, where each object represents a document. The JSON object should have a subset of the following fields: `id`, `text`, `source`, `source_id`, `url`, `created_at`, and `author`. The `text` field is required, while the rest are optional and will be used to populate the metadata of the document. If the `id` field is not specified, a random UUID will be generated for the document.
16 | - `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{"source": "file"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`).
17 | - `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`.
18 | - `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`.
19 | 
20 | The script will load the JSON file as a list of dictionaries, iterate over the data, create document objects, and batch upsert them into the database. It will also print some progress messages and error messages if any, as well as the number and content of the skipped items due to errors or PII detection.
21 | 
22 | You can use `python process_json.py -h` to get a summary of the options and their descriptions.
23 | 
24 | Test the script with the example file, [example.json](example.json).
25 | 


--------------------------------------------------------------------------------
/scripts/process_json/example.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "id": "123",
 4 |       "text": "This is a document about something",
 5 |       "source": "file",
 6 |       "source_id": "https://example.com/doc1",
 7 |       "url": "https://example.com/doc1",
 8 |       "created_at": "2021-01-01T12:00:00Z",
 9 |       "author": "Alice"
10 |     },
11 |     {
12 |       "text": "This is another document about something else",
13 |       "source": "file",
14 |       "source_id": "doc2.txt",
15 |       "author": "Bob"
16 |     },
17 |     {
18 |       "id": "456",
19 |       "text": "This is Alice's phone number: 123-456-7890",
20 |       "source": "email",
21 |       "source_id": "567",
22 |       "created_at": "2021-01-02T13:00:00Z",
23 |       "author": "Alice"
24 |     }
25 | ]


--------------------------------------------------------------------------------
/scripts/process_json/process_json.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import json
  3 | import argparse
  4 | import asyncio
  5 | 
  6 | from loguru import logger
  7 | from models.models import Document, DocumentMetadata
  8 | from datastore.datastore import DataStore
  9 | from datastore.factory import get_datastore
 10 | from services.extract_metadata import extract_metadata_from_document
 11 | from services.pii_detection import screen_text_for_pii
 12 | 
 13 | DOCUMENT_UPSERT_BATCH_SIZE = 50
 14 | 
 15 | 
 16 | async def process_json_dump(
 17 |     filepath: str,
 18 |     datastore: DataStore,
 19 |     custom_metadata: dict,
 20 |     screen_for_pii: bool,
 21 |     extract_metadata: bool,
 22 | ):
 23 |     # load the json file as a list of dictionaries
 24 |     with open(filepath) as json_file:
 25 |         data = json.load(json_file)
 26 | 
 27 |     documents = []
 28 |     skipped_items = []
 29 |     # iterate over the data and create document objects
 30 |     for item in data:
 31 |         if len(documents) % 20 == 0:
 32 |             logger.info(f"Processed {len(documents)} documents")
 33 | 
 34 |         try:
 35 |             # get the id, text, source, source_id, url, created_at and author from the item
 36 |             # use default values if not specified
 37 |             id = item.get("id", None)
 38 |             text = item.get("text", None)
 39 |             source = item.get("source", None)
 40 |             source_id = item.get("source_id", None)
 41 |             url = item.get("url", None)
 42 |             created_at = item.get("created_at", None)
 43 |             author = item.get("author", None)
 44 | 
 45 |             if not text:
 46 |                 logger.info("No document text, skipping...")
 47 |                 continue
 48 | 
 49 |             # create a metadata object with the source, source_id, url, created_at and author
 50 |             metadata = DocumentMetadata(
 51 |                 source=source,
 52 |                 source_id=source_id,
 53 |                 url=url,
 54 |                 created_at=created_at,
 55 |                 author=author,
 56 |             )
 57 |             logger.info("metadata: ", str(metadata))
 58 | 
 59 |             # update metadata with custom values
 60 |             for key, value in custom_metadata.items():
 61 |                 if hasattr(metadata, key):
 62 |                     setattr(metadata, key, value)
 63 | 
 64 |             # screen for pii if requested
 65 |             if screen_for_pii:
 66 |                 pii_detected = screen_text_for_pii(text)
 67 |                 # if pii detected, print a warning and skip the document
 68 |                 if pii_detected:
 69 |                     logger.info("PII detected in document, skipping")
 70 |                     skipped_items.append(item)  # add the skipped item to the list
 71 |                     continue
 72 | 
 73 |             # extract metadata if requested
 74 |             if extract_metadata:
 75 |                 # extract metadata from the document text
 76 |                 extracted_metadata = extract_metadata_from_document(
 77 |                     f"Text: {text}; Metadata: {str(metadata)}"
 78 |                 )
 79 |                 # get a Metadata object from the extracted metadata
 80 |                 metadata = DocumentMetadata(**extracted_metadata)
 81 | 
 82 |             # create a document object with the id or a random id, text and metadata
 83 |             document = Document(
 84 |                 id=id or str(uuid.uuid4()),
 85 |                 text=text,
 86 |                 metadata=metadata,
 87 |             )
 88 |             documents.append(document)
 89 |         except Exception as e:
 90 |             # log the error and continue with the next item
 91 |             logger.error(f"Error processing {item}: {e}")
 92 |             skipped_items.append(item)  # add the skipped item to the list
 93 | 
 94 |     # do this in batches, the upsert method already batches documents but this allows
 95 |     # us to add more descriptive logging
 96 |     for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE):
 97 |         # Get the text of the chunks in the current batch
 98 |         batch_documents = documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]
 99 |         logger.info(f"Upserting batch of {len(batch_documents)} documents, batch {i}")
100 |         logger.info("documents: ", documents)
101 |         await datastore.upsert(batch_documents)
102 | 
103 |     # print the skipped items
104 |     logger.info(f"Skipped {len(skipped_items)} items due to errors or PII detection")
105 |     for item in skipped_items:
106 |         logger.info(item)
107 | 
108 | 
109 | async def main():
110 |     # parse the command-line arguments
111 |     parser = argparse.ArgumentParser()
112 |     parser.add_argument("--filepath", required=True, help="The path to the json dump")
113 |     parser.add_argument(
114 |         "--custom_metadata",
115 |         default="{}",
116 |         help="A JSON string of key-value pairs to update the metadata of the documents",
117 |     )
118 |     parser.add_argument(
119 |         "--screen_for_pii",
120 |         default=False,
121 |         type=bool,
122 |         help="A boolean flag to indicate whether to try the PII detection function (using a language model)",
123 |     )
124 |     parser.add_argument(
125 |         "--extract_metadata",
126 |         default=False,
127 |         type=bool,
128 |         help="A boolean flag to indicate whether to try to extract metadata from the document (using a language model)",
129 |     )
130 |     args = parser.parse_args()
131 | 
132 |     # get the arguments
133 |     filepath = args.filepath
134 |     custom_metadata = json.loads(args.custom_metadata)
135 |     screen_for_pii = args.screen_for_pii
136 |     extract_metadata = args.extract_metadata
137 | 
138 |     # initialize the db instance once as a global variable
139 |     datastore = await get_datastore()
140 |     # process the json dump
141 |     await process_json_dump(
142 |         filepath, datastore, custom_metadata, screen_for_pii, extract_metadata
143 |     )
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     asyncio.run(main())
148 | 


--------------------------------------------------------------------------------
/scripts/process_jsonl/README.md:
--------------------------------------------------------------------------------
 1 | ## Process a JSONL File
 2 | 
 3 | This script is a utility to process a file dump of documents in a JSONL format and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case.
 4 | 
 5 | ## Usage
 6 | 
 7 | To run this script from the terminal, navigate to this folder and use the following command:
 8 | 
 9 | ```
10 | python process_jsonl.py --filepath path/to/file_dump.jsonl --custom_metadata '{"source": "email"}' --screen_for_pii True --extract_metadata True
11 | ```
12 | 
13 | where:
14 | 
15 | - `path/to/file_dump.jsonl` is the name or path to the file dump to be processed. The format of this JSONL file should be a newline-delimited JSON file, where each line is a valid JSON object representing a document. The JSON object should have a subset of the following fields: `id`, `text`, `source`, `source_id`, `url`, `created_at`, and `author`. The `text` field is required, while the rest are optional and will be used to populate the metadata of the document. If the `id` field is not specified, a random UUID will be generated for the document.
16 | - `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{"source": "file"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`).
17 | - `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`.
18 | - `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`.
19 | 
20 | The script will open the JSONL file as a generator of dictionaries, iterate over the data, create document objects, and batch upsert them into the database. It will also print some progress messages and error messages if any, as well as the number and content of the skipped items due to errors, PII detection, or metadata extraction issues.
21 | 
22 | You can use `python process_jsonl.py -h` to get a summary of the options and their descriptions.
23 | 
24 | Test the script with the example file, [example.jsonl](example.jsonl).
25 | 


--------------------------------------------------------------------------------
/scripts/process_jsonl/example.jsonl:
--------------------------------------------------------------------------------
1 | {"id": "4", "text": "This document only has an ID and text. The other fields are missing."}
2 | {"text": "This document has no ID, but it has text and a source.", "source": "email"}
3 | {"id": "6", "text": "This document has an ID, text, and author, but no source information.", "author": "John Doe"}
4 | {"text": "This document has text, a source, and a URL, but no ID or author.", "source": "file", "url": "https://example.com/file/2"}
5 | {"id": "8", "text": "This document has an ID, text, source, and created_at timestamp, but no author or URL.", "source": "chat", "created_at": "2022-01-04T00:00:00"}
6 | {"id": "9", "text": "This document contains PII. John Smith's email address is john.smith@example.com and his phone number is +1 (555) 123-4567.", "source": "email", "source_id": "email_2", "url": "https://example.com/email/2", "created_at": "2022-01-05T00:00:00", "author": "John Smith"}


--------------------------------------------------------------------------------
/scripts/process_jsonl/process_jsonl.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import json
  3 | import argparse
  4 | import asyncio
  5 | 
  6 | from loguru import logger
  7 | from models.models import Document, DocumentMetadata
  8 | from datastore.datastore import DataStore
  9 | from datastore.factory import get_datastore
 10 | from services.extract_metadata import extract_metadata_from_document
 11 | from services.pii_detection import screen_text_for_pii
 12 | 
 13 | DOCUMENT_UPSERT_BATCH_SIZE = 50
 14 | 
 15 | 
 16 | async def process_jsonl_dump(
 17 |     filepath: str,
 18 |     datastore: DataStore,
 19 |     custom_metadata: dict,
 20 |     screen_for_pii: bool,
 21 |     extract_metadata: bool,
 22 | ):
 23 |     # open the jsonl file as a generator of dictionaries
 24 |     with open(filepath) as jsonl_file:
 25 |         data = [json.loads(line) for line in jsonl_file]
 26 | 
 27 |     documents = []
 28 |     skipped_items = []
 29 |     # iterate over the data and create document objects
 30 |     for item in data:
 31 |         if len(documents) % 20 == 0:
 32 |             logger.info(f"Processed {len(documents)} documents")
 33 | 
 34 |         try:
 35 |             # get the id, text, source, source_id, url, created_at and author from the item
 36 |             # use default values if not specified
 37 |             id = item.get("id", None)
 38 |             text = item.get("text", None)
 39 |             source = item.get("source", None)
 40 |             source_id = item.get("source_id", None)
 41 |             url = item.get("url", None)
 42 |             created_at = item.get("created_at", None)
 43 |             author = item.get("author", None)
 44 | 
 45 |             if not text:
 46 |                 logger.info("No document text, skipping...")
 47 |                 continue
 48 | 
 49 |             # create a metadata object with the source, source_id, url, created_at and author
 50 |             metadata = DocumentMetadata(
 51 |                 source=source,
 52 |                 source_id=source_id,
 53 |                 url=url,
 54 |                 created_at=created_at,
 55 |                 author=author,
 56 |             )
 57 | 
 58 |             # update metadata with custom values
 59 |             for key, value in custom_metadata.items():
 60 |                 if hasattr(metadata, key):
 61 |                     setattr(metadata, key, value)
 62 | 
 63 |             # screen for pii if requested
 64 |             if screen_for_pii:
 65 |                 pii_detected = screen_text_for_pii(text)
 66 |                 # if pii detected, print a warning and skip the document
 67 |                 if pii_detected:
 68 |                     logger.info("PII detected in document, skipping")
 69 |                     skipped_items.append(item)  # add the skipped item to the list
 70 |                     continue
 71 | 
 72 |             # extract metadata if requested
 73 |             if extract_metadata:
 74 |                 # extract metadata from the document text
 75 |                 extracted_metadata = extract_metadata_from_document(
 76 |                     f"Text: {text}; Metadata: {str(metadata)}"
 77 |                 )
 78 |                 # get a Metadata object from the extracted metadata
 79 |                 metadata = DocumentMetadata(**extracted_metadata)
 80 | 
 81 |             # create a document object with the id, text and metadata
 82 |             document = Document(
 83 |                 id=id,
 84 |                 text=text,
 85 |                 metadata=metadata,
 86 |             )
 87 |             documents.append(document)
 88 |         except Exception as e:
 89 |             # log the error and continue with the next item
 90 |             logger.error(f"Error processing {item}: {e}")
 91 |             skipped_items.append(item)  # add the skipped item to the list
 92 | 
 93 |     # do this in batches, the upsert method already batches documents but this allows
 94 |     # us to add more descriptive logging
 95 |     for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE):
 96 |         # Get the text of the chunks in the current batch
 97 |         batch_documents = documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]
 98 |         logger.info(f"Upserting batch of {len(batch_documents)} documents, batch {i}")
 99 |         await datastore.upsert(batch_documents)
100 | 
101 |     # print the skipped items
102 |     logger.info(f"Skipped {len(skipped_items)} items due to errors or PII detection")
103 |     for item in skipped_items:
104 |         logger.info(item)
105 | 
106 | 
107 | async def main():
108 |     # parse the command-line arguments
109 |     parser = argparse.ArgumentParser()
110 |     parser.add_argument("--filepath", required=True, help="The path to the jsonl dump")
111 |     parser.add_argument(
112 |         "--custom_metadata",
113 |         default="{}",
114 |         help="A JSON string of key-value pairs to update the metadata of the documents",
115 |     )
116 |     parser.add_argument(
117 |         "--screen_for_pii",
118 |         default=False,
119 |         type=bool,
120 |         help="A boolean flag to indicate whether to try the PII detection function (using a language model)",
121 |     )
122 |     parser.add_argument(
123 |         "--extract_metadata",
124 |         default=False,
125 |         type=bool,
126 |         help="A boolean flag to indicate whether to try to extract metadata from the document (using a language model)",
127 |     )
128 |     args = parser.parse_args()
129 | 
130 |     # get the arguments
131 |     filepath = args.filepath
132 |     custom_metadata = json.loads(args.custom_metadata)
133 |     screen_for_pii = args.screen_for_pii
134 |     extract_metadata = args.extract_metadata
135 | 
136 |     # initialize the db instance once as a global variable
137 |     datastore = await get_datastore()
138 |     # process the jsonl dump
139 |     await process_jsonl_dump(
140 |         filepath, datastore, custom_metadata, screen_for_pii, extract_metadata
141 |     )
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     asyncio.run(main())
146 | 


--------------------------------------------------------------------------------
/scripts/process_zip/README.md:
--------------------------------------------------------------------------------
 1 | ## Process a ZIP File
 2 | 
 3 | This script is a utility to process a file dump of documents in a zip file and store them in the vector database with some metadata. It can also optionally screen the documents for personally identifiable information (PII) using a language model, and skip them if detected. Additionally, the script can extract metadata from the document using a language model. You can customize the PII detection function in [`services/pii_detection`](../../services/pii_detection.py) and the metadata extraction function in [`services/extract_metadata`](../../services/extract_metadata.py) for your use case.
 4 | 
 5 | ## Usage
 6 | 
 7 | To run this script from the terminal, navigate to this folder and use the following command:
 8 | 
 9 | ```
10 | python process_zip.py --filepath path/to/file_dump.zip --custom_metadata '{"source": "email"}' --screen_for_pii True --extract_metadata True
11 | ```
12 | 
13 | where:
14 | 
15 | - `path/to/file_dump.zip` is the name or path to the file dump to be processed. The format of this zip file should be a zip file containing of docx, pdf, txt, md and pptx files (any internal folder structure is acceptable).
16 | - `--custom_metadata` is an optional JSON string of key-value pairs to update the metadata of the documents. For example, `{"source": "file"}` will add a `source` field with the value `file` to the metadata of each document. The default value is an empty JSON object (`{}`).
17 | - `--screen_for_pii` is an optional boolean flag to indicate whether to use the PII detection function or not. If set to `True`, the script will use the `screen_text_for_pii` function from the [`services/pii_detection`](../../services/pii_detection.py) module to check if the document text contains any PII using a language model. If PII is detected, the script will print a warning and skip the document. The default value is `False`.
18 | - `--extract_metadata` is an optional boolean flag to indicate whether to try to extract metadata from the document using a language model. If set to `True`, the script will use the `extract_metadata_from_document` function from the [`services/extract_metadata`](../../services/extract_metadata.py) module to extract metadata from the document text and update the metadata object accordingly. The default value is`False`.
19 | 
20 | The script will extract the files from the zip file into a temporary directory named `dump`, process each file and store the document text and metadata in the database, and then delete the temporary directory and its contents. It will also print some progress messages and error messages if any.
21 | 
22 | You can use `python process_zip.py -h` to get a summary of the options and their descriptions.
23 | 
24 | Test the script with the example file, [example.zip](example.zip).
25 | 


--------------------------------------------------------------------------------
/scripts/process_zip/example.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/scripts/process_zip/example.zip


--------------------------------------------------------------------------------
/scripts/process_zip/process_zip.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import zipfile
  3 | import os
  4 | import json
  5 | import argparse
  6 | import asyncio
  7 | 
  8 | from loguru import logger
  9 | from models.models import Document, DocumentMetadata, Source
 10 | from datastore.datastore import DataStore
 11 | from datastore.factory import get_datastore
 12 | from services.extract_metadata import extract_metadata_from_document
 13 | from services.file import extract_text_from_filepath
 14 | from services.pii_detection import screen_text_for_pii
 15 | 
 16 | DOCUMENT_UPSERT_BATCH_SIZE = 50
 17 | 
 18 | 
 19 | async def process_file_dump(
 20 |     filepath: str,
 21 |     datastore: DataStore,
 22 |     custom_metadata: dict,
 23 |     screen_for_pii: bool,
 24 |     extract_metadata: bool,
 25 | ):
 26 |     # create a ZipFile object and extract all the files into a directory named 'dump'
 27 |     with zipfile.ZipFile(filepath) as zip_file:
 28 |         zip_file.extractall("dump")
 29 | 
 30 |     documents = []
 31 |     skipped_files = []
 32 |     # use os.walk to traverse the dump directory and its subdirectories
 33 |     for root, dirs, files in os.walk("dump"):
 34 |         for filename in files:
 35 |             if len(documents) % 20 == 0:
 36 |                 logger.info(f"Processed {len(documents)} documents")
 37 | 
 38 |             filepath = os.path.join(root, filename)
 39 | 
 40 |             try:
 41 |                 extracted_text = extract_text_from_filepath(filepath)
 42 |                 logger.info(f"extracted_text from {filepath}")
 43 | 
 44 |                 # create a metadata object with the source and source_id fields
 45 |                 metadata = DocumentMetadata(
 46 |                     source=Source.file,
 47 |                     source_id=filename,
 48 |                 )
 49 | 
 50 |                 # update metadata with custom values
 51 |                 for key, value in custom_metadata.items():
 52 |                     if hasattr(metadata, key):
 53 |                         setattr(metadata, key, value)
 54 | 
 55 |                 # screen for pii if requested
 56 |                 if screen_for_pii:
 57 |                     pii_detected = screen_text_for_pii(extracted_text)
 58 |                     # if pii detected, print a warning and skip the document
 59 |                     if pii_detected:
 60 |                         logger.info("PII detected in document, skipping")
 61 |                         skipped_files.append(
 62 |                             filepath
 63 |                         )  # add the skipped file to the list
 64 |                         continue
 65 | 
 66 |                 # extract metadata if requested
 67 |                 if extract_metadata:
 68 |                     # extract metadata from the document text
 69 |                     extracted_metadata = extract_metadata_from_document(
 70 |                         f"Text: {extracted_text}; Metadata: {str(metadata)}"
 71 |                     )
 72 |                     # get a Metadata object from the extracted metadata
 73 |                     metadata = DocumentMetadata(**extracted_metadata)
 74 | 
 75 |                 # create a document object with a random id, text and metadata
 76 |                 document = Document(
 77 |                     id=str(uuid.uuid4()),
 78 |                     text=extracted_text,
 79 |                     metadata=metadata,
 80 |                 )
 81 |                 documents.append(document)
 82 |             except Exception as e:
 83 |                 # log the error and continue with the next file
 84 |                 logger.error(f"Error processing {filepath}: {e}")
 85 |                 skipped_files.append(filepath)  # add the skipped file to the list
 86 | 
 87 |     # do this in batches, the upsert method already batches documents but this allows
 88 |     # us to add more descriptive logging
 89 |     for i in range(0, len(documents), DOCUMENT_UPSERT_BATCH_SIZE):
 90 |         # Get the text of the chunks in the current batch
 91 |         batch_documents = [doc for doc in documents[i : i + DOCUMENT_UPSERT_BATCH_SIZE]]
 92 |         logger.info(f"Upserting batch of {len(batch_documents)} documents, batch {i}")
 93 |         logger.info("documents: ", documents)
 94 |         await datastore.upsert(batch_documents)
 95 | 
 96 |     # delete all files in the dump directory
 97 |     for root, dirs, files in os.walk("dump", topdown=False):
 98 |         for filename in files:
 99 |             filepath = os.path.join(root, filename)
100 |             os.remove(filepath)
101 |         for dirname in dirs:
102 |             dirpath = os.path.join(root, dirname)
103 |             os.rmdir(dirpath)
104 | 
105 |     # delete the dump directory
106 |     os.rmdir("dump")
107 | 
108 |     # print the skipped files
109 |     logger.info(f"Skipped {len(skipped_files)} files due to errors or PII detection")
110 |     for file in skipped_files:
111 |         logger.info(file)
112 | 
113 | 
114 | async def main():
115 |     # parse the command-line arguments
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument("--filepath", required=True, help="The path to the file dump")
118 |     parser.add_argument(
119 |         "--custom_metadata",
120 |         default="{}",
121 |         help="A JSON string of key-value pairs to update the metadata of the documents",
122 |     )
123 |     parser.add_argument(
124 |         "--screen_for_pii",
125 |         default=False,
126 |         type=bool,
127 |         help="A boolean flag to indicate whether to try the PII detection function (using a language model)",
128 |     )
129 |     parser.add_argument(
130 |         "--extract_metadata",
131 |         default=False,
132 |         type=bool,
133 |         help="A boolean flag to indicate whether to try to extract metadata from the document (using a language model)",
134 |     )
135 |     args = parser.parse_args()
136 | 
137 |     # get the arguments
138 |     filepath = args.filepath
139 |     custom_metadata = json.loads(args.custom_metadata)
140 |     screen_for_pii = args.screen_for_pii
141 |     extract_metadata = args.extract_metadata
142 | 
143 |     # initialize the db instance once as a global variable
144 |     datastore = await get_datastore()
145 |     # process the file dump
146 |     await process_file_dump(
147 |         filepath, datastore, custom_metadata, screen_for_pii, extract_metadata
148 |     )
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     asyncio.run(main())
153 | 


--------------------------------------------------------------------------------
/server/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Optional
  3 | import uvicorn
  4 | from fastapi import FastAPI, File, Form, HTTPException, Depends, Body, UploadFile
  5 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
  6 | from fastapi.staticfiles import StaticFiles
  7 | from loguru import logger
  8 | 
  9 | from models.api import (
 10 |     DeleteRequest,
 11 |     DeleteResponse,
 12 |     QueryRequest,
 13 |     QueryResponse,
 14 |     UpsertRequest,
 15 |     UpsertResponse,
 16 | )
 17 | from datastore.factory import get_datastore
 18 | from services.file import get_document_from_file
 19 | 
 20 | from models.models import DocumentMetadata, Source
 21 | 
 22 | bearer_scheme = HTTPBearer()
 23 | BEARER_TOKEN = os.environ.get("BEARER_TOKEN")
 24 | assert BEARER_TOKEN is not None
 25 | 
 26 | 
 27 | def validate_token(credentials: HTTPAuthorizationCredentials = Depends(bearer_scheme)):
 28 |     if credentials.scheme != "Bearer" or credentials.credentials != BEARER_TOKEN:
 29 |         raise HTTPException(status_code=401, detail="Invalid or missing token")
 30 |     return credentials
 31 | 
 32 | 
 33 | app = FastAPI(dependencies=[Depends(validate_token)])
 34 | app.mount("/.well-known", StaticFiles(directory=".well-known"), name="static")
 35 | 
 36 | # Create a sub-application, in order to access just the query endpoint in an OpenAPI schema, found at http://0.0.0.0:8000/sub/openapi.json when the app is running locally
 37 | sub_app = FastAPI(
 38 |     title="Retrieval Plugin API",
 39 |     description="A retrieval API for querying and filtering documents based on natural language queries and metadata",
 40 |     version="1.0.0",
 41 |     servers=[{"url": "https://your-app-url.com"}],
 42 |     dependencies=[Depends(validate_token)],
 43 | )
 44 | app.mount("/sub", sub_app)
 45 | 
 46 | 
 47 | @app.post(
 48 |     "/upsert-file",
 49 |     response_model=UpsertResponse,
 50 | )
 51 | async def upsert_file(
 52 |     file: UploadFile = File(...),
 53 |     metadata: Optional[str] = Form(None),
 54 | ):
 55 |     try:
 56 |         metadata_obj = (
 57 |             DocumentMetadata.parse_raw(metadata)
 58 |             if metadata
 59 |             else DocumentMetadata(source=Source.file)
 60 |         )
 61 |     except:
 62 |         metadata_obj = DocumentMetadata(source=Source.file)
 63 | 
 64 |     document = await get_document_from_file(file, metadata_obj)
 65 | 
 66 |     try:
 67 |         ids = await datastore.upsert([document])
 68 |         return UpsertResponse(ids=ids)
 69 |     except Exception as e:
 70 |         logger.error(e)
 71 |         raise HTTPException(status_code=500, detail=f"str({e})")
 72 | 
 73 | 
 74 | @app.post(
 75 |     "/upsert",
 76 |     response_model=UpsertResponse,
 77 | )
 78 | async def upsert(
 79 |     request: UpsertRequest = Body(...),
 80 | ):
 81 |     try:
 82 |         ids = await datastore.upsert(request.documents)
 83 |         return UpsertResponse(ids=ids)
 84 |     except Exception as e:
 85 |         logger.error(e)
 86 |         raise HTTPException(status_code=500, detail="Internal Service Error")
 87 | 
 88 | 
 89 | @app.post(
 90 |     "/query",
 91 |     response_model=QueryResponse,
 92 | )
 93 | async def query_main(
 94 |     request: QueryRequest = Body(...),
 95 | ):
 96 |     try:
 97 |         results = await datastore.query(
 98 |             request.queries,
 99 |         )
100 |         return QueryResponse(results=results)
101 |     except Exception as e:
102 |         logger.error(e)
103 |         raise HTTPException(status_code=500, detail="Internal Service Error")
104 | 
105 | 
106 | @sub_app.post(
107 |     "/query",
108 |     response_model=QueryResponse,
109 |     # NOTE: We are describing the shape of the API endpoint input due to a current limitation in parsing arrays of objects from OpenAPI schemas. This will not be necessary in the future.
110 |     description="Accepts search query objects array each with query and optional filter. Break down complex questions into sub-questions. Refine results by criteria, e.g. time / source, don't do this often. Split queries if ResponseTooLargeError occurs.",
111 | )
112 | async def query(
113 |     request: QueryRequest = Body(...),
114 | ):
115 |     try:
116 |         results = await datastore.query(
117 |             request.queries,
118 |         )
119 |         return QueryResponse(results=results)
120 |     except Exception as e:
121 |         logger.error(e)
122 |         raise HTTPException(status_code=500, detail="Internal Service Error")
123 | 
124 | 
125 | @app.delete(
126 |     "/delete",
127 |     response_model=DeleteResponse,
128 | )
129 | async def delete(
130 |     request: DeleteRequest = Body(...),
131 | ):
132 |     if not (request.ids or request.filter or request.delete_all):
133 |         raise HTTPException(
134 |             status_code=400,
135 |             detail="One of ids, filter, or delete_all is required",
136 |         )
137 |     try:
138 |         success = await datastore.delete(
139 |             ids=request.ids,
140 |             filter=request.filter,
141 |             delete_all=request.delete_all,
142 |         )
143 |         return DeleteResponse(success=success)
144 |     except Exception as e:
145 |         logger.error(e)
146 |         raise HTTPException(status_code=500, detail="Internal Service Error")
147 | 
148 | 
149 | @app.on_event("startup")
150 | async def startup():
151 |     global datastore
152 |     datastore = await get_datastore()
153 | 
154 | 
155 | def start():
156 |     uvicorn.run("server.main:app", host="0.0.0.0", port=8000, reload=True)
157 | 


--------------------------------------------------------------------------------
/services/date.py:
--------------------------------------------------------------------------------
 1 | import arrow
 2 | from loguru import logger
 3 | 
 4 | 
 5 | def to_unix_timestamp(date_str: str) -> int:
 6 |     """
 7 |     Convert a date string to a unix timestamp (seconds since epoch).
 8 | 
 9 |     Args:
10 |         date_str: The date string to convert.
11 | 
12 |     Returns:
13 |         The unix timestamp corresponding to the date string.
14 | 
15 |     If the date string cannot be parsed as a valid date format, returns the current unix timestamp and prints a warning.
16 |     """
17 |     # Try to parse the date string using arrow, which supports many common date formats
18 |     try:
19 |         date_obj = arrow.get(date_str)
20 |         return int(date_obj.timestamp())
21 |     except arrow.parser.ParserError:
22 |         # If the parsing fails, return the current unix timestamp and print a warning
23 |         logger.info(f"Invalid date format: {date_str}")
24 |         return int(arrow.now().timestamp())
25 | 


--------------------------------------------------------------------------------
/services/extract_metadata.py:
--------------------------------------------------------------------------------
 1 | from models.models import Source
 2 | from services.openai import get_chat_completion
 3 | import json
 4 | from typing import Dict
 5 | import os
 6 | from loguru import logger
 7 | 
 8 | 
 9 | def extract_metadata_from_document(text: str) -> Dict[str, str]:
10 |     sources = Source.__members__.keys()
11 |     sources_string = ", ".join(sources)
12 |     # This prompt is just an example, change it to fit your use case
13 |     messages = [
14 |         {
15 |             "role": "system",
16 |             "content": f"""
17 |             Given a document from a user, try to extract the following metadata:
18 |             - source: string, one of {sources_string}
19 |             - url: string or don't specify
20 |             - created_at: string or don't specify
21 |             - author: string or don't specify
22 | 
23 |             Respond with a JSON containing the extracted metadata in key value pairs. If you don't find a metadata field, don't specify it.
24 |             """,
25 |         },
26 |         {"role": "user", "content": text},
27 |     ]
28 | 
29 |     # NOTE: Azure Open AI requires deployment id
30 |     # Read environment variable - if not set - not used
31 |     completion = get_chat_completion(
32 |         messages,
33 |         "gpt-4",
34 |         # os.environ.get("OPENAI_METADATA_EXTRACTIONMODEL_DEPLOYMENTID")
35 |     )  # TODO: change to your preferred model name
36 | 
37 |     logger.info(f"completion: {completion}")
38 | 
39 |     try:
40 |         metadata = json.loads(completion)
41 |     except Exception as e:
42 |         logger.error(f"Error parsing completion: {e}")
43 |         metadata = {}
44 | 
45 |     return metadata
46 | 


--------------------------------------------------------------------------------
/services/file.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from io import BufferedReader
  3 | from typing import Optional
  4 | from fastapi import UploadFile
  5 | import mimetypes
  6 | from PyPDF2 import PdfReader
  7 | import docx2txt
  8 | import csv
  9 | import pptx
 10 | from loguru import logger
 11 | 
 12 | from models.models import Document, DocumentMetadata
 13 | 
 14 | 
 15 | async def get_document_from_file(
 16 |     file: UploadFile, metadata: DocumentMetadata
 17 | ) -> Document:
 18 |     extracted_text = await extract_text_from_form_file(file)
 19 | 
 20 |     doc = Document(text=extracted_text, metadata=metadata)
 21 | 
 22 |     return doc
 23 | 
 24 | 
 25 | def extract_text_from_filepath(filepath: str, mimetype: Optional[str] = None) -> str:
 26 |     """Return the text content of a file given its filepath."""
 27 | 
 28 |     if mimetype is None:
 29 |         # Get the mimetype of the file based on its extension
 30 |         mimetype, _ = mimetypes.guess_type(filepath)
 31 | 
 32 |     if not mimetype:
 33 |         if filepath.endswith(".md"):
 34 |             mimetype = "text/markdown"
 35 |         else:
 36 |             raise Exception("Unsupported file type")
 37 | 
 38 |     try:
 39 |         with open(filepath, "rb") as file:
 40 |             extracted_text = extract_text_from_file(file, mimetype)
 41 |     except Exception as e:
 42 |         logger.error(e)
 43 |         raise e
 44 | 
 45 |     return extracted_text
 46 | 
 47 | 
 48 | def extract_text_from_file(file: BufferedReader, mimetype: str) -> str:
 49 |     if mimetype == "application/pdf":
 50 |         # Extract text from pdf using PyPDF2
 51 |         reader = PdfReader(file)
 52 |         extracted_text = " ".join([page.extract_text() for page in reader.pages])
 53 |     elif mimetype == "text/plain" or mimetype == "text/markdown":
 54 |         # Read text from plain text file
 55 |         extracted_text = file.read().decode("utf-8")
 56 |     elif (
 57 |         mimetype
 58 |         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
 59 |     ):
 60 |         # Extract text from docx using docx2txt
 61 |         extracted_text = docx2txt.process(file)
 62 |     elif mimetype == "text/csv":
 63 |         # Extract text from csv using csv module
 64 |         extracted_text = ""
 65 |         decoded_buffer = (line.decode("utf-8") for line in file)
 66 |         reader = csv.reader(decoded_buffer)
 67 |         for row in reader:
 68 |             extracted_text += " ".join(row) + "\n"
 69 |     elif (
 70 |         mimetype
 71 |         == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 72 |     ):
 73 |         # Extract text from pptx using python-pptx
 74 |         extracted_text = ""
 75 |         presentation = pptx.Presentation(file)
 76 |         for slide in presentation.slides:
 77 |             for shape in slide.shapes:
 78 |                 if shape.has_text_frame:
 79 |                     for paragraph in shape.text_frame.paragraphs:
 80 |                         for run in paragraph.runs:
 81 |                             extracted_text += run.text + " "
 82 |                     extracted_text += "\n"
 83 |     else:
 84 |         # Unsupported file type
 85 |         raise ValueError("Unsupported file type: {}".format(mimetype))
 86 | 
 87 |     return extracted_text
 88 | 
 89 | 
 90 | # Extract text from a file based on its mimetype
 91 | async def extract_text_from_form_file(file: UploadFile):
 92 |     """Return the text content of a file."""
 93 |     # get the file body from the upload file object
 94 |     mimetype = file.content_type
 95 |     logger.info(f"mimetype: {mimetype}")
 96 |     logger.info(f"file.file: {file.file}")
 97 |     logger.info("file: ", file)
 98 | 
 99 |     file_stream = await file.read()
100 | 
101 |     temp_file_path = "/tmp/temp_file"
102 | 
103 |     # write the file to a temporary location
104 |     with open(temp_file_path, "wb") as f:
105 |         f.write(file_stream)
106 | 
107 |     try:
108 |         extracted_text = extract_text_from_filepath(temp_file_path, mimetype)
109 |     except Exception as e:
110 |         logger.error(e)
111 |         os.remove(temp_file_path)
112 |         raise e
113 | 
114 |     # remove file from temp location
115 |     os.remove(temp_file_path)
116 | 
117 |     return extracted_text
118 | 


--------------------------------------------------------------------------------
/services/openai.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import openai
 3 | import os
 4 | from loguru import logger
 5 | 
 6 | from tenacity import retry, wait_random_exponential, stop_after_attempt
 7 | 
 8 | EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-large")
 9 | EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256))
10 | 
11 | 
12 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
13 | def get_embeddings(texts: List[str]) -> List[List[float]]:
14 |     """
15 |     Embed texts using OpenAI's ada model.
16 | 
17 |     Args:
18 |         texts: The list of texts to embed.
19 | 
20 |     Returns:
21 |         A list of embeddings, each of which is a list of floats.
22 | 
23 |     Raises:
24 |         Exception: If the OpenAI API call fails.
25 |     """
26 |     # Call the OpenAI API to get the embeddings
27 |     # NOTE: Azure Open AI requires deployment id
28 |     deployment = os.environ.get("OPENAI_EMBEDDINGMODEL_DEPLOYMENTID")
29 | 
30 |     response = {}
31 |     if deployment is None:
32 |         response = openai.Embedding.create(input=texts, model=EMBEDDING_MODEL, dimensions=EMBEDDING_DIMENSION)
33 |     else:
34 |         response = openai.Embedding.create(input=texts, deployment_id=deployment)
35 | 
36 |     # Extract the embedding data from the response
37 |     data = response["data"]  # type: ignore
38 | 
39 |     # Return the embeddings as a list of lists of floats
40 |     return [result["embedding"] for result in data]
41 | 
42 | 
43 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
44 | def get_chat_completion(
45 |     messages,
46 |     model="gpt-3.5-turbo",  # use "gpt-4" for better results
47 |     deployment_id=None,
48 | ):
49 |     """
50 |     Generate a chat completion using OpenAI's chat completion API.
51 | 
52 |     Args:
53 |         messages: The list of messages in the chat history.
54 |         model: The name of the model to use for the completion. Default is gpt-3.5-turbo, which is a fast, cheap and versatile model. Use gpt-4 for higher quality but slower results.
55 | 
56 |     Returns:
57 |         A string containing the chat completion.
58 | 
59 |     Raises:
60 |         Exception: If the OpenAI API call fails.
61 |     """
62 |     # call the OpenAI chat completion API with the given messages
63 |     # Note: Azure Open AI requires deployment id
64 |     response = {}
65 |     if deployment_id == None:
66 |         response = openai.ChatCompletion.create(
67 |             model=model,
68 |             messages=messages,
69 |         )
70 |     else:
71 |         response = openai.ChatCompletion.create(
72 |             deployment_id=deployment_id,
73 |             messages=messages,
74 |         )
75 | 
76 |     choices = response["choices"]  # type: ignore
77 |     completion = choices[0].message.content.strip()
78 |     logger.info(f"Completion: {completion}")
79 |     return completion
80 | 


--------------------------------------------------------------------------------
/services/pii_detection.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from services.openai import get_chat_completion
 3 | 
 4 | 
 5 | def screen_text_for_pii(text: str) -> bool:
 6 |     # This prompt is just an example, change it to fit your use case
 7 |     messages = [
 8 |         {
 9 |             "role": "system",
10 |             "content": f"""
11 |             You can only respond with the word "True" or "False", where your answer indicates whether the text in the user's message contains PII.
12 |             Do not explain your answer, and do not use punctuation.
13 |             Your task is to identify whether the text extracted from your company files
14 |             contains sensitive PII information that should not be shared with the broader company. Here are some things to look out for:
15 |             - An email address that identifies a specific person in either the local-part or the domain
16 |             - The postal address of a private residence (must include at least a street name)
17 |             - The postal address of a public place (must include either a street name or business name)
18 |             - Notes about hiring decisions with mentioned names of candidates. The user will send a document for you to analyze.
19 |             """,
20 |         },
21 |         {"role": "user", "content": text},
22 |     ]
23 | 
24 |     completion = get_chat_completion(
25 |         messages, deployment_id=os.environ.get("OPENAI_COMPLETIONMODEL_DEPLOYMENTID")
26 |     )
27 | 
28 |     if completion.startswith("True"):
29 |         return True
30 | 
31 |     return False
32 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/b28ddce58474441da332d4e15c6dd60ddaa953ab/tests/__init__.py


--------------------------------------------------------------------------------
/tests/datastore/providers/azurecosmosdb/test_azurecosmosdb_datastore.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from typing import Dict, List
  3 | from dotenv import dotenv_values
  4 | 
  5 | from datastore.datastore import DataStore
  6 | from datastore.providers.azurecosmosdb_datastore import AzureCosmosDBDataStore
  7 | from models.models import (
  8 |     DocumentChunk,
  9 |     DocumentChunkMetadata,
 10 |     QueryWithEmbedding,
 11 | )
 12 | import os
 13 | 
 14 | num_lists = 1
 15 | similarity = "COS"
 16 | 
 17 | EMBEDDING_DIMENSION = int(os.environ.get("EMBEDDING_DIMENSION", 256))
 18 | 
 19 | 
 20 | def create_embedding(non_zero_pos: int) -> List[float]:
 21 |     # create a vector with a single non-zero value of dimension EMBEDDING_DIMENSION
 22 |     vector = [0.0] * EMBEDDING_DIMENSION
 23 |     vector[non_zero_pos - 1] = 1.0
 24 |     return vector
 25 | 
 26 | 
 27 | @pytest.fixture
 28 | def azure_cosmos_db_settings_from_dot_env() -> dict:
 29 |     """
 30 |     Reads the Azure CosmosDB environment variables for the .env file.
 31 | 
 32 |     Returns:
 33 |         dict: The Azure CosmosDB environment variables
 34 |     """
 35 |     config = dotenv_values(".env")
 36 |     env_variables = {
 37 |         "DATASTORE": "azurecosmosdb",
 38 |         "AZCOSMOS_API": config.get(
 39 |             ("AZCOSMOS_API")
 40 |         ),  # Right now CosmosDB only supports vector search in Mongo vCore.
 41 |         "AZCOSMOS_CONNSTR": config.get("AZCOSMOS_CONNSTR"),
 42 |         "AZCOSMOS_DATABASE_NAME": config.get("AZCOSMOS_DATABASE_NAME"),
 43 |         "AZCOSMOS_CONTAINER_NAME": config.get("AZCOSMOS_CONTAINER_NAME"),
 44 |     }
 45 | 
 46 |     return env_variables
 47 | 
 48 | 
 49 | @pytest.fixture
 50 | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
 51 |     first_doc_chunks = [
 52 |         DocumentChunk(
 53 |             id=f"first-doc-{i}",
 54 |             text=f"Lorem ipsum {i}",
 55 |             metadata=DocumentChunkMetadata(),
 56 |             embedding=create_embedding(i),
 57 |         )
 58 |         for i in range(4, 7)
 59 |     ]
 60 |     return {
 61 |         "first-doc": first_doc_chunks,
 62 |     }
 63 | 
 64 | 
 65 | @pytest.fixture
 66 | def queries() -> List[QueryWithEmbedding]:
 67 |     queries = [
 68 |         QueryWithEmbedding(
 69 |             query="Query 1",
 70 |             top_k=1,
 71 |             embedding=create_embedding(4),
 72 |         ),
 73 |         QueryWithEmbedding(
 74 |             query="Query 2",
 75 |             top_k=2,
 76 |             embedding=create_embedding(5),
 77 |         ),
 78 |     ]
 79 |     return queries
 80 | 
 81 | 
 82 | @pytest.fixture
 83 | async def azurecosmosdb_datastore() -> DataStore:
 84 |     return await AzureCosmosDBDataStore.create(
 85 |         num_lists=num_lists, similarity=similarity
 86 |     )
 87 | 
 88 | 
 89 | @pytest.mark.asyncio
 90 | async def test_upsert(
 91 |     azurecosmosdb_datastore: AzureCosmosDBDataStore,
 92 |     initial_document_chunks: Dict[str, List[DocumentChunk]],
 93 | ) -> None:
 94 |     """Test basic upsert."""
 95 |     doc_ids = await azurecosmosdb_datastore._upsert(initial_document_chunks)
 96 |     assert doc_ids == [
 97 |         f"doc:{doc_id}:chunk:{chunk.id}"
 98 |         for doc_id, chunk_list in initial_document_chunks.items()
 99 |         for chunk in chunk_list
100 |     ]
101 | 
102 | 
103 | @pytest.mark.asyncio
104 | async def test_query(
105 |     azurecosmosdb_datastore: AzureCosmosDBDataStore,
106 |     initial_document_chunks: Dict[str, List[DocumentChunk]],
107 |     queries: List[QueryWithEmbedding],
108 | ) -> None:
109 |     """Test basic query."""
110 |     await azurecosmosdb_datastore.delete(delete_all=True)
111 |     # insert to prepare for the test
112 |     await azurecosmosdb_datastore._upsert(initial_document_chunks)
113 | 
114 |     query_results = await azurecosmosdb_datastore._query(queries)
115 |     assert len(query_results) == len(queries)
116 | 
117 |     query_0_results = query_results[0].results
118 |     query_1_results = query_results[1].results
119 | 
120 |     assert len(query_0_results) == 1
121 |     assert len(query_1_results) == 2
122 | 
123 |     # NOTE: this is the correct behavior
124 |     assert query_0_results[0].id == "doc:first-doc:chunk:first-doc-4"
125 |     assert query_1_results[0].id == "doc:first-doc:chunk:first-doc-5"
126 |     assert query_1_results[1].id == "doc:first-doc:chunk:first-doc-4"
127 | 
128 | 
129 | @pytest.mark.asyncio
130 | async def test_delete(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None:
131 |     await azurecosmosdb_datastore.delete(delete_all=True)
132 |     chunk1 = DocumentChunk(
133 |         id="deleteChunk1",
134 |         text="delete text 1",
135 |         embedding=[1] * EMBEDDING_DIMENSION,
136 |         metadata=DocumentChunkMetadata(),
137 |     )
138 |     chunk2 = DocumentChunk(
139 |         id="deleteChunk2",
140 |         text="delete text 2",
141 |         embedding=[1] * EMBEDDING_DIMENSION,
142 |         metadata=DocumentChunkMetadata(),
143 |     )
144 |     # insert to prepare for test
145 |     await azurecosmosdb_datastore._upsert(
146 |         {"deleteDoc1": [chunk1], "deleteDoc2": [chunk2]}
147 |     )
148 | 
149 |     query_embedding = [1] * EMBEDDING_DIMENSION
150 |     query = QueryWithEmbedding(
151 |         query="Query for delete",
152 |         embedding=query_embedding,
153 |     )
154 |     results = await azurecosmosdb_datastore._query([query])
155 | 
156 |     assert len(results[0].results) == 2
157 |     assert results[0].results[0].id == "doc:deleteDoc1:chunk:deleteChunk1"
158 |     assert results[0].results[1].id == "doc:deleteDoc2:chunk:deleteChunk2"
159 | 
160 |     await azurecosmosdb_datastore.delete(ids=["doc:deleteDoc1:chunk:deleteChunk1"])
161 |     results_after_delete = await azurecosmosdb_datastore._query([query])
162 | 
163 |     assert len(results_after_delete[0].results) == 1
164 |     assert results_after_delete[0].results[0].id == "doc:deleteDoc2:chunk:deleteChunk2"
165 | 
166 | 
167 | @pytest.mark.asynio
168 | async def test_delete_all(azurecosmosdb_datastore: AzureCosmosDBDataStore) -> None:
169 |     await azurecosmosdb_datastore.delete(delete_all=True)
170 |     chunk = DocumentChunk(
171 |         id="deleteChunk",
172 |         text="delete text",
173 |         embedding=[1] * EMBEDDING_DIMENSION,
174 |         metadata=DocumentChunkMetadata(),
175 |     )
176 |     await azurecosmosdb_datastore._upsert({"deleteDoc": [chunk]})
177 | 
178 |     query_embedding = [1] * EMBEDDING_DIMENSION
179 |     query = QueryWithEmbedding(
180 |         query="delete query",
181 |         embedding=query_embedding,
182 |         top_k=1,
183 |     )
184 |     results = await azurecosmosdb_datastore._query([query])
185 | 
186 |     assert len(results) == 1
187 |     assert len(results[0].results) == 1
188 |     assert results[0].results[0].id == "doc:deleteDoc:chunk:deleteChunk"
189 | 
190 |     await azurecosmosdb_datastore.delete(delete_all=True)
191 |     results_after_delete = await azurecosmosdb_datastore._query([query])
192 | 
193 |     assert len(results_after_delete[0].results) == 0
194 | 


--------------------------------------------------------------------------------
/tests/datastore/providers/elasticsearch/test_elasticsearch_datastore.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from models.models import (
  3 |     DocumentChunkMetadata,
  4 |     DocumentMetadataFilter,
  5 |     DocumentChunk,
  6 |     QueryWithEmbedding,
  7 |     Source,
  8 | )
  9 | from datastore.providers.elasticsearch_datastore import (
 10 |     ElasticsearchDataStore,
 11 | )
 12 | import time
 13 | import os
 14 | 
 15 | DIM_SIZE = int(os.environ.get("EMBEDDING_DIMENSION", 256))
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def elasticsearch_datastore():
 20 |     return ElasticsearchDataStore()
 21 | 
 22 | 
 23 | def sample_embedding(one_element_poz: int):
 24 |     embedding = [0] * DIM_SIZE
 25 |     embedding[one_element_poz % DIM_SIZE] = 1
 26 |     return embedding
 27 | 
 28 | 
 29 | def sample_embeddings(num: int, one_element_start: int = 0):
 30 |     embeddings = []
 31 |     for x in range(num):
 32 |         embedding = [0] * DIM_SIZE
 33 |         embedding[(x + one_element_start) % DIM_SIZE] = 1
 34 |         embeddings.append(embedding)
 35 |     return embeddings
 36 | 
 37 | 
 38 | @pytest.fixture
 39 | def document_chunk_one():
 40 |     doc_id = "abc"
 41 |     doc_chunks = []
 42 | 
 43 |     ids = ["123", "456", "789"]
 44 |     texts = [
 45 |         "Aenean euismod bibendum laoreet",
 46 |         "Vivamus non enim vitae tortor",
 47 |         "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae",
 48 |     ]
 49 |     sources = [Source.email, Source.file, Source.chat]
 50 |     created_ats = [
 51 |         "1929-10-28T09:30:00-05:00",
 52 |         "2009-01-03T16:39:57-08:00",
 53 |         "2021-01-21T10:00:00-02:00",
 54 |     ]
 55 |     authors = ["Fred Smith", "Bob Doe", "Appleton Doe"]
 56 | 
 57 |     embeddings = sample_embeddings(len(texts))
 58 | 
 59 |     for i in range(3):
 60 |         chunk = DocumentChunk(
 61 |             id=ids[i],
 62 |             text=texts[i],
 63 |             metadata=DocumentChunkMetadata(
 64 |                 document_id=doc_id,
 65 |                 source=sources[i],
 66 |                 created_at=created_ats[i],
 67 |                 author=authors[i],
 68 |             ),
 69 |             embedding=embeddings[i],  # type: ignore
 70 |         )
 71 | 
 72 |         doc_chunks.append(chunk)
 73 | 
 74 |     return {doc_id: doc_chunks}
 75 | 
 76 | 
 77 | async def test_upsert(elasticsearch_datastore, document_chunk_one):
 78 |     await elasticsearch_datastore.delete(delete_all=True)
 79 |     res = await elasticsearch_datastore._upsert(document_chunk_one)
 80 |     assert res == list(document_chunk_one.keys())
 81 |     time.sleep(1)
 82 | 
 83 |     results = elasticsearch_datastore.client.search(
 84 |         index=elasticsearch_datastore.index_name, query={"match_all": {}}
 85 |     )
 86 |     assert results["hits"]["total"]["value"] == 3
 87 |     elasticsearch_datastore.client.indices.delete(
 88 |         index=elasticsearch_datastore.index_name
 89 |     )
 90 | 
 91 | 
 92 | async def test_upsert_query_all(elasticsearch_datastore, document_chunk_one):
 93 |     await elasticsearch_datastore.delete(delete_all=True)
 94 |     res = await elasticsearch_datastore._upsert(document_chunk_one)
 95 |     assert res == list(document_chunk_one.keys())
 96 |     time.sleep(1)
 97 | 
 98 |     query = QueryWithEmbedding(
 99 |         query="Aenean",
100 |         top_k=10,
101 |         embedding=sample_embedding(0),  # type: ignore
102 |     )
103 |     query_results = await elasticsearch_datastore._query(queries=[query])
104 | 
105 |     assert 1 == len(query_results)
106 |     assert 3 == len(query_results[0].results)
107 | 
108 | 
109 | async def test_delete_with_document_id(elasticsearch_datastore, document_chunk_one):
110 |     await elasticsearch_datastore.delete(delete_all=True)
111 |     res = await elasticsearch_datastore._upsert(document_chunk_one)
112 |     time.sleep(1)
113 |     assert res == list(document_chunk_one.keys())
114 |     await elasticsearch_datastore.delete([res[0]])
115 |     time.sleep(1)
116 | 
117 |     query = QueryWithEmbedding(
118 |         query="Aenean",
119 |         top_k=9,
120 |         embedding=sample_embedding(0),  # type: ignore
121 |     )
122 |     query_results = await elasticsearch_datastore._query(queries=[query])
123 | 
124 |     assert 1 == len(query_results)
125 |     assert 0 == len(query_results[0].results)
126 | 
127 |     elasticsearch_datastore.client.indices.delete(
128 |         index=elasticsearch_datastore.index_name
129 |     )
130 | 
131 | 
132 | async def test_delete_with_source_filter(elasticsearch_datastore, document_chunk_one):
133 |     await elasticsearch_datastore.delete(delete_all=True)
134 |     res = await elasticsearch_datastore._upsert(document_chunk_one)
135 |     assert res == list(document_chunk_one.keys())
136 |     time.sleep(1)
137 | 
138 |     await elasticsearch_datastore.delete(
139 |         filter=DocumentMetadataFilter(
140 |             source=Source.email,
141 |         )
142 |     )
143 | 
144 |     time.sleep(1)
145 | 
146 |     query = QueryWithEmbedding(
147 |         query="Aenean",
148 |         top_k=9,
149 |         embedding=sample_embedding(0),  # type: ignore
150 |     )
151 |     query_results = await elasticsearch_datastore._query(queries=[query])
152 | 
153 |     assert 1 == len(query_results)
154 |     assert 2 == len(query_results[0].results)
155 |     assert "456" == query_results[0].results[0].id
156 | 
157 |     elasticsearch_datastore.client.indices.delete(
158 |         index=elasticsearch_datastore.index_name
159 |     )
160 | 


--------------------------------------------------------------------------------
/tests/datastore/providers/llama/test_llama_datastore.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | import pytest
 3 | from datastore.providers.llama_datastore import LlamaDataStore
 4 | from models.models import DocumentChunk, DocumentChunkMetadata, QueryWithEmbedding
 5 | 
 6 | 
 7 | def create_embedding(non_zero_pos: int, size: int) -> List[float]:
 8 |     vector = [0.0] * size
 9 |     vector[non_zero_pos % size] = 1.0
10 |     return vector
11 | 
12 | 
13 | @pytest.fixture
14 | def initial_document_chunks() -> Dict[str, List[DocumentChunk]]:
15 |     first_doc_chunks = [
16 |         DocumentChunk(
17 |             id=f"first-doc-{i}",
18 |             text=f"Lorem ipsum {i}",
19 |             metadata=DocumentChunkMetadata(),
20 |             embedding=create_embedding(i, 5),
21 |         )
22 |         for i in range(4, 7)
23 |     ]
24 |     return {
25 |         "first-doc": first_doc_chunks,
26 |     }
27 | 
28 | 
29 | @pytest.fixture
30 | def queries() -> List[QueryWithEmbedding]:
31 |     queries = [
32 |         QueryWithEmbedding(
33 |             query="Query 1",
34 |             top_k=1,
35 |             embedding=create_embedding(4, 5),
36 |         ),
37 |         QueryWithEmbedding(
38 |             query="Query 2",
39 |             top_k=2,
40 |             embedding=create_embedding(5, 5),
41 |         ),
42 |     ]
43 |     return queries
44 | 
45 | 
46 | @pytest.fixture
47 | def llama_datastore() -> LlamaDataStore:
48 |     return LlamaDataStore()
49 | 
50 | 
51 | @pytest.mark.asyncio
52 | async def test_upsert(
53 |     llama_datastore: LlamaDataStore,
54 |     initial_document_chunks: Dict[str, List[DocumentChunk]],
55 | ) -> None:
56 |     """Test basic upsert."""
57 |     doc_ids = await llama_datastore._upsert(initial_document_chunks)
58 |     assert doc_ids == [doc_id for doc_id in initial_document_chunks]
59 | 
60 | 
61 | @pytest.mark.asyncio
62 | async def test_query(
63 |     llama_datastore: LlamaDataStore,
64 |     initial_document_chunks: Dict[str, List[DocumentChunk]],
65 |     queries: List[QueryWithEmbedding],
66 | ) -> None:
67 |     """Test basic query."""
68 |     # insert to prepare for test
69 |     await llama_datastore._upsert(initial_document_chunks)
70 | 
71 |     query_results = await llama_datastore._query(queries)
72 |     assert len(query_results) == len(queries)
73 | 
74 |     query_0_results = query_results[0].results
75 |     query_1_results = query_results[1].results
76 | 
77 |     assert len(query_0_results) == 1
78 |     assert len(query_1_results) == 2
79 | 
80 |     # NOTE: this is the correct behavior
81 |     assert query_0_results[0].id == "first-doc-4"
82 |     assert query_1_results[0].id == "first-doc-5"
83 |     assert query_1_results[1].id == "first-doc-4"
84 | 
85 | 
86 | @pytest.mark.asyncio
87 | async def test_delete(
88 |     llama_datastore: LlamaDataStore,
89 |     initial_document_chunks: Dict[str, List[DocumentChunk]],
90 | ) -> None:
91 |     # insert to prepare for test
92 |     await llama_datastore._upsert(initial_document_chunks)
93 | 
94 |     is_success = llama_datastore.delete(["first-doc"])
95 |     assert is_success
96 | 


--------------------------------------------------------------------------------
/tests/datastore/providers/mongodb_atlas/test_integration.py:
--------------------------------------------------------------------------------
  1 | """Integration Tests of ChatGPT Retrieval Plugin
  2 | with MongoDB Atlas Vector Datastore and OPENAI Embedding model.
  3 | 
  4 | As described in docs/providers/mongodb/setup.md, to run this, one must
  5 | have a running MongoDB Atlas Cluster, and
  6 | provide a valid OPENAI_API_KEY.
  7 | """
  8 | 
  9 | import os
 10 | from time import sleep
 11 | 
 12 | import openai
 13 | import pytest
 14 | from fastapi.testclient import TestClient
 15 | from httpx import Response
 16 | from pymongo import MongoClient
 17 | 
 18 | from server.main import app
 19 | 
 20 | 
 21 | @pytest.fixture(scope="session")
 22 | def documents():
 23 |     """ List of documents represents data to be embedded in the datastore.
 24 |     Minimum requirements fpr Documents in the /upsert endpoint's UpsertRequest.
 25 |     """
 26 |     return [
 27 |         {"text": "The quick brown fox jumped over the slimy green toad."},
 28 |         {"text": "The big brown bear jumped over the lazy dog."},
 29 |         {"text": "Toads are frogs."},
 30 |         {"text": "Green toads are basically red frogs."},
 31 |     ]
 32 | 
 33 | 
 34 | @pytest.fixture(scope="session", autouse=True)
 35 | def client():
 36 |     """TestClient makes requests to FastAPI service."""
 37 |     endpoint_url = "http://127.0.0.1:8000"
 38 |     headers = {"Authorization": f"Bearer {os.environ['BEARER_TOKEN']}"}
 39 |     with TestClient(app=app, base_url=endpoint_url, headers=headers) as client:
 40 |         yield client
 41 | 
 42 | 
 43 | @pytest.fixture(scope="session")
 44 | def delete(client) -> bool:
 45 |     """Drop existing documents from the collection"""
 46 |     response = client.request("DELETE", "/delete", json={"delete_all": True})
 47 |     sleep(2)
 48 |     return response
 49 | 
 50 | 
 51 | @pytest.fixture(scope="session")
 52 | def upsert(delete, documents, client) -> bool:
 53 |     """Upload documents to the datastore via plugin's REST API."""
 54 |     response = client.post("/upsert", json={"documents": documents})
 55 |     sleep(2)  # At this point, the Vector Search Index is being built
 56 |     return response
 57 | 
 58 | 
 59 | def test_delete(delete) -> None:
 60 |     """Simply confirm that delete fixture ran successfully"""
 61 |     assert delete.status_code == 200
 62 |     assert delete.json()['success']
 63 | 
 64 | 
 65 | def test_upsert(upsert) -> None:
 66 |     """Simply confirm that upsert fixture has run successfully"""
 67 |     assert upsert.status_code == 200
 68 |     assert len(upsert.json()['ids']) == 4
 69 | 
 70 | 
 71 | def test_query(upsert, client) -> None:  # upsert,
 72 |     """Test queries produce reasonable results,
 73 |     now that datastore contains embedded data which has been indexed
 74 |     """
 75 |     question = "What did the fox jump over?"
 76 |     n_requested = 2  # top N results per query
 77 |     got_response = False
 78 |     retries = 5
 79 |     query_result = {}
 80 |     while retries and not got_response:
 81 |         response = client.post("/query", json={'queries': [{"query": question, "top_k": n_requested}]})
 82 |         assert isinstance(response, Response)
 83 |         assert response.status_code == 200
 84 |         assert len(response.json()) == 1
 85 |         query_result = response.json()['results'][0]
 86 |         if len(query_result['results']) == n_requested:
 87 |             got_response = True
 88 |         else:
 89 |             retries -= 1
 90 |             sleep(5)
 91 | 
 92 |     assert got_response  # we got n_requested responses
 93 |     assert query_result['query'] == question
 94 |     answers = []
 95 |     scores = []
 96 |     for result in query_result['results']:
 97 |         answers.append(result['text'])
 98 |         scores.append(round(result['score'], 2))
 99 |     assert 0.8 < scores[0] < 0.9
100 |     assert answers[0] == "The quick brown fox jumped over the slimy green toad."
101 | 
102 | 
103 | def test_required_vars() -> None:
104 |     """Confirm that the environment has all it needs"""
105 |     required_vars = {'BEARER_TOKEN', 'OPENAI_API_KEY', 'DATASTORE', 'EMBEDDING_DIMENSION', 'EMBEDDING_MODEL',
106 |                      'MONGODB_COLLECTION', 'MONGODB_DATABASE', 'MONGODB_INDEX', 'MONGODB_URI'}
107 |     assert os.environ["DATASTORE"] == 'mongodb'
108 |     missing = required_vars - set(os.environ)
109 |     assert len(missing) == 0
110 | 
111 | 
112 | def test_mongodb_connection() -> None:
113 |     """Confirm that the connection to the datastore works."""
114 |     client = MongoClient(os.environ["MONGODB_URI"])
115 |     assert client.admin.command('ping')['ok']
116 | 
117 | 
118 | def test_openai_connection() -> None:
119 |     """Check that we can call OpenAI Embedding models."""
120 |     openai.api_key = os.environ["OPENAI_API_KEY"]
121 |     models = openai.Model.list()
122 |     model_names = [model["id"] for model in models['data']]
123 |     for model_name in model_names:
124 |         try:
125 |             response = openai.Embedding.create(input=["Some input text"], model=model_name)
126 |             assert len(response['data'][0]['embedding']) >= int(os.environ['EMBEDDING_DIMENSION'])
127 |         except:
128 |             pass  # Not all models are for text embedding.
129 | 


--------------------------------------------------------------------------------
/tests/datastore/providers/redis/test_redis_datastore.py:
--------------------------------------------------------------------------------
 1 | from datastore.providers.redis_datastore import RedisDataStore
 2 | from models.models import (
 3 |     DocumentChunk,
 4 |     DocumentChunkMetadata,
 5 |     QueryWithEmbedding,
 6 |     Source,
 7 |     DocumentMetadataFilter,
 8 | )
 9 | import pytest
10 | import redis.asyncio as redis
11 | import numpy as np
12 | 
13 | NUM_TEST_DOCS = 10
14 | 
15 | 
16 | @pytest.fixture
17 | async def redis_datastore():
18 |     return await RedisDataStore.init(dim=5)
19 | 
20 | 
21 | def create_embedding(i, dim):
22 |     vec = np.array([0.1] * dim).astype(np.float64).tolist()
23 |     vec[dim - 1] = i + 1 / 10
24 |     return vec
25 | 
26 | 
27 | def create_document_chunk(i, dim):
28 |     return DocumentChunk(
29 |         id=f"first-doc_{i}",
30 |         text=f"Lorem ipsum {i}",
31 |         embedding=create_embedding(i, dim),
32 |         metadata=DocumentChunkMetadata(
33 |             source=Source.file, created_at="1970-01-01", document_id="docs"
34 |         ),
35 |     )
36 | 
37 | 
38 | def create_document_chunks(n, dim):
39 |     docs = [create_document_chunk(i, dim) for i in range(n)]
40 |     return {"docs": docs}
41 | 
42 | 
43 | @pytest.mark.asyncio
44 | async def test_redis_upsert_query(redis_datastore):
45 |     docs = create_document_chunks(NUM_TEST_DOCS, 5)
46 |     await redis_datastore._upsert(docs)
47 |     query = QueryWithEmbedding(
48 |         query="Lorem ipsum 0",
49 |         top_k=5,
50 |         embedding=create_embedding(0, 5),
51 |     )
52 |     query_results = await redis_datastore._query(queries=[query])
53 |     assert 1 == len(query_results)
54 |     for i in range(5):
55 |         assert f"Lorem ipsum {i}" == query_results[0].results[i].text
56 |         assert "docs" == query_results[0].results[i].id
57 | 
58 | 
59 | @pytest.mark.asyncio
60 | async def test_redis_filter_query(redis_datastore):
61 |     query = QueryWithEmbedding(
62 |         query="Lorem ipsum 0",
63 |         filter=DocumentMetadataFilter(document_id="docs"),
64 |         top_k=5,
65 |         embedding=create_embedding(0, 5),
66 |     )
67 |     query_results = await redis_datastore._query(queries=[query])
68 |     print(query_results)
69 |     assert 1 == len(query_results)
70 |     assert "docs" == query_results[0].results[0].id
71 | 
72 | 
73 | @pytest.mark.asyncio
74 | async def test_redis_delete_docs(redis_datastore):
75 |     res = await redis_datastore.delete(ids=["docs"])
76 |     assert res
77 | 


--------------------------------------------------------------------------------
/tests/datastore/providers/weaviate/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: '3.4'
 3 | services:
 4 |   weaviate:
 5 |     command:
 6 |     - --host
 7 |     - 0.0.0.0
 8 |     - --port
 9 |     - '8080'
10 |     - --scheme
11 |     - http
12 |     image: semitechnologies/weaviate:1.18.0
13 |     ports:
14 |     - 8080:8080
15 |     restart: on-failure:0
16 |     environment:
17 |       QUERY_DEFAULTS_LIMIT: 25
18 |       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
19 |       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
20 |       DEFAULT_VECTORIZER_MODULE: 'none'
21 |       ENABLE_MODULES: ''
22 |       CLUSTER_HOSTNAME: 'node1'
23 |       LOG_LEVEL: debug
24 |       AUTOSCHEMA_ENABLED: 'false'
25 | ...


--------------------------------------------------------------------------------
/tests/datastore/providers/zilliz/test_zilliz_datastore.py:
--------------------------------------------------------------------------------
 1 | # from pathlib import Path
 2 | # from dotenv import find_dotenv, load_dotenv
 3 | # env_path = Path(".") / "zilliz.env"
 4 | # load_dotenv(dotenv_path=env_path, verbose=True)
 5 | 
 6 | import pytest
 7 | 
 8 | from datastore.providers.zilliz_datastore import (
 9 |     ZillizDataStore,
10 | )
11 | 
12 | from datastore.providers.milvus_datastore import (
13 |     EMBEDDING_FIELD,
14 | )
15 | 
16 | # Note: Only do basic test here, the ZillizDataStore is derived from MilvusDataStore.
17 | 
18 | 
19 | @pytest.fixture
20 | def zilliz_datastore():
21 |     return ZillizDataStore()
22 | 
23 | 
24 | @pytest.mark.asyncio
25 | async def test_zilliz(zilliz_datastore):
26 |     assert True == zilliz_datastore.col.has_index()
27 |     index_list = [x.to_dict() for x in zilliz_datastore.col.indexes]
28 |     for index in index_list:
29 |         if index["index_name"] == EMBEDDING_FIELD:
30 |             assert "AUTOINDEX" == index["index_param"]["index_type"]
31 | 


--------------------------------------------------------------------------------