├── .env.template
├── .github
    └── workflows
    │   └── WebApp.yml
├── .gitignore
├── .vscode
    ├── extensions.json
    ├── launch.json
    ├── settings.json
    └── tasks.json
├── BatchProcess.Dockerfile
├── BatchProcess.Dockerfile.dockerignore
├── LICENSE
├── README.md
├── WebApp.Dockerfile
├── WebApp.Dockerfile.dockerignore
├── code
    ├── .funcignore
    ├── ApiQnA
    │   ├── __init__.py
    │   └── function.json
    ├── BatchPushResults
    │   ├── __init__.py
    │   └── function.json
    ├── BatchStartProcessing
    │   ├── __init__.py
    │   └── function.json
    ├── OpenAI_Queries.py
    ├── embeddings_text.csv
    ├── environment.yml
    ├── host.json
    ├── images
    │   └── microsoft.png
    ├── pages
    │   ├── 00_Chat.py
    │   ├── 01_Add_Document.py
    │   ├── 02_Document_Management.py
    │   ├── 04_Index_Management.py
    │   ├── 10_Sandbox.py
    │   ├── 10_Utils - Document_Summary.py
    │   ├── 11_Utils - Conversation_Data_Extraction.py
    │   └── 12_Utils - Prompt Exploration.py
    ├── requirements.txt
    └── utilities
    │   ├── __init__.py
    │   ├── azureblobstorage.py
    │   ├── azuresearch.py
    │   ├── customprompt.py
    │   ├── formrecognizer.py
    │   ├── helper.py
    │   ├── pgvector.py
    │   ├── redis.py
    │   └── translator.py
├── demo
    ├── .dockerignore
    ├── Dockerfile
    ├── architecture.png
    ├── demo.py
    ├── helper.py
    ├── microsoft.png
    └── requirements.txt
├── docker-compose.yml
├── docs
    ├── architecture.png
    ├── architecture_acre.png
    ├── architecture_acs.png
    ├── architecture_pg.png
    └── architecture_redis.png
└── infrastructure
    ├── deployment.json
    ├── deploymentACRE.json
    ├── deployment_ACS.json
    ├── deployment_azcn.json
    ├── deployment_pg.json
    └── deployment_pg_azcn.json


/.env.template:
--------------------------------------------------------------------------------
 1 | OPENAI_ENGINE=text-davinci-003
 2 | OPENAI_DEPLOYMENT_TYPE=Text
 3 | OPENAI_EMBEDDINGS_ENGINE=text-embedding-ada-002
 4 | OPENAI_EMBEDDINGS_ENGINE_DOC=text-embedding-ada-002
 5 | OPENAI_EMBEDDINGS_ENGINE_QUERY=text-embedding-ada-002
 6 | OPENAI_API_BASE=https://YOUR_AZURE_OPENAI_RESOURCE.openai.azure.com/
 7 | OPENAI_API_KEY=YOUR_AZURE_OPENAI_API_KEY
 8 | OPENAI_TEMPERATURE=0.7
 9 | OPENAI_MAX_TOKENS=-1
10 | AZURE_CLOUD=AzureCloud # AzureCloud or AzureChinaCloud
11 | VECTOR_STORE_TYPE=AzureSearch
12 | AZURE_SEARCH_SERVICE_NAME=YOUR_AZURE_SEARCH_SERVICE_NAME
13 | AZURE_SEARCH_ADMIN_KEY=YOUR_AZURE_SEARCH_ADMIN_KEY
14 | REDIS_ADDRESS=api
15 | REDIS_PORT=6379
16 | REDIS_PASSWORD=redis-stack-password
17 | REDIS_ARGS=--requirepass $REDIS_PASSWORD
18 | BLOB_ACCOUNT_NAME=YOUR_AZURE_BLOB_STORAGE_ACCOUNT_NAME
19 | BLOB_ACCOUNT_KEY=YOUR_AZURE_BLOB_STORAGE_ACCOUNT_KEY
20 | BLOB_CONTAINER_NAME=YOUR_AZURE_BLOB_STORAGE_CONTAINER_NAME
21 | QUEUE_NAME=doc-processing
22 | FORM_RECOGNIZER_ENDPOINT=YOUR_AZURE_FORM_RECOGNIZER_ENDPOINT
23 | FORM_RECOGNIZER_KEY=YOUR_AZURE_FORM_RECOGNIZER_KEY
24 | CHUNK_SIZE=500
25 | CHUNK_OVERLAP=100
26 | TRANSLATE_ENDPOINT=YOUR_AZURE_TRANSLATE_ENDPOINT
27 | TRANSLATE_KEY=YOUR_AZURE_TRANSLATE_KEY
28 | TRANSLATE_REGION=YOUR_AZURE_TRANSLATE_REGION
29 | VNET_DEPLOYMENT=false
30 | NUMBER_OF_EMBEDDINGS_FOR_QNA=3
31 | CONVERT_ADD_EMBEDDINGS_URL=http://batch/api/BatchStartProcessing
32 | AzureWebJobsStorage=AZURE_BLOB_STORAGE_CONNECTION_STRING_FOR_AZURE_FUNCTION_EXECUTION
33 | CHAT_AI_AVATAR_STYLE=thumbs
34 | CHAT_AI_SEED=Lucy
35 | CHAT_USER_AVATAR_STYLE=thumbs
36 | CHAT_USER_SEED=Bubba
37 | 


--------------------------------------------------------------------------------
/.github/workflows/WebApp.yml:
--------------------------------------------------------------------------------
 1 | name: WebApp Docker Image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 | 
 7 | jobs:
 8 | 
 9 |   build:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     
15 |     - name: Docker Login
16 |       uses: docker/login-action@v2.1.0
17 |       with:
18 |         username: fruocco
19 |         # Password or personal access token used to log against the Docker registry
20 |         password: ${{ secrets.DOCKER_TOKEN }}
21 |     
22 |     - uses: actions/checkout@v3
23 |     - name: Build the Docker image
24 |       run:
25 |         docker build . --file WebApp.Dockerfile --tag fruocco/oai-embeddings:$(date +'%Y-%m-%d')_$GITHUB_RUN_NUMBER;
26 |         docker tag fruocco/oai-embeddings:$(date +'%Y-%m-%d')_$GITHUB_RUN_NUMBER fruocco/oai-embeddings:latest;
27 |         docker push fruocco/oai-embeddings:$(date +'%Y-%m-%d')_$GITHUB_RUN_NUMBER;
28 |         docker push fruocco/oai-embeddings:latest;
29 |         
30 |         docker build . --file BatchProcess.Dockerfile --tag fruocco/oai-batch:$(date +'%Y-%m-%d')_$GITHUB_RUN_NUMBER;
31 |         docker tag fruocco/oai-batch:$(date +'%Y-%m-%d')_$GITHUB_RUN_NUMBER fruocco/oai-batch:latest;
32 |         docker push fruocco/oai-batch:$(date +'%Y-%m-%d')_$GITHUB_RUN_NUMBER;
33 |         docker push fruocco/oai-batch:latest;
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | code/embeddings_text.csv
3 | code/utilities/__pycache__
4 | .env
5 | __pycache__
6 | .vscode
7 | WebApp.Dockerfile
8 | BatchProcess.Dockerfile
9 | .gitignore


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "ms-azuretools.vscode-azurefunctions",
4 |     "ms-python.python"
5 |   ]
6 | }


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Attach to Python Functions",
 6 |             "type": "python",
 7 |             "request": "attach",
 8 |             "port": 9091,
 9 |             "preLaunchTask": "func: host start"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "azureFunctions.deploySubpath": "code",
3 |     "azureFunctions.scmDoBuildDuringDeployment": true,
4 |     "azureFunctions.pythonVenv": ".venv",
5 |     "azureFunctions.projectLanguage": "Python",
6 |     "azureFunctions.projectRuntime": "~4",
7 |     "debug.internalConsoleOptions": "neverOpen"
8 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "2.0.0",
 3 | 	"tasks": [
 4 | 		{
 5 | 			"type": "func",
 6 | 			"label": "func: host start",
 7 | 			"command": "host start",
 8 | 			"problemMatcher": "$func-python-watch",
 9 | 			"isBackground": true,
10 | 			"dependsOn": "pip install (functions)",
11 | 			"options": {
12 | 				"cwd": "${workspaceFolder}/code"
13 | 			}
14 | 		},
15 | 		{
16 | 			"label": "pip install (functions)",
17 | 			"type": "shell",
18 | 			"osx": {
19 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
20 | 			},
21 | 			"windows": {
22 | 				"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
23 | 			},
24 | 			"linux": {
25 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
26 | 			},
27 | 			"problemMatcher": [],
28 | 			"options": {
29 | 				"cwd": "${workspaceFolder}/code"
30 | 			}
31 | 		}
32 | 	]
33 | }


--------------------------------------------------------------------------------
/BatchProcess.Dockerfile:
--------------------------------------------------------------------------------
 1 | # To enable ssh & remote debugging on app service change the base image to the one below
 2 | # FROM mcr.microsoft.com/azure-functions/python:4-python3.9-appservice
 3 | FROM mcr.microsoft.com/azure-functions/python:4-python3.9
 4 | 
 5 | ENV AzureWebJobsScriptRoot=/home/site/wwwroot \
 6 |     AzureFunctionsJobHost__Logging__Console__IsEnabled=true
 7 | 
 8 | COPY ./code/requirements.txt /
 9 | RUN pip install -r /requirements.txt
10 | 
11 | COPY ./code /home/site/wwwroot


--------------------------------------------------------------------------------
/BatchProcess.Dockerfile.dockerignore:
--------------------------------------------------------------------------------
1 | code/__pycache__
2 | code/images
3 | code/pages
4 | code/.funcignore
5 | code/environment.yml
6 | code/host.json
7 | code/local.settings.json
8 | code/OpenAI_Queries.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE


--------------------------------------------------------------------------------
/WebApp.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.10-slim-buster
2 | RUN apt-get update && apt-get install python-tk python3-tk tk-dev -y
3 | COPY ./code/requirements.txt /usr/local/src/myscripts/requirements.txt
4 | WORKDIR /usr/local/src/myscripts
5 | RUN pip install -r requirements.txt
6 | COPY ./code/ /usr/local/src/myscripts
7 | EXPOSE 80
8 | CMD ["streamlit", "run", "OpenAI_Queries.py", "--server.port", "80", "--server.enableXsrfProtection", "false"]


--------------------------------------------------------------------------------
/WebApp.Dockerfile.dockerignore:
--------------------------------------------------------------------------------
1 | code/BatchStartProcessing
2 | code/BatchPushResults
3 | code/Dockerfile
4 | code/__pycache__
5 | code/environment.yml
6 | code/host.json
7 | code/local.settings.json


--------------------------------------------------------------------------------
/code/.funcignore:
--------------------------------------------------------------------------------
1 | .git*
2 | .vscode
3 | __azurite_db*__.json
4 | __blobstorage__
5 | __queuestorage__
6 | local.settings.json
7 | test
8 | .venv


--------------------------------------------------------------------------------
/code/ApiQnA/__init__.py:
--------------------------------------------------------------------------------
 1 | import azure.functions
 2 | from dotenv import load_dotenv
 3 | load_dotenv()
 4 | 
 5 | import os
 6 | from utilities.helper import LLMHelper
 7 | 
 8 | def main(req: azure.functions.HttpRequest) -> str:
 9 |     # Get data from POST request
10 |     try:
11 |         req_body = req.get_json()
12 |     except ValueError:
13 |         pass
14 |     else:
15 |         question = req_body.get('question')
16 |         history = req_body.get('history', [])
17 |         custom_prompt = req_body.get('custom_prompt', "")
18 |         custom_temperature = float(req_body.get('custom_temperature', os.getenv("OPENAI_TEMPERATURE", 0.7)))
19 |     # Create LLMHelper object
20 |     llm_helper = LLMHelper(custom_prompt=custom_prompt, temperature=custom_temperature)
21 |     # Get answer
22 |     data = {}
23 |     data['question'], data['response'], data['context'], data["sources"] = llm_helper.get_semantic_answer_lang_chain(question, history)
24 |     # Return answer
25 |     return f'{data}'


--------------------------------------------------------------------------------
/code/ApiQnA/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "scriptFile": "__init__.py",
 3 |     "bindings": [
 4 |         {
 5 |             "authLevel": "anonymous",
 6 |             "type": "httpTrigger",
 7 |             "direction": "in",
 8 |             "name": "req",
 9 |             "methods": [
10 |                 "post"
11 |             ]
12 |         },
13 |         {
14 |             "type": "http",
15 |             "direction": "out",
16 |             "name": "$return"
17 |         }
18 |     ]
19 | }


--------------------------------------------------------------------------------
/code/BatchPushResults/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging, json
 2 | import azure.functions as func
 3 | from utilities.helper import LLMHelper
 4 | 
 5 | def main(msg: func.QueueMessage) -> None:
 6 |     logging.info('Python queue trigger function processed a queue item: %s',
 7 |                  msg.get_body().decode('utf-8'))
 8 | 
 9 |     # Set up LLM Helper
10 |     llm_helper = LLMHelper()
11 |     # Get the file name from the message
12 |     file_name = json.loads(msg.get_body().decode('utf-8'))['filename']
13 |     # Generate the SAS URL for the file
14 |     file_sas = llm_helper.blob_client.get_blob_sas(file_name)
15 | 
16 |     # Check the file extension
17 |     if file_name.endswith('.txt'):
18 |         # Add the text to the embeddings
19 |         llm_helper.add_embeddings_lc(file_sas)
20 |     else:
21 |         # Get OCR with Layout API and then add embeddigns
22 |         llm_helper.convert_file_and_add_embeddings(file_sas , file_name)
23 | 
24 |     llm_helper.blob_client.upsert_blob_metadata(file_name, {'embeddings_added': 'true'})
25 | 


--------------------------------------------------------------------------------
/code/BatchPushResults/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "name": "msg",
 6 |       "type": "queueTrigger",
 7 |       "direction": "in",
 8 |       "queueName": "doc-processing",
 9 |       "connection": "AzureWebJobsStorage"
10 |     }
11 |   ]
12 | }


--------------------------------------------------------------------------------
/code/BatchStartProcessing/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging, json, os
 2 | import azure.functions as func
 3 | from azure.storage.queue import QueueClient, BinaryBase64EncodePolicy
 4 | from utilities.helper import LLMHelper
 5 | 
 6 | queue_name = os.environ['QUEUE_NAME']
 7 | 
 8 | def main(req: func.HttpRequest) -> func.HttpResponse:
 9 |     logging.info('Requested to start processing all documents received')
10 |     # Set up LLM Helper
11 |     llm_helper = LLMHelper()
12 |     # Get all files from Blob Storage
13 |     files_data = llm_helper.blob_client.get_all_files()
14 |     # Filter out files that have already been processed
15 |     files_data = list(filter(lambda x : not x['embeddings_added'], files_data)) if req.params.get('process_all') != 'true' else files_data
16 |     files_data = list(map(lambda x: {'filename': x['filename']}, files_data))
17 |     # Create the QueueClient object
18 |     queue_client = QueueClient.from_connection_string(llm_helper.blob_client.connect_str, queue_name, message_encode_policy=BinaryBase64EncodePolicy())
19 |     # Send a message to the queue for each file
20 |     for fd in files_data:
21 |         queue_client.send_message(json.dumps(fd).encode('utf-8'))
22 |  
23 |     return func.HttpResponse(f"Conversion started successfully for {len(files_data)} documents.", status_code=200)


--------------------------------------------------------------------------------
/code/BatchStartProcessing/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "anonymous",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": [
10 |         "get",
11 |         "post"
12 |       ]
13 |     },
14 |     {
15 |       "type": "http",
16 |       "direction": "out",
17 |       "name": "$return"
18 |     }
19 |   ]
20 | }


--------------------------------------------------------------------------------
/code/OpenAI_Queries.py:
--------------------------------------------------------------------------------
  1 | from dotenv import load_dotenv
  2 | load_dotenv()
  3 | 
  4 | import streamlit as st
  5 | import os
  6 | import traceback
  7 | from utilities.helper import LLMHelper
  8 | import regex as re
  9 | 
 10 | import logging
 11 | logger = logging.getLogger('azure.core.pipeline.policies.http_logging_policy').setLevel(logging.WARNING)
 12 | 
 13 | def check_deployment():
 14 |     # Check if the deployment is working
 15 |     #\ 1. Check if the llm is working
 16 |     try:
 17 |         llm_helper = LLMHelper()
 18 |         llm_helper.get_completion("Generate a joke!")
 19 |         st.success("LLM is working!")
 20 |     except Exception as e:
 21 |         st.error(f"""LLM is not working.  
 22 |             Please check you have a deployment name {llm_helper.deployment_name} in your Azure OpenAI resource {llm_helper.api_base}.  
 23 |             If you are using an Instructions based deployment (text-davinci-003), please check you have an environment variable OPENAI_DEPLOYMENT_TYPE=Text or delete the environment variable OPENAI_DEPLOYMENT_TYPE.  
 24 |             If you are using a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4), please check you have an environment variable OPENAI_DEPLOYMENT_TYPE=Chat.  
 25 |             Then restart your application.
 26 |             """)
 27 |         st.error(traceback.format_exc())
 28 |     #\ 2. Check if the embedding is working
 29 |     try:
 30 |         llm_helper = LLMHelper()
 31 |         llm_helper.embeddings.embed_documents(texts=["This is a test"])
 32 |         st.success("Embedding is working!")
 33 |     except Exception as e:
 34 |         st.error(f"""Embedding model is not working.  
 35 |             Please check you have a deployment named "text-embedding-ada-002" for "text-embedding-ada-002" model in your Azure OpenAI resource {llm_helper.api_base}.  
 36 |             Then restart your application.
 37 |             """)
 38 |         st.error(traceback.format_exc())
 39 |     #\ 3. Check if the translation is working
 40 |     try:
 41 |         llm_helper = LLMHelper()
 42 |         llm_helper.translator.translate("This is a test", "it")
 43 |         st.success("Translation is working!")
 44 |     except Exception as e:
 45 |         st.error(f"""Translation model is not working.  
 46 |             Please check your Azure Translator key in the App Settings.  
 47 |             Then restart your application.  
 48 |             """)
 49 |         st.error(traceback.format_exc())
 50 |     #\ 4. Check if the VectorStore is working with previous version of data
 51 |     try:
 52 |         llm_helper = LLMHelper()
 53 |         if llm_helper.vector_store_type == "AzureSearch":
 54 |             try:
 55 |                 llm_helper.vector_store.index_exists()
 56 |                 st.success("Azure Cognitive Search is working!")
 57 |             except Exception as e:
 58 |                 st.error("""Azure Cognitive Search is not working.  
 59 |                     Please check your Azure Cognitive Search service name and service key in the App Settings.  
 60 |                     Then restart your application.  
 61 |                     """)
 62 |                 st.error(traceback.format_exc())
 63 |         elif llm_helper.vector_store_type == "PGVector":
 64 |             try:
 65 |                 llm_helper.vector_store.__post_init__()
 66 |                 st.success("PGVector is working!")
 67 |             except Exception as e:
 68 |                 st.error("""PGVector is not working.  
 69 |                     Please check your Azure PostgreSQL server, database, user name and password in the App Settings.
 70 |                     Make sure the network settings(firewall rule) allow your app to access the Azure PostgreSQL service.
 71 |                     Then restart your application.  
 72 |                     """)
 73 |                 st.error(traceback.format_exc())
 74 |         else:
 75 |             if llm_helper.vector_store.check_existing_index("embeddings-index"):
 76 |                 st.warning("""Seems like you're using a Redis with an old data structure.  
 77 |                 If you want to use the new data structure, you can start using the app and go to "Add Document" -> "Add documents in Batch" and click on "Convert all files and add embeddings" to reprocess your documents.  
 78 |                 To remove this working, please delete the index "embeddings-index" from your Redis.  
 79 |                 If you prefer to use the old data structure, please change your Web App container image to point to the docker image: fruocco/oai-embeddings:2023-03-27_25. 
 80 |                 """)
 81 |             else:
 82 |                 st.success("Redis is working!")
 83 |     except Exception as e:
 84 |         st.error(f"""Redis is not working. 
 85 |             Please check your Redis connection string in the App Settings.  
 86 |             Then restart your application.
 87 |             """)
 88 |         st.error(traceback.format_exc())
 89 | 
 90 | 
 91 | def check_variables_in_prompt():
 92 |     # Check if "summaries" is present in the string custom_prompt
 93 |     if "{summaries}" not in st.session_state.custom_prompt:
 94 |         st.warning("""Your custom prompt doesn't contain the variable "{summaries}".  
 95 |         This variable is used to add the content of the documents retrieved from the VectorStore to the prompt.  
 96 |         Please add it to your custom prompt to use the app.  
 97 |         Reverting to default prompt.
 98 |         """)
 99 |         st.session_state.custom_prompt = ""
100 |     if "{question}" not in st.session_state.custom_prompt:
101 |         st.warning("""Your custom prompt doesn't contain the variable "{question}".  
102 |         This variable is used to add the user's question to the prompt.  
103 |         Please add it to your custom prompt to use the app.  
104 |         Reverting to default prompt.  
105 |         """)
106 |         st.session_state.custom_prompt = ""
107 |     
108 | 
109 |  # Callback to assign the follow-up question is selected by the user
110 | def ask_followup_question(followup_question):
111 |     st.session_state.askedquestion = followup_question
112 |     st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1
113 | 
114 | def questionAsked():
115 |     st.session_state.askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])]
116 | 
117 | @st.cache_data()
118 | def get_languages():
119 |     return llm_helper.translator.get_available_languages()
120 | 
121 | try:
122 | 
123 |     default_prompt = "" 
124 |     default_question = "" 
125 |     default_answer = ""
126 | 
127 |     if 'question' not in st.session_state:
128 |         st.session_state['question'] = default_question
129 |     if 'response' not in st.session_state:
130 |         st.session_state['response'] = default_answer
131 |     if 'context' not in st.session_state:
132 |         st.session_state['context'] = ""
133 |     if 'custom_prompt' not in st.session_state:
134 |         st.session_state['custom_prompt'] = ""
135 |     if 'custom_temperature' not in st.session_state:
136 |         st.session_state['custom_temperature'] = float(os.getenv("OPENAI_TEMPERATURE", 0.7))
137 | 
138 |     if 'sources' not in st.session_state:
139 |         st.session_state['sources'] = ""
140 |     if 'followup_questions' not in st.session_state:
141 |         st.session_state['followup_questions'] = []
142 |     if 'input_message_key' not in st.session_state:
143 |         st.session_state ['input_message_key'] = 1
144 |     if 'askedquestion' not in st.session_state:
145 |         st.session_state.askedquestion = default_question
146 | 
147 |     # Set page layout to wide screen and menu item
148 |     menu_items = {
149 | 	'Get help': None,
150 | 	'Report a bug': None,
151 | 	'About': '''
152 | 	 ## Embeddings App
153 | 	 Embedding testing application.
154 | 	'''
155 |     }
156 |     st.set_page_config(layout="wide", menu_items=menu_items)
157 | 
158 |     llm_helper = LLMHelper(custom_prompt=st.session_state.custom_prompt, temperature=st.session_state.custom_temperature)
159 | 
160 |     # Get available languages for translation
161 |     available_languages = get_languages()
162 | 
163 |     # Custom prompt variables
164 |     custom_prompt_placeholder = """{summaries}  
165 |     Please reply to the question using only the text above.  
166 |     Question: {question}  
167 |     Answer:"""
168 |     custom_prompt_help = """You can configure a custom prompt by adding the variables {summaries} and {question} to the prompt.  
169 |     {summaries} will be replaced with the content of the documents retrieved from the VectorStore.  
170 |     {question} will be replaced with the user's question.
171 |         """
172 | 
173 |     col1, col2, col3 = st.columns([1,2,1])
174 |     with col1:
175 |         st.image(os.path.join('images','microsoft.png'))
176 | 
177 |     col1, col2, col3 = st.columns([2,2,2])
178 |     with col1:
179 |         st.button("Check deployment", on_click=check_deployment)
180 |     with col3:
181 |         with st.expander("Settings"):
182 |             # model = st.selectbox(
183 |             #     "OpenAI GPT-3 Model",
184 |             #     [os.environ['OPENAI_ENGINE']]
185 |             # )
186 |             # st.tokens_response = st.slider("Tokens response length", 100, 500, 400)
187 |             st.slider("Temperature", min_value=0.0, max_value=1.0, step=0.1, key='custom_temperature')
188 |             st.text_area("Custom Prompt", key='custom_prompt', on_change=check_variables_in_prompt, placeholder= custom_prompt_placeholder,help=custom_prompt_help, height=150)
189 |             st.selectbox("Language", [None] + list(available_languages.keys()), key='translation_language')
190 | 
191 | 
192 |     question = st.text_input("Azure OpenAI Semantic Answer", value=st.session_state['askedquestion'], key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked)
193 | 
194 |     # Answer the question if any
195 |     if st.session_state.askedquestion != '':
196 |         st.session_state['question'] = st.session_state.askedquestion
197 |         st.session_state.askedquestion = ""
198 |         st.session_state['question'], \
199 |         st.session_state['response'], \
200 |         st.session_state['context'], \
201 |         st.session_state['sources'] = llm_helper.get_semantic_answer_lang_chain(st.session_state['question'], [])
202 |         st.session_state['response'], followup_questions_list = llm_helper.extract_followupquestions(st.session_state['response'])
203 |         st.session_state['followup_questions'] = followup_questions_list
204 | 
205 |     sourceList = []
206 | 
207 |     # Display the sources and context - even if the page is reloaded
208 |     if st.session_state['sources'] or st.session_state['context']:
209 |         st.session_state['response'], sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['response'], st.session_state['sources'])
210 |         st.write("<br>", unsafe_allow_html=True)
211 |         st.markdown("Answer: " + st.session_state['response'])
212 |  
213 |     # Display proposed follow-up questions which can be clicked on to ask that question automatically
214 |     if len(st.session_state['followup_questions']) > 0:
215 |         st.write("<br>", unsafe_allow_html=True)
216 |         st.markdown('**Proposed follow-up questions:**')
217 |     with st.container():
218 |         for questionId, followup_question in enumerate(st.session_state['followup_questions']):
219 |             if followup_question:
220 |                 str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
221 |                 st.button(str_followup_question, key=1000+questionId, on_click=ask_followup_question, args=(followup_question, ))
222 | 
223 |     if st.session_state['sources'] or st.session_state['context']:
224 |         # Buttons to display the context used to answer
225 |         st.write("<br>", unsafe_allow_html=True)
226 |         st.markdown('**Document sources:**')
227 |         for id in range(len(sourceList)):
228 |             st.markdown(f"[{id+1}] {sourceList[id]}")
229 | 
230 |         # Details on the question and answer context
231 |         st.write("<br><br>", unsafe_allow_html=True)
232 |         with st.expander("Question and Answer Context"):
233 |             if not st.session_state['context'] is None and st.session_state['context'] != []:
234 |                 for content_source in st.session_state['context'].keys():
235 |                     st.markdown(f"#### {content_source}")
236 |                     for context_text in st.session_state['context'][content_source]:
237 |                         st.markdown(f"{context_text}")
238 | 
239 |             st.markdown(f"SOURCES: {st.session_state['sources']}") 
240 | 
241 |     for questionId, followup_question in enumerate(st.session_state['followup_questions']):
242 |         if followup_question:
243 |             str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
244 | 
245 |     if st.session_state['translation_language'] and st.session_state['translation_language'] != '':
246 |         st.write(f"Translation to other languages, 翻译成其他语言, النص باللغة العربية")
247 |         st.write(f"{llm_helper.translator.translate(st.session_state['response'], available_languages[st.session_state['translation_language']])}")		
248 | 		
249 | except Exception:
250 |     st.error(traceback.format_exc())
251 | 


--------------------------------------------------------------------------------
/code/embeddings_text.csv:
--------------------------------------------------------------------------------
1 | text,davinci_search


--------------------------------------------------------------------------------
/code/environment.yml:
--------------------------------------------------------------------------------
1 | name: openai-qna-env
2 | channels:
3 |   - conda-forge
4 |   - defaults
5 | dependencies:
6 |   - python=3.10
7 |   - pip
8 |   - pip:
9 |     - -r requirements.txt


--------------------------------------------------------------------------------
/code/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "2.0",
 3 |   "logging": {
 4 |     "applicationInsights": {
 5 |       "samplingSettings": {
 6 |         "isEnabled": true,
 7 |         "excludedTypes": "Request"
 8 |       }
 9 |     }
10 |   },
11 |   "extensionBundle": {
12 |     "id": "Microsoft.Azure.Functions.ExtensionBundle",
13 |     "version": "[3.*, 4.0.0)"
14 |   }
15 | }


--------------------------------------------------------------------------------
/code/images/microsoft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/code/images/microsoft.png


--------------------------------------------------------------------------------
/code/pages/00_Chat.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from streamlit_chat import message
 3 | from utilities.helper import LLMHelper
 4 | import regex as re
 5 | import os
 6 | from random import randint
 7 | 
 8 | def clear_chat_data():
 9 |     st.session_state['chat_history'] = []
10 |     st.session_state['chat_source_documents'] = []
11 |     st.session_state['chat_askedquestion'] = ''
12 |     st.session_state['chat_question'] = ''
13 |     st.session_state['chat_followup_questions'] = []
14 |     answer_with_citations = ""
15 | 
16 | def questionAsked():
17 |     st.session_state.chat_askedquestion = st.session_state["input"+str(st.session_state ['input_message_key'])]
18 |     st.session_state["input"+str(st.session_state ['input_message_key'])] = ""
19 |     st.session_state.chat_question = st.session_state.chat_askedquestion
20 | 
21 | # Callback to assign the follow-up question is selected by the user
22 | def ask_followup_question(followup_question):
23 |     st.session_state.chat_askedquestion = followup_question
24 |     st.session_state['input_message_key'] = st.session_state['input_message_key'] + 1
25 | 
26 | try :
27 |     # Initialize chat history
28 |     if 'chat_question' not in st.session_state:
29 |             st.session_state['chat_question'] = ''
30 |     if 'chat_askedquestion' not in st.session_state:
31 |         st.session_state.chat_askedquestion = ''
32 |     if 'chat_history' not in st.session_state:
33 |         st.session_state['chat_history'] = []
34 |     if 'chat_source_documents' not in st.session_state:
35 |         st.session_state['chat_source_documents'] = []
36 |     if 'chat_followup_questions' not in st.session_state:
37 |         st.session_state['chat_followup_questions'] = []
38 |     if 'input_message_key' not in st.session_state:
39 |         st.session_state ['input_message_key'] = 1
40 | 
41 |     # Initialize Chat Icons
42 |     ai_avatar_style = os.getenv("CHAT_AI_AVATAR_STYLE", "thumbs")
43 |     ai_seed = os.getenv("CHAT_AI_SEED", "Lucy")
44 |     user_avatar_style = os.getenv("CHAT_USER_AVATAR_STYLE", "thumbs")
45 |     user_seed = os.getenv("CHAT_USER_SEED", "Bubba")
46 | 
47 |     llm_helper = LLMHelper()
48 | 
49 |     # Chat 
50 |     clear_chat = st.button("Clear chat", key="clear_chat", on_click=clear_chat_data)
51 |     input_text = st.text_input("You: ", placeholder="type your question", key="input"+str(st.session_state ['input_message_key']), on_change=questionAsked)
52 | 
53 | 
54 |     # If a question is asked execute the request to get the result, context, sources and up to 3 follow-up questions proposals
55 |     if st.session_state.chat_askedquestion:
56 |         st.session_state['chat_question'] = st.session_state.chat_askedquestion
57 |         st.session_state.chat_askedquestion = ""
58 |         st.session_state['chat_question'], result, context, sources = llm_helper.get_semantic_answer_lang_chain(st.session_state['chat_question'], st.session_state['chat_history'])    
59 |         result, chat_followup_questions_list = llm_helper.extract_followupquestions(result)
60 |         st.session_state['chat_history'].append((st.session_state['chat_question'], result))
61 |         st.session_state['chat_source_documents'].append(sources)
62 |         st.session_state['chat_followup_questions'] = chat_followup_questions_list
63 | 
64 | 
65 |     # Displays the chat history
66 |     if st.session_state['chat_history']:
67 |         history_range = range(len(st.session_state['chat_history'])-1, -1, -1)
68 |         for i in range(len(st.session_state['chat_history'])-1, -1, -1):
69 | 
70 |             # This history entry is the latest one - also show follow-up questions, buttons to access source(s) context(s) 
71 |             if i == history_range.start:
72 |                 answer_with_citations, sourceList, matchedSourcesList, linkList, filenameList = llm_helper.get_links_filenames(st.session_state['chat_history'][i][1], st.session_state['chat_source_documents'][i])
73 |                 st.session_state['chat_history'][i] = st.session_state['chat_history'][i][:1] + (answer_with_citations,)
74 | 
75 |                 answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]).strip() # message() does not get Latex nor html
76 | 
77 |                 # Display proposed follow-up questions which can be clicked on to ask that question automatically
78 |                 if len(st.session_state['chat_followup_questions']) > 0:
79 |                     st.markdown('**Proposed follow-up questions:**')
80 |                 with st.container():
81 |                     for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
82 |                         if followup_question:
83 |                             str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
84 |                             st.button(str_followup_question, key=randint(1000,99999), on_click=ask_followup_question, args=(followup_question, ))
85 |                     
86 |                 for questionId, followup_question in enumerate(st.session_state['chat_followup_questions']):
87 |                     if followup_question:
88 |                         str_followup_question = re.sub(r"(^|[^\\\\])'", r"\1\\'", followup_question)
89 | 
90 |             answer_with_citations = re.sub(r'\$\^\{(.*?)\}\$', r'(\1)', st.session_state['chat_history'][i][1]) # message() does not get Latex nor html
91 |             message(answer_with_citations ,key=str(i)+'answers', avatar_style=ai_avatar_style, seed=ai_seed)
92 |             st.markdown(f'\n\nSources: {st.session_state["chat_source_documents"][i]}')
93 |             message(st.session_state['chat_history'][i][0], is_user=True, key=str(i)+'user' + '_user', avatar_style=user_avatar_style, seed=user_seed)
94 | 
95 | except Exception:
96 |     st.error(traceback.format_exc())
97 | 


--------------------------------------------------------------------------------
/code/pages/01_Add_Document.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import os, json, re, io
  3 | from os import path
  4 | import requests
  5 | import mimetypes
  6 | import traceback
  7 | import chardet
  8 | from utilities.helper import LLMHelper
  9 | import uuid
 10 | from redis.exceptions import ResponseError 
 11 | from urllib import parse
 12 |     
 13 | def upload_text_and_embeddings():
 14 |     file_name = f"{uuid.uuid4()}.txt"
 15 |     source_url = llm_helper.blob_client.upload_file(st.session_state['doc_text'], file_name=file_name, content_type='text/plain; charset=utf-8')
 16 |     llm_helper.add_embeddings_lc(source_url) 
 17 |     st.success("Embeddings added successfully.")
 18 | 
 19 | def remote_convert_files_and_add_embeddings(process_all=False):
 20 |     url = os.getenv('CONVERT_ADD_EMBEDDINGS_URL')
 21 |     if process_all:
 22 |         url = f"{url}?process_all=true"
 23 |     try:
 24 |         response = requests.post(url)
 25 |         if response.status_code == 200:
 26 |             st.success(f"{response.text}\nPlease note this is an asynchronous process and may take a few minutes to complete.")
 27 |         else:
 28 |             st.error(f"Error: {response.text}")
 29 |     except Exception as e:
 30 |         st.error(traceback.format_exc())
 31 | 
 32 | def delete_row():
 33 |     st.session_state['data_to_drop'] 
 34 |     redisembeddings.delete_document(st.session_state['data_to_drop'])
 35 | 
 36 | def add_urls():
 37 |     urls = st.session_state['urls'].split('\n')
 38 |     for url in urls:
 39 |         if url:
 40 |             llm_helper.add_embeddings_lc(url)
 41 |             st.success(f"Embeddings added successfully for {url}")
 42 | 
 43 | def upload_file(bytes_data: bytes, file_name: str):
 44 |     # Upload a new file
 45 |     st.session_state['filename'] = file_name
 46 |     content_type = mimetypes.MimeTypes().guess_type(file_name)[0]
 47 |     charset = f"; charset={chardet.detect(bytes_data)['encoding']}" if content_type == 'text/plain' else ''
 48 |     st.session_state['file_url'] = llm_helper.blob_client.upload_file(bytes_data, st.session_state['filename'], content_type=content_type+charset)
 49 | 
 50 | 
 51 | try:
 52 |     # Set page layout to wide screen and menu item
 53 |     menu_items = {
 54 | 	'Get help': None,
 55 | 	'Report a bug': None,
 56 | 	'About': '''
 57 | 	 ## Embeddings App
 58 | 	 Embedding testing application.
 59 | 	'''
 60 |     }
 61 |     st.set_page_config(layout="wide", menu_items=menu_items)
 62 | 
 63 |     llm_helper = LLMHelper()
 64 | 
 65 |     with st.expander("Add a single document to the knowledge base", expanded=True):
 66 |         st.write("For heavy or long PDF, please use the 'Add documents in batch' option below.")
 67 |         st.checkbox("Translate document to English", key="translate")
 68 |         uploaded_file = st.file_uploader("Upload a document to add it to the knowledge base", type=['pdf','jpeg','jpg','png', 'txt'])
 69 |         if uploaded_file is not None:
 70 |             # To read file as bytes:
 71 |             bytes_data = uploaded_file.getvalue()
 72 | 
 73 |             if st.session_state.get('filename', '') != uploaded_file.name:
 74 |                 upload_file(bytes_data, uploaded_file.name)
 75 |                 converted_filename = ''
 76 |                 if uploaded_file.name.endswith('.txt'):
 77 |                     # Add the text to the embeddings
 78 |                     llm_helper.add_embeddings_lc(st.session_state['file_url'])
 79 | 
 80 |                 else:
 81 |                     # Get OCR with Layout API and then add embeddigns
 82 |                     converted_filename = llm_helper.convert_file_and_add_embeddings(st.session_state['file_url'], st.session_state['filename'], st.session_state['translate'])
 83 |                 
 84 |                 llm_helper.blob_client.upsert_blob_metadata(uploaded_file.name, {'converted': 'true', 'embeddings_added': 'true', 'converted_filename': parse.quote(converted_filename)})
 85 |                 st.success(f"File {uploaded_file.name} embeddings added to the knowledge base.")
 86 |             
 87 |             # pdf_display = f'<iframe src="{st.session_state["file_url"]}" width="700" height="1000" type="application/pdf"></iframe>'
 88 | 
 89 |     with st.expander("Add text to the knowledge base", expanded=False):
 90 |         col1, col2 = st.columns([3,1])
 91 |         with col1: 
 92 |             st.session_state['doc_text'] = st.text_area("Add a new text content and them click on 'Compute Embeddings'", height=600)
 93 | 
 94 |         with col2:
 95 |             st.session_state['embeddings_model'] = st.selectbox('Embeddings models', [llm_helper.get_embeddings_model()['doc']], disabled=True)
 96 |             st.button("Compute Embeddings", on_click=upload_text_and_embeddings)
 97 | 
 98 |     with st.expander("Add documents in Batch", expanded=False):
 99 |         uploaded_files = st.file_uploader("Upload a document to add it to the Azure Storage Account", type=['pdf','jpeg','jpg','png', 'txt'], accept_multiple_files=True)
100 |         if uploaded_files is not None:
101 |             for up in uploaded_files:
102 |                 # To read file as bytes:
103 |                 bytes_data = up.getvalue()
104 | 
105 |                 if st.session_state.get('filename', '') != up.name:
106 |                     # Upload a new file
107 |                     upload_file(bytes_data, up.name)
108 |                     if up.name.endswith('.txt'):
109 |                         # Add the text to the embeddings
110 |                         llm_helper.blob_client.upsert_blob_metadata(up.name, {'converted': "true"})
111 | 
112 |         col1, col2, col3 = st.columns([2,2,2])
113 |         with col1:
114 |             st.button("Convert new files and add embeddings", on_click=remote_convert_files_and_add_embeddings)
115 |         with col3:
116 |             st.button("Convert all files and add embeddings", on_click=remote_convert_files_and_add_embeddings, args=(True,))
117 | 
118 |     with st.expander("Add URLs to the knowledge base", expanded=True):
119 |         col1, col2 = st.columns([3,1])
120 |         with col1: 
121 |             st.session_state['urls'] = st.text_area("Add a URLs and than click on 'Compute Embeddings'", placeholder="PLACE YOUR URLS HERE SEPARATED BY A NEW LINE", height=100)
122 | 
123 |         with col2:
124 |             st.selectbox('Embeddings models', [llm_helper.get_embeddings_model()['doc']], disabled=True, key="embeddings_model_url")
125 |             st.button("Compute Embeddings", on_click=add_urls, key="add_url")
126 | 
127 |     with st.expander("View documents in the knowledge base", expanded=False):
128 |         # Query RediSearch to get all the embeddings
129 |         try:
130 |             data = llm_helper.get_all_documents(k=1000)
131 |             if len(data) == 0:
132 |                 st.warning("No embeddings found. Copy paste your data in the text input and click on 'Compute Embeddings' or drag-and-drop documents.")
133 |             else:
134 |                 st.dataframe(data, use_container_width=True)
135 |         except Exception as e:
136 |             if isinstance(e, ResponseError):
137 |                 st.warning("No embeddings found. Copy paste your data in the text input and click on 'Compute Embeddings' or drag-and-drop documents.")
138 |             else:
139 |                 st.error(traceback.format_exc())
140 | 
141 | 
142 | except Exception as e:
143 |     st.error(traceback.format_exc())
144 | 


--------------------------------------------------------------------------------
/code/pages/02_Document_Management.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import os
  3 | import traceback
  4 | from utilities.helper import LLMHelper
  5 | import streamlit.components.v1 as components
  6 | from urllib import parse
  7 | 
  8 | def delete_embeddings_of_file(file_to_delete):
  9 |     # Query RediSearch to get all the embeddings - lazy loading
 10 |     if 'data_files_embeddings' not in st.session_state:
 11 |         st.session_state['data_files_embeddings'] = llm_helper.get_all_documents(k=1000)
 12 | 
 13 |     if st.session_state['data_files_embeddings'].shape[0] == 0:
 14 |         return
 15 | 
 16 |     for converted_file_extension in ['.txt']:
 17 |         file_to_delete = 'converted/' + file_to_delete + converted_file_extension
 18 | 
 19 |         embeddings_to_delete = st.session_state['data_files_embeddings'][st.session_state['data_files_embeddings']['filename'] == file_to_delete]['key'].tolist()
 20 |         embeddings_to_delete = list(map(lambda x: f"{x}", embeddings_to_delete))
 21 |         if len(embeddings_to_delete) > 0:
 22 |             llm_helper.vector_store.delete_keys(embeddings_to_delete)
 23 |             # remove all embeddings lines for the filename from session state
 24 |             st.session_state['data_files_embeddings'] = st.session_state['data_files_embeddings'].drop(st.session_state['data_files_embeddings'][st.session_state['data_files_embeddings']['filename'] == file_to_delete].index)
 25 | 
 26 | def delete_file_and_embeddings(filename=''):
 27 |     # Query RediSearch to get all the embeddings - lazy loading
 28 |     if 'data_files_embeddings' not in st.session_state:
 29 |         st.session_state['data_files_embeddings'] = llm_helper.get_all_documents(k=1000)
 30 | 
 31 |     if filename == '':
 32 |         filename = st.session_state['file_and_embeddings_to_drop'] # get the current selected filename
 33 |     
 34 |     file_dict = next((d for d in st.session_state['data_files'] if d['filename'] == filename), None)
 35 | 
 36 |     if len(file_dict) > 0:
 37 |         # delete source file
 38 |         source_file = file_dict['filename']
 39 |         try:
 40 |             llm_helper.blob_client.delete_file(source_file)
 41 |         except Exception as e:
 42 |             st.error(f"Error deleting file: {source_file} - {e}")
 43 | 
 44 |         # delete converted file
 45 |         if file_dict['converted']:
 46 |             converted_file = 'converted/' + file_dict['filename'] + '.txt'
 47 |             try:
 48 |                 llm_helper.blob_client.delete_file(converted_file)
 49 |             except Exception as e:
 50 |                 st.error(f"Error deleting file : {converted_file} - {e}")
 51 | 
 52 |         # delete embeddings
 53 |         if file_dict['embeddings_added']:
 54 |             delete_embeddings_of_file(parse.quote(filename))
 55 |     
 56 |     # update the list of filenames to remove the deleted filename
 57 |     st.session_state['data_files'] = [d for d in st.session_state['data_files'] if d['filename'] != '{filename}']
 58 | 
 59 | 
 60 | def delete_all_files_and_embeddings():
 61 |     files_list = st.session_state['data_files']
 62 |     for filename_dict in files_list:
 63 |         delete_file_and_embeddings(filename_dict['filename'])
 64 | 
 65 | try:
 66 |     # Set page layout to wide screen and menu item
 67 |     menu_items = {
 68 | 	'Get help': None,
 69 | 	'Report a bug': None,
 70 | 	'About': '''
 71 | 	 ## Embeddings App
 72 | 
 73 | 	Document Reader Sample Demo.
 74 | 	'''
 75 |     }
 76 |     st.set_page_config(layout="wide", menu_items=menu_items)
 77 | 
 78 |     hide_streamlit_style = """
 79 |                 <style>
 80 |                 #MainMenu {visibility: hidden;}
 81 |                 footer {visibility: hidden;}
 82 |                 </style>
 83 |                 """
 84 |     st.markdown(hide_streamlit_style, unsafe_allow_html=True) 
 85 | 
 86 |     llm_helper = LLMHelper()
 87 | 
 88 | 
 89 |     st.session_state['data_files'] = llm_helper.blob_client.get_all_files()
 90 |     st.session_state['data_files_embeddings'] = llm_helper.get_all_documents(k=1000)
 91 | 
 92 |     if len(st.session_state['data_files']) == 0:
 93 |         st.warning("No files found. Go to the 'Add Document' tab to insert your docs.")
 94 | 
 95 |     else:
 96 |         st.dataframe(st.session_state['data_files'], use_container_width=True)
 97 | 
 98 |         st.text("")
 99 |         st.text("")
100 |         st.text("")
101 | 
102 |         filenames_list = [d['filename'] for d in st.session_state['data_files']]
103 |         st.selectbox("Select filename to delete", filenames_list, key="file_and_embeddings_to_drop")
104 |          
105 |         st.text("")
106 |         st.button("Delete file and its embeddings", on_click=delete_file_and_embeddings)
107 |         st.text("")
108 |         st.text("")
109 | 
110 |         if len(st.session_state['data_files']) > 1:
111 |             st.button("Delete all files (with their embeddings)", type="secondary", on_click=delete_all_files_and_embeddings, args=None, kwargs=None)
112 | 
113 | except Exception as e:
114 |     st.error(traceback.format_exc())
115 | 


--------------------------------------------------------------------------------
/code/pages/04_Index_Management.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | import traceback
 4 | from utilities.helper import LLMHelper
 5 | 
 6 | def delete_embedding():
 7 |     llm_helper.vector_store.delete_keys([f"{st.session_state['embedding_to_drop']}"])
 8 |     if 'data_embeddings' in st.session_state:
 9 |         del st.session_state['data_embeddings'] 
10 | 
11 | def delete_file_embeddings():
12 |     if st.session_state['data_embeddings'].shape[0] != 0:
13 |         file_to_delete = st.session_state['file_to_drop']
14 |         embeddings_to_delete = st.session_state['data_embeddings'][st.session_state['data_embeddings']['filename'] == file_to_delete]['key'].tolist()
15 |         embeddings_to_delete = list(map(lambda x: f"{x}", embeddings_to_delete))
16 |         if len(embeddings_to_delete) > 0:
17 |             llm_helper.vector_store.delete_keys(embeddings_to_delete)
18 |             # remove all embeddings lines for the filename from session state
19 |             st.session_state['data_embeddings'] = st.session_state['data_embeddings'].drop(st.session_state['data_embeddings'][st.session_state['data_embeddings']['filename'] == file_to_delete].index)
20 | 
21 | def delete_all():
22 |     embeddings_to_delete = st.session_state['data_embeddings'].key.tolist()
23 |     embeddings_to_delete = list(map(lambda x: f"{x}", embeddings_to_delete))
24 |     llm_helper.vector_store.delete_keys(embeddings_to_delete)   
25 | 
26 | 
27 | 
28 | try:
29 |     # Set page layout to wide screen and menu item
30 |     menu_items = {
31 | 	'Get help': None,
32 | 	'Report a bug': None,
33 | 	'About': '''
34 | 	 ## Embeddings App
35 | 	 Embedding testing application.
36 | 	'''
37 |     }
38 |     st.set_page_config(layout="wide", menu_items=menu_items)
39 | 
40 |     llm_helper = LLMHelper()
41 | 
42 |     # Query RediSearch to get all the embeddings
43 |     st.session_state['data_embeddings'] = llm_helper.get_all_documents(k=1000)
44 | 
45 |     nb_embeddings = len(st.session_state['data_embeddings'])
46 | 
47 |     if nb_embeddings == 0:
48 |         st.warning("No embeddings found. Go to the 'Add Document' tab to insert your docs.")
49 |     else:
50 |         st.dataframe(st.session_state['data_embeddings'], use_container_width=True)
51 |         st.text("")
52 |         st.text("")
53 |         st.download_button("Download data", st.session_state['data_embeddings'].to_csv(index=False).encode('utf-8'), "embeddings.csv", "text/csv", key='download-embeddings')
54 | 
55 |         st.text("")
56 |         st.text("")
57 |         col1, col2, col3 = st.columns([3,1,3])
58 |         with col1:
59 |             st.selectbox("Embedding id to delete", st.session_state['data_embeddings'].get('key',[]), key="embedding_to_drop")
60 |             st.text("")
61 |             st.button("Delete embedding", on_click=delete_embedding)
62 |         with col2:
63 |             st.text("")
64 |         with col3:
65 |             st.selectbox("File name to delete its embeddings", set(st.session_state['data_embeddings'].get('filename',[])), key="file_to_drop")
66 |             st.text("")
67 |             st.button("Delete file embeddings", on_click=delete_file_embeddings)
68 | 
69 |         st.text("")
70 |         st.text("")
71 |         st.button("Delete all embeddings", type="secondary", on_click=delete_all)
72 |  
73 | except Exception as e:
74 |     st.error(traceback.format_exc())
75 | 


--------------------------------------------------------------------------------
/code/pages/10_Sandbox.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import traceback
 3 | from utilities.helper import LLMHelper
 4 | 
 5 | def clear_summary():
 6 |     st.session_state['summary'] = ""
 7 | 
 8 | def get_custom_prompt():
 9 |     customtext = st.session_state['customtext']
10 |     customprompt = "{}".format(customtext)
11 |     return customprompt
12 | 
13 | def customcompletion():
14 |     response = llm_helper.get_completion(get_custom_prompt(), max_tokens=500)
15 |     st.session_state['result'] = response.encode().decode()
16 | 
17 | try:
18 |     # Set page layout to wide screen and menu item
19 |     menu_items = {
20 |     'Get help': None,
21 |     'Report a bug': None,
22 |     'About': '''
23 |      ## Embeddings App
24 |      Embedding testing application.
25 |     '''
26 |     }
27 |     st.set_page_config(layout="wide", menu_items=menu_items)
28 | 
29 |     st.markdown("## Bring your own prompt")
30 | 
31 |     llm_helper = LLMHelper()
32 | 
33 |     # displaying a box for a custom prompt
34 |     st.session_state['customtext'] = st.text_area(label="Prompt",value='Legal clause: The Company and the Founders will provide the Investors with customary representations and warranties examples of which are set out in Appendix 4 and the Founders will provide the Investors with customary non-competition, non-solicitation and confidentiality undertakings.\n \n Plain English: The company and its founders will provide the usual assurances and guarantees on facts about the business. The founders will also agree not to work for competitors, poach employees or customers when they leave the company, and respect confidentiality. \n \n Legal clause: In the event of an initial public offering of the Companys shares on a US stock \n exchange the Investors shall be entitled to registration rights customary in transactions of this type (including two demand rights and unlimited shelf and piggy-back rights), with the expenses paid by the Company. \n \n Plain English: If the Company does an IPO in the USA, investors have the usual rights to include \n their shares in the public offering and the costs of doing this will be covered by the Company. \n \n Legal clause: Upon liquidation of the Company, the Series A Shareholders will receive in preference to all other shareholders an amount in respect of each Series A Share equal to one times the Original Issue Price (the "Liquidation Preference"), plus all accrued but unpaid dividends. To the extent that the Company has assets remaining after the distribution of that amount, the Series A Shareholders will participate with the holders of Ordinary Shares pro rata to the number of shares held on an as converted basis. \n \n Plain English:', height=400)
35 |     st.button(label="Test with your own prompt", on_click=customcompletion)
36 |     # displaying the summary
37 |     result = ""
38 |     if 'result' in st.session_state:
39 |         result = st.session_state['result']
40 |     st.text_area(label="OpenAI result", value=result, height=200)
41 | 
42 | except Exception as e:
43 |     st.error(traceback.format_exc())
44 | 


--------------------------------------------------------------------------------
/code/pages/10_Utils - Document_Summary.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from utilities.helper import LLMHelper
 3 | import os
 4 | import traceback
 5 | 
 6 | def summarize():
 7 |     response = llm_helper.get_completion(get_prompt())
 8 |     st.session_state['summary'] = response
 9 | 
10 | def clear_summary():
11 |     st.session_state['summary'] = ""
12 | 
13 | def get_prompt():
14 |     text = st.session_state['text']
15 |     if text is None or text == '':
16 |         text = '{}'
17 |     if summary_type == "Basic Summary":
18 |         prompt = "Summarize the following text:\n\n{}\n\nSummary:".format(text)
19 |     elif summary_type == "Bullet Points":
20 |         prompt = "Summarize the following text into bullet points:\n\n{}\n\nSummary:".format(text)
21 |     elif summary_type == "Explain it to a second grader":
22 |         prompt = "Explain the following text to a second grader:\n\n{}\n\nSummary:".format(text)
23 | 
24 |     return prompt
25 | 
26 | try:
27 |     # Set page layout to wide screen and menu item
28 |     menu_items = {
29 |     'Get help': None,
30 |     'Report a bug': None,
31 |     'About': '''
32 |      ## Embeddings App
33 |      Embedding testing application.
34 |     '''
35 |     }
36 |     st.set_page_config(layout="wide", menu_items=menu_items)
37 | 
38 |     llm_helper = LLMHelper()
39 | 
40 |     st.markdown("## Summarization")
41 |     # radio buttons for summary type
42 |     summary_type = st.radio(
43 |         "Select a type of summarization",
44 |         ["Basic Summary", "Bullet Points", "Explain it to a second grader"],
45 |         key="visibility"
46 |     )
47 |     # text area for user to input text
48 |     st.session_state['text'] = st.text_area(label="Enter some text to summarize",value='A neutron star is the collapsed core of a massive supergiant star, which had a total mass of between 10 and 25 solar masses, possibly more if the star was especially metal-rich.[1] Neutron stars are the smallest and densest stellar objects, excluding black holes and hypothetical white holes, quark stars, and strange stars.[2] Neutron stars have a radius on the order of 10 kilometres (6.2 mi) and a mass of about 1.4 solar masses.[3] They result from the supernova explosion of a massive star, combined with gravitational collapse, that compresses the core past white dwarf star density to that of atomic nuclei.', height=200)
49 |     st.button(label="Summarize", on_click=summarize)
50 | 
51 |     # if summary doesn't exist in the state, make it an empty string
52 |     summary = ""
53 |     if 'summary' in st.session_state:
54 |         summary = st.session_state['summary']
55 | 
56 |     # displaying the summary
57 |     st.text_area(label="Summary result", value=summary, height=200)
58 |     st.button(label="Clear summary", on_click=clear_summary)
59 | 
60 |     # displaying the prompt that was used to generate the summary
61 |     st.text_area(label="Prompt",value=get_prompt(), height=400)
62 |     st.button(label="Summarize with updated prompt")
63 | 
64 | except Exception as e:
65 |     st.error(traceback.format_exc())
66 | 


--------------------------------------------------------------------------------
/code/pages/11_Utils - Conversation_Data_Extraction.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | import traceback
 4 | from utilities.helper import LLMHelper
 5 | 
 6 | def clear_summary():
 7 |     st.session_state['summary'] = ""
 8 | 
 9 | def get_custom_prompt():
10 |     customtext = st.session_state['customtext']
11 |     customprompt = "{}".format(customtext)
12 |     return customprompt
13 | 
14 | def customcompletion():
15 |     response = llm_helper.get_completion(get_custom_prompt())
16 |     st.session_state['conv_result'] = response.encode().decode()
17 | 
18 | try:
19 |     # Set page layout to wide screen and menu item
20 |     menu_items = {
21 |     'Get help': None,
22 |     'Report a bug': None,
23 |     'About': '''
24 |      ## Embeddings App
25 |      Embedding testing application.
26 |     '''
27 |     }
28 |     st.set_page_config(layout="wide", menu_items=menu_items)
29 | 
30 |     llm_helper = LLMHelper()
31 | 
32 |     st.markdown("## Conversation data extraction")
33 | 
34 |     conversation_prompt = """   User: Hi there, I’m off between August 25 and September 11. I saved up 4000 for a nice trip. If I flew out from San Francisco, what are your suggestions for where I can go?
35 |         Agent: For that budget you could travel to cities in the US, Mexico, Brazil, Italy or Japan. Any preferences?
36 |         User: Excellent, I’ve always wanted to see Japan. What kind of hotel can I expect?
37 |         Agent: Great, let me check what I have. First, can I just confirm with you that this is a trip for one adult?
38 |         User: Yes it is
39 |         Agent: Great, thank you, In that case I can offer you 15 days at HOTEL Sugoi, a 3 star hotel close to a Palace. You would be staying there between August 25th and September 7th. They offer free wifi and have an excellent guest rating of 8.49/10. The entire package costs 2024.25USD. Should I book this for you?
40 |         User: That sounds really good actually. Lets say I have a date I wanted to bring…would Japan be out of my price range then?
41 |         Agent: Yes, unfortunately the packages I have for two in Japan do not fit in your budget. However I can offer you a 13 day beach getaway at the 3 star Rose Sierra Hotel in Santo Domingo. Would something like that interest you?
42 |         User: How are the guest ratings for that place?
43 |         Agent: 7.06/10, so guests seem to be quite satisfied with the place.
44 |         User: TRUE. You know what, I’m not sure that I’m ready to ask her to travel with me yet anyway. Just book me for Sugoi
45 |         Agent:I can do that for you! 
46 |         User:Thanks!
47 |         Agent: Can I help you with some other booking today?
48 |         User:No, thanks!
49 | 
50 | 
51 |         Execute these tasks:
52 |         -	Summarize the conversation, key: summary
53 |         -      Customer budget none if not detected, key: budget
54 |         -      Departure city, key: departure
55 |         -      Destination city, key: destination
56 |         -      Selected country, key: country
57 |         -      Which hotel the customer choose?, key: hotel
58 |         -	Did the agent remind the customer about the evaluation survey? , key:evaluation true or false as bool
59 |         -	Did the customer mention a product competitor?, key: competitor true or false as bool
60 |         -	Did the customer ask for a discount?, key:discount true or false as bool
61 |         - Agent asked for additional customer needs. key: additional_requests
62 |         - Was the customer happy with the resolution? key: satisfied
63 | 
64 |         Answer in JSON machine-readable format, using the keys from above.
65 |         Format the ouput as JSON object called "results". Pretty print the JSON and make sure that is properly closed at the end."""
66 | 
67 |     # displaying a box for a custom prompt
68 |     st.session_state['customtext'] = st.text_area(label="Prompt",value=conversation_prompt, height=400)
69 |     st.button(label="Execute tasks", on_click=customcompletion)
70 |     # displaying the summary
71 |     result = ""
72 |     if 'conv_result' in st.session_state:
73 |         result = st.session_state['conv_result']
74 |     st.text_area(label="OpenAI result", value=result, height=200)
75 | 
76 | except Exception as e:
77 |     st.error(traceback.format_exc())
78 | 


--------------------------------------------------------------------------------
/code/pages/12_Utils - Prompt Exploration.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import os
 3 | import traceback
 4 | from utilities.helper import LLMHelper
 5 | 
 6 | def get_prompt():
 7 |     return f"{st.session_state['doc_text']}\n{st.session_state['input_prompt']}"
 8 |    
 9 | def customcompletion():
10 |     response = llm_helper.get_completion(get_prompt())
11 |     st.session_state['prompt_result']= response.encode().decode()
12 | 
13 | def process_all(data):
14 |     llm_helper.vector_store.delete_prompt_results('prompt*')
15 |     data_to_process = data[data.filename.isin(st.session_state['selected_docs'])]
16 |     for doc in data_to_process.to_dict('records'):
17 |         prompt = f"{doc['content']}\n{st.session_state['input_prompt']}\n\n"
18 |         response = llm_helper.get_completion(prompt)
19 |         llm_helper.vector_store.add_prompt_result(doc['key'], response.encode().decode(), doc['filename'], st.session_state['input_prompt'])
20 |     st.session_state['data_processed'] = llm_helper.vector_store.get_prompt_results().to_csv(index=False)
21 | 
22 | try:
23 |     # Set page layout to wide screen and menu item
24 |     menu_items = {
25 | 	'Get help': None,
26 | 	'Report a bug': None,
27 | 	'About': '''
28 | 	 ## Embeddings App
29 | 	 Embedding testing application.
30 | 	'''
31 |     }
32 |     st.set_page_config(layout="wide", menu_items=menu_items)
33 | 
34 |     if not 'data_processed' in st.session_state:
35 |         st.session_state['data_processed'] = None
36 | 
37 |     llm_helper = LLMHelper()
38 | 
39 |     # Query RediSearch to get all the embeddings
40 |     data = llm_helper.get_all_documents(k=1000)
41 | 
42 |     if len(data) == 0:
43 |         st.warning("No embeddings found. Go to the 'Add Document' tab to insert your docs.")
44 |     else:
45 |         st.dataframe(data, use_container_width=True)
46 | 
47 |         # displaying a box for a custom prompt
48 |         st.text_area(label="Document", height=400, key="doc_text")
49 |         st.text_area(label="Prompt", height=100, key="input_prompt")
50 |         st.button(label="Execute tasks", on_click=customcompletion)
51 |         # displaying the summary
52 |         result = ""
53 |         if 'prompt_result' in st.session_state:
54 |             result = st.session_state['prompt_result']
55 |             st.text_area(label="Result", value=result, height=400)
56 | 
57 |         cols = st.columns([1,1,1,2])
58 |         with cols[1]:
59 |             st.multiselect("Select documents", sorted(set(data.filename.tolist())), key="selected_docs")
60 |         with cols[2]:
61 |             st.text("-")
62 |             st.button("Execute task on docs", on_click=process_all, args=(data,)) 
63 |         with cols[3]:
64 |             st.text("-")
65 |             download_data = st.session_state['data_processed'] if st.session_state['data_processed'] is not None else ""
66 |             st.download_button(label="Download results", data=download_data, file_name="results.csv", mime="text/csv", disabled=st.session_state['data_processed'] is None)
67 | 
68 | except Exception as e:
69 |     st.error(traceback.format_exc())
70 | 


--------------------------------------------------------------------------------
/code/requirements.txt:
--------------------------------------------------------------------------------
 1 | azure-functions
 2 | 
 3 | streamlit==1.20.0
 4 | openai==0.27.2
 5 | matplotlib==3.6.3
 6 | plotly==5.12.0
 7 | scipy==1.10.0
 8 | scikit-learn==1.2.0
 9 | transformers==4.25.1
10 | redis==4.5.4
11 | python-dotenv==1.0.0
12 | azure-ai-formrecognizer==3.2.0
13 | azure-storage-blob==12.14.1
14 | requests==2.28.2
15 | tiktoken==0.2.0
16 | azure-storage-queue==12.5.0
17 | langchain==0.0.136
18 | beautifulsoup4==4.12.0
19 | streamlit-chat==0.0.2.2
20 | fake-useragent==1.1.3
21 | chardet==5.1.0
22 | pgvector==0.2.4
23 | psycopg2-binary==2.9.9
24 | --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
25 | azure-search-documents==11.4.0a20230509004


--------------------------------------------------------------------------------
/code/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/code/utilities/__init__.py


--------------------------------------------------------------------------------
/code/utilities/azureblobstorage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient, generate_blob_sas, generate_container_sas, ContentSettings
 4 | from dotenv import load_dotenv
 5 | 
 6 | class AzureBlobStorageClient:
 7 |     def __init__(self, account_name: str = None, account_key: str = None, container_name: str = None):
 8 | 
 9 |         load_dotenv()
10 | 
11 |         self.azure_cloud : str = os.getenv('AZURE_CLOUD', 'AzureCloud')
12 |         self.blob_endpoint_suffix : str = 'core.chinacloudapi.cn' if self.azure_cloud == 'AzureChinaCloud' else 'core.windows.net'
13 |         self.account_name : str = account_name if account_name else os.getenv('BLOB_ACCOUNT_NAME')
14 |         self.account_key : str = account_key if account_key else os.getenv('BLOB_ACCOUNT_KEY')
15 |         self.connect_str : str = f"DefaultEndpointsProtocol=https;AccountName={self.account_name};AccountKey={self.account_key};EndpointSuffix={self.blob_endpoint_suffix}"
16 |         self.container_name : str = container_name if container_name else os.getenv('BLOB_CONTAINER_NAME')
17 |         self.blob_service_client : BlobServiceClient = BlobServiceClient.from_connection_string(self.connect_str)
18 | 
19 |     def delete_file(self, file_name):
20 |         blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=file_name)
21 |         blob_client.delete_blob()
22 | 
23 |     def upload_file(self, bytes_data, file_name, content_type='application/pdf'):
24 |         # Create a blob client using the local file name as the name for the blob
25 |         blob_client = self.blob_service_client.get_blob_client(container=self.container_name, blob=file_name)
26 |         # Upload the created file
27 |         blob_client.upload_blob(bytes_data, overwrite=True, content_settings=ContentSettings(content_type=content_type))
28 |         # Generate a SAS URL to the blob and return it
29 |         return blob_client.url + '?' + generate_blob_sas(self.account_name, self.container_name, file_name,account_key=self.account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
30 | 
31 |     def get_all_files(self):
32 |         # Get all files in the container from Azure Blob Storage
33 |         container_client = self.blob_service_client.get_container_client(self.container_name)
34 |         blob_list = container_client.list_blobs(include='metadata')
35 |         # sas = generate_blob_sas(account_name, container_name, blob.name,account_key=account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
36 |         sas = generate_container_sas(self.account_name, self.container_name,account_key=self.account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=3))
37 |         files = []
38 |         converted_files = {}
39 |         for blob in blob_list:
40 |             if not blob.name.startswith('converted/'):
41 |                 files.append({
42 |                     "filename" : blob.name,
43 |                     "converted": blob.metadata.get('converted', 'false') == 'true' if blob.metadata else False,
44 |                     "embeddings_added": blob.metadata.get('embeddings_added', 'false') == 'true' if blob.metadata else False,
45 |                     "fullpath": f"https://{self.account_name}.blob.{self.blob_endpoint_suffix}/{self.container_name}/{blob.name}?{sas}",
46 |                     "converted_filename": blob.metadata.get('converted_filename', '') if blob.metadata else '',
47 |                     "converted_path": ""
48 |                     })
49 |             else:
50 |                 converted_files[blob.name] = f"https://{self.account_name}.blob.{self.blob_endpoint_suffix}/{self.container_name}/{blob.name}?{sas}"
51 | 
52 |         for file in files:
53 |             converted_filename = file.pop('converted_filename', '')
54 |             if converted_filename in converted_files:
55 |                 file['converted'] = True
56 |                 file['converted_path'] = converted_files[converted_filename]
57 |         
58 |         return files
59 | 
60 |     def upsert_blob_metadata(self, file_name, metadata):
61 |         blob_client = BlobServiceClient.from_connection_string(self.connect_str).get_blob_client(container=self.container_name, blob=file_name)
62 |         # Read metadata from the blob
63 |         blob_metadata = blob_client.get_blob_properties().metadata
64 |         # Update metadata
65 |         blob_metadata.update(metadata)
66 |         # Add metadata to the blob
67 |         blob_client.set_blob_metadata(metadata= blob_metadata)
68 | 
69 |     def get_container_sas(self):
70 |         # Generate a SAS URL to the container and return it
71 |         return "?" + generate_container_sas(account_name= self.account_name, container_name= self.container_name,account_key=self.account_key,  permission="r", expiry=datetime.utcnow() + timedelta(hours=1))
72 | 
73 |     def get_blob_sas(self, file_name):
74 |         # Generate a SAS URL to the blob and return it
75 |         return f"https://{self.account_name}.blob.{self.blob_endpoint_suffix}/{self.container_name}/{file_name}" + "?" + generate_blob_sas(account_name= self.account_name, container_name=self.container_name, blob_name= file_name, account_key= self.account_key, permission='r', expiry=datetime.utcnow() + timedelta(hours=1))
76 | 


--------------------------------------------------------------------------------
/code/utilities/azuresearch.py:
--------------------------------------------------------------------------------
  1 | """Wrapper around Azure Cognitive Search."""
  2 | from __future__ import annotations
  3 | 
  4 | import json
  5 | import logging
  6 | import uuid
  7 | from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type
  8 | from pydantic import BaseModel, root_validator
  9 | import os
 10 | 
 11 | import numpy as np
 12 | from azure.core.exceptions import ResourceNotFoundError
 13 | from azure.core.credentials import AzureKeyCredential
 14 | from azure.search.documents import SearchClient
 15 | from azure.search.documents.indexes import SearchIndexClient
 16 | from azure.search.documents.models import Vector
 17 | from azure.search.documents.indexes.models import (
 18 |     SearchIndex,
 19 |     SearchField,
 20 |     SearchFieldDataType,
 21 |     SimpleField,
 22 |     SearchableField,
 23 |     SearchIndex,
 24 |     SemanticConfiguration,
 25 |     PrioritizedFields,
 26 |     SemanticField,
 27 |     SearchField,
 28 |     SemanticSettings,
 29 |     VectorSearch,
 30 |     VectorSearchAlgorithmConfiguration,
 31 | )
 32 | 
 33 | from langchain.docstore.document import Document
 34 | from langchain.embeddings.base import Embeddings
 35 | from langchain.schema import BaseRetriever
 36 | from langchain.utils import get_from_dict_or_env
 37 | from langchain.vectorstores.base import VectorStore
 38 | 
 39 | logger = logging.getLogger()
 40 | 
 41 | AZURESEARCH_DIMENSIONS = int(os.environ.get("AZURESEARCH_DIMENSIONS", 1536)) # Default to OpenAI's ada-002 embedding model vector size
 42 | 
 43 | # Allow overriding field names for Azure Search
 44 | FIELDS_ID = os.environ.get("AZURESEARCH_FIELDS_ID", "id")
 45 | FIELDS_TITLE = os.environ.get("AZURESEARCH_FIELDS_TITLE", "title")
 46 | FIELDS_CONTENT = os.environ.get("AZURESEARCH_FIELDS_CONTENT", "content")
 47 | FIELDS_CONTENT_VECTOR = os.environ.get(
 48 |     "AZURESEARCH_FIELDS_CONTENT_VECTOR", "content_vector")
 49 | FIELDS_TAG = os.environ.get("AZURESEARCH_FIELDS_TAG", "tag")
 50 | FIELDS_METADATA = os.environ.get("AZURESEARCH_FIELDS_TAG", "metadata")
 51 | 
 52 | MAX_UPLOAD_BATCH_SIZE = 1000
 53 | MAX_DELETE_BATCH_SIZE = 1000
 54 | 
 55 | def get_search_client(endpoint: str, key: str, index_name: str, semantic_configuration_name:str = None) -> SearchClient:
 56 |     if key is None:
 57 |         credential = DefaultAzureCredential()
 58 |     else:
 59 |         credential = AzureKeyCredential(key)
 60 |     index_client: SearchIndexClient = SearchIndexClient(
 61 |         endpoint=endpoint, credential=credential)
 62 |     try:
 63 |         index_client.get_index(name=index_name)
 64 |     except ResourceNotFoundError as ex:
 65 |         # Fields configuration
 66 |         fields = [
 67 |             SimpleField(name=FIELDS_ID, type=SearchFieldDataType.String,
 68 |                         key=True, filterable=True),
 69 |             SearchableField(name=FIELDS_TITLE, type=SearchFieldDataType.String,
 70 |                             searchable=True, retrievable=True),
 71 |             SearchableField(name=FIELDS_CONTENT, type=SearchFieldDataType.String,
 72 |                             searchable=True, retrievable=True),
 73 |             SearchField(name=FIELDS_CONTENT_VECTOR, type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
 74 |                         searchable=True, dimensions=AZURESEARCH_DIMENSIONS, vector_search_configuration="default"),
 75 |             SearchableField(name=FIELDS_TAG, type=SearchFieldDataType.String,
 76 |                             filterable=True, searchable=True, retrievable=True),
 77 |             SearchableField(name=FIELDS_METADATA, type=SearchFieldDataType.String,
 78 |                             searchable=True, retrievable=True)
 79 |         ]
 80 |         # Vector search configuration
 81 |         vector_search = VectorSearch(
 82 |             algorithm_configurations=[
 83 |                 VectorSearchAlgorithmConfiguration(
 84 |                     name="default",
 85 |                     kind="hnsw",
 86 |                     hnsw_parameters={
 87 |                         "m": 4,
 88 |                         "efConstruction": 400,
 89 |                         "efSearch": 500,
 90 |                         "metric": "cosine"
 91 |                     }
 92 |                 )
 93 |             ]
 94 |         )
 95 |         # Create the semantic settings with the configuration
 96 |         semantic_settings = None if semantic_configuration_name is None else SemanticSettings(
 97 |             configurations=[SemanticConfiguration(
 98 |                 name=semantic_configuration_name,
 99 |                 prioritized_fields=PrioritizedFields(
100 |                     title_field=SemanticField(field_name=FIELDS_TITLE),
101 |                     prioritized_keywords_fields=[
102 |                         SemanticField(field_name=FIELDS_TAG)],
103 |                     prioritized_content_fields=[
104 |                         SemanticField(field_name=FIELDS_CONTENT)]
105 |                 )
106 |             )
107 |             ]
108 |         )
109 |         # Create the search index with the semantic settings and vector search
110 |         index = SearchIndex(name=index_name, fields=fields,
111 |                             vector_search=vector_search, semantic_settings=semantic_settings)
112 |         index_client.create_index(index)
113 |     # Create the search client
114 |     return SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(key))
115 | 
116 | 
117 | class AzureSearch(VectorStore):
118 |     def __init__(
119 |         self,
120 |         azure_cognitive_search_name: str,
121 |         azure_cognitive_search_key: str,
122 |         index_name: str,
123 |         embedding_function: Callable,
124 |         semantic_configuration_name: str = None,
125 |         semantic_query_language: str = "en-us",
126 |         **kwargs: Any,
127 |     ):
128 |         """Initialize with necessary components."""
129 |         try:
130 |             from azure.search.documents import SearchClient
131 |         except ImportError:
132 |             raise ValueError(
133 |                 "Could not import requests python package. "
134 |                 "Please install it with `pip install --index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ azure-search-documents==11.4.0a20230509004`"
135 |             )
136 |         # Initialize base class
137 |         self.embedding_function = embedding_function
138 |         self.azure_cognitive_search_name = azure_cognitive_search_name
139 |         self.azure_cognitive_search_key = azure_cognitive_search_key
140 |         self.index_name = index_name
141 |         self.semantic_configuration_name = semantic_configuration_name
142 |         self.semantic_query_language = semantic_query_language
143 |         self.client = get_search_client(
144 |             self.azure_cognitive_search_name, self.azure_cognitive_search_key, self.index_name, self.semantic_configuration_name)
145 | 
146 |     def add_texts(
147 |         self,
148 |         texts: Iterable[str],
149 |         metadatas: Optional[List[dict]] = None,
150 |         **kwargs: Any,
151 |     ) -> List[str]:
152 |         """Add texts data to an existing index."""
153 |         keys = kwargs.get("keys")
154 |         keys = list(map(lambda x: x.replace(':','_'), keys)) if keys else None
155 |         ids = []
156 |         # Write data to index
157 |         data = []
158 |         for i, text in enumerate(texts):
159 |             # Use provided key otherwise use default key
160 |             key = keys[i] if keys else str(uuid.uuid4())
161 |             metadata = metadatas[i] if metadatas else {}
162 |             # Add data to index
163 |             data.append({
164 |                 "@search.action": "upload",
165 |                 FIELDS_ID: key,
166 |                 FIELDS_TITLE : metadata.get(FIELDS_TITLE, metadata.get("source", "[]").split('[')[1].split(']')[0]),
167 |                 FIELDS_TAG: metadata.get(FIELDS_TAG, ""),
168 |                 FIELDS_CONTENT: text,
169 |                 FIELDS_CONTENT_VECTOR: np.array(
170 |                     self.embedding_function(text), dtype=np.float32
171 |                 ).tolist(),
172 |                 FIELDS_METADATA: json.dumps(metadata)
173 |             })
174 |             ids.append(key)
175 |             # Upload data in batches
176 |             if len(data) == MAX_UPLOAD_BATCH_SIZE:
177 |                 response = self.client.upload_documents(documents=data)
178 |                 # Check if all documents were successfully uploaded
179 |                 if not all([r.succeeded for r in response]):
180 |                     raise Exception(response)
181 |                 # Reset data
182 |                 data = []
183 |         # Upload data to index
184 |         response = self.client.upload_documents(documents=data)
185 |         # Check if all documents were successfully uploaded
186 |         if all([r.succeeded for r in response]):
187 |             return ids
188 |         else:
189 |             raise Exception(response)
190 | 
191 |     def similarity_search(
192 |         self, query: str, k: int = 4, **kwargs: Any
193 |     ) -> List[Document]:
194 |         """
195 |         Returns the most similar indexed documents to the query text.
196 | 
197 |         Args:
198 |             query (str): The query text for which to find similar documents.
199 |             k (int): The number of documents to return. Default is 4.
200 | 
201 |         Returns:
202 |             List[Document]: A list of documents that are most similar to the query text.
203 |         """
204 |         docs_and_scores = self.similarity_search_with_score(
205 |             query, k=k, filters=kwargs.get("filters", None))
206 |         return [doc for doc, _ in docs_and_scores]
207 | 
208 |     def similarity_search_with_score(
209 |         self, query: str, k: int = 4, filters: str = None
210 |     ) -> List[Tuple[Document, float]]:
211 |         """Return docs most similar to query.
212 | 
213 |         Args:
214 |             query: Text to look up documents similar to.
215 |             k: Number of Documents to return. Defaults to 4.
216 | 
217 |         Returns:
218 |             List of Documents most similar to the query and score for each
219 |         """
220 |         results = self.client.search(
221 |             search_text="",
222 |             vector=Vector(value=np.array(self.embedding_function(
223 |                 query), dtype=np.float32).tolist(), k=k, fields=FIELDS_CONTENT_VECTOR),
224 |             select=[f"{FIELDS_TITLE},{FIELDS_CONTENT},{FIELDS_METADATA}"],
225 |             filter=filters
226 |         )
227 |         # Convert results to Document objects
228 |         docs = [
229 |             (
230 |                 Document(
231 |                     page_content=result[FIELDS_CONTENT], metadata=json.loads(
232 |                         result[FIELDS_METADATA]) 
233 |                 ),
234 |                 1 - float(result['@search.score']),
235 |             )
236 |             for result in results
237 |         ]
238 |         return docs
239 | 
240 |     def hybrid_search(
241 |         self, query: str, k: int = 4, **kwargs: Any
242 |     ) -> List[Document]:
243 |         """
244 |         Returns the most similar indexed documents to the query text.
245 | 
246 |         Args:
247 |             query (str): The query text for which to find similar documents.
248 |             k (int): The number of documents to return. Default is 4.
249 | 
250 |         Returns:
251 |             List[Document]: A list of documents that are most similar to the query text.
252 |         """
253 |         docs_and_scores = self.hybrid_search_with_score(
254 |             query, k=k, filters=kwargs.get("filters", None))
255 |         return [doc for doc, _ in docs_and_scores]
256 | 
257 |     def hybrid_search_with_score(
258 |         self, query: str, k: int = 4, filters: str = None
259 |     ) -> List[Tuple[Document, float]]:
260 |         """Return docs most similar to query with an hybrid query.
261 | 
262 |         Args:
263 |             query: Text to look up documents similar to.
264 |             k: Number of Documents to return. Defaults to 4.
265 | 
266 |         Returns:
267 |             List of Documents most similar to the query and score for each
268 |         """
269 |         results = self.client.search(
270 |             search_text=query,
271 |             vector=Vector(value=np.array(self.embedding_function(
272 |                 query), dtype=np.float32).tolist(), k=k, fields=FIELDS_CONTENT_VECTOR),
273 |             select=[f"{FIELDS_TITLE},{FIELDS_CONTENT},{FIELDS_METADATA}"],
274 |             filter=filters,
275 |             top=k
276 |         )
277 |         # Convert results to Document objects
278 |         docs = [
279 |             (
280 |                 Document(
281 |                     page_content=result[FIELDS_CONTENT], metadata=json.loads(
282 |                         result[FIELDS_METADATA])
283 |                 ),
284 |                 1 - float(result['@search.score']),
285 |             )
286 |             for result in results
287 |         ]
288 |         return docs
289 | 
290 |     def semantic_hybrid_search(
291 |         self, query: str, k: int = 4, **kwargs: Any
292 |     ) -> List[Document]:
293 |         """
294 |         Returns the most similar indexed documents to the query text.
295 | 
296 |         Args:
297 |             query (str): The query text for which to find similar documents.
298 |             k (int): The number of documents to return. Default is 4.
299 | 
300 |         Returns:
301 |             List[Document]: A list of documents that are most similar to the query text.
302 |         """
303 |         docs_and_scores = self.semantic_hybrid_search_with_score(
304 |             query, k=k, filters=kwargs.get('filters', None))
305 |         return [doc for doc, _ in docs_and_scores]
306 | 
307 |     def semantic_hybrid_search_with_score(
308 |         self, query: str, k: int = 4, filters: str = None
309 |     ) -> List[Tuple[Document, float]]:
310 |         """Return docs most similar to query with an hybrid query.
311 | 
312 |         Args:
313 |             query: Text to look up documents similar to.
314 |             k: Number of Documents to return. Defaults to 4.
315 | 
316 |         Returns:
317 |             List of Documents most similar to the query and score for each
318 |         """
319 |         results = self.client.search(
320 |             search_text=query,
321 |             vector=Vector(value=np.array(self.embedding_function(
322 |                 query), dtype=np.float32).tolist(), k=k, fields=FIELDS_CONTENT_VECTOR),
323 |             select=[f"{FIELDS_TITLE},{FIELDS_CONTENT},{FIELDS_METADATA}"],
324 |             filter=filters,
325 |             query_type="semantic",
326 |             query_language=self.semantic_query_language,
327 |             semantic_configuration_name=self.semantic_configuration_name,
328 |             query_caption="extractive",
329 |             query_answer="extractive",
330 |             top=k
331 |         )
332 |         # Get Semantic Answers
333 |         semantic_answers = results.get_answers()
334 |         semantic_answers_dict = {}
335 |         for semantic_answer in semantic_answers:
336 |             semantic_answers_dict[semantic_answer.key] = {
337 |                 "text": semantic_answer.text,
338 |                 "highlights": semantic_answer.highlights
339 |             }
340 |         # Convert results to Document objects
341 |         docs = [
342 |             (
343 |                 Document(
344 |                     page_content=result['content'],
345 |                     metadata={**json.loads(result['metadata']), **{
346 |                         'captions': {
347 |                             'text': result.get('@search.captions', [{}])[0].text,
348 |                             'highlights': result.get('@search.captions', [{}])[0].highlights
349 |                         } if result.get("@search.captions") else {},
350 |                         'answers': semantic_answers_dict.get(json.loads(result['metadata']).get('key'), '')
351 |                     }
352 |                     }
353 |                 ),
354 |                 1 - float(result['@search.score']),
355 |             )
356 |             for result in results
357 |         ]
358 |         return docs
359 | 
360 |     @classmethod
361 |     def from_texts(
362 |         cls: Type[AzureSearch],
363 |         texts: List[str],
364 |         embedding: Embeddings,
365 |         azure_cognitive_search_name: str,
366 |         azure_cognitive_search_key: str,
367 |         metadatas: Optional[List[dict]] = None,
368 |         index_name: Optional[str] = None,
369 |         **kwargs: Any,
370 |     ) -> AzureSearch:
371 |         # Name of the search index if not given
372 |         if not index_name:
373 |             index_name = uuid.uuid4().hex
374 |         # Creating a new Azure Search instance
375 |         azure_search = cls(azure_cognitive_search_name,
376 |                            azure_cognitive_search_key, index_name, embedding.embed_query)
377 |         azure_search.add_texts(texts, metadatas, **kwargs)
378 |         return azure_search
379 | 
380 |     def index_exists(self):
381 |         if self.azure_cognitive_search_key is None:
382 |             credential = DefaultAzureCredential()
383 |         else:
384 |             credential = AzureKeyCredential(self.azure_cognitive_search_key)
385 |         index_client: SearchIndexClient = SearchIndexClient(
386 |             endpoint=self.azure_cognitive_search_name, credential=credential)
387 |         return index_client.get_index(name=self.index_name)
388 | 
389 | 
390 |     def delete_keys(self, keys: List[str]):
391 |         documents = []
392 |         keys = list(map(lambda x: x.replace(':','_'), keys)) if keys else None
393 |         for i, key in enumerate(keys):
394 |             documents.append(
395 |                 {
396 |                     "@search.action": "delete",
397 |                     FIELDS_ID: key
398 |                 }
399 |             )
400 |             if i % MAX_DELETE_BATCH_SIZE == 0 and i != 0:
401 |                 self.client.delete_documents(documents=documents)
402 |                 documents = []
403 |         return self.client.delete_documents(documents=documents)
404 | 
405 | 
406 | 
407 | class AzureSearchVectorStoreRetriever(BaseRetriever, BaseModel):
408 |     vectorstore: AzureSearch
409 |     search_type: str = "similarity"
410 |     k: int = 4
411 |     score_threshold: float = 0.4
412 | 
413 |     class Config:
414 |         """Configuration for this pydantic object."""
415 |         arbitrary_types_allowed = True
416 | 
417 |     def validate_search_type(cls, values: Dict) -> Dict:
418 |         """Validate search type."""
419 |         if "search_type" in values:
420 |             search_type = values["search_type"]
421 |             if search_type not in ("similarity", "hybrid", "semantic_hybrid"):
422 |                 raise ValueError(f"search_type of {search_type} not allowed.")
423 |         return values
424 | 
425 |     def get_relevant_documents(self, query: str) -> List[Document]:
426 |         if self.search_type == "similarity":
427 |             docs = self.vectorstore.similarity_search(query, k=self.k)
428 |         elif self.search_type == "hybrid":
429 |             docs = self.vectorstore.hybrid_search(query, k=self.k)
430 |         elif self.search_type == "semantic_hybrid":
431 |             docs = self.vectorstore.semantic_hybrid_search(query, k=self.k)
432 |         else:
433 |             raise ValueError(f"search_type of {self.search_type} not allowed.")
434 |         return docs
435 | 
436 |     async def aget_relevant_documents(self, query: str) -> List[Document]:
437 |         raise NotImplementedError(
438 |             "AzureSearchVectorStoreRetriever does not support async")


--------------------------------------------------------------------------------
/code/utilities/customprompt.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from langchain.prompts import PromptTemplate
 3 | 
 4 | template = """{summaries}
 5 | 
 6 | Please reply to the question using only the information present in the text above.
 7 | If you can't find it, reply politely that the information is not in the knowledge base.
 8 | Detect the language of the question and answer in the same language. 
 9 | If asked for enumerations list all of them and do not invent any.
10 | Each source has a name followed by a colon and the actual information, always include the source name for each fact you use in the response. Always use double square brackets to reference the filename source, e.g. [[info1.pdf.txt]]. Don't combine sources, list each source separately, e.g. [[info1.pdf]][[info2.txt]].
11 | After answering the question generate three very brief follow-up questions that the user would likely ask next.
12 | Only use double angle brackets to reference the questions, e.g. <<Are there exclusions for prescriptions?>>.
13 | Only generate questions and do not generate any text before or after the questions, such as 'Follow-up Questions:'.
14 | Try not to repeat questions that have already been asked.
15 | 
16 | Question: {question}
17 | Answer:"""
18 | 
19 | PROMPT = PromptTemplate(template=template, input_variables=["summaries", "question"])
20 | 
21 | EXAMPLE_PROMPT = PromptTemplate(
22 |     template="Content: {page_content}\nSource: {source}",
23 |     input_variables=["page_content", "source"],
24 | )
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/code/utilities/formrecognizer.py:
--------------------------------------------------------------------------------
 1 | from azure.core.credentials import AzureKeyCredential
 2 | from azure.ai.formrecognizer import DocumentAnalysisClient
 3 | import os
 4 | from dotenv import load_dotenv
 5 | 
 6 | class AzureFormRecognizerClient:
 7 |     def __init__(self, form_recognizer_endpoint: str = None, form_recognizer_key: str = None):
 8 | 
 9 |         load_dotenv()
10 | 
11 |         self.pages_per_embeddings = int(os.getenv('PAGES_PER_EMBEDDINGS', 2))
12 |         self.section_to_exclude = ['footnote', 'pageHeader', 'pageFooter', 'pageNumber']
13 | 
14 |         self.form_recognizer_endpoint : str = form_recognizer_endpoint if form_recognizer_endpoint else os.getenv('FORM_RECOGNIZER_ENDPOINT')
15 |         self.form_recognizer_key : str = form_recognizer_key if form_recognizer_key else os.getenv('FORM_RECOGNIZER_KEY')
16 | 
17 |     def analyze_read(self, formUrl):
18 | 
19 |         document_analysis_client = DocumentAnalysisClient(
20 |             endpoint=self.form_recognizer_endpoint, credential=AzureKeyCredential(self.form_recognizer_key)
21 |         )
22 |         
23 |         poller = document_analysis_client.begin_analyze_document_from_url(
24 |                 "prebuilt-layout", formUrl)
25 |         layout = poller.result()
26 | 
27 |         results = []
28 |         page_result = ''
29 |         for p in layout.paragraphs:
30 |             page_number = p.bounding_regions[0].page_number
31 |             output_file_id = int((page_number - 1 ) / self.pages_per_embeddings)
32 | 
33 |             if len(results) < output_file_id + 1:
34 |                 results.append('')
35 | 
36 |             if p.role not in self.section_to_exclude:
37 |                 results[output_file_id] += f"{p.content}\n"
38 | 
39 |         for t in layout.tables:
40 |             page_number = t.bounding_regions[0].page_number
41 |             output_file_id = int((page_number - 1 ) / self.pages_per_embeddings)
42 |             
43 |             if len(results) < output_file_id + 1:
44 |                 results.append('')
45 |             previous_cell_row=0
46 |             rowcontent='| '
47 |             tablecontent = ''
48 |             for c in t.cells:
49 |                 if c.row_index == previous_cell_row:
50 |                     rowcontent +=  c.content + " | "
51 |                 else:
52 |                     tablecontent += rowcontent + "\n"
53 |                     rowcontent='|'
54 |                     rowcontent += c.content + " | "
55 |                     previous_cell_row += 1
56 |             results[output_file_id] += f"{tablecontent}|"
57 |         return results
58 | 


--------------------------------------------------------------------------------
/code/utilities/pgvector.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | import logging
  3 | import uuid
  4 | from typing import Any, Dict, Iterable, List, Optional, Tuple
  5 | 
  6 | import sqlalchemy
  7 | from sqlalchemy import delete
  8 | from pgvector.sqlalchemy import Vector
  9 | from sqlalchemy.dialects.postgresql import JSON, UUID
 10 | from sqlalchemy.orm import Mapped, Session, declarative_base, relationship
 11 | 
 12 | from langchain.docstore.document import Document
 13 | from langchain.embeddings.base import Embeddings
 14 | from langchain.utils import get_from_dict_or_env
 15 | from langchain.vectorstores.base import VectorStore
 16 | 
 17 | Base = declarative_base()  # type: Any
 18 | 
 19 | 
 20 | ADA_TOKEN_COUNT = 1536
 21 | _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"
 22 | 
 23 | 
 24 | class BaseModel(Base):
 25 |     __abstract__ = True
 26 |     uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
 27 | 
 28 | 
 29 | class CollectionStore(BaseModel):
 30 |     __tablename__ = "langchain_pg_collection"
 31 | 
 32 |     name = sqlalchemy.Column(sqlalchemy.String)
 33 |     cmetadata = sqlalchemy.Column(JSON)
 34 | 
 35 |     embeddings = relationship(
 36 |         "EmbeddingStore",
 37 |         back_populates="collection",
 38 |         passive_deletes=True,
 39 |     )
 40 | 
 41 |     @classmethod
 42 |     def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]:
 43 |         return session.query(cls).filter(cls.name == name).first()
 44 | 
 45 |     @classmethod
 46 |     def get_or_create(
 47 |         cls,
 48 |         session: Session,
 49 |         name: str,
 50 |         cmetadata: Optional[dict] = None,
 51 |     ) -> Tuple["CollectionStore", bool]:
 52 |         """
 53 |         Get or create a collection.
 54 |         Returns [Collection, bool] where the bool is True if the collection was created.
 55 |         """
 56 |         created = False
 57 |         collection = cls.get_by_name(session, name)
 58 |         if collection:
 59 |             return collection, created
 60 | 
 61 |         collection = cls(name=name, cmetadata=cmetadata)
 62 |         session.add(collection)
 63 |         session.commit()
 64 |         created = True
 65 |         return collection, created
 66 | 
 67 | 
 68 | class EmbeddingStore(BaseModel):
 69 |     __tablename__ = "langchain_pg_embedding"
 70 | 
 71 |     collection_id: Mapped[UUID] = sqlalchemy.Column(
 72 |         UUID(as_uuid=True),
 73 |         sqlalchemy.ForeignKey(
 74 |             f"{CollectionStore.__tablename__}.uuid",
 75 |             ondelete="CASCADE",
 76 |         ),
 77 |     )
 78 |     collection = relationship(CollectionStore, back_populates="embeddings")
 79 | 
 80 |     embedding: Vector = sqlalchemy.Column(Vector(ADA_TOKEN_COUNT))
 81 |     document = sqlalchemy.Column(sqlalchemy.String, nullable=True)
 82 |     cmetadata = sqlalchemy.Column(JSON, nullable=True)
 83 | 
 84 |     # custom_id : any user defined id
 85 |     custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True)
 86 | 
 87 | 
 88 | class QueryResult:
 89 |     EmbeddingStore: EmbeddingStore
 90 |     distance: float
 91 | 
 92 | 
 93 | class DistanceStrategy(str, enum.Enum):
 94 |     EUCLIDEAN = EmbeddingStore.embedding.l2_distance
 95 |     COSINE = EmbeddingStore.embedding.cosine_distance
 96 |     MAX_INNER_PRODUCT = EmbeddingStore.embedding.max_inner_product
 97 | 
 98 | 
 99 | DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN
100 | 
101 | 
102 | class PGVectorExtended(VectorStore):
103 |     """
104 |     VectorStore implementation using Postgres and pgvector.
105 |     - `connection_string` is a postgres connection string.
106 |     - `embedding_function` any embedding function implementing
107 |         `langchain.embeddings.base.Embeddings` interface.
108 |     - `collection_name` is the name of the collection to use. (default: langchain)
109 |         - NOTE: This is not the name of the table, but the name of the collection.
110 |             The tables will be created when initializing the store (if not exists)
111 |             So, make sure the user has the right permissions to create tables.
112 |     - `distance_strategy` is the distance strategy to use. (default: EUCLIDEAN)
113 |         - `EUCLIDEAN` is the euclidean distance.
114 |         - `COSINE` is the cosine distance.
115 |     - `pre_delete_collection` if True, will delete the collection if it exists.
116 |         (default: False)
117 |         - Useful for testing.
118 |     """
119 | 
120 |     def __init__(
121 |         self,
122 |         connection_string: str,
123 |         embedding_function: Embeddings,
124 |         collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
125 |         collection_metadata: Optional[dict] = None,
126 |         distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
127 |         pre_delete_collection: bool = False,
128 |         logger: Optional[logging.Logger]= None,
129 |         engine_args: Optional[dict[str, Any]] = None,
130 |     ) -> None:
131 |         self.connection_string = connection_string
132 |         self.embedding_function = embedding_function
133 |         self.collection_name = collection_name
134 |         self.collection_metadata = collection_metadata
135 |         self.distance_strategy = distance_strategy
136 |         self.pre_delete_collection = pre_delete_collection
137 |         self.logger = logger or logging.getLogger(__name__)
138 |         self.engine_args = engine_args or {}
139 |         self._engine = self.connect()
140 |         # self._conn = self.connect()
141 |         # self.__post_init__()
142 |         self.CollectionStore = CollectionStore
143 |         self.EmbeddingStore = EmbeddingStore
144 |       
145 |     def __post_init__(
146 |         self,
147 |     ) -> None:
148 |         self.create_vector_extension()
149 |         self.create_tables_if_not_exists()
150 |         self.create_collection()
151 | 
152 |     def connect(self) -> sqlalchemy.engine:
153 |         engine = sqlalchemy.create_engine(self.connection_string, **self.engine_args)
154 |         return engine
155 | 
156 |     def create_vector_extension(self) -> None:
157 |         try:
158 |             with Session(self._engine) as session:
159 |                 # The advisor lock fixes issue arising from concurrent
160 |                 # creation of the vector extension.
161 |                 # https://github.com/langchain-ai/langchain/issues/12933
162 |                 # For more information see:
163 |                 # https://www.postgresql.org/docs/16/explicit-locking.html#ADVISORY-LOCKS
164 |                 statement = sqlalchemy.text(
165 |                     "BEGIN;"
166 |                     "SELECT pg_advisory_xact_lock(1573678846307946496);"
167 |                     "CREATE EXTENSION IF NOT EXISTS vector;"
168 |                     "COMMIT;"
169 |                 )
170 |                 session.execute(statement)
171 |                 session.commit()
172 |         except Exception as e:
173 |             raise Exception(f"Failed to create vector extension: {e}") from e
174 | 
175 |     def create_tables_if_not_exists(self) -> None:
176 |         with self._engine.begin():
177 |             Base.metadata.create_all(self._engine)
178 | 
179 |     def drop_tables(self) -> None:
180 |         with self._engine.begin():
181 |             Base.metadata.drop_all(self._engine)
182 | 
183 |     def create_collection(self) -> None:
184 |         if self.pre_delete_collection:
185 |             self.delete_collection()
186 |         with Session(self._engine) as session:
187 |             CollectionStore.get_or_create(
188 |                 session, self.collection_name, cmetadata=self.collection_metadata
189 |             )
190 | 
191 |     def delete_collection(self) -> None:
192 |         self.logger.debug("Trying to delete collection")
193 |         with Session(self._engine) as session:
194 |             collection = self.get_collection(session)
195 |             if not collection:
196 |                 self.logger.error("Collection not found")
197 |                 return
198 |             session.delete(collection)
199 |             session.commit()
200 | 
201 |     def get_collection(self, session: Session) -> Optional["CollectionStore"]:
202 |         return CollectionStore.get_by_name(session, self.collection_name)
203 | 
204 |     def add_texts(
205 |         self,
206 |         texts: Iterable[str],
207 |         metadatas: Optional[List[dict]] = None,
208 |         ids: Optional[List[str]] = None,
209 |         **kwargs: Any,
210 |     ) -> List[str]:
211 |         """Run more texts through the embeddings and add to the vectorstore.
212 | 
213 |         Args:
214 |             texts: Iterable of strings to add to the vectorstore.
215 |             metadatas: Optional list of metadatas associated with the texts.
216 |             kwargs: vectorstore specific parameters
217 | 
218 |         Returns:
219 |             List of ids from adding the texts into the vectorstore.
220 |         """
221 |         if ids is None:
222 |             ids = [str(uuid.uuid1()) for _ in texts]
223 | 
224 |         embeddings = self.embedding_function.embed_documents(list(texts))
225 | 
226 |         if not metadatas:
227 |             metadatas = [{} for _ in texts]
228 | 
229 |         with Session(self._engine) as session:
230 |             collection = self.get_collection(session)
231 |             if not collection:
232 |                 raise ValueError("Collection not found")
233 |             for text, metadata, embedding, id in zip(texts, metadatas, embeddings, ids):
234 |                 embedding_store = EmbeddingStore(
235 |                     embedding=embedding,
236 |                     document=text,
237 |                     cmetadata=metadata,
238 |                     custom_id=id,
239 |                 )
240 |                 collection.embeddings.append(embedding_store)
241 |                 session.add(embedding_store)
242 |             session.commit()
243 | 
244 |         return ids
245 | 
246 |     def similarity_search(
247 |         self,
248 |         query: str,
249 |         k: int = 4,
250 |         filter: Optional[dict] = None,
251 |         **kwargs: Any,
252 |     ) -> List[Document]:
253 |         """Run similarity search with PGVector with distance.
254 | 
255 |         Args:
256 |             query (str): Query text to search for.
257 |             k (int): Number of results to return. Defaults to 4.
258 |             filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
259 | 
260 |         Returns:
261 |             List of Documents most similar to the query.
262 |         """
263 |         embedding = self.embedding_function.embed_query(text=query)
264 |         return self.similarity_search_by_vector(
265 |             embedding=embedding,
266 |             k=k,
267 |             filter=filter,
268 |         )
269 | 
270 |     def similarity_search_with_score(
271 |         self,
272 |         query: str,
273 |         k: int = 4,
274 |         filter: Optional[dict] = None,
275 |     ) -> List[Tuple[Document, float]]:
276 |         """Return docs most similar to query.
277 | 
278 |         Args:
279 |             query: Text to look up documents similar to.
280 |             k: Number of Documents to return. Defaults to 4.
281 |             filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
282 | 
283 |         Returns:
284 |             List of Documents most similar to the query and score for each
285 |         """
286 |         embedding = self.embedding_function.embed_query(query)
287 |         docs = self.similarity_search_with_score_by_vector(
288 |             embedding=embedding, k=k, filter=filter
289 |         )
290 |         return docs
291 | 
292 |     def similarity_search_with_score_by_vector(
293 |         self,
294 |         embedding: List[float],
295 |         k: int = 4,
296 |         filter: Optional[dict] = None,
297 |     ) -> List[Tuple[Document, float]]:
298 |         with Session(self._engine) as session:
299 |             collection = self.get_collection(session)
300 |             if not collection:
301 |                 raise ValueError("Collection not found")
302 | 
303 |         filter_by = EmbeddingStore.collection_id == collection.uuid
304 | 
305 |         if filter is not None:
306 |             filter_clauses = []
307 |             for key, value in filter.items():
308 |                 filter_by_metadata = EmbeddingStore.cmetadata[key].astext == str(value)
309 |                 filter_clauses.append(filter_by_metadata)
310 | 
311 |             filter_by = sqlalchemy.and_(filter_by, *filter_clauses)
312 | 
313 |         results: List[QueryResult] = (
314 |             session.query(
315 |                 EmbeddingStore,
316 |                 self.distance_strategy(embedding).label("distance"),  # type: ignore
317 |             )
318 |             .filter(filter_by)
319 |             .order_by(sqlalchemy.asc("distance"))
320 |             .join(
321 |                 CollectionStore,
322 |                 EmbeddingStore.collection_id == CollectionStore.uuid,
323 |             )
324 |             .limit(k)
325 |             .all()
326 |         )
327 |         docs = [
328 |             (
329 |                 Document(
330 |                     page_content=result.EmbeddingStore.document,
331 |                     metadata=result.EmbeddingStore.cmetadata,
332 |                 ),
333 |                 result.distance if self.embedding_function is not None else None,
334 |             )
335 |             for result in results
336 |         ]
337 |         return docs
338 | 
339 |     def similarity_search_by_vector(
340 |         self,
341 |         embedding: List[float],
342 |         k: int = 4,
343 |         filter: Optional[dict] = None,
344 |         **kwargs: Any,
345 |     ) -> List[Document]:
346 |         """Return docs most similar to embedding vector.
347 | 
348 |         Args:
349 |             embedding: Embedding to look up documents similar to.
350 |             k: Number of Documents to return. Defaults to 4.
351 |             filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
352 | 
353 |         Returns:
354 |             List of Documents most similar to the query vector.
355 |         """
356 |         docs_and_scores = self.similarity_search_with_score_by_vector(
357 |             embedding=embedding, k=k, filter=filter
358 |         )
359 |         return [doc for doc, _ in docs_and_scores]
360 | 
361 |     @classmethod
362 |     def from_texts(
363 |         cls,
364 |         texts: List[str],
365 |         embedding: Embeddings,
366 |         metadatas: Optional[List[dict]] = None,
367 |         collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
368 |         distance_strategy: DistanceStrategy = DistanceStrategy.COSINE,
369 |         ids: Optional[List[str]] = None,
370 |         pre_delete_collection: bool = False,
371 |         **kwargs: Any,
372 |     ) -> "PGVectorExtended":
373 |         """
374 |         Return VectorStore initialized from texts and embeddings.
375 |         Postgres connection string is required
376 |         "Either pass it as a parameter
377 |         or set the PGVECTOR_CONNECTION_STRING environment variable.
378 |         """
379 | 
380 |         connection_string = cls.get_connection_string(kwargs)
381 | 
382 |         store = cls(
383 |             connection_string=connection_string,
384 |             collection_name=collection_name,
385 |             embedding_function=embedding,
386 |             distance_strategy=distance_strategy,
387 |             pre_delete_collection=pre_delete_collection,
388 |         )
389 | 
390 |         store.add_texts(texts=texts, metadatas=metadatas, ids=ids, **kwargs)
391 |         return store
392 | 
393 |     @classmethod
394 |     def get_connection_string(cls, kwargs: Dict[str, Any]) -> str:
395 |         connection_string: str = get_from_dict_or_env(
396 |             data=kwargs,
397 |             key="connection_string",
398 |             env_key="PGVECTOR_CONNECTION_STRING",
399 |         )
400 | 
401 |         if not connection_string:
402 |             raise ValueError(
403 |                 "Postgres connection string is required"
404 |                 "Either pass it as a parameter"
405 |                 "or set the PGVECTOR_CONNECTION_STRING environment variable."
406 |             )
407 | 
408 |         return connection_string
409 | 
410 |     @classmethod
411 |     def from_documents(
412 |         cls,
413 |         documents: List[Document],
414 |         embedding: Embeddings,
415 |         collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
416 |         distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
417 |         ids: Optional[List[str]] = None,
418 |         pre_delete_collection: bool = False,
419 |         **kwargs: Any,
420 |     ) -> "PGVectorExtended":
421 |         """
422 |         Return VectorStore initialized from documents and embeddings.
423 |         Postgres connection string is required
424 |         "Either pass it as a parameter
425 |         or set the PGVECTOR_CONNECTION_STRING environment variable.
426 |         """
427 | 
428 |         texts = [d.page_content for d in documents]
429 |         metadatas = [d.metadata for d in documents]
430 |         connection_string = cls.get_connection_string(kwargs)
431 | 
432 |         kwargs["connection_string"] = connection_string
433 | 
434 |         return cls.from_texts(
435 |             texts=texts,
436 |             pre_delete_collection=pre_delete_collection,
437 |             embedding=embedding,
438 |             distance_strategy=distance_strategy,
439 |             metadatas=metadatas,
440 |             ids=ids,
441 |             collection_name=collection_name,
442 |             **kwargs,
443 |         )
444 | 
445 |     @classmethod
446 |     def connection_string_from_db_params(
447 |         cls,
448 |         driver: str,
449 |         host: str,
450 |         port: int,
451 |         database: str,
452 |         user: str,
453 |         password: str,
454 |     ) -> str:
455 |         """Return connection string from database parameters."""
456 |         return f"postgresql+{driver}://{user}:{password}@{host}:{port}/{database}"
457 | 
458 |     def delete_keys(
459 |         self,
460 |         ids: Optional[List[str]] = None,
461 |         **kwargs: Any,
462 |     ) -> None:
463 |         """Delete vectors by ids or uuids.
464 | 
465 |         Args:
466 |             ids: List of ids to delete.
467 |         """
468 |         with Session(self._engine) as session:
469 |             if ids is not None:
470 |                 self.logger.debug(
471 |                     "Trying to delete vectors by ids (represented by the model "
472 |                     "using the custom ids field)"
473 |                 )
474 |                 stmt = delete(self.EmbeddingStore).where(
475 |                     self.EmbeddingStore.custom_id.in_(ids)
476 |                 )
477 |                 session.execute(stmt)
478 |             session.commit()


--------------------------------------------------------------------------------
/code/utilities/redis.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import uuid
  4 | from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple
  5 | 
  6 | from langchain.vectorstores.redis import Redis
  7 | from langchain.docstore.document import Document
  8 | from langchain.embeddings.base import Embeddings
  9 | from langchain.vectorstores.base import VectorStore
 10 | 
 11 | import pandas as pd
 12 | from redis.commands.search.query import Query
 13 | from redis.commands.search.indexDefinition import IndexDefinition, IndexType
 14 | from redis.commands.search.field import VectorField, TagField, TextField
 15 | 
 16 | logger = logging.getLogger()
 17 | 
 18 | class RedisExtended(Redis):
 19 |     def __init__(
 20 |         self,
 21 |         redis_url: str,
 22 |         index_name: str,
 23 |         embedding_function: Callable,
 24 |         **kwargs: Any,
 25 |     ):
 26 |         super().__init__(redis_url, index_name, embedding_function)
 27 |         
 28 |         # Check if index exists
 29 |         try:
 30 |             self.client.ft("prompt-index").info()
 31 |         except: 
 32 |             # Create Redis Index
 33 |             self.create_prompt_index()
 34 | 
 35 |         try:
 36 |             self.client.ft(self.index_name).info()
 37 |         except:
 38 |             # Create Redis Index
 39 |             self.create_index()
 40 | 
 41 |     def check_existing_index(self, index_name: str = None):
 42 |         try:
 43 |             self.client.ft(index_name if index_name else self.index_name).info()
 44 |             return True
 45 |         except:
 46 |             return False
 47 | 
 48 |     def delete_keys(self, keys: List[str]) -> None:
 49 |         for key in keys:
 50 |             self.client.delete(key)
 51 |     
 52 |     def delete_keys_pattern(self, pattern: str) -> None:
 53 |         keys = self.client.keys(pattern)
 54 |         self.delete_keys(keys)
 55 | 
 56 |     def create_index(self, prefix = "doc", distance_metric:str="COSINE"):
 57 |         content = TextField(name="content")
 58 |         metadata = TextField(name="metadata")
 59 |         content_vector = VectorField("content_vector",
 60 |                     "HNSW", {
 61 |                         "TYPE": "FLOAT32",
 62 |                         "DIM": 1536,
 63 |                         "DISTANCE_METRIC": distance_metric,
 64 |                         "INITIAL_CAP": 1000,
 65 |                     })
 66 |         # Create index
 67 |         self.client.ft(self.index_name).create_index(
 68 |             fields = [content, metadata, content_vector],
 69 |             definition = IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
 70 |         )
 71 | 
 72 |     # Prompt management
 73 |     def create_prompt_index(self, index_name="prompt-index", prefix = "prompt"):
 74 |         result = TextField(name="result")
 75 |         filename = TextField(name="filename")
 76 |         prompt = TextField(name="prompt")
 77 |         # Create index
 78 |         self.client.ft(index_name).create_index(
 79 |             fields = [result, filename, prompt],
 80 |             definition = IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
 81 |         )
 82 | 
 83 |     def add_prompt_result(self, id, result, filename="", prompt=""):
 84 |         self.client.hset(
 85 |             f"prompt:{id}",
 86 |             mapping={
 87 |                 "result": result,
 88 |                 "filename": filename,
 89 |                 "prompt": prompt
 90 |             }
 91 |         )
 92 | 
 93 |     def get_prompt_results(self, prompt_index_name="prompt-index", number_of_results: int=3155):
 94 |         base_query = f'*'
 95 |         return_fields = ['id','result','filename','prompt']
 96 |         query = Query(base_query)\
 97 |             .paging(0, number_of_results)\
 98 |             .return_fields(*return_fields)\
 99 |             .dialect(2)
100 |         results = self.client.ft(prompt_index_name).search(query)
101 |         if results.docs:
102 |             return pd.DataFrame(list(map(lambda x: {'id' : x.id, 'filename': x.filename, 'prompt': x.prompt, 'result': x.result.replace('\n',' ').replace('\r',' '),}, results.docs))).sort_values(by='id')
103 |         else:
104 |             return pd.DataFrame()
105 | 
106 |     def delete_prompt_results(self, prefix="prompt*"):
107 |         self.delete_keys_pattern(pattern=prefix)
108 | 


--------------------------------------------------------------------------------
/code/utilities/translator.py:
--------------------------------------------------------------------------------
 1 | import os, requests, urllib
 2 | from dotenv import load_dotenv
 3 | 
 4 | default_languages = {"translation":{"af":{"name":"Afrikaans","nativeName":"Afrikaans","dir":"ltr"},"am":{"name":"Amharic","nativeName":"አማርኛ","dir":"ltr"},"ar":{"name":"Arabic","nativeName":"العربية","dir":"rtl"},"as":{"name":"Assamese","nativeName":"অসমীয়া","dir":"ltr"},"az":{"name":"Azerbaijani","nativeName":"Azərbaycan","dir":"ltr"},"ba":{"name":"Bashkir","nativeName":"Bashkir","dir":"ltr"},"bg":{"name":"Bulgarian","nativeName":"Български","dir":"ltr"},"bn":{"name":"Bangla","nativeName":"বাংলা","dir":"ltr"},"bo":{"name":"Tibetan","nativeName":"བོད་སྐད་","dir":"ltr"},"bs":{"name":"Bosnian","nativeName":"Bosnian","dir":"ltr"},"ca":{"name":"Catalan","nativeName":"Català","dir":"ltr"},"cs":{"name":"Czech","nativeName":"Čeština","dir":"ltr"},"cy":{"name":"Welsh","nativeName":"Cymraeg","dir":"ltr"},"da":{"name":"Danish","nativeName":"Dansk","dir":"ltr"},"de":{"name":"German","nativeName":"Deutsch","dir":"ltr"},"dsb":{"name":"Lower Sorbian","nativeName":"Dolnoserbšćina","dir":"ltr"},"dv":{"name":"Divehi","nativeName":"ދިވެހިބަސް","dir":"rtl"},"el":{"name":"Greek","nativeName":"Ελληνικά","dir":"ltr"},"en":{"name":"English","nativeName":"English","dir":"ltr"},"es":{"name":"Spanish","nativeName":"Español","dir":"ltr"},"et":{"name":"Estonian","nativeName":"Eesti","dir":"ltr"},"eu":{"name":"Basque","nativeName":"Euskara","dir":"ltr"},"fa":{"name":"Persian","nativeName":"فارسی","dir":"rtl"},"fi":{"name":"Finnish","nativeName":"Suomi","dir":"ltr"},"fil":{"name":"Filipino","nativeName":"Filipino","dir":"ltr"},"fj":{"name":"Fijian","nativeName":"Na Vosa Vakaviti","dir":"ltr"},"fo":{"name":"Faroese","nativeName":"Føroyskt","dir":"ltr"},"fr":{"name":"French","nativeName":"Français","dir":"ltr"},"fr-CA":{"name":"French (Canada)","nativeName":"Français (Canada)","dir":"ltr"},"ga":{"name":"Irish","nativeName":"Gaeilge","dir":"ltr"},"gl":{"name":"Galician","nativeName":"Galego","dir":"ltr"},"gom":{"name":"Konkani","nativeName":"Konkani","dir":"ltr"},"gu":{"name":"Gujarati","nativeName":"ગુજરાતી","dir":"ltr"},"ha":{"name":"Hausa","nativeName":"Hausa","dir":"ltr"},"he":{"name":"Hebrew","nativeName":"עברית","dir":"rtl"},"hi":{"name":"Hindi","nativeName":"हिन्दी","dir":"ltr"},"hr":{"name":"Croatian","nativeName":"Hrvatski","dir":"ltr"},"hsb":{"name":"Upper Sorbian","nativeName":"Hornjoserbšćina","dir":"ltr"},"ht":{"name":"Haitian Creole","nativeName":"Haitian Creole","dir":"ltr"},"hu":{"name":"Hungarian","nativeName":"Magyar","dir":"ltr"},"hy":{"name":"Armenian","nativeName":"Հայերեն","dir":"ltr"},"id":{"name":"Indonesian","nativeName":"Indonesia","dir":"ltr"},"ig":{"name":"Igbo","nativeName":"Ásụ̀sụ́ Ìgbò","dir":"ltr"},"ikt":{"name":"Inuinnaqtun","nativeName":"Inuinnaqtun","dir":"ltr"},"is":{"name":"Icelandic","nativeName":"Íslenska","dir":"ltr"},"it":{"name":"Italian","nativeName":"Italiano","dir":"ltr"},"iu":{"name":"Inuktitut","nativeName":"ᐃᓄᒃᑎᑐᑦ","dir":"ltr"},"iu-Latn":{"name":"Inuktitut (Latin)","nativeName":"Inuktitut (Latin)","dir":"ltr"},"ja":{"name":"Japanese","nativeName":"日本語","dir":"ltr"},"ka":{"name":"Georgian","nativeName":"ქართული","dir":"ltr"},"kk":{"name":"Kazakh","nativeName":"Қазақ Тілі","dir":"ltr"},"km":{"name":"Khmer","nativeName":"ខ្មែរ","dir":"ltr"},"kmr":{"name":"Kurdish (Northern)","nativeName":"Kurdî (Bakur)","dir":"ltr"},"kn":{"name":"Kannada","nativeName":"ಕನ್ನಡ","dir":"ltr"},"ko":{"name":"Korean","nativeName":"한국어","dir":"ltr"},"ku":{"name":"Kurdish (Central)","nativeName":"Kurdî (Navîn)","dir":"rtl"},"ky":{"name":"Kyrgyz","nativeName":"Кыргызча","dir":"ltr"},"ln":{"name":"Lingala","nativeName":"Lingála","dir":"ltr"},"lo":{"name":"Lao","nativeName":"ລາວ","dir":"ltr"},"lt":{"name":"Lithuanian","nativeName":"Lietuvių","dir":"ltr"},"lug":{"name":"Ganda","nativeName":"Ganda","dir":"ltr"},"lv":{"name":"Latvian","nativeName":"Latviešu","dir":"ltr"},"lzh":{"name":"Chinese (Literary)","nativeName":"中文 (文言文)","dir":"ltr"},"mai":{"name":"Maithili","nativeName":"Maithili","dir":"ltr"},"mg":{"name":"Malagasy","nativeName":"Malagasy","dir":"ltr"},"mi":{"name":"Māori","nativeName":"Te Reo Māori","dir":"ltr"},"mk":{"name":"Macedonian","nativeName":"Македонски","dir":"ltr"},"ml":{"name":"Malayalam","nativeName":"മലയാളം","dir":"ltr"},"mn-Cyrl":{"name":"Mongolian (Cyrillic)","nativeName":"Mongolian (Cyrillic)","dir":"ltr"},"mn-Mong":{"name":"Mongolian (Traditional)","nativeName":"ᠮᠣᠩᠭᠣᠯ ᠬᠡᠯᠡ","dir":"ltr"},"mr":{"name":"Marathi","nativeName":"मराठी","dir":"ltr"},"ms":{"name":"Malay","nativeName":"Melayu","dir":"ltr"},"mt":{"name":"Maltese","nativeName":"Malti","dir":"ltr"},"mww":{"name":"Hmong Daw","nativeName":"Hmong Daw","dir":"ltr"},"my":{"name":"Myanmar (Burmese)","nativeName":"မြန်မာ","dir":"ltr"},"nb":{"name":"Norwegian","nativeName":"Norsk Bokmål","dir":"ltr"},"ne":{"name":"Nepali","nativeName":"नेपाली","dir":"ltr"},"nl":{"name":"Dutch","nativeName":"Nederlands","dir":"ltr"},"nso":{"name":"Sesotho sa Leboa","nativeName":"Sesotho sa Leboa","dir":"ltr"},"nya":{"name":"Nyanja","nativeName":"Nyanja","dir":"ltr"},"or":{"name":"Odia","nativeName":"ଓଡ଼ିଆ","dir":"ltr"},"otq":{"name":"Querétaro Otomi","nativeName":"Hñähñu","dir":"ltr"},"pa":{"name":"Punjabi","nativeName":"ਪੰਜਾਬੀ","dir":"ltr"},"pl":{"name":"Polish","nativeName":"Polski","dir":"ltr"},"prs":{"name":"Dari","nativeName":"دری","dir":"rtl"},"ps":{"name":"Pashto","nativeName":"پښتو","dir":"rtl"},"pt":{"name":"Portuguese (Brazil)","nativeName":"Português (Brasil)","dir":"ltr"},"pt-PT":{"name":"Portuguese (Portugal)","nativeName":"Português (Portugal)","dir":"ltr"},"ro":{"name":"Romanian","nativeName":"Română","dir":"ltr"},"ru":{"name":"Russian","nativeName":"Русский","dir":"ltr"},"run":{"name":"Rundi","nativeName":"Rundi","dir":"ltr"},"rw":{"name":"Kinyarwanda","nativeName":"Kinyarwanda","dir":"ltr"},"sd":{"name":"Sindhi","nativeName":"سنڌي","dir":"ltr"},"si":{"name":"Sinhala","nativeName":"සිංහල","dir":"ltr"},"sk":{"name":"Slovak","nativeName":"Slovenčina","dir":"ltr"},"sl":{"name":"Slovenian","nativeName":"Slovenščina","dir":"ltr"},"sm":{"name":"Samoan","nativeName":"Gagana Sāmoa","dir":"ltr"},"sn":{"name":"Shona","nativeName":"chiShona","dir":"ltr"},"so":{"name":"Somali","nativeName":"Soomaali","dir":"ltr"},"sq":{"name":"Albanian","nativeName":"Shqip","dir":"ltr"},"sr-Cyrl":{"name":"Serbian (Cyrillic)","nativeName":"Српски (ћирилица)","dir":"ltr"},"sr-Latn":{"name":"Serbian (Latin)","nativeName":"Srpski (latinica)","dir":"ltr"},"st":{"name":"Sesotho","nativeName":"Sesotho","dir":"ltr"},"sv":{"name":"Swedish","nativeName":"Svenska","dir":"ltr"},"sw":{"name":"Swahili","nativeName":"Kiswahili","dir":"ltr"},"ta":{"name":"Tamil","nativeName":"தமிழ்","dir":"ltr"},"te":{"name":"Telugu","nativeName":"తెలుగు","dir":"ltr"},"th":{"name":"Thai","nativeName":"ไทย","dir":"ltr"},"ti":{"name":"Tigrinya","nativeName":"ትግር","dir":"ltr"},"tk":{"name":"Turkmen","nativeName":"Türkmen Dili","dir":"ltr"},"tlh-Latn":{"name":"Klingon (Latin)","nativeName":"Klingon (Latin)","dir":"ltr"},"tlh-Piqd":{"name":"Klingon (pIqaD)","nativeName":"Klingon (pIqaD)","dir":"ltr"},"tn":{"name":"Setswana","nativeName":"Setswana","dir":"ltr"},"to":{"name":"Tongan","nativeName":"Lea Fakatonga","dir":"ltr"},"tr":{"name":"Turkish","nativeName":"Türkçe","dir":"ltr"},"tt":{"name":"Tatar","nativeName":"Татар","dir":"ltr"},"ty":{"name":"Tahitian","nativeName":"Reo Tahiti","dir":"ltr"},"ug":{"name":"Uyghur","nativeName":"ئۇيغۇرچە","dir":"rtl"},"uk":{"name":"Ukrainian","nativeName":"Українська","dir":"ltr"},"ur":{"name":"Urdu","nativeName":"اردو","dir":"rtl"},"uz":{"name":"Uzbek (Latin)","nativeName":"Uzbek (Latin)","dir":"ltr"},"vi":{"name":"Vietnamese","nativeName":"Tiếng Việt","dir":"ltr"},"xh":{"name":"Xhosa","nativeName":"isiXhosa","dir":"ltr"},"yo":{"name":"Yoruba","nativeName":"Èdè Yorùbá","dir":"ltr"},"yua":{"name":"Yucatec Maya","nativeName":"Yucatec Maya","dir":"ltr"},"yue":{"name":"Cantonese (Traditional)","nativeName":"粵語 (繁體)","dir":"ltr"},"zh-Hans":{"name":"Chinese Simplified","nativeName":"中文 (简体)","dir":"ltr"},"zh-Hant":{"name":"Chinese Traditional","nativeName":"繁體中文 (繁體)","dir":"ltr"},"zu":{"name":"Zulu","nativeName":"Isi-Zulu","dir":"ltr"}}}
 5 | 
 6 | class AzureTranslatorClient:
 7 |     def __init__(self, translate_key=None, translate_region=None, translate_endpoint=None):
 8 | 
 9 |         load_dotenv()
10 | 
11 |         self.translate_key = translate_key if translate_key else os.getenv('TRANSLATE_KEY')
12 |         self.translate_region = translate_region if translate_region else os.getenv('TRANSLATE_REGION')
13 |         self.translate_endpoint = translate_endpoint if translate_endpoint else os.getenv('TRANSLATE_ENDPOINT')
14 |         self.api_version = "3.0"
15 | 
16 |         if os.getenv('VNET_DEPLOYMENT', 'false') == 'false':
17 |             self.detect_endpoint = urllib.parse.urljoin(self.translate_endpoint, f"/detect?api-version={self.api_version}")
18 |             self.translate_endpoint = urllib.parse.urljoin(self.translate_endpoint, f"/translate?api-version={self.api_version}")
19 |         else:
20 |             self.detect_endpoint = urllib.parse.urljoin(self.translate_endpoint, f"/translator/text/v3.0/detect?api-version={self.api_version}")
21 |             self.translate_endpoint = urllib.parse.urljoin(self.translate_endpoint, f"/translator/text/v3.0/translate?api-version={self.api_version}")
22 | 
23 | 
24 |     def translate(self, text, language='en'):
25 |         headers = {
26 |             'Ocp-Apim-Subscription-Key': self.translate_key,
27 |             'Ocp-Apim-Subscription-Region': self.translate_region,
28 |             'Content-type': 'application/json'
29 |         }
30 |         params = urllib.parse.urlencode({})
31 |         body = [{
32 |             'text': text
33 |         }]
34 |         request = requests.post(self.detect_endpoint, params=params, headers=headers, json=body)
35 |         response = request.json()
36 |         if (response[0]['language'] != language):
37 |             params = urllib.parse.urlencode({
38 |                 'from': response[0]['language'],
39 |                 'to': language
40 |             })
41 |             body = [{
42 |                 'text': text
43 |             }]
44 |             request = requests.post(self.translate_endpoint, params=params, headers=headers, json=body)
45 |             response = request.json()
46 |             return response[0]['translations'][0]['text']
47 |         else:
48 |             return text
49 |         
50 | 
51 |     def get_available_languages(self):
52 |         if os.getenv('VNET_DEPLOYMENT', 'false') == 'true':
53 |             available_languages = default_languages['translation']
54 |         else:
55 |             r = requests.get(f"https://api.cognitive.microsofttranslator.com/languages?api-version={self.api_version}&scope=translation")
56 |             available_languages = r.json()['translation']
57 |         languages = {}
58 |         for k,v  in available_languages.items():
59 |             languages[v['name']] =  k
60 |         return languages


--------------------------------------------------------------------------------
/demo/.dockerignore:
--------------------------------------------------------------------------------
1 | .env
2 | __pycache__


--------------------------------------------------------------------------------
/demo/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.10-slim-buster
2 | RUN apt-get update && apt-get install python-tk python3-tk tk-dev -y
3 | COPY ./requirements.txt /usr/local/src/myscripts/requirements.txt
4 | WORKDIR /usr/local/src/myscripts
5 | RUN pip install -r requirements.txt
6 | COPY . /usr/local/src/myscripts
7 | EXPOSE 80
8 | CMD ["streamlit", "run", "demo.py", "--server.port", "80", "--server.enableXsrfProtection", "false"]


--------------------------------------------------------------------------------
/demo/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/demo/architecture.png


--------------------------------------------------------------------------------
/demo/demo.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from helper import get_semantic_answer
 3 | 
 4 | # st.set_page_config(layout="wide")
 5 | hide_menu_style = """<style>#MainMenu {visibility: hidden;}</style>"""
 6 | st.markdown(hide_menu_style, unsafe_allow_html=True)
 7 | 
 8 | st.image('./microsoft.png', width=200)
 9 | st.title('Azure OpenAI Service Q&A Demo')
10 | st.caption('Sponsored by Microsoft\'s Global Black Belt team for AI in EMEA')
11 | st.write('This demo shows how Azure OpenAI Service can be used to answer questions on unstructured data. It was trained on the 10K form dataset. Under the hood, we use OpenAI\'s GPT-3 models and embeddings to generate answers to the users\' questions.')
12 | 
13 | tab1, tab2, tab3 = st.tabs(["Demo", "Sample questions", "How does this demo work?"])
14 | 
15 | with tab1:
16 |     st.write('Try asking a question like:\n\nWhat is Azure? Give me a long answer!')
17 |     question = st.text_input("Question:")
18 | 
19 |     if question != '':
20 |         answer, prompt = get_semantic_answer(question)
21 |         st.write(f"**Question:** {question}")
22 |         st.write(f"**Answer:** {answer}")
23 |         with st.expander("Click here to see the prompt we've used to generate the answer", expanded=False):
24 |             prompt = prompt.replace('$', '\$')
25 |             st.markdown(f":star: **Short explanation**\n1. The first part of the prompt is the retrieved documents that were likely to contain the answer\n1. The second part is the actual prompt to answer our question\n\n:star: **Prompt:**\n{prompt}")
26 | with tab2:
27 |     st.write('Try asking questions like:')
28 |     col1, col2 = st.columns(2, gap="medium")
29 |     with col1:
30 |         st.markdown("""
31 | * What's Microsoft's mission? Give me a long answer!
32 | * What licenses Microsoft offers?
33 | * What's an Enterprise Agreement?
34 | * Tell me more about Microsoft execs
35 | * Who's Amy Hood? how long in role? and previously?""")
36 | 
37 |     with col2:
38 |         st.markdown("""
39 | * Who's Amy Hood? how long in role? and previously?
40 | * What Microsoft mean with Intelligent Cloud?
41 | * Where does Github sit in Microsoft portfolio?
42 | * Which are Microsoft competitors in intelligent cloud business?
43 | * What's microsoft commitment to inclusion?""")
44 | 
45 |     st.write("If you want a shorter answer, you can say \"Write a short answer\" or do the opposite and say \"Give me a long answer\".")
46 |     st.write("You can also ask questions in other languages, e.g., try to ask a question in German or Spanish.")
47 | 
48 | with tab3:
49 |    st.header("How does this demo work?")
50 |    st.markdown("""
51 |                This demo leverages the following components to achieve a ChatGPT-like experience on unstructured documents:
52 |                * **Azure OpenAI Service** to generate answers to questions
53 |                * **Azure OpenAI Service Embeddings** to semantically extract the "meaning of a document"
54 |                * **RediSearch** to store the embeddings and perform search queries
55 |                * **Azure Form Recognizer** to extract the text from the documents
56 |                """)
57 |    st.image("./architecture.png", caption="Solution Architecture")
58 |    st.markdown("""
59 |                So, what is happening here? Let's break it down:
60 |                1. Firstly, we parse the documents in our knowledge base and extract the text using Azure Form Recognizer. We do this since data might be in PDF format, but it also allows to create smaller text chunks. We do not want to work on documents that are 100's of pages long.
61 |                1. Next, we use Azure OpenAI Service Embeddings to semantically extract the "meaning of a document". This converts the sections of each document into a vector (basically a long series of numbers, 1536 to be more precise), which represents the semantics of each document section. We store this vector in RediSearch.
62 |                1. As the user asks a question, we again use Azure OpenAI Service Embeddings to semantically extract the "meaning of the question". We then use RediSearch to find the most similar documents to the question. In our case, we use the top 3 documents. These documents are likely to contain the answer to our question.
63 |                1. Now that we have the matching documents, we use Azure OpenAI Service to generate an answer to our question. To do this, we use the top 3 documents as the context to generate the answer, given the original question of the user. You can see this prompt when you click on the "Click here to see the prompt we've used to generate the answer" link.
64 |                1. Finally, we return the answer to the user. Done!
65 |                """)


--------------------------------------------------------------------------------
/demo/helper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import openai
 3 | from dotenv import load_dotenv
 4 | from openai.embeddings_utils import get_embedding
 5 | from tenacity import retry, wait_random_exponential, stop_after_attempt
 6 | import pandas as pd
 7 | import numpy as np
 8 | from redis.commands.search.query import Query
 9 | from redis import Redis
10 | 
11 | load_dotenv()
12 | 
13 | openai.api_key = os.getenv("OPENAI_API_KEY")
14 | openai.api_base =  os.getenv("OPENAI_API_BASE")
15 | openai.api_type = 'azure'
16 | openai.api_version = '2022-12-01'
17 | completion_model = os.getenv("OPENAI_ENGINES").split(',')[0]
18 | embedding_model = os.getenv("OPENAI_EMBEDDINGS_ENGINE_DOC")
19 | question_prompt = os.getenv("QUESTION_PROMPT").replace(r'\n', '\n')
20 | number_of_embeddings_for_qna = int(os.getenv("NUMBER_OF_EMBEDDINGS_FOR_QNA", 1))
21 | 
22 | redis_conn = Redis(host=os.getenv('REDIS_ADDRESS'), port=int(os.environ.get('REDIS_PORT','6379')), password=os.getenv('REDIS_PASSWORD'))
23 | index_name = "embeddings-index"
24 | prompt_index_name = "prompt-index"
25 | 
26 | 
27 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
28 | def get_embedding(text) -> list[float]:
29 |     text = text.replace("\n", " ")
30 |     return openai.Embedding.create(input=text, engine=embedding_model)["data"][0]["embedding"]
31 | 
32 | 
33 | def find_matching_vectors_in_redis(np_vector:np.array, return_fields: list=[], search_type: str="KNN", number_of_results: int=20, vector_field_name: str="embeddings"):
34 |     base_query = f'*=>[{search_type} {number_of_results} @{vector_field_name} $vec_param AS vector_score]'
35 |     query = Query(base_query)\
36 |         .sort_by("vector_score")\
37 |         .paging(0, number_of_results)\
38 |         .return_fields(*return_fields)\
39 |         .dialect(2)
40 |     params_dict = {"vec_param": np_vector.astype(dtype=np.float32).tobytes()}
41 |     results = redis_conn.ft(index_name).search(query, params_dict)
42 |     return pd.DataFrame(list(map(lambda x: {'id' : x.id, 'text': x.text, 'filename': x.filename, 'vector_score': x.vector_score}, results.docs)))
43 | 
44 | 
45 | def search_semantic_redis(search_query, pprint=True):
46 |     embedding = get_embedding(search_query)
47 |     res = find_matching_vectors_in_redis(np.array(embedding))
48 |     if pprint:
49 |         for r in res:
50 |             print(r[:200])
51 |             print()
52 |     return res.reset_index()
53 | 
54 | 
55 | def get_semantic_answer(question):
56 |     # question += "\n"
57 |     res = search_semantic_redis(question, pprint=False)
58 | 
59 |     if len(res) == 0:
60 |         return None, "No vectors matched, try a different question."
61 | 
62 | 
63 |     res_text = "\n".join(res['text'][0:number_of_embeddings_for_qna])
64 |     prompt = question_prompt.replace("_QUESTION_", question)
65 |     prompt = f"{res_text}\n\n{prompt}"
66 | 
67 |     response = openai.Completion.create(
68 |         engine=completion_model,
69 |         prompt=prompt,
70 |         temperature=0.7,
71 |         max_tokens=500,
72 |         top_p=1,
73 |         frequency_penalty=0,
74 |         presence_penalty=0,
75 |         stop=None
76 |     )
77 |     print(prompt)
78 |     response = response['choices'][0]['text'].strip()
79 |     print(f"{response}\n\n\n")
80 |     return response, prompt


--------------------------------------------------------------------------------
/demo/microsoft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/demo/microsoft.png


--------------------------------------------------------------------------------
/demo/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit==1.18.1
 2 | openai==0.26.5
 3 | redis==4.4.2
 4 | python-dotenv==0.21.0
 5 | numpy
 6 | pandas
 7 | matplotlib==3.6.3
 8 | plotly==5.12.0
 9 | scipy==1.10.0
10 | scikit-learn==1.2.0


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | services:
 3 |   web:
 4 |     image: fruocco/oai-embeddings
 5 |     ports:
 6 |       - "8080:80"
 7 |     env_file:
 8 |       - .env
 9 |     depends_on:
10 |       api:
11 |         condition: service_healthy
12 |   api:
13 |     image: redis/redis-stack-server:latest
14 |     ports:
15 |       - "6379:6379"
16 |     env_file:
17 |       - .env
18 |     healthcheck:
19 |       test: ["CMD", "redis-cli", "-h", "localhost", "-p", "6379", "ping"]
20 |       interval: 2s
21 |       timeout: 1m30s
22 |       retries: 5
23 |       start_period: 5s
24 |   batch:
25 |     image: fruocco/oai-batch:latest
26 |     ports: 
27 |       - "8081:80"
28 |     env_file:
29 |       - .env
30 |     depends_on:
31 |       api:
32 |         condition: service_healthy


--------------------------------------------------------------------------------
/docs/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/docs/architecture.png


--------------------------------------------------------------------------------
/docs/architecture_acre.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/docs/architecture_acre.png


--------------------------------------------------------------------------------
/docs/architecture_acs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/docs/architecture_acs.png


--------------------------------------------------------------------------------
/docs/architecture_pg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/docs/architecture_pg.png


--------------------------------------------------------------------------------
/docs/architecture_redis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Azure-Samples/azure-open-ai-embeddings-qna/96ce23acbe613a8031a5f7e5b02cc87d17a512c4/docs/architecture_redis.png


--------------------------------------------------------------------------------
/infrastructure/deployment.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
  3 |   "contentVersion": "1.0.0.0",
  4 |   "parameters": {
  5 |     "ResourcePrefix": {
  6 |       "type": "string",
  7 |       "metadata": {
  8 |         "description": "provide a 2-13 character prefix for all resources."
  9 |       }
 10 |     },
 11 |     "ContainerName": {
 12 |       "type": "string",
 13 |       "defaultValue": "redis",
 14 |       "metadata": {
 15 |         "description": "Name of the container"
 16 |       }
 17 |     },
 18 |     "DNSNameLabel": {
 19 |       "type": "string",
 20 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-redis')]",
 21 |       "metadata": {
 22 |         "description": "DNS Name Label for the Public IP address"
 23 |       }
 24 |     },
 25 |     "RedisPassword": {
 26 |       "type": "securestring",
 27 |       "defaultValue": "redis",
 28 |       "metadata": {
 29 |         "description": "Redis Password"
 30 |       }
 31 |     },
 32 |     "HostingPlanName": {
 33 |       "type": "string",
 34 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-plan')]",
 35 |       "metadata": {
 36 |         "description": "Name of App Service plan"
 37 |       }
 38 |     },
 39 |     "HostingPlanSku": {
 40 |       "type": "string",
 41 |       "defaultValue": "B3",
 42 |       "allowedValues": [
 43 |         "F1",
 44 |         "D1",
 45 |         "B1",
 46 |         "B2",
 47 |         "B3",
 48 |         "S1",
 49 |         "S2",
 50 |         "S3",
 51 |         "P1",
 52 |         "P2",
 53 |         "P3",
 54 |         "P4"
 55 |       ],
 56 |       "metadata": {
 57 |         "description": "The pricing tier for the App Service plan"
 58 |       }
 59 |     },
 60 |     "StorageAccountName": {
 61 |       "type": "string",
 62 |       "defaultValue": "[concat(parameters('ResourcePrefix'), 'str')]",
 63 |       "metadata": {
 64 |         "description": "Name of Storage Account"
 65 |       }
 66 |     },
 67 |     "WebsiteName": {
 68 |       "type": "string",
 69 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-site')]",
 70 |       "metadata": {
 71 |         "description": "Name of Web App"
 72 |       }
 73 |     },
 74 |     "FunctionName": {
 75 |       "type": "string",
 76 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-batchfunc')]",
 77 |       "metadata": {
 78 |         "description": "Name of Function App for Batch document processing"
 79 |       }
 80 |     },
 81 |     "ApplicationInsightsName": {
 82 |       "type": "string",
 83 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-appinsights')]",
 84 |       "metadata": {
 85 |         "description": "Name of Application Insights"
 86 |       }
 87 |     },
 88 |     "OpenAIName": {
 89 |       "type": "string",
 90 |       "metadata": {
 91 |         "description": "Name of OpenAI Resource"
 92 |       }
 93 |     },
 94 |     "OpenAIKey": {
 95 |       "type": "securestring",
 96 |       "defaultValue": "sk-",
 97 |       "metadata": {
 98 |         "description": "OpenAI API Key"
 99 |       }
100 |     },
101 |     "OpenAIEngine": {
102 |       "type": "string",
103 |       "defaultValue": "text-davinci-003",
104 |       "metadata": {
105 |         "description": "OpenAI Engine"
106 |       }
107 |     },
108 |     "OpenAIDeploymentType": {
109 |       "type": "string",
110 |       "defaultValue": "Text",
111 |       "metadata": {
112 |         "description": "OpenAI Deployment Type. Text for an Instructions based deployment (text-davinci-003). Chat for a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4)."
113 |       }
114 |     },
115 |     "OpenAIEmbeddingsEngineDoc": {
116 |       "type": "string",
117 |       "defaultValue": "text-embedding-ada-002",
118 |       "metadata": {
119 |         "description": "OpenAI Embeddings Engine for Documents"
120 |       }
121 |     },
122 |     "OpenAIEmbeddingsEngineQuery": {
123 |       "type": "string",
124 |       "defaultValue": "text-embedding-ada-002",
125 |       "metadata": {
126 |         "description": "OpenAI Embeddings Engine for Queries"
127 |       }
128 |     },
129 |     "OpenAITemperature": {
130 |       "type": "string",
131 |       "defaultValue": "0.7",
132 |       "metadata": {
133 |         "description": "OpenAI Temperature"
134 |       }
135 |     },
136 |     "OpenAIMaxTokens": {
137 |       "type": "string",
138 |       "defaultValue": "-1",
139 |       "metadata": {
140 |         "description": "OpenAI Max Tokens"
141 |       }
142 |     },
143 |     "FormRecognizerEndpoint": {
144 |       "type": "string",
145 |       "metadata": {
146 |         "description": "Form Recognizer Endpoint"
147 |       }
148 |     },
149 |     "FormRecognizerKey": {
150 |       "type": "securestring",
151 |       "metadata": {
152 |         "description": "Form Recognizer Key"
153 |       }
154 |     },
155 |     "TranslateEndpoint": {
156 |       "type": "string",
157 |       "metadata": {
158 |         "description": "Translator Endpoint"
159 |       }
160 |     },
161 |     "TranslateKey": {
162 |       "type": "securestring",
163 |       "metadata": {
164 |         "description": "Translator Key"
165 |       }
166 |     },
167 |     "TranslateRegion": {
168 |       "type": "string",
169 |       "metadata": {
170 |         "description": "Translator Region"
171 |       }
172 |     },
173 |     "newGuid": {
174 |       "type": "string",
175 |       "defaultValue": "[newGuid()]"
176 |     }
177 |   },
178 |   "variables": {
179 |     "ContainerImageName": "redis/redis-stack-server:latest",
180 |     "WebAppImageName": "DOCKER|fruocco/oai-embeddings",
181 |     "BlobContainerName": "documents",
182 |     "FileShareName": "redisdata",
183 |     "QueueName": "doc-processing",
184 |     "ClientKey": "[concat(uniqueString(guid(resourceGroup().id, deployment().name)), parameters('newGuid'), 'Tg2%')]"
185 |   },
186 |   "resources": [
187 |     {
188 |       "apiVersion": "2020-06-01",
189 |       "name": "[parameters('HostingPlanName')]",
190 |       "type": "Microsoft.Web/serverfarms",
191 |       "location": "[resourceGroup().location]",
192 |       "sku": {
193 |         "name": "[parameters('HostingPlanSku')]"
194 |       },
195 |       "properties": {
196 |         "name": "[parameters('HostingPlanName')]",
197 |         "reserved": true
198 |       },
199 |       "kind": "linux"
200 |     },
201 |     {
202 |       "apiVersion": "2020-06-01",
203 |       "name": "[parameters('WebsiteName')]",
204 |       "type": "Microsoft.Web/sites",
205 |       "location": "[resourceGroup().location]",
206 |       "dependsOn": [
207 |         "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]"
208 |       ],
209 |       "properties": {
210 |         "serverFarmId": "[parameters('HostingPlanName')]",
211 |         "siteConfig": {
212 |           "linuxFxVersion": "[variables('WebAppImageName')]"
213 |         }
214 |       }
215 |     },
216 |     {
217 |       "name": "[parameters('StorageAccountName')]",
218 |       "type": "Microsoft.Storage/storageAccounts",
219 |       "apiVersion": "2021-08-01",
220 |       "location": "[resourceGroup().location]",
221 |       "kind": "StorageV2",
222 |       "sku": {
223 |         "name": "Standard_GRS"
224 |       }
225 |     },
226 |     {
227 |       "type": "Microsoft.Storage/storageAccounts/blobServices/containers",
228 |       "apiVersion": "2021-08-01",
229 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('BlobContainerName'))]",
230 |       "dependsOn": [
231 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
232 |       ],
233 |       "properties": {
234 |         "publicAccess": "None"
235 |       }
236 |     },
237 |     {
238 |       "type": "Microsoft.Storage/storageAccounts/fileServices/shares",
239 |       "apiVersion": "2021-08-01",
240 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('FileShareName'))]",
241 |       "dependsOn": [
242 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
243 |       ],
244 |       "properties": {
245 | 
246 |         "protocolSettings": {
247 |           "smb": {}
248 |         },
249 |         "cors": {
250 |           "corsRules": []
251 |         },
252 |         "shareDeleteRetentionPolicy": {
253 |           "enabled": true,
254 |           "days": 7
255 |         }
256 |       }
257 |     },
258 |     {
259 |       "type": "Microsoft.Storage/storageAccounts/queueServices",
260 |       "apiVersion": "2022-09-01",
261 |       "name": "[concat(parameters('StorageAccountName'), '/default')]",
262 |       "dependsOn": [
263 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
264 |       ],
265 |       "properties": {
266 |         "cors": {
267 |           "corsRules": []
268 |         }
269 |       }
270 |     },
271 |     {
272 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
273 |       "apiVersion": "2022-09-01",
274 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing')]",
275 |       "dependsOn": [
276 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
277 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
278 |       ],
279 |       "properties": {
280 |         "metadata": {}
281 |       }
282 |     },
283 |     {
284 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
285 |       "apiVersion": "2022-09-01",
286 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing-poison')]",
287 |       "dependsOn": [
288 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
289 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
290 |       ],
291 |       "properties": {
292 |         "metadata": {}
293 |       }
294 |     },
295 |     {
296 |       "type": "Microsoft.Insights/components",
297 |       "apiVersion": "2020-02-02",
298 |       "name": "[parameters('ApplicationInsightsName')]",
299 |       "location": "[resourceGroup().location]",
300 |       "tags": {
301 |         "[concat('hidden-link:', resourceId('Microsoft.Web/sites', parameters('ApplicationInsightsName')))]": "Resource"
302 |       },
303 |       "properties": {
304 |         "Application_Type": "web"
305 |       },
306 |       "kind": "web"
307 |     },
308 |     {
309 |       "apiVersion": "2018-11-01",
310 |       "name": "[parameters('FunctionName')]",
311 |       "type": "Microsoft.Web/sites",
312 |       "kind": "functionapp,linux",
313 |       "location": "[resourceGroup().location]",
314 |       "tags": {},
315 |       "dependsOn": [
316 |         "[concat('Microsoft.Web/serverfarms/', parameters('HostingPlanName'))]",
317 |         "[concat('Microsoft.Storage/storageAccounts/', parameters('StorageAccountName'))]",
318 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]"
319 |       ],
320 |       "properties": {
321 |         "name": "[parameters('FunctionName')]",
322 |         "siteConfig": {
323 |           "appSettings": [
324 |             {
325 |               "name": "FUNCTIONS_EXTENSION_VERSION",
326 |               "value": "~4"
327 |             },
328 |             {
329 |               "name": "WEBSITES_ENABLE_APP_SERVICE_STORAGE",
330 |               "value": "false"
331 |             },
332 |             {
333 |               "name": "APPINSIGHTS_INSTRUMENTATIONKEY",
334 |               "value": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]"
335 |             },
336 |             {
337 |               "name": "AzureWebJobsStorage",
338 |               "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',parameters('StorageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]"
339 |             },
340 |             {
341 |               "name": "OPENAI_ENGINE",
342 |               "value": "[parameters('OpenAIEngine')]"
343 |             },
344 |             {
345 |               "name": "OPENAI_DEPLOYMENT_TYPE",
346 |               "value": "[parameters('OpenAIDeploymentType')]"
347 |             },            
348 |             {
349 |               "name": "OPENAI_EMBEDDINGS_ENGINE_DOC",
350 |               "value": "[parameters('OpenAIEmbeddingsEngineDoc')]"
351 |             },           
352 |             {
353 |               "name": "OPENAI_EMBEDDINGS_ENGINE_QUERY",
354 |               "value": "[parameters('OpenAIEmbeddingsEngineQuery')]"
355 |             },
356 |             {
357 |               "name": "OPENAI_API_BASE",
358 |               "value": "[concat('https://', parameters('OpenAIName'), '.openai.azure.com/')]"
359 |             },
360 |             {
361 |               "name": "OPENAI_API_KEY",
362 |               "value": "[parameters('OpenAIKey')]"
363 |             },
364 |             {
365 |               "name": "OPENAI_TEMPERATURE",
366 |               "value": "[parameters('OpenAITemperature')]"
367 |             },
368 |             {
369 |               "name": "OPENAI_MAX_TOKENS",
370 |               "value": "[parameters('OpenAIMaxTokens')]"
371 |             },
372 |             {
373 |               "name": "BLOB_ACCOUNT_NAME",
374 |               "value": "[parameters('StorageAccountName')]"
375 |             },
376 |             {
377 |               "name": "BLOB_ACCOUNT_KEY",
378 |               "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value]"
379 |             },
380 |             {
381 |               "name": "BLOB_CONTAINER_NAME",
382 |               "value": "[variables('BlobContainerName')]"
383 |             },
384 |             {
385 |               "name": "FORM_RECOGNIZER_ENDPOINT",
386 |               "value": "[parameters('FormRecognizerEndpoint')]"
387 |             },
388 |             {
389 |               "name": "FORM_RECOGNIZER_KEY",
390 |               "value": "[parameters('FormRecognizerKey')]"
391 |             },
392 |             {
393 |               "name": "REDIS_ADDRESS",
394 |               "value": "[concat(parameters('DNSNameLabel'),'.' , resourceGroup().location ,'.azurecontainer.io')]"
395 |             },
396 |             {
397 |               "name": "REDIS_PASSWORD",
398 |               "value": "[parameters('RedisPassword')]"
399 |             },
400 |             {
401 |               "name": "TRANSLATE_ENDPOINT",
402 |               "value": "[parameters('TranslateEndpoint')]"
403 |             },
404 |             {
405 |               "name": "TRANSLATE_KEY",
406 |               "value": "[parameters('TranslateKey')]"
407 |             },
408 |             {
409 |               "name": "TRANSLATE_REGION",
410 |               "value": "[parameters('TranslateRegion')]"
411 |             },
412 |             {
413 |               "name": "QUEUE_NAME",
414 |               "value": "[variables('QueueName')]"
415 |             }
416 |           ],
417 |           "cors": {
418 |             "allowedOrigins": [
419 |               "https://portal.azure.com"
420 |             ]
421 |           },
422 |           "use32BitWorkerProcess": false,
423 |           "linuxFxVersion": "DOCKER|fruocco/oai-batch:latest",
424 |           "appCommandLine": "",
425 |           "alwaysOn": true
426 |         },
427 |         "serverFarmId": "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]",
428 |         "clientAffinityEnabled": false,
429 |         "virtualNetworkSubnetId": null,
430 |         "httpsOnly": true
431 |       }
432 |     },
433 |     {
434 |       "name": "[parameters('ContainerName')]",
435 |       "type": "Microsoft.ContainerInstance/containerGroups",
436 |       "apiVersion": "2021-10-01",
437 |       "location": "[resourceGroup().location]",
438 |       "properties": {
439 |         "containers": [
440 |           {
441 |             "name": "[parameters('ContainerName')]",
442 |             "properties": {
443 |               "image": "[variables('ContainerImageName')]",
444 |               "resources": {
445 |                 "requests": {
446 |                   "cpu": 1,
447 |                   "memoryInGb": 1.5
448 |                 }
449 |               },
450 |               "environmentVariables": [
451 |                 {
452 |                   "name": "REDIS_ARGS",
453 |                   "value": "[concat('--requirepass ', parameters('RedisPassword'))]"
454 |                 }
455 |               ],
456 |               "ports": [
457 |                 {
458 |                   "protocol": "TCP",
459 |                   "port": 6379
460 |                 }
461 |               ],
462 |               "volumeMounts": [
463 |                 {
464 |                   "name": "[variables('FileShareName')]",
465 |                   "mountPath": "/data"
466 |                 }
467 |               ]
468 |             }
469 |           }
470 |         ],
471 |         "osType": "Linux",
472 |         "ipAddress": {
473 |           "type": "Public",
474 |           "ports": [
475 |             {
476 |               "protocol": "TCP",
477 |               "port": 6379
478 |             }
479 |           ],
480 |           "dnsNameLabel": "[parameters('DNSNameLabel')]"
481 |         },
482 |         "initContainers": [
483 |         ],
484 |         "volumes": [
485 |           {
486 |             "name": "[variables('FileShareName')]",
487 |             "azureFile": {
488 |               "shareName": "[variables('FileShareName')]",
489 |               "storageAccountName": "[parameters('StorageAccountName')]",
490 |               "storageAccountKey": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName')), '2021-08-01').keys[0].value]"
491 |             }
492 |           }
493 |         ]
494 |       }
495 |     },
496 |     {
497 |       "type": "Microsoft.Web/sites/host/functionKeys",
498 |       "apiVersion": "2018-11-01",
499 |       "name": "[concat(parameters('FunctionName'), '/default/clientKey')]",
500 |       "dependsOn": [
501 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]",
502 |         "WaitFunctionDeploymentSection"
503 |       ],
504 |       "properties": {
505 |         "name": "ClientKey",
506 |         "value": "[variables('ClientKey')]"
507 |       }
508 |     },
509 |     {
510 |       "type": "Microsoft.Web/sites/config",
511 |       "apiVersion": "2021-03-01",
512 |       "name": "[format('{0}/{1}', parameters('WebsiteName'), 'appsettings')]",
513 |       "kind": "string",
514 |       "dependsOn": [
515 |         "[resourceId('Microsoft.Web/sites', parameters('WebsiteName'))]",
516 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]"
517 |       ],
518 |       "properties": {
519 |         "APPINSIGHTS_INSTRUMENTATIONKEY": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]",
520 |         "OPENAI_ENGINE": "[parameters('OpenAIEngine')]",
521 |         "OPENAI_DEPLOYMENT_TYPE": "[parameters('OpenAIDeploymentType')]",
522 |         "OPENAI_EMBEDDINGS_ENGINE_DOC": "[parameters('OpenAIEmbeddingsEngineDoc')]",
523 |         "OPENAI_EMBEDDINGS_ENGINE_QUERY": "[parameters('OpenAIEmbeddingsEngineQuery')]",
524 |         "REDIS_ADDRESS": "[concat(parameters('DNSNameLabel'),'.' , resourceGroup().location ,'.azurecontainer.io')]",
525 |         "REDIS_PASSWORD": "[parameters('RedisPassword')]",
526 |         "OPENAI_API_BASE": "[concat('https://', parameters('OpenAIName'),'.openai.azure.com/')]",
527 |         "OPENAI_API_KEY": "[parameters('OpenAIKey')]",
528 |         "OPENAI_TEMPERATURE": "[parameters('OpenAITemperature')]",
529 |         "OPENAI_MAX_TOKENS": "[parameters('OpenAIMaxTokens')]",
530 |         "BLOB_ACCOUNT_NAME": "[parameters('StorageAccountName')]",
531 |         "BLOB_ACCOUNT_KEY": "[listkeys(resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName')), '2015-05-01-preview').key1]",
532 |         "BLOB_CONTAINER_NAME": "[variables('BlobContainerName')]",
533 |         "FORM_RECOGNIZER_ENDPOINT": "[parameters('FormRecognizerEndpoint')]",
534 |         "FORM_RECOGNIZER_KEY": "[parameters('FormRecognizerKey')]",
535 |         "TRANSLATE_ENDPOINT": "[parameters('TranslateEndpoint')]",
536 |         "TRANSLATE_KEY": "[parameters('TranslateKey')]",
537 |         "TRANSLATE_REGION": "[parameters('TranslateRegion')]",
538 |         "CONVERT_ADD_EMBEDDINGS_URL": "[concat('https://', parameters('FunctionName') , '.azurewebsites.net/api/BatchStartProcessing?code=', variables('ClientKey'))]"
539 |       }
540 |     },
541 |     {
542 |       "type": "Microsoft.Resources/deploymentScripts",
543 |       "apiVersion": "2020-10-01",
544 |       "kind": "AzurePowerShell",
545 |       "name": "WaitFunctionDeploymentSection",
546 |       "location": "[resourceGroup().location]",
547 |       "dependsOn": [
548 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]"
549 |       ],
550 |       "properties": {
551 |         "azPowerShellVersion": "3.0",
552 |         "scriptContent": "start-sleep -Seconds 300",
553 |         "cleanupPreference": "Always",
554 |         "retentionInterval": "PT1H"
555 |       }
556 |     }
557 |   ]
558 | }
559 | 


--------------------------------------------------------------------------------
/infrastructure/deploymentACRE.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
  3 |   "contentVersion": "1.0.0.0",
  4 |   "parameters": {
  5 |     "ResourcePrefix": {
  6 |       "type": "string",
  7 |       "metadata": {
  8 |         "description": "provide a 2-13 character prefix for all resources."
  9 |       }
 10 |     },
 11 |     "RedisName": {
 12 |       "type": "string",
 13 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-redis')]",
 14 |       "metadata": {
 15 |         "description": "Redis Hostname"
 16 |       }
 17 |     },
 18 | 	"RedisPort": {
 19 |       "type": "int",
 20 |       "defaultValue": 10000,
 21 |       "metadata": {
 22 |         "description": "Redis Port"
 23 |       }
 24 |     },
 25 |     "HostingPlanName": {
 26 |       "type": "string",
 27 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-plan')]",
 28 |       "metadata": {
 29 |         "description": "Name of App Service plan"
 30 |       }
 31 |     },
 32 |     "HostingPlanSku": {
 33 |       "type": "string",
 34 |       "defaultValue": "B3",
 35 |       "allowedValues": [
 36 |         "F1",
 37 |         "D1",
 38 |         "B1",
 39 |         "B2",
 40 |         "B3",
 41 |         "S1",
 42 |         "S2",
 43 |         "S3",
 44 |         "P1",
 45 |         "P2",
 46 |         "P3",
 47 |         "P4"
 48 |       ],
 49 |       "metadata": {
 50 |         "description": "The pricing tier for the App Service plan"
 51 |       }
 52 |     },
 53 |     "StorageAccountName": {
 54 |       "type": "string",
 55 |       "defaultValue": "[concat(parameters('ResourcePrefix'), 'str')]",
 56 |       "metadata": {
 57 |         "description": "Name of Storage Account"
 58 |       }
 59 |     },
 60 |     "WebsiteName": {
 61 |       "type": "string",
 62 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-site')]",
 63 |       "metadata": {
 64 |         "description": "Name of Web App"
 65 |       }
 66 |     },
 67 |     "FunctionName": {
 68 |       "type": "string",
 69 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-batchfunc')]",
 70 |       "metadata": {
 71 |         "description": "Name of Function App for Batch document processing"
 72 |       }
 73 |     },
 74 |     "ApplicationInsightsName": {
 75 |       "type": "string",
 76 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-appinsights')]",
 77 |       "metadata": {
 78 |         "description": "Name of Application Insights"
 79 |       }
 80 |     },
 81 |     "OpenAIEndpoint": {
 82 |       "type": "string",
 83 |       "metadata": {
 84 |         "description": "Endpoint of OpenAI Resource"
 85 |       }
 86 |     },
 87 |     "OpenAIKey": {
 88 |       "type": "securestring",
 89 |       "metadata": {
 90 |         "description": "OpenAI API Key"
 91 |       }
 92 |     },
 93 |     "OpenAIEngine": {
 94 |       "type": "string",
 95 |       "defaultValue": "text-davinci-003",
 96 |       "metadata": {
 97 |         "description": "OpenAI Engine"
 98 |       }
 99 |     },
100 |     "OpenAIDeploymentType": {
101 |       "type": "string",
102 |       "defaultValue": "Text",
103 |       "metadata": {
104 |         "description": "OpenAI Deployment Type. Text for an Instructions based deployment (text-davinci-003). Chat for a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4)."
105 |       }
106 |     },
107 |     "OpenAIEmbeddingsEngineDoc": {
108 |       "type": "string",
109 |       "defaultValue": "text-embedding-ada-002",
110 |       "metadata": {
111 |         "description": "OpenAI Embeddings Engine for Documents"
112 |       }
113 |     },
114 |     "OpenAIEmbeddingsEngineQuery": {
115 |       "type": "string",
116 |       "defaultValue": "text-embedding-ada-002",
117 |       "metadata": {
118 |         "description": "OpenAI Embeddings Engine for Queries"
119 |       }
120 |     },
121 |     "OpenAITemperature": {
122 |       "type": "string",
123 |       "defaultValue": "0.7",
124 |       "metadata": {
125 |         "description": "OpenAI Temperature"
126 |       }
127 |     },
128 |     "OpenAIMaxTokens": {
129 |       "type": "string",
130 |       "defaultValue": "-1",
131 |       "metadata": {
132 |         "description": "OpenAI Max Tokens"
133 |       }
134 |     },
135 |     "FormRecognizerEndpoint": {
136 |       "type": "string",
137 |       "metadata": {
138 |         "description": "Form Recognizer Endpoint"
139 |       }
140 |     },
141 |     "FormRecognizerKey": {
142 |       "type": "securestring",
143 |       "metadata": {
144 |         "description": "Form Recognizer Key"
145 |       }
146 |     },
147 |     "TranslateEndpoint": {
148 |       "type": "string",
149 |       "metadata": {
150 |         "description": "Translator Endpoint"
151 |       }
152 |     },
153 |     "TranslateKey": {
154 |       "type": "securestring",
155 |       "metadata": {
156 |         "description": "Translator Key"
157 |       }
158 |     },
159 |     "TranslateRegion": {
160 |       "type": "string",
161 |       "metadata": {
162 |         "description": "Translator Region"
163 |       }
164 |     },
165 |     "newGuid": {
166 |       "type": "string",
167 |       "defaultValue": "[newGuid()]"
168 |     }
169 |   },
170 |   "variables": {
171 |     "WebAppImageName": "DOCKER|fruocco/oai-embeddings",
172 |     "BlobContainerName": "documents",
173 |     "FileShareName": "redisdata",
174 |     "QueueName": "doc-processing",
175 |     "ClientKey": "[concat(uniqueString(guid(resourceGroup().id, deployment().name)), parameters('newGuid'), 'Tg2%')]"
176 |   },
177 |   "resources": [
178 |     {
179 |       "apiVersion": "2020-06-01",
180 |       "name": "[parameters('HostingPlanName')]",
181 |       "type": "Microsoft.Web/serverfarms",
182 |       "location": "[resourceGroup().location]",
183 |       "sku": {
184 |         "name": "[parameters('HostingPlanSku')]"
185 |       },
186 |       "properties": {
187 |         "name": "[parameters('HostingPlanName')]",
188 |         "reserved": true
189 |       },
190 |       "kind": "linux"
191 |     },
192 |     {
193 |       "apiVersion": "2020-06-01",
194 |       "name": "[parameters('WebsiteName')]",
195 |       "type": "Microsoft.Web/sites",
196 |       "location": "[resourceGroup().location]",
197 |       "dependsOn": [
198 |         "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]"
199 |       ],
200 |       "properties": {
201 |         "serverFarmId": "[parameters('HostingPlanName')]",
202 |         "siteConfig": {
203 |           "linuxFxVersion": "[variables('WebAppImageName')]"
204 |         }
205 |       }
206 |     },
207 |     {
208 |             "type": "Microsoft.Cache/redisEnterprise",
209 |             "apiVersion": "2023-03-01-preview",
210 |             "name": "[parameters('RedisName')]",
211 |             "location": "[resourceGroup().location]",
212 |             "sku": {
213 |                 "name": "Enterprise_E10",
214 |                 "capacity": 2
215 |             },
216 |             "zones": [
217 |                 "1",
218 |                 "2",
219 |                 "3"
220 |             ],
221 |             "identity": {
222 |                 "type": "None"
223 |             },
224 |             "properties": {
225 |                 "minimumTlsVersion": "1.2"
226 |             }
227 |     },
228 |     {
229 |             "type": "Microsoft.Cache/redisEnterprise/databases",
230 |             "apiVersion": "2023-03-01-preview",
231 |             "name": "[concat(parameters('RedisName'), '/default')]",
232 |             "dependsOn": [
233 |                 "[resourceId('Microsoft.Cache/redisEnterprise', parameters('RedisName'))]"
234 |             ],
235 |             "properties": {
236 |                 "clientProtocol": "Plaintext",
237 |                 "port": 10000,
238 |                 "clusteringPolicy": "EnterpriseCluster",
239 |                 "evictionPolicy": "NoEviction",
240 |                 "modules": [
241 |                     {
242 |                         "name": "RedisJSON"
243 |                     },
244 |                     {
245 |                         "name": "RediSearch"
246 |                     }
247 |                 ],
248 |                 "persistence": {
249 |                     "aofEnabled": false,
250 |                     "rdbEnabled": false
251 |                 }
252 |             }
253 |     },
254 |     {
255 |       "name": "[parameters('StorageAccountName')]",
256 |       "type": "Microsoft.Storage/storageAccounts",
257 |       "apiVersion": "2021-08-01",
258 |       "location": "[resourceGroup().location]",
259 |       "kind": "StorageV2",
260 |       "sku": {
261 |         "name": "Standard_GRS"
262 |       }
263 |     },
264 |     {
265 |       "type": "Microsoft.Storage/storageAccounts/blobServices/containers",
266 |       "apiVersion": "2021-08-01",
267 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('BlobContainerName'))]",
268 |       "dependsOn": [
269 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
270 |       ],
271 |       "properties": {
272 |         "publicAccess": "None"
273 |       }
274 |     },
275 |     {
276 |       "type": "Microsoft.Storage/storageAccounts/fileServices/shares",
277 |       "apiVersion": "2021-08-01",
278 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('FileShareName'))]",
279 |       "dependsOn": [
280 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
281 |       ],
282 |       "properties": {
283 | 
284 |         "protocolSettings": {
285 |           "smb": {}
286 |         },
287 |         "cors": {
288 |           "corsRules": []
289 |         },
290 |         "shareDeleteRetentionPolicy": {
291 |           "enabled": true,
292 |           "days": 7
293 |         }
294 |       }
295 |     },
296 |     {
297 |       "type": "Microsoft.Storage/storageAccounts/queueServices",
298 |       "apiVersion": "2022-09-01",
299 |       "name": "[concat(parameters('StorageAccountName'), '/default')]",
300 |       "dependsOn": [
301 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
302 |       ],
303 |       "properties": {
304 |         "cors": {
305 |           "corsRules": []
306 |         }
307 |       }
308 |     },
309 |     {
310 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
311 |       "apiVersion": "2022-09-01",
312 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing')]",
313 |       "dependsOn": [
314 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
315 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
316 |       ],
317 |       "properties": {
318 |         "metadata": {}
319 |       }
320 |     },
321 |     {
322 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
323 |       "apiVersion": "2022-09-01",
324 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing-poison')]",
325 |       "dependsOn": [
326 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
327 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
328 |       ],
329 |       "properties": {
330 |         "metadata": {}
331 |       }
332 |     },
333 |     {
334 |       "type": "Microsoft.Insights/components",
335 |       "apiVersion": "2020-02-02",
336 |       "name": "[parameters('ApplicationInsightsName')]",
337 |       "location": "[resourceGroup().location]",
338 |       "tags": {
339 |         "[concat('hidden-link:', resourceId('Microsoft.Web/sites', parameters('ApplicationInsightsName')))]": "Resource"
340 |       },
341 |       "properties": {
342 |         "Application_Type": "web"
343 |       },
344 |       "kind": "web"
345 |     },
346 |     {
347 |       "apiVersion": "2018-11-01",
348 |       "name": "[parameters('FunctionName')]",
349 |       "type": "Microsoft.Web/sites",
350 |       "kind": "functionapp,linux",
351 |       "location": "[resourceGroup().location]",
352 |       "tags": {},
353 |       "dependsOn": [
354 |         "[concat('Microsoft.Web/serverfarms/', parameters('HostingPlanName'))]",
355 |         "[concat('Microsoft.Storage/storageAccounts/', parameters('StorageAccountName'))]",
356 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]",
357 | 		"[resourceId('Microsoft.Cache/redisEnterprise/databases', parameters('RedisName'), 'default')]"
358 |       ],
359 |       "properties": {
360 |         "name": "[parameters('FunctionName')]",
361 |         "siteConfig": {
362 |           "appSettings": [
363 |             {
364 |               "name": "FUNCTIONS_EXTENSION_VERSION",
365 |               "value": "~4"
366 |             },
367 |             {
368 |               "name": "WEBSITES_ENABLE_APP_SERVICE_STORAGE",
369 |               "value": "false"
370 |             },
371 |             {
372 |               "name": "APPINSIGHTS_INSTRUMENTATIONKEY",
373 |               "value": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]"
374 |             },
375 |             {
376 |               "name": "AzureWebJobsStorage",
377 |               "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',parameters('StorageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]"
378 |             },
379 |             {
380 |               "name": "OPENAI_ENGINE",
381 |               "value": "[parameters('OpenAIEngine')]"
382 |             },
383 |             {
384 |               "name": "OPENAI_DEPLOYMENT_TYPE",
385 |               "value": "[parameters('OpenAIDeploymentType')]"
386 |             },            
387 |             {
388 |               "name": "OPENAI_EMBEDDINGS_ENGINE_DOC",
389 |               "value": "[parameters('OpenAIEmbeddingsEngineDoc')]"
390 |             },           
391 |             {
392 |               "name": "OPENAI_EMBEDDINGS_ENGINE_QUERY",
393 |               "value": "[parameters('OpenAIEmbeddingsEngineQuery')]"
394 |             },
395 |             {
396 |               "name": "OPENAI_API_BASE",
397 |               "value": "[parameters('OpenAIEndpoint')]"
398 |             },
399 |             {
400 |               "name": "OPENAI_API_KEY",
401 |               "value": "[parameters('OpenAIKey')]"
402 |             },
403 |             {
404 |               "name": "OPENAI_TEMPERATURE",
405 |               "value": "[parameters('OpenAITemperature')]"
406 |             },
407 |             {
408 |               "name": "OPENAI_MAX_TOKENS",
409 |               "value": "[parameters('OpenAIMaxTokens')]"
410 |             },
411 |             {
412 |               "name": "BLOB_ACCOUNT_NAME",
413 |               "value": "[parameters('StorageAccountName')]"
414 |             },
415 |             {
416 |               "name": "BLOB_ACCOUNT_KEY",
417 |               "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value]"
418 |             },
419 |             {
420 |               "name": "BLOB_CONTAINER_NAME",
421 |               "value": "[variables('BlobContainerName')]"
422 |             },
423 |             {
424 |               "name": "FORM_RECOGNIZER_ENDPOINT",
425 |               "value": "[parameters('FormRecognizerEndpoint')]"
426 |             },
427 |             {
428 |               "name": "FORM_RECOGNIZER_KEY",
429 |               "value": "[parameters('FormRecognizerKey')]"
430 |             },
431 |             {
432 |               "name": "REDIS_ADDRESS",
433 |               "value": "[reference(resourceId('Microsoft.Cache/redisEnterprise', parameters('RedisName')), '2023-03-01-preview').hostName]"
434 |             },
435 |             {
436 |               "name": "REDIS_PASSWORD",
437 |               "value": "[listKeys(resourceId('Microsoft.Cache/redisEnterprise/databases', parameters('RedisName'), 'default'), '2023-03-01-preview').primaryKey]"
438 |             }, 
439 | 			{
440 |               "name": "REDIS_PORT",
441 |               "value": 10000
442 |             },
443 |             {
444 |               "name": "TRANSLATE_ENDPOINT",
445 |               "value": "[parameters('TranslateEndpoint')]"
446 |             },
447 |             {
448 |               "name": "TRANSLATE_KEY",
449 |               "value": "[parameters('TranslateKey')]"
450 |             },
451 |             {
452 |               "name": "TRANSLATE_REGION",
453 |               "value": "[parameters('TranslateRegion')]"
454 |             },
455 |             {
456 |               "name": "QUEUE_NAME",
457 |               "value": "[variables('QueueName')]"
458 |             }
459 |           ],
460 |           "cors": {
461 |             "allowedOrigins": [
462 |               "https://portal.azure.com"
463 |             ]
464 |           },
465 |           "use32BitWorkerProcess": false,
466 |           "linuxFxVersion": "DOCKER|fruocco/oai-batch:latest",
467 |           "appCommandLine": "",
468 |           "alwaysOn": true
469 |         },
470 |         "serverFarmId": "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]",
471 |         "clientAffinityEnabled": false,
472 |         "virtualNetworkSubnetId": null,
473 |         "httpsOnly": true
474 |       }
475 |     },
476 |     {
477 |       "type": "Microsoft.Web/sites/host/functionKeys",
478 |       "apiVersion": "2018-11-01",
479 |       "name": "[concat(parameters('FunctionName'), '/default/clientKey')]",
480 |       "dependsOn": [
481 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]",
482 |         "WaitFunctionDeploymentSection"
483 |       ],
484 |       "properties": {
485 |         "name": "ClientKey",
486 |         "value": "[variables('ClientKey')]"
487 |       }
488 |     },
489 |     {
490 |       "type": "Microsoft.Web/sites/config",
491 |       "apiVersion": "2021-03-01",
492 |       "name": "[format('{0}/{1}', parameters('WebsiteName'), 'appsettings')]",
493 |       "kind": "string",
494 |       "dependsOn": [
495 |         "[resourceId('Microsoft.Web/sites', parameters('WebsiteName'))]",
496 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]",
497 | 		"[resourceId('Microsoft.Cache/redisEnterprise/databases', parameters('RedisName'), 'default')]"
498 |       ],
499 |       "properties": {
500 |         "APPINSIGHTS_INSTRUMENTATIONKEY": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]",
501 |         "OPENAI_ENGINE": "[parameters('OpenAIEngine')]",
502 |         "OPENAI_DEPLOYMENT_TYPE": "[parameters('OpenAIDeploymentType')]",
503 |         "OPENAI_EMBEDDINGS_ENGINE_DOC": "[parameters('OpenAIEmbeddingsEngineDoc')]",
504 |         "OPENAI_EMBEDDINGS_ENGINE_QUERY": "[parameters('OpenAIEmbeddingsEngineQuery')]",
505 |         "REDIS_ADDRESS": "[reference(resourceId('Microsoft.Cache/redisEnterprise', parameters('RedisName')), '2023-03-01-preview').hostName]",
506 |         "REDIS_PASSWORD": "[listKeys(resourceId('Microsoft.Cache/redisEnterprise/databases', parameters('RedisName'), 'default'), '2023-03-01-preview').primaryKey]",
507 | 		    "REDIS_PORT": 10000,
508 |         "OPENAI_API_BASE": "[parameters('OpenAIEndpoint')]",
509 |         "OPENAI_API_KEY": "[parameters('OpenAIKey')]",
510 |         "OPENAI_TEMPERATURE": "[parameters('OpenAITemperature')]",
511 |         "OPENAI_MAX_TOKENS": "[parameters('OpenAIMaxTokens')]",
512 |         "BLOB_ACCOUNT_NAME": "[parameters('StorageAccountName')]",
513 |         "BLOB_ACCOUNT_KEY": "[listkeys(resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName')), '2015-05-01-preview').key1]",
514 |         "BLOB_CONTAINER_NAME": "[variables('BlobContainerName')]",
515 |         "FORM_RECOGNIZER_ENDPOINT": "[parameters('FormRecognizerEndpoint')]",
516 |         "FORM_RECOGNIZER_KEY": "[parameters('FormRecognizerKey')]",
517 |         "TRANSLATE_ENDPOINT": "[parameters('TranslateEndpoint')]",
518 |         "TRANSLATE_KEY": "[parameters('TranslateKey')]",
519 |         "TRANSLATE_REGION": "[parameters('TranslateRegion')]",
520 |         "CONVERT_ADD_EMBEDDINGS_URL": "[concat('https://', parameters('FunctionName') , '.azurewebsites.net/api/BatchStartProcessing?code=', variables('ClientKey'))]"
521 |       }
522 |     },
523 |     {
524 |       "type": "Microsoft.Resources/deploymentScripts",
525 |       "apiVersion": "2020-10-01",
526 |       "kind": "AzurePowerShell",
527 |       "name": "WaitFunctionDeploymentSection",
528 |       "location": "[resourceGroup().location]",
529 |       "dependsOn": [
530 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]"
531 |       ],
532 |       "properties": {
533 |         "azPowerShellVersion": "3.0",
534 |         "scriptContent": "start-sleep -Seconds 300",
535 |         "cleanupPreference": "Always",
536 |         "retentionInterval": "PT1H"
537 |       }
538 |     }
539 |   ]
540 | }
541 | 


--------------------------------------------------------------------------------
/infrastructure/deployment_ACS.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
  3 |   "contentVersion": "1.0.0.0",
  4 |   "parameters": {
  5 |     "ResourcePrefix": {
  6 |       "type": "string",
  7 |       "metadata": {
  8 |         "description": "provide a 2-13 character prefix for all resources."
  9 |       }
 10 |     },
 11 |     "AzureCognitiveSearch": {
 12 |       "type": "string",
 13 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-search')]",
 14 |       "metadata": {
 15 |         "description": "Azure Cognitive Search Resource"
 16 |       }
 17 |     },
 18 |     "AzureCognitiveSearchSku": {
 19 |       "type": "string",
 20 |       "defaultValue": "standard",
 21 |       "allowedValues": [
 22 |         "free",
 23 |         "basic",
 24 |         "standard",
 25 |         "standard2",
 26 |         "standard3"
 27 |       ],
 28 |       "metadata": {
 29 |         "description": "The SKU of the search service you want to create. E.g. free or standard"
 30 |       }
 31 |     },
 32 |     "HostingPlanName": {
 33 |       "type": "string",
 34 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-plan')]",
 35 |       "metadata": {
 36 |         "description": "Name of App Service plan"
 37 |       }
 38 |     },
 39 |     "HostingPlanSku": {
 40 |       "type": "string",
 41 |       "defaultValue": "B3",
 42 |       "allowedValues": [
 43 |         "F1",
 44 |         "D1",
 45 |         "B1",
 46 |         "B2",
 47 |         "B3",
 48 |         "S1",
 49 |         "S2",
 50 |         "S3",
 51 |         "P1",
 52 |         "P2",
 53 |         "P3",
 54 |         "P4"
 55 |       ],
 56 |       "metadata": {
 57 |         "description": "The pricing tier for the App Service plan"
 58 |       }
 59 |     },
 60 |     "StorageAccountName": {
 61 |       "type": "string",
 62 |       "defaultValue": "[concat(parameters('ResourcePrefix'), 'str')]",
 63 |       "metadata": {
 64 |         "description": "Name of Storage Account"
 65 |       }
 66 |     },
 67 |     "WebsiteName": {
 68 |       "type": "string",
 69 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-site')]",
 70 |       "metadata": {
 71 |         "description": "Name of Web App"
 72 |       }
 73 |     },
 74 |     "FunctionName": {
 75 |       "type": "string",
 76 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-batchfunc')]",
 77 |       "metadata": {
 78 |         "description": "Name of Function App for Batch document processing"
 79 |       }
 80 |     },
 81 |     "ApplicationInsightsName": {
 82 |       "type": "string",
 83 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-appinsights')]",
 84 |       "metadata": {
 85 |         "description": "Name of Application Insights"
 86 |       }
 87 |     },
 88 |     "FormRecognizerName": {
 89 |       "type": "string",
 90 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-formrecog')]",
 91 |       "metadata": {
 92 |         "description": "Azure Form Recognizer Name"
 93 |       }
 94 |     },
 95 |     "TranslatorName": {
 96 |       "type": "string",
 97 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-translator')]",
 98 |       "metadata": {
 99 |         "description": "Azure Translator Name"
100 |       }
101 |     },
102 |     "OpenAIName": {
103 |       "type": "string",
104 |       "metadata": {
105 |         "description": "Name of OpenAI Resource"
106 |       }
107 |     },
108 |     "OpenAIKey": {
109 |       "type": "securestring",
110 |       "defaultValue": "",
111 |       "metadata": {
112 |         "description": "OpenAI API Key"
113 |       }
114 |     },
115 |     "OpenAIEngine": {
116 |       "type": "string",
117 |       "defaultValue": "text-davinci-003",
118 |       "metadata": {
119 |         "description": "OpenAI Engine"
120 |       }
121 |     },
122 |     "OpenAIDeploymentType": {
123 |       "type": "string",
124 |       "defaultValue": "Text",
125 |       "metadata": {
126 |         "description": "OpenAI Deployment Type. Text for an Instructions based deployment (text-davinci-003). Chat for a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4)."
127 |       }
128 |     },
129 |     "OpenAIEmbeddingsEngineDoc": {
130 |       "type": "string",
131 |       "defaultValue": "text-embedding-ada-002",
132 |       "metadata": {
133 |         "description": "OpenAI Embeddings Engine for Documents"
134 |       }
135 |     },
136 |     "OpenAIEmbeddingsEngineQuery": {
137 |       "type": "string",
138 |       "defaultValue": "text-embedding-ada-002",
139 |       "metadata": {
140 |         "description": "OpenAI Embeddings Engine for Queries"
141 |       }
142 |     },
143 |     "newGuid": {
144 |       "type": "string",
145 |       "defaultValue": "[newGuid()]"
146 |     }
147 |   },
148 |   "variables": {
149 |     "WebAppImageName": "DOCKER|fruocco/oai-embeddings",
150 |     "BlobContainerName": "documents",
151 |     "QueueName": "doc-processing",
152 |     "ClientKey": "[concat(uniqueString(guid(resourceGroup().id, deployment().name)), parameters('newGuid'), 'Tg2%')]"
153 |   },
154 |   "resources": [
155 |     {
156 |       "apiVersion": "2015-08-19",
157 |       "name": "[parameters('AzureCognitiveSearch')]",
158 |       "type": "Microsoft.Search/searchServices",
159 |       "location": "[resourceGroup().location]",
160 |       "sku": {
161 |         "name": "[parameters('AzureCognitiveSearchSku')]"
162 |       },
163 |       "properties": {
164 |         "replicaCount": 1,
165 |         "partitionCount": 1
166 |       }
167 |     },
168 |     {
169 |       "type": "Microsoft.CognitiveServices/accounts",
170 |       "apiVersion": "2022-12-01",
171 |       "name": "[parameters('FormRecognizerName')]",
172 |       "location": "[resourceGroup().location]",
173 |       "sku": {
174 |         "name": "S0"
175 |       },
176 |       "kind": "FormRecognizer",
177 |       "identity": {
178 |         "type": "None"
179 |       },
180 |       "properties": {
181 |         "networkAcls": {
182 |           "defaultAction": "Allow",
183 |           "virtualNetworkRules": [],
184 |           "ipRules": []
185 |         },
186 |         "publicNetworkAccess": "Enabled"
187 |       }
188 |     },
189 |     {
190 |       "type": "Microsoft.CognitiveServices/accounts",
191 |       "apiVersion": "2022-12-01",
192 |       "name": "[parameters('TranslatorName')]",
193 |       "location": "[resourceGroup().location]",
194 |       "sku": {
195 |         "name": "S1"
196 |       },
197 |       "kind": "TextTranslation",
198 |       "identity": {
199 |         "type": "None"
200 |       },
201 |       "properties": {
202 |         "networkAcls": {
203 |           "defaultAction": "Allow",
204 |           "virtualNetworkRules": [],
205 |           "ipRules": []
206 |         },
207 |         "publicNetworkAccess": "Enabled"
208 |       }
209 |     },
210 |     {
211 |       "apiVersion": "2020-06-01",
212 |       "name": "[parameters('HostingPlanName')]",
213 |       "type": "Microsoft.Web/serverfarms",
214 |       "location": "[resourceGroup().location]",
215 |       "sku": {
216 |         "name": "[parameters('HostingPlanSku')]"
217 |       },
218 |       "properties": {
219 |         "name": "[parameters('HostingPlanName')]",
220 |         "reserved": true
221 |       },
222 |       "kind": "linux"
223 |     },
224 |     {
225 |       "apiVersion": "2020-06-01",
226 |       "name": "[parameters('WebsiteName')]",
227 |       "type": "Microsoft.Web/sites",
228 |       "location": "[resourceGroup().location]",
229 |       "dependsOn": [
230 |         "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]"
231 |       ],
232 |       "properties": {
233 |         "serverFarmId": "[parameters('HostingPlanName')]",
234 |         "siteConfig": {
235 |           "linuxFxVersion": "[variables('WebAppImageName')]"
236 |         }
237 |       }
238 |     },
239 |     {
240 |       "name": "[parameters('StorageAccountName')]",
241 |       "type": "Microsoft.Storage/storageAccounts",
242 |       "apiVersion": "2021-08-01",
243 |       "location": "[resourceGroup().location]",
244 |       "kind": "StorageV2",
245 |       "sku": {
246 |         "name": "Standard_GRS"
247 |       }
248 |     },
249 |     {
250 |       "type": "Microsoft.Storage/storageAccounts/blobServices/containers",
251 |       "apiVersion": "2021-08-01",
252 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('BlobContainerName'))]",
253 |       "dependsOn": [
254 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
255 |       ],
256 |       "properties": {
257 |         "publicAccess": "None"
258 |       }
259 |     },
260 |     {
261 |       "type": "Microsoft.Storage/storageAccounts/queueServices",
262 |       "apiVersion": "2022-09-01",
263 |       "name": "[concat(parameters('StorageAccountName'), '/default')]",
264 |       "dependsOn": [
265 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
266 |       ],
267 |       "properties": {
268 |         "cors": {
269 |           "corsRules": []
270 |         }
271 |       }
272 |     },
273 |     {
274 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
275 |       "apiVersion": "2022-09-01",
276 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing')]",
277 |       "dependsOn": [
278 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
279 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
280 |       ],
281 |       "properties": {
282 |         "metadata": {}
283 |       }
284 |     },
285 |     {
286 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
287 |       "apiVersion": "2022-09-01",
288 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing-poison')]",
289 |       "dependsOn": [
290 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
291 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
292 |       ],
293 |       "properties": {
294 |         "metadata": {}
295 |       }
296 |     },
297 |     {
298 |       "type": "Microsoft.Insights/components",
299 |       "apiVersion": "2020-02-02",
300 |       "name": "[parameters('ApplicationInsightsName')]",
301 |       "location": "[resourceGroup().location]",
302 |       "tags": {
303 |         "[concat('hidden-link:', resourceId('Microsoft.Web/sites', parameters('ApplicationInsightsName')))]": "Resource"
304 |       },
305 |       "properties": {
306 |         "Application_Type": "web"
307 |       },
308 |       "kind": "web"
309 |     },
310 |     {
311 |       "apiVersion": "2018-11-01",
312 |       "name": "[parameters('FunctionName')]",
313 |       "type": "Microsoft.Web/sites",
314 |       "kind": "functionapp,linux",
315 |       "location": "[resourceGroup().location]",
316 |       "tags": {},
317 |       "dependsOn": [
318 |         "[concat('Microsoft.Web/serverfarms/', parameters('HostingPlanName'))]",
319 |         "[concat('Microsoft.Storage/storageAccounts/', parameters('StorageAccountName'))]",
320 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]"
321 |       ],
322 |       "properties": {
323 |         "name": "[parameters('FunctionName')]",
324 |         "siteConfig": {
325 |           "appSettings": [
326 |             {
327 |               "name": "FUNCTIONS_EXTENSION_VERSION",
328 |               "value": "~4"
329 |             },
330 |             {
331 |               "name": "WEBSITES_ENABLE_APP_SERVICE_STORAGE",
332 |               "value": "false"
333 |             },
334 |             {
335 |               "name": "APPINSIGHTS_INSTRUMENTATIONKEY",
336 |               "value": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]"
337 |             },
338 |             {
339 |               "name": "AzureWebJobsStorage",
340 |               "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',parameters('StorageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.windows.net')]"
341 |             },
342 |             {
343 |               "name": "OPENAI_ENGINE",
344 |               "value": "[parameters('OpenAIEngine')]"
345 |             },
346 |             {
347 |               "name": "OPENAI_DEPLOYMENT_TYPE",
348 |               "value": "[parameters('OpenAIDeploymentType')]"
349 |             },            
350 |             {
351 |               "name": "OPENAI_EMBEDDINGS_ENGINE_DOC",
352 |               "value": "[parameters('OpenAIEmbeddingsEngineDoc')]"
353 |             },           
354 |             {
355 |               "name": "OPENAI_EMBEDDINGS_ENGINE_QUERY",
356 |               "value": "[parameters('OpenAIEmbeddingsEngineQuery')]"
357 |             },
358 |             {
359 |               "name": "OPENAI_API_BASE",
360 |               "value": "[concat('https://', parameters('OpenAIName'), '.openai.azure.com/')]"
361 |             },
362 |             {
363 |               "name": "OPENAI_API_KEY",
364 |               "value": "[parameters('OpenAIKey')]"
365 |             },
366 |             {
367 |               "name": "BLOB_ACCOUNT_NAME",
368 |               "value": "[parameters('StorageAccountName')]"
369 |             },
370 |             {
371 |               "name": "BLOB_ACCOUNT_KEY",
372 |               "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value]"
373 |             },
374 |             {
375 |               "name": "BLOB_CONTAINER_NAME",
376 |               "value": "[variables('BlobContainerName')]"
377 |             },
378 |             {
379 |               "name": "FORM_RECOGNIZER_ENDPOINT",
380 |               "value": "[concat('https://',resourceGroup().location,'.api.cognitive.microsoft.com/')]"
381 |             },
382 |             {
383 |               "name": "FORM_RECOGNIZER_KEY",
384 |               "value": "[listKeys(concat('Microsoft.CognitiveServices/accounts/', parameters('FormRecognizerName')), '2023-05-01').key1]"
385 |             },
386 |             {
387 |               "name": "VECTOR_STORE_TYPE",
388 |               "value": "AzureSearch"
389 |             },
390 |             {
391 |               "name": "AZURE_SEARCH_SERVICE_NAME",
392 |               "value": "[concat('https://',parameters('AzureCognitiveSearch'),'.search.windows.net')]"
393 |             },
394 |             {
395 |               "name": "AZURE_SEARCH_ADMIN_KEY",
396 |               "value": "[listAdminKeys(concat('Microsoft.Search/searchServices/', parameters('AzureCognitiveSearch')), '2021-04-01-preview').primaryKey]"
397 |             },
398 |             {
399 |               "name": "TRANSLATE_ENDPOINT",
400 |               "value": "https://api.cognitive.microsofttranslator.com/"
401 |             },
402 |             {
403 |               "name": "TRANSLATE_KEY",
404 |               "value": "[listKeys(concat('Microsoft.CognitiveServices/accounts/', parameters('TranslatorName')), '2023-05-01').key1]"
405 |             },
406 |             {
407 |               "name": "TRANSLATE_REGION",
408 |               "value": "[resourceGroup().location]"
409 |             },
410 |             {
411 |               "name": "QUEUE_NAME",
412 |               "value": "[variables('QueueName')]"
413 |             }
414 |           ],
415 |           "cors": {
416 |             "allowedOrigins": [ "https://portal.azure.com" ]
417 |           },
418 |           "use32BitWorkerProcess": false,
419 |           "linuxFxVersion": "DOCKER|fruocco/oai-batch:latest",
420 |           "appCommandLine": "",
421 |           "alwaysOn": true
422 |         },
423 |         "serverFarmId": "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]",
424 |         "clientAffinityEnabled": false,
425 |         "virtualNetworkSubnetId": null,
426 |         "httpsOnly": true
427 |       }
428 |     },
429 |     {
430 |       "type": "Microsoft.Web/sites/host/functionKeys",
431 |       "apiVersion": "2018-11-01",
432 |       "name": "[concat(parameters('FunctionName'), '/default/clientKey')]",
433 |       "dependsOn": [
434 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]",
435 |         "WaitFunctionDeploymentSection"
436 |       ],
437 |       "properties": {
438 |         "name": "ClientKey",
439 |         "value": "[variables('ClientKey')]"
440 |       }
441 |     },
442 |     {
443 |       "type": "Microsoft.Web/sites/config",
444 |       "apiVersion": "2021-03-01",
445 |       "name": "[format('{0}/{1}', parameters('WebsiteName'), 'appsettings')]",
446 |       "kind": "string",
447 |       "dependsOn": [
448 |         "[resourceId('Microsoft.Web/sites', parameters('WebsiteName'))]",
449 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]"
450 |       ],
451 |       "properties": {
452 |         "APPINSIGHTS_INSTRUMENTATIONKEY": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]",
453 |         "OPENAI_ENGINE": "[parameters('OpenAIEngine')]",
454 |         "OPENAI_DEPLOYMENT_TYPE": "[parameters('OpenAIDeploymentType')]",
455 |         "OPENAI_EMBEDDINGS_ENGINE_DOC": "[parameters('OpenAIEmbeddingsEngineDoc')]",
456 |         "OPENAI_EMBEDDINGS_ENGINE_QUERY": "[parameters('OpenAIEmbeddingsEngineQuery')]",
457 |         "VECTOR_STORE_TYPE": "AzureSearch",
458 |         "AZURE_SEARCH_SERVICE_NAME": "[concat('https://',parameters('AzureCognitiveSearch'),'.search.windows.net')]",
459 |         "AZURE_SEARCH_ADMIN_KEY": "[listAdminKeys(concat('Microsoft.Search/searchServices/', parameters('AzureCognitiveSearch')), '2021-04-01-preview').primaryKey]",
460 |         "OPENAI_API_BASE": "[concat('https://', parameters('OpenAIName'),'.openai.azure.com/')]",
461 |         "OPENAI_API_KEY": "[parameters('OpenAIKey')]",
462 |         "BLOB_ACCOUNT_NAME": "[parameters('StorageAccountName')]",
463 |         "BLOB_ACCOUNT_KEY": "[listkeys(resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName')), '2015-05-01-preview').key1]",
464 |         "BLOB_CONTAINER_NAME": "[variables('BlobContainerName')]",
465 |         "FORM_RECOGNIZER_ENDPOINT": "[concat('https://',resourceGroup().location,'.api.cognitive.microsoft.com/')]",
466 |         "FORM_RECOGNIZER_KEY": "[listKeys(concat('Microsoft.CognitiveServices/accounts/', parameters('FormRecognizerName')), '2023-05-01').key1]",
467 |         "TRANSLATE_ENDPOINT": "https://api.cognitive.microsofttranslator.com/",
468 |         "TRANSLATE_KEY": "[listKeys(concat('Microsoft.CognitiveServices/accounts/', parameters('TranslatorName')), '2023-05-01').key1]",
469 |         "TRANSLATE_REGION": "[resourceGroup().location]",
470 |         "CONVERT_ADD_EMBEDDINGS_URL": "[concat('https://', parameters('FunctionName') , '.azurewebsites.net/api/BatchStartProcessing?code=', variables('ClientKey'))]"
471 |       }
472 |     },
473 |     {
474 |       "type": "Microsoft.Resources/deploymentScripts",
475 |       "apiVersion": "2020-10-01",
476 |       "kind": "AzurePowerShell",
477 |       "name": "WaitFunctionDeploymentSection",
478 |       "location": "[resourceGroup().location]",
479 |       "dependsOn": [
480 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]"
481 |       ],
482 |       "properties": {
483 |         "azPowerShellVersion": "3.0",
484 |         "scriptContent": "start-sleep -Seconds 300",
485 |         "cleanupPreference": "Always",
486 |         "retentionInterval": "PT1H"
487 |       }
488 |     }
489 |   ]
490 | }
491 | 


--------------------------------------------------------------------------------
/infrastructure/deployment_azcn.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
  3 |   "contentVersion": "1.0.0.0",
  4 |   "parameters": {
  5 |     "ResourcePrefix": {
  6 |       "type": "string",
  7 |       "metadata": {
  8 |         "description": "provide a 2-13 character prefix for all resources. Be sure to lowercase."
  9 |       }
 10 |     },
 11 |     "ContainerName": {
 12 |       "type": "string",
 13 |       "defaultValue": "redis",
 14 |       "metadata": {
 15 |         "description": "Name of the container"
 16 |       }
 17 |     },
 18 |     "DNSNameLabel": {
 19 |       "type": "string",
 20 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-redis')]",
 21 |       "metadata": {
 22 |         "description": "DNS Name Label for the Public IP address"
 23 |       }
 24 |     },
 25 |     "RedisPassword": {
 26 |       "type": "securestring",
 27 |       "defaultValue": "redis",
 28 |       "metadata": {
 29 |         "description": "Redis Password"
 30 |       }
 31 |     },
 32 |     "HostingPlanName": {
 33 |       "type": "string",
 34 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-plan')]",
 35 |       "metadata": {
 36 |         "description": "Name of App Service plan"
 37 |       }
 38 |     },
 39 |     "HostingPlanSku": {
 40 |       "type": "string",
 41 |       "defaultValue": "P3V2",
 42 |       "allowedValues": [
 43 |         "B3",
 44 |         "S2",
 45 |         "S3",
 46 |         "P2V2",
 47 |         "P3V2"
 48 |       ],
 49 |       "metadata": {
 50 |         "description": "The pricing tier for the App Service plan"
 51 |       }
 52 |     },
 53 |     "StorageAccountName": {
 54 |       "type": "string",
 55 |       "defaultValue": "[concat(parameters('ResourcePrefix'), 'str')]",
 56 |       "metadata": {
 57 |         "description": "Name of Storage Account"
 58 |       }
 59 |     },
 60 |     "WebsiteName": {
 61 |       "type": "string",
 62 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-site')]",
 63 |       "metadata": {
 64 |         "description": "Name of Web App"
 65 |       }
 66 |     },
 67 |     "FunctionName": {
 68 |       "type": "string",
 69 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-batchfunc')]",
 70 |       "metadata": {
 71 |         "description": "Name of Function App for Batch document processing"
 72 |       }
 73 |     },
 74 |     "ApplicationInsightsName": {
 75 |       "type": "string",
 76 |       "defaultValue": "[concat(parameters('ResourcePrefix'), '-appinsights')]",
 77 |       "metadata": {
 78 |         "description": "Name of Application Insights"
 79 |       }
 80 |     },
 81 |    "OpenAIName": {
 82 |       "type": "string",
 83 |       "metadata": {
 84 |         "description": "Name of OpenAI Resource"
 85 |       }
 86 |     },
 87 |     "OpenAIKey": {
 88 |       "type": "securestring",
 89 |       "metadata": {
 90 |         "description": "OpenAI API Key"
 91 |       }
 92 |     },
 93 |     "OpenAIEngine": {
 94 |       "type": "string",
 95 |       "metadata": {
 96 |         "description": "OpenAI Engine"
 97 |       }
 98 |     },
 99 |     "OpenAIDeploymentType": {
100 |       "type": "string",
101 |       "allowedValues": [
102 |         "Chat",
103 |         "Text"
104 |       ],
105 |       "metadata": {
106 |         "description": "OpenAI Deployment Type. Text for an Instructions based deployment (text-davinci-003). Chat for a Chat based deployment (gpt-35-turbo or gpt-4-32k or gpt-4)."
107 |       }
108 |     },
109 |     "OpenAIEmbeddingsEngineDoc": {
110 |       "type": "string",
111 |       "defaultValue": "text-embedding-ada-002",
112 |       "metadata": {
113 |         "description": "OpenAI Embeddings Engine for Documents"
114 |       }
115 |     },
116 |     "OpenAIEmbeddingsEngineQuery": {
117 |       "type": "string",
118 |       "defaultValue": "text-embedding-ada-002",
119 |       "metadata": {
120 |         "description": "OpenAI Embeddings Engine for Queries"
121 |       }
122 |     },
123 |     "OpenAITemperature": {
124 |       "type": "string",
125 |       "defaultValue": "0.1",
126 |       "metadata": {
127 |         "description": "OpenAI Temperature"
128 |       }
129 |     },
130 |     "OpenAIMaxTokens": {
131 |       "type": "string",
132 |       "defaultValue": "-1",
133 |       "metadata": {
134 |         "description": "OpenAI Max Tokens"
135 |       }
136 |     },
137 |     "DocumentIntelligenceEndpoint": {
138 |       "type": "string",
139 |       "metadata": {
140 |         "description": "Document Intelligence Endpoint"
141 |       }
142 |     },
143 |     "DocumentIntelligenceKey": {
144 |       "type": "securestring",
145 |       "metadata": {
146 |         "description": "Document Intelligence Key"
147 |       }
148 |     },
149 |     "TranslateEndpoint": {
150 |       "type": "string",
151 |       "defaultValue": "https://api.translator.azure.cn/",
152 |       "metadata": {
153 |         "description": "Translator Endpoint"
154 |       }
155 |     },
156 |     "TranslateKey": {
157 |       "type": "securestring",
158 |       "metadata": {
159 |         "description": "Translator Key"
160 |       }
161 |     },
162 |     "TranslateRegion": {
163 |       "type": "string",
164 |       "allowedValues": [
165 |         "chinaeast2",
166 |         "chinanorth",
167 |         "chinanorth2",
168 |         "chinanorth3"
169 |       ],
170 |       "metadata": {
171 |         "description": "Translator Region"
172 |       }
173 |     },
174 |     "newGuid": {
175 |       "type": "string",
176 |       "defaultValue": "[newGuid()]"
177 |     }
178 |   },
179 |   "variables": {
180 |     "BlobContainerName": "documents",
181 |     "FileShareName": "redisdata",
182 |     "QueueName": "doc-processing",
183 |     "ClientKey": "[concat(uniqueString(guid(resourceGroup().id, deployment().name)), parameters('newGuid'), 'Tg2%')]"
184 |   },
185 |   "resources": [
186 |     {
187 |       "apiVersion": "2020-06-01",
188 |       "name": "[parameters('HostingPlanName')]",
189 |       "type": "Microsoft.Web/serverfarms",
190 |       "location": "[resourceGroup().location]",
191 |       "sku": {
192 |         "name": "[parameters('HostingPlanSku')]"
193 |       },
194 |       "properties": {
195 |         "name": "[parameters('HostingPlanName')]",
196 |         "reserved": true
197 |       },
198 |       "kind": "linux"
199 |     },
200 |     {
201 |       "apiVersion": "2020-06-01",
202 |       "name": "[parameters('WebsiteName')]",
203 |       "type": "Microsoft.Web/sites",
204 |       "location": "[resourceGroup().location]",
205 |       "dependsOn": [
206 |         "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]"
207 |       ],
208 |       "properties": {
209 |         "serverFarmId": "[parameters('HostingPlanName')]",
210 |         "siteConfig": {
211 |           "linuxFxVersion": "DOCKER|cyberflying/aoai-web:latest"
212 |         }
213 |       }
214 |     },
215 |     {
216 |       "name": "[parameters('StorageAccountName')]",
217 |       "type": "Microsoft.Storage/storageAccounts",
218 |       "apiVersion": "2021-08-01",
219 |       "location": "[resourceGroup().location]",
220 |       "kind": "StorageV2",
221 |       "sku": {
222 |         "name": "Standard_GRS"
223 |       }
224 |     },
225 |     {
226 |       "type": "Microsoft.Storage/storageAccounts/blobServices/containers",
227 |       "apiVersion": "2021-08-01",
228 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('BlobContainerName'))]",
229 |       "dependsOn": [
230 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
231 |       ],
232 |       "properties": {
233 |         "publicAccess": "None"
234 |       }
235 |     },
236 |     {
237 |       "type": "Microsoft.Storage/storageAccounts/fileServices/shares",
238 |       "apiVersion": "2021-08-01",
239 |       "name": "[concat(parameters('StorageAccountName'), '/default/', variables('FileShareName'))]",
240 |       "dependsOn": [
241 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
242 |       ],
243 |       "properties": {
244 | 
245 |         "protocolSettings": {
246 |           "smb": {}
247 |         },
248 |         "cors": {
249 |           "corsRules": []
250 |         },
251 |         "shareDeleteRetentionPolicy": {
252 |           "enabled": true,
253 |           "days": 7
254 |         }
255 |       }
256 |     },
257 |     {
258 |       "type": "Microsoft.Storage/storageAccounts/queueServices",
259 |       "apiVersion": "2022-09-01",
260 |       "name": "[concat(parameters('StorageAccountName'), '/default')]",
261 |       "dependsOn": [
262 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
263 |       ],
264 |       "properties": {
265 |         "cors": {
266 |           "corsRules": []
267 |         }
268 |       }
269 |     },
270 |     {
271 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
272 |       "apiVersion": "2022-09-01",
273 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing')]",
274 |       "dependsOn": [
275 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
276 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
277 |       ],
278 |       "properties": {
279 |         "metadata": {}
280 |       }
281 |     },
282 |     {
283 |       "type": "Microsoft.Storage/storageAccounts/queueServices/queues",
284 |       "apiVersion": "2022-09-01",
285 |       "name": "[concat(parameters('StorageAccountName'), '/default/doc-processing-poison')]",
286 |       "dependsOn": [
287 |         "[resourceId('Microsoft.Storage/storageAccounts/queueServices', parameters('StorageAccountName'), 'default')]",
288 |         "[resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName'))]"
289 |       ],
290 |       "properties": {
291 |         "metadata": {}
292 |       }
293 |     },
294 |     {
295 |       "type": "Microsoft.Insights/components",
296 |       "apiVersion": "2020-02-02",
297 |       "name": "[parameters('ApplicationInsightsName')]",
298 |       "location": "[resourceGroup().location]",
299 |       "tags": {
300 |         "[concat('hidden-link:', resourceId('Microsoft.Web/sites', parameters('ApplicationInsightsName')))]": "Resource"
301 |       },
302 |       "properties": {
303 |         "Application_Type": "web"
304 |       },
305 |       "kind": "web"
306 |     },
307 |     {
308 |       "apiVersion": "2018-11-01",
309 |       "name": "[parameters('FunctionName')]",
310 |       "type": "Microsoft.Web/sites",
311 |       "kind": "functionapp,linux",
312 |       "location": "[resourceGroup().location]",
313 |       "tags": {},
314 |       "dependsOn": [
315 |         "[concat('Microsoft.Web/serverfarms/', parameters('HostingPlanName'))]",
316 |         "[concat('Microsoft.Storage/storageAccounts/', parameters('StorageAccountName'))]",
317 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]"
318 |       ],
319 |       "properties": {
320 |         "name": "[parameters('FunctionName')]",
321 |         "siteConfig": {
322 |           "appSettings": [
323 |             {
324 |               "name": "FUNCTIONS_EXTENSION_VERSION",
325 |               "value": "~4"
326 |             },
327 |             {
328 |               "name": "WEBSITES_ENABLE_APP_SERVICE_STORAGE",
329 |               "value": "false"
330 |             },
331 |             {
332 |               "name": "APPINSIGHTS_INSTRUMENTATIONKEY",
333 |               "value": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]"
334 |             },
335 |             {
336 |               "name": "AzureWebJobsStorage",
337 |               "value": "[concat('DefaultEndpointsProtocol=https;AccountName=',parameters('StorageAccountName'),';AccountKey=',listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value,';EndpointSuffix=','core.chinacloudapi.cn')]"
338 |             },
339 |             {
340 |               "name": "OPENAI_ENGINE",
341 |               "value": "[parameters('OpenAIEngine')]"
342 |             },
343 |             {
344 |               "name": "OPENAI_DEPLOYMENT_TYPE",
345 |               "value": "[parameters('OpenAIDeploymentType')]"
346 |             },            
347 |             {
348 |               "name": "OPENAI_EMBEDDINGS_ENGINE_DOC",
349 |               "value": "[parameters('OpenAIEmbeddingsEngineDoc')]"
350 |             },           
351 |             {
352 |               "name": "OPENAI_EMBEDDINGS_ENGINE_QUERY",
353 |               "value": "[parameters('OpenAIEmbeddingsEngineQuery')]"
354 |             },
355 |             {
356 |               "name": "OPENAI_API_BASE",
357 |               "value": "[concat('https://', parameters('OpenAIName'), '.openai.azure.com/')]"
358 |             },
359 |             {
360 |               "name": "OPENAI_API_KEY",
361 |               "value": "[parameters('OpenAIKey')]"
362 |             },
363 |             {
364 |               "name": "OPENAI_TEMPERATURE",
365 |               "value": "[parameters('OpenAITemperature')]"
366 |             },
367 |             {
368 |               "name": "OPENAI_MAX_TOKENS",
369 |               "value": "[parameters('OpenAIMaxTokens')]"
370 |             },
371 |             {
372 |               "name": "BLOB_ACCOUNT_NAME",
373 |               "value": "[parameters('StorageAccountName')]"
374 |             },
375 |             {
376 |               "name": "BLOB_ACCOUNT_KEY",
377 |               "value": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('storageAccountName')), '2019-06-01').keys[0].value]"
378 |             },
379 |             {
380 |               "name": "BLOB_CONTAINER_NAME",
381 |               "value": "[variables('BlobContainerName')]"
382 |             },
383 |             {
384 |               "name": "AZURE_CLOUD",
385 |               "value": "AzureChinaCloud"
386 |             },
387 |             {
388 |               "name": "FORM_RECOGNIZER_ENDPOINT",
389 |               "value": "[parameters('DocumentIntelligenceEndpoint')]"
390 |             },
391 |             {
392 |               "name": "FORM_RECOGNIZER_KEY",
393 |               "value": "[parameters('DocumentIntelligenceKey')]"
394 |             },
395 |             {
396 |               "name": "REDIS_ADDRESS",
397 |               "value": "[concat(parameters('DNSNameLabel'),'.' , resourceGroup().location ,'.azurecontainer.console.azure.cn')]"
398 |             },
399 |             {
400 |               "name": "REDIS_PASSWORD",
401 |               "value": "[parameters('RedisPassword')]"
402 |             },
403 |             {
404 |               "name": "TRANSLATE_ENDPOINT",
405 |               "value": "[parameters('TranslateEndpoint')]"
406 |             },
407 |             {
408 |               "name": "TRANSLATE_KEY",
409 |               "value": "[parameters('TranslateKey')]"
410 |             },
411 |             {
412 |               "name": "TRANSLATE_REGION",
413 |               "value": "[parameters('TranslateRegion')]"
414 |             },
415 |             {
416 |               "name": "QUEUE_NAME",
417 |               "value": "[variables('QueueName')]"
418 |             }
419 |           ],
420 |           "cors": {
421 |             "allowedOrigins": [
422 |               "https://portal.azure.com"
423 |             ]
424 |           },
425 |           "use32BitWorkerProcess": false,
426 |           "linuxFxVersion": "DOCKER|cyberflying/aoai-batch:latest",
427 |           "appCommandLine": "",
428 |           "alwaysOn": true
429 |         },
430 |         "serverFarmId": "[resourceId('Microsoft.Web/serverfarms', parameters('HostingPlanName'))]",
431 |         "clientAffinityEnabled": false,
432 |         "virtualNetworkSubnetId": null,
433 |         "httpsOnly": true
434 |       }
435 |     },
436 |     {
437 |       "name": "[parameters('ContainerName')]",
438 |       "type": "Microsoft.ContainerInstance/containerGroups",
439 |       "apiVersion": "2021-10-01",
440 |       "location": "[resourceGroup().location]",
441 |       "properties": {
442 |         "containers": [
443 |           {
444 |             "name": "[parameters('ContainerName')]",
445 |             "properties": {
446 |               "image": "dockerhub.azk8s.cn/redis/redis-stack-server:latest",
447 |               "resources": {
448 |                 "requests": {
449 |                   "cpu": 4,
450 |                   "memoryInGb": 16
451 |                 }
452 |               },
453 |               "environmentVariables": [
454 |                 {
455 |                   "name": "REDIS_ARGS",
456 |                   "value": "[concat('--requirepass ', parameters('RedisPassword'))]"
457 |                 }
458 |               ],
459 |               "ports": [
460 |                 {
461 |                   "protocol": "TCP",
462 |                   "port": 6379
463 |                 }
464 |               ],
465 |               "volumeMounts": [
466 |                 {
467 |                   "name": "[variables('FileShareName')]",
468 |                   "mountPath": "/data"
469 |                 }
470 |               ]
471 |             }
472 |           }
473 |         ],
474 |         "osType": "Linux",
475 |         "ipAddress": {
476 |           "type": "Public",
477 |           "ports": [
478 |             {
479 |               "protocol": "TCP",
480 |               "port": 6379
481 |             }
482 |           ],
483 |           "dnsNameLabel": "[parameters('DNSNameLabel')]"
484 |         },
485 |         "initContainers": [
486 |         ],
487 |         "volumes": [
488 |           {
489 |             "name": "[variables('FileShareName')]",
490 |             "azureFile": {
491 |               "shareName": "[variables('FileShareName')]",
492 |               "storageAccountName": "[parameters('StorageAccountName')]",
493 |               "storageAccountKey": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName')), '2021-08-01').keys[0].value]"
494 |             }
495 |           }
496 |         ]
497 |       }
498 |     },
499 |     {
500 |       "type": "Microsoft.Web/sites/host/functionKeys",
501 |       "apiVersion": "2018-11-01",
502 |       "name": "[concat(parameters('FunctionName'), '/default/clientKey')]",
503 |       "dependsOn": [
504 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]",
505 |         "WaitFunctionDeploymentSection"
506 |       ],
507 |       "properties": {
508 |         "name": "ClientKey",
509 |         "value": "[variables('ClientKey')]"
510 |       }
511 |     },
512 |     {
513 |       "type": "Microsoft.Web/sites/config",
514 |       "apiVersion": "2021-03-01",
515 |       "name": "[format('{0}/{1}', parameters('WebsiteName'), 'appsettings')]",
516 |       "kind": "string",
517 |       "dependsOn": [
518 |         "[resourceId('Microsoft.Web/sites', parameters('WebsiteName'))]",
519 |         "[concat('Microsoft.Insights/components/', parameters('ApplicationInsightsName'))]"
520 |       ],
521 |       "properties": {
522 |         "APPINSIGHTS_INSTRUMENTATIONKEY": "[reference(resourceId('Microsoft.Insights/components', parameters('ApplicationInsightsName')), '2015-05-01').InstrumentationKey]",
523 |         "OPENAI_ENGINE": "[parameters('OpenAIEngine')]",
524 |         "OPENAI_DEPLOYMENT_TYPE": "[parameters('OpenAIDeploymentType')]",
525 |         "OPENAI_EMBEDDINGS_ENGINE_DOC": "[parameters('OpenAIEmbeddingsEngineDoc')]",
526 |         "OPENAI_EMBEDDINGS_ENGINE_QUERY": "[parameters('OpenAIEmbeddingsEngineQuery')]",
527 |         "AZURE_CLOUD": "AzureChinaCloud",
528 |         "REDIS_ADDRESS": "[concat(parameters('DNSNameLabel'),'.' , resourceGroup().location ,'.azurecontainer.console.azure.cn')]",
529 |         "REDIS_PASSWORD": "[parameters('RedisPassword')]",
530 |         "OPENAI_API_BASE": "[concat('https://', parameters('OpenAIName'),'.openai.azure.com/')]",
531 |         "OPENAI_API_KEY": "[parameters('OpenAIKey')]",
532 |         "OPENAI_TEMPERATURE": "[parameters('OpenAITemperature')]",
533 |         "OPENAI_MAX_TOKENS": "[parameters('OpenAIMaxTokens')]",
534 |         "BLOB_ACCOUNT_NAME": "[parameters('StorageAccountName')]",
535 |         "BLOB_ACCOUNT_KEY": "[listkeys(resourceId('Microsoft.Storage/storageAccounts', parameters('StorageAccountName')), '2015-05-01-preview').key1]",
536 |         "BLOB_CONTAINER_NAME": "[variables('BlobContainerName')]",
537 |         "FORM_RECOGNIZER_ENDPOINT": "[parameters('DocumentIntelligenceEndpoint')]",
538 |         "FORM_RECOGNIZER_KEY": "[parameters('DocumentIntelligenceKey')]",
539 |         "TRANSLATE_ENDPOINT": "[parameters('TranslateEndpoint')]",
540 |         "TRANSLATE_KEY": "[parameters('TranslateKey')]",
541 |         "TRANSLATE_REGION": "[parameters('TranslateRegion')]",
542 |         "CONVERT_ADD_EMBEDDINGS_URL": "[concat('https://', parameters('FunctionName') , '.chinacloudsites.cn/api/BatchStartProcessing?code=', variables('ClientKey'))]"
543 |       }
544 |     },
545 |     {
546 |       "type": "Microsoft.Resources/deploymentScripts",
547 |       "apiVersion": "2020-10-01",
548 |       "kind": "AzurePowerShell",
549 |       "name": "WaitFunctionDeploymentSection",
550 |       "location": "[resourceGroup().location]",
551 |       "dependsOn": [
552 |         "[resourceId('Microsoft.Web/sites', parameters('FunctionName'))]"
553 |       ],
554 |       "properties": {
555 |         "azPowerShellVersion": "3.0",
556 |         "scriptContent": "start-sleep -Seconds 100",
557 |         "cleanupPreference": "Always",
558 |         "retentionInterval": "PT1H"
559 |       }
560 |     }
561 |   ]
562 | }
563 | 


--------------------------------------------------------------------------------