├── .deployment
├── .env.template
├── .funcignore
├── .gitignore
├── .vscode
    ├── extensions.json
    ├── launch.json
    ├── settings.json
    └── tasks.json
├── AzCogSearchDocCrackingFunc
    ├── __init__.py
    ├── function.json
    └── sample.dat
├── AzureOpenAIandPVAbot.pdf
├── BotQnAHTTPFunc
    ├── __init__.py
    ├── function.json
    └── sample.dat
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── ServiceBusQueueNewDocument
    ├── __init__.py
    ├── function.json
    └── sample.dat
├── WISHLIST.md
├── app.py
├── experiment.ipynb
├── host.json
├── images
    ├── agent-arch.jpg
    ├── azure.jpg
    ├── chatbot.jpg
    ├── copyfuncurl.jpg
    ├── copyoutputs.jpg
    ├── custom_skill.jpg
    ├── depl-outputs.jpg
    ├── first_req.jpg
    ├── firstquery.jpg
    ├── funcdeploy.jpg
    ├── km-openai v2.jpg
    ├── km-openai.png
    ├── local_settings.jpg
    ├── midjourney.png
    ├── oai_deployments.jpg
    ├── openaichoice.jpg
    ├── openaifuncapp.jpg
    ├── postman.jpg
    ├── prompt_choice.png
    ├── redis.jpg
    ├── redischoice.jpg
    ├── run_ingest.jpg
    ├── search_params.jpg
    ├── sec_req.jpg
    ├── sem_search.jpg
    ├── stream-client.jpg
    ├── subs_conv.jpg
    └── suffix.jpg
├── kb_docs_samples
    ├── Dubai Brochure.pdf
    ├── Las Vegas Brochure.pdf
    ├── London Brochure.pdf
    ├── Margies Travel Company Info.pdf
    ├── New York Brochure.pdf
    ├── San Francisco Brochure.pdf
    └── olympics_sections_text.csv
├── local.settings.json
├── redis.yml
├── requirements.txt
├── static
    ├── index.html
    ├── index_old.html
    ├── script.js
    └── styles.css
├── template.json
└── utils
    ├── bot_helpers.py
    ├── cogsearch_helpers.py
    ├── cogvecsearch_helpers
        ├── cogsearch_vecstore.py
        └── cs_json.py
    ├── cosmos_helpers.py
    ├── cv_helpers.py
    ├── env_vars.py
    ├── fr_helpers.py
    ├── helpers.py
    ├── http_helpers.py
    ├── kb_doc.py
    ├── km_agents.py
    ├── langchain_helpers
        ├── mod_agent.py
        ├── mod_ccr_prompt.py
        ├── mod_react_prompt.py
        ├── mod_wiki_prompt.py
        ├── oai_fc_agent.py
        ├── oldschoolsearch.py
        ├── simple_prompt.py
        └── streaming_handler.py
    ├── language.py
    ├── openai_helpers.py
    ├── redis_helpers.py
    ├── storage.py
    ├── summarization.py
    └── web_crawler.py


/.deployment:
--------------------------------------------------------------------------------
1 | [config]
2 | SCM_DO_BUILD_DURING_DEPLOYMENT=true
3 | WEBSITE_WEBDEPLOY_USE_SCM=true


--------------------------------------------------------------------------------
/.env.template:
--------------------------------------------------------------------------------
  1 | ##########################################
  2 | ###### Please fill in the below 3 sections
  3 | ##########################################
  4 | 
  5 | ### Configuration
  6 | USE_COG_VECSEARCH = 0 # set this to 1 to enable vector search in cognitive search
  7 | PROCESS_IMAGES = 0 # set this to 1 to enable image processing
  8 | DATABASE_MODE = 0 # set this to 1 to enable backup mode with Cosmos
  9 | USE_REDIS_CACHE = 1 # set this to 1 to enable caching sessions and intermediate results with Redis
 10 | 
 11 | 
 12 | #### Cognitive Search
 13 | COG_SEARCH_ENDPOINT=""
 14 | COG_SEARCH_ADMIN_KEY=""
 15 | COG_SEARCH_CUSTOM_FUNC=""
 16 | KB_INDEX_NAME = "km-openai"
 17 | KB_INDEXER_NAME = "km-openai-indexer"
 18 | KB_DATA_SOURCE_NAME = "km-openai-docs"
 19 | KB_SKILLSET_NAME = "km-openai-skills"
 20 | KB_SEM_INDEX_NAME = "km-openai-sem"
 21 | COG_VEC_SEARCH_API_VERSION = "2023-07-01-Preview"
 22 | COG_VECSEARCH_VECTOR_INDEX = "vec-index"
 23 | 
 24 | 
 25 | #### Cognitive Services
 26 | COG_SERV_ENDPOINT=""
 27 | COG_SERV_KEY=""
 28 | FR_CONTAINER=kmoaiforms
 29 | CV_API_VERSION="2023-02-01-preview"
 30 | 
 31 | #### Knowledge Base - Blob Storage
 32 | KB_BLOB_CONN_STR=""
 33 | KB_BLOB_CONTAINER=kmoaidemo
 34 | OUTPUT_BLOB_CONTAINER=kmoaiprocessed
 35 | 
 36 | 
 37 | #### OPENAI
 38 | OPENAI_RESOURCE_ENDPOINT=""
 39 | OPENAI_API_KEY=""
 40 | 
 41 | 
 42 | ############################################
 43 | ###### No need to fill in the below sections
 44 | ###### unless you're planning to develop
 45 | ###### features with the below systems
 46 | ############################################
 47 | 
 48 | 
 49 | #### Bing Search
 50 | USE_BING = "no"
 51 | BING_SUBSCRIPTION_KEY = ""
 52 | BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
 53 | LIST_OF_COMMA_SEPARATED_URLS = ""
 54 | 
 55 | 
 56 | #### Service Bus
 57 | SERVICEBUS_CONN_STR = ""
 58 | 
 59 | 
 60 | #### Redis E10
 61 | REDIS_ADDR=""
 62 | REDIS_PORT=10000
 63 | REDIS_PASSWORD=""
 64 | REDIS_INDEX_NAME='acs_emb_index'
 65 | VECTOR_FIELD_IN_REDIS='item_vector'
 66 | NUMBER_PRODUCTS_INDEX=1000
 67 | 
 68 |  
 69 | 
 70 | #### Cognitive Services - Translator
 71 | TRANSLATION_ENDPOINT="https://api.cognitive.microsofttranslator.com"
 72 | TRANSLATION_API_KEY=""
 73 | TRANSLATION_LOCATION=westeurope
 74 | 
 75 | 
 76 | 
 77 | #### Cosmos
 78 | COSMOS_URI=""
 79 | COSMOS_KEY=""
 80 | CATEGORYID="KM_OAI_CATEGORY"
 81 | EMBCATEGORYID="KM_OAI_EMB_CATEGORY"
 82 | COSMOS_DB_NAME="KM_OAI_DB"
 83 | 
 84 | 
 85 | # Computer Vision 
 86 | CV_ENDPOINT = ""
 87 | CV_API_KEY = ""
 88 | 
 89 | 
 90 | #### OPENAI
 91 | MAX_QUERY_TOKENS = 500
 92 | MAX_OUTPUT_TOKENS = 750
 93 | MAX_HISTORY_TOKENS = 1000
 94 | CONVERSATION_TTL_SECS = 7200
 95 | MAX_SEARCH_TOKENS = 2000
 96 | PRE_CONTEXT = 500
 97 | 
 98 | OVERLAP_TEXT=80
 99 | 
100 | ADA_002_EMBED_NUM_DIMS  = 1536
101 | ADA_002_MODEL_MAX_TOKENS = 4095
102 | ADA_002_EMBEDDING_MODEL = "text-embedding-ada-002"
103 | ADA_EMBEDDING_ENCODING = "cl100k_base"
104 | 
105 | DAVINCI_003_EMBED_NUM_DIMS = 12288
106 | DAVINCI_003_MODEL_MAX_TOKENS = 4000
107 | DAVINCI_003_EMB_MAX_TOKENS = 2047
108 | DAVINCI_003_COMPLETIONS_MODEL = "text-davinci-003"
109 | DAVINCI_003_EMBEDDING_MODEL   = "text-search-davinci-doc-001"
110 | DAVINCI_003_QUERY_EMB_MODEL   = "text-search-davinci-query-001"
111 | DAVINCI_EMBEDDING_ENCODING = "p50k_base"
112 | 
113 | GPT35_TURBO_COMPLETIONS_MODEL = 'gpt-35-turbo'
114 | GPT35_TURBO_COMPLETIONS_MAX_TOKENS = 8193
115 | GPT35_TURBO_COMPLETIONS_ENCODING = "cl100k_base"
116 | 
117 | GPT4_MODEL = "gpt-4"
118 | GPT4_32K_MODEL = "gpt-4-32k"
119 | 
120 | CHOSEN_EMB_MODEL = "text-embedding-ada-002"
121 | SMALL_EMB_TOKEN_NUM  = 0
122 | MEDIUM_EMB_TOKEN_NUM  = 800
123 | LARGE_EMB_TOKEN_NUM  = 0
124 | X_LARGE_EMB_TOKEN_NUM = 0
125 | NUM_TOP_MATCHES = 2
126 | 
127 | 
128 | MAX_OUTPUT_TOKENS = 500
129 | MAX_QUERY_TOKENS  = 500
130 | CHOSEN_QUERY_EMB_MODEL = "text-embedding-ada-002"
131 | CHOSEN_COMP_MODEL = "gpt-35-turbo"
132 | 
133 | RESTRICTIVE_PROMPT = "no"
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/.funcignore:
--------------------------------------------------------------------------------
1 | .git*
2 | .vscode
3 | __azurite_db*__.json
4 | __blobstorage__
5 | __queuestorage__
6 | local.settings.json
7 | test
8 | .venv


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # pipenv
 86 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 87 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 88 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 89 | #   install all needed dependencies.
 90 | #Pipfile.lock
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | .dmypy.json
119 | dmypy.json
120 | 
121 | # Pyre type checker
122 | .pyre/
123 | 
124 | # Azure Functions artifacts
125 | bin
126 | obj
127 | appsettings.json
128 | 
129 | 
130 | # Azurite artifacts
131 | __blobstorage__
132 | __queuestorage__
133 | __azurite_db*__.json
134 | .python_packages
135 | .env (mine)
136 | 
137 | dump/
138 | test.ipynb
139 | load_embeddings.ipynb
140 | local.settings (dev).json
141 | local.settings (template).json
142 | local.settings (mine).json
143 | template_test.json
144 | template_test copy.json
145 | 
146 | .env copy
147 | template.json
148 | README copy.md
149 | template (backup).json
150 | template (backup1).json
151 | embs.pkl
152 | jsonl.pkl
153 | data.json
154 | full_data.pkl
155 | full_data.json
156 | new_emb_arr.pkl
157 | demo_06_transaction_data_understanding (1).ipynb
158 | data.jsonl
159 | baseline.pkl
160 | baseline.json
161 | 
162 | backup/
163 | WebSiteCrawler.ipynb
164 | agent copy.ipynb
165 | .env copy 2
166 | demo.ipynb
167 | evaluate.ipynb
168 | eval_results.csv
169 | all_predictions.pkl
170 | test_openai.ipynb
171 | eval_results_cont.csv
172 | eval.csv
173 | agent_name.csv
174 | pres.ipynb
175 | hukoomi/
176 | hukoomi_new/
177 | .env
178 | qna.ipynb
179 | notebooks/
180 | demo copy.ipynb
181 | bing.ipynb
182 | app/
183 | qna copy.ipynb
184 | summarization.ipynb
185 | summarization copy.ipynb
186 | utils/langchain_helpers/summ_react_prompt.py
187 | cogvecsearch.ipynb
188 | qna debug.ipynb
189 | cv.ipynb
190 | .env1
191 | qna_demo.ipynb
192 | qna_lufthansa.ipynb
193 | .env2
194 | qna - ai knowledge exchange.ipynb
195 | kb_docs_samples/summaries_map_reduce.csv
196 | kb_docs_samples/summaries_refine.csv
197 | kb_docs_samples/summaries_map_reduce.pkl
198 | kb_docs_samples/summaries_refine.pkl
199 | demo_06_transaction_data_understanding.ipynb
200 | qna - ai knowledge exchange.zip
201 | London Brochure.json
202 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "ms-azuretools.vscode-azurefunctions",
4 |     "ms-python.python"
5 |   ]
6 | }


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "0.2.0",
 3 |     "configurations": [
 4 |         {
 5 |             "name": "Attach to Python Functions",
 6 |             "type": "python",
 7 |             "request": "attach",
 8 |             "port": 9091,
 9 |             "preLaunchTask": "func: host start"
10 |         }
11 |     ]
12 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "azureFunctions.deploySubpath": ".",
 3 |     "azureFunctions.scmDoBuildDuringDeployment": true, 
 4 |     "azureFunctions.pythonVenv": ".venv",
 5 |     "azureFunctions.projectLanguage": "Python",
 6 |     "azureFunctions.projectRuntime": "~4",
 7 |     "debug.internalConsoleOptions": "neverOpen",
 8 |     "appService.zipIgnorePattern": [
 9 |         "__pycache__{,/**}",
10 |         "*.py[cod]",
11 |         "*$py.class",
12 |         ".Python{,/**}",
13 |         "build{,/**}",
14 |         "develop-eggs{,/**}",
15 |         "dist{,/**}",
16 |         "backup{,/**}",
17 |         "AzCogSearchDocCrackingFunc{,/**}",
18 |         "BotQnAHTTPFunc{,/**}",
19 |         "ServiceBusQueueNewDocument{,/**}",
20 |         "kb_docs_samples{,/**}",
21 |         "notebooks{,/**}",
22 |         "dump{,/**}",
23 |         ".env copy{,/**}",
24 |         ".env copy 2{,/**}",
25 |         "downloads{,/**}",
26 |         ".git{,/**}",
27 |         "eggs{,/**}",
28 |         ".eggs{,/**}",
29 |         "lib{,/**}",
30 |         "lib64{,/**}",
31 |         "parts{,/**}",
32 |         "sdist{,/**}",
33 |         "var{,/**}",
34 |         "wheels{,/**}",
35 |         "share/python-wheels{,/**}",
36 |         "*.egg-info{,/**}",
37 |         ".installed.cfg",
38 |         "*.egg",
39 |         "MANIFEST",
40 |         ".env{,/**}",
41 |         ".venv{,/**}",
42 |         "env{,/**}",
43 |         "venv{,/**}",
44 |         "ENV{,/**}",
45 |         "env.bak{,/**}",
46 |         "venv.bak{,/**}",
47 |         ".vscode{,/**}"
48 |     ],
49 |     "appService.defaultWebAppToDeploy": "/subscriptions/2a7eed04-714e-4ba9-96ba-47355c32a8d6/resourceGroups/km-demo/providers/Microsoft.Web/sites/kmaoiwebappdemo0001",
50 |     "appService.deploySubpath": "."
51 | }


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"version": "2.0.0",
 3 | 	"tasks": [
 4 | 		{
 5 | 			"type": "func",
 6 | 			"label": "func: host start",
 7 | 			"command": "host start",
 8 | 			"problemMatcher": "$func-python-watch",
 9 | 			"isBackground": true,
10 | 			"dependsOn": "pip install (functions)"
11 | 		},
12 | 		{
13 | 			"label": "pip install (functions)",
14 | 			"type": "shell",
15 | 			"osx": {
16 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
17 | 			},
18 | 			"windows": {
19 | 				"command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt"
20 | 			},
21 | 			"linux": {
22 | 				"command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt"
23 | 			},
24 | 			"problemMatcher": []
25 | 		}
26 | 	]
27 | }


--------------------------------------------------------------------------------
/AzCogSearchDocCrackingFunc/__init__.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import re
  4 | import azure.functions as func
  5 | import os
  6 | from azure.cosmos import CosmosClient, PartitionKey
  7 | from azure.storage.blob import ContainerClient
  8 | import urllib
  9 | import uuid
 10 | import copy
 11 | 
 12 | from utils import cosmos_helpers
 13 | from utils import storage
 14 | from utils import cv_helpers
 15 | 
 16 | from utils.env_vars import *
 17 | 
 18 | 
 19 | def remove_urls(text):
 20 |     text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
 21 |     return text
 22 | 
 23 | re_strs = [
 24 |     "customXml\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*", 
 25 |     "ppt\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*",
 26 |     "\.MsftOfcThm_[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*[\r\n\t\f\v ]\{[\r\n\t\f\v ].*[\r\n\t\f\v ]\}",
 27 |     "SlidePowerPoint",
 28 |     "PresentationPowerPoint",
 29 |     '[a-zA-Z0-9]*\.(?:gif|emf)'
 30 |     ]
 31 | 
 32 | 
 33 | 
 34 | def analyze_doc(data_dict):
 35 |     
 36 |     ret_dict = {}
 37 |     ret_dict['status'] = ''
 38 |     db_status = ''
 39 |     data_dict['text'] = remove_urls(data_dict['content'].replace("\n\n\n", "\n").replace("....", "."))
 40 |     data_dict['contentType'] = 'text'
 41 |     data_dict['container'] = storage.get_container_name(data_dict['doc_url'])
 42 | 
 43 |     try:
 44 |         if isinstance(data_dict['timestamp'], list): 
 45 |             data_dict['timestamp'] = ' '.join(data_dict['timestamp'])
 46 |     except:
 47 |         data_dict['timestamp'] = "1/1/1970 00:00:00 AM"
 48 | 
 49 | 
 50 |         
 51 |     for re_str in re_strs:
 52 |         matches = re.findall(re_str, data_dict['text'], re.DOTALL)
 53 |         for m in matches: data_dict['text'] = data_dict['text'].replace(m, '')
 54 | 
 55 | 
 56 |     try:
 57 |         if PROCESS_IMAGES == 1:
 58 | 
 59 |             url = data_dict['doc_url']
 60 | 
 61 |             fn = storage.get_filename(url)
 62 |             extension = os.path.splitext(fn)[1]
 63 | 
 64 |             if extension in ['.jpg', '.jpeg', '.png']:
 65 |                 sas_url = storage.create_sas(url)
 66 |                 cvr = cv_helpers.CV()
 67 | 
 68 |                 res = cvr.analyze_image(img_url=sas_url)
 69 | 
 70 |                 data_dict['text'] = res['text'] + data_dict['text']
 71 |                 data_dict['cv_image_vector'] = cvr.get_img_embedding(sas_url)
 72 |                 data_dict['cv_text_vector'] = cvr.get_text_embedding(res['text'])
 73 |                 data_dict['contentType'] = 'image'
 74 | 
 75 |     except Exception as e:    
 76 |         logging.error(f"Exception: Image {doc_id} created an exception.\n{e}")
 77 |         print(f"Exception: Image {doc_id} created an exception.\n{e}")
 78 |         ret_dict['status'] = f"Exception: Image {doc_id} created an exception.\n{e}"
 79 | 
 80 | 
 81 |     try:
 82 |         if DATABASE_MODE == 1:
 83 |             db_status = cosmos_helpers.cosmos_store_contents(data_dict)
 84 |             logging.info(db_status)
 85 |             print(db_status)
 86 |     except Exception as e:    
 87 |         doc_id = data_dict.get('id', 'Unknown ID')
 88 |         logging.error(f"Exception: Document {doc_id} created an exception.\n{e}")
 89 |         ret_dict['status'] = ret_dict['status'] + '\n' + f"Exception: Document {doc_id} created an exception.\n{e}"
 90 | 
 91 |     try:
 92 |         ret_dict = storage.save_json_document(data_dict, OUTPUT_BLOB_CONTAINER)
 93 |         logging.info(ret_dict['status'])
 94 |     except Exception as e:
 95 |         doc_id = data_dict.get('id', 'Unknown ID')
 96 |         logging.error(f"Exception: Document {doc_id} created an exception.\n{e}")
 97 |         ret_dict['status'] = ret_dict['status'] + '\n' + f"Exception: Document {doc_id} created an exception.\n{e}"
 98 | 
 99 |     return ret_dict
100 | 
101 | 
102 | 
103 | 
104 | ## Perform an operation on a record
105 | def transform_value(value):
106 |     try:
107 |         recordId = value['recordId']
108 |     except AssertionError  as error:
109 |         logging.info(error)
110 |         return None
111 | 
112 |     # Validate the inputs
113 |     try:         
114 |         assert ('data' in value), "'data' field is required."
115 |         data = value['data']        
116 |         logging.info(f"Data received: {data}")
117 |         assert ('content' in data), "'content' field is required in 'data' object."
118 |         assert ('id' in data), "'id' field is required in 'data' object."
119 |         
120 |     except AssertionError as error:
121 |         logging.info(error)
122 |         return (
123 |             {
124 |             "recordId": recordId,
125 |             "errors": [ { "message": "Error:" + error.args[0] }   ]       
126 |             })
127 | 
128 |     try:                
129 |         ret_dict = analyze_doc(value['data'])
130 |                                
131 |         # Here you could do something more interesting with the inputs
132 | 
133 |     except AssertionError  as error:
134 |         logging.info(error)
135 |         return (
136 |             {
137 |             "recordId": recordId,
138 |             "errors": [ { "message": "Could not complete operation for record." }   ]       
139 |             })
140 | 
141 |     return ({
142 |             "recordId": recordId,
143 |             "data": ret_dict
144 |             })
145 | 
146 | 
147 | 
148 | 
149 | 
150 | def compose_response(json_data):
151 |     values = json.loads(json_data)['values']
152 |     
153 |     # Prepare the Output before the loop
154 |     results = {}
155 |     results["values"] = []
156 |     
157 |     for value in values:
158 |         output_record = transform_value(value)
159 |         if output_record != None:
160 |             results["values"].append(output_record)
161 | 
162 |     return json.dumps(results, ensure_ascii=False)
163 | 
164 |     
165 | 
166 | 
167 | def main(req: func.HttpRequest) -> func.HttpResponse:
168 |     logging.info('Python HTTP trigger function processed a request.')
169 | 
170 |     try:
171 |         body = json.dumps(req.get_json())
172 |     except ValueError:
173 |         return func.HttpResponse(
174 |              "Invalid body",
175 |              status_code=400
176 |         )
177 |     
178 |     if body:
179 |         result = compose_response(body)
180 |         return func.HttpResponse(result, mimetype="application/json")
181 |     else:
182 |         return func.HttpResponse(
183 |              "Invalid body",
184 |              status_code=400
185 |         )
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/AzCogSearchDocCrackingFunc/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": [
10 |         "get",
11 |         "post"
12 |       ]
13 |     },
14 |     {
15 |       "type": "http",
16 |       "direction": "out",
17 |       "name": "$return"
18 |     }
19 |   ]
20 | }


--------------------------------------------------------------------------------
/AzCogSearchDocCrackingFunc/sample.dat:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "Azure"
3 | }


--------------------------------------------------------------------------------
/AzureOpenAIandPVAbot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/AzureOpenAIandPVAbot.pdf


--------------------------------------------------------------------------------
/BotQnAHTTPFunc/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import azure.functions as func
 3 | import os
 4 | 
 5 | from utils import bot_helpers
 6 | 
 7 | from utils.env_vars import *
 8 | 
 9 | 
10 | def get_param(req, param_name):
11 |     param = req.params.get(param_name) 
12 | 
13 |     if not param:
14 |         try:
15 |             req_body = req.get_json()
16 |         except ValueError:
17 |             pass
18 |         else:
19 |             param = req_body.get(param_name)
20 |     
21 |     return param
22 | 
23 | 
24 | 
25 | def check_param(param):
26 |     if param == 'true':
27 |         param = True
28 |     else:
29 |         param = False
30 | 
31 |     return param
32 | 
33 | 
34 | 
35 | def main(req: func.HttpRequest) -> func.HttpResponse:
36 |     logging.info('Python HTTP trigger function processed a request.')
37 | 
38 |     query = get_param(req, 'query')
39 |     session_id = get_param(req, 'session_id')
40 |     filter_param = get_param(req, 'filter')
41 |     search_method = get_param(req, 'search_method')
42 | 
43 |     enable_unified_search = get_param(req, 'enable_unified_search')
44 |     enable_redis_search = get_param(req, 'enable_redis_search')
45 |     enable_cognitive_search = get_param(req, 'enable_cognitive_search')
46 |     evaluate_step = get_param(req, 'evaluate_step')
47 |     check_adequacy = get_param(req, 'check_adequacy')
48 |     check_intent = get_param(req, 'check_intent') 
49 |     use_calendar = get_param(req, 'use_calendar')
50 |     use_calculator = get_param(req, 'use_calculator')
51 |     use_bing = get_param(req, 'use_bing')
52 |     
53 | 
54 |     params_dict = {
55 |         'enable_unified_search': check_param(enable_unified_search),
56 |         'enable_redis_search': check_param(enable_redis_search),
57 |         'enable_cognitive_search': check_param(enable_cognitive_search),
58 |         'evaluate_step': check_param(evaluate_step),
59 |         'check_adequacy': check_param(check_adequacy),
60 |         'check_intent': check_param(check_intent),
61 |         'use_calendar': check_param(use_calendar),
62 |         'use_calculator': check_param(use_calculator),
63 |         'use_bing': check_param(use_bing)
64 |     }
65 | 
66 |     if filter_param is None: filter_param = '*'
67 | 
68 |     if query:
69 |         str = bot_helpers.openai_interrogate_text(query, session_id=session_id, filter_param=filter_param, agent_name=search_method, params_dict=params_dict)
70 |         return func.HttpResponse(str)
71 |     else:
72 |         return func.HttpResponse(
73 |              "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.",
74 |              status_code=200
75 |         )
76 | 


--------------------------------------------------------------------------------
/BotQnAHTTPFunc/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "authLevel": "function",
 6 |       "type": "httpTrigger",
 7 |       "direction": "in",
 8 |       "name": "req",
 9 |       "methods": [
10 |         "get",
11 |         "post"
12 |       ]
13 |     },
14 |     {
15 |       "type": "http",
16 |       "direction": "out",
17 |       "name": "$return"
18 |     }
19 |   ]
20 | }


--------------------------------------------------------------------------------
/BotQnAHTTPFunc/sample.dat:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "Azure"
3 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | This project welcomes contributions and suggestions. Most contributions require you to
 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
 5 | and actually do, grant us the rights to use your contribution. For details, visit
 6 | https://cla.microsoft.com.
 7 | 
 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need
 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
11 | 
12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 samelhousseini
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help  
 4 | 
 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
 6 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
 7 | feature request as a new Issue.
 8 | 
 9 | For help and questions about using this project, please contact the repo owner via GitHub.
10 | 
11 | ## Microsoft Support Policy  
12 | 
13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
14 | 


--------------------------------------------------------------------------------
/ServiceBusQueueNewDocument/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import json
 3 | import azure.functions as func
 4 | import smart_open
 5 | import os, uuid
 6 | from azure.storage.blob import BlobServiceClient, BlobClient
 7 | 
 8 | from utils import helpers
 9 | from utils import cosmos_helpers
10 | from utils import cogsearch_helpers
11 | from utils.kb_doc import KB_Doc
12 | from utils.cogvecsearch_helpers import cogsearch_vecstore
13 | 
14 | from utils.env_vars import *
15 | 
16 | 
17 | def main(msg: func.ServiceBusMessage):
18 | 
19 |     msg_dict = json.loads(msg.get_body().decode('utf-8'))
20 | 
21 |     logging.info('Python ServiceBus queue trigger processed message: %s', msg_dict)
22 |     logging.info("Event Type:%s", msg_dict['eventType'])
23 | 
24 |     transport_params = {
25 |     'client': BlobServiceClient.from_connection_string(KB_BLOB_CONN_STR),
26 |     }
27 | 
28 |     json_filename = os.path.basename(msg_dict['subject'])
29 | 
30 |     with smart_open.open(f"azure://{OUTPUT_BLOB_CONTAINER}/{json_filename}", transport_params=transport_params) as fin:
31 |         data = json.load(fin)
32 | 
33 |     full_kbd_doc = KB_Doc()
34 |     full_kbd_doc.load(data)
35 | 
36 |     emb_documents = []
37 | 
38 |     emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, SMALL_EMB_TOKEN_NUM,  text_suffix = 'S')
39 | 
40 |     if MEDIUM_EMB_TOKEN_NUM != 0:
41 |         emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, MEDIUM_EMB_TOKEN_NUM, text_suffix = 'M', previous_max_tokens=SMALL_EMB_TOKEN_NUM)
42 | 
43 |     if LARGE_EMB_TOKEN_NUM != 0:
44 |         emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, LARGE_EMB_TOKEN_NUM,  text_suffix = 'L', previous_max_tokens=MEDIUM_EMB_TOKEN_NUM)
45 | 
46 |     if X_LARGE_EMB_TOKEN_NUM != 0:
47 |         emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, X_LARGE_EMB_TOKEN_NUM,  text_suffix = 'XL', previous_max_tokens=LARGE_EMB_TOKEN_NUM)
48 | 
49 |     logging.info(f"Generated {len(emb_documents)} emb chunks from doc {json_filename}")
50 | 
51 |     if (REDIS_ADDR is not None) and (REDIS_ADDR != '') and (USE_REDIS_CACHE == 1): 
52 |         loaded = helpers.load_embedding_docs_in_redis(emb_documents, document_name = json_filename)
53 |         logging.info(f"Loaded into Redis {loaded} emb chunks from doc {json_filename}")
54 |         print(f"Loaded into Redis {loaded} emb chunks from doc {json_filename}")
55 | 
56 |     if USE_COG_VECSEARCH == 1:
57 |         vs = cogsearch_vecstore.CogSearchVecStore()
58 |         vs.create_index()
59 |         docs_dict = vs.upload_documents(emb_documents)
60 |     else:
61 |         cogsearch_helpers.index_semantic_sections(emb_documents)
62 | 
63 |     if DATABASE_MODE == 1:
64 |         cosmos_helpers.cosmos_backup_embeddings(emb_documents)
65 | 
66 |     


--------------------------------------------------------------------------------
/ServiceBusQueueNewDocument/function.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scriptFile": "__init__.py",
 3 |   "bindings": [
 4 |     {
 5 |       "name": "msg",
 6 |       "type": "serviceBusTrigger",
 7 |       "direction": "in",
 8 |       "queueName": "km-oai-processing-queue",
 9 |       "connection": "SERVICEBUS_CONN_STR"
10 |     }
11 |   ]
12 | }


--------------------------------------------------------------------------------
/ServiceBusQueueNewDocument/sample.dat:
--------------------------------------------------------------------------------
1 | Service Bus Message


--------------------------------------------------------------------------------
/WISHLIST.md:
--------------------------------------------------------------------------------
 1 | # Wishlist
 2 | 
 3 | The following features are added to the wishlist to be implemented:
 4 | 
 5 | 
 6 | ### Completed 
 7 | 1. Code: Adding support for ChatGPT
 8 | 1. Code: Passing TopK matches parameters in the REST API call to the BotQnAHTTPFunc
 9 | 1. ARM: Implementing Dedicated Plan with B1 for the Function App, instead of the Premium Plan
10 | 1. Code: Adding Form Recognizer either as a new data source to the "kmoaiprocessed" container, or as a new Custom Skill
11 | 1. Code: Storing contents, embeddings, and queries in Cosmos. It's important to know which are the most asked queries.
12 | 1. ARM: Adding Application Insights to the ARM template
13 | 1. Code: Adding a custom skill that processes csv files
14 | 1. GUI for triggering Cognitive Search and Form Recognizer document ingestion - streamlit
15 | 
16 | ### TBD
17 | 1. ARM: Adding Application Insights to the ARM template
18 | 1. Code: Adding a custom skill that processes csv files
19 | 
20 | 1. GUI for triggering Cognitive Search and Form Recognizer document ingestion - 
21 |     1. Backend: Python control part in Flask - Samer 
22 |     1. Frontend: React control part for the UI (generate with GPT4) - Yacine
23 | 1. Chat client UI - demo - Yacine (1st Priority)
24 | 1. Streaming capability with Flash SocketIO - Andrey (1st priority)
25 | 1. (maximizing Cosmos use) - Translation Problem (checksum checking in Cosmos) - re-generate translations for all the chunks (cost) - get from Cosmos - Andrey (2nd priority)
26 | 1. Streamlit vs Flask - Andrey (3rd priority)
27 | 1. GPT4 Agent -- Samer 
28 | 
29 | ### Future
30 | 1. Code: Adding support for fine-tuned models.


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from flask import Flask, redirect, url_for, request, jsonify
  4 | from flask_socketio import SocketIO
  5 | from flask_socketio import send, emit
  6 | import urllib
  7 | 
  8 | 
  9 | from utils import bot_helpers
 10 | from utils import langchain_helpers
 11 | from utils import km_agents
 12 | from utils import redis_helpers
 13 | from utils import language
 14 | 
 15 | 
 16 | global_params_dict = {
 17 |     'enable_unified_search': False,
 18 |     'enable_redis_search': False,
 19 |     'enable_cognitive_search': True,
 20 |     'evaluate_step': False,
 21 |     'check_adequacy': False,
 22 |     'check_intent': False
 23 | }
 24 | 
 25 | # redis_conn = redis_helpers.get_new_conn()
 26 | 
 27 | from utils.env_vars import *
 28 | 
 29 | app = Flask(__name__)
 30 | socketio = SocketIO(app, cors_allowed_origins='*') 
 31 | app.config['SECRET_KEY'] = 'secret!'
 32 | 
 33 | redis_conn = redis_helpers.get_new_conn() 
 34 | 
 35 | ##############################################################
 36 | ##############################################################
 37 | # IMPORTANT
 38 | # To run this web server, use the following command:
 39 | # flask --app app.py --debug run
 40 | # To be able to run this, activate the venv first using the
 41 | # following command on Windows:
 42 | # .\.venv\Scripts\activate
 43 | # Then install the required packages using the following command:
 44 | # pip install -r requirements.txt
 45 | ##############################################################
 46 | ##############################################################
 47 | 
 48 | 
 49 | 
 50 | agents_sid = {}
 51 | 
 52 | 
 53 | @app.route("/", defaults={"path": "index.html"})
 54 | @app.route("/<path:path>")
 55 | def static_file(path):
 56 |     print("path", path)
 57 |     return app.send_static_file(path)
 58 | 
 59 | 
 60 | @socketio.on('connect')
 61 | def on_connect():
 62 |     print(f"connected {request.sid}")
 63 | 
 64 | 
 65 | @socketio.on('config')
 66 | def on_config(agent_type):
 67 |     print(f"config {request.sid} - {agent_type}")
 68 |     connection = {'socketio': socketio, 'connection_id':request.sid}
 69 |     agent = km_agents.KMOAI_Agent(agent_name = agent_type, params_dict=global_params_dict,  stream = True, connection=connection)
 70 |     agents_sid[request.sid] = agent
 71 | 
 72 | 
 73 | @socketio.on('disconnect')
 74 | def on_disconnect():
 75 |     try:
 76 |         del agents_sid[request.sid]
 77 |     except Exception as e:
 78 |         print(f"Client not found: {e}")
 79 | 
 80 | 
 81 | 
 82 | 
 83 | @socketio.on('message')
 84 | def handle_message(q):
 85 | 
 86 |     print(f'received message: {q} from {request.sid}')
 87 |     emit('new_message', "Query: " + q + '\n') 
 88 |     
 89 |     lang = language.detect_content_language(q)
 90 |     if lang != 'en': q = language.translate(q, lang, 'en')
 91 | 
 92 |     print(f'language detected: {lang}')
 93 | 
 94 |     answer, sources, likely_sources, s_id = agents_sid[request.sid].run(q, request.sid, redis_conn=redis_conn)
 95 |     sources_str = ''
 96 | 
 97 |     if lang != 'en': answer = language.translate(answer, 'en', lang)
 98 | 
 99 |     answer = answer.replace('\n', ' <br> ')
100 |     
101 |     send(answer)
102 |     if len(sources) > 0:
103 |         for s in set(sources): 
104 |             try:
105 |                 linkname = urllib.parse.unquote(os.path.basename(s.split('?')[0]))
106 |             except:
107 |                 linkname = 'Link'
108 |             sources_str +=  '[<a href="' + s + f'" target="_blank">{linkname}</a>]' 
109 |         send('Links:'+ sources_str)
110 | 
111 | 
112 | 
113 | ##### IMPORTANT
114 | ##### INCLUDE IN THE POST HEADER --> Content-Type: application/json
115 | ##### IMPORTANT
116 | @app.route('/kmoai_request', methods=['POST'])
117 | def kmoai_request():
118 |     data = request.get_json()
119 |     return process_kmoai_request(data)
120 | 
121 | 
122 | 
123 | def check_param(param):
124 |     if param == 'true':
125 |         param = True
126 |     else:
127 |         param = False
128 | 
129 |     return param
130 | 
131 | 
132 | def get_param(req, param_name):
133 |     param = req.get(param_name, None) 
134 |     return param
135 | 
136 | 
137 | 
138 | ##### IMPORTANT
139 | ##### INCLUDE IN THE POST HEADER --> Content-Type: application/json
140 | ##### IMPORTANT
141 | def process_kmoai_request(req):
142 |     logging.info('Python HTTP trigger function processed a request.')
143 | 
144 |     query = get_param(req, 'query')
145 |     session_id = get_param(req, 'session_id')
146 |     filter_param = get_param(req, 'filter')
147 |     search_method = get_param(req, 'search_method')
148 | 
149 |     enable_unified_search = get_param(req, 'enable_unified_search')
150 |     enable_redis_search = get_param(req, 'enable_redis_search')
151 |     enable_cognitive_search = get_param(req, 'enable_cognitive_search')
152 |     evaluate_step = get_param(req, 'evaluate_step')
153 |     check_adequacy = get_param(req, 'check_adequacy')
154 |     check_intent = get_param(req, 'check_intent') 
155 |     use_calendar = get_param(req, 'use_calendar')
156 |     use_bing = get_param(req, 'use_bing')
157 | 
158 |     params_dict = {
159 |         'enable_unified_search': check_param(enable_unified_search),
160 |         'enable_redis_search': check_param(enable_redis_search),
161 |         'enable_cognitive_search': check_param(enable_cognitive_search),
162 |         'evaluate_step': check_param(evaluate_step),
163 |         'check_adequacy': check_param(check_adequacy),
164 |         'check_intent': check_param(check_intent),
165 |         'use_calendar': check_param(use_calendar),
166 |         'use_bing': check_param(use_bing)
167 |     }
168 |     
169 |     if filter_param is None: filter_param = '*'
170 |     
171 |     return bot_helpers.openai_interrogate_text(query, session_id=session_id, filter_param=filter_param, agent_name=search_method, params_dict=params_dict)
172 | 
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     app.run()
177 |     socketio.run(app, allow_unsafe_werkzeug=True)
178 |     print('socket io start')
179 | 


--------------------------------------------------------------------------------
/host.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": "2.0",
 3 |   "functionTimeout": "-1",
 4 |   "extensions": {
 5 |     "serviceBus": {
 6 |       "prefetchCount": 1,
 7 |       "messageHandlerOptions": {
 8 |         "maxConcurrentCalls": 1
 9 |       }
10 |     }
11 |   },
12 |   "logging": {
13 |     "applicationInsights": {
14 |       "samplingSettings": {
15 |         "isEnabled": true,
16 |         "excludedTypes": "Request"
17 |       }
18 |     }
19 |   },
20 |   "extensionBundle": {
21 |     "id": "Microsoft.Azure.Functions.ExtensionBundle",
22 |     "version": "[3.3.*, 4.0.0)"
23 |   },
24 |   "concurrency": {
25 |     "dynamicConcurrencyEnabled": true,
26 |     "snapshotPersistenceEnabled": true
27 |   }
28 | }


--------------------------------------------------------------------------------
/images/agent-arch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/agent-arch.jpg


--------------------------------------------------------------------------------
/images/azure.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/azure.jpg


--------------------------------------------------------------------------------
/images/chatbot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/chatbot.jpg


--------------------------------------------------------------------------------
/images/copyfuncurl.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/copyfuncurl.jpg


--------------------------------------------------------------------------------
/images/copyoutputs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/copyoutputs.jpg


--------------------------------------------------------------------------------
/images/custom_skill.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/custom_skill.jpg


--------------------------------------------------------------------------------
/images/depl-outputs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/depl-outputs.jpg


--------------------------------------------------------------------------------
/images/first_req.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/first_req.jpg


--------------------------------------------------------------------------------
/images/firstquery.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/firstquery.jpg


--------------------------------------------------------------------------------
/images/funcdeploy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/funcdeploy.jpg


--------------------------------------------------------------------------------
/images/km-openai v2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/km-openai v2.jpg


--------------------------------------------------------------------------------
/images/km-openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/km-openai.png


--------------------------------------------------------------------------------
/images/local_settings.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/local_settings.jpg


--------------------------------------------------------------------------------
/images/midjourney.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/midjourney.png


--------------------------------------------------------------------------------
/images/oai_deployments.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/oai_deployments.jpg


--------------------------------------------------------------------------------
/images/openaichoice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/openaichoice.jpg


--------------------------------------------------------------------------------
/images/openaifuncapp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/openaifuncapp.jpg


--------------------------------------------------------------------------------
/images/postman.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/postman.jpg


--------------------------------------------------------------------------------
/images/prompt_choice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/prompt_choice.png


--------------------------------------------------------------------------------
/images/redis.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/redis.jpg


--------------------------------------------------------------------------------
/images/redischoice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/redischoice.jpg


--------------------------------------------------------------------------------
/images/run_ingest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/run_ingest.jpg


--------------------------------------------------------------------------------
/images/search_params.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/search_params.jpg


--------------------------------------------------------------------------------
/images/sec_req.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/sec_req.jpg


--------------------------------------------------------------------------------
/images/sem_search.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/sem_search.jpg


--------------------------------------------------------------------------------
/images/stream-client.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/stream-client.jpg


--------------------------------------------------------------------------------
/images/subs_conv.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/subs_conv.jpg


--------------------------------------------------------------------------------
/images/suffix.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/suffix.jpg


--------------------------------------------------------------------------------
/kb_docs_samples/Dubai Brochure.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/Dubai Brochure.pdf


--------------------------------------------------------------------------------
/kb_docs_samples/Las Vegas Brochure.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/Las Vegas Brochure.pdf


--------------------------------------------------------------------------------
/kb_docs_samples/London Brochure.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/London Brochure.pdf


--------------------------------------------------------------------------------
/kb_docs_samples/Margies Travel Company Info.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/Margies Travel Company Info.pdf


--------------------------------------------------------------------------------
/kb_docs_samples/New York Brochure.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/New York Brochure.pdf


--------------------------------------------------------------------------------
/kb_docs_samples/San Francisco Brochure.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/San Francisco Brochure.pdf


--------------------------------------------------------------------------------
/local.settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "IsEncrypted": false,
 3 |   "Values": {
 4 |     "AzureWebJobsStorage": "",
 5 |     "FUNCTIONS_WORKER_RUNTIME": "python",
 6 |     "WEBSITE_MAX_DYNAMIC_APPLICATION_SCALE_OUT":"1",
 7 |     "FUNCTIONS_EXTENSION_VERSION":"~4",
 8 | 
 9 |     "COG_SEARCH_ENDPOINT": "",
10 |     "COG_SEARCH_ADMIN_KEY": "",
11 |     "KB_INDEX_NAME":"km-openai",
12 |     "KB_INDEXER_NAME":"km-openai-indexer",
13 |     "KB_DATA_SOURCE_NAME":"km-openai-docs",
14 |     "KB_SKILLSET_NAME":"km-openai-skills",
15 |     
16 |     "COG_SERV_ENDPOINT": "",
17 |     "COG_SERV_KEY": "",
18 |     
19 |     "COSMOS_URI": "",
20 |     "COSMOS_KEY": "",
21 |     "COSMOS_DB_NAME": "KM_OAI_DB",
22 |     "CATEGORYID": "KM_OAI_CATEGORY",
23 | 
24 |     "DATABASE_MODE": "0",
25 | 
26 | 
27 |     "SERVICEBUS_CONN_STR":"",
28 | 
29 |     "TRANSLATION_ENDPOINT": "https://api.cognitive.microsofttranslator.com",
30 |     "TRANSLATION_API_KEY": "",
31 |     "TRANSLATION_LOCATION": "westeurope",
32 |     
33 |     "KB_BLOB_CONN_STR":"",
34 |     "KB_BLOB_CONTAINER":"kmoaidemo",
35 |     "OUTPUT_BLOB_CONTAINER":"kmoaiprocessed",
36 | 
37 |     "REDIS_ADDR": "",
38 |     "REDIS_PORT": "10000",
39 |     "REDIS_PASSWORD":"",
40 |     "REDIS_INDEX_NAME":"acs_emb_index",
41 |     "VECTOR_FIELD_IN_REDIS":"item_vector",
42 |     "NUMBER_PRODUCTS_INDEX":"1000",
43 | 
44 |     "OPENAI_RESOURCE_ENDPOINT": "",
45 |     "OPENAI_API_KEY": "",
46 | 
47 |     "ADA_002_EMBED_NUM_DIMS":"1536",
48 |     "ADA_002_MODEL_MAX_TOKENS":"2047",
49 |     "ADA_002_EMBEDDING_MODEL":"text-embedding-ada-002",
50 |     "ADA_EMBEDDING_ENCODING":"cl100k_base",
51 | 
52 |     "DAVINCI_003_EMBED_NUM_DIMS":"12288",
53 |     "DAVINCI_003_MODEL_MAX_TOKENS":"4097",
54 |     "DAVINCI_003_EMB_MAX_TOKENS":"2047",
55 |     "DAVINCI_003_COMPLETIONS_MODEL":"text-davinci-003",
56 |     "DAVINCI_003_EMBEDDING_MODEL":"text-search-davinci-doc-001",
57 |     "DAVINCI_003_QUERY_EMB_MODEL":"text-search-davinci-query-001",
58 |     "DAVINCI_EMBEDDING_ENCODING":"p50k_base",
59 |     
60 | 
61 |     "OVERLAP_TEXT":"80",
62 |     "SMALL_EMB_TOKEN_NUM":"2047",
63 |     "MEDIUM_EMB_TOKEN_NUM":"0",
64 |     "LARGE_EMB_TOKEN_NUM":"0",
65 |     "CHOSEN_EMB_MODEL":"text-embedding-ada-002",
66 |     "CHOSEN_QUERY_EMB_MODEL":"text-embedding-ada-002",
67 |     "CHOSEN_COMP_MODEL":"text-davinci-003",
68 |     "NUM_TOP_MATCHES":"5",
69 |     "MAX_OUTPUT_TOKENS":"500",
70 |     "MAX_QUERY_TOKENS":"500"
71 | 
72 |   }
73 | }


--------------------------------------------------------------------------------
/redis.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: '2019-12-01'
 2 | location: westeurope
 3 | name: redis-with-file-share-0001
 4 | properties:
 5 |   containers:
 6 |   - name: redisearch
 7 |     properties:
 8 |       environmentVariables:
 9 |         - name: "REDIS_ARGS"
10 |           value: "--save 60 1"
11 |       image: redis/redis-stack-server:latest
12 |       ports:
13 |       - port: 6379
14 |       resources:
15 |         requests:
16 |           cpu: 1.0
17 |           memoryInGB: 1.5
18 |   osType: Linux
19 |   restartPolicy: Always
20 |   ipAddress:
21 |     type: Public
22 |     ports:
23 |       - port: 6379
24 |     dnsNameLabel: aci-redis-cp1
25 | tags: {}
26 | type: Microsoft.ContainerInstance/containerGroups


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # DO NOT include azure-functions-worker in this file
 2 | # The Python Worker is managed by Azure Functions platform
 3 | # Manually managing azure-functions-worker may cause unexpected issues
 4 | 
 5 | azure-functions
 6 | azure-core
 7 | azure-cosmos
 8 | openai==0.28.1
 9 | numpy
10 | requests
11 | pandas
12 | azure-storage-blob 
13 | azure-identity
14 | smart_open
15 | tenacity
16 | redis
17 | tiktoken
18 | azure-storage-file-share
19 | python-dotenv
20 | azure-search-documents>=11.4.0b3
21 | azure-ai-formrecognizer
22 | beautifulsoup4
23 | lxml
24 | azure-ai-textanalytics
25 | langchain==0.0.139
26 | flask
27 | flask-socketio
28 | 


--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>AI Chat App</title>
 7 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js" integrity="sha512-q/dWJ3kcmjBLU4Qc47E4A9kTB4m3wuTY7vkFJDTZKjTs8jhyGQnaUrxa0Ytd0ssMZhbNua9hE+E7Qv1j+DyZwA==" crossorigin="anonymous"></script>
 8 |     <link rel="stylesheet" href="styles.css">
 9 |     <script type="text/javascript" src="script.js" defer></script>
10 | </head>
11 | <body>
12 | 
13 |     <div id="mySidebar" class="sidebar">
14 |         <a href="javascript:void(0)" class="closebtn" onclick="closeNav()">&times;</a>
15 |         <h3>Search Methods</h3>
16 |         <ul>
17 |             <li>
18 |                 <input type="radio" name="search-method" id="method-1" value="os" >
19 |                 <label for="method-1">One-Pass Agent</label>
20 |             </li>
21 |             <li>
22 |                 <input type="radio" name="search-method" id="method-2" value="ccr" checked>
23 |                 <label for="method-2">Conversational Chat ReAct Agent</label>
24 |             </li>
25 |             <li>
26 |                 <input type="radio" name="search-method" id="method-3" value="zs" >
27 |                 <label for="method-3">Zero-Shot ReAct Agent</label>
28 |             </li>
29 |         </ul>
30 |         <button type="button" class="savebtn" onclick="saveSelection()">Save</button>
31 |     </div>
32 | 
33 |     <div id="main">
34 |     <button class="openbtn" onclick="openNav()">&#9776; Options</button>
35 |     </div>
36 | 
37 |     <div id="chat-container">
38 |         <!-- Chat messages will be appended here -->
39 |     </div>
40 | 
41 |     <div id="input-container">
42 |         <input type="text" id="input-message" placeholder="Type your message...">
43 |         <button id="send-button">Send</button>
44 |     </div>
45 | 
46 | </body>
47 | </html>
48 | 


--------------------------------------------------------------------------------
/static/index_old.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <body>
 3 |         <h1>Chat App</h1>
 4 |         <form id="message-form">
 5 |           <input type="text" id="message-input" placeholder="Type your message">
 6 |           <button type="submit">Send</button>
 7 |         </form>
 8 |         <div id="messages"></div>
 9 |     </body>
10 | <script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.0.1/socket.io.js" integrity="sha512-q/dWJ3kcmjBLU4Qc47E4A9kTB4m3wuTY7vkFJDTZKjTs8jhyGQnaUrxa0Ytd0ssMZhbNua9hE+E7Qv1j+DyZwA==" crossorigin="anonymous"></script>
11 | <script type="text/javascript" charset="utf-8">
12 |     var socket = io();
13 |     const messages = document.querySelector('#messages');
14 |     const messageForm = document.querySelector('#message-form');
15 |     const messageInput = document.querySelector('#message-input');
16 |     var latest_div = document.getElementById("messages").lastChild;
17 |     
18 |     console.log("starting")
19 | 
20 |     messageForm.addEventListener('submit', (event) => {
21 |         event.preventDefault();
22 |         const message = messageInput.value;
23 |         socket.emit('message', message);
24 |         messageInput.value = '';
25 |     });
26 | 
27 |     socket.on('message', (message) => {
28 |         // const messageElement = document.createElement('div');
29 |         console.log(message)
30 |         console.log(document.getElementById("messages").lastChild.innerHTML )
31 |         document.getElementById("messages").lastChild.innerHTML   = document.getElementById("messages").lastChild.innerHTML  + '<br/>' + message + '<br/><br/>';
32 |         // messages.appendChild(messageElement);
33 |     });
34 | 
35 |     socket.on('new_message', (message) => {
36 |         const messageElement = document.createElement('div');
37 |         messageElement.textContent  = message 
38 |         messageElement.innerHTML = messageElement.innerHTML + '<br/>';
39 |         messages.appendChild(messageElement);
40 |     });
41 | 
42 |     socket.on('token', (message) => {
43 |         document.getElementById("messages").lastChild.textContent  = document.getElementById("messages").lastChild.textContent + message;
44 |     });
45 | 
46 |     socket.on('connect', function() {
47 |         socket.emit('connect_message', {data: 'Im connected!'});
48 |     });
49 | </script>
50 | </html>


--------------------------------------------------------------------------------
/static/script.js:
--------------------------------------------------------------------------------
 1 | var socket = io();
 2 | let selectedOption = 'ccr';
 3 | 
 4 | 
 5 | function saveSelection() {
 6 |     const radioButtons = document.getElementsByName('search-method');
 7 | 
 8 |     for (let i = 0; i < radioButtons.length; i++) {
 9 |         if (radioButtons[i].checked) {
10 |             selectedOption = radioButtons[i].value;
11 |             break;
12 |         }
13 |     }
14 | 
15 |     if (selectedOption == 'os') {
16 |         socket.emit('config', 'os');
17 |     } 
18 |     else if (selectedOption == 'ccr') {
19 |         socket.emit('config', 'ccr');
20 |     }
21 |     else if (selectedOption == 'zs') {
22 |         socket.emit('config', 'zs');
23 |     }
24 |     else {
25 |         socket.emit('config', 'os');
26 |     }
27 | 
28 |     console.log("New Config: " + selectedOption)
29 | 
30 |     closeNav();
31 | }
32 | 
33 | 
34 | 
35 | 
36 | /* Set the width of the sidebar to 250px and the left margin of the page content to 250px */
37 | function openNav() {
38 |     document.getElementById("mySidebar").style.width = "450px";
39 |     document.getElementById("main").style.marginLeft = "450px";
40 | }
41 | 
42 | /* Set the width of the sidebar to 0 and the left margin of the page content to 0 */
43 | function closeNav() {
44 |     document.getElementById("mySidebar").style.width = "0";
45 |     document.getElementById("main").style.marginLeft = "0";
46 | }
47 | 
48 | 
49 | document.getElementById("send-button").addEventListener("click", function () {
50 |     let message = document.getElementById("input-message").value.trim();
51 | 
52 |     if (message) {
53 |         // Append human message
54 |         let humanBubble = document.createElement("div");
55 |         humanBubble.classList.add("chat-bubble", "human");
56 |         humanBubble.innerText = message;
57 |         document.getElementById("chat-container").appendChild(humanBubble);
58 |         
59 |         socket.emit('message', message);
60 | 
61 |         // Clear input field
62 |         document.getElementById("input-message").value = "";
63 |     }
64 | });
65 | 
66 | socket.on('message', (message) => {
67 |     console.log(message)
68 |     document.getElementById("chat-container").lastChild.innerHTML   = document.getElementById("chat-container").lastChild.innerHTML  + '<br>' + message + '<br><br>';
69 |     document.getElementById("chat-container").scrollTop = document.getElementById("chat-container").scrollHeight;
70 | });
71 | 
72 | socket.on('new_message', (message) => {
73 |     console.log("Created new response bubble " + message)
74 |     let aiBubble = document.createElement("div");
75 |     aiBubble.classList.add("chat-bubble", "ai");
76 |     aiBubble.innerText = ''
77 |     document.getElementById("chat-container").appendChild(aiBubble);
78 |     document.getElementById("chat-container").scrollTop = document.getElementById("chat-container").scrollHeight;
79 | });
80 | 
81 | socket.on('token', (message) => {
82 |     console.log(message)
83 |     document.getElementById("chat-container").lastChild.innerHTML  = document.getElementById("chat-container").lastChild.innerHTML + message;
84 |     document.getElementById("chat-container").scrollTop = document.getElementById("chat-container").scrollHeight;
85 | });
86 | 
87 | socket.on('connect', function() {
88 |     console.log('Im connected! ' + selectedOption);
89 |     socket.emit('config', selectedOption);
90 | });
91 | 
92 | // Send message with Enter key
93 | document.getElementById("input-message").addEventListener("keydown", function (event) {
94 |     if (event.key === "Enter") {
95 |         event.preventDefault();
96 |         document.getElementById("send-button").click();
97 |     }
98 | });


--------------------------------------------------------------------------------
/static/styles.css:
--------------------------------------------------------------------------------
  1 | /* The sidebar menu */
  2 | .sidebar {
  3 |     height: 100%; /* 100% Full-height */
  4 |     width: 0; /* 0 width - change this with JavaScript */
  5 |     position: fixed; /* Stay in place */
  6 |     z-index: 1; /* Stay on top */
  7 |     top: 0;
  8 |     left: 0;
  9 |     background-color: #111; /* Black*/
 10 |     overflow-x: hidden; /* Disable horizontal scroll */
 11 |     padding-top: 60px; /* Place content 60px from the top */
 12 |     transition: 0.5s; /* 0.5 second transition effect to slide in the sidebar */
 13 | }
 14 | 
 15 | /* The sidebar links */
 16 | 
 17 | .sidebar h3 {
 18 |     padding: 8px 8px 8px 32px;
 19 |     text-decoration: none;
 20 |     font-size: 18px;
 21 |     color: #818181;
 22 |     display: block;
 23 |     transition: 0.3s;
 24 | }
 25 | 
 26 | 
 27 | .sidebar li {
 28 |     padding: 8px 8px 8px 32px;
 29 |     text-decoration: none;
 30 |     font-size: 16px;
 31 |     color: #818181;
 32 |     display: block;
 33 |     transition: 0.3s;
 34 | }
 35 | 
 36 | /* When you mouse over the navigation links, change their color */
 37 | .sidebar a:hover {
 38 |     color: #f1f1f1;
 39 | }
 40 | 
 41 | /* Position and style the close button (top right corner) */
 42 | .sidebar .closebtn {
 43 |     position: absolute;
 44 |     top: 0;
 45 |     right: 25px;
 46 |     font-size: 36px;
 47 |     margin-left: 50px;
 48 | }
 49 | 
 50 | .sidebar .savebtn {
 51 |     font-size: 15px;
 52 |     cursor: pointer;
 53 |     background-color: #8b8b8b;
 54 |     color: white;
 55 |     padding: 10px 15px;
 56 |     border: none;
 57 |     margin-left: 70px;
 58 | }
 59 | 
 60 | 
 61 | /* The button used to open the sidebar */
 62 | .openbtn {
 63 |     font-size: 15px;
 64 |     cursor: pointer;
 65 |     background-color: #111;
 66 |     color: white;
 67 |     padding: 10px 15px;
 68 |     border: none;
 69 | }
 70 | 
 71 | .openbtn:hover {
 72 |     background-color: #444;
 73 | }
 74 | 
 75 | /* Style page content - use this if you want to push the page content to the right when you open the side navigation */
 76 | #main {
 77 |     transition: margin-left .5s; /* If you want a transition effect */
 78 |     padding: 20px;
 79 | }
 80 | 
 81 | /* On smaller screens, where height is less than 450px, change the style of the sidenav (less padding and a smaller font size) */
 82 | @media screen and (max-height: 450px) {
 83 |     .sidebar {padding-top: 15px;}
 84 |     .sidebar a {font-size: 18px;}
 85 | }
 86 | 
 87 |     
 88 | body {
 89 |     font-family: Arial, sans-serif;
 90 |     background-color: #f5f5f5;
 91 |     margin: 0;
 92 |     padding: 0;
 93 |     display: flex;
 94 |     flex-direction: column;
 95 |     height: 100vh;
 96 | }
 97 | 
 98 | #chat-container {
 99 |     flex-grow: 1;
100 |     overflow-y: auto;
101 |     padding: 20px;
102 |     background-color: #fff;
103 |     border-radius: 5px;
104 |     box-shadow: 0 3px 10px rgba(0, 0, 0, 0.1);
105 | }
106 | 
107 | .chat-bubble {
108 |     padding: 10px 20px;
109 |     background-color: #f1f1f1;
110 |     border-radius: 20px;
111 |     margin-bottom: 10px;
112 |     max-width: 80%;
113 |     width: 100%;
114 |     display: inline-block;
115 |     clear: both;
116 | }
117 | 
118 | .human {
119 |     background-color: #e0f7fa;
120 |     float: left;
121 | }
122 | 
123 | .ai {
124 |     background-color: #ffeb3b;
125 |     float: right;
126 | }
127 | 
128 | #input-container {
129 |     display: flex;
130 |     justify-content: center;
131 |     margin-top: 20px;
132 |     padding: 10px;
133 |     background-color: #ffffff;
134 |     border-top: 1px solid #ccc;
135 | }
136 | 
137 | #input-message {
138 |     width: 100%;
139 |     padding: 10px;
140 |     border-radius: 5px;
141 |     border: 1px solid #ccc;
142 |     outline: none;
143 | }
144 | 
145 | #send-button {
146 |     background-color: #4caf50;
147 |     color: #fff;
148 |     border: none;
149 |     padding: 10px 20px;
150 |     margin-left: 10px;
151 |     border-radius: 5px;
152 |     cursor: pointer;
153 |     font-size: 16px;
154 | }


--------------------------------------------------------------------------------
/utils/bot_helpers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import numpy as np
 4 | import tiktoken
 5 | import json
 6 | import logging
 7 | 
 8 | 
 9 | from utils import language
10 | from utils import storage
11 | from utils import redis_helpers
12 | from utils import openai_helpers
13 | from utils import cosmos_helpers
14 | from utils import km_agents
15 | 
16 | from utils.env_vars import *
17 | 
18 |         
19 | redis_conn = redis_helpers.get_new_conn() 
20 | 
21 | 
22 | 
23 | 
24 | def openai_interrogate_text(query, session_id=None, filter_param=None, agent_name=None, params_dict={}):
25 | 
26 |     lang = language.detect_content_language(query)
27 |     if lang != 'en': query = language.translate(query, lang, 'en')
28 | 
29 |     if (agent_name is None) or (agent_name not in ['zs', 'ccr', 'os']):
30 |         agent_name = 'zs'
31 | 
32 |     agent = km_agents.KMOAI_Agent(agent_name = agent_name, params_dict=params_dict, verbose = False)
33 | 
34 | 
35 |     final_answer, sources, likely_sources, session_id = agent.run(query, prompt_id=session_id, filter_param=filter_param, redis_conn=redis_conn)
36 | 
37 |     if lang != 'en': 
38 |         final_answer = language.translate(final_answer, 'en', lang)
39 |        
40 |     sources_str = ', '.join(sources)
41 | 
42 |     ret_dict  = {
43 |         "link": sources_str,
44 |         "likely_links": likely_sources,
45 |         "answer": final_answer,
46 |         "context": '',
47 |         "session_id": session_id
48 |     }
49 |     
50 |     return json.dumps(ret_dict, indent=4) 
51 | 


--------------------------------------------------------------------------------
/utils/cogvecsearch_helpers/cogsearch_vecstore.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import os
  3 | import logging
  4 | import json
  5 | import copy
  6 | 
  7 | 
  8 | from utils import helpers
  9 | from utils import http_helpers
 10 | import utils.cogvecsearch_helpers.cs_json
 11 | from utils import openai_helpers
 12 | 
 13 | from utils.env_vars import *
 14 | from utils import kb_doc
 15 | from utils import cv_helpers
 16 | 
 17 | import re
 18 | 
 19 | 
 20 | class CogSearchVecStore:
 21 | 
 22 |     def __init__(self, api_key = COG_SEARCH_ADMIN_KEY, 
 23 |                        search_service_name = COG_SEARCH_ENDPOINT, 
 24 |                        index_name = COG_VECSEARCH_VECTOR_INDEX, 
 25 |                        api_version  = COG_VEC_SEARCH_API_VERSION,
 26 |                        load_addtl_fields = True):
 27 | 
 28 | 
 29 |         self.http_req = http_helpers.CogSearchHttpRequest(api_key, search_service_name, index_name, api_version)
 30 |         self.index_name = index_name
 31 |         self.all_fields = ['id', 'text', 'text_en', 'categoryId']
 32 |         self.search_types = ['vector', 'hybrid', 'semantic_hybrid']
 33 | 
 34 |         self.addtl_fields = []
 35 | 
 36 |         if load_addtl_fields:
 37 |             self.addtl_fields += list(kb_doc.KB_Doc().get_fields() - ['text', 'text_en', VECTOR_FIELD_IN_REDIS, 'id', 'cv_image_vector', 'cv_text_vector'])
 38 |             self.all_fields += self.addtl_fields
 39 | 
 40 | 
 41 | 
 42 |     def create_index(self):
 43 |         
 44 |         index_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.create_index_json)
 45 |         index_dict['name'] = self.index_name
 46 | 
 47 |         for f in self.addtl_fields:
 48 |             field_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.field_json)
 49 |             field_dict['name'] = f
 50 |             index_dict['fields'].append(field_dict)
 51 | 
 52 |         self.http_req.put(body = index_dict)
 53 | 
 54 | 
 55 |     def get_index(self):
 56 |         return self.http_req.get()
 57 | 
 58 | 
 59 |     def delete_index(self):
 60 |         return self.http_req.delete()
 61 | 
 62 | 
 63 |     def upload_documents(self, documents):
 64 | 
 65 |         docs_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_docs_json)
 66 | 
 67 |         for doc in documents:
 68 |             doc_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_doc_json)
 69 |                         
 70 |             for k in self.all_fields:
 71 |                 doc_dict[k] = doc.get(k, '')
 72 | 
 73 |             doc_dict['id'] = doc['id'] if doc.get('id', None) else str(uuid.uuid4())
 74 |             doc_dict[VECTOR_FIELD_IN_REDIS] = doc.get(VECTOR_FIELD_IN_REDIS, [0]*1024)
 75 |             doc_dict['cv_image_vector'] = doc.get('cv_image_vector',[0]*1024)
 76 |             doc_dict['cv_text_vector'] = doc.get('cv_text_vector', [0]*1024)
 77 |             doc_dict["@search.action"] = "upload"
 78 |             docs_dict['value'].append(doc_dict)
 79 |         
 80 |         self.http_req.post(op ='index', body = docs_dict)
 81 | 
 82 |         return docs_dict
 83 | 
 84 | 
 85 |     def delete_documents(self, op='index', ids = []):
 86 |         docs_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_docs_json)
 87 | 
 88 |         for i in ids:
 89 |             doc_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_doc_json)
 90 |             doc_dict['id'] = i
 91 |             doc_dict[VECTOR_FIELD_IN_REDIS] = [0] * openai_helpers.get_model_dims(CHOSEN_EMB_MODEL)
 92 |             doc_dict["@search.action"] = "delete"
 93 |             docs_dict['value'].append(doc_dict)
 94 | 
 95 |         self.http_req.post(op ='index', body = docs_dict)
 96 | 
 97 | 
 98 | 
 99 |     def get_search_json(self, query, search_type = 'vector'):
100 |         if search_type == 'vector':
101 |             query_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.search_dict_vector)
102 |         elif search_type == 'hybrid':
103 |             query_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.search_dict_hybrid)
104 |             query_dict['search'] = query
105 |         elif search_type == 'semantic_hybrid':
106 |             query_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.search_dict_semantic_hybrid)
107 |             query_dict['search'] = query
108 |         return query_dict
109 | 
110 |             
111 |     def get_vector_fields(self, query, query_dict, vector_name = None):
112 |         if (vector_name is None) or (vector_name == VECTOR_FIELD_IN_REDIS):
113 |             completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL)
114 |             embedding_enc = openai_helpers.get_encoder(CHOSEN_EMB_MODEL)
115 |             query_dict['vector']['fields'] = VECTOR_FIELD_IN_REDIS
116 |             query = embedding_enc.decode(embedding_enc.encode(query)[:MAX_QUERY_TOKENS])
117 |             query_dict['vector']['value'] = openai_helpers.get_openai_embedding(query, CHOSEN_EMB_MODEL)    
118 |         elif vector_name == 'cv_text_vector':
119 |             cvr = cv_helpers.CV()
120 |             query_dict['vector']['fields'] = vector_name
121 |             query_dict['vector']['value'] = cvr.get_text_embedding(query)
122 |         elif vector_name == 'cv_image_vector':
123 |             cvr = cv_helpers.CV()
124 |             query_dict['vector']['fields'] = vector_name
125 |             query_dict['vector']['value'] = cvr.get_img_embedding(query)
126 |         else:
127 |             raise Exception(f'Invalid Vector Name {vector_name}')
128 |         
129 |         return query_dict
130 | 
131 | 
132 | 
133 |     def search(self, query, search_type = 'vector', vector_name = None, select=None, filter=None, verbose=False):
134 | 
135 |         if search_type not in self.search_types:
136 |             raise Exception(f"search_type must be one of {self.search_types}")
137 | 
138 |         regex = r"(https?:\/\/[^\/\s]+(?:\/[^\/\s]+)*\/[^?\/\s]+(?:\.jpg|\.jpeg|\.png)(?:\?[^\s'\"]+)?)"
139 |         match = re.search(regex, query)
140 | 
141 |         if match:
142 |             sas_url = match.group(1)
143 |             cvr = cv_helpers.CV()
144 |             res = cvr.analyze_image(img_url=sas_url)
145 |             query = query.replace(sas_url, '') + '\n' + res['text']
146 | 
147 |         query_dict = self.get_search_json(query, search_type)
148 |         query_dict = self.get_vector_fields(query, query_dict, vector_name)
149 |         query_dict['vector']['k'] = NUM_TOP_MATCHES
150 |         query_dict['filter'] = filter
151 |         query_dict['select'] = ', '.join(self.all_fields) if select is None else select
152 | 
153 |         results = self.http_req.post(op ='search', body = query_dict)
154 |         results = results['value']
155 |         if verbose: [print(r['@search.score']) for r in results]
156 | 
157 | 
158 |         if match:
159 |             sas_url = match.group(1)
160 |             query_dict = self.get_vector_fields(sas_url, query_dict, 'cv_image_vector')
161 |             img_results = self.http_req.post(op ='search', body = query_dict)
162 |             results = [img_results['value'], results]
163 |             
164 |             max_items = max([len(r) for r in results])
165 | 
166 |             final_context = []
167 |             context_dict = {}
168 | 
169 |             for i in range(max_items):
170 |                 for j in range(len(results)):
171 |                     if i < len(results[j]): 
172 |                         if results[j][i]['id'] not in context_dict:
173 |                             context_dict[results[j][i]['id']] = 1
174 |                             final_context.append(results[j][i])        
175 | 
176 |             results = final_context
177 | 
178 |         context = helpers.process_search_results(results)
179 | 
180 |         if match:
181 |             return ['Analysis of the image in the question: ' + query + '\n\n'] + context
182 |         else:
183 |             return context
184 | 
185 | 
186 | 
187 |     def search_similar_images(self, query,  select=None, filter=None, verbose=False):
188 | 
189 |         search_type = 'vector'
190 |         vector_name = 'cv_image_vector'
191 | 
192 |         if search_type not in self.search_types:
193 |             raise Exception(f"search_type must be one of {self.search_types}")
194 | 
195 |         regex = r"(https?:\/\/[^\/\s]+(?:\/[^\/\s]+)*\/[^?\/\s]+(?:\.jpg|\.jpeg|\.png)(?:\?[^\s'\"]+)?)"
196 |         match = re.search(regex, query)
197 | 
198 |         if match:
199 |             url = match.group(1)
200 |             query_dict = self.get_search_json(url, search_type)
201 |             query_dict = self.get_vector_fields(url, query_dict, vector_name)
202 |             query_dict['vector']['k'] = NUM_TOP_MATCHES
203 |             query_dict['filter'] = filter
204 |             query_dict['select'] = ', '.join(self.all_fields) if select is None else select
205 | 
206 |             results = self.http_req.post(op ='search', body = query_dict)
207 |             results = results['value']
208 |             if verbose: [print(r['@search.score']) for r in results]
209 | 
210 |             context = helpers.process_search_results(results)
211 | 
212 |             return context
213 |         
214 |         else:
215 |             return ["Sorry, no similar images have been found"]


--------------------------------------------------------------------------------
/utils/cogvecsearch_helpers/cs_json.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from utils.env_vars import *
  3 | 
  4 | 
  5 | field_json = {
  6 |     "name": "",
  7 |     "type": "Edm.String",
  8 |     "searchable": True,
  9 |     "filterable": True,
 10 |     "retrievable": True,
 11 |     "sortable": True,
 12 |     "facetable": True,
 13 |     "key": False,
 14 |     "indexAnalyzer": None,
 15 |     "searchAnalyzer": None,
 16 |     "analyzer": None,
 17 |     "normalizer": None,
 18 |     "dimensions": None,
 19 |     "vectorSearchConfiguration": None,
 20 |     "synonymMaps": []
 21 | }
 22 | 
 23 | 
 24 | vector_json = {
 25 |     "name": "vector",
 26 |     "type": "Collection(Edm.Single)",
 27 |     "searchable": True,
 28 |     "filterable": False,
 29 |     "retrievable": True,
 30 |     "sortable": False,
 31 |     "facetable": False,
 32 |     "key": False,
 33 |     "indexAnalyzer": None,
 34 |     "searchAnalyzer": None,
 35 |     "analyzer": None,
 36 |     "normalizer": None,
 37 |     "dimensions": 1536,
 38 |     "vectorSearchConfiguration": "vector-config",
 39 |     "synonymMaps": []
 40 | }
 41 | 
 42 | 
 43 | create_index_json = {
 44 |     "@odata.context": "https://cogvecseearch.search.windows.net/$metadata#indexes/$entity",
 45 |     "@odata.etag": "\"0x8DB40C97F04622D\"",
 46 |     "name": "vec-index",
 47 |     "defaultScoringProfile": None,
 48 |     "fields": [
 49 |         {
 50 |             "name": "id",
 51 |             "type": "Edm.String",
 52 |             "searchable": True,
 53 |             "filterable": True,
 54 |             "retrievable": True,
 55 |             "sortable": True,
 56 |             "facetable": True,
 57 |             "key": True,
 58 |             "indexAnalyzer": None,
 59 |             "searchAnalyzer": None,
 60 |             "analyzer": None,
 61 |             "normalizer": None,
 62 |             "dimensions": None,
 63 |             "vectorSearchConfiguration": None,
 64 |             "synonymMaps": []
 65 |         },
 66 |         {
 67 |             "name": "text",
 68 |             "type": "Edm.String",
 69 |             "searchable": True,
 70 |             "filterable": True,
 71 |             "retrievable": True,
 72 |             "sortable": True,
 73 |             "facetable": True,
 74 |             "key": False,
 75 |             "indexAnalyzer": None,
 76 |             "searchAnalyzer": None,
 77 |             "analyzer": None,
 78 |             "normalizer": None,
 79 |             "dimensions": None,
 80 |             "vectorSearchConfiguration": None,
 81 |             "synonymMaps": []
 82 |         },
 83 |         {
 84 |             "name": "text_en",
 85 |             "type": "Edm.String",
 86 |             "searchable": True,
 87 |             "filterable": True,
 88 |             "retrievable": True,
 89 |             "sortable": True,
 90 |             "facetable": True,
 91 |             "key": False,
 92 |             "indexAnalyzer": None,
 93 |             "searchAnalyzer": None,
 94 |             "analyzer": None,
 95 |             "normalizer": None,
 96 |             "dimensions": None,
 97 |             "vectorSearchConfiguration": None,
 98 |             "synonymMaps": []
 99 |         },
100 |         {
101 |             "name": "categoryId",
102 |             "type": "Edm.String",
103 |             "searchable": True,
104 |             "filterable": True,
105 |             "retrievable": True,
106 |             "sortable": True,
107 |             "facetable": True,
108 |             "key": False,
109 |             "indexAnalyzer": None,
110 |             "searchAnalyzer": None,
111 |             "analyzer": None,
112 |             "normalizer": None,
113 |             "dimensions": None,
114 |             "vectorSearchConfiguration": None,
115 |             "synonymMaps": []
116 |         },
117 |         {
118 |             "name": VECTOR_FIELD_IN_REDIS,
119 |             "type": "Collection(Edm.Single)",
120 |             "searchable": True,
121 |             "filterable": False,
122 |             "retrievable": True,
123 |             "sortable": False,
124 |             "facetable": False,
125 |             "key": False,
126 |             "indexAnalyzer": None,
127 |             "searchAnalyzer": None,
128 |             "analyzer": None,
129 |             "normalizer": None,
130 |             "dimensions": 1536,
131 |             "vectorSearchConfiguration": "vector-config",
132 |             "synonymMaps": []
133 |         },
134 |         {
135 |             "name": 'cv_image_vector',
136 |             "type": "Collection(Edm.Single)",
137 |             "searchable": True,
138 |             "filterable": False,
139 |             "retrievable": True,
140 |             "sortable": False,
141 |             "facetable": False,
142 |             "key": False,
143 |             "indexAnalyzer": None,
144 |             "searchAnalyzer": None,
145 |             "analyzer": None,
146 |             "normalizer": None,
147 |             "dimensions": 1024,
148 |             "vectorSearchConfiguration": "vector-config",
149 |             "synonymMaps": []
150 |         },
151 |         {
152 |             "name": 'cv_text_vector',
153 |             "type": "Collection(Edm.Single)",
154 |             "searchable": True,
155 |             "filterable": False,
156 |             "retrievable": True,
157 |             "sortable": False,
158 |             "facetable": False,
159 |             "key": False,
160 |             "indexAnalyzer": None,
161 |             "searchAnalyzer": None,
162 |             "analyzer": None,
163 |             "normalizer": None,
164 |             "dimensions": 1024,
165 |             "vectorSearchConfiguration": "vector-config",
166 |             "synonymMaps": []
167 |         }
168 |     ],
169 |     "scoringProfiles": [],
170 |     "corsOptions": {
171 |         "allowedOrigins": [
172 |             "*"
173 |         ],
174 |         "maxAgeInSeconds": 60
175 |     },
176 |     "suggesters": [],
177 |     "analyzers": [],
178 |     "normalizers": [],
179 |     "tokenizers": [],
180 |     "tokenFilters": [],
181 |     "charFilters": [],
182 |     "encryptionKey": None,
183 |     "similarity": {
184 |         "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
185 |         "k1": None,
186 |         "b": None
187 |     },
188 |     "semantic": {
189 |         "defaultConfiguration": None,
190 |         "configurations": [
191 |             {
192 |                 "name": "semantic-config",
193 |                 "prioritizedFields": {
194 |                     "prioritizedContentFields": [
195 |                         {
196 |                             "fieldName": "text_en"
197 |                         }
198 |                     ],
199 |                     "prioritizedKeywordsFields": [
200 |                         {
201 |                             "fieldName": "categoryId"
202 |                         }
203 |                     ]
204 |                 }
205 |             }
206 |         ]
207 |     },
208 |     "vectorSearch": {
209 |         "algorithmConfigurations": [
210 |             {
211 |                 "name": "vector-config",
212 |                 "kind": "hnsw",
213 |                 "hnswParameters": {
214 |                     "m": 10,
215 |                     "efConstruction": 400,
216 |                     "metric": "cosine"
217 |                 }
218 |             }
219 |         ]
220 |     }
221 | }
222 | 
223 | 
224 | upload_doc_json = {
225 |     "id": "",
226 |     "text": "",
227 |     "text_en": "",
228 |     "categoryId": "",
229 |     VECTOR_FIELD_IN_REDIS: [],
230 |     "@search.action": "upload"
231 | }
232 | 
233 | upload_docs_json = {
234 |     "value": [
235 |     ]
236 | }
237 | 
238 | 
239 | 
240 | search_dict_vector = {
241 |     "vector": {
242 |         "value": [],
243 |         "fields": VECTOR_FIELD_IN_REDIS,
244 |         "k": NUM_TOP_MATCHES
245 |     },
246 |     "select": "*",
247 |     "filter": None
248 | }
249 | 
250 | 
251 | search_dict_hybrid = {
252 |     "vector": {
253 |         "value": [],
254 |         "fields": VECTOR_FIELD_IN_REDIS,
255 |         "k": 10
256 |     },
257 |     "search": "",
258 |     "select": "*",
259 |     "top": f"{NUM_TOP_MATCHES}",
260 |     "filter": None
261 | }
262 | 
263 | 
264 | search_dict_semantic_hybrid= {
265 |     "vector": {
266 |         "value": [],
267 |         "fields": VECTOR_FIELD_IN_REDIS,
268 |         "k": NUM_TOP_MATCHES,
269 |     },
270 |     "search": "",
271 |     "select":"*",
272 |     "queryType": "semantic",
273 |     "semanticConfiguration": "semantic-config",
274 |     "queryLanguage": "en-us",
275 |     "captions": "extractive",
276 |     "answers": "extractive",
277 |     "top": f"{NUM_TOP_MATCHES*3}",
278 |     "filter": None
279 | }
280 | 
281 | 


--------------------------------------------------------------------------------
/utils/cosmos_helpers.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import re
  4 | import azure.functions as func
  5 | import os
  6 | from azure.cosmos import CosmosClient, PartitionKey
  7 | import urllib
  8 | import numpy as np
  9 | import uuid
 10 | import copy
 11 | from datetime import datetime, timedelta
 12 | 
 13 | from utils import redis_helpers
 14 | from utils.env_vars import *
 15 | 
 16 | 
 17 | try:
 18 | 
 19 |     if DATABASE_MODE == 1:
 20 |         client = CosmosClient(url=COSMOS_URI, credential=COSMOS_KEY)
 21 |         partitionKeyPath = PartitionKey(path="/categoryId")
 22 |         database = client.create_database_if_not_exists(id=COSMOS_DB_NAME)
 23 | 
 24 |         def init_container():
 25 | 
 26 |             indexing_policy={ "includedPaths":[{ "path":"/*"}], "excludedPaths":[{ "path":"/\"_etag\"/?"},{ "path":f"/{VECTOR_FIELD_IN_REDIS}/?"}]}
 27 |             
 28 |             try:
 29 |                 container = database.create_container_if_not_exists(id="documents", partition_key=partitionKeyPath,indexing_policy=indexing_policy)
 30 |             except:
 31 |                 try:
 32 |                     container = database.create_container_if_not_exists(id="documents", partition_key=partitionKeyPath,indexing_policy=indexing_policy)
 33 | 
 34 |                 except Exception as e:
 35 |                     logging.error(f"Encountered error {e} while creating the container")
 36 |                     print(f"Encountered error {e} while creating the container")
 37 | 
 38 |             return container
 39 | 
 40 |         container = init_container()
 41 |     
 42 | except:
 43 |     print("Failed to initialize Cosmos DB container")
 44 |     logging.error("Failed to initialize Cosmos DB container")
 45 | 
 46 | 
 47 | 
 48 | def cosmos_restore_embeddings():
 49 |     QUERY = "SELECT * FROM documents p WHERE p.categoryId = @categoryId"
 50 |     params = [dict(name="@categoryId", value=EMBCATEGORYID)]
 51 | 
 52 |     embeddings = container.query_items(query=QUERY, parameters=params, enable_cross_partition_query=False)
 53 | 
 54 |     redis_conn = redis_helpers.get_new_conn()
 55 |     counter = 0
 56 |     
 57 |     try:
 58 |         for e in embeddings:
 59 |             counter += redis_helpers.redis_upsert_embedding(redis_conn, e)
 60 |             
 61 |     except Exception as e:
 62 |         print("No Documents found")
 63 | 
 64 |     logging.info(f"Loaded {counter} embeddings from Cosmos into Redis")
 65 |     print(f"Loaded {counter} embeddings from Cosmos into Redis")
 66 | 
 67 | 
 68 | 
 69 | def cosmos_backup_embeddings(emb_documents):
 70 |     
 71 |     ret_dict = {}
 72 |     
 73 |     try:
 74 |         for e in emb_documents:
 75 |             #e[VECTOR_FIELD_IN_REDIS] = np.array(e[VECTOR_FIELD_IN_REDIS]).astype(np.float32).tobytes()
 76 |             e['categoryId'] = EMBCATEGORYID
 77 |             container.upsert_item(e)
 78 | 
 79 |         ret_dict['status'] = f"Successfully loaded {len(emb_documents)} embedding documents into Cosmos"
 80 | 
 81 |     except Exception as e:
 82 |         logging.error(e)
 83 |         print(e)
 84 |         ret_dict['status'] = f"Failed loading {len(emb_documents)} embeddings into Cosmos: {e}"
 85 | 
 86 |     return ret_dict
 87 | 
 88 | 
 89 | 
 90 | 
 91 | def cosmos_store_contents(data_dict):
 92 |     ret_dict = {}
 93 | 
 94 |     new_doc = copy.deepcopy(data_dict)
 95 | 
 96 |     new_doc['id'] = new_doc.get('id', str(uuid.uuid4()))
 97 |     new_doc['categoryId'] = CATEGORYID
 98 |     new_doc['timestamp']  = new_doc.get('timestamp', datetime.now().strftime("%m/%d/%Y, %H:%M:%S")),  
 99 |     new_doc['doc_url']    = new_doc.get('doc_url', f'https://microsoft.com/{str(uuid.uuid4())}')
100 | 
101 |     if 'content' in new_doc.keys():
102 |         del new_doc['content']
103 | 
104 |     try:
105 |         container.upsert_item(new_doc)
106 |         ret_dict['status'] = f"Document {new_doc['id']} was successfully inserted into Cosmos"
107 |     except Exception as e:
108 |         logging.error(e)
109 |         print(e)
110 |         ret_dict['status'] = f"Document {new_doc['id']} failed to be inserted into Cosmos: {e}"
111 | 
112 |     return ret_dict
113 | 
114 | 
115 | 
116 | # def cosmos_download_contents():
117 | #     QUERY = "SELECT * FROM documents p WHERE p.categoryId = @categoryId"
118 | #     params = [dict(name="@categoryId", value=CATEGORYID)]
119 | 
120 | #     contents = container.query_items(query=QUERY, parameters=params, enable_cross_partition_query=False, max_item_count=10)
121 | #     counter = 0
122 |     
123 | #     try:
124 | #         for c in contents:
125 | #             #counter += redis_helpers.redis_upsert_embedding(redis_conn, e)
126 | #             # print(c)
127 | #             yield self._parse_entry(item_dict) 
128 |             
129 | #     except Exception as e:
130 | #         print("No Documents found")
131 | 
132 | #     logging.info(f"Loaded {counter} embeddings from Cosmos into Redis")
133 | #     print(f"Loaded {counter} embeddings from Cosmos into Redis")    


--------------------------------------------------------------------------------
/utils/cv_helpers.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | import os
 3 | import logging
 4 | import json
 5 | import copy
 6 | 
 7 | 
 8 | from utils import http_helpers
 9 | from utils import openai_helpers
10 | 
11 | from utils.env_vars import *
12 | 
13 | 
14 | 
15 | 
16 | class CV:
17 | 
18 |     def __init__(self, api_key = COG_SERV_KEY, 
19 |                        cog_serv_name = COG_SERV_ENDPOINT, 
20 |                        api_version  = CV_API_VERSION):
21 | 
22 | 
23 |         self.http_req = http_helpers.CVHttpRequest(api_key, cog_serv_name, api_version)
24 | 
25 | 
26 | 
27 |     def process_json(self, img_url, response):
28 |         res = {}
29 | 
30 |         res['main_caption'] = response['captionResult']['text']
31 |         res['tags'] = [tag['name'] for tag in response['tagsResult']['values']]
32 |         res['ocr'] = response['readResult']['content']
33 |         res['captions'] = [caption['text'] for caption in response['denseCaptionsResult']['values']]
34 | 
35 |         res['text'] = f"[{img_url}] This is an image. Main Caption: {res['main_caption']}\nOCR: {res['ocr']}\nDense Captions: {', '.join(res['captions'])}\nTags: {', '.join(res['tags'])}"
36 | 
37 |         return res
38 | 
39 | 
40 | 
41 |     def analyze_image(self, img_url = None, filename = None):
42 | 
43 |         if filename is not None: 
44 |         
45 |             with open(filename, 'rb') as f:
46 |                 data = f.read()
47 |             response = self.http_req.post(op='analyze', data=data)
48 | 
49 |         else:
50 |             response = self.http_req.post(op='analyze', headers=self.http_req.json_headers, body={'url': img_url})
51 |             
52 |         response = self.process_json(img_url, response)
53 | 
54 |         return response
55 | 
56 | 
57 |     def get_img_embedding(self, img_url = None, filename = None):
58 | 
59 |         if filename is not None: 
60 |             with open(filename, 'rb') as f:
61 |                 data = f.read()
62 |             
63 |             response = self.http_req.post(op='img_embedding', data=data)
64 |         else:
65 | 
66 |             response = self.http_req.post(op='img_embedding', headers=self.http_req.json_headers, body={'url': img_url})
67 | 
68 |         try:
69 |             return response['vector']
70 |         except:
71 |             return None
72 | 
73 | 
74 | 
75 |     def get_text_embedding(self, text):
76 |         response = self.http_req.post(op='text_embedding', headers=self.http_req.json_headers, body={'text': text})
77 | 
78 |         try:
79 |             return response['vector']
80 |         except:
81 |             return None


--------------------------------------------------------------------------------
/utils/env_vars.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | ###########################
  4 | ## Configuration Options ##
  5 | ###########################
  6 | 
  7 | CHOSEN_COMP_MODEL = os.environ.get("CHOSEN_COMP_MODEL", "gpt-35-turbo")
  8 | CHOSEN_EMB_MODEL = os.environ.get("CHOSEN_EMB_MODEL", "text-embedding-ada-002")
  9 | MAX_OUTPUT_TOKENS = int(os.environ.get("MAX_OUTPUT_TOKENS", "2000"))
 10 | MAX_HISTORY_TOKENS = int(os.environ.get("MAX_HISTORY_TOKENS", "1000"))
 11 | MAX_SEARCH_TOKENS = int(os.environ.get("MAX_SEARCH_TOKENS", "2500"))
 12 | MAX_QUERY_TOKENS = int(os.environ.get("MAX_QUERY_TOKENS", "500"))
 13 | PRE_CONTEXT = int(os.environ.get("PRE_CONTEXT", "500"))
 14 | NUM_TOP_MATCHES = int(os.environ.get("NUM_TOP_MATCHES", "3"))
 15 | 
 16 | OVERLAP_TEXT = int(os.environ.get("OVERLAP_TEXT", "150"))
 17 | SMALL_EMB_TOKEN_NUM = int(os.environ.get("SMALL_EMB_TOKEN_NUM", "0"))
 18 | MEDIUM_EMB_TOKEN_NUM = int(os.environ.get("MEDIUM_EMB_TOKEN_NUM", "800"))
 19 | LARGE_EMB_TOKEN_NUM = int(os.environ.get("LARGE_EMB_TOKEN_NUM", "0"))
 20 | X_LARGE_EMB_TOKEN_NUM = int(os.environ.get("X_LARGE_EMB_TOKEN_NUM", "0"))
 21 | 
 22 | USE_BING = os.environ.get("USE_BING", "no")
 23 | LIST_OF_COMMA_SEPARATED_URLS = os.environ.get("LIST_OF_COMMA_SEPARATED_URLS", "")
 24 | 
 25 | CONVERSATION_TTL_SECS = int(os.environ.get("CONVERSATION_TTL_SECS", "172800"))
 26 | 
 27 | USE_COG_VECSEARCH = int(os.environ.get("USE_COG_VECSEARCH", "1"))
 28 | 
 29 | DATABASE_MODE = int(os.environ.get("DATABASE_MODE", "1"))
 30 | 
 31 | USE_REDIS_CACHE = int(os.environ.get("USE_REDIS_CACHE", "0"))
 32 | 
 33 | PROCESS_IMAGES = int(os.environ.get("PROCESS_IMAGES", "0"))
 34 | 
 35 | 
 36 | 
 37 | 
 38 | ########################
 39 | ## Endpoints and Keys ##
 40 | ########################
 41 | 
 42 | COG_SEARCH_ENDPOINT = os.environ.get("COG_SEARCH_ENDPOINT", "")
 43 | COG_SEARCH_ADMIN_KEY = os.environ.get("COG_SEARCH_ADMIN_KEY", "")
 44 | COG_SEARCH_CUSTOM_FUNC = os.environ.get("COG_SEARCH_CUSTOM_FUNC", "")
 45 | 
 46 | COG_SERV_ENDPOINT = os.environ.get("COG_SERV_ENDPOINT", "")
 47 | COG_SERV_KEY = os.environ.get("COG_SERV_KEY", "")
 48 | 
 49 | OPENAI_RESOURCE_ENDPOINT = os.environ.get("OPENAI_RESOURCE_ENDPOINT", "")
 50 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
 51 | 
 52 | KB_BLOB_CONN_STR = os.environ.get("KB_BLOB_CONN_STR", "")
 53 | 
 54 | COSMOS_URI = os.environ.get("COSMOS_URI", "")
 55 | COSMOS_KEY = os.environ.get("COSMOS_KEY", "")
 56 | 
 57 | SERVICEBUS_CONN_STR = os.environ.get("SERVICEBUS_CONN_STR", "")
 58 | 
 59 | REDIS_ADDR = os.environ.get("REDIS_ADDR", "")
 60 | REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "")
 61 | REDIS_PORT = os.environ.get("REDIS_PORT", "10000")
 62 | 
 63 | BING_SUBSCRIPTION_KEY = os.environ.get("BING_SUBSCRIPTION_KEY", "")
 64 | BING_SEARCH_URL = os.environ.get("BING_SEARCH_URL", "https://api.bing.microsoft.com/v7.0/search")
 65 | 
 66 | TRANSLATION_ENDPOINT = os.environ.get("TRANSLATION_ENDPOINT", "https://api.cognitive.microsofttranslator.com")
 67 | TRANSLATION_API_KEY = os.environ.get("TRANSLATION_API_KEY", "ad8ac9b95ba94b79ba37d43cdc0c606c")
 68 | TRANSLATION_LOCATION = os.environ.get("TRANSLATION_LOCATION", "westeurope")
 69 | 
 70 | #if TRANSLATION_API_KEY == "": TRANSLATION_API_KEY = COG_SERV_KEY
 71 | 
 72 | 
 73 | ###################
 74 | ## OpenAI Params ##
 75 | ###################
 76 | 
 77 | import openai
 78 | 
 79 | 
 80 | OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION", "2023-03-15-preview")
 81 | openai.api_type = "azure"
 82 | openai.api_key = OPENAI_API_KEY
 83 | openai.api_base = OPENAI_RESOURCE_ENDPOINT
 84 | openai.api_version = OPENAI_API_VERSION
 85 | 
 86 | 
 87 | 
 88 | #############################
 89 | ## Cognitive Search Params ##
 90 | #############################
 91 | 
 92 | KB_FIELDS_CONTENT = os.environ.get("KB_FIELDS_CONTENT", "content")
 93 | KB_FIELDS_CATEGORY =  os.environ.get("KB_FIELDS_CATEGORY", "category")
 94 | KB_FIELDS_SOURCEFILE  = os.environ.get("KB_FIELDS_SOURCEFILE", "sourcefile")
 95 | KB_FIELDS_CONTAINER  = os.environ.get("KB_FIELDS_CONTAINER", "container")
 96 | KB_FIELDS_FILENAME  = os.environ.get("KB_FIELDS_FILENAME", "filename")
 97 | KB_SEM_INDEX_NAME = os.environ.get("KB_SEM_INDEX_NAME", "km-openai-sem")
 98 | COG_VEC_SEARCH_API_VERSION = os.environ.get("COG_VEC_SEARCH_API_VERSION", "2023-07-01-Preview")
 99 | COG_VECSEARCH_VECTOR_INDEX = os.environ.get("COG_VECSEARCH_VECTOR_INDEX", "vec-index")
100 | 
101 | 
102 | 
103 | ############################
104 | ## Defaults and Constants ##
105 | ############################
106 | 
107 | AzureWebJobsStorage = os.environ.get("AzureWebJobsStorage", KB_BLOB_CONN_STR)
108 | AzureWebJobsDashboard = os.environ.get("AzureWebJobsDashboard", KB_BLOB_CONN_STR)
109 | FUNCTIONS_EXTENSION_VERSION = os.environ.get("FUNCTIONS_EXTENSION_VERSION", "~4")
110 | FUNCTIONS_WORKER_RUNTIME = os.environ.get("FUNCTIONS_WORKER_RUNTIME", "python")
111 | WEBSITE_MAX_DYNAMIC_APPLICATION_SCALE_OUT = int(os.environ.get("WEBSITE_MAX_DYNAMIC_APPLICATION_SCALE_OUT", "1"))
112 | KB_INDEX_NAME = os.environ.get("KB_INDEX_NAME", "km-openai")
113 | KB_INDEXER_NAME = os.environ.get("KB_INDEXER_NAME", "km-openai-indexer")
114 | KB_DATA_SOURCE_NAME = os.environ.get("KB_DATA_SOURCE_NAME", "km-openai-docs")
115 | KB_SKILLSET_NAME = os.environ.get("KB_SKILLSET_NAME", "km-openai-skills")
116 | REDIS_INDEX_NAME = os.environ.get("REDIS_INDEX_NAME", "acs_emb_index")
117 | VECTOR_FIELD_IN_REDIS = os.environ.get("VECTOR_FIELD_IN_REDIS", "item_vector")
118 | NUMBER_PRODUCTS_INDEX = int(os.environ.get("NUMBER_PRODUCTS_INDEX", "1000"))
119 | CATEGORYID = os.environ.get("CATEGORYID", "KM_OAI_CATEGORY")
120 | EMBCATEGORYID = os.environ.get("EMBCATEGORYID", "KM_OAI_EMB_CATEGORY")
121 | COSMOS_DB_NAME = os.environ.get("COSMOS_DB_NAME", "KM_OAI_DB")
122 | KB_BLOB_CONTAINER = os.environ.get("KB_BLOB_CONTAINER", "kmoaidemo")
123 | OUTPUT_BLOB_CONTAINER = os.environ.get("OUTPUT_BLOB_CONTAINER", "kmoaiprocessed")
124 | CHOSEN_QUERY_EMB_MODEL = os.environ.get("CHOSEN_QUERY_EMB_MODEL", "text-embedding-ada-002")
125 | ADA_002_EMBED_NUM_DIMS = int(os.environ.get("ADA_002_EMBED_NUM_DIMS", "1536"))
126 | ADA_002_MODEL_MAX_TOKENS = int(os.environ.get("ADA_002_MODEL_MAX_TOKENS", "4095"))
127 | ADA_002_EMBEDDING_MODEL = os.environ.get("ADA_002_EMBEDDING_MODEL", "text-embedding-ada-002")
128 | ADA_EMBEDDING_ENCODING = os.environ.get("ADA_EMBEDDING_ENCODING", "cl100k_base")
129 | DAVINCI_003_EMBED_NUM_DIMS = int(os.environ.get("DAVINCI_003_EMBED_NUM_DIMS", "12288"))
130 | DAVINCI_003_MODEL_MAX_TOKENS = int(os.environ.get("DAVINCI_003_MODEL_MAX_TOKENS", "4000"))
131 | DAVINCI_003_EMB_MAX_TOKENS = int(os.environ.get("DAVINCI_003_EMB_MAX_TOKENS", "2047"))
132 | DAVINCI_003_COMPLETIONS_MODEL = os.environ.get("DAVINCI_003_COMPLETIONS_MODEL", "text-davinci-003")
133 | DAVINCI_003_EMBEDDING_MODEL = os.environ.get("DAVINCI_003_EMBEDDING_MODEL", "text-search-davinci-doc-001")
134 | DAVINCI_003_QUERY_EMB_MODEL = os.environ.get("DAVINCI_003_QUERY_EMB_MODEL", "text-search-davinci-query-001")
135 | DAVINCI_EMBEDDING_ENCODING = os.environ.get("DAVINCI_EMBEDDING_ENCODING", "p50k_base")
136 | GPT35_TURBO_COMPLETIONS_MODEL = os.environ.get("GPT35_TURBO_COMPLETIONS_MODEL", "gpt-35-turbo")
137 | GPT35_TURBO_COMPLETIONS_MAX_TOKENS = int(os.environ.get("GPT35_TURBO_COMPLETIONS_MAX_TOKENS", "8193"))
138 | GPT35_TURBO_COMPLETIONS_ENCODING = os.environ.get("GPT35_TURBO_COMPLETIONS_ENCODING", "cl100k_base")
139 | FR_CONTAINER = os.environ.get("FR_CONTAINER", "kmoaiforms")
140 | RESTRICTIVE_PROMPT = os.environ.get("RESTRICTIVE_PROMPT", "no")
141 | TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3"))
142 | GPT4_COMPLETIONS_MODEL_MAX_TOKENS =  int(os.environ.get("GPT4_COMPLETIONS_MODEL_MAX_TOKENS", "8192"))
143 | GPT4_32K_COMPLETIONS_MODEL_MAX_TOKENS = int(os.environ.get("GPT4_32K_COMPLETIONS_MODEL_MAX_TOKENS", "32768"))
144 | GPT35_TURBO_16K_COMPLETIONS_MAX_TOKENS = int(os.environ.get("GPT35_TURBO_16K_COMPLETIONS_MAX_TOKENS", "16384"))
145 | GPT4_MODEL = os.environ.get("GPT4_MODEL", "gpt-4")
146 | GPT4_32K_MODEL = os.environ.get("GPT4_32K_MODEL", "gpt-4-32k")
147 | CV_API_VERSION = os.environ.get("CV_API_VERSION", "2023-02-01-preview")
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/utils/fr_helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np
  4 | import logging
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | from azure.core.credentials import AzureKeyCredential
  9 | from azure.ai.formrecognizer import DocumentAnalysisClient
 10 | 
 11 | 
 12 | from tenacity import (
 13 |     retry,
 14 |     stop_after_attempt,
 15 |     wait_random_exponential,
 16 | )
 17 | 
 18 | 
 19 | 
 20 | from utils import storage
 21 | 
 22 | from utils.env_vars import *
 23 | 
 24 | 
 25 | document_analysis_client = DocumentAnalysisClient(COG_SERV_ENDPOINT, AzureKeyCredential(COG_SERV_KEY))
 26 | 
 27 | 
 28 | 
 29 | def process_forms(in_container = FR_CONTAINER, out_container = OUTPUT_BLOB_CONTAINER): 
 30 |     blob_list = storage.list_documents(in_container)
 31 | 
 32 |     for b in blob_list:
 33 |         url = storage.create_sas(b)
 34 |         result = fr_analyze_doc(url)
 35 | 
 36 |         new_json = {
 37 |             'text': result,
 38 |             'doc_url': b,
 39 |             'container': in_container,
 40 |             'filename': storage.get_filename(b),
 41 |             'web_url': ''
 42 |         }
 43 | 
 44 |         storage.save_json_document(new_json, container = out_container )
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | def fr_analyze_doc(url):
 51 | 
 52 |     poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-document", url)
 53 |     result = poller.result()
 54 | 
 55 |     contents = ''
 56 | 
 57 |     for paragraph in result.paragraphs:
 58 |         contents += paragraph.content + '\n'
 59 | 
 60 |     
 61 |     for kv_pair in result.key_value_pairs:
 62 |         key = kv_pair.key.content if kv_pair.key else ''
 63 |         value = kv_pair.value.content if kv_pair.value else ''
 64 |         kv_pairs_str = f"{key} : {value}"
 65 |         contents += kv_pairs_str + '\n'
 66 | 
 67 |     for table_idx, table in enumerate(result.tables):
 68 |         row = 0
 69 |         row_str = ''
 70 |         row_str_arr = []
 71 | 
 72 |         for cell in table.cells:
 73 |             if cell.row_index == row:
 74 |                 row_str += ' | ' + str(cell.content)
 75 |             else:
 76 |                 row_str_arr.append(row_str)
 77 |                 row_str = ''
 78 |                 row = cell.row_index
 79 |                 row_str += ' | ' + str(cell.content)
 80 | 
 81 |         row_str_arr.append(row_str)
 82 |         contents += '\n'.join(row_str_arr) +'\n'
 83 | 
 84 |     return contents
 85 | 
 86 | 
 87 | 
 88 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(10))
 89 | def fr_analyze_local_doc_with_dfs(path, verbose = True):
 90 | 
 91 |     with open(path, "rb") as f:
 92 |         poller = document_analysis_client.begin_analyze_document("prebuilt-document", document=f)
 93 | 
 94 |     result = poller.result()
 95 |     
 96 |     contents = ''
 97 |     kv_contents = ''
 98 |     t_contents = ''
 99 | 
100 |     for kv_pair in result.key_value_pairs:
101 |         key = kv_pair.key.content if kv_pair.key else ''
102 |         value = kv_pair.value.content if kv_pair.value else ''
103 |         kv_pairs_str = f"{key} : {value}"
104 |         kv_contents += kv_pairs_str + '\n'
105 | 
106 |     for paragraph in result.paragraphs:
107 |         contents += paragraph.content + '\n'
108 | 
109 | 
110 |     for table_idx, table in enumerate(result.tables):
111 |         row = 0
112 |         row_str = ''
113 |         row_str_arr = []
114 | 
115 |         for cell in table.cells:
116 |             if cell.row_index == row:
117 |                 row_str += ' \t ' + str(cell.content)
118 |             else:
119 |                 row_str_arr.append(row_str )
120 |                 row_str = ''
121 |                 row = cell.row_index
122 |                 row_str += ' \t ' + str(cell.content)
123 | 
124 |         row_str_arr.append(row_str )
125 |         t_contents += '\n'.join(row_str_arr) +'\n\n'  
126 |             
127 |     dfs = []
128 | 
129 |     # for idx, table in enumerate(result.tables):
130 |         
131 |     #     field_list = [c['content'] for c in table.to_dict()['cells'] if c['kind'] == 'columnHeader'] 
132 |     #     print('\n', field_list)
133 |         
134 |     #     table_dict = table.to_dict()
135 |     #     row_count = table_dict['row_count']
136 |     #     col_count = table_dict['column_count']
137 | 
138 |     #     cells = [c for c in table_dict['cells'] if c['kind'] == 'content']
139 |     #     rows = []
140 |     #     max_cols = 0
141 | 
142 |     #     for i in range(row_count - 1):
143 |     #         row = [c['content'] for c in cells if c['row_index'] == i + 1]
144 |     #         # print(row, i)
145 |     #         if len(row) > 0: rows.append(row)
146 |     #         if len(row) > max_cols: max_cols = len(row)
147 | 
148 |     #     if len(field_list) < max_cols: field_list += [''] * (max_cols - len(field_list))
149 |     #     df = pd.DataFrame(rows, columns=field_list)
150 |     #     if verbose: display(df)
151 |     #     dfs.append(df)
152 | 
153 |       
154 | 
155 |     return contents, kv_contents, dfs, t_contents


--------------------------------------------------------------------------------
/utils/helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import re
  4 | import numpy as np
  5 | import tiktoken
  6 | import json
  7 | import logging
  8 | from azure.storage.blob import BlobServiceClient, BlobClient
  9 | from azure.storage.blob import ContainerClient, __version__
 10 | from azure.storage.blob import generate_blob_sas, BlobSasPermissions
 11 | import copy
 12 | from langchain.llms import AzureOpenAI
 13 | from langchain.chat_models import ChatOpenAI
 14 | from langchain.callbacks.base import CallbackManager
 15 | 
 16 | from utils import language
 17 | from utils import storage
 18 | from utils import redis_helpers
 19 | from utils import openai_helpers
 20 | from utils.kb_doc import KB_Doc
 21 | from utils import cosmos_helpers
 22 | from utils.langchain_helpers import mod_agent
 23 | 
 24 | from utils.env_vars import *
 25 | 
 26 | 
 27 | def generate_embeddings(full_kbd_doc, embedding_model, max_emb_tokens, previous_max_tokens = 0, text_suffix = '',  gen_emb=True):
 28 |     
 29 |     emb_documents = []
 30 | 
 31 |     json_object = full_kbd_doc.get_dict()
 32 | 
 33 |     logging.info(f"Starting to generate embeddings with {embedding_model} and {max_emb_tokens} tokens")
 34 |     print(f"Starting to generate embeddings with {embedding_model} and {max_emb_tokens} tokens")
 35 | 
 36 |     try:
 37 |         if isinstance(json_object['timestamp'], list):
 38 |             json_object['timestamp'] = json_object['timestamp'][0]
 39 |         elif isinstance(json_object['timestamp'], str):
 40 |             json_object['timestamp'] = json_object['timestamp']
 41 |         else:
 42 |             json_object['timestamp'] = "1/1/1970 00:00:00 AM"    
 43 |     except:
 44 |         json_object['timestamp'] = "1/1/1970 00:00:00 AM"
 45 | 
 46 |     
 47 | 
 48 |     #### FOR DEMO PURPOSES ONLY -- OF COURSE NOT SECURE
 49 |     access = 'public'
 50 | 
 51 |     if (json_object['filename'] is None) or (json_object['filename'] == '')  or (json_object['filename'] == 'null'):
 52 |         filename = storage.get_filename(json_object['doc_url'])
 53 |     else:
 54 |         filename = json_object['filename']
 55 | 
 56 |     if filename.startswith('PRIVATE_'):
 57 |         access = 'private'
 58 |     #### FOR DEMO PURPOSES ONLY -- OF COURSE NOT SECURE
 59 | 
 60 | 
 61 |     doc_id = json_object['id']
 62 |     doc_text = json_object['text']
 63 |     enc = openai_helpers.get_encoder(embedding_model)
 64 |     tokens = enc.encode(doc_text)
 65 |     lang = language.detect_content_language(doc_text[:500])
 66 |     is_doc = json_object.get('doc_url', False) # doc_url empty for scraped webpages. web_url used instead.
 67 |     if is_doc:
 68 |         json_object['doc_url'] = storage.create_sas(json_object.get('doc_url', "https://microsoft.com"))
 69 |     else:
 70 |         json_object['doc_url'] = ''
 71 |     #  json_object['filename'] = filename
 72 |     json_object['access'] = access
 73 |     json_object['orig_lang'] = lang
 74 | 
 75 | 
 76 |     print("Comparing lengths", len(tokens) , previous_max_tokens-OVERLAP_TEXT)
 77 | 
 78 |     if (len(tokens) < previous_max_tokens-OVERLAP_TEXT) and (previous_max_tokens > 0):
 79 |         print("Skipping generating embeddings as it is optional for this text")
 80 |         return emb_documents
 81 | 
 82 | 
 83 |     suff = 0 
 84 |     for chunk in chunked_words(tokens, chunk_length=max_emb_tokens-OVERLAP_TEXT):
 85 |         decoded_chunk = enc.decode(chunk)
 86 |         
 87 |         translated_chunk = decoded_chunk
 88 |         if lang != 'en': 
 89 |             translated_chunk = language.translate(decoded_chunk, lang)
 90 |        
 91 |         if gen_emb:
 92 |             embedding = openai_helpers.get_openai_embedding(translated_chunk, embedding_model)
 93 |         else:
 94 |             embedding = ''
 95 | 
 96 |         dd = copy.deepcopy(json_object)
 97 |         dd['id'] = f"{doc_id}_{text_suffix}_{suff}"
 98 |         dd['text_en'] = translated_chunk
 99 |         if lang != 'en': dd['text'] = decoded_chunk
100 |         else: dd['text'] = ''
101 |         dd[VECTOR_FIELD_IN_REDIS] = embedding
102 | 
103 |         chunk_kbd_doc = KB_Doc()
104 |         chunk_kbd_doc.load(dd)
105 | 
106 |         emb_documents.append(chunk_kbd_doc.get_dict())
107 |         suff += 1
108 | 
109 |         if suff % 10 == 0:
110 |             print (f'Processed: {suff} embeddings for document {filename}')
111 |             logging.info (f'Processed: {suff} embeddings for document {filename}')
112 | 
113 | 
114 |     print(f"This doc generated {suff} chunks")
115 |     logging.info(f"This doc generated {suff} chunks")
116 | 
117 |     return emb_documents
118 | 
119 | 
120 | 
121 | def generate_embeddings_from_json_docs(json_folder, embedding_model, max_emb_tokens, text_suffix='M', limit = -1):
122 |     
123 |     emb_documents = []
124 | 
125 |     counter = 0
126 |     for item in os.listdir(json_folder):
127 |         if (limit != -1 ) and (counter >= limit): break
128 |         path = os.path.join(json_folder, item)
129 | 
130 |         with open(path, 'r') as openfile:
131 |             json_object = json.load(openfile)
132 |         
133 |         doc_embs = generate_embeddings(json_object, embedding_model, max_emb_tokens = max_emb_tokens, text_suffix = text_suffix)
134 |         emb_documents += doc_embs
135 |         counter += 1
136 | 
137 |         print(f"Now processing {path}, generated {len(doc_embs)} chunks")
138 | 
139 |     return emb_documents
140 | 
141 | 
142 | 
143 | def save_object_to_pkl(object, filename):
144 |     with open(filename, 'wb') as pickle_out:
145 |         pickle.dump(object, pickle_out)
146 | 
147 | 
148 | def load_object_from_pkl(filename):
149 |     with open(filename, 'rb') as pickle_in:
150 |         object = pickle.load(pickle_in)
151 | 
152 |     return object  
153 | 
154 | 
155 | def load_embedding_docs_in_redis(emb_documents, emb_filename = '', document_name = ''):
156 | 
157 |     if (emb_documents is None) and (emb_filename != ''):
158 |         emb_documents = load_embedding_docs_from_pkl(emb_filename)
159 | 
160 |     redis_conn = redis_helpers.get_new_conn()
161 | 
162 |     print(f"Loading {len(emb_documents)} embeddings into Redis")
163 |     logging.info(f"Loading {len(emb_documents)} embeddings into Redis")
164 | 
165 |     counter = 0
166 |     loaded = 0
167 | 
168 |     for e in emb_documents:
169 |         loaded += redis_helpers.redis_upsert_embedding(redis_conn, e)
170 | 
171 |         counter +=1
172 |         if counter % 200 == 0:
173 |             print (f'Processed: {counter} of {len(emb_documents)} for document {document_name}')
174 |             logging.info (f'Processed: {counter} of {len(emb_documents)} for document {document_name}')
175 |     
176 |     print (f'Processed: {counter} of {len(emb_documents)} for document {document_name}')
177 | 
178 |     return loaded
179 | 
180 | 
181 | def chunked_words(tokens, chunk_length, overlap=OVERLAP_TEXT):
182 |     num_slices = len(tokens) // chunk_length + (len(tokens) % chunk_length > 0)
183 |     chunks_iterator = (tokens[i*chunk_length:(i+1)*chunk_length + overlap] for i in range(num_slices))
184 |     yield from chunks_iterator
185 | 
186 | 
187 | 
188 | 
189 | def push_summarizations(doc_text, completion_model, max_output_tokens):
190 |          
191 |     for chunk in chunked_words(tokens, chunk_length=max_summ_tokens):
192 |         print("Chunking summarization", len(chunk))
193 |         d['summary'].append(openai_summarize(enc.decode(chunk), completion_model, max_output_tokens))
194 |                 
195 |     summary = '\n'.join(d['summary'])
196 |     logging.info(f"Summary {summary}")
197 |     print(f"Summary {summary}")
198 | 
199 |     push_embeddings(summary, enc.encode(summary), lang,  timestamp, doc_id, doc_url, text_suffix = 'summ')
200 | 
201 | 
202 | 
203 | re_strs = [
204 |     "customXml\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*", 
205 |     "ppt\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*",
206 |     "\.MsftOfcThm_[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*[\r\n\t\f\v ]\{[\r\n\t\f\v ].*[\r\n\t\f\v ]\}",
207 |     "SlidePowerPoint",
208 |     "PresentationPowerPoint",
209 |     '[a-zA-Z0-9]*\.(?:gif|emf)'
210 |     ]
211 | 
212 | 
213 | 
214 | def redis_search(query: str, filter_param: str):
215 |     if (REDIS_ADDR is None) or (REDIS_ADDR == ''): 
216 |         return ["Sorry, I couldn't find any information related to the question."]
217 | 
218 | 
219 |     redis_conn = redis_helpers.get_new_conn()
220 |     completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL)
221 |     embedding_enc = openai_helpers.get_encoder(CHOSEN_EMB_MODEL)
222 | 
223 |     query = embedding_enc.decode(embedding_enc.encode(query)[:MAX_QUERY_TOKENS])
224 | 
225 |     query_embedding = openai_helpers.get_openai_embedding(query, CHOSEN_EMB_MODEL)    
226 |     results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=NUM_TOP_MATCHES, filter_param=filter_param)
227 | 
228 |     if len(results) == 0:
229 |         logging.warning("No embeddings found in Redis, attempting to load embeddings from Cosmos")
230 |         cosmos_helpers.cosmos_restore_embeddings()
231 |         results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=NUM_TOP_MATCHES, filter_param=filter_param)
232 |     
233 |     return process_search_results(results)
234 |     
235 |     
236 | def process_search_results(results):
237 |     completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL)
238 | 
239 |     if len(results) == 0:
240 |         return ["Sorry, I couldn't find any information related to the question."]
241 | 
242 |     context = []
243 | 
244 |     for t in results:
245 |         t['text_en'] = t['text_en'].replace('\r', ' ').replace('\n', ' ') 
246 | 
247 |         try:
248 |             if ('web_url' in t.keys()) and (t['web_url'] is not None) and (t['web_url'] != ''):
249 |                 context.append('\n\n' + f"[{t['web_url']}] " + t['text_en'] + '\n\n')
250 |             else:
251 |                 context.append('\n\n' + f"[{t['container']}/{t['filename']}] " + t['text_en']  + '\n\n')
252 |         except Exception as e:
253 |             print("------------------- Exception in process_search_results: ", e)
254 |             context.append('\n\n' + t['text_en'] + '\n\n')
255 | 
256 | 
257 |     for i in range(len(context)):
258 |         for re_str in re_strs:
259 |             matches = re.findall(re_str, context[i], re.DOTALL)
260 |             for m in matches: context[i] = context[i].replace(m, '')
261 | 
262 |     final_context = []
263 |     total_tokens = 0
264 | 
265 |     for i in range(len(context)):
266 |         total_tokens += len(completion_enc.encode(context[i]))
267 |         # print(total_tokens)
268 |         if  (total_tokens < MAX_SEARCH_TOKENS) and (len(final_context) < NUM_TOP_MATCHES):
269 |             final_context.append(context[i])
270 |         else:
271 |             break
272 | 
273 |     return final_context
274 | 
275 | 
276 | def redis_lookup(query: str, filter_param: str):
277 |     redis_conn = redis_helpers.get_new_conn()
278 |     completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL)
279 | 
280 |     embedding_enc = openai_helpers.get_encoder(CHOSEN_EMB_MODEL)
281 |     query = embedding_enc.decode(embedding_enc.encode(query)[:MAX_QUERY_TOKENS])
282 | 
283 |     query_embedding = openai_helpers.get_openai_embedding(query, CHOSEN_EMB_MODEL)    
284 |     results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=1, filter_param=filter_param)
285 | 
286 |     if len(results) == 0:
287 |         logging.warning("No embeddings found in Redis, attempting to load embeddings from Cosmos")
288 |         cosmos_helpers.cosmos_restore_embeddings()
289 |         results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=NUM_TOP_MATCHES, filter_param=filter_param)
290 |         
291 |     context = ' \n'.join([f"[{t['container']}/{t['filename']}] " + t['text_en'].replace('\n', ' ') for t in results])
292 |     
293 |     for re_str in re_strs:
294 |         matches = re.findall(re_str, context, re.DOTALL)
295 |         for m in matches: context = context.replace(m, '')
296 | 
297 |     context = completion_enc.decode(completion_enc.encode(context)[:MAX_SEARCH_TOKENS])
298 |     return context
299 | 
300 | 
301 | 
302 | 
303 | 
304 | def get_llm(model = CHOSEN_COMP_MODEL, temperature=0.3, max_output_tokens=MAX_OUTPUT_TOKENS, stream=False, callbacks=[]):
305 |     gen = openai_helpers.get_generation(model)
306 | 
307 |     if (gen == 3) :
308 |         llm = AzureOpenAI(deployment_name=model, model_name=model, temperature=temperature, 
309 |                         openai_api_key=openai.api_key, max_retries=30, 
310 |                         request_timeout=120, streaming=stream,
311 |                         callback_manager=CallbackManager(callbacks),
312 |                         max_tokens=max_output_tokens, verbose = True)
313 |                         
314 |     elif (gen == 4) or (gen == 3.5):
315 |         llm = ChatOpenAI(model_name=model, model=model, engine=model, 
316 |                             temperature=0.3, openai_api_key=openai.api_key, max_retries=30, streaming=stream,
317 |                             callback_manager=CallbackManager(callbacks),
318 |                             request_timeout=120, max_tokens=max_output_tokens, verbose = True)    
319 |     else:
320 |         assert False, f"Generation unknown for model {model}"                                
321 | 
322 |     return llm                                  


--------------------------------------------------------------------------------
/utils/http_helpers.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | 
  4 | from tenacity import (
  5 |     retry,
  6 |     stop_after_attempt,
  7 |     wait_random_exponential,
  8 | )
  9 | 
 10 | 
 11 | # """
 12 | # api_key = 'YOUR_API_KEY'
 13 | # search_service_name = 'your-search-service-name'
 14 | # index_name = 'your-index-name'
 15 | # api_version = 'your-api-version'
 16 | 
 17 | # request = HTTPRequest(api_key, search_service_name, index_name, api_version)
 18 | # headers = {'Authorization': 'Bearer YOUR_ACCESS_TOKEN'}
 19 | # body = {'key': 'value'}
 20 | 
 21 | # response_put = request.put(headers=headers, body=body)
 22 | # response_post = request.post(headers=headers, body=body)
 23 | # response_get = request.get(headers=headers)
 24 | # response_delete = request.delete(headers=headers)
 25 | 
 26 | # print(response_put)
 27 | # print(response_post)
 28 | # print(response_get)
 29 | # print(response_delete)
 30 | # """
 31 | 
 32 | 
 33 | class HTTPError(Exception):
 34 |     def __init__(self, status_code, message):
 35 |         self.status_code = status_code
 36 |         self.message = message
 37 |         super().__init__(f"HTTP Error {status_code}: {message}")
 38 | 
 39 | 
 40 | 
 41 | class HTTPRequest:
 42 |     def __init__(self, url = '', api_key = ''):
 43 |         self.url = url
 44 |         self.api_key = api_key
 45 |         self.default_headers = {'Content-Type': 'application/json', 'api-key': self.api_key}
 46 |         
 47 |         
 48 |     def initialize_for_cogsearch(self, api_key, search_service_name, index_name, api_version):
 49 |         self.api_key = api_key
 50 |         self.search_service_name = search_service_name
 51 |         self.index_name = index_name
 52 |         self.api_version = api_version
 53 |         self.url        = f"{search_service_name}/indexes/{index_name}?api-version={api_version}"
 54 |         self.post_url   = f"{search_service_name}/indexes/{index_name}/docs/index?api-version={api_version}"
 55 |         self.search_url = f"{search_service_name}/indexes/{index_name}/docs/search?api-version={self.api_version}"
 56 |         
 57 |         self.default_headers = {'Content-Type': 'application/json', 'api-key': self.api_key}
 58 | 
 59 | 
 60 |     def handle_response(self, response):
 61 |         try:
 62 |             response_data = json.loads(response.text)
 63 |         except json.JSONDecodeError:
 64 |             response_data = response.text
 65 | 
 66 |         if response.status_code >= 400:
 67 |             raise HTTPError(response.status_code, response_data)
 68 | 
 69 |         return response_data
 70 | 
 71 | 
 72 |     def get_url(self, op = None):
 73 |         return self.url
 74 | 
 75 | 
 76 |     @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(4))
 77 |     def put(self, op = None, headers=None, body=None):
 78 |         
 79 |         url = self.get_url(op)
 80 | 
 81 |         if headers is None:
 82 |             headers = self.default_headers
 83 |         else:
 84 |             headers = {**self.default_headers, **headers}
 85 |         
 86 |         if body is None:
 87 |             body = {}
 88 |         
 89 |         response = requests.put(url, json=body, headers=headers)
 90 |         return self.handle_response(response)
 91 | 
 92 | 
 93 |     @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(4))
 94 |     def post(self, op = None, headers=None, body=None, data=None):
 95 | 
 96 |         url = self.get_url(op)
 97 | 
 98 |         if headers is None:
 99 |             headers = self.default_headers
100 |         else:
101 |             headers = {**self.default_headers, **headers}
102 |         
103 |         if body is None:
104 |             body = {}
105 |         
106 |         if data is not None:
107 |             response = requests.post(url, data=data, headers=headers)
108 |         elif body is not None:
109 |             response = requests.post(url, json=body, headers=headers)
110 |         else:
111 |             response = requests.post(url, headers=headers)
112 | 
113 |         return self.handle_response(response)
114 | 
115 | 
116 |     @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(2))
117 |     def get(self, op = None, headers=None, params=None):
118 | 
119 |         url = self.get_url(op)
120 | 
121 |         if headers is None:
122 |             headers = self.default_headers
123 |         else:
124 |             headers = {**self.default_headers, **headers}
125 |         
126 |         if params is None:
127 |             params = {}
128 |         
129 |         response = requests.get(url, headers=headers, params=params)
130 |         return self.handle_response(response)
131 | 
132 | 
133 |     @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(4))
134 |     def delete(self, op = None, id = None, headers=None):
135 | 
136 |         url = self.get_url(op)
137 | 
138 |         if headers is None:
139 |             headers = self.default_headers
140 |         else:
141 |             headers = {**self.default_headers, **headers}
142 |         
143 |         response = requests.delete(url, headers=headers)
144 |         return self.handle_response(response)
145 | 
146 | 
147 | 
148 | 
149 | 
150 | class CogSearchHttpRequest(HTTPRequest):
151 | 
152 |     def __init__(self, api_key, search_service_name, index_name, api_version):
153 |         self.api_key = api_key
154 |         self.search_service_name = search_service_name
155 |         self.index_name = index_name
156 |         self.api_version = api_version
157 |         self.url        = f"{search_service_name}/indexes/{index_name}?api-version={api_version}"
158 |         self.post_url   = f"{search_service_name}/indexes/{index_name}/docs/index?api-version={api_version}"
159 |         self.search_url = f"{search_service_name}/indexes/{index_name}/docs/search?api-version={self.api_version}"
160 |         
161 |         self.default_headers = {'Content-Type': 'application/json', 'api-key': self.api_key}
162 | 
163 | 
164 |     def get_url(self, op = None):
165 |         if op == 'index':
166 |             url = self.post_url
167 |         elif op == 'search':
168 |             url = self.search_url
169 |         else:
170 |             url = self.url
171 | 
172 |         return url
173 | 
174 | 
175 | 
176 | class CVHttpRequest(HTTPRequest):
177 | 
178 |     def __init__(self, api_key, cog_serv_name, api_version, 
179 |                  options = ['tags', 'objects', 'caption', 'read', 'smartCrops', 'denseCaptions', 'people']):
180 | 
181 |         self.api_key = api_key
182 | 
183 |         if cog_serv_name.endswith('/'):
184 |             cog_serv_name = cog_serv_name[:-1]
185 | 
186 |         self.cog_serv_name = cog_serv_name
187 |         self.api_version = api_version
188 | 
189 |         options = ','.join(options).replace(' ', '') if isinstance(options, list) else options
190 |         self.url = f"{cog_serv_name}/computervision/imageanalysis:analyze?api-version={api_version}&modelVersion=latest&features={options}"
191 |         self.imgvec_url = f"{cog_serv_name}/computervision/retrieval:vectorizeImage?api-version={api_version}&modelVersion=latest"
192 |         self.txtvec_url = f"{cog_serv_name}/computervision/retrieval:vectorizeText?api-version={api_version}&modelVersion=latest"
193 |         
194 |         self.default_headers = {'Content-type': 'application/octet-stream','Ocp-Apim-Subscription-Key': self.api_key}
195 |         self.json_headers = {'Content-type': 'application/json','Ocp-Apim-Subscription-Key': self.api_key}
196 | 
197 | 
198 |     def get_url(self, op = None):
199 |         if op == 'analyze':
200 |             url = self.url
201 |         elif op == 'img_embedding':
202 |             url = self.imgvec_url
203 |         elif op == 'text_embedding':
204 |             url = self.txtvec_url            
205 |         else:
206 |             url = self.url
207 | 
208 |         return url
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/utils/kb_doc.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from datetime import datetime, timedelta
 4 | 
 5 | from utils.env_vars import *
 6 | 
 7 | class KB_Doc():
 8 | 
 9 |     def __init__(self):
10 | 
11 |         self.id = ''
12 |         self.text_en = ''
13 |         self.text = ''
14 |         self.doc_url = ''
15 |         self.timestamp = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
16 |         self.item_vector = []
17 |         self.orig_lang = 'en'
18 |         self.access = 'public'
19 |         self.client = KB_INDEX_NAME
20 |         self.container = KB_BLOB_CONTAINER
21 |         self.filename = ''
22 |         self.web_url = ''
23 |         self.contentType = ''
24 | 
25 | 
26 |         if PROCESS_IMAGES == 1:
27 |             self.cv_image_vector = [0.] * 1024
28 |             self.cv_text_vector = [0.] * 1024
29 | 
30 | 
31 |     def load(self, data):
32 |         for k in data:
33 |             setattr(self, k, data[k])
34 | 
35 | 
36 |     def get_fields(self):
37 |         return self.__dict__.keys()
38 | 
39 | 
40 |     def get_dict(self):
41 |         return self.__dict__


--------------------------------------------------------------------------------
/utils/langchain_helpers/mod_ccr_prompt.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | # Assistant is constantly learning and improving, and its capabilities are constantly evolving. It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics.
 4 | 
 5 | 
 6 | PREFIX = """Assistant is a large language model trained by OpenAI and is super factual and details oriented. The assistant must look for answers within the provided tools responses, and if the answer is not in the tools reponses or the context, then the assistant must answer by "Sorry, I do not know the answer". 
 7 | 
 8 | Assistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. 
 9 | 
10 | Overall, Assistant is a powerful system that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.
11 | 
12 | Final answers should be concise and to the point. If presented with lots of information, the assistant should try to summarize and give a concise answer.
13 | 
14 | Observations have sources, the assistant MUST include the source name in the final answer. If there are multiple sources, the assistant MUST cite each one in their own square brackets. For example, the assistant must use \"[folder3/info343][http://wesbite]\" and not \"[folder3/info343,http://wesbite]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://wesbite".
15 | 
16 | THE ASSISTANT MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE USER INPUT OR TOOLS' RESPONSES, THE ASSISTANT MUST NOT ANSWER FROM MEMORY AND MUST NOT MAKE UP ANSWERS.
17 | 
18 | """
19 | 
20 | 
21 | FORMAT_INSTRUCTIONS = """RESPONSE FORMAT INSTRUCTIONS
22 | ----------------------------
23 | 
24 | When responding to me please, please output a response in one of two formats:
25 | 
26 | **Option 1:**
27 | Use this if you want the human to use a tool.
28 | Markdown code snippet formatted in the following schema.The assistant must strictly follow the following format, with no additional or other text in the response outside the json block:
29 | 
30 | ```json
31 | {{{{
32 |     "action": "string" \\ The action to take. Must be one of {tool_names}
33 |     "action_input": "string" \\ The input to the action
34 | }}}}
35 | ```
36 | 
37 | **Option #2:**
38 | Use this if you want to respond directly to the human. Markdown code snippet formatted in the following schema. The assistant must strictly follow the following format, with no additional or other text in the response outside the json block:
39 | 
40 | ```json
41 | {{{{
42 |     "action": "Final Answer",
43 |     "action_input": "[source name 1][source name 2] string" \\ You should put what you want to return to use here
44 | }}}}
45 | ```"""
46 | 
47 | SUFFIX = """TOOLS
48 | ------
49 | Assistant must ask the user to use tools to look up information that may be helpful in answering the users original question. The tools the human can use are:
50 | 
51 | {{tools}}
52 | 
53 | {format_instructions}
54 | 
55 | USER'S INPUT
56 | --------------------
57 | Here is the user's input (remember to respond with a markdown code snippet of a json blob with a single action, and NOTHING else):
58 | 
59 | {{{{input}}}}"""
60 | 
61 | 
62 | 
63 | TEMPLATE_TOOL_RESPONSE = """TOOL RESPONSE: 
64 | ---------------------
65 | [source name 1][source name 2] {observation}
66 | 
67 | USER'S INPUT
68 | --------------------
69 | 
70 | Okay, so what is the response to my last comment? If using information obtained from the tools you must mention it explicitly without mentioning the tool names - I have forgotten all TOOL RESPONSES! Remember to respond with a markdown code snippet of a json blob with a single action, and NOTHING else. The assistant MUST NOT MENTION THE TOOL NAME, the final answer to the original input question MUST BE detailed and specific but concise, human-friendly and easy to read. (do NOT use the tool names in the final answer, and do not use machine jargon). Make sure that you send the correct source as a reference, if the source is already included in the history, make sure to include it again in the Final Answer."""
71 | 


--------------------------------------------------------------------------------
/utils/langchain_helpers/mod_react_prompt.py:
--------------------------------------------------------------------------------
  1 | from langchain.prompts import PromptTemplate, BasePromptTemplate
  2 | 
  3 | 
  4 | mod_react_prefix = """Answer the following questions as best you can. You have access to only the following tools:"""
  5 | 
  6 | # If after using 2 tools and the assistant has a partial final answer, then the assistant must formulate a final answer, and then add to it "I'm not sure if this is the answer you are looking for, but here is what I found." and then the assistant MUST stop searching. 
  7 | # YOU MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE OBSERVATIONS, FROM THE INITIAL CONTEXT OR FROM PREVIOUS CONVERSATION, DO NOT ANSWER FROM MEMORY. 
  8 | 
  9 | 
 10 | mod_react_format_instructions = """The assistant can use ONLY the listed tools. The assistant MUST NOT make up tool names. 
 11 | 
 12 | After each time the assistant uses a tool, the assistant shall thoroughly inspect the tool results in the Observation and formulate a final answer if the results have enough information. If the assistant has a final answer, then the assistant MUST stop using the tools. 
 13 | If the assistant does not have the information needed to formulate an answer, the assistant MUST continue using the tools with different action inputs for a maximum total of 3 tool uses. If after using the first tool, the assistant has enough information needed to formulate an answer, the assistant MUST STOP using the tools and return a final answer to the user. If the assistant decides to continue using the tools, then the assistant MUST change the Action Input with every tool. If there are lots of facts or information options, the assistant MUST try its best to summarize the information in the final answer, and must stop using the tools. 
 14 | 
 15 | The assistant MUST NOT use the tools more than 3 times. 
 16 | The assistant MUST NOT use the tools if the assistant has a final answer.
 17 | The assistant MUST NOT use the same tool twice or more with the exact same input.
 18 | 
 19 | Observations have sources, the assistant MUST include the source name in the final answer. If there are multiple sources, the assistant MUST cite each one in their own square brackets. For example, the assistant must use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://microsoft.com".
 20 | 
 21 | THE ASSISTANT MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE OBSERVATIONS, FROM THE USER'S INPUT, INITIAL CONTEXT OR FROM PREVIOUS CONVERSATION, THE ASSISTANT MUST NOT ANSWER FROM MEMORY. If the assistant is sure about a fact that is not explicitly stated in the knowledge base (such as knowning which country a city is located in), the assistant is permitted to use that fact from memory in the final answer but need to state this explicitly in the Final Answer. However, the assistant MUST NOT make up facts.
 22 | 
 23 | It is critically important that the assistant MUST not mention the tool names in the Final Answer.
 24 | 
 25 | If the Conversation History or Initial Context are not related to the question, then the assistant MUST ignore them. ALWAYS remember that the assistant MUST synthesize a Final Answer out of all the information collected for the user's benefit. If there are several pieces of information in the final answer, the assistant can choose to answer in bullet point format. The assistant MUST be detailed and specific but concise when giving a final answer, with facts that are RELEVANT ONLY to the question.
 26 | 
 27 | It is critically important that the assistant USE the following format STRICTLY, the assistant's answer MUST be in the below format. The assistant MUST either generate a thought with an action and action input, or a thought with a final answer:
 28 | 
 29 | #FORMAT#
 30 | Question: the input question you must answer
 31 | Thought: you should always think about what to do. First, identify in the previous observations any facts or information that can help in answering the above Question, and make sure to explicitly output them in the current Thought. If the question needs multiple tools, break it down into multiple action inputs for multiple tools. Decide on the most relevant tool for the next step.
 32 | Action: the action to take, should be one of [{tool_names}]
 33 | Action Input: the input to the action
 34 | Observation: [folder1/file1] the result of the action.\n[http://wikipedia.com] second result of the action\n[website.com] third result of the action
 35 | ... (this Thought/Action/Action Input/Observation can repeat up to 3 times with different action inputs in each time)
 36 | Thought: After carefully analyzing the previous Observations, I now know the final answer. Formulate a final answer from all previous thoughts and observations, and write down an elaborate, detailed, and specific answer, which is directly relevant to the question.
 37 | Final Answer: [folder1/file1][http://wikipedia.com][website.com] the final answer to the original input question that is human-friendly and easy to read. (do NOT use the tool names in the final answer, and do not use machine jargon)
 38 | #FORMAT#
 39 | 
 40 | Follow the above Format strictly, and make sure to follow the instructions in each step (Thought/Action/Action Input/Observation). DO NOT USE THE TOOL NAMES IN THE ANSWER.
 41 | 
 42 | """
 43 | 
 44 | # Identify explicitly any information inside those observations that can help in answering the above question.
 45 | 
 46 | 
 47 | mod_react_suffix = """Begin!
 48 | Conversation History: {history}
 49 | 
 50 | Question: {input}
 51 | 
 52 | 
 53 | Thought:{agent_scratchpad}"""
 54 | 
 55 | # Initial Context:{pre_context}
 56 | 
 57 | 
 58 | 
 59 | 
 60 | mod_evaluate_instructions = """<|im_start|>
 61 | The assistant is a super helpful assistant that plays the role of detective and has ultra high attention to details. The assistant must go through the below context paragraph by paragraph and try to find relevant information to the user's question. The current time and date will be provided for the assistant in the Context. The assistant can use the current date and time to derive the day and date for any time-related questions, such as this afternoon, this evening, today, tomorrow, this weekend or next week.
 62 | <|im_end|>
 63 | <|im_start|>user 
 64 | 
 65 | Instruction: Identify in the above facts or information that can help in answering the following question: "##{history}\nHuman: {question}##" and list them in bullet point format. Be elaborate, detailed and specific when identifying facts or information. Do NOT be concise so as not to miss critical information.
 66 | YOU MUST STRICTLY USE THE CONTEXT TO IDENTIFY FACTS OR INFORMATION, DO NOT ANSWER FROM MEMORY.
 67 | Facts have sources, you MUST include the source name in the EACH bullet point at the beginning before any text. If there are multiple sources, cite each one in their own square brackets. For example, use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://microsoft.com".
 68 | 
 69 | Context:    
 70 | - [https://www.timeanddate.com] {todays_time} 
 71 | 
 72 | {context}
 73 | 
 74 | 
 75 | Use the following format:
 76 | - [folder1/file1] the first fact or information (elaborate, detailed, and specific)
 77 | - [http://website.com] the second fact or information (elaborate, detailed, and specific)
 78 | - [http://wikipedia.com] the third fact or information (elaborate, detailed, and specific)
 79 | - [folder3/file3] the fourth fact or information (elaborate, detailed, and specific)
 80 | - [http://microsoft.com] the fifth fact or information (elaborate, detailed, and specific)
 81 | - [folder4/file4] the sixth fact or information (elaborate, detailed, and specific)
 82 | - [http://outlook.com] the seventh fact or information (elaborate, detailed, and specific)
 83 | - [https://linkedin.com] the eighth fact or information (elaborate, detailed, and specific)
 84 | - (and so on ...)
 85 | 
 86 | 
 87 | 
 88 | Begin:
 89 | <|im_end|>
 90 | <|im_start|>assistant
 91 | """
 92 | 
 93 | 
 94 | mod_extract_intent_instructions = """<|im_start|>
 95 | The assistant is a super helpful assistant that plays the role of a search engine expert and has ultra high attention to details. The assistant must go through the below question and think about the most important keywords to extract. Please extract the intent of the below question and the keywords in as few words as possible. Imagine extracting the intent as keywords to be the input to a search engine. DO NOT ANSWER THE QUESTION, EXTRACT ONLY THE INTENT.
 96 | 
 97 | <|im_end|>
 98 | <|im_start|>user 
 99 | 
100 | The following are examples, and must be strictly used as output format:
101 | 
102 | Question: what hotels are recommended in Las Vegas?
103 | Intent: knowledge base
104 | Keywords: recommend hotels Las Vegas
105 | 
106 | Question: Hi
107 | Intent: chit chat
108 | Keywords: chit chat
109 | 
110 | Question: Don't you want to know about me?
111 | Intent: chit chat
112 | Keywords: chit chat
113 | 
114 | Question: Do you eat?
115 | Intent: chit chat
116 | Keywords: chit chat
117 | 
118 | Question: I'm curious about your family
119 | Intent: chit chat
120 | Keywords: chit chat
121 | 
122 | Question: Surprise me
123 | Intent: chit chat
124 | Keywords: chit chat
125 | 
126 | Question: Who is Barack Obama?
127 | Intent: knowledge base
128 | Keywords: Identify Barack Obama
129 | 
130 | Question: what is mentioned about the Volcano hotel?
131 | Intent: knowledge base
132 | Keywords: Volcano hotel, mentioned
133 | 
134 | Question: how much are the one day pass tickets for Ferrari world?
135 | Intent: knowledge base
136 | Keywords: Ferrari world, one day pass, price
137 | 
138 | Question: where is the Eiffel Tower?
139 | Intent: knowledge base
140 | Keywords: locate Eiffel Tower
141 | 
142 | 
143 | Use the below format strictly:
144 | 
145 | Question: "{question}"
146 | 
147 | <|im_end|>
148 | <|im_start|>assistant
149 | """
150 | 
151 | 
152 | 
153 | 
154 | mod_chit_chat_instructions = """<|im_start|>
155 | The assistant is a super helpful assistant that plays the role of a chit chat buddy and is very talkative and friendly. The assistant must go through the below question and reply in a super friendly and talkative manner. The user wants to chit chat, so the assistant must indulge them. 
156 | 
157 | <|im_end|>
158 | <|im_start|>user 
159 | 
160 | 
161 | Question: "{question}"
162 | 
163 | <|im_end|>
164 | <|im_start|>assistant
165 | """
166 | 
167 | 
168 | 
169 | mod_qc_instructions = """<|im_start|>
170 | The assistant is a super helpful assistant that plays the role of a quality control engineer and has ultra high attention to details. The assistant must go through the below question and think whether the answer is an adequate response to the question. Inadequate answers appear to be incomplete as they mention that the assistant must try another action, or try a different tool. Inadequate answers also suggest the answer is not final and that the user must perform an extra action of checking pages in the reference source You MUST answer by "Yes" or "No" ONLY. No additional explanation is required.
171 | 
172 | <|im_end|>
173 | <|im_start|>user 
174 | 
175 | The following are examples:
176 | 
177 | Question: "what hotels are recommended in Las Vegas?"
178 | Answer: "The search results do not provide a clear answer to the question. I should try a different action input.\n\n 'Most luxurious hotels on the Las Vegas Strip'"
179 | Adequate: No
180 | 
181 | Question: "what is mentioned about the Lost City hotel?"
182 | Answer: "The Lost City Hotel is a luxurious accommodation in Dubai with an onsite waterpark and aquarium."
183 | Adequate: Yes
184 | 
185 | Question: "who is Barack Obama?"
186 | Answer: 'I need to be more specific with my input.\n\n "Barack Obama biography"'
187 | Adequate: No
188 | 
189 | Question: "who is Barack Obama?"
190 | Answer: 'Unfortunately, none of the sources I searched provided any specific information about Barack Obama.'
191 | Adequate: Yes
192 | 
193 | Question: "how much are the one day pass tickets for Ferrari world?"
194 | Answer: "I'm sorry, I could not find the ticket prices for Ferrari World."
195 | Adequate: Yes
196 | 
197 | Question: "what is the average salary in the USA?"
198 | Answer: 'Since Cognitive Search did not provide any relevant information, I should try a different tool.'
199 | Adequate: No
200 | 
201 | Question: "who is Barack Obama?"
202 | Answer: 'The context paragraph does not provide any direct information about Barack Obama. However, a Cognitive Search for "Barack Obama presidency" may yield relevant information about his presidency.'
203 | Adequate: No
204 | 
205 | Question: "who is Barack Obama?"
206 | Answer: 'I apologize, but I cannot find any relevant information about Barack Obama in the given context.'
207 | Adequate: Yes
208 | 
209 | Question: "What is the best thing about Las Vegas?"
210 | Answer: 'I need to try a different tool.'
211 | Adequate: No
212 | 
213 | Question: "what are the total annual leaves in days with full renumeration in Australia?"
214 | Answer:  'Since Redis Search did not provide any relevant information, I should try a different tool.'
215 | Adequate: No
216 | 
217 | Question:  "what hotels are recommended in Seattle?"
218 | Answer: "Our travel agency offers the following hotels in Seattle: The Cinnamon Hotel, The Creek Hotel, and The Bay Hotel.
219 | Adequate: Yes
220 | 
221 | Question: "what hotels are recommended in Las Vegas?"
222 | Answer: "Margie’s Travel offers the following hotels in Las Vegas: The Volcano Hotel, The Fountain Hotel, The Canal Hotel. To book your trip to Las Vegas, visit www.margiestravel.com."
223 | Adequate: Yes
224 | 
225 | Question: "what is mentioned about the Lost City hotel?"
226 | Answer: "The Lost City Hotel is a luxurious accommodation in Dubai, with an onsite waterpark and aquarium, offered by Margie's Travel. To book a trip to Dubai, visit www.margiestravel.com."
227 | Adequate: Yes
228 | 
229 | Question: "what is mentioned about the Volcano hotel?"
230 | Answer: "The Volcano Hotel is a stylish casino hotel with live entertainment and an extensive pool area, located in the heart of The Strip. To book a trip to Las Vegas, visit www.margiestravel.com."
231 | Adequate: Yes
232 | 
233 | Question: "what is the contact info of IPA?"
234 | Answer: "The contact information for IPA Qatar is provided on page 26 of the guide."
235 | Adequate: No
236 | 
237 | 
238 | 
239 | Question: "{question}"
240 | Answer: "{answer}"
241 | Adequate: 
242 | 
243 | <|im_end|>
244 | <|im_start|>assistant
245 | """
246 | 


--------------------------------------------------------------------------------
/utils/langchain_helpers/mod_wiki_prompt.py:
--------------------------------------------------------------------------------
  1 | from langchain.prompts import PromptTemplate, BasePromptTemplate
  2 | 
  3 | 
  4 | EXAMPLES = [
  5 | #     """Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
  6 | # Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.
  7 | # Action 1: Search[Colorado orogeny]
  8 | # Observation 1: [info1.pdf] The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.
  9 | # Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.
 10 | # Action 2: Lookup[eastern sector]
 11 | # Observation 2: [info2.txt] (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.
 12 | # Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.
 13 | # Action 3: Search[High Plains]
 14 | # Observation 3: [some_file.pdf] High Plains refers to one of two distinct land regions
 15 | # Thought 4: I need to instead search High Plains (United States).
 16 | # Action 4: Search[High Plains (United States)]
 17 | # Observation 4: [filea.pdf] The High Plains are a subregion of the Great Plains. [another-ref.docx] From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130m).
 18 | # Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
 19 | # Action 5: Finish[1,800 to 7,000 ft [filea.pdf]]\n\n""",
 20 | #     """Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
 21 | # Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.
 22 | # Action 1: Search[Milhouse]
 23 | # Observation 1: [info7.pdf] Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.
 24 | # Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after".
 25 | # Action 2: Lookup[named after]
 26 | # Observation 2: [historyref2.txt] (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous.
 27 | # Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.
 28 | # Action 3: Finish[(historyref2.txt) Richard Nixon\n\n]""",
 29 |     """Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?
 30 | Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.
 31 | Action 1: Search[Adam Clayton Powell]
 32 | Observation 1: [info9.pdf] Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office
 33 | Building', 'Isabel Washington Powell', 'Adam Powell', vAdam Clayton Powell (film)', 'Giancarlo Esposito'].
 34 | Thought 2: To find the documentary, I can search Adam Clayton Powell (film).
 35 | Action 2: Search[Adam Clayton Powell (film)]
 36 | Observation 2: [data123.txt] Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg. The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.
 37 | Thought 3: Adam Clayton Powell (film) is a documentary about an African-American
 38 | politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.
 39 | Action 3: Finish[[data123.txt] The Saimaa Gesture]""",
 40 |     """Question: What profession does Nicholas Ray and Elia Kazan have in common?
 41 | Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.
 42 | Action 1: Search[Nicholas Ray]
 43 | Observation 1: [files-987.png] Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 - June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.
 44 | Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.
 45 | Action 2: Search[Elia Kazan]
 46 | Observation 2: [files-654.txt] Elia Kazan was an American film and theatre director, producer, screenwriter and actor.
 47 | Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.
 48 | Action 3: Finish[[files-987.png](files-654.txt] director, screenwriter, actor]""",
 49 |     """Question: Which magazine was started first Arthur's Magazine or First for Women?
 50 | Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.
 51 | Action 1: Search[Arthur's Magazine]
 52 | Observation 1: [magazines-1850.pdf] Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century.
 53 | Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.
 54 | Action 2: Search[First for Women]
 55 | Observation 2: [magazines-1900.pdf] First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989.
 56 | Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.
 57 | Action 3: Finish[[magazines-1850.pdf)(magazines-1900.pdf) Arthur's Magazine]""",
 58 |     """Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?
 59 | Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.
 60 | Action 1: Search[Pavel Urysohn]
 61 | Observation 1: [info4444.pdf] Pavel Samuilovich Urysohn (February 3, 1898 - August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.
 62 | Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.
 63 | Action 2: Search[Leonid Levin]
 64 | Observation 2: [datapoints_aaa.txt] Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist.
 65 | Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work.
 66 | Action 3: Finish[[info4444.pdf] yes ]""",
 67 | ]
 68 | 
 69 | 
 70 | SUFFIX = """
 71 | 
 72 | Initial Context:{pre_context}
 73 | 
 74 | Current Conversation: {history}
 75 | 
 76 | Question: {input}
 77 | 
 78 | <|im_end|>
 79 | <|im_start|>assistant
 80 | 
 81 | Begin:
 82 | 
 83 | {agent_scratchpad}
 84 | """
 85 | 
 86 | 
 87 | 
 88 | PREFIX = """<|im_start|>system
 89 | The following is a friendly conversation between a human and an AI assistant. The AI assistant is talkative and provides lots of specific details from its context. You are an intelligent assistant helping our employees with their knowledge base questions. Answer questions as shown in the following examples, by splitting the question into individual search or lookup actions to find facts until you can answer the question. 
 90 | Observations are prefixed by their source name in square brackets, source names MUST be included with the actions in the answers.
 91 | All questions must be answered from the results from search or look up actions, only facts resulting from those can be used in an answer.
 92 | Answer questions as truthfully as possible, and ONLY answer the questions using the information from observations, do not speculate or your own knowledge.
 93 | If the question is not clear or further clarifications are needed, the AI assistant MUST use the search or lookup actions to get the context and information. The AI assistant MUST use one of the tools AT LEAST ONCE. 
 94 | At each Observation, the assistant shall ponder carefully whether it has the final answer or not. If the assistant does, then the assistant can stop searching and provide the final answer. If the assistant does not, then the assistant must continue searching until all search sources are exhausted.
 95 | Do NOT answer based on your knowledge of Wikipedia.
 96 | For example, if the question is \"What color is the sky?\" and one of the information sources says \"info123: the sky is blue whenever it's not cloudy\", then answer with \"The sky is blue [info123]\" 
 97 | It's important to strictly follow the format where the name of the source is in brackets at the end of the sentence, and only up to the prefix before the colon [\":\"]. 
 98 | If there are multiple sources, cite each one in their own square brackets. For example, use \"[info343][ref-76]\" and not \"[info343,ref-76]\". 
 99 | Never quote tool names as sources.
100 | 
101 | <|im_end|>
102 | <|im_start|>user 
103 | 
104 | """
105 | 
106 | 
107 | 
108 | mod_wiki_prompt = PromptTemplate.from_examples(
109 |     EXAMPLES, SUFFIX, ["input", "agent_scratchpad", "history", "pre_context"], '\n', PREFIX
110 | )
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | # PREFIX = """<|im_start|>system
124 | # The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. Observations are prefixed by their reference name in square brackets, reference names MUST be included with the actions in the answers.
125 | # Answer the following questions as best you can. You have access to the following tools:
126 | # Search: useful for when you need to answer questions from the local knowledge store, and to get more context
127 | # Lookup: useful for when you need to lookup terms from the local knowledge store, and to get more context
128 | # The assistant MUST use the following search sources to deduce an asnwer: Search or Lookup. All questions MUST be answered from the results from these search sources, only facts resulting from those sources can be used in an answer. If any piece of information is missing, the assistant must be persistent in finding it from search sources until all tools are exhausted. 
129 | # If a source provides enough evidence for an answer, then the assistant can deduce a final answer without trying other sources.
130 | # If the question is not related to the previous conversation, then the assistant must use Search or Lookup to answer the question and ignore the previous conversation. 
131 | # If this question is too broad and unclear, or if the assistant needs more context to understand the question, then the assistant can use Search or Lookup to find a more specific question to use with the tools at its disposal.
132 | # At each Observation, the assistant shall ponder carefully whether it has the final answer or not. If the assistant does, then the assistant can stop searching and provide the final answer. If the assistant does not, then the assistant must continue searching until all search sources are exhausted.
133 | # If the answer can be fully answered from the previous conversation, then the answer MUST be elaborated and repeated again and sent back to the user as a Final Asnwer.
134 | 
135 | # """
136 | 
137 | # PREFIX = """<|im_start|>system
138 | # The following is a friendly conversation between a human and an AI assistant. The AI assistant is talkative and provides lots of specific details from its context. 
139 | # Answer questions as shown in the following examples, by splitting the question into individual search or lookup actions to find facts until you can answer the question. 
140 | # Observations are prefixed by their source name in square brackets, source names MUST be included with the actions in the answers.
141 | # All questions must be answered from the results from search or look up actions, only facts resulting from those can be used in an answer. 
142 | # Answer questions as truthfully as possible, and ONLY answer the questions using the information from observations, do not speculate or answer based on the assistant's own knowledge.
143 | # If the question is not clear or further clarifications are needed, the AI assistant MUST use the search or lookup actions to get the context and information. The AI assistant MUST use one of the tools AT LEAST ONCE. Do NOT answer based on Wikipedia.
144 | # Answer the question using the provided Observations only, and if the answer is not contained within the Observations, say "Sorry, the query did not find a good match. Please rephrase your question":
145 | # """
146 | 
147 | # PREFIX = \
148 | # """You are an intelligent assistant helping our employees with their knowledge base questions. 
149 | # Answer the question using only the data provided in the information sources below. 
150 | # Each source has a name followed by colon and the actual data, quote the source name for each piece of data you use in the response. 
151 | # For example, if the question is \"What color is the sky?\" and one of the information sources says \"info123: the sky is blue whenever it's not cloudy\", then answer with \"The sky is blue (info123)\" 
152 | # It's important to strictly follow the format where the name of the source is in parenthesis at the end of the sentence, and only up to the prefix before the colon (\":\"). 
153 | # If there are multiple sources, cite each one in their own square brackets. For example, use \"(info343)(ref-76)\" and not \"(info343,ref-76)\". 
154 | # Never quote tool names as sources.
155 | # Answer questions as truthfully as possible, and ONLY answer the questions using the information from observations, do not speculate or answer based on the assistant's own knowledge.
156 | # If the question is not clear or further clarifications are needed, the AI assistant MUST use the search or lookup actions to get the context and information. The AI assistant MUST use one of the tools AT LEAST ONCE. 
157 | # If you cannot answer using the sources below, say that you don't know. 
158 | # \n\nYou can access to the following tools:"""


--------------------------------------------------------------------------------
/utils/langchain_helpers/oai_fc_agent.py:
--------------------------------------------------------------------------------
  1 | import openai
  2 | import os
  3 | import json
  4 | import yaml
  5 | import copy
  6 | import numpy as np
  7 | import itertools
  8 | 
  9 | 
 10 | from utils.env_vars import *
 11 | from utils import openai_helpers
 12 | from utils import http_helpers
 13 | from utils.cogsearch_helpers import *
 14 | 
 15 | 
 16 | instruction_prompt =  """You are an AI assistant specialized in answering user questions. You can call functions to obtain specific details based on user queries. 
 17 | Facts have sources, you MUST include the source name in the answer at the beginning before any text. If there are multiple sources, cite each one in their own square brackets. For example, use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". You must follow the following format strictly for the final answer: 
 18 | Answer: [folder1/file1][http://website][http://website2] the answer based on the facts or information.
 19 | DO NOT MAKE UP ANY ANSWERS, ALL ANSWERS MUST BE BASED ON THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL". The Assistant should not make up sources. ALL SOURCES MUST BE EXTRACTED FROM THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL".
 20 | 
 21 | The below are examples of final answers:
 22 | 
 23 | Question: "what is mentioned about the Lost City hotel?"
 24 | Answer: "The Lost City Hotel is a luxurious accommodation in Dubai with an onsite waterpark and aquarium. [website]"
 25 | 
 26 | Question: "what hotels are recommended in Las Vegas?"
 27 | Answer: "Margie's Travel offers the following hotels in Las Vegas: The Volcano Hotel, The Fountain Hotel, The Canal Hotel. To book your trip to Las Vegas, visit www.margiestravel.com. [folder/Las Vegas.pdf]"
 28 | 
 29 | Question: "who is Barack Obama?"
 30 | Answer: 'Barack Obama is the 44th President of the United States of America. [http://website]'
 31 | 
 32 | Question: "who is Barack Obama?"
 33 | Answer: 'Unfortunately, none of the sources I searched provided any specific information about Barack Obama. []'
 34 | 
 35 | Question: "how much are the one day pass tickets for Ferrari world?"
 36 | Answer: "I'm sorry, I could not find the ticket prices for Ferrari World. []
 37 | 
 38 | THE ASSISTANT MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE USER INPUT OR THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL", THE ASSISTANT MUST NOT ANSWER FROM MEMORY AND MUST NOT MAKE UP ANSWERS. Assistant must make sure to send the correct source as a reference, if the source is already included in the history which is delimited by three dollar signs, make sure to include it again in the answer. The Assistant should not make up sources. ALL SOURCES MUST BE EXTRACTED FROM THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL".
 39 | """
 40 | 
 41 | intent_messages= [
 42 |     {"role": "system", "content":instruction_prompt},
 43 | ]
 44 | 
 45 | 
 46 | 
 47 | intent_functions= [  
 48 |     {  
 49 |         "name": "extract_search_terms",  
 50 |         "type": "function",  
 51 |         "description": "Search through knowledge base to find relevant documents that might help in answering the user query.",  
 52 |         "parameters": {  
 53 |             "type": "object",  
 54 |             "properties": {  
 55 |                 "search_terms": {  
 56 |                     "type": "array",  
 57 |                     "items": {  
 58 |                         "type": "object",  
 59 |                         "properties": {  
 60 |                             "term": {"type": "string",  "description": "Search terms that would be used in the search engine"  },  
 61 |                             "additional_context": {"type": "string",  "description": "Additional context related to the term."  },
 62 |                         },  
 63 |                         "required": ["term", "additional_context"]  
 64 |                     }  
 65 |                 }
 66 |             },  
 67 |             "required": ["search_terms"]  
 68 |         }  
 69 |     }  
 70 | ]  
 71 | 
 72 | 
 73 | 
 74 | intent_body = """
 75 | Current Conversation: 
 76 | $$$
 77 | {history}
 78 | $$$
 79 | 
 80 | Query: {query}       
 81 | 
 82 | """
 83 | 
 84 | 
 85 | body = """
 86 | Current Conversation: 
 87 | $$$
 88 | {history}
 89 | $$$
 90 | 
 91 | Context: 
 92 | @@@
 93 | {context}
 94 | @@@
 95 | 
 96 | Question: {query}       
 97 | 
 98 | Answer:
 99 | """
100 | 
101 | 
102 | 
103 | class oai_fc_agent():
104 | 
105 |     def __init__(self):
106 |         self.context = {}
107 |         self.context['history'] = ""
108 |         
109 | 
110 | 
111 |     def get_dict(self, response):
112 |         dd = yaml.full_load(str(response['choices'][0]['message']))
113 | 
114 |         if 'function_call' in dd:
115 |             dd['function_call']['arguments'] = yaml.full_load(dd['function_call']['arguments'])
116 |         
117 |         return dd
118 | 
119 | 
120 |     def update_history(self, input_text, answer):
121 |         self.context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n"
122 | 
123 | 
124 |     def chat(self, query, lc_agent, history):
125 |         search_results = []
126 |         content = ""
127 |         messages = copy.deepcopy(intent_messages)
128 |         completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL)
129 | 
130 |         messages.append({"role": "user", "content":intent_body.format(history=history, query=query)})
131 |         print("messages", messages)
132 |         
133 |         response = openai_helpers.contact_openai(messages, completion_model = CHOSEN_COMP_MODEL, functions=intent_functions)
134 | 
135 |         dd = self.get_dict(response)
136 |         
137 | 
138 |         if 'function_call' in dd:
139 |             search_terms  = dd['function_call']['arguments']['search_terms']
140 |             search_results = [] 
141 | 
142 |             print("search_terms", search_terms)
143 | 
144 |             for s in search_terms:
145 |                 search_results.append(lc_agent.agent_cog_search(s['term'] + ' ' + s.get('additional_context', '')))
146 |             
147 |             search_results = '\n'.join(search_results)
148 |    
149 |             empty_prompt_length = len(completion_enc.encode(instruction_prompt + body))
150 |             max_comp_model_tokens = openai_helpers.get_model_max_tokens(CHOSEN_COMP_MODEL)
151 |             query   = completion_enc.decode(completion_enc.encode(query)[:MAX_QUERY_TOKENS])
152 | 
153 |             history = completion_enc.decode(completion_enc.encode(history)[:MAX_HISTORY_TOKENS])
154 |             query_length        = len(completion_enc.encode(query))
155 |             history_length      = len(completion_enc.encode(history))
156 | 
157 |             functions_length    = len(completion_enc.encode(str(intent_functions)))
158 |             func_args_length    = len(completion_enc.encode(str(dd['function_call']['arguments'])))
159 | 
160 |             max_context_len = max_comp_model_tokens - query_length - MAX_OUTPUT_TOKENS - empty_prompt_length - history_length - functions_length -  func_args_length - 1
161 |             print(max_context_len, max_comp_model_tokens, query_length, MAX_OUTPUT_TOKENS, empty_prompt_length, history_length, functions_length,  func_args_length)
162 | 
163 |             print("max_context_len", max_context_len)
164 |             search_results = completion_enc.decode(completion_enc.encode(search_results)[:max_context_len])
165 | 
166 |             messages.append(  # adding assistant response to messages
167 |                     {
168 |                     "role": dd["role"],
169 |                     "function_call": {
170 |                         "name": dd["function_call"]["name"],
171 |                         "arguments": str(dd['function_call']['arguments'])
172 |                     },
173 |                     "content": None
174 |                     }
175 |             )
176 |             messages.append(
177 |                     {
178 |                     "role": "function",
179 |                     "name": dd["function_call"]["name"],
180 |                     "content": str(search_results),
181 |                     }
182 |             )
183 |             print("search_results", len(search_results), search_results)
184 |             print('total tokens', len(completion_enc.encode(str(messages))))
185 |             answer = openai_helpers.contact_openai(messages, completion_model = CHOSEN_COMP_MODEL)
186 |         
187 |         else:
188 |             answer = dd['content']
189 | 
190 |         return answer
191 | 
192 | 
193 | 
194 |     def run(self, query, lc_agent = None, history = None):
195 | 
196 |         answer = self.chat(query, lc_agent, history)
197 |         print(answer)
198 |         return answer


--------------------------------------------------------------------------------
/utils/langchain_helpers/oldschoolsearch.py:
--------------------------------------------------------------------------------
  1 | import openai
  2 | import tiktoken
  3 | import numpy as np
  4 | import os
  5 | import time
  6 | import logging
  7 | import re
  8 | 
  9 | 
 10 | import utils.langchain_helpers.simple_prompt  
 11 | 
 12 | from utils import openai_helpers
 13 | from utils import redis_helpers
 14 | from utils import helpers
 15 | 
 16 | 
 17 | 
 18 | from langchain.prompts.chat import (
 19 |     ChatPromptTemplate,
 20 |     HumanMessagePromptTemplate,
 21 |     MessagesPlaceholder,
 22 |     SystemMessagePromptTemplate,
 23 | )
 24 | 
 25 | 
 26 | from utils.env_vars import *
 27 | 
 28 | 
 29 | system_message = "The assistant is a super helpful assistant that plays the role of a linguistic professor and has ultra high attention to details."
 30 | 
 31 | instruction = """From the above Question and Current Conversation, output search keywords to use in a search engine to get an answer for the Question. If the Question is not related to the Current Conversation, then do not use the Current Conversation when generating the Search Keywords.
 32 | Search Keywords:"""
 33 | 
 34 | body = """
 35 | Current Conversation: 
 36 | {history}
 37 | 
 38 | Question: {question}
 39 | """
 40 | 
 41 | context_prompt = """
 42 | <|im_start|>
 43 | {system_message}
 44 | <|im_end|>
 45 | <|im_start|>user 
 46 | 
 47 | Current Conversation: 
 48 | {history}
 49 | 
 50 | Question: {question}
 51 | 
 52 | {instruction}
 53 | <|im_end|>
 54 | <|im_start|>assistant
 55 | """
 56 | 
 57 | 
 58 | class OldSchoolSearch():
 59 | 
 60 | 
 61 |     def search(self, query, history, pre_context, filter_param=None,  enable_unified_search=False, 
 62 |                 lc_agent = None, enable_cognitive_search=False, evaluate_step=True, 
 63 |                 topK=NUM_TOP_MATCHES, stream = False, verbose = False):   
 64 |         
 65 |         redis_conn = redis_helpers.get_new_conn()
 66 | 
 67 |         completion_model = CHOSEN_COMP_MODEL
 68 |         embedding_model = CHOSEN_EMB_MODEL
 69 |         completion_enc = openai_helpers.get_encoder(completion_model)
 70 |         embedding_enc = openai_helpers.get_encoder(embedding_model)
 71 | 
 72 |         if verbose: print("Old Query: ", query)
 73 |         gen = openai_helpers.get_generation(completion_model)
 74 | 
 75 |         if history != '':
 76 |             
 77 |             if (gen == 4) or (gen == 3.5):
 78 |                 messages = [
 79 |                     SystemMessagePromptTemplate.from_template(system_message).format(),
 80 |                     HumanMessagePromptTemplate.from_template(body).format(history=history, question=query),
 81 |                     HumanMessagePromptTemplate.from_template(instruction).format(),  
 82 |                 ]
 83 |                 messages = openai_helpers.convert_messages_to_roles(messages)
 84 |                 query = openai_helpers.contact_openai(messages)
 85 |             else:
 86 |                 prompt = context_prompt.format(system_message=system_message, 
 87 |                                                 history=history,
 88 |                                                 question=query,
 89 |                                                 instruction=instruction)
 90 |                 query = openai_helpers.contact_openai(prompt)
 91 |                 
 92 |         if (gen == 4) or (gen == 3.5):
 93 |             p = ''
 94 |             for m in utils.langchain_helpers.simple_prompt.get_simple_prompt('', '', '', ''): p += m['content']
 95 |             empty_prompt_length = len(completion_enc.encode(p))
 96 |         else:
 97 |             empty_prompt_length = len(completion_enc.encode(utils.langchain_helpers.simple_prompt.get_simple_prompt('', '', '', '')))
 98 | 
 99 | 
100 |         if verbose: print("New Query: ", query)
101 | 
102 |         max_comp_model_tokens = openai_helpers.get_model_max_tokens(completion_model)
103 |         max_emb_model_tokens = openai_helpers.get_model_max_tokens(embedding_model)
104 | 
105 |         if lc_agent.enable_unified_search:
106 |             context = lc_agent.unified_search(query)
107 |         elif enable_cognitive_search:
108 |             context = lc_agent.agent_cog_search(query)
109 |         # elif lc_agent.use_bing:
110 |         #     context = lc_agent.agent_bing_search(query)
111 |         else: 
112 |             context = lc_agent.agent_redis_search(query)
113 |         
114 |         query   = completion_enc.decode(completion_enc.encode(query)[:MAX_QUERY_TOKENS])
115 |         history = completion_enc.decode(completion_enc.encode(history)[:MAX_HISTORY_TOKENS])
116 |         pre_context = completion_enc.decode(completion_enc.encode(pre_context)[:PRE_CONTEXT])
117 | 
118 |         context_length      = len(completion_enc.encode(context))
119 |         query_length        = len(completion_enc.encode(query))
120 |         history_length      = len(completion_enc.encode(history))
121 |         pre_context_length  = len(completion_enc.encode(pre_context))
122 | 
123 |         max_context_len = max_comp_model_tokens - query_length - MAX_OUTPUT_TOKENS - empty_prompt_length - history_length - pre_context_length - 1
124 | 
125 |         context = completion_enc.decode(completion_enc.encode(context)[:max_context_len])
126 |         
127 |         prompt = utils.langchain_helpers.simple_prompt.get_simple_prompt(context, query, history, pre_context)  
128 | 
129 |         if verbose: 
130 |             print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
131 |             print(prompt)         
132 |             print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$")
133 | 
134 |         if verbose: print("OSS OAI Call")
135 |         answer = openai_helpers.contact_openai(prompt, completion_model, MAX_OUTPUT_TOKENS, stream=stream, verbose=verbose)
136 | 
137 |         return answer


--------------------------------------------------------------------------------
/utils/langchain_helpers/simple_prompt.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from datetime import datetime
  4 | from utils import openai_helpers
  5 | 
  6 | from langchain.prompts.chat import (
  7 |     ChatPromptTemplate,
  8 |     HumanMessagePromptTemplate,
  9 |     MessagesPlaceholder,
 10 |     SystemMessagePromptTemplate,
 11 | )
 12 | 
 13 | 
 14 | from utils.env_vars import *
 15 | 
 16 | 
 17 | 
 18 | ## Original Prompt - too strict for OpenAI
 19 | ## Answer the question using the above Context only, and if the answer is not contained within the Context above, say "Sorry, the query did not find a good match. Please rephrase your question":
 20 | 
 21 | end_of_prev_prompt_tags="""
 22 | <|im_end|>
 23 | <|im_start|>user
 24 | """
 25 | 
 26 | append_tags = """
 27 | <|im_end|>
 28 | <|im_start|>assistant
 29 | """
 30 | 
 31 | strict_prompt = "If the facts below do not answer the question, say you don't know."
 32 | 
 33 | instruction_template = """The system is an AI assistant that helps people find information in the provided Context and Current Conversation below. Only answer questions based on the facts listed below. {strict}
 34 | Facts have sources, you MUST include the source name in the answer at the beginning before any text. If there are multiple sources, cite each one in their own square brackets. For example, use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://microsoft.com". You must follow the following format strictly for the final answer: 
 35 | Answer: [folder1/file1][http://website][http://website2] the answer based on the facts or information. 
 36 | The current time and date will be provided for the assistant in the Context. The assistant can use the current date and time to derive the day and date for any time-related questions, such as this afternoon, this evening, today, tomorrow, this weekend or next week.
 37 | The assistant must first decide if the question is related to the Current Conversation. If it is, then the assistant must answer the question based on the Current Conversation and the Context. If the question is not related to the Current Conversation, then the assistant must answer the question based on the Context only.
 38 | 
 39 | The below are examples of final answers:
 40 | 
 41 | Question: "what is mentioned about the Lost City hotel?"
 42 | Answer: "[website] The Lost City Hotel is a luxurious accommodation in Dubai with an onsite waterpark and aquarium."
 43 | 
 44 | Question: "what hotels are recommended in Las Vegas?"
 45 | Answer: "[folder/Las Vegas.pdf] Margie’s Travel offers the following hotels in Las Vegas: The Volcano Hotel, The Fountain Hotel, The Canal Hotel. To book your trip to Las Vegas, visit www.margiestravel.com."
 46 | 
 47 | Question: "who is Barack Obama?"
 48 | Answer: '[http://website] Barack Obama is the 44th President of the United States of America.'
 49 | 
 50 | Question: "who is Barack Obama?"
 51 | Answer: '[] Unfortunately, none of the sources I searched provided any specific information about Barack Obama.'
 52 | 
 53 | Question: "how much are the one day pass tickets for Ferrari world?"
 54 | Answer: "[] I'm sorry, I could not find the ticket prices for Ferrari World."
 55 | 
 56 | """
 57 | 
 58 | 
 59 | body = """
 60 | Initial Context: 
 61 | {pre_context}
 62 | 
 63 | Current Conversation: 
 64 | {history}
 65 | 
 66 | Context: 
 67 | [https://www.timeanddate.com] The current date and time are {todays_time}. 
 68 | 
 69 | {context}
 70 | 
 71 | Question: {query}       
 72 | Answer:
 73 | """
 74 | 
 75 | 
 76 | def get_simple_prompt(context, query, history, pre_context):
 77 | 
 78 |     # logging.info(f"{CHOSEN_COMP_MODEL}, {GPT35_TURBO_COMPLETIONS_MODEL}, {CHOSEN_COMP_MODEL == GPT35_TURBO_COMPLETIONS_MODEL}")
 79 |     todays_time = datetime.now().strftime('%A %B %d, %Y %H:%M:%S')
 80 | 
 81 |     instruction_strict = instruction_template.format(strict=strict_prompt)
 82 |     instruction_simple = instruction_template.format(strict="")
 83 | 
 84 |     if RESTRICTIVE_PROMPT == 'yes':
 85 |         instruction = instruction_strict
 86 |     else:
 87 |         instruction = instruction_simple
 88 | 
 89 |     gen = openai_helpers.get_generation(CHOSEN_COMP_MODEL)
 90 | 
 91 |     # if (CHOSEN_COMP_MODEL == GPT4_MODEL) or (CHOSEN_COMP_MODEL == GPT4_32K_MODEL):
 92 |     if (gen == 4) or (gen == 3.5):        
 93 |         messages = [
 94 |                     SystemMessagePromptTemplate.from_template(instruction_template).format(strict=strict_prompt),
 95 |                     HumanMessagePromptTemplate.from_template(body).format(history=history, 
 96 |                                                                           query=query, 
 97 |                                                                           pre_context=pre_context, 
 98 |                                                                           context=context, 
 99 |                                                                           todays_time=todays_time),
100 |                 ]
101 |         prompt = openai_helpers.convert_messages_to_roles(messages)
102 |     elif (CHOSEN_COMP_MODEL == GPT35_TURBO_COMPLETIONS_MODEL):
103 | 
104 |         prompt = f"""
105 | <|im_start|>system
106 | {instruction}
107 | 
108 | 
109 | <|im_end|>
110 | <|im_start|>user
111 | 
112 | 
113 | Initial Context: 
114 | {pre_context}
115 | 
116 | Current Conversation: 
117 | {history}
118 | 
119 | Context: 
120 | [https://www.timeanddate.com] The current date and time are {datetime.now().strftime('%A %B %d, %Y %H:%M:%S')}. 
121 | 
122 | {context}
123 | 
124 | Question: {query}       
125 | Answer:
126 | <|im_end|>
127 | <|im_start|>assistant
128 |         """
129 | 
130 |     else:
131 | 
132 |         prompt =f"""{instruction}
133 | 
134 | Initial Context: 
135 | {pre_context}
136 | 
137 | Current Conversation: 
138 | {history}
139 | 
140 | Context: 
141 | [https://www.timeanddate.com] The current date and time are {datetime.now().strftime('%A %B %d, %Y %H:%M:%S')}. 
142 | 
143 | {context}
144 | 
145 | 
146 | Question: {query}       
147 | Answer:
148 | 
149 |         """
150 |     
151 |     # logging.info(f"Using as prompt instruction: {instruction}")
152 |     # print(f"Using as prompt instruction: {instruction}")
153 | 
154 |     return prompt


--------------------------------------------------------------------------------
/utils/langchain_helpers/streaming_handler.py:
--------------------------------------------------------------------------------
  1 | """Callback Handler streams to stdout on new llm token."""
  2 | import sys
  3 | from typing import Any, Dict, List, Union
  4 | import re
  5 | from langchain.callbacks.base import BaseCallbackHandler
  6 | from langchain.schema import AgentAction, AgentFinish, LLMResult
  7 | 
  8 | 
  9 | class StreamingSocketIOCallbackHandler(BaseCallbackHandler):
 10 |     """Callback handler for streaming. Only works with LLMs that support streaming."""
 11 | 
 12 |     def __init__(self, socketio_obj, connection_id):
 13 |         self.socketio_obj = socketio_obj
 14 |         self.connection_id = connection_id
 15 |         super().__init__()
 16 | 
 17 |     def on_llm_start(
 18 |         self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
 19 |     ) -> None:
 20 |         """Run when LLM starts running."""
 21 |         self.buffer = ''
 22 |         self.partial_answer = ''
 23 |         self.num_partial_answer = 0
 24 | 
 25 |     def output_partial_answer(self):
 26 |         self.partial_answer = self.partial_answer.replace('":', '').replace('"', '').replace('}', '').replace('```', '').replace(':', '').replace('\\n', '<br>')
 27 |         self.socketio_obj.emit('token', self.partial_answer, to=self.connection_id)
 28 |         self.partial_answer = ''
 29 |         self.num_partial_answer = 0
 30 | 
 31 |     def process_new_token(self, token):
 32 |         self.partial_answer += token #
 33 |         self.num_partial_answer += 1
 34 | 
 35 |         source_matches = re.findall(r'\[(.*?)\]', self.partial_answer)
 36 |         for s in source_matches:
 37 |             self.partial_answer = self.partial_answer.replace('['+s+']', '')
 38 | 
 39 |         if ('[' in self.partial_answer) and (']' not in self.partial_answer):
 40 |             return
 41 |         else:
 42 |             if (self.num_partial_answer >= 5) and (not self.partial_answer.endswith('\\')):
 43 |                 self.output_partial_answer()
 44 | 
 45 | 
 46 |     def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
 47 |         """Run on new LLM token. Only available when streaming is enabled.""" 
 48 |         self.buffer += token
 49 | 
 50 |         if '"action": "Final Answer"' in self.buffer:               
 51 |             if '"action_input":' in self.buffer:
 52 |                 self.process_new_token(token)
 53 | 
 54 |         if 'Final Answer:' in self.buffer:               
 55 |             self.process_new_token(token)
 56 | 
 57 | 
 58 |     def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
 59 |         """Run when LLM ends running."""
 60 |         self.output_partial_answer()
 61 | 
 62 |     def on_llm_error(
 63 |         self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
 64 |     ) -> None:
 65 |         """Run when LLM errors."""
 66 | 
 67 |     def on_chain_start(
 68 |         self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
 69 |     ) -> None:
 70 |         """Run when chain starts running."""
 71 | 
 72 |     def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
 73 |         """Run when chain ends running."""
 74 | 
 75 |     def on_chain_error(
 76 |         self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
 77 |     ) -> None:
 78 |         """Run when chain errors."""
 79 | 
 80 |     def on_tool_start(
 81 |         self, serialized: Dict[str, Any], input_str: str, **kwargs: Any
 82 |     ) -> None:
 83 |         """Run when tool starts running.""" 
 84 | 
 85 |     def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
 86 |         """Run on agent action."""
 87 | 
 88 |     def on_tool_end(self, output: str, **kwargs: Any) -> None:
 89 |         """Run when tool ends running."""
 90 | 
 91 |     def on_tool_error(
 92 |         self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
 93 |     ) -> None:
 94 |         """Run when tool errors."""
 95 | 
 96 |     def on_text(self, text: str, **kwargs: Any) -> None:
 97 |         """Run on arbitrary text."""
 98 | 
 99 |     def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None:
100 |         """Run on agent end."""
101 | 
102 | 
103 | 
104 | 
105 | class StreamingStdOutCallbackHandler(BaseCallbackHandler):
106 |     """Callback handler for streaming. Only works with LLMs that support streaming."""
107 | 
108 |     buffer: str = ''
109 |     partial_answer: str = ''
110 |     num_partial_answer: int = 0
111 | 
112 | 
113 |     def on_llm_start(
114 |         self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
115 |     ) -> None:
116 |         """Run when LLM starts running."""
117 |         self.buffer = ''
118 |         self.partial_answer = ''
119 |         self.num_partial_answer = 0
120 | 
121 | 
122 |     def output_partial_answer(self):
123 |         self.partial_answer = self.partial_answer.replace('":', '').replace('"', '').replace('}', '').replace('```', '').replace(':', '')
124 |         sys.stdout.write(self.partial_answer)
125 |         sys.stdout.flush()
126 |         self.partial_answer = ''
127 |         self.num_partial_answer = 0
128 | 
129 |     def process_new_token(self, token):
130 |         self.partial_answer += token #
131 |         self.num_partial_answer += 1
132 | 
133 |         source_matches = re.findall(r'\[(.*?)\]', self.partial_answer)
134 |         for s in source_matches:
135 |             self.partial_answer = self.partial_answer.replace('['+s+']', '')
136 | 
137 |         if ('[' in self.partial_answer) and (']' not in self.partial_answer):
138 |             return
139 |         else:
140 |             if (self.num_partial_answer >= 5) and (not self.partial_answer.endswith('\\')):
141 |                 self.output_partial_answer()
142 | 
143 | 
144 |     def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
145 |         """Run on new LLM token. Only available when streaming is enabled.""" 
146 |         self.buffer += token
147 | 
148 |         if '"action": "Final Answer"' in self.buffer:               
149 |             if '"action_input":' in self.buffer:
150 |                 self.process_new_token(token)
151 | 
152 |         if 'Final Answer:' in self.buffer:               
153 |             self.process_new_token(token)
154 | 
155 | 
156 |     def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
157 |         """Run when LLM ends running."""
158 |         self.output_partial_answer()
159 | 
160 | 
161 | 
162 |     def on_llm_error(
163 |         self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
164 |     ) -> None:
165 |         """Run when LLM errors."""
166 | 
167 |     def on_chain_start(
168 |         self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
169 |     ) -> None:
170 |         """Run when chain starts running."""
171 | 
172 |     def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
173 |         """Run when chain ends running."""
174 | 
175 |     def on_chain_error(
176 |         self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
177 |     ) -> None:
178 |         """Run when chain errors."""
179 | 
180 |     def on_tool_start(
181 |         self, serialized: Dict[str, Any], input_str: str, **kwargs: Any
182 |     ) -> None:
183 |         """Run when tool starts running."""
184 | 
185 |     def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
186 |         """Run on agent action."""
187 |         pass
188 | 
189 |     def on_tool_end(self, output: str, **kwargs: Any) -> None:
190 |         """Run when tool ends running."""
191 | 
192 |     def on_tool_error(
193 |         self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any
194 |     ) -> None:
195 |         """Run when tool errors."""
196 | 
197 |     def on_text(self, text: str, **kwargs: Any) -> None:
198 |         """Run on arbitrary text."""
199 | 
200 |     def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None:
201 |         """Run on agent end."""
202 | 


--------------------------------------------------------------------------------
/utils/language.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import requests
 3 | import uuid
 4 | import os
 5 | import logging
 6 | 
 7 | import typing
 8 | from azure.core.credentials import AzureKeyCredential
 9 | from azure.ai.textanalytics import TextAnalyticsClient
10 | 
11 | from utils.env_vars import *
12 | 
13 | def detect_content_language(content):
14 |     path = '/detect'
15 |     constructed_url = TRANSLATION_ENDPOINT + path
16 | 
17 |     params = {
18 |         'api-version': '3.0',
19 |     }
20 | 
21 |     headers = {
22 |         'Ocp-Apim-Subscription-Key': TRANSLATION_API_KEY,
23 |         'Ocp-Apim-Subscription-Region': TRANSLATION_LOCATION,
24 |         'Content-type': 'application/json',
25 |         'X-ClientTraceId': str(uuid.uuid4())
26 |     }
27 | 
28 |     # You can pass more than one object in body.
29 |     body = [{'text': content}]
30 | 
31 |     request = requests.post(constructed_url, params=params, headers=headers, json=body)
32 |     response = request.json()
33 | 
34 |     try:
35 |         lang = response[0]['language']
36 |         return lang
37 |     except: 
38 |         return 'xx'
39 | 
40 | 
41 | 
42 | 
43 | def translate(text, from_lang, to_lang = 'en'):
44 | 
45 |     path = '/translate'
46 |     constructed_url = TRANSLATION_ENDPOINT + path
47 |     body = [{'text': text}]
48 | 
49 |     params = {
50 |         'api-version': '3.0',
51 |         'from': from_lang,
52 |         'to': [to_lang]
53 |     }
54 | 
55 |     headers = {
56 |         'Ocp-Apim-Subscription-Key': TRANSLATION_API_KEY,
57 |         'Ocp-Apim-Subscription-Region': TRANSLATION_LOCATION,
58 |         'Content-type': 'application/json',
59 |         'X-ClientTraceId': str(uuid.uuid4())
60 |     }
61 | 
62 |     request = requests.post(constructed_url, params=params, headers=headers, json=body)
63 |     response = request.json()
64 | 
65 |     try:
66 |         # print(response)
67 |         return response[0]['translations'][0]['text']
68 |     except Exception as e:
69 |         print(e)
70 |         return response
71 | 
72 | 
73 | 
74 | def extract_entities(text):
75 | 
76 |     text_analytics_client = TextAnalyticsClient(endpoint=COG_SERV_ENDPOINT, credential=AzureKeyCredential(COG_SERV_KEY))
77 |     reviews = [text]
78 | 
79 |     result = text_analytics_client.recognize_entities(reviews)
80 |     result = [review for review in result if not review.is_error]
81 |     organization_to_reviews: typing.Dict[str, typing.List[str]] = {}
82 | 
83 |     entities = []
84 | 
85 |     for idx, review in enumerate(result):
86 |         for entity in review.entities:
87 |             entities.append(entity.text)
88 |             #print(entity.text)
89 |     
90 |     return entities


--------------------------------------------------------------------------------
/utils/openai_helpers.py:
--------------------------------------------------------------------------------
  1 | import openai
  2 | import tiktoken
  3 | import numpy as np
  4 | import os
  5 | import time
  6 | import logging
  7 | 
  8 | from tenacity import (
  9 |     retry,
 10 |     stop_after_attempt,
 11 |     wait_random_exponential,
 12 | )
 13 | 
 14 | 
 15 | # from langchain.prompts.chat import (
 16 | #     ChatPromptTemplate,
 17 | #     HumanMessagePromptTemplate,
 18 | #     MessagesPlaceholder,
 19 | #     SystemMessagePromptTemplate,
 20 | #     AIMessagePromptTemplate
 21 | # )
 22 | 
 23 | 
 24 | # from langchain.schema import (
 25 | #     AIMessage,
 26 | #     HumanMessage,
 27 | #     SystemMessage
 28 | # )
 29 | 
 30 | from utils.env_vars import *
 31 | 
 32 | 
 33 | import openai
 34 | openai.api_version = OPENAI_API_VERSION
 35 | 
 36 | 
 37 | 
 38 | system_start_prompt = "<|im_start|>system "
 39 | user_start_prompt = "<|im_start|>user "
 40 | assistant_start_prompt = "<|im_start|>assistant "
 41 | end_prompt = "<|im_end|> "
 42 | 
 43 | 
 44 | system_start_prompt="""
 45 | <|im_end|>
 46 | <|im_start|>user
 47 | """
 48 | 
 49 | append_tags = """
 50 | <|im_end|>
 51 | <|im_start|>assistant
 52 | """
 53 | 
 54 | 
 55 | 
 56 | def check_model_deployment(oai_model):
 57 |     try:
 58 |         model_exists = False
 59 |         result = openai.Deployment.list()
 60 |         for deployment in result.data:
 61 |             if (deployment["model"] == oai_model):
 62 |                 model_exists = True
 63 |                 #logging.info(f"Found deployment {deployment}")
 64 |                 return deployment["id"]
 65 |                 
 66 | 
 67 |         if not model_exists: 
 68 |             openai.Deployment.create(model=oai_model, scale_settings={"scale_type":"standard"})
 69 |             time.sleep(30)
 70 |         assert model_exists, f"Model {oai_model} is not deployed, deploying now"
 71 |         
 72 |     except Exception as e:
 73 | 
 74 |         print(e)
 75 |         counter = 0
 76 |         deployed = False
 77 | 
 78 |         while counter < 2:
 79 |             time.sleep(2)
 80 |             result = openai.Deployment.list()
 81 |             print(f"Found {len(result.data)} deployments")
 82 | 
 83 |             for deployment in result.data:
 84 |                 logging.info(f"OpenAI Deployment Exception --- Found deployment {deployment}")
 85 |                 if (deployment["status"] == "succeeded") and (deployment["model"] == oai_model):
 86 |                     deployed = True
 87 |                     print(f"The right model {deployment['model']} was found")
 88 |                     return deployment["id"]
 89 |             
 90 |             if deployed: break
 91 |             
 92 |             counter += 1   
 93 | 
 94 |     return ""
 95 | 
 96 | 
 97 | 
 98 | # completion_deployment_id = check_model_deployment(CHOSEN_COMP_MODEL)
 99 | # embedding_deployment_id = check_model_deployment(CHOSEN_EMB_MODEL)
100 | 
101 | 
102 | 
103 | def experiment_prompt(context, query):
104 | 
105 |     prompt =f"""
106 |     Context: {context}
107 |     
108 |     Question: {query}       
109 |     
110 |     
111 |     Answer the question using the above Context only, and if the answer is not contained within the Context above, say "Sorry, I don't know":
112 |     """
113 |     
114 | 
115 | 
116 | def get_summ_prompt(text):
117 | 
118 |     prompt =f"""
119 |     Summarize the following text.
120 | 
121 |     Text:
122 |     ###
123 |     {text}
124 |     ###
125 | 
126 |     Summary:
127 |     """
128 | 
129 |     return prompt
130 | 
131 | 
132 | def get_generation(model):
133 |     if model == "text-davinci-003":
134 |         return 3
135 |     elif model == "gpt-35-turbo":
136 |         return 3.5
137 |     elif model == "gpt-35-turbo-16k":
138 |         return 3.5
139 |     elif model == "gpt-4-32k":
140 |         return 4
141 |     elif model == "gpt-4":
142 |         return 4
143 |     else:
144 |         assert False, f"Generation unknown for model {model}"
145 | 
146 | 
147 | 
148 | def convert_messages_to_roles(messages):
149 |     roles = []
150 |     for m in messages:
151 |         if isinstance(m, HumanMessage):
152 |             roles.append({'role':'user', 'content': m.content})
153 |         elif isinstance(m, AIMessage):
154 |             roles.append({'role':'assistant', 'content': m.content})
155 |         elif isinstance(m, SystemMessage):
156 |             roles.append({'role':'system', 'content': m.content})
157 |         elif isinstance(m, Messages):
158 |             roles.append({'role':'user', 'content': m.content})
159 |         else:
160 |             assert False, f"Unknown message type {type(m)}"
161 | 
162 |     return roles
163 | 
164 | 
165 | def get_model_max_tokens(model):
166 |     if model == "text-search-davinci-doc-001":
167 |         return DAVINCI_003_EMB_MAX_TOKENS
168 |     elif model == "text-search-davinci-query-001":
169 |         return DAVINCI_003_EMB_MAX_TOKENS        
170 |     elif model == "text-davinci-003":
171 |         return DAVINCI_003_MODEL_MAX_TOKENS        
172 |     elif model == "text-embedding-ada-002":
173 |         return ADA_002_MODEL_MAX_TOKENS
174 |     elif model == "gpt-35-turbo":
175 |         return GPT35_TURBO_COMPLETIONS_MAX_TOKENS        
176 |     elif model == "gpt-35-turbo-16k":
177 |         return GPT35_TURBO_16K_COMPLETIONS_MAX_TOKENS        
178 |     elif model == "gpt-4-32k":
179 |         return GPT4_32K_COMPLETIONS_MODEL_MAX_TOKENS     
180 |     elif model == "gpt-4":
181 |         return GPT4_COMPLETIONS_MODEL_MAX_TOKENS             
182 |     else:
183 |         return GPT35_TURBO_COMPLETIONS_MAX_TOKENS
184 | 
185 | 
186 | def get_encoding_name(model):
187 |     if model == "text-search-davinci-doc-001":
188 |         return "p50k_base"
189 |     elif model == "text-embedding-ada-002":
190 |         return "cl100k_base"
191 |     elif model == "gpt-35-turbo": 
192 |         return "cl100k_base"
193 |     elif model == "gpt-35-turbo-16k": 
194 |         return "cl100k_base"        
195 |     elif model == "gpt-4-32k":
196 |         return "cl100k_base"
197 |     elif model == "gpt-4":
198 |         return "cl100k_base"               
199 |     elif model == "text-davinci-003":
200 |         return "p50k_base"  
201 |     else:
202 |         return "gpt2"
203 | 
204 | 
205 | def get_encoder(model):
206 |     if model == "text-search-davinci-doc-001":
207 |         return tiktoken.get_encoding("p50k_base")
208 |     elif model == "text-embedding-ada-002":
209 |         return tiktoken.get_encoding("cl100k_base")
210 |     elif model == "gpt-35-turbo": 
211 |         return tiktoken.get_encoding("cl100k_base")
212 |     elif model == "gpt-35-turbo-16k": 
213 |         return tiktoken.get_encoding("cl100k_base")        
214 |     elif model == "gpt-4-32k":
215 |         return tiktoken.get_encoding("cl100k_base")
216 |     elif model == "gpt-4":
217 |         return tiktoken.get_encoding("cl100k_base")                
218 |     elif model == "text-davinci-003":
219 |         return tiktoken.get_encoding("p50k_base")           
220 |     else:
221 |         return tiktoken.get_encoding("gpt2")
222 | 
223 | 
224 | 
225 | def get_model_dims(embedding_model):
226 |     if embedding_model == "text-search-davinci-doc-001":
227 |         return DAVINCI_003_EMBED_NUM_DIMS
228 |     elif embedding_model == "text-embedding-ada-002":
229 |         return ADA_002_EMBED_NUM_DIMS
230 |     else:
231 |         return ADA_002_EMBED_NUM_DIMS
232 | 
233 | 
234 | def get_token_length(text, model = CHOSEN_EMB_MODEL):
235 |     enc = get_encoder(model)
236 |     return len(enc.encode(text))
237 | 
238 | 
239 | 
240 | # @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(30))
241 | def get_openai_embedding(query, embedding_model = CHOSEN_EMB_MODEL):
242 |     return openai.Embedding.create(input=query, engine=embedding_model)['data'][0]['embedding']
243 | 
244 | 
245 | 
246 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(20))
247 | def openai_summarize(text, completion_model, max_output_tokens = MAX_OUTPUT_TOKENS, lang='en'):
248 |     prompt = get_summ_prompt(text)
249 |     return contact_openai(prompt, completion_model, max_output_tokens)
250 | 
251 | 
252 | 
253 | # @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(7))
254 | def contact_openai(prompt, completion_model = CHOSEN_COMP_MODEL, max_output_tokens = MAX_OUTPUT_TOKENS, functions=None, stream = False, verbose = False):
255 |     if verbose: print("\n########################### Calling OAI Completion API - start call")
256 | 
257 | 
258 |     b = time.time()
259 |     openai.api_version = "2023-07-01-preview"
260 | 
261 |     if not isinstance(prompt, list):
262 |         prompt = [{'role':'user', 'content': prompt}]
263 | 
264 |     if functions is None:
265 |         resp = openai.ChatCompletion.create(
266 |                 messages=prompt,
267 |                 temperature=TEMPERATURE,
268 |                 max_tokens=max_output_tokens,
269 |                 engine=completion_model,
270 |                 stream = stream
271 |             )
272 |     else:
273 |         resp = openai.ChatCompletion.create(
274 |                 messages=prompt,
275 |                 temperature=TEMPERATURE,
276 |                 max_tokens=max_output_tokens,
277 |                 engine=completion_model,
278 |                 functions=functions,
279 |                 function_call="auto",
280 |                 stream = stream
281 |             )
282 |     a = time.time()
283 |     if verbose: print(f"Using GPT-4 - Chat Completion - with stream {stream} - OpenAI response time: {a-b}")   
284 |     if stream: return resp
285 |     else: 
286 |         if functions is None:
287 |             return resp["choices"][0]["message"]['content'].strip(" \n")
288 |         else:
289 |             return resp
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 


--------------------------------------------------------------------------------
/utils/redis_helpers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import redis
  4 | from redis import Redis
  5 | import logging
  6 | import copy
  7 | from redis.commands.search.field import VectorField
  8 | from redis.commands.search.field import TextField
  9 | from redis.commands.search.field import TagField
 10 | from redis.commands.search.query import Query
 11 | from redis.commands.search.result import Result
 12 | 
 13 | 
 14 | ## https://redis-py.readthedocs.io/en/stable/commands.html
 15 | ## https://redis.io/docs/stack/search/reference/query_syntax/
 16 | 
 17 | 
 18 | 
 19 | from utils.kb_doc import KB_Doc
 20 | 
 21 | from tenacity import (
 22 |     retry,
 23 |     stop_after_attempt,
 24 |     wait_random_exponential,
 25 | )
 26 | 
 27 | 
 28 | from utils.env_vars import *
 29 | 
 30 | 
 31 | def get_model_dims(embedding_model):
 32 |     if embedding_model == "text-search-davinci-doc-001":
 33 |         return DAVINCI_003_EMBED_NUM_DIMS
 34 |     elif embedding_model == "text-embedding-ada-002":
 35 |         return ADA_002_EMBED_NUM_DIMS
 36 |     else:
 37 |         return ADA_002_EMBED_NUM_DIMS
 38 | 
 39 | 
 40 | def create_search_index (redis_new_conn, vector_field_name, number_of_vectors, vector_dimensions=512, distance_metric='L2'):
 41 |     if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None
 42 | 
 43 |     M=40
 44 |     EF=200
 45 | 
 46 |     fields = [VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, "EF_CONSTRUCTION": EF})] + \
 47 |              [TextField(f) for f in KB_Doc().get_fields() if f != VECTOR_FIELD_IN_REDIS]
 48 | 
 49 |     redis_new_conn.ft(REDIS_INDEX_NAME).create_index(fields)
 50 | 
 51 | 
 52 | def flush_cached_values_only():
 53 |     if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None
 54 | 
 55 |     redis_conn = get_new_conn()
 56 |     ks = redis_conn.keys()
 57 |     print(f"Found {len(ks)} values that are cached in Redis")
 58 | 
 59 |     for k in ks:
 60 |         ttl = redis_conn.ttl(k)
 61 |         if ttl > 0:
 62 |             print(f"Key has {ttl} seconds to live, deleting...")
 63 |             redis_conn.expire(name=k, time=1)
 64 | 
 65 | 
 66 | 
 67 | def redis_reset_index(redis_new_conn):
 68 |     #flush all data
 69 |     redis_new_conn.flushall()
 70 | 
 71 |     #create flat index & load vectors
 72 |     create_search_index(redis_new_conn,VECTOR_FIELD_IN_REDIS, NUMBER_PRODUCTS_INDEX, get_model_dims(CHOSEN_EMB_MODEL), 'COSINE')
 73 | 
 74 | 
 75 | def test_redis(redis_new_conn):
 76 |     if (REDIS_ADDR is None) or (REDIS_ADDR == '') or (USE_REDIS_CACHE != 1): return None
 77 |     print("test redis")
 78 | 
 79 |     try:
 80 |         out = redis_new_conn.ft(REDIS_INDEX_NAME).info()
 81 |         print(f"Found Redis Index {REDIS_INDEX_NAME}")
 82 |     except Exception as e:
 83 |         # print(f"Redis Index {REDIS_INDEX_NAME} not found. Creating a new index.")
 84 |         logging.error(f"Redis Index {REDIS_INDEX_NAME} not found. Creating a new index.")
 85 |         redis_reset_index(redis_new_conn)
 86 | 
 87 | 
 88 | def get_new_conn():
 89 |     if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None
 90 | 
 91 |     if REDIS_PASSWORD == '':
 92 |         redis_conn = Redis(host = REDIS_ADDR, port = REDIS_PORT)
 93 |     else:
 94 |         redis_conn = redis.StrictRedis(host=REDIS_ADDR, port=int(REDIS_PORT), password=REDIS_PASSWORD, ssl=True)
 95 | 
 96 |     #print('Connected to redis', redis_conn)
 97 |     test_redis(redis_conn)
 98 |     
 99 |     return redis_conn
100 | 
101 | 
102 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4))
103 | def redis_upsert_embedding(redis_conn, e_dict):   
104 |     if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None
105 | 
106 |     try:
107 |         #embeds = np.array(e[VECTOR_FIELD_IN_REDIS]).astype(np.float32).tobytes()
108 |         #meta = {'text_en': e['text_en'], 'text':e['text'], 'doc_url': e['doc_url'], 'timestamp': e['timestamp'], VECTOR_FIELD_IN_REDIS:embeds}
109 |         e = copy.deepcopy(e_dict)
110 | 
111 |         for k in e: 
112 |             if isinstance(e[k], list) and (len(e[k]) > 0):
113 |                 if isinstance(e[k][0], float): e[k] = np.array(e[k]).astype(np.float32).tobytes()
114 |                 if isinstance(e[k][0], str): e[k] = ', '.join(e[k])
115 | 
116 |         # e[VECTOR_FIELD_IN_REDIS] = np.array(e[VECTOR_FIELD_IN_REDIS]).astype(np.float32).tobytes()
117 | 
118 |         for k in e: 
119 |             if isinstance(e[k], list):
120 |                 print(e[k])
121 | 
122 |         p = redis_conn.pipeline(transaction=False)
123 |         p.hset(e['id'], mapping=e)
124 |         p.execute()   
125 |         return 1
126 | 
127 |     except Exception as e:
128 |         print(f"Embedding Except: {e}")
129 |         logging.error(f"Embedding Except: {e}")
130 |         return 0
131 | 
132 | 
133 | 
134 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4))
135 | def redis_query_embedding_index(redis_conn, query_emb, t_id, topK=5, filter_param=None):
136 |     if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None
137 | 
138 |     if (filter_param is None) or (filter_param == '*'):
139 |         filter_param = '*'
140 |     else:
141 |         if not filter_param.startswith('@'):
142 |             filter_param = '@' + filter_param
143 | 
144 |     filter_param = filter_param.replace('-', '\-')
145 |     fields = list(KB_Doc().get_fields()) + ['vector_score']
146 |     query_vector = np.array(query_emb).astype(np.float32).tobytes()
147 |     query_string = f'({filter_param})=>[KNN {topK} @{VECTOR_FIELD_IN_REDIS} $vec_param AS vector_score]'
148 | 
149 |     q = Query(query_string).sort_by('vector_score').paging(0,topK).return_fields(*fields).dialect(2)
150 |     params_dict = {"vec_param": query_vector}
151 |     results = redis_conn.ft(REDIS_INDEX_NAME).search(q, query_params = params_dict)
152 |     
153 |     return [{k: match.__dict__[k] for k in (set(list(match.__dict__.keys())) - set([VECTOR_FIELD_IN_REDIS]))} for match in results.docs if match.id != t_id]
154 | 
155 | 
156 | 
157 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4))
158 | def redis_set(redis_conn, key, field, value, expiry = None, verbose = False, force=False):
159 | 
160 |     print("Entering REDIS SET", REDIS_ADDR, force)
161 |     if (REDIS_ADDR is not None) and (REDIS_ADDR != '') and (force == True): 
162 |         pass
163 |     else:
164 |         if (REDIS_ADDR is None) or (REDIS_ADDR == '') or (USE_REDIS_CACHE != 1): return None
165 | 
166 |     print("Executing REDIS SET")
167 |     key = key.replace('"', '')
168 |     res = redis_conn.hset(key, field, value)
169 | 
170 |     if expiry is not None:
171 |         redis_conn.expire(name=key, time=expiry)
172 |     if verbose: print("\nSetting Redis Key: ", key, field, expiry)
173 |     return res
174 |         
175 | 
176 | 
177 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4))
178 | def redis_get(redis_conn, key, field, expiry = CONVERSATION_TTL_SECS, verbose = False, force=False):
179 | 
180 |     print("Entering REDIS GET", REDIS_ADDR, force)
181 |     if (REDIS_ADDR is not None) and (REDIS_ADDR != '') and (force == True): 
182 |         pass
183 |     else:
184 |         if (REDIS_ADDR is None) or (REDIS_ADDR == '') or (USE_REDIS_CACHE != 1): return None
185 |     print("Executing REDIS GET")
186 | 
187 |     key = key.replace('"', '')
188 |     if verbose: print("\nGetting Redis Key: ", key, field)
189 |     if redis_conn.ttl(key) > 0: redis_conn.expire(name=key, time=expiry)
190 |     return redis_conn.hget(key, field)
191 |     
192 | 
193 | 
194 |  


--------------------------------------------------------------------------------
/utils/storage.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np
  4 | import urllib
  5 | from requests.utils import requote_uri
  6 | from datetime import datetime, timedelta
  7 | import logging
  8 | import smart_open
  9 | from azure.storage.blob import BlobServiceClient, BlobClient
 10 | from azure.storage.blob import ContainerClient, __version__
 11 | from azure.storage.blob import generate_blob_sas, BlobSasPermissions
 12 | import copy
 13 | import uuid
 14 | import json
 15 | 
 16 | from utils.env_vars import *
 17 | 
 18 | 
 19 | def get_kb_container_client():
 20 |     blob_service_client = BlobServiceClient.from_connection_string(KB_BLOB_CONN_STR)
 21 |     return blob_service_client
 22 | 
 23 | 
 24 | blob_service_client = get_kb_container_client()
 25 | 
 26 | 
 27 | def get_container_name(url):
 28 |     return url.split('.blob.core.windows.net/')[1].split('/')[0]
 29 | 
 30 | 
 31 | def create_sas_from_container_and_blob(container, blob_name):
 32 |     blob_client = blob_service_client.get_blob_client(container=container, blob=blob_name)
 33 | 
 34 |     token = generate_blob_sas(
 35 |             account_name=blob_client.account_name,
 36 |             account_key=blob_client.credential.account_key,
 37 |             container_name=container,
 38 |             blob_name=blob_name,
 39 |             permission=BlobSasPermissions(read=True),
 40 |             expiry=datetime.utcnow() + timedelta(hours=20*365*24),
 41 |         )
 42 |     
 43 |     sas_url = blob_client.url + '?' + token
 44 |     #print(f"Processing now '{blob_name}' with SAS URL {sas_url}")
 45 |     return sas_url
 46 | 
 47 | 
 48 | def get_filename(blob_path):
 49 |     try:
 50 |         return urllib.parse.unquote(os.path.basename(blob_path.split('?')[0]))
 51 |     except:
 52 |         return 'default_file_name_exception'
 53 |     
 54 | 
 55 | 
 56 | def create_sas(blob_path):
 57 |     blob_name = get_filename(blob_path)
 58 |     container = get_container_name(blob_path)
 59 |     return create_sas_from_container_and_blob(container, blob_name)
 60 | 
 61 | 
 62 | 
 63 | 
 64 | def save_json_document(data_dict, container = OUTPUT_BLOB_CONTAINER):
 65 | 
 66 |     ret_dict = {}
 67 | 
 68 |     new_doc = copy.deepcopy(data_dict)
 69 | 
 70 |     new_doc['id'] = new_doc.get('id', str(uuid.uuid4()))
 71 |     new_doc['categoryId'] = CATEGORYID
 72 |     new_doc['timestamp']  = new_doc.get('timestamp', datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))  
 73 |     new_doc['doc_url']    = new_doc.get('doc_url', f'https://microsoft.com/{str(uuid.uuid4())}')
 74 |     
 75 |     if 'content' in new_doc.keys():
 76 |         del new_doc['content']
 77 | 
 78 |     container_client = blob_service_client.get_container_client(container)
 79 | 
 80 |     try:
 81 |         container_properties = container_client.get_container_properties() 
 82 |     except Exception as e:
 83 |         container_client.create_container()
 84 | 
 85 |     blob_name = urllib.parse.unquote(os.path.basename(new_doc['doc_url'].split('?')[0]))
 86 |     pre, ext = os.path.splitext(blob_name)
 87 |     blob_name = pre + '.json'            
 88 |     blob_client = container_client.get_blob_client(blob=blob_name)
 89 |     blob_client.upload_blob(json.dumps(new_doc, indent=4), overwrite=True)
 90 |     ret_dict['status'] = f"Document {new_doc['id']} was successfully saved to the {OUTPUT_BLOB_CONTAINER} container"
 91 |     logging.info(ret_dict['status'])
 92 | 
 93 |     return ret_dict
 94 | 
 95 | 
 96 | 
 97 | 
 98 | def list_documents(container):
 99 |     container_client = blob_service_client.get_container_client(container)
100 |     generator = container_client.list_blobs()
101 |     blobs = []
102 |     for blob in generator:
103 |         blob_client = blob_service_client.get_blob_client(container=container, blob=blob.name)
104 |         blobs.append(blob_client.url)
105 | 
106 |     return blobs
107 | 
108 | 
109 | def get_document_url(container, filename):
110 |     url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{container}/{filename}"
111 |     return requote_uri(url)
112 | 
113 | 
114 | def get_document(container, filename):
115 |     
116 |     transport_params = {
117 |         'client': blob_service_client
118 |     }
119 | 
120 |     with smart_open.open(f"azure://{container}/{filename}", transport_params=transport_params) as fin:
121 |         data = fin.read()
122 | 
123 |     return data
124 | 
125 | 
126 | def download_document(url, as_text = True):
127 |     
128 |     blob_client = blob_service_client.get_blob_client(container=container, blob=blob_name)
129 |     blob_name = urllib.parse.unquote(os.path.basename(blob_path))
130 |     container = get_container_name(blob_path)
131 |     download_stream = blob_client.download_blob()
132 | 
133 |     if as_text:
134 |         return download_stream.content_as_text()
135 |     else:
136 |         return download_stream.content_as_bytes()
137 |     
138 | 
139 |     


--------------------------------------------------------------------------------
/utils/summarization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import numpy as np
  4 | import pandas as pd
  5 | import urllib
  6 | from datetime import datetime, timedelta
  7 | import logging
  8 | import copy
  9 | import uuid
 10 | import json
 11 | import openpyxl
 12 | import time
 13 | 
 14 | from langchain import OpenAI, PromptTemplate, LLMChain
 15 | from langchain.text_splitter import CharacterTextSplitter
 16 | from langchain.chains.mapreduce import MapReduceChain
 17 | from langchain.prompts import PromptTemplate
 18 | from langchain.text_splitter import TokenTextSplitter, TextSplitter
 19 | from langchain.chains.summarize import load_summarize_chain
 20 | from langchain.docstore.document import Document
 21 | from langchain.callbacks.base import CallbackManager
 22 | 
 23 | from utils import openai_helpers
 24 | from utils import helpers
 25 | from utils import fr_helpers
 26 | 
 27 | from utils.env_vars import *
 28 | 
 29 | 
 30 | ## Use with Python 3.9+ ONLY
 31 | # """
 32 | # from utils import km_agents
 33 | # from utils import openai_helpers
 34 | # from utils import fr_helpers
 35 | # from utils import summarization
 36 | # folder = './docs_to_summarize'
 37 | # ref_summ_df = summarization.summarize_folder(folder, mode='refine', verbose=False)
 38 | # mp_summ_df  = summarization.summarize_folder(folder, mode='map_reduce', verbose=False)
 39 | # """
 40 | 
 41 | 
 42 | 
 43 | mapreduce_prompt_template = """The maximum output is about 500 to 750 tokens, so make sure to take advantage of this to the maximum.\n
 44 | Write an elaborate summary of 3 paragraphs of the following:
 45 | 
 46 | 
 47 | {text}
 48 | 
 49 | 
 50 | SUMMARY:"""
 51 | 
 52 | 
 53 | refine_prompt_template = """Write an elaborate summary of 3 paragraphs of the following:
 54 | 
 55 | {text}
 56 | 
 57 | """
 58 | 
 59 | refine_template = (
 60 |     "Your job is to produce a final summary of 3 paragraphs that is elaborate and rich in details.\n" 
 61 |     "The maximum output is about 500 to 750 tokens, so make sure to take advantage of this to the maximum.\n"
 62 |     "We have provided an existing summary up to a certain point: {existing_answer}\n"
 63 |     "We have the opportunity to refine the existing summary."
 64 |     "(only if needed) with some more context below.\n"
 65 |     "------------\n"
 66 |     "{text}\n"
 67 |     "------------\n"
 68 |     "Given the new context, refine the original summary."
 69 |     "If the context isn't useful, return the original summary."
 70 | )
 71 | 
 72 | 
 73 | 
 74 | def chunk_doc(all_text, mode='refine', model=CHOSEN_COMP_MODEL, max_output_tokens=MAX_OUTPUT_TOKENS, chunk_overlap=500):
 75 | 
 76 |     enc_name = openai_helpers.get_encoding_name(model)
 77 |     enc = openai_helpers.get_encoder(model)
 78 | 
 79 |     max_tokens = openai_helpers.get_model_max_tokens(model)
 80 | 
 81 |     if mode == 'refine':
 82 |         max_tokens = max_tokens - len(enc.encode(refine_prompt_template)) - len(enc.encode(refine_template)) - 2*MAX_OUTPUT_TOKENS - chunk_overlap
 83 |     elif mode == 'map_reduce':
 84 |         max_tokens = max_tokens - len(enc.encode(mapreduce_prompt_template)) - MAX_OUTPUT_TOKENS - chunk_overlap
 85 |     else:
 86 |         raise Exception('Invalid mode')
 87 | 
 88 |     text_splitter = TokenTextSplitter(encoding_name=enc_name, chunk_size = max_tokens, chunk_overlap=chunk_overlap)
 89 |     
 90 |     texts = text_splitter.split_text(all_text)
 91 |     docs = [Document(page_content=t) for t in texts]
 92 | 
 93 |     enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL)
 94 | 
 95 |     l_arr = []
 96 |     for d in texts:
 97 |         l_arr.append(str(len(enc.encode(d))))
 98 | 
 99 |     print("Chunks Generated", len(docs), ' | max_tokens', max_tokens, " | Chunk Lengths:", ', '.join(l_arr))
100 | 
101 |     return docs
102 | 
103 | 
104 | def clean_up_text(text):
105 |     text = text.replace('....', '')
106 |     return text
107 | 
108 | 
109 | 
110 | def get_refined_summarization(docs, model=CHOSEN_COMP_MODEL, max_output_tokens=MAX_OUTPUT_TOKENS, stream=False, callbacks=[]):
111 | 
112 |     PROMPT = PromptTemplate(template=refine_prompt_template, input_variables=["text"])
113 |     refine_prompt = PromptTemplate(input_variables=["existing_answer", "text"],template=refine_template)
114 | 
115 |     llm = helpers.get_llm(model, temperature=0, max_output_tokens=max_output_tokens, stream=stream, callbacks=callbacks)
116 | 
117 |     chain = load_summarize_chain(llm, chain_type="refine",  question_prompt=PROMPT, refine_prompt=refine_prompt, return_intermediate_steps=True)
118 |     summ = chain({"input_documents": docs}, return_only_outputs=True)
119 |     
120 |     return summ
121 | 
122 | 
123 | def get_mapreduced_summarization(docs, model=CHOSEN_COMP_MODEL, max_output_tokens=MAX_OUTPUT_TOKENS, stream=False, callbacks=[]):
124 | 
125 |     PROMPT = PromptTemplate(template=mapreduce_prompt_template, input_variables=["text"])
126 | 
127 |     llm = helpers.get_llm(model, temperature=0, max_output_tokens=max_output_tokens, stream=stream, callbacks=callbacks)
128 | 
129 |     chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=PROMPT, combine_prompt=PROMPT, return_intermediate_steps=True)
130 |     summ = chain({"input_documents": docs}, return_only_outputs=True)
131 |     
132 |     return summ
133 | 
134 | 
135 | 
136 | 
137 | def read_document(path, verbose = False):
138 |     if verbose: print(f"Reading {path}")
139 |     
140 |     all_text = ''
141 |     ext = os.path.splitext(path)[1]
142 | 
143 |     if ext == '.xlsx':
144 |         dataframe = openpyxl.load_workbook(path, data_only=True)
145 |         sheets = [s for s in dataframe.sheetnames if 'HiddenCache' not in s]
146 |         for sheet in sheets:
147 |             print('sheet', sheet)
148 |             all_text += pd.read_excel(path, sheet_name=sheets[0]).to_string(na_rep='') + '\n\n\n\n'
149 |     elif ext == '.csv':
150 |         return None
151 |     elif ext == '.pdf':
152 |         contents, kv_contents, dfs, t_contents = fr_helpers.fr_analyze_local_doc_with_dfs(path, verbose = verbose)
153 |         all_text = ' '.join([kv_contents , contents ,  t_contents])
154 |     else:
155 |         return None
156 |     
157 |     all_text = clean_up_text(all_text)
158 | 
159 |     return all_text
160 | 
161 | 
162 | def summarize_document(path, mode='refine', verbose = False):
163 | 
164 |     print(f"##########################\nStarting Processing {path} ...")
165 |     start = time.time()
166 |     text = read_document(path, verbose=verbose)
167 |     if text is None: return None
168 | 
169 |     summ = summarize_text(text, mode=mode, verbose=verbose)
170 |     end = time.time()
171 | 
172 |     summary = {
173 |         'file': os.path.basename(path),
174 |         'intermediate_steps': summ['intermediate_steps'],
175 |         'summary': summ['output_text'],
176 |         'proc_time': end-start
177 |     }
178 | 
179 |     print(f"Done Processing {path} in {end-start} seconds\n##########################\n")
180 |     return summary 
181 | 
182 | 
183 | def summarize_text(text, mode='refine', verbose = False):    
184 |     docs = chunk_doc(text, mode=mode)
185 | 
186 |     if mode == 'refine':
187 |         summ = get_refined_summarization(docs)
188 |     elif mode == 'map_reduce':
189 |         summ = get_mapreduced_summarization(docs)
190 |     else:
191 |         raise Exception("Invalid mode")
192 | 
193 |     return summ
194 | 
195 | 
196 | 
197 | def summarize_folder(folder, mode='refine', save_to_csv=True, save_to_pkl=True, verbose = False):
198 |     files = os.listdir(folder)
199 |     print(f"Files in folder {len(files)}")
200 |     pkl_file = os.path.join(folder, f'summaries_{mode}.pkl')
201 |     csv_file = os.path.join(folder, f'summaries_{mode}.csv')
202 | 
203 |     if os.path.exists(csv_file):
204 |         summ_df = pd.read_csv(csv_file)
205 |     else:
206 |         summ_df = pd.DataFrame(columns=['file', 'intermediate_steps', 'summary', 'proc_time'])
207 | 
208 |     processed_files = list(summ_df['file'])
209 |     print(f"List of already processed files {processed_files}")
210 |      
211 |     for f in files:        
212 |         path = os.path.join(folder, f)
213 |         if f in processed_files: continue
214 |         
215 |         summary = summarize_document(path, mode=mode, verbose=verbose)
216 |         if summary is None: continue
217 |         summ_df = pd.concat([summ_df, pd.DataFrame([summary])], ignore_index=True)
218 | 
219 |         if save_to_csv: summ_df.to_csv(csv_file)
220 |         if save_to_pkl: summ_df.to_pickle(pkl_file)
221 | 
222 |     return summ_df


--------------------------------------------------------------------------------
/utils/web_crawler.py:
--------------------------------------------------------------------------------
  1 | import logging, json, re, os, requests, uuid,ssl
  2 | import azure.functions as func
  3 | from azure.storage.blob import ContainerClient
  4 | from azure.storage.blob import BlobServiceClient
  5 | from bs4 import BeautifulSoup
  6 | from collections import deque
  7 | from html.parser import HTMLParser
  8 | from urllib.parse import urlparse
  9 | from urllib.request import urlopen
 10 | import urllib.request
 11 | import urllib
 12 | 
 13 | import pandas as pd
 14 | import numpy as np
 15 | from datetime import datetime
 16 | import time
 17 | from utils import language
 18 | 
 19 | 
 20 | HTTP_URL_PATTERN = r'^http[s]*://.+'
 21 | 
 22 | CONTEXT = ssl._create_unverified_context()
 23 | 
 24 | 
 25 | # Create a class to parse the HTML and get the hyperlinks
 26 | class HyperlinkParser(HTMLParser):
 27 |     def __init__(self):
 28 |         super().__init__()
 29 |         # Create a list to store the hyperlinks
 30 |         self.hyperlinks = []
 31 | 
 32 |     # Override the HTMLParser's handle_starttag method to get the hyperlinks
 33 |     def handle_starttag(self, tag, attrs):
 34 |         attrs = dict(attrs)
 35 | 
 36 |         # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
 37 |         if tag == "a" and "href" in attrs:
 38 |             self.hyperlinks.append(attrs["href"])
 39 | # Function to get the hyperlinks from a URL
 40 | def get_hyperlinks(url):
 41 |     
 42 |     # Try to open the URL and read the HTML
 43 |     try:
 44 |         # Open the URL and read the HTML
 45 |         with urllib.request.urlopen(url,context=CONTEXT) as response:
 46 | 
 47 |             # If the response is not HTML, return an empty list
 48 |             if not response.info().get('Content-Type').startswith("text/html"):
 49 |                 return []
 50 |             
 51 |             # Decode the HTML
 52 |             html = response.read().decode('utf-8')
 53 |     except Exception as e:
 54 |         print(e)
 55 |         return []
 56 | 
 57 |     # Create the HTML Parser and then Parse the HTML to get hyperlinks
 58 |     parser = HyperlinkParser()
 59 |     parser.feed(html)
 60 | 
 61 |     return parser.hyperlinks
 62 | # Function to get the hyperlinks from a URL that are within the same domain
 63 | def get_domain_hyperlinks(local_domain, url):
 64 |     clean_links = []
 65 |     for link in set(get_hyperlinks(url)):
 66 |         clean_link = None
 67 | 
 68 |         # If the link is a URL, check if it is within the same domain
 69 |         if re.search(HTTP_URL_PATTERN, link):
 70 |             # Parse the URL and check if the domain is the same
 71 |             url_obj = urlparse(link)
 72 |             if url_obj.netloc == local_domain:
 73 |                 clean_link = link
 74 | 
 75 |         # If the link is not a URL, check if it is a relative link
 76 |         else:
 77 |             if link.startswith("/"):
 78 |                 link = link[1:]
 79 |             elif link.startswith("#") or link.startswith("mailto:"):
 80 |                 continue
 81 |             clean_link = "https://" + local_domain + "/" + link
 82 | 
 83 |         if clean_link is not None:
 84 |             if clean_link.endswith("/"):
 85 |                 clean_link = clean_link[:-1]
 86 |             clean_links.append(clean_link)
 87 | 
 88 |     # Return the list of hyperlinks that are within the same domain
 89 |     return list(set(clean_links))
 90 | def remove_newlines(text):
 91 |     text = text.replace('\n', ' ')
 92 |     text = text.replace('\\n', ' ')
 93 |     text = text.replace('  ', ' ')
 94 |     text = text.replace('  ', ' ')
 95 |     text = text.replace('  ', ' ')
 96 |     text = text.replace('  ', ' ')
 97 |     text = text.replace('  ', ' ')
 98 |     text = text.replace('  ', ' ')
 99 |     return text
100 | def remove_urls(text):
101 |     text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
102 |     return text
103 | 
104 | def crawl(url, KB_BLOB_CONN_STR, KB_BLOB_CONTAINER, OUTPUT_BLOB_CONTAINER):
105 |     # Parse the URL and get the domain
106 |     local_domain = urlparse(url).netloc
107 | 
108 |     # Create a queue to store the URLs to crawl
109 |     queue = deque([url])
110 | 
111 |     # Create a set to store the URLs that have already been seen (no duplicates)
112 |     seen = set()
113 | 
114 |     # While the queue is not empty, continue crawling
115 |     while queue:
116 |         # Get the next URL from the queue
117 |         url = queue.pop()
118 |         print(url) # for debugging and to see the progress
119 |         if url in seen:
120 |             print('already processed')
121 |         else:
122 |             seen.add(url)
123 |             if url.endswith(".pdf"):
124 |                 try:
125 |                     dest_blob_name = os.path.basename(urlparse(url).path)
126 |                     source_url = url
127 |                     container_client = ContainerClient.from_connection_string(KB_BLOB_CONN_STR, KB_BLOB_CONTAINER)
128 |                     blob_client = container_client.get_blob_client(dest_blob_name)
129 |                     blob_client.upload_blob(b'',overwrite=True)
130 |                     blob_client.stage_block_from_url(block_id=1, source_url=source_url)
131 |                     blob_client.commit_block_list(['1'])
132 |                 except Exception as e:
133 |                     print("Could not upload this PDF file")
134 |                     print(e)
135 | 
136 |             else:
137 |                 try:
138 |                     soup = BeautifulSoup(urlopen(url,context=CONTEXT), "html.parser")
139 |                     text = soup.get_text()
140 |                     doc_id=str(uuid.uuid3(uuid.NAMESPACE_DNS, text))
141 |                     timestamp = str(datetime.now()),
142 |                     doc_text = remove_urls(remove_newlines(text))
143 |                     lang = language.detect_content_language(doc_text[:500])
144 |                     new_doc = {
145 |                         "id": doc_id,
146 |                         "categoryId": 'CATEGORYID',
147 |                         "timestamp": timestamp,
148 |                         "web_url": url,
149 |                         "text": doc_text, 
150 |                         "source_language": lang 
151 | 
152 |                     }
153 |                     try:
154 |                         container = ContainerClient.from_connection_string(KB_BLOB_CONN_STR, OUTPUT_BLOB_CONTAINER)
155 |                         try:
156 |                             container_properties = container.get_container_properties()
157 |                         except Exception as e:
158 |                             container.create_container()
159 | 
160 | 
161 |                         filename=local_domain+'_'+doc_id
162 |                         blob_name = filename + '.json'            
163 |                         blob_client = container.get_blob_client(blob=blob_name)
164 |                         blob_client.upload_blob(json.dumps(new_doc, indent=4, ensure_ascii = False), overwrite=True)
165 |                         logging.info(f"Document {doc_id} was successfully saved to the {OUTPUT_BLOB_CONTAINER} container")
166 | 
167 |                     except Exception as e:
168 |                         logging.error(f"Exception: Document {doc_id} created an exception.\n{e}")
169 | 
170 |                 except Exception as e:
171 |                     print(e)
172 |         # Get the hyperlinks from the URL and add them to the queue if not already seen.
173 |         for link in get_domain_hyperlinks(local_domain, url):
174 |             if link not in seen:
175 |                 queue.append(link)


--------------------------------------------------------------------------------