├── .deployment ├── .env.template ├── .funcignore ├── .gitignore ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── AzCogSearchDocCrackingFunc ├── __init__.py ├── function.json └── sample.dat ├── AzureOpenAIandPVAbot.pdf ├── BotQnAHTTPFunc ├── __init__.py ├── function.json └── sample.dat ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── ServiceBusQueueNewDocument ├── __init__.py ├── function.json └── sample.dat ├── WISHLIST.md ├── app.py ├── experiment.ipynb ├── host.json ├── images ├── agent-arch.jpg ├── azure.jpg ├── chatbot.jpg ├── copyfuncurl.jpg ├── copyoutputs.jpg ├── custom_skill.jpg ├── depl-outputs.jpg ├── first_req.jpg ├── firstquery.jpg ├── funcdeploy.jpg ├── km-openai v2.jpg ├── km-openai.png ├── local_settings.jpg ├── midjourney.png ├── oai_deployments.jpg ├── openaichoice.jpg ├── openaifuncapp.jpg ├── postman.jpg ├── prompt_choice.png ├── redis.jpg ├── redischoice.jpg ├── run_ingest.jpg ├── search_params.jpg ├── sec_req.jpg ├── sem_search.jpg ├── stream-client.jpg ├── subs_conv.jpg └── suffix.jpg ├── kb_docs_samples ├── Dubai Brochure.pdf ├── Las Vegas Brochure.pdf ├── London Brochure.pdf ├── Margies Travel Company Info.pdf ├── New York Brochure.pdf ├── San Francisco Brochure.pdf └── olympics_sections_text.csv ├── local.settings.json ├── redis.yml ├── requirements.txt ├── static ├── index.html ├── index_old.html ├── script.js └── styles.css ├── template.json └── utils ├── bot_helpers.py ├── cogsearch_helpers.py ├── cogvecsearch_helpers ├── cogsearch_vecstore.py └── cs_json.py ├── cosmos_helpers.py ├── cv_helpers.py ├── env_vars.py ├── fr_helpers.py ├── helpers.py ├── http_helpers.py ├── kb_doc.py ├── km_agents.py ├── langchain_helpers ├── mod_agent.py ├── mod_ccr_prompt.py ├── mod_react_prompt.py ├── mod_wiki_prompt.py ├── oai_fc_agent.py ├── oldschoolsearch.py ├── simple_prompt.py └── streaming_handler.py ├── language.py ├── openai_helpers.py ├── redis_helpers.py ├── storage.py ├── summarization.py └── web_crawler.py /.deployment: -------------------------------------------------------------------------------- 1 | [config] 2 | SCM_DO_BUILD_DURING_DEPLOYMENT=true 3 | WEBSITE_WEBDEPLOY_USE_SCM=true -------------------------------------------------------------------------------- /.env.template: -------------------------------------------------------------------------------- 1 | ########################################## 2 | ###### Please fill in the below 3 sections 3 | ########################################## 4 | 5 | ### Configuration 6 | USE_COG_VECSEARCH = 0 # set this to 1 to enable vector search in cognitive search 7 | PROCESS_IMAGES = 0 # set this to 1 to enable image processing 8 | DATABASE_MODE = 0 # set this to 1 to enable backup mode with Cosmos 9 | USE_REDIS_CACHE = 1 # set this to 1 to enable caching sessions and intermediate results with Redis 10 | 11 | 12 | #### Cognitive Search 13 | COG_SEARCH_ENDPOINT="" 14 | COG_SEARCH_ADMIN_KEY="" 15 | COG_SEARCH_CUSTOM_FUNC="" 16 | KB_INDEX_NAME = "km-openai" 17 | KB_INDEXER_NAME = "km-openai-indexer" 18 | KB_DATA_SOURCE_NAME = "km-openai-docs" 19 | KB_SKILLSET_NAME = "km-openai-skills" 20 | KB_SEM_INDEX_NAME = "km-openai-sem" 21 | COG_VEC_SEARCH_API_VERSION = "2023-07-01-Preview" 22 | COG_VECSEARCH_VECTOR_INDEX = "vec-index" 23 | 24 | 25 | #### Cognitive Services 26 | COG_SERV_ENDPOINT="" 27 | COG_SERV_KEY="" 28 | FR_CONTAINER=kmoaiforms 29 | CV_API_VERSION="2023-02-01-preview" 30 | 31 | #### Knowledge Base - Blob Storage 32 | KB_BLOB_CONN_STR="" 33 | KB_BLOB_CONTAINER=kmoaidemo 34 | OUTPUT_BLOB_CONTAINER=kmoaiprocessed 35 | 36 | 37 | #### OPENAI 38 | OPENAI_RESOURCE_ENDPOINT="" 39 | OPENAI_API_KEY="" 40 | 41 | 42 | ############################################ 43 | ###### No need to fill in the below sections 44 | ###### unless you're planning to develop 45 | ###### features with the below systems 46 | ############################################ 47 | 48 | 49 | #### Bing Search 50 | USE_BING = "no" 51 | BING_SUBSCRIPTION_KEY = "" 52 | BING_SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search" 53 | LIST_OF_COMMA_SEPARATED_URLS = "" 54 | 55 | 56 | #### Service Bus 57 | SERVICEBUS_CONN_STR = "" 58 | 59 | 60 | #### Redis E10 61 | REDIS_ADDR="" 62 | REDIS_PORT=10000 63 | REDIS_PASSWORD="" 64 | REDIS_INDEX_NAME='acs_emb_index' 65 | VECTOR_FIELD_IN_REDIS='item_vector' 66 | NUMBER_PRODUCTS_INDEX=1000 67 | 68 | 69 | 70 | #### Cognitive Services - Translator 71 | TRANSLATION_ENDPOINT="https://api.cognitive.microsofttranslator.com" 72 | TRANSLATION_API_KEY="" 73 | TRANSLATION_LOCATION=westeurope 74 | 75 | 76 | 77 | #### Cosmos 78 | COSMOS_URI="" 79 | COSMOS_KEY="" 80 | CATEGORYID="KM_OAI_CATEGORY" 81 | EMBCATEGORYID="KM_OAI_EMB_CATEGORY" 82 | COSMOS_DB_NAME="KM_OAI_DB" 83 | 84 | 85 | # Computer Vision 86 | CV_ENDPOINT = "" 87 | CV_API_KEY = "" 88 | 89 | 90 | #### OPENAI 91 | MAX_QUERY_TOKENS = 500 92 | MAX_OUTPUT_TOKENS = 750 93 | MAX_HISTORY_TOKENS = 1000 94 | CONVERSATION_TTL_SECS = 7200 95 | MAX_SEARCH_TOKENS = 2000 96 | PRE_CONTEXT = 500 97 | 98 | OVERLAP_TEXT=80 99 | 100 | ADA_002_EMBED_NUM_DIMS = 1536 101 | ADA_002_MODEL_MAX_TOKENS = 4095 102 | ADA_002_EMBEDDING_MODEL = "text-embedding-ada-002" 103 | ADA_EMBEDDING_ENCODING = "cl100k_base" 104 | 105 | DAVINCI_003_EMBED_NUM_DIMS = 12288 106 | DAVINCI_003_MODEL_MAX_TOKENS = 4000 107 | DAVINCI_003_EMB_MAX_TOKENS = 2047 108 | DAVINCI_003_COMPLETIONS_MODEL = "text-davinci-003" 109 | DAVINCI_003_EMBEDDING_MODEL = "text-search-davinci-doc-001" 110 | DAVINCI_003_QUERY_EMB_MODEL = "text-search-davinci-query-001" 111 | DAVINCI_EMBEDDING_ENCODING = "p50k_base" 112 | 113 | GPT35_TURBO_COMPLETIONS_MODEL = 'gpt-35-turbo' 114 | GPT35_TURBO_COMPLETIONS_MAX_TOKENS = 8193 115 | GPT35_TURBO_COMPLETIONS_ENCODING = "cl100k_base" 116 | 117 | GPT4_MODEL = "gpt-4" 118 | GPT4_32K_MODEL = "gpt-4-32k" 119 | 120 | CHOSEN_EMB_MODEL = "text-embedding-ada-002" 121 | SMALL_EMB_TOKEN_NUM = 0 122 | MEDIUM_EMB_TOKEN_NUM = 800 123 | LARGE_EMB_TOKEN_NUM = 0 124 | X_LARGE_EMB_TOKEN_NUM = 0 125 | NUM_TOP_MATCHES = 2 126 | 127 | 128 | MAX_OUTPUT_TOKENS = 500 129 | MAX_QUERY_TOKENS = 500 130 | CHOSEN_QUERY_EMB_MODEL = "text-embedding-ada-002" 131 | CHOSEN_COMP_MODEL = "gpt-35-turbo" 132 | 133 | RESTRICTIVE_PROMPT = "no" 134 | 135 | 136 | -------------------------------------------------------------------------------- /.funcignore: -------------------------------------------------------------------------------- 1 | .git* 2 | .vscode 3 | __azurite_db*__.json 4 | __blobstorage__ 5 | __queuestorage__ 6 | local.settings.json 7 | test 8 | .venv -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # pipenv 86 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 87 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 88 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 89 | # install all needed dependencies. 90 | #Pipfile.lock 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # Environments 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | 124 | # Azure Functions artifacts 125 | bin 126 | obj 127 | appsettings.json 128 | 129 | 130 | # Azurite artifacts 131 | __blobstorage__ 132 | __queuestorage__ 133 | __azurite_db*__.json 134 | .python_packages 135 | .env (mine) 136 | 137 | dump/ 138 | test.ipynb 139 | load_embeddings.ipynb 140 | local.settings (dev).json 141 | local.settings (template).json 142 | local.settings (mine).json 143 | template_test.json 144 | template_test copy.json 145 | 146 | .env copy 147 | template.json 148 | README copy.md 149 | template (backup).json 150 | template (backup1).json 151 | embs.pkl 152 | jsonl.pkl 153 | data.json 154 | full_data.pkl 155 | full_data.json 156 | new_emb_arr.pkl 157 | demo_06_transaction_data_understanding (1).ipynb 158 | data.jsonl 159 | baseline.pkl 160 | baseline.json 161 | 162 | backup/ 163 | WebSiteCrawler.ipynb 164 | agent copy.ipynb 165 | .env copy 2 166 | demo.ipynb 167 | evaluate.ipynb 168 | eval_results.csv 169 | all_predictions.pkl 170 | test_openai.ipynb 171 | eval_results_cont.csv 172 | eval.csv 173 | agent_name.csv 174 | pres.ipynb 175 | hukoomi/ 176 | hukoomi_new/ 177 | .env 178 | qna.ipynb 179 | notebooks/ 180 | demo copy.ipynb 181 | bing.ipynb 182 | app/ 183 | qna copy.ipynb 184 | summarization.ipynb 185 | summarization copy.ipynb 186 | utils/langchain_helpers/summ_react_prompt.py 187 | cogvecsearch.ipynb 188 | qna debug.ipynb 189 | cv.ipynb 190 | .env1 191 | qna_demo.ipynb 192 | qna_lufthansa.ipynb 193 | .env2 194 | qna - ai knowledge exchange.ipynb 195 | kb_docs_samples/summaries_map_reduce.csv 196 | kb_docs_samples/summaries_refine.csv 197 | kb_docs_samples/summaries_map_reduce.pkl 198 | kb_docs_samples/summaries_refine.pkl 199 | demo_06_transaction_data_understanding.ipynb 200 | qna - ai knowledge exchange.zip 201 | London Brochure.json 202 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-azuretools.vscode-azurefunctions", 4 | "ms-python.python" 5 | ] 6 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Attach to Python Functions", 6 | "type": "python", 7 | "request": "attach", 8 | "port": 9091, 9 | "preLaunchTask": "func: host start" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "azureFunctions.deploySubpath": ".", 3 | "azureFunctions.scmDoBuildDuringDeployment": true, 4 | "azureFunctions.pythonVenv": ".venv", 5 | "azureFunctions.projectLanguage": "Python", 6 | "azureFunctions.projectRuntime": "~4", 7 | "debug.internalConsoleOptions": "neverOpen", 8 | "appService.zipIgnorePattern": [ 9 | "__pycache__{,/**}", 10 | "*.py[cod]", 11 | "*$py.class", 12 | ".Python{,/**}", 13 | "build{,/**}", 14 | "develop-eggs{,/**}", 15 | "dist{,/**}", 16 | "backup{,/**}", 17 | "AzCogSearchDocCrackingFunc{,/**}", 18 | "BotQnAHTTPFunc{,/**}", 19 | "ServiceBusQueueNewDocument{,/**}", 20 | "kb_docs_samples{,/**}", 21 | "notebooks{,/**}", 22 | "dump{,/**}", 23 | ".env copy{,/**}", 24 | ".env copy 2{,/**}", 25 | "downloads{,/**}", 26 | ".git{,/**}", 27 | "eggs{,/**}", 28 | ".eggs{,/**}", 29 | "lib{,/**}", 30 | "lib64{,/**}", 31 | "parts{,/**}", 32 | "sdist{,/**}", 33 | "var{,/**}", 34 | "wheels{,/**}", 35 | "share/python-wheels{,/**}", 36 | "*.egg-info{,/**}", 37 | ".installed.cfg", 38 | "*.egg", 39 | "MANIFEST", 40 | ".env{,/**}", 41 | ".venv{,/**}", 42 | "env{,/**}", 43 | "venv{,/**}", 44 | "ENV{,/**}", 45 | "env.bak{,/**}", 46 | "venv.bak{,/**}", 47 | ".vscode{,/**}" 48 | ], 49 | "appService.defaultWebAppToDeploy": "/subscriptions/2a7eed04-714e-4ba9-96ba-47355c32a8d6/resourceGroups/km-demo/providers/Microsoft.Web/sites/kmaoiwebappdemo0001", 50 | "appService.deploySubpath": "." 51 | } -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "type": "func", 6 | "label": "func: host start", 7 | "command": "host start", 8 | "problemMatcher": "$func-python-watch", 9 | "isBackground": true, 10 | "dependsOn": "pip install (functions)" 11 | }, 12 | { 13 | "label": "pip install (functions)", 14 | "type": "shell", 15 | "osx": { 16 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 17 | }, 18 | "windows": { 19 | "command": "${config:azureFunctions.pythonVenv}\\Scripts\\python -m pip install -r requirements.txt" 20 | }, 21 | "linux": { 22 | "command": "${config:azureFunctions.pythonVenv}/bin/python -m pip install -r requirements.txt" 23 | }, 24 | "problemMatcher": [] 25 | } 26 | ] 27 | } -------------------------------------------------------------------------------- /AzCogSearchDocCrackingFunc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import re 4 | import azure.functions as func 5 | import os 6 | from azure.cosmos import CosmosClient, PartitionKey 7 | from azure.storage.blob import ContainerClient 8 | import urllib 9 | import uuid 10 | import copy 11 | 12 | from utils import cosmos_helpers 13 | from utils import storage 14 | from utils import cv_helpers 15 | 16 | from utils.env_vars import * 17 | 18 | 19 | def remove_urls(text): 20 | text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) 21 | return text 22 | 23 | re_strs = [ 24 | "customXml\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*", 25 | "ppt\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*", 26 | "\.MsftOfcThm_[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*[\r\n\t\f\v ]\{[\r\n\t\f\v ].*[\r\n\t\f\v ]\}", 27 | "SlidePowerPoint", 28 | "PresentationPowerPoint", 29 | '[a-zA-Z0-9]*\.(?:gif|emf)' 30 | ] 31 | 32 | 33 | 34 | def analyze_doc(data_dict): 35 | 36 | ret_dict = {} 37 | ret_dict['status'] = '' 38 | db_status = '' 39 | data_dict['text'] = remove_urls(data_dict['content'].replace("\n\n\n", "\n").replace("....", ".")) 40 | data_dict['contentType'] = 'text' 41 | data_dict['container'] = storage.get_container_name(data_dict['doc_url']) 42 | 43 | try: 44 | if isinstance(data_dict['timestamp'], list): 45 | data_dict['timestamp'] = ' '.join(data_dict['timestamp']) 46 | except: 47 | data_dict['timestamp'] = "1/1/1970 00:00:00 AM" 48 | 49 | 50 | 51 | for re_str in re_strs: 52 | matches = re.findall(re_str, data_dict['text'], re.DOTALL) 53 | for m in matches: data_dict['text'] = data_dict['text'].replace(m, '') 54 | 55 | 56 | try: 57 | if PROCESS_IMAGES == 1: 58 | 59 | url = data_dict['doc_url'] 60 | 61 | fn = storage.get_filename(url) 62 | extension = os.path.splitext(fn)[1] 63 | 64 | if extension in ['.jpg', '.jpeg', '.png']: 65 | sas_url = storage.create_sas(url) 66 | cvr = cv_helpers.CV() 67 | 68 | res = cvr.analyze_image(img_url=sas_url) 69 | 70 | data_dict['text'] = res['text'] + data_dict['text'] 71 | data_dict['cv_image_vector'] = cvr.get_img_embedding(sas_url) 72 | data_dict['cv_text_vector'] = cvr.get_text_embedding(res['text']) 73 | data_dict['contentType'] = 'image' 74 | 75 | except Exception as e: 76 | logging.error(f"Exception: Image {doc_id} created an exception.\n{e}") 77 | print(f"Exception: Image {doc_id} created an exception.\n{e}") 78 | ret_dict['status'] = f"Exception: Image {doc_id} created an exception.\n{e}" 79 | 80 | 81 | try: 82 | if DATABASE_MODE == 1: 83 | db_status = cosmos_helpers.cosmos_store_contents(data_dict) 84 | logging.info(db_status) 85 | print(db_status) 86 | except Exception as e: 87 | doc_id = data_dict.get('id', 'Unknown ID') 88 | logging.error(f"Exception: Document {doc_id} created an exception.\n{e}") 89 | ret_dict['status'] = ret_dict['status'] + '\n' + f"Exception: Document {doc_id} created an exception.\n{e}" 90 | 91 | try: 92 | ret_dict = storage.save_json_document(data_dict, OUTPUT_BLOB_CONTAINER) 93 | logging.info(ret_dict['status']) 94 | except Exception as e: 95 | doc_id = data_dict.get('id', 'Unknown ID') 96 | logging.error(f"Exception: Document {doc_id} created an exception.\n{e}") 97 | ret_dict['status'] = ret_dict['status'] + '\n' + f"Exception: Document {doc_id} created an exception.\n{e}" 98 | 99 | return ret_dict 100 | 101 | 102 | 103 | 104 | ## Perform an operation on a record 105 | def transform_value(value): 106 | try: 107 | recordId = value['recordId'] 108 | except AssertionError as error: 109 | logging.info(error) 110 | return None 111 | 112 | # Validate the inputs 113 | try: 114 | assert ('data' in value), "'data' field is required." 115 | data = value['data'] 116 | logging.info(f"Data received: {data}") 117 | assert ('content' in data), "'content' field is required in 'data' object." 118 | assert ('id' in data), "'id' field is required in 'data' object." 119 | 120 | except AssertionError as error: 121 | logging.info(error) 122 | return ( 123 | { 124 | "recordId": recordId, 125 | "errors": [ { "message": "Error:" + error.args[0] } ] 126 | }) 127 | 128 | try: 129 | ret_dict = analyze_doc(value['data']) 130 | 131 | # Here you could do something more interesting with the inputs 132 | 133 | except AssertionError as error: 134 | logging.info(error) 135 | return ( 136 | { 137 | "recordId": recordId, 138 | "errors": [ { "message": "Could not complete operation for record." } ] 139 | }) 140 | 141 | return ({ 142 | "recordId": recordId, 143 | "data": ret_dict 144 | }) 145 | 146 | 147 | 148 | 149 | 150 | def compose_response(json_data): 151 | values = json.loads(json_data)['values'] 152 | 153 | # Prepare the Output before the loop 154 | results = {} 155 | results["values"] = [] 156 | 157 | for value in values: 158 | output_record = transform_value(value) 159 | if output_record != None: 160 | results["values"].append(output_record) 161 | 162 | return json.dumps(results, ensure_ascii=False) 163 | 164 | 165 | 166 | 167 | def main(req: func.HttpRequest) -> func.HttpResponse: 168 | logging.info('Python HTTP trigger function processed a request.') 169 | 170 | try: 171 | body = json.dumps(req.get_json()) 172 | except ValueError: 173 | return func.HttpResponse( 174 | "Invalid body", 175 | status_code=400 176 | ) 177 | 178 | if body: 179 | result = compose_response(body) 180 | return func.HttpResponse(result, mimetype="application/json") 181 | else: 182 | return func.HttpResponse( 183 | "Invalid body", 184 | status_code=400 185 | ) 186 | 187 | 188 | -------------------------------------------------------------------------------- /AzCogSearchDocCrackingFunc/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": [ 10 | "get", 11 | "post" 12 | ] 13 | }, 14 | { 15 | "type": "http", 16 | "direction": "out", 17 | "name": "$return" 18 | } 19 | ] 20 | } -------------------------------------------------------------------------------- /AzCogSearchDocCrackingFunc/sample.dat: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Azure" 3 | } -------------------------------------------------------------------------------- /AzureOpenAIandPVAbot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/AzureOpenAIandPVAbot.pdf -------------------------------------------------------------------------------- /BotQnAHTTPFunc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import azure.functions as func 3 | import os 4 | 5 | from utils import bot_helpers 6 | 7 | from utils.env_vars import * 8 | 9 | 10 | def get_param(req, param_name): 11 | param = req.params.get(param_name) 12 | 13 | if not param: 14 | try: 15 | req_body = req.get_json() 16 | except ValueError: 17 | pass 18 | else: 19 | param = req_body.get(param_name) 20 | 21 | return param 22 | 23 | 24 | 25 | def check_param(param): 26 | if param == 'true': 27 | param = True 28 | else: 29 | param = False 30 | 31 | return param 32 | 33 | 34 | 35 | def main(req: func.HttpRequest) -> func.HttpResponse: 36 | logging.info('Python HTTP trigger function processed a request.') 37 | 38 | query = get_param(req, 'query') 39 | session_id = get_param(req, 'session_id') 40 | filter_param = get_param(req, 'filter') 41 | search_method = get_param(req, 'search_method') 42 | 43 | enable_unified_search = get_param(req, 'enable_unified_search') 44 | enable_redis_search = get_param(req, 'enable_redis_search') 45 | enable_cognitive_search = get_param(req, 'enable_cognitive_search') 46 | evaluate_step = get_param(req, 'evaluate_step') 47 | check_adequacy = get_param(req, 'check_adequacy') 48 | check_intent = get_param(req, 'check_intent') 49 | use_calendar = get_param(req, 'use_calendar') 50 | use_calculator = get_param(req, 'use_calculator') 51 | use_bing = get_param(req, 'use_bing') 52 | 53 | 54 | params_dict = { 55 | 'enable_unified_search': check_param(enable_unified_search), 56 | 'enable_redis_search': check_param(enable_redis_search), 57 | 'enable_cognitive_search': check_param(enable_cognitive_search), 58 | 'evaluate_step': check_param(evaluate_step), 59 | 'check_adequacy': check_param(check_adequacy), 60 | 'check_intent': check_param(check_intent), 61 | 'use_calendar': check_param(use_calendar), 62 | 'use_calculator': check_param(use_calculator), 63 | 'use_bing': check_param(use_bing) 64 | } 65 | 66 | if filter_param is None: filter_param = '*' 67 | 68 | if query: 69 | str = bot_helpers.openai_interrogate_text(query, session_id=session_id, filter_param=filter_param, agent_name=search_method, params_dict=params_dict) 70 | return func.HttpResponse(str) 71 | else: 72 | return func.HttpResponse( 73 | "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.", 74 | status_code=200 75 | ) 76 | -------------------------------------------------------------------------------- /BotQnAHTTPFunc/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "authLevel": "function", 6 | "type": "httpTrigger", 7 | "direction": "in", 8 | "name": "req", 9 | "methods": [ 10 | "get", 11 | "post" 12 | ] 13 | }, 14 | { 15 | "type": "http", 16 | "direction": "out", 17 | "name": "$return" 18 | } 19 | ] 20 | } -------------------------------------------------------------------------------- /BotQnAHTTPFunc/sample.dat: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Azure" 3 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | This project welcomes contributions and suggestions. Most contributions require you to 4 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 5 | and actually do, grant us the rights to use your contribution. For details, visit 6 | https://cla.microsoft.com. 7 | 8 | When you submit a pull request, a CLA-bot will automatically determine whether you need 9 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 10 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 13 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 14 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 samelhousseini 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | For help and questions about using this project, please contact the repo owner via GitHub. 10 | 11 | ## Microsoft Support Policy 12 | 13 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 14 | -------------------------------------------------------------------------------- /ServiceBusQueueNewDocument/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import azure.functions as func 4 | import smart_open 5 | import os, uuid 6 | from azure.storage.blob import BlobServiceClient, BlobClient 7 | 8 | from utils import helpers 9 | from utils import cosmos_helpers 10 | from utils import cogsearch_helpers 11 | from utils.kb_doc import KB_Doc 12 | from utils.cogvecsearch_helpers import cogsearch_vecstore 13 | 14 | from utils.env_vars import * 15 | 16 | 17 | def main(msg: func.ServiceBusMessage): 18 | 19 | msg_dict = json.loads(msg.get_body().decode('utf-8')) 20 | 21 | logging.info('Python ServiceBus queue trigger processed message: %s', msg_dict) 22 | logging.info("Event Type:%s", msg_dict['eventType']) 23 | 24 | transport_params = { 25 | 'client': BlobServiceClient.from_connection_string(KB_BLOB_CONN_STR), 26 | } 27 | 28 | json_filename = os.path.basename(msg_dict['subject']) 29 | 30 | with smart_open.open(f"azure://{OUTPUT_BLOB_CONTAINER}/{json_filename}", transport_params=transport_params) as fin: 31 | data = json.load(fin) 32 | 33 | full_kbd_doc = KB_Doc() 34 | full_kbd_doc.load(data) 35 | 36 | emb_documents = [] 37 | 38 | emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, SMALL_EMB_TOKEN_NUM, text_suffix = 'S') 39 | 40 | if MEDIUM_EMB_TOKEN_NUM != 0: 41 | emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, MEDIUM_EMB_TOKEN_NUM, text_suffix = 'M', previous_max_tokens=SMALL_EMB_TOKEN_NUM) 42 | 43 | if LARGE_EMB_TOKEN_NUM != 0: 44 | emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, LARGE_EMB_TOKEN_NUM, text_suffix = 'L', previous_max_tokens=MEDIUM_EMB_TOKEN_NUM) 45 | 46 | if X_LARGE_EMB_TOKEN_NUM != 0: 47 | emb_documents += helpers.generate_embeddings(full_kbd_doc, CHOSEN_EMB_MODEL, X_LARGE_EMB_TOKEN_NUM, text_suffix = 'XL', previous_max_tokens=LARGE_EMB_TOKEN_NUM) 48 | 49 | logging.info(f"Generated {len(emb_documents)} emb chunks from doc {json_filename}") 50 | 51 | if (REDIS_ADDR is not None) and (REDIS_ADDR != '') and (USE_REDIS_CACHE == 1): 52 | loaded = helpers.load_embedding_docs_in_redis(emb_documents, document_name = json_filename) 53 | logging.info(f"Loaded into Redis {loaded} emb chunks from doc {json_filename}") 54 | print(f"Loaded into Redis {loaded} emb chunks from doc {json_filename}") 55 | 56 | if USE_COG_VECSEARCH == 1: 57 | vs = cogsearch_vecstore.CogSearchVecStore() 58 | vs.create_index() 59 | docs_dict = vs.upload_documents(emb_documents) 60 | else: 61 | cogsearch_helpers.index_semantic_sections(emb_documents) 62 | 63 | if DATABASE_MODE == 1: 64 | cosmos_helpers.cosmos_backup_embeddings(emb_documents) 65 | 66 | -------------------------------------------------------------------------------- /ServiceBusQueueNewDocument/function.json: -------------------------------------------------------------------------------- 1 | { 2 | "scriptFile": "__init__.py", 3 | "bindings": [ 4 | { 5 | "name": "msg", 6 | "type": "serviceBusTrigger", 7 | "direction": "in", 8 | "queueName": "km-oai-processing-queue", 9 | "connection": "SERVICEBUS_CONN_STR" 10 | } 11 | ] 12 | } -------------------------------------------------------------------------------- /ServiceBusQueueNewDocument/sample.dat: -------------------------------------------------------------------------------- 1 | Service Bus Message -------------------------------------------------------------------------------- /WISHLIST.md: -------------------------------------------------------------------------------- 1 | # Wishlist 2 | 3 | The following features are added to the wishlist to be implemented: 4 | 5 | 6 | ### Completed 7 | 1. Code: Adding support for ChatGPT 8 | 1. Code: Passing TopK matches parameters in the REST API call to the BotQnAHTTPFunc 9 | 1. ARM: Implementing Dedicated Plan with B1 for the Function App, instead of the Premium Plan 10 | 1. Code: Adding Form Recognizer either as a new data source to the "kmoaiprocessed" container, or as a new Custom Skill 11 | 1. Code: Storing contents, embeddings, and queries in Cosmos. It's important to know which are the most asked queries. 12 | 1. ARM: Adding Application Insights to the ARM template 13 | 1. Code: Adding a custom skill that processes csv files 14 | 1. GUI for triggering Cognitive Search and Form Recognizer document ingestion - streamlit 15 | 16 | ### TBD 17 | 1. ARM: Adding Application Insights to the ARM template 18 | 1. Code: Adding a custom skill that processes csv files 19 | 20 | 1. GUI for triggering Cognitive Search and Form Recognizer document ingestion - 21 | 1. Backend: Python control part in Flask - Samer 22 | 1. Frontend: React control part for the UI (generate with GPT4) - Yacine 23 | 1. Chat client UI - demo - Yacine (1st Priority) 24 | 1. Streaming capability with Flash SocketIO - Andrey (1st priority) 25 | 1. (maximizing Cosmos use) - Translation Problem (checksum checking in Cosmos) - re-generate translations for all the chunks (cost) - get from Cosmos - Andrey (2nd priority) 26 | 1. Streamlit vs Flask - Andrey (3rd priority) 27 | 1. GPT4 Agent -- Samer 28 | 29 | ### Future 30 | 1. Code: Adding support for fine-tuned models. -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from flask import Flask, redirect, url_for, request, jsonify 4 | from flask_socketio import SocketIO 5 | from flask_socketio import send, emit 6 | import urllib 7 | 8 | 9 | from utils import bot_helpers 10 | from utils import langchain_helpers 11 | from utils import km_agents 12 | from utils import redis_helpers 13 | from utils import language 14 | 15 | 16 | global_params_dict = { 17 | 'enable_unified_search': False, 18 | 'enable_redis_search': False, 19 | 'enable_cognitive_search': True, 20 | 'evaluate_step': False, 21 | 'check_adequacy': False, 22 | 'check_intent': False 23 | } 24 | 25 | # redis_conn = redis_helpers.get_new_conn() 26 | 27 | from utils.env_vars import * 28 | 29 | app = Flask(__name__) 30 | socketio = SocketIO(app, cors_allowed_origins='*') 31 | app.config['SECRET_KEY'] = 'secret!' 32 | 33 | redis_conn = redis_helpers.get_new_conn() 34 | 35 | ############################################################## 36 | ############################################################## 37 | # IMPORTANT 38 | # To run this web server, use the following command: 39 | # flask --app app.py --debug run 40 | # To be able to run this, activate the venv first using the 41 | # following command on Windows: 42 | # .\.venv\Scripts\activate 43 | # Then install the required packages using the following command: 44 | # pip install -r requirements.txt 45 | ############################################################## 46 | ############################################################## 47 | 48 | 49 | 50 | agents_sid = {} 51 | 52 | 53 | @app.route("/", defaults={"path": "index.html"}) 54 | @app.route("/") 55 | def static_file(path): 56 | print("path", path) 57 | return app.send_static_file(path) 58 | 59 | 60 | @socketio.on('connect') 61 | def on_connect(): 62 | print(f"connected {request.sid}") 63 | 64 | 65 | @socketio.on('config') 66 | def on_config(agent_type): 67 | print(f"config {request.sid} - {agent_type}") 68 | connection = {'socketio': socketio, 'connection_id':request.sid} 69 | agent = km_agents.KMOAI_Agent(agent_name = agent_type, params_dict=global_params_dict, stream = True, connection=connection) 70 | agents_sid[request.sid] = agent 71 | 72 | 73 | @socketio.on('disconnect') 74 | def on_disconnect(): 75 | try: 76 | del agents_sid[request.sid] 77 | except Exception as e: 78 | print(f"Client not found: {e}") 79 | 80 | 81 | 82 | 83 | @socketio.on('message') 84 | def handle_message(q): 85 | 86 | print(f'received message: {q} from {request.sid}') 87 | emit('new_message', "Query: " + q + '\n') 88 | 89 | lang = language.detect_content_language(q) 90 | if lang != 'en': q = language.translate(q, lang, 'en') 91 | 92 | print(f'language detected: {lang}') 93 | 94 | answer, sources, likely_sources, s_id = agents_sid[request.sid].run(q, request.sid, redis_conn=redis_conn) 95 | sources_str = '' 96 | 97 | if lang != 'en': answer = language.translate(answer, 'en', lang) 98 | 99 | answer = answer.replace('\n', '
') 100 | 101 | send(answer) 102 | if len(sources) > 0: 103 | for s in set(sources): 104 | try: 105 | linkname = urllib.parse.unquote(os.path.basename(s.split('?')[0])) 106 | except: 107 | linkname = 'Link' 108 | sources_str += '[{linkname}]' 109 | send('Links:'+ sources_str) 110 | 111 | 112 | 113 | ##### IMPORTANT 114 | ##### INCLUDE IN THE POST HEADER --> Content-Type: application/json 115 | ##### IMPORTANT 116 | @app.route('/kmoai_request', methods=['POST']) 117 | def kmoai_request(): 118 | data = request.get_json() 119 | return process_kmoai_request(data) 120 | 121 | 122 | 123 | def check_param(param): 124 | if param == 'true': 125 | param = True 126 | else: 127 | param = False 128 | 129 | return param 130 | 131 | 132 | def get_param(req, param_name): 133 | param = req.get(param_name, None) 134 | return param 135 | 136 | 137 | 138 | ##### IMPORTANT 139 | ##### INCLUDE IN THE POST HEADER --> Content-Type: application/json 140 | ##### IMPORTANT 141 | def process_kmoai_request(req): 142 | logging.info('Python HTTP trigger function processed a request.') 143 | 144 | query = get_param(req, 'query') 145 | session_id = get_param(req, 'session_id') 146 | filter_param = get_param(req, 'filter') 147 | search_method = get_param(req, 'search_method') 148 | 149 | enable_unified_search = get_param(req, 'enable_unified_search') 150 | enable_redis_search = get_param(req, 'enable_redis_search') 151 | enable_cognitive_search = get_param(req, 'enable_cognitive_search') 152 | evaluate_step = get_param(req, 'evaluate_step') 153 | check_adequacy = get_param(req, 'check_adequacy') 154 | check_intent = get_param(req, 'check_intent') 155 | use_calendar = get_param(req, 'use_calendar') 156 | use_bing = get_param(req, 'use_bing') 157 | 158 | params_dict = { 159 | 'enable_unified_search': check_param(enable_unified_search), 160 | 'enable_redis_search': check_param(enable_redis_search), 161 | 'enable_cognitive_search': check_param(enable_cognitive_search), 162 | 'evaluate_step': check_param(evaluate_step), 163 | 'check_adequacy': check_param(check_adequacy), 164 | 'check_intent': check_param(check_intent), 165 | 'use_calendar': check_param(use_calendar), 166 | 'use_bing': check_param(use_bing) 167 | } 168 | 169 | if filter_param is None: filter_param = '*' 170 | 171 | return bot_helpers.openai_interrogate_text(query, session_id=session_id, filter_param=filter_param, agent_name=search_method, params_dict=params_dict) 172 | 173 | 174 | 175 | if __name__ == '__main__': 176 | app.run() 177 | socketio.run(app, allow_unsafe_werkzeug=True) 178 | print('socket io start') 179 | -------------------------------------------------------------------------------- /host.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0", 3 | "functionTimeout": "-1", 4 | "extensions": { 5 | "serviceBus": { 6 | "prefetchCount": 1, 7 | "messageHandlerOptions": { 8 | "maxConcurrentCalls": 1 9 | } 10 | } 11 | }, 12 | "logging": { 13 | "applicationInsights": { 14 | "samplingSettings": { 15 | "isEnabled": true, 16 | "excludedTypes": "Request" 17 | } 18 | } 19 | }, 20 | "extensionBundle": { 21 | "id": "Microsoft.Azure.Functions.ExtensionBundle", 22 | "version": "[3.3.*, 4.0.0)" 23 | }, 24 | "concurrency": { 25 | "dynamicConcurrencyEnabled": true, 26 | "snapshotPersistenceEnabled": true 27 | } 28 | } -------------------------------------------------------------------------------- /images/agent-arch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/agent-arch.jpg -------------------------------------------------------------------------------- /images/azure.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/azure.jpg -------------------------------------------------------------------------------- /images/chatbot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/chatbot.jpg -------------------------------------------------------------------------------- /images/copyfuncurl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/copyfuncurl.jpg -------------------------------------------------------------------------------- /images/copyoutputs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/copyoutputs.jpg -------------------------------------------------------------------------------- /images/custom_skill.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/custom_skill.jpg -------------------------------------------------------------------------------- /images/depl-outputs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/depl-outputs.jpg -------------------------------------------------------------------------------- /images/first_req.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/first_req.jpg -------------------------------------------------------------------------------- /images/firstquery.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/firstquery.jpg -------------------------------------------------------------------------------- /images/funcdeploy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/funcdeploy.jpg -------------------------------------------------------------------------------- /images/km-openai v2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/km-openai v2.jpg -------------------------------------------------------------------------------- /images/km-openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/km-openai.png -------------------------------------------------------------------------------- /images/local_settings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/local_settings.jpg -------------------------------------------------------------------------------- /images/midjourney.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/midjourney.png -------------------------------------------------------------------------------- /images/oai_deployments.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/oai_deployments.jpg -------------------------------------------------------------------------------- /images/openaichoice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/openaichoice.jpg -------------------------------------------------------------------------------- /images/openaifuncapp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/openaifuncapp.jpg -------------------------------------------------------------------------------- /images/postman.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/postman.jpg -------------------------------------------------------------------------------- /images/prompt_choice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/prompt_choice.png -------------------------------------------------------------------------------- /images/redis.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/redis.jpg -------------------------------------------------------------------------------- /images/redischoice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/redischoice.jpg -------------------------------------------------------------------------------- /images/run_ingest.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/run_ingest.jpg -------------------------------------------------------------------------------- /images/search_params.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/search_params.jpg -------------------------------------------------------------------------------- /images/sec_req.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/sec_req.jpg -------------------------------------------------------------------------------- /images/sem_search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/sem_search.jpg -------------------------------------------------------------------------------- /images/stream-client.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/stream-client.jpg -------------------------------------------------------------------------------- /images/subs_conv.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/subs_conv.jpg -------------------------------------------------------------------------------- /images/suffix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/images/suffix.jpg -------------------------------------------------------------------------------- /kb_docs_samples/Dubai Brochure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/Dubai Brochure.pdf -------------------------------------------------------------------------------- /kb_docs_samples/Las Vegas Brochure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/Las Vegas Brochure.pdf -------------------------------------------------------------------------------- /kb_docs_samples/London Brochure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/London Brochure.pdf -------------------------------------------------------------------------------- /kb_docs_samples/Margies Travel Company Info.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/Margies Travel Company Info.pdf -------------------------------------------------------------------------------- /kb_docs_samples/New York Brochure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/New York Brochure.pdf -------------------------------------------------------------------------------- /kb_docs_samples/San Francisco Brochure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MSUSAzureAccelerators/Knowledge-Mining-with-OpenAI/9b247b52c8af0f9ecf2fefc8e71bd1377b23ac6c/kb_docs_samples/San Francisco Brochure.pdf -------------------------------------------------------------------------------- /local.settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "IsEncrypted": false, 3 | "Values": { 4 | "AzureWebJobsStorage": "", 5 | "FUNCTIONS_WORKER_RUNTIME": "python", 6 | "WEBSITE_MAX_DYNAMIC_APPLICATION_SCALE_OUT":"1", 7 | "FUNCTIONS_EXTENSION_VERSION":"~4", 8 | 9 | "COG_SEARCH_ENDPOINT": "", 10 | "COG_SEARCH_ADMIN_KEY": "", 11 | "KB_INDEX_NAME":"km-openai", 12 | "KB_INDEXER_NAME":"km-openai-indexer", 13 | "KB_DATA_SOURCE_NAME":"km-openai-docs", 14 | "KB_SKILLSET_NAME":"km-openai-skills", 15 | 16 | "COG_SERV_ENDPOINT": "", 17 | "COG_SERV_KEY": "", 18 | 19 | "COSMOS_URI": "", 20 | "COSMOS_KEY": "", 21 | "COSMOS_DB_NAME": "KM_OAI_DB", 22 | "CATEGORYID": "KM_OAI_CATEGORY", 23 | 24 | "DATABASE_MODE": "0", 25 | 26 | 27 | "SERVICEBUS_CONN_STR":"", 28 | 29 | "TRANSLATION_ENDPOINT": "https://api.cognitive.microsofttranslator.com", 30 | "TRANSLATION_API_KEY": "", 31 | "TRANSLATION_LOCATION": "westeurope", 32 | 33 | "KB_BLOB_CONN_STR":"", 34 | "KB_BLOB_CONTAINER":"kmoaidemo", 35 | "OUTPUT_BLOB_CONTAINER":"kmoaiprocessed", 36 | 37 | "REDIS_ADDR": "", 38 | "REDIS_PORT": "10000", 39 | "REDIS_PASSWORD":"", 40 | "REDIS_INDEX_NAME":"acs_emb_index", 41 | "VECTOR_FIELD_IN_REDIS":"item_vector", 42 | "NUMBER_PRODUCTS_INDEX":"1000", 43 | 44 | "OPENAI_RESOURCE_ENDPOINT": "", 45 | "OPENAI_API_KEY": "", 46 | 47 | "ADA_002_EMBED_NUM_DIMS":"1536", 48 | "ADA_002_MODEL_MAX_TOKENS":"2047", 49 | "ADA_002_EMBEDDING_MODEL":"text-embedding-ada-002", 50 | "ADA_EMBEDDING_ENCODING":"cl100k_base", 51 | 52 | "DAVINCI_003_EMBED_NUM_DIMS":"12288", 53 | "DAVINCI_003_MODEL_MAX_TOKENS":"4097", 54 | "DAVINCI_003_EMB_MAX_TOKENS":"2047", 55 | "DAVINCI_003_COMPLETIONS_MODEL":"text-davinci-003", 56 | "DAVINCI_003_EMBEDDING_MODEL":"text-search-davinci-doc-001", 57 | "DAVINCI_003_QUERY_EMB_MODEL":"text-search-davinci-query-001", 58 | "DAVINCI_EMBEDDING_ENCODING":"p50k_base", 59 | 60 | 61 | "OVERLAP_TEXT":"80", 62 | "SMALL_EMB_TOKEN_NUM":"2047", 63 | "MEDIUM_EMB_TOKEN_NUM":"0", 64 | "LARGE_EMB_TOKEN_NUM":"0", 65 | "CHOSEN_EMB_MODEL":"text-embedding-ada-002", 66 | "CHOSEN_QUERY_EMB_MODEL":"text-embedding-ada-002", 67 | "CHOSEN_COMP_MODEL":"text-davinci-003", 68 | "NUM_TOP_MATCHES":"5", 69 | "MAX_OUTPUT_TOKENS":"500", 70 | "MAX_QUERY_TOKENS":"500" 71 | 72 | } 73 | } -------------------------------------------------------------------------------- /redis.yml: -------------------------------------------------------------------------------- 1 | apiVersion: '2019-12-01' 2 | location: westeurope 3 | name: redis-with-file-share-0001 4 | properties: 5 | containers: 6 | - name: redisearch 7 | properties: 8 | environmentVariables: 9 | - name: "REDIS_ARGS" 10 | value: "--save 60 1" 11 | image: redis/redis-stack-server:latest 12 | ports: 13 | - port: 6379 14 | resources: 15 | requests: 16 | cpu: 1.0 17 | memoryInGB: 1.5 18 | osType: Linux 19 | restartPolicy: Always 20 | ipAddress: 21 | type: Public 22 | ports: 23 | - port: 6379 24 | dnsNameLabel: aci-redis-cp1 25 | tags: {} 26 | type: Microsoft.ContainerInstance/containerGroups -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # DO NOT include azure-functions-worker in this file 2 | # The Python Worker is managed by Azure Functions platform 3 | # Manually managing azure-functions-worker may cause unexpected issues 4 | 5 | azure-functions 6 | azure-core 7 | azure-cosmos 8 | openai==0.28.1 9 | numpy 10 | requests 11 | pandas 12 | azure-storage-blob 13 | azure-identity 14 | smart_open 15 | tenacity 16 | redis 17 | tiktoken 18 | azure-storage-file-share 19 | python-dotenv 20 | azure-search-documents>=11.4.0b3 21 | azure-ai-formrecognizer 22 | beautifulsoup4 23 | lxml 24 | azure-ai-textanalytics 25 | langchain==0.0.139 26 | flask 27 | flask-socketio 28 | -------------------------------------------------------------------------------- /static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | AI Chat App 7 | 8 | 9 | 10 | 11 | 12 | 13 | 32 | 33 |
34 | 35 |
36 | 37 |
38 | 39 |
40 | 41 |
42 | 43 | 44 |
45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /static/index_old.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Chat App

4 |
5 | 6 | 7 |
8 |
9 | 10 | 11 | 50 | -------------------------------------------------------------------------------- /static/script.js: -------------------------------------------------------------------------------- 1 | var socket = io(); 2 | let selectedOption = 'ccr'; 3 | 4 | 5 | function saveSelection() { 6 | const radioButtons = document.getElementsByName('search-method'); 7 | 8 | for (let i = 0; i < radioButtons.length; i++) { 9 | if (radioButtons[i].checked) { 10 | selectedOption = radioButtons[i].value; 11 | break; 12 | } 13 | } 14 | 15 | if (selectedOption == 'os') { 16 | socket.emit('config', 'os'); 17 | } 18 | else if (selectedOption == 'ccr') { 19 | socket.emit('config', 'ccr'); 20 | } 21 | else if (selectedOption == 'zs') { 22 | socket.emit('config', 'zs'); 23 | } 24 | else { 25 | socket.emit('config', 'os'); 26 | } 27 | 28 | console.log("New Config: " + selectedOption) 29 | 30 | closeNav(); 31 | } 32 | 33 | 34 | 35 | 36 | /* Set the width of the sidebar to 250px and the left margin of the page content to 250px */ 37 | function openNav() { 38 | document.getElementById("mySidebar").style.width = "450px"; 39 | document.getElementById("main").style.marginLeft = "450px"; 40 | } 41 | 42 | /* Set the width of the sidebar to 0 and the left margin of the page content to 0 */ 43 | function closeNav() { 44 | document.getElementById("mySidebar").style.width = "0"; 45 | document.getElementById("main").style.marginLeft = "0"; 46 | } 47 | 48 | 49 | document.getElementById("send-button").addEventListener("click", function () { 50 | let message = document.getElementById("input-message").value.trim(); 51 | 52 | if (message) { 53 | // Append human message 54 | let humanBubble = document.createElement("div"); 55 | humanBubble.classList.add("chat-bubble", "human"); 56 | humanBubble.innerText = message; 57 | document.getElementById("chat-container").appendChild(humanBubble); 58 | 59 | socket.emit('message', message); 60 | 61 | // Clear input field 62 | document.getElementById("input-message").value = ""; 63 | } 64 | }); 65 | 66 | socket.on('message', (message) => { 67 | console.log(message) 68 | document.getElementById("chat-container").lastChild.innerHTML = document.getElementById("chat-container").lastChild.innerHTML + '
' + message + '

'; 69 | document.getElementById("chat-container").scrollTop = document.getElementById("chat-container").scrollHeight; 70 | }); 71 | 72 | socket.on('new_message', (message) => { 73 | console.log("Created new response bubble " + message) 74 | let aiBubble = document.createElement("div"); 75 | aiBubble.classList.add("chat-bubble", "ai"); 76 | aiBubble.innerText = '' 77 | document.getElementById("chat-container").appendChild(aiBubble); 78 | document.getElementById("chat-container").scrollTop = document.getElementById("chat-container").scrollHeight; 79 | }); 80 | 81 | socket.on('token', (message) => { 82 | console.log(message) 83 | document.getElementById("chat-container").lastChild.innerHTML = document.getElementById("chat-container").lastChild.innerHTML + message; 84 | document.getElementById("chat-container").scrollTop = document.getElementById("chat-container").scrollHeight; 85 | }); 86 | 87 | socket.on('connect', function() { 88 | console.log('Im connected! ' + selectedOption); 89 | socket.emit('config', selectedOption); 90 | }); 91 | 92 | // Send message with Enter key 93 | document.getElementById("input-message").addEventListener("keydown", function (event) { 94 | if (event.key === "Enter") { 95 | event.preventDefault(); 96 | document.getElementById("send-button").click(); 97 | } 98 | }); -------------------------------------------------------------------------------- /static/styles.css: -------------------------------------------------------------------------------- 1 | /* The sidebar menu */ 2 | .sidebar { 3 | height: 100%; /* 100% Full-height */ 4 | width: 0; /* 0 width - change this with JavaScript */ 5 | position: fixed; /* Stay in place */ 6 | z-index: 1; /* Stay on top */ 7 | top: 0; 8 | left: 0; 9 | background-color: #111; /* Black*/ 10 | overflow-x: hidden; /* Disable horizontal scroll */ 11 | padding-top: 60px; /* Place content 60px from the top */ 12 | transition: 0.5s; /* 0.5 second transition effect to slide in the sidebar */ 13 | } 14 | 15 | /* The sidebar links */ 16 | 17 | .sidebar h3 { 18 | padding: 8px 8px 8px 32px; 19 | text-decoration: none; 20 | font-size: 18px; 21 | color: #818181; 22 | display: block; 23 | transition: 0.3s; 24 | } 25 | 26 | 27 | .sidebar li { 28 | padding: 8px 8px 8px 32px; 29 | text-decoration: none; 30 | font-size: 16px; 31 | color: #818181; 32 | display: block; 33 | transition: 0.3s; 34 | } 35 | 36 | /* When you mouse over the navigation links, change their color */ 37 | .sidebar a:hover { 38 | color: #f1f1f1; 39 | } 40 | 41 | /* Position and style the close button (top right corner) */ 42 | .sidebar .closebtn { 43 | position: absolute; 44 | top: 0; 45 | right: 25px; 46 | font-size: 36px; 47 | margin-left: 50px; 48 | } 49 | 50 | .sidebar .savebtn { 51 | font-size: 15px; 52 | cursor: pointer; 53 | background-color: #8b8b8b; 54 | color: white; 55 | padding: 10px 15px; 56 | border: none; 57 | margin-left: 70px; 58 | } 59 | 60 | 61 | /* The button used to open the sidebar */ 62 | .openbtn { 63 | font-size: 15px; 64 | cursor: pointer; 65 | background-color: #111; 66 | color: white; 67 | padding: 10px 15px; 68 | border: none; 69 | } 70 | 71 | .openbtn:hover { 72 | background-color: #444; 73 | } 74 | 75 | /* Style page content - use this if you want to push the page content to the right when you open the side navigation */ 76 | #main { 77 | transition: margin-left .5s; /* If you want a transition effect */ 78 | padding: 20px; 79 | } 80 | 81 | /* On smaller screens, where height is less than 450px, change the style of the sidenav (less padding and a smaller font size) */ 82 | @media screen and (max-height: 450px) { 83 | .sidebar {padding-top: 15px;} 84 | .sidebar a {font-size: 18px;} 85 | } 86 | 87 | 88 | body { 89 | font-family: Arial, sans-serif; 90 | background-color: #f5f5f5; 91 | margin: 0; 92 | padding: 0; 93 | display: flex; 94 | flex-direction: column; 95 | height: 100vh; 96 | } 97 | 98 | #chat-container { 99 | flex-grow: 1; 100 | overflow-y: auto; 101 | padding: 20px; 102 | background-color: #fff; 103 | border-radius: 5px; 104 | box-shadow: 0 3px 10px rgba(0, 0, 0, 0.1); 105 | } 106 | 107 | .chat-bubble { 108 | padding: 10px 20px; 109 | background-color: #f1f1f1; 110 | border-radius: 20px; 111 | margin-bottom: 10px; 112 | max-width: 80%; 113 | width: 100%; 114 | display: inline-block; 115 | clear: both; 116 | } 117 | 118 | .human { 119 | background-color: #e0f7fa; 120 | float: left; 121 | } 122 | 123 | .ai { 124 | background-color: #ffeb3b; 125 | float: right; 126 | } 127 | 128 | #input-container { 129 | display: flex; 130 | justify-content: center; 131 | margin-top: 20px; 132 | padding: 10px; 133 | background-color: #ffffff; 134 | border-top: 1px solid #ccc; 135 | } 136 | 137 | #input-message { 138 | width: 100%; 139 | padding: 10px; 140 | border-radius: 5px; 141 | border: 1px solid #ccc; 142 | outline: none; 143 | } 144 | 145 | #send-button { 146 | background-color: #4caf50; 147 | color: #fff; 148 | border: none; 149 | padding: 10px 20px; 150 | margin-left: 10px; 151 | border-radius: 5px; 152 | cursor: pointer; 153 | font-size: 16px; 154 | } -------------------------------------------------------------------------------- /utils/bot_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import tiktoken 5 | import json 6 | import logging 7 | 8 | 9 | from utils import language 10 | from utils import storage 11 | from utils import redis_helpers 12 | from utils import openai_helpers 13 | from utils import cosmos_helpers 14 | from utils import km_agents 15 | 16 | from utils.env_vars import * 17 | 18 | 19 | redis_conn = redis_helpers.get_new_conn() 20 | 21 | 22 | 23 | 24 | def openai_interrogate_text(query, session_id=None, filter_param=None, agent_name=None, params_dict={}): 25 | 26 | lang = language.detect_content_language(query) 27 | if lang != 'en': query = language.translate(query, lang, 'en') 28 | 29 | if (agent_name is None) or (agent_name not in ['zs', 'ccr', 'os']): 30 | agent_name = 'zs' 31 | 32 | agent = km_agents.KMOAI_Agent(agent_name = agent_name, params_dict=params_dict, verbose = False) 33 | 34 | 35 | final_answer, sources, likely_sources, session_id = agent.run(query, prompt_id=session_id, filter_param=filter_param, redis_conn=redis_conn) 36 | 37 | if lang != 'en': 38 | final_answer = language.translate(final_answer, 'en', lang) 39 | 40 | sources_str = ', '.join(sources) 41 | 42 | ret_dict = { 43 | "link": sources_str, 44 | "likely_links": likely_sources, 45 | "answer": final_answer, 46 | "context": '', 47 | "session_id": session_id 48 | } 49 | 50 | return json.dumps(ret_dict, indent=4) 51 | -------------------------------------------------------------------------------- /utils/cogvecsearch_helpers/cogsearch_vecstore.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import os 3 | import logging 4 | import json 5 | import copy 6 | 7 | 8 | from utils import helpers 9 | from utils import http_helpers 10 | import utils.cogvecsearch_helpers.cs_json 11 | from utils import openai_helpers 12 | 13 | from utils.env_vars import * 14 | from utils import kb_doc 15 | from utils import cv_helpers 16 | 17 | import re 18 | 19 | 20 | class CogSearchVecStore: 21 | 22 | def __init__(self, api_key = COG_SEARCH_ADMIN_KEY, 23 | search_service_name = COG_SEARCH_ENDPOINT, 24 | index_name = COG_VECSEARCH_VECTOR_INDEX, 25 | api_version = COG_VEC_SEARCH_API_VERSION, 26 | load_addtl_fields = True): 27 | 28 | 29 | self.http_req = http_helpers.CogSearchHttpRequest(api_key, search_service_name, index_name, api_version) 30 | self.index_name = index_name 31 | self.all_fields = ['id', 'text', 'text_en', 'categoryId'] 32 | self.search_types = ['vector', 'hybrid', 'semantic_hybrid'] 33 | 34 | self.addtl_fields = [] 35 | 36 | if load_addtl_fields: 37 | self.addtl_fields += list(kb_doc.KB_Doc().get_fields() - ['text', 'text_en', VECTOR_FIELD_IN_REDIS, 'id', 'cv_image_vector', 'cv_text_vector']) 38 | self.all_fields += self.addtl_fields 39 | 40 | 41 | 42 | def create_index(self): 43 | 44 | index_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.create_index_json) 45 | index_dict['name'] = self.index_name 46 | 47 | for f in self.addtl_fields: 48 | field_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.field_json) 49 | field_dict['name'] = f 50 | index_dict['fields'].append(field_dict) 51 | 52 | self.http_req.put(body = index_dict) 53 | 54 | 55 | def get_index(self): 56 | return self.http_req.get() 57 | 58 | 59 | def delete_index(self): 60 | return self.http_req.delete() 61 | 62 | 63 | def upload_documents(self, documents): 64 | 65 | docs_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_docs_json) 66 | 67 | for doc in documents: 68 | doc_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_doc_json) 69 | 70 | for k in self.all_fields: 71 | doc_dict[k] = doc.get(k, '') 72 | 73 | doc_dict['id'] = doc['id'] if doc.get('id', None) else str(uuid.uuid4()) 74 | doc_dict[VECTOR_FIELD_IN_REDIS] = doc.get(VECTOR_FIELD_IN_REDIS, [0]*1024) 75 | doc_dict['cv_image_vector'] = doc.get('cv_image_vector',[0]*1024) 76 | doc_dict['cv_text_vector'] = doc.get('cv_text_vector', [0]*1024) 77 | doc_dict["@search.action"] = "upload" 78 | docs_dict['value'].append(doc_dict) 79 | 80 | self.http_req.post(op ='index', body = docs_dict) 81 | 82 | return docs_dict 83 | 84 | 85 | def delete_documents(self, op='index', ids = []): 86 | docs_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_docs_json) 87 | 88 | for i in ids: 89 | doc_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.upload_doc_json) 90 | doc_dict['id'] = i 91 | doc_dict[VECTOR_FIELD_IN_REDIS] = [0] * openai_helpers.get_model_dims(CHOSEN_EMB_MODEL) 92 | doc_dict["@search.action"] = "delete" 93 | docs_dict['value'].append(doc_dict) 94 | 95 | self.http_req.post(op ='index', body = docs_dict) 96 | 97 | 98 | 99 | def get_search_json(self, query, search_type = 'vector'): 100 | if search_type == 'vector': 101 | query_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.search_dict_vector) 102 | elif search_type == 'hybrid': 103 | query_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.search_dict_hybrid) 104 | query_dict['search'] = query 105 | elif search_type == 'semantic_hybrid': 106 | query_dict = copy.deepcopy(utils.cogvecsearch_helpers.cs_json.search_dict_semantic_hybrid) 107 | query_dict['search'] = query 108 | return query_dict 109 | 110 | 111 | def get_vector_fields(self, query, query_dict, vector_name = None): 112 | if (vector_name is None) or (vector_name == VECTOR_FIELD_IN_REDIS): 113 | completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL) 114 | embedding_enc = openai_helpers.get_encoder(CHOSEN_EMB_MODEL) 115 | query_dict['vector']['fields'] = VECTOR_FIELD_IN_REDIS 116 | query = embedding_enc.decode(embedding_enc.encode(query)[:MAX_QUERY_TOKENS]) 117 | query_dict['vector']['value'] = openai_helpers.get_openai_embedding(query, CHOSEN_EMB_MODEL) 118 | elif vector_name == 'cv_text_vector': 119 | cvr = cv_helpers.CV() 120 | query_dict['vector']['fields'] = vector_name 121 | query_dict['vector']['value'] = cvr.get_text_embedding(query) 122 | elif vector_name == 'cv_image_vector': 123 | cvr = cv_helpers.CV() 124 | query_dict['vector']['fields'] = vector_name 125 | query_dict['vector']['value'] = cvr.get_img_embedding(query) 126 | else: 127 | raise Exception(f'Invalid Vector Name {vector_name}') 128 | 129 | return query_dict 130 | 131 | 132 | 133 | def search(self, query, search_type = 'vector', vector_name = None, select=None, filter=None, verbose=False): 134 | 135 | if search_type not in self.search_types: 136 | raise Exception(f"search_type must be one of {self.search_types}") 137 | 138 | regex = r"(https?:\/\/[^\/\s]+(?:\/[^\/\s]+)*\/[^?\/\s]+(?:\.jpg|\.jpeg|\.png)(?:\?[^\s'\"]+)?)" 139 | match = re.search(regex, query) 140 | 141 | if match: 142 | sas_url = match.group(1) 143 | cvr = cv_helpers.CV() 144 | res = cvr.analyze_image(img_url=sas_url) 145 | query = query.replace(sas_url, '') + '\n' + res['text'] 146 | 147 | query_dict = self.get_search_json(query, search_type) 148 | query_dict = self.get_vector_fields(query, query_dict, vector_name) 149 | query_dict['vector']['k'] = NUM_TOP_MATCHES 150 | query_dict['filter'] = filter 151 | query_dict['select'] = ', '.join(self.all_fields) if select is None else select 152 | 153 | results = self.http_req.post(op ='search', body = query_dict) 154 | results = results['value'] 155 | if verbose: [print(r['@search.score']) for r in results] 156 | 157 | 158 | if match: 159 | sas_url = match.group(1) 160 | query_dict = self.get_vector_fields(sas_url, query_dict, 'cv_image_vector') 161 | img_results = self.http_req.post(op ='search', body = query_dict) 162 | results = [img_results['value'], results] 163 | 164 | max_items = max([len(r) for r in results]) 165 | 166 | final_context = [] 167 | context_dict = {} 168 | 169 | for i in range(max_items): 170 | for j in range(len(results)): 171 | if i < len(results[j]): 172 | if results[j][i]['id'] not in context_dict: 173 | context_dict[results[j][i]['id']] = 1 174 | final_context.append(results[j][i]) 175 | 176 | results = final_context 177 | 178 | context = helpers.process_search_results(results) 179 | 180 | if match: 181 | return ['Analysis of the image in the question: ' + query + '\n\n'] + context 182 | else: 183 | return context 184 | 185 | 186 | 187 | def search_similar_images(self, query, select=None, filter=None, verbose=False): 188 | 189 | search_type = 'vector' 190 | vector_name = 'cv_image_vector' 191 | 192 | if search_type not in self.search_types: 193 | raise Exception(f"search_type must be one of {self.search_types}") 194 | 195 | regex = r"(https?:\/\/[^\/\s]+(?:\/[^\/\s]+)*\/[^?\/\s]+(?:\.jpg|\.jpeg|\.png)(?:\?[^\s'\"]+)?)" 196 | match = re.search(regex, query) 197 | 198 | if match: 199 | url = match.group(1) 200 | query_dict = self.get_search_json(url, search_type) 201 | query_dict = self.get_vector_fields(url, query_dict, vector_name) 202 | query_dict['vector']['k'] = NUM_TOP_MATCHES 203 | query_dict['filter'] = filter 204 | query_dict['select'] = ', '.join(self.all_fields) if select is None else select 205 | 206 | results = self.http_req.post(op ='search', body = query_dict) 207 | results = results['value'] 208 | if verbose: [print(r['@search.score']) for r in results] 209 | 210 | context = helpers.process_search_results(results) 211 | 212 | return context 213 | 214 | else: 215 | return ["Sorry, no similar images have been found"] -------------------------------------------------------------------------------- /utils/cogvecsearch_helpers/cs_json.py: -------------------------------------------------------------------------------- 1 | 2 | from utils.env_vars import * 3 | 4 | 5 | field_json = { 6 | "name": "", 7 | "type": "Edm.String", 8 | "searchable": True, 9 | "filterable": True, 10 | "retrievable": True, 11 | "sortable": True, 12 | "facetable": True, 13 | "key": False, 14 | "indexAnalyzer": None, 15 | "searchAnalyzer": None, 16 | "analyzer": None, 17 | "normalizer": None, 18 | "dimensions": None, 19 | "vectorSearchConfiguration": None, 20 | "synonymMaps": [] 21 | } 22 | 23 | 24 | vector_json = { 25 | "name": "vector", 26 | "type": "Collection(Edm.Single)", 27 | "searchable": True, 28 | "filterable": False, 29 | "retrievable": True, 30 | "sortable": False, 31 | "facetable": False, 32 | "key": False, 33 | "indexAnalyzer": None, 34 | "searchAnalyzer": None, 35 | "analyzer": None, 36 | "normalizer": None, 37 | "dimensions": 1536, 38 | "vectorSearchConfiguration": "vector-config", 39 | "synonymMaps": [] 40 | } 41 | 42 | 43 | create_index_json = { 44 | "@odata.context": "https://cogvecseearch.search.windows.net/$metadata#indexes/$entity", 45 | "@odata.etag": "\"0x8DB40C97F04622D\"", 46 | "name": "vec-index", 47 | "defaultScoringProfile": None, 48 | "fields": [ 49 | { 50 | "name": "id", 51 | "type": "Edm.String", 52 | "searchable": True, 53 | "filterable": True, 54 | "retrievable": True, 55 | "sortable": True, 56 | "facetable": True, 57 | "key": True, 58 | "indexAnalyzer": None, 59 | "searchAnalyzer": None, 60 | "analyzer": None, 61 | "normalizer": None, 62 | "dimensions": None, 63 | "vectorSearchConfiguration": None, 64 | "synonymMaps": [] 65 | }, 66 | { 67 | "name": "text", 68 | "type": "Edm.String", 69 | "searchable": True, 70 | "filterable": True, 71 | "retrievable": True, 72 | "sortable": True, 73 | "facetable": True, 74 | "key": False, 75 | "indexAnalyzer": None, 76 | "searchAnalyzer": None, 77 | "analyzer": None, 78 | "normalizer": None, 79 | "dimensions": None, 80 | "vectorSearchConfiguration": None, 81 | "synonymMaps": [] 82 | }, 83 | { 84 | "name": "text_en", 85 | "type": "Edm.String", 86 | "searchable": True, 87 | "filterable": True, 88 | "retrievable": True, 89 | "sortable": True, 90 | "facetable": True, 91 | "key": False, 92 | "indexAnalyzer": None, 93 | "searchAnalyzer": None, 94 | "analyzer": None, 95 | "normalizer": None, 96 | "dimensions": None, 97 | "vectorSearchConfiguration": None, 98 | "synonymMaps": [] 99 | }, 100 | { 101 | "name": "categoryId", 102 | "type": "Edm.String", 103 | "searchable": True, 104 | "filterable": True, 105 | "retrievable": True, 106 | "sortable": True, 107 | "facetable": True, 108 | "key": False, 109 | "indexAnalyzer": None, 110 | "searchAnalyzer": None, 111 | "analyzer": None, 112 | "normalizer": None, 113 | "dimensions": None, 114 | "vectorSearchConfiguration": None, 115 | "synonymMaps": [] 116 | }, 117 | { 118 | "name": VECTOR_FIELD_IN_REDIS, 119 | "type": "Collection(Edm.Single)", 120 | "searchable": True, 121 | "filterable": False, 122 | "retrievable": True, 123 | "sortable": False, 124 | "facetable": False, 125 | "key": False, 126 | "indexAnalyzer": None, 127 | "searchAnalyzer": None, 128 | "analyzer": None, 129 | "normalizer": None, 130 | "dimensions": 1536, 131 | "vectorSearchConfiguration": "vector-config", 132 | "synonymMaps": [] 133 | }, 134 | { 135 | "name": 'cv_image_vector', 136 | "type": "Collection(Edm.Single)", 137 | "searchable": True, 138 | "filterable": False, 139 | "retrievable": True, 140 | "sortable": False, 141 | "facetable": False, 142 | "key": False, 143 | "indexAnalyzer": None, 144 | "searchAnalyzer": None, 145 | "analyzer": None, 146 | "normalizer": None, 147 | "dimensions": 1024, 148 | "vectorSearchConfiguration": "vector-config", 149 | "synonymMaps": [] 150 | }, 151 | { 152 | "name": 'cv_text_vector', 153 | "type": "Collection(Edm.Single)", 154 | "searchable": True, 155 | "filterable": False, 156 | "retrievable": True, 157 | "sortable": False, 158 | "facetable": False, 159 | "key": False, 160 | "indexAnalyzer": None, 161 | "searchAnalyzer": None, 162 | "analyzer": None, 163 | "normalizer": None, 164 | "dimensions": 1024, 165 | "vectorSearchConfiguration": "vector-config", 166 | "synonymMaps": [] 167 | } 168 | ], 169 | "scoringProfiles": [], 170 | "corsOptions": { 171 | "allowedOrigins": [ 172 | "*" 173 | ], 174 | "maxAgeInSeconds": 60 175 | }, 176 | "suggesters": [], 177 | "analyzers": [], 178 | "normalizers": [], 179 | "tokenizers": [], 180 | "tokenFilters": [], 181 | "charFilters": [], 182 | "encryptionKey": None, 183 | "similarity": { 184 | "@odata.type": "#Microsoft.Azure.Search.BM25Similarity", 185 | "k1": None, 186 | "b": None 187 | }, 188 | "semantic": { 189 | "defaultConfiguration": None, 190 | "configurations": [ 191 | { 192 | "name": "semantic-config", 193 | "prioritizedFields": { 194 | "prioritizedContentFields": [ 195 | { 196 | "fieldName": "text_en" 197 | } 198 | ], 199 | "prioritizedKeywordsFields": [ 200 | { 201 | "fieldName": "categoryId" 202 | } 203 | ] 204 | } 205 | } 206 | ] 207 | }, 208 | "vectorSearch": { 209 | "algorithmConfigurations": [ 210 | { 211 | "name": "vector-config", 212 | "kind": "hnsw", 213 | "hnswParameters": { 214 | "m": 10, 215 | "efConstruction": 400, 216 | "metric": "cosine" 217 | } 218 | } 219 | ] 220 | } 221 | } 222 | 223 | 224 | upload_doc_json = { 225 | "id": "", 226 | "text": "", 227 | "text_en": "", 228 | "categoryId": "", 229 | VECTOR_FIELD_IN_REDIS: [], 230 | "@search.action": "upload" 231 | } 232 | 233 | upload_docs_json = { 234 | "value": [ 235 | ] 236 | } 237 | 238 | 239 | 240 | search_dict_vector = { 241 | "vector": { 242 | "value": [], 243 | "fields": VECTOR_FIELD_IN_REDIS, 244 | "k": NUM_TOP_MATCHES 245 | }, 246 | "select": "*", 247 | "filter": None 248 | } 249 | 250 | 251 | search_dict_hybrid = { 252 | "vector": { 253 | "value": [], 254 | "fields": VECTOR_FIELD_IN_REDIS, 255 | "k": 10 256 | }, 257 | "search": "", 258 | "select": "*", 259 | "top": f"{NUM_TOP_MATCHES}", 260 | "filter": None 261 | } 262 | 263 | 264 | search_dict_semantic_hybrid= { 265 | "vector": { 266 | "value": [], 267 | "fields": VECTOR_FIELD_IN_REDIS, 268 | "k": NUM_TOP_MATCHES, 269 | }, 270 | "search": "", 271 | "select":"*", 272 | "queryType": "semantic", 273 | "semanticConfiguration": "semantic-config", 274 | "queryLanguage": "en-us", 275 | "captions": "extractive", 276 | "answers": "extractive", 277 | "top": f"{NUM_TOP_MATCHES*3}", 278 | "filter": None 279 | } 280 | 281 | -------------------------------------------------------------------------------- /utils/cosmos_helpers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import re 4 | import azure.functions as func 5 | import os 6 | from azure.cosmos import CosmosClient, PartitionKey 7 | import urllib 8 | import numpy as np 9 | import uuid 10 | import copy 11 | from datetime import datetime, timedelta 12 | 13 | from utils import redis_helpers 14 | from utils.env_vars import * 15 | 16 | 17 | try: 18 | 19 | if DATABASE_MODE == 1: 20 | client = CosmosClient(url=COSMOS_URI, credential=COSMOS_KEY) 21 | partitionKeyPath = PartitionKey(path="/categoryId") 22 | database = client.create_database_if_not_exists(id=COSMOS_DB_NAME) 23 | 24 | def init_container(): 25 | 26 | indexing_policy={ "includedPaths":[{ "path":"/*"}], "excludedPaths":[{ "path":"/\"_etag\"/?"},{ "path":f"/{VECTOR_FIELD_IN_REDIS}/?"}]} 27 | 28 | try: 29 | container = database.create_container_if_not_exists(id="documents", partition_key=partitionKeyPath,indexing_policy=indexing_policy) 30 | except: 31 | try: 32 | container = database.create_container_if_not_exists(id="documents", partition_key=partitionKeyPath,indexing_policy=indexing_policy) 33 | 34 | except Exception as e: 35 | logging.error(f"Encountered error {e} while creating the container") 36 | print(f"Encountered error {e} while creating the container") 37 | 38 | return container 39 | 40 | container = init_container() 41 | 42 | except: 43 | print("Failed to initialize Cosmos DB container") 44 | logging.error("Failed to initialize Cosmos DB container") 45 | 46 | 47 | 48 | def cosmos_restore_embeddings(): 49 | QUERY = "SELECT * FROM documents p WHERE p.categoryId = @categoryId" 50 | params = [dict(name="@categoryId", value=EMBCATEGORYID)] 51 | 52 | embeddings = container.query_items(query=QUERY, parameters=params, enable_cross_partition_query=False) 53 | 54 | redis_conn = redis_helpers.get_new_conn() 55 | counter = 0 56 | 57 | try: 58 | for e in embeddings: 59 | counter += redis_helpers.redis_upsert_embedding(redis_conn, e) 60 | 61 | except Exception as e: 62 | print("No Documents found") 63 | 64 | logging.info(f"Loaded {counter} embeddings from Cosmos into Redis") 65 | print(f"Loaded {counter} embeddings from Cosmos into Redis") 66 | 67 | 68 | 69 | def cosmos_backup_embeddings(emb_documents): 70 | 71 | ret_dict = {} 72 | 73 | try: 74 | for e in emb_documents: 75 | #e[VECTOR_FIELD_IN_REDIS] = np.array(e[VECTOR_FIELD_IN_REDIS]).astype(np.float32).tobytes() 76 | e['categoryId'] = EMBCATEGORYID 77 | container.upsert_item(e) 78 | 79 | ret_dict['status'] = f"Successfully loaded {len(emb_documents)} embedding documents into Cosmos" 80 | 81 | except Exception as e: 82 | logging.error(e) 83 | print(e) 84 | ret_dict['status'] = f"Failed loading {len(emb_documents)} embeddings into Cosmos: {e}" 85 | 86 | return ret_dict 87 | 88 | 89 | 90 | 91 | def cosmos_store_contents(data_dict): 92 | ret_dict = {} 93 | 94 | new_doc = copy.deepcopy(data_dict) 95 | 96 | new_doc['id'] = new_doc.get('id', str(uuid.uuid4())) 97 | new_doc['categoryId'] = CATEGORYID 98 | new_doc['timestamp'] = new_doc.get('timestamp', datetime.now().strftime("%m/%d/%Y, %H:%M:%S")), 99 | new_doc['doc_url'] = new_doc.get('doc_url', f'https://microsoft.com/{str(uuid.uuid4())}') 100 | 101 | if 'content' in new_doc.keys(): 102 | del new_doc['content'] 103 | 104 | try: 105 | container.upsert_item(new_doc) 106 | ret_dict['status'] = f"Document {new_doc['id']} was successfully inserted into Cosmos" 107 | except Exception as e: 108 | logging.error(e) 109 | print(e) 110 | ret_dict['status'] = f"Document {new_doc['id']} failed to be inserted into Cosmos: {e}" 111 | 112 | return ret_dict 113 | 114 | 115 | 116 | # def cosmos_download_contents(): 117 | # QUERY = "SELECT * FROM documents p WHERE p.categoryId = @categoryId" 118 | # params = [dict(name="@categoryId", value=CATEGORYID)] 119 | 120 | # contents = container.query_items(query=QUERY, parameters=params, enable_cross_partition_query=False, max_item_count=10) 121 | # counter = 0 122 | 123 | # try: 124 | # for c in contents: 125 | # #counter += redis_helpers.redis_upsert_embedding(redis_conn, e) 126 | # # print(c) 127 | # yield self._parse_entry(item_dict) 128 | 129 | # except Exception as e: 130 | # print("No Documents found") 131 | 132 | # logging.info(f"Loaded {counter} embeddings from Cosmos into Redis") 133 | # print(f"Loaded {counter} embeddings from Cosmos into Redis") -------------------------------------------------------------------------------- /utils/cv_helpers.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import os 3 | import logging 4 | import json 5 | import copy 6 | 7 | 8 | from utils import http_helpers 9 | from utils import openai_helpers 10 | 11 | from utils.env_vars import * 12 | 13 | 14 | 15 | 16 | class CV: 17 | 18 | def __init__(self, api_key = COG_SERV_KEY, 19 | cog_serv_name = COG_SERV_ENDPOINT, 20 | api_version = CV_API_VERSION): 21 | 22 | 23 | self.http_req = http_helpers.CVHttpRequest(api_key, cog_serv_name, api_version) 24 | 25 | 26 | 27 | def process_json(self, img_url, response): 28 | res = {} 29 | 30 | res['main_caption'] = response['captionResult']['text'] 31 | res['tags'] = [tag['name'] for tag in response['tagsResult']['values']] 32 | res['ocr'] = response['readResult']['content'] 33 | res['captions'] = [caption['text'] for caption in response['denseCaptionsResult']['values']] 34 | 35 | res['text'] = f"[{img_url}] This is an image. Main Caption: {res['main_caption']}\nOCR: {res['ocr']}\nDense Captions: {', '.join(res['captions'])}\nTags: {', '.join(res['tags'])}" 36 | 37 | return res 38 | 39 | 40 | 41 | def analyze_image(self, img_url = None, filename = None): 42 | 43 | if filename is not None: 44 | 45 | with open(filename, 'rb') as f: 46 | data = f.read() 47 | response = self.http_req.post(op='analyze', data=data) 48 | 49 | else: 50 | response = self.http_req.post(op='analyze', headers=self.http_req.json_headers, body={'url': img_url}) 51 | 52 | response = self.process_json(img_url, response) 53 | 54 | return response 55 | 56 | 57 | def get_img_embedding(self, img_url = None, filename = None): 58 | 59 | if filename is not None: 60 | with open(filename, 'rb') as f: 61 | data = f.read() 62 | 63 | response = self.http_req.post(op='img_embedding', data=data) 64 | else: 65 | 66 | response = self.http_req.post(op='img_embedding', headers=self.http_req.json_headers, body={'url': img_url}) 67 | 68 | try: 69 | return response['vector'] 70 | except: 71 | return None 72 | 73 | 74 | 75 | def get_text_embedding(self, text): 76 | response = self.http_req.post(op='text_embedding', headers=self.http_req.json_headers, body={'text': text}) 77 | 78 | try: 79 | return response['vector'] 80 | except: 81 | return None -------------------------------------------------------------------------------- /utils/env_vars.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ########################### 4 | ## Configuration Options ## 5 | ########################### 6 | 7 | CHOSEN_COMP_MODEL = os.environ.get("CHOSEN_COMP_MODEL", "gpt-35-turbo") 8 | CHOSEN_EMB_MODEL = os.environ.get("CHOSEN_EMB_MODEL", "text-embedding-ada-002") 9 | MAX_OUTPUT_TOKENS = int(os.environ.get("MAX_OUTPUT_TOKENS", "2000")) 10 | MAX_HISTORY_TOKENS = int(os.environ.get("MAX_HISTORY_TOKENS", "1000")) 11 | MAX_SEARCH_TOKENS = int(os.environ.get("MAX_SEARCH_TOKENS", "2500")) 12 | MAX_QUERY_TOKENS = int(os.environ.get("MAX_QUERY_TOKENS", "500")) 13 | PRE_CONTEXT = int(os.environ.get("PRE_CONTEXT", "500")) 14 | NUM_TOP_MATCHES = int(os.environ.get("NUM_TOP_MATCHES", "3")) 15 | 16 | OVERLAP_TEXT = int(os.environ.get("OVERLAP_TEXT", "150")) 17 | SMALL_EMB_TOKEN_NUM = int(os.environ.get("SMALL_EMB_TOKEN_NUM", "0")) 18 | MEDIUM_EMB_TOKEN_NUM = int(os.environ.get("MEDIUM_EMB_TOKEN_NUM", "800")) 19 | LARGE_EMB_TOKEN_NUM = int(os.environ.get("LARGE_EMB_TOKEN_NUM", "0")) 20 | X_LARGE_EMB_TOKEN_NUM = int(os.environ.get("X_LARGE_EMB_TOKEN_NUM", "0")) 21 | 22 | USE_BING = os.environ.get("USE_BING", "no") 23 | LIST_OF_COMMA_SEPARATED_URLS = os.environ.get("LIST_OF_COMMA_SEPARATED_URLS", "") 24 | 25 | CONVERSATION_TTL_SECS = int(os.environ.get("CONVERSATION_TTL_SECS", "172800")) 26 | 27 | USE_COG_VECSEARCH = int(os.environ.get("USE_COG_VECSEARCH", "1")) 28 | 29 | DATABASE_MODE = int(os.environ.get("DATABASE_MODE", "1")) 30 | 31 | USE_REDIS_CACHE = int(os.environ.get("USE_REDIS_CACHE", "0")) 32 | 33 | PROCESS_IMAGES = int(os.environ.get("PROCESS_IMAGES", "0")) 34 | 35 | 36 | 37 | 38 | ######################## 39 | ## Endpoints and Keys ## 40 | ######################## 41 | 42 | COG_SEARCH_ENDPOINT = os.environ.get("COG_SEARCH_ENDPOINT", "") 43 | COG_SEARCH_ADMIN_KEY = os.environ.get("COG_SEARCH_ADMIN_KEY", "") 44 | COG_SEARCH_CUSTOM_FUNC = os.environ.get("COG_SEARCH_CUSTOM_FUNC", "") 45 | 46 | COG_SERV_ENDPOINT = os.environ.get("COG_SERV_ENDPOINT", "") 47 | COG_SERV_KEY = os.environ.get("COG_SERV_KEY", "") 48 | 49 | OPENAI_RESOURCE_ENDPOINT = os.environ.get("OPENAI_RESOURCE_ENDPOINT", "") 50 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") 51 | 52 | KB_BLOB_CONN_STR = os.environ.get("KB_BLOB_CONN_STR", "") 53 | 54 | COSMOS_URI = os.environ.get("COSMOS_URI", "") 55 | COSMOS_KEY = os.environ.get("COSMOS_KEY", "") 56 | 57 | SERVICEBUS_CONN_STR = os.environ.get("SERVICEBUS_CONN_STR", "") 58 | 59 | REDIS_ADDR = os.environ.get("REDIS_ADDR", "") 60 | REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "") 61 | REDIS_PORT = os.environ.get("REDIS_PORT", "10000") 62 | 63 | BING_SUBSCRIPTION_KEY = os.environ.get("BING_SUBSCRIPTION_KEY", "") 64 | BING_SEARCH_URL = os.environ.get("BING_SEARCH_URL", "https://api.bing.microsoft.com/v7.0/search") 65 | 66 | TRANSLATION_ENDPOINT = os.environ.get("TRANSLATION_ENDPOINT", "https://api.cognitive.microsofttranslator.com") 67 | TRANSLATION_API_KEY = os.environ.get("TRANSLATION_API_KEY", "ad8ac9b95ba94b79ba37d43cdc0c606c") 68 | TRANSLATION_LOCATION = os.environ.get("TRANSLATION_LOCATION", "westeurope") 69 | 70 | #if TRANSLATION_API_KEY == "": TRANSLATION_API_KEY = COG_SERV_KEY 71 | 72 | 73 | ################### 74 | ## OpenAI Params ## 75 | ################### 76 | 77 | import openai 78 | 79 | 80 | OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION", "2023-03-15-preview") 81 | openai.api_type = "azure" 82 | openai.api_key = OPENAI_API_KEY 83 | openai.api_base = OPENAI_RESOURCE_ENDPOINT 84 | openai.api_version = OPENAI_API_VERSION 85 | 86 | 87 | 88 | ############################# 89 | ## Cognitive Search Params ## 90 | ############################# 91 | 92 | KB_FIELDS_CONTENT = os.environ.get("KB_FIELDS_CONTENT", "content") 93 | KB_FIELDS_CATEGORY = os.environ.get("KB_FIELDS_CATEGORY", "category") 94 | KB_FIELDS_SOURCEFILE = os.environ.get("KB_FIELDS_SOURCEFILE", "sourcefile") 95 | KB_FIELDS_CONTAINER = os.environ.get("KB_FIELDS_CONTAINER", "container") 96 | KB_FIELDS_FILENAME = os.environ.get("KB_FIELDS_FILENAME", "filename") 97 | KB_SEM_INDEX_NAME = os.environ.get("KB_SEM_INDEX_NAME", "km-openai-sem") 98 | COG_VEC_SEARCH_API_VERSION = os.environ.get("COG_VEC_SEARCH_API_VERSION", "2023-07-01-Preview") 99 | COG_VECSEARCH_VECTOR_INDEX = os.environ.get("COG_VECSEARCH_VECTOR_INDEX", "vec-index") 100 | 101 | 102 | 103 | ############################ 104 | ## Defaults and Constants ## 105 | ############################ 106 | 107 | AzureWebJobsStorage = os.environ.get("AzureWebJobsStorage", KB_BLOB_CONN_STR) 108 | AzureWebJobsDashboard = os.environ.get("AzureWebJobsDashboard", KB_BLOB_CONN_STR) 109 | FUNCTIONS_EXTENSION_VERSION = os.environ.get("FUNCTIONS_EXTENSION_VERSION", "~4") 110 | FUNCTIONS_WORKER_RUNTIME = os.environ.get("FUNCTIONS_WORKER_RUNTIME", "python") 111 | WEBSITE_MAX_DYNAMIC_APPLICATION_SCALE_OUT = int(os.environ.get("WEBSITE_MAX_DYNAMIC_APPLICATION_SCALE_OUT", "1")) 112 | KB_INDEX_NAME = os.environ.get("KB_INDEX_NAME", "km-openai") 113 | KB_INDEXER_NAME = os.environ.get("KB_INDEXER_NAME", "km-openai-indexer") 114 | KB_DATA_SOURCE_NAME = os.environ.get("KB_DATA_SOURCE_NAME", "km-openai-docs") 115 | KB_SKILLSET_NAME = os.environ.get("KB_SKILLSET_NAME", "km-openai-skills") 116 | REDIS_INDEX_NAME = os.environ.get("REDIS_INDEX_NAME", "acs_emb_index") 117 | VECTOR_FIELD_IN_REDIS = os.environ.get("VECTOR_FIELD_IN_REDIS", "item_vector") 118 | NUMBER_PRODUCTS_INDEX = int(os.environ.get("NUMBER_PRODUCTS_INDEX", "1000")) 119 | CATEGORYID = os.environ.get("CATEGORYID", "KM_OAI_CATEGORY") 120 | EMBCATEGORYID = os.environ.get("EMBCATEGORYID", "KM_OAI_EMB_CATEGORY") 121 | COSMOS_DB_NAME = os.environ.get("COSMOS_DB_NAME", "KM_OAI_DB") 122 | KB_BLOB_CONTAINER = os.environ.get("KB_BLOB_CONTAINER", "kmoaidemo") 123 | OUTPUT_BLOB_CONTAINER = os.environ.get("OUTPUT_BLOB_CONTAINER", "kmoaiprocessed") 124 | CHOSEN_QUERY_EMB_MODEL = os.environ.get("CHOSEN_QUERY_EMB_MODEL", "text-embedding-ada-002") 125 | ADA_002_EMBED_NUM_DIMS = int(os.environ.get("ADA_002_EMBED_NUM_DIMS", "1536")) 126 | ADA_002_MODEL_MAX_TOKENS = int(os.environ.get("ADA_002_MODEL_MAX_TOKENS", "4095")) 127 | ADA_002_EMBEDDING_MODEL = os.environ.get("ADA_002_EMBEDDING_MODEL", "text-embedding-ada-002") 128 | ADA_EMBEDDING_ENCODING = os.environ.get("ADA_EMBEDDING_ENCODING", "cl100k_base") 129 | DAVINCI_003_EMBED_NUM_DIMS = int(os.environ.get("DAVINCI_003_EMBED_NUM_DIMS", "12288")) 130 | DAVINCI_003_MODEL_MAX_TOKENS = int(os.environ.get("DAVINCI_003_MODEL_MAX_TOKENS", "4000")) 131 | DAVINCI_003_EMB_MAX_TOKENS = int(os.environ.get("DAVINCI_003_EMB_MAX_TOKENS", "2047")) 132 | DAVINCI_003_COMPLETIONS_MODEL = os.environ.get("DAVINCI_003_COMPLETIONS_MODEL", "text-davinci-003") 133 | DAVINCI_003_EMBEDDING_MODEL = os.environ.get("DAVINCI_003_EMBEDDING_MODEL", "text-search-davinci-doc-001") 134 | DAVINCI_003_QUERY_EMB_MODEL = os.environ.get("DAVINCI_003_QUERY_EMB_MODEL", "text-search-davinci-query-001") 135 | DAVINCI_EMBEDDING_ENCODING = os.environ.get("DAVINCI_EMBEDDING_ENCODING", "p50k_base") 136 | GPT35_TURBO_COMPLETIONS_MODEL = os.environ.get("GPT35_TURBO_COMPLETIONS_MODEL", "gpt-35-turbo") 137 | GPT35_TURBO_COMPLETIONS_MAX_TOKENS = int(os.environ.get("GPT35_TURBO_COMPLETIONS_MAX_TOKENS", "8193")) 138 | GPT35_TURBO_COMPLETIONS_ENCODING = os.environ.get("GPT35_TURBO_COMPLETIONS_ENCODING", "cl100k_base") 139 | FR_CONTAINER = os.environ.get("FR_CONTAINER", "kmoaiforms") 140 | RESTRICTIVE_PROMPT = os.environ.get("RESTRICTIVE_PROMPT", "no") 141 | TEMPERATURE = float(os.environ.get("TEMPERATURE", "0.3")) 142 | GPT4_COMPLETIONS_MODEL_MAX_TOKENS = int(os.environ.get("GPT4_COMPLETIONS_MODEL_MAX_TOKENS", "8192")) 143 | GPT4_32K_COMPLETIONS_MODEL_MAX_TOKENS = int(os.environ.get("GPT4_32K_COMPLETIONS_MODEL_MAX_TOKENS", "32768")) 144 | GPT35_TURBO_16K_COMPLETIONS_MAX_TOKENS = int(os.environ.get("GPT35_TURBO_16K_COMPLETIONS_MAX_TOKENS", "16384")) 145 | GPT4_MODEL = os.environ.get("GPT4_MODEL", "gpt-4") 146 | GPT4_32K_MODEL = os.environ.get("GPT4_32K_MODEL", "gpt-4-32k") 147 | CV_API_VERSION = os.environ.get("CV_API_VERSION", "2023-02-01-preview") 148 | 149 | 150 | -------------------------------------------------------------------------------- /utils/fr_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import logging 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from azure.core.credentials import AzureKeyCredential 9 | from azure.ai.formrecognizer import DocumentAnalysisClient 10 | 11 | 12 | from tenacity import ( 13 | retry, 14 | stop_after_attempt, 15 | wait_random_exponential, 16 | ) 17 | 18 | 19 | 20 | from utils import storage 21 | 22 | from utils.env_vars import * 23 | 24 | 25 | document_analysis_client = DocumentAnalysisClient(COG_SERV_ENDPOINT, AzureKeyCredential(COG_SERV_KEY)) 26 | 27 | 28 | 29 | def process_forms(in_container = FR_CONTAINER, out_container = OUTPUT_BLOB_CONTAINER): 30 | blob_list = storage.list_documents(in_container) 31 | 32 | for b in blob_list: 33 | url = storage.create_sas(b) 34 | result = fr_analyze_doc(url) 35 | 36 | new_json = { 37 | 'text': result, 38 | 'doc_url': b, 39 | 'container': in_container, 40 | 'filename': storage.get_filename(b), 41 | 'web_url': '' 42 | } 43 | 44 | storage.save_json_document(new_json, container = out_container ) 45 | 46 | 47 | 48 | 49 | 50 | def fr_analyze_doc(url): 51 | 52 | poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-document", url) 53 | result = poller.result() 54 | 55 | contents = '' 56 | 57 | for paragraph in result.paragraphs: 58 | contents += paragraph.content + '\n' 59 | 60 | 61 | for kv_pair in result.key_value_pairs: 62 | key = kv_pair.key.content if kv_pair.key else '' 63 | value = kv_pair.value.content if kv_pair.value else '' 64 | kv_pairs_str = f"{key} : {value}" 65 | contents += kv_pairs_str + '\n' 66 | 67 | for table_idx, table in enumerate(result.tables): 68 | row = 0 69 | row_str = '' 70 | row_str_arr = [] 71 | 72 | for cell in table.cells: 73 | if cell.row_index == row: 74 | row_str += ' | ' + str(cell.content) 75 | else: 76 | row_str_arr.append(row_str) 77 | row_str = '' 78 | row = cell.row_index 79 | row_str += ' | ' + str(cell.content) 80 | 81 | row_str_arr.append(row_str) 82 | contents += '\n'.join(row_str_arr) +'\n' 83 | 84 | return contents 85 | 86 | 87 | 88 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(10)) 89 | def fr_analyze_local_doc_with_dfs(path, verbose = True): 90 | 91 | with open(path, "rb") as f: 92 | poller = document_analysis_client.begin_analyze_document("prebuilt-document", document=f) 93 | 94 | result = poller.result() 95 | 96 | contents = '' 97 | kv_contents = '' 98 | t_contents = '' 99 | 100 | for kv_pair in result.key_value_pairs: 101 | key = kv_pair.key.content if kv_pair.key else '' 102 | value = kv_pair.value.content if kv_pair.value else '' 103 | kv_pairs_str = f"{key} : {value}" 104 | kv_contents += kv_pairs_str + '\n' 105 | 106 | for paragraph in result.paragraphs: 107 | contents += paragraph.content + '\n' 108 | 109 | 110 | for table_idx, table in enumerate(result.tables): 111 | row = 0 112 | row_str = '' 113 | row_str_arr = [] 114 | 115 | for cell in table.cells: 116 | if cell.row_index == row: 117 | row_str += ' \t ' + str(cell.content) 118 | else: 119 | row_str_arr.append(row_str ) 120 | row_str = '' 121 | row = cell.row_index 122 | row_str += ' \t ' + str(cell.content) 123 | 124 | row_str_arr.append(row_str ) 125 | t_contents += '\n'.join(row_str_arr) +'\n\n' 126 | 127 | dfs = [] 128 | 129 | # for idx, table in enumerate(result.tables): 130 | 131 | # field_list = [c['content'] for c in table.to_dict()['cells'] if c['kind'] == 'columnHeader'] 132 | # print('\n', field_list) 133 | 134 | # table_dict = table.to_dict() 135 | # row_count = table_dict['row_count'] 136 | # col_count = table_dict['column_count'] 137 | 138 | # cells = [c for c in table_dict['cells'] if c['kind'] == 'content'] 139 | # rows = [] 140 | # max_cols = 0 141 | 142 | # for i in range(row_count - 1): 143 | # row = [c['content'] for c in cells if c['row_index'] == i + 1] 144 | # # print(row, i) 145 | # if len(row) > 0: rows.append(row) 146 | # if len(row) > max_cols: max_cols = len(row) 147 | 148 | # if len(field_list) < max_cols: field_list += [''] * (max_cols - len(field_list)) 149 | # df = pd.DataFrame(rows, columns=field_list) 150 | # if verbose: display(df) 151 | # dfs.append(df) 152 | 153 | 154 | 155 | return contents, kv_contents, dfs, t_contents -------------------------------------------------------------------------------- /utils/helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import re 4 | import numpy as np 5 | import tiktoken 6 | import json 7 | import logging 8 | from azure.storage.blob import BlobServiceClient, BlobClient 9 | from azure.storage.blob import ContainerClient, __version__ 10 | from azure.storage.blob import generate_blob_sas, BlobSasPermissions 11 | import copy 12 | from langchain.llms import AzureOpenAI 13 | from langchain.chat_models import ChatOpenAI 14 | from langchain.callbacks.base import CallbackManager 15 | 16 | from utils import language 17 | from utils import storage 18 | from utils import redis_helpers 19 | from utils import openai_helpers 20 | from utils.kb_doc import KB_Doc 21 | from utils import cosmos_helpers 22 | from utils.langchain_helpers import mod_agent 23 | 24 | from utils.env_vars import * 25 | 26 | 27 | def generate_embeddings(full_kbd_doc, embedding_model, max_emb_tokens, previous_max_tokens = 0, text_suffix = '', gen_emb=True): 28 | 29 | emb_documents = [] 30 | 31 | json_object = full_kbd_doc.get_dict() 32 | 33 | logging.info(f"Starting to generate embeddings with {embedding_model} and {max_emb_tokens} tokens") 34 | print(f"Starting to generate embeddings with {embedding_model} and {max_emb_tokens} tokens") 35 | 36 | try: 37 | if isinstance(json_object['timestamp'], list): 38 | json_object['timestamp'] = json_object['timestamp'][0] 39 | elif isinstance(json_object['timestamp'], str): 40 | json_object['timestamp'] = json_object['timestamp'] 41 | else: 42 | json_object['timestamp'] = "1/1/1970 00:00:00 AM" 43 | except: 44 | json_object['timestamp'] = "1/1/1970 00:00:00 AM" 45 | 46 | 47 | 48 | #### FOR DEMO PURPOSES ONLY -- OF COURSE NOT SECURE 49 | access = 'public' 50 | 51 | if (json_object['filename'] is None) or (json_object['filename'] == '') or (json_object['filename'] == 'null'): 52 | filename = storage.get_filename(json_object['doc_url']) 53 | else: 54 | filename = json_object['filename'] 55 | 56 | if filename.startswith('PRIVATE_'): 57 | access = 'private' 58 | #### FOR DEMO PURPOSES ONLY -- OF COURSE NOT SECURE 59 | 60 | 61 | doc_id = json_object['id'] 62 | doc_text = json_object['text'] 63 | enc = openai_helpers.get_encoder(embedding_model) 64 | tokens = enc.encode(doc_text) 65 | lang = language.detect_content_language(doc_text[:500]) 66 | is_doc = json_object.get('doc_url', False) # doc_url empty for scraped webpages. web_url used instead. 67 | if is_doc: 68 | json_object['doc_url'] = storage.create_sas(json_object.get('doc_url', "https://microsoft.com")) 69 | else: 70 | json_object['doc_url'] = '' 71 | # json_object['filename'] = filename 72 | json_object['access'] = access 73 | json_object['orig_lang'] = lang 74 | 75 | 76 | print("Comparing lengths", len(tokens) , previous_max_tokens-OVERLAP_TEXT) 77 | 78 | if (len(tokens) < previous_max_tokens-OVERLAP_TEXT) and (previous_max_tokens > 0): 79 | print("Skipping generating embeddings as it is optional for this text") 80 | return emb_documents 81 | 82 | 83 | suff = 0 84 | for chunk in chunked_words(tokens, chunk_length=max_emb_tokens-OVERLAP_TEXT): 85 | decoded_chunk = enc.decode(chunk) 86 | 87 | translated_chunk = decoded_chunk 88 | if lang != 'en': 89 | translated_chunk = language.translate(decoded_chunk, lang) 90 | 91 | if gen_emb: 92 | embedding = openai_helpers.get_openai_embedding(translated_chunk, embedding_model) 93 | else: 94 | embedding = '' 95 | 96 | dd = copy.deepcopy(json_object) 97 | dd['id'] = f"{doc_id}_{text_suffix}_{suff}" 98 | dd['text_en'] = translated_chunk 99 | if lang != 'en': dd['text'] = decoded_chunk 100 | else: dd['text'] = '' 101 | dd[VECTOR_FIELD_IN_REDIS] = embedding 102 | 103 | chunk_kbd_doc = KB_Doc() 104 | chunk_kbd_doc.load(dd) 105 | 106 | emb_documents.append(chunk_kbd_doc.get_dict()) 107 | suff += 1 108 | 109 | if suff % 10 == 0: 110 | print (f'Processed: {suff} embeddings for document {filename}') 111 | logging.info (f'Processed: {suff} embeddings for document {filename}') 112 | 113 | 114 | print(f"This doc generated {suff} chunks") 115 | logging.info(f"This doc generated {suff} chunks") 116 | 117 | return emb_documents 118 | 119 | 120 | 121 | def generate_embeddings_from_json_docs(json_folder, embedding_model, max_emb_tokens, text_suffix='M', limit = -1): 122 | 123 | emb_documents = [] 124 | 125 | counter = 0 126 | for item in os.listdir(json_folder): 127 | if (limit != -1 ) and (counter >= limit): break 128 | path = os.path.join(json_folder, item) 129 | 130 | with open(path, 'r') as openfile: 131 | json_object = json.load(openfile) 132 | 133 | doc_embs = generate_embeddings(json_object, embedding_model, max_emb_tokens = max_emb_tokens, text_suffix = text_suffix) 134 | emb_documents += doc_embs 135 | counter += 1 136 | 137 | print(f"Now processing {path}, generated {len(doc_embs)} chunks") 138 | 139 | return emb_documents 140 | 141 | 142 | 143 | def save_object_to_pkl(object, filename): 144 | with open(filename, 'wb') as pickle_out: 145 | pickle.dump(object, pickle_out) 146 | 147 | 148 | def load_object_from_pkl(filename): 149 | with open(filename, 'rb') as pickle_in: 150 | object = pickle.load(pickle_in) 151 | 152 | return object 153 | 154 | 155 | def load_embedding_docs_in_redis(emb_documents, emb_filename = '', document_name = ''): 156 | 157 | if (emb_documents is None) and (emb_filename != ''): 158 | emb_documents = load_embedding_docs_from_pkl(emb_filename) 159 | 160 | redis_conn = redis_helpers.get_new_conn() 161 | 162 | print(f"Loading {len(emb_documents)} embeddings into Redis") 163 | logging.info(f"Loading {len(emb_documents)} embeddings into Redis") 164 | 165 | counter = 0 166 | loaded = 0 167 | 168 | for e in emb_documents: 169 | loaded += redis_helpers.redis_upsert_embedding(redis_conn, e) 170 | 171 | counter +=1 172 | if counter % 200 == 0: 173 | print (f'Processed: {counter} of {len(emb_documents)} for document {document_name}') 174 | logging.info (f'Processed: {counter} of {len(emb_documents)} for document {document_name}') 175 | 176 | print (f'Processed: {counter} of {len(emb_documents)} for document {document_name}') 177 | 178 | return loaded 179 | 180 | 181 | def chunked_words(tokens, chunk_length, overlap=OVERLAP_TEXT): 182 | num_slices = len(tokens) // chunk_length + (len(tokens) % chunk_length > 0) 183 | chunks_iterator = (tokens[i*chunk_length:(i+1)*chunk_length + overlap] for i in range(num_slices)) 184 | yield from chunks_iterator 185 | 186 | 187 | 188 | 189 | def push_summarizations(doc_text, completion_model, max_output_tokens): 190 | 191 | for chunk in chunked_words(tokens, chunk_length=max_summ_tokens): 192 | print("Chunking summarization", len(chunk)) 193 | d['summary'].append(openai_summarize(enc.decode(chunk), completion_model, max_output_tokens)) 194 | 195 | summary = '\n'.join(d['summary']) 196 | logging.info(f"Summary {summary}") 197 | print(f"Summary {summary}") 198 | 199 | push_embeddings(summary, enc.encode(summary), lang, timestamp, doc_id, doc_url, text_suffix = 'summ') 200 | 201 | 202 | 203 | re_strs = [ 204 | "customXml\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*", 205 | "ppt\/[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*", 206 | "\.MsftOfcThm_[-a-zA-Z0-9+&@#\/%=~_|$?!:,.]*[\r\n\t\f\v ]\{[\r\n\t\f\v ].*[\r\n\t\f\v ]\}", 207 | "SlidePowerPoint", 208 | "PresentationPowerPoint", 209 | '[a-zA-Z0-9]*\.(?:gif|emf)' 210 | ] 211 | 212 | 213 | 214 | def redis_search(query: str, filter_param: str): 215 | if (REDIS_ADDR is None) or (REDIS_ADDR == ''): 216 | return ["Sorry, I couldn't find any information related to the question."] 217 | 218 | 219 | redis_conn = redis_helpers.get_new_conn() 220 | completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL) 221 | embedding_enc = openai_helpers.get_encoder(CHOSEN_EMB_MODEL) 222 | 223 | query = embedding_enc.decode(embedding_enc.encode(query)[:MAX_QUERY_TOKENS]) 224 | 225 | query_embedding = openai_helpers.get_openai_embedding(query, CHOSEN_EMB_MODEL) 226 | results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=NUM_TOP_MATCHES, filter_param=filter_param) 227 | 228 | if len(results) == 0: 229 | logging.warning("No embeddings found in Redis, attempting to load embeddings from Cosmos") 230 | cosmos_helpers.cosmos_restore_embeddings() 231 | results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=NUM_TOP_MATCHES, filter_param=filter_param) 232 | 233 | return process_search_results(results) 234 | 235 | 236 | def process_search_results(results): 237 | completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL) 238 | 239 | if len(results) == 0: 240 | return ["Sorry, I couldn't find any information related to the question."] 241 | 242 | context = [] 243 | 244 | for t in results: 245 | t['text_en'] = t['text_en'].replace('\r', ' ').replace('\n', ' ') 246 | 247 | try: 248 | if ('web_url' in t.keys()) and (t['web_url'] is not None) and (t['web_url'] != ''): 249 | context.append('\n\n' + f"[{t['web_url']}] " + t['text_en'] + '\n\n') 250 | else: 251 | context.append('\n\n' + f"[{t['container']}/{t['filename']}] " + t['text_en'] + '\n\n') 252 | except Exception as e: 253 | print("------------------- Exception in process_search_results: ", e) 254 | context.append('\n\n' + t['text_en'] + '\n\n') 255 | 256 | 257 | for i in range(len(context)): 258 | for re_str in re_strs: 259 | matches = re.findall(re_str, context[i], re.DOTALL) 260 | for m in matches: context[i] = context[i].replace(m, '') 261 | 262 | final_context = [] 263 | total_tokens = 0 264 | 265 | for i in range(len(context)): 266 | total_tokens += len(completion_enc.encode(context[i])) 267 | # print(total_tokens) 268 | if (total_tokens < MAX_SEARCH_TOKENS) and (len(final_context) < NUM_TOP_MATCHES): 269 | final_context.append(context[i]) 270 | else: 271 | break 272 | 273 | return final_context 274 | 275 | 276 | def redis_lookup(query: str, filter_param: str): 277 | redis_conn = redis_helpers.get_new_conn() 278 | completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL) 279 | 280 | embedding_enc = openai_helpers.get_encoder(CHOSEN_EMB_MODEL) 281 | query = embedding_enc.decode(embedding_enc.encode(query)[:MAX_QUERY_TOKENS]) 282 | 283 | query_embedding = openai_helpers.get_openai_embedding(query, CHOSEN_EMB_MODEL) 284 | results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=1, filter_param=filter_param) 285 | 286 | if len(results) == 0: 287 | logging.warning("No embeddings found in Redis, attempting to load embeddings from Cosmos") 288 | cosmos_helpers.cosmos_restore_embeddings() 289 | results = redis_helpers.redis_query_embedding_index(redis_conn, query_embedding, -1, topK=NUM_TOP_MATCHES, filter_param=filter_param) 290 | 291 | context = ' \n'.join([f"[{t['container']}/{t['filename']}] " + t['text_en'].replace('\n', ' ') for t in results]) 292 | 293 | for re_str in re_strs: 294 | matches = re.findall(re_str, context, re.DOTALL) 295 | for m in matches: context = context.replace(m, '') 296 | 297 | context = completion_enc.decode(completion_enc.encode(context)[:MAX_SEARCH_TOKENS]) 298 | return context 299 | 300 | 301 | 302 | 303 | 304 | def get_llm(model = CHOSEN_COMP_MODEL, temperature=0.3, max_output_tokens=MAX_OUTPUT_TOKENS, stream=False, callbacks=[]): 305 | gen = openai_helpers.get_generation(model) 306 | 307 | if (gen == 3) : 308 | llm = AzureOpenAI(deployment_name=model, model_name=model, temperature=temperature, 309 | openai_api_key=openai.api_key, max_retries=30, 310 | request_timeout=120, streaming=stream, 311 | callback_manager=CallbackManager(callbacks), 312 | max_tokens=max_output_tokens, verbose = True) 313 | 314 | elif (gen == 4) or (gen == 3.5): 315 | llm = ChatOpenAI(model_name=model, model=model, engine=model, 316 | temperature=0.3, openai_api_key=openai.api_key, max_retries=30, streaming=stream, 317 | callback_manager=CallbackManager(callbacks), 318 | request_timeout=120, max_tokens=max_output_tokens, verbose = True) 319 | else: 320 | assert False, f"Generation unknown for model {model}" 321 | 322 | return llm -------------------------------------------------------------------------------- /utils/http_helpers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | from tenacity import ( 5 | retry, 6 | stop_after_attempt, 7 | wait_random_exponential, 8 | ) 9 | 10 | 11 | # """ 12 | # api_key = 'YOUR_API_KEY' 13 | # search_service_name = 'your-search-service-name' 14 | # index_name = 'your-index-name' 15 | # api_version = 'your-api-version' 16 | 17 | # request = HTTPRequest(api_key, search_service_name, index_name, api_version) 18 | # headers = {'Authorization': 'Bearer YOUR_ACCESS_TOKEN'} 19 | # body = {'key': 'value'} 20 | 21 | # response_put = request.put(headers=headers, body=body) 22 | # response_post = request.post(headers=headers, body=body) 23 | # response_get = request.get(headers=headers) 24 | # response_delete = request.delete(headers=headers) 25 | 26 | # print(response_put) 27 | # print(response_post) 28 | # print(response_get) 29 | # print(response_delete) 30 | # """ 31 | 32 | 33 | class HTTPError(Exception): 34 | def __init__(self, status_code, message): 35 | self.status_code = status_code 36 | self.message = message 37 | super().__init__(f"HTTP Error {status_code}: {message}") 38 | 39 | 40 | 41 | class HTTPRequest: 42 | def __init__(self, url = '', api_key = ''): 43 | self.url = url 44 | self.api_key = api_key 45 | self.default_headers = {'Content-Type': 'application/json', 'api-key': self.api_key} 46 | 47 | 48 | def initialize_for_cogsearch(self, api_key, search_service_name, index_name, api_version): 49 | self.api_key = api_key 50 | self.search_service_name = search_service_name 51 | self.index_name = index_name 52 | self.api_version = api_version 53 | self.url = f"{search_service_name}/indexes/{index_name}?api-version={api_version}" 54 | self.post_url = f"{search_service_name}/indexes/{index_name}/docs/index?api-version={api_version}" 55 | self.search_url = f"{search_service_name}/indexes/{index_name}/docs/search?api-version={self.api_version}" 56 | 57 | self.default_headers = {'Content-Type': 'application/json', 'api-key': self.api_key} 58 | 59 | 60 | def handle_response(self, response): 61 | try: 62 | response_data = json.loads(response.text) 63 | except json.JSONDecodeError: 64 | response_data = response.text 65 | 66 | if response.status_code >= 400: 67 | raise HTTPError(response.status_code, response_data) 68 | 69 | return response_data 70 | 71 | 72 | def get_url(self, op = None): 73 | return self.url 74 | 75 | 76 | @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(4)) 77 | def put(self, op = None, headers=None, body=None): 78 | 79 | url = self.get_url(op) 80 | 81 | if headers is None: 82 | headers = self.default_headers 83 | else: 84 | headers = {**self.default_headers, **headers} 85 | 86 | if body is None: 87 | body = {} 88 | 89 | response = requests.put(url, json=body, headers=headers) 90 | return self.handle_response(response) 91 | 92 | 93 | @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(4)) 94 | def post(self, op = None, headers=None, body=None, data=None): 95 | 96 | url = self.get_url(op) 97 | 98 | if headers is None: 99 | headers = self.default_headers 100 | else: 101 | headers = {**self.default_headers, **headers} 102 | 103 | if body is None: 104 | body = {} 105 | 106 | if data is not None: 107 | response = requests.post(url, data=data, headers=headers) 108 | elif body is not None: 109 | response = requests.post(url, json=body, headers=headers) 110 | else: 111 | response = requests.post(url, headers=headers) 112 | 113 | return self.handle_response(response) 114 | 115 | 116 | @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(2)) 117 | def get(self, op = None, headers=None, params=None): 118 | 119 | url = self.get_url(op) 120 | 121 | if headers is None: 122 | headers = self.default_headers 123 | else: 124 | headers = {**self.default_headers, **headers} 125 | 126 | if params is None: 127 | params = {} 128 | 129 | response = requests.get(url, headers=headers, params=params) 130 | return self.handle_response(response) 131 | 132 | 133 | @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(4)) 134 | def delete(self, op = None, id = None, headers=None): 135 | 136 | url = self.get_url(op) 137 | 138 | if headers is None: 139 | headers = self.default_headers 140 | else: 141 | headers = {**self.default_headers, **headers} 142 | 143 | response = requests.delete(url, headers=headers) 144 | return self.handle_response(response) 145 | 146 | 147 | 148 | 149 | 150 | class CogSearchHttpRequest(HTTPRequest): 151 | 152 | def __init__(self, api_key, search_service_name, index_name, api_version): 153 | self.api_key = api_key 154 | self.search_service_name = search_service_name 155 | self.index_name = index_name 156 | self.api_version = api_version 157 | self.url = f"{search_service_name}/indexes/{index_name}?api-version={api_version}" 158 | self.post_url = f"{search_service_name}/indexes/{index_name}/docs/index?api-version={api_version}" 159 | self.search_url = f"{search_service_name}/indexes/{index_name}/docs/search?api-version={self.api_version}" 160 | 161 | self.default_headers = {'Content-Type': 'application/json', 'api-key': self.api_key} 162 | 163 | 164 | def get_url(self, op = None): 165 | if op == 'index': 166 | url = self.post_url 167 | elif op == 'search': 168 | url = self.search_url 169 | else: 170 | url = self.url 171 | 172 | return url 173 | 174 | 175 | 176 | class CVHttpRequest(HTTPRequest): 177 | 178 | def __init__(self, api_key, cog_serv_name, api_version, 179 | options = ['tags', 'objects', 'caption', 'read', 'smartCrops', 'denseCaptions', 'people']): 180 | 181 | self.api_key = api_key 182 | 183 | if cog_serv_name.endswith('/'): 184 | cog_serv_name = cog_serv_name[:-1] 185 | 186 | self.cog_serv_name = cog_serv_name 187 | self.api_version = api_version 188 | 189 | options = ','.join(options).replace(' ', '') if isinstance(options, list) else options 190 | self.url = f"{cog_serv_name}/computervision/imageanalysis:analyze?api-version={api_version}&modelVersion=latest&features={options}" 191 | self.imgvec_url = f"{cog_serv_name}/computervision/retrieval:vectorizeImage?api-version={api_version}&modelVersion=latest" 192 | self.txtvec_url = f"{cog_serv_name}/computervision/retrieval:vectorizeText?api-version={api_version}&modelVersion=latest" 193 | 194 | self.default_headers = {'Content-type': 'application/octet-stream','Ocp-Apim-Subscription-Key': self.api_key} 195 | self.json_headers = {'Content-type': 'application/json','Ocp-Apim-Subscription-Key': self.api_key} 196 | 197 | 198 | def get_url(self, op = None): 199 | if op == 'analyze': 200 | url = self.url 201 | elif op == 'img_embedding': 202 | url = self.imgvec_url 203 | elif op == 'text_embedding': 204 | url = self.txtvec_url 205 | else: 206 | url = self.url 207 | 208 | return url 209 | 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /utils/kb_doc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from datetime import datetime, timedelta 4 | 5 | from utils.env_vars import * 6 | 7 | class KB_Doc(): 8 | 9 | def __init__(self): 10 | 11 | self.id = '' 12 | self.text_en = '' 13 | self.text = '' 14 | self.doc_url = '' 15 | self.timestamp = datetime.now().strftime("%m/%d/%Y, %H:%M:%S") 16 | self.item_vector = [] 17 | self.orig_lang = 'en' 18 | self.access = 'public' 19 | self.client = KB_INDEX_NAME 20 | self.container = KB_BLOB_CONTAINER 21 | self.filename = '' 22 | self.web_url = '' 23 | self.contentType = '' 24 | 25 | 26 | if PROCESS_IMAGES == 1: 27 | self.cv_image_vector = [0.] * 1024 28 | self.cv_text_vector = [0.] * 1024 29 | 30 | 31 | def load(self, data): 32 | for k in data: 33 | setattr(self, k, data[k]) 34 | 35 | 36 | def get_fields(self): 37 | return self.__dict__.keys() 38 | 39 | 40 | def get_dict(self): 41 | return self.__dict__ -------------------------------------------------------------------------------- /utils/langchain_helpers/mod_ccr_prompt.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Assistant is constantly learning and improving, and its capabilities are constantly evolving. It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics. 4 | 5 | 6 | PREFIX = """Assistant is a large language model trained by OpenAI and is super factual and details oriented. The assistant must look for answers within the provided tools responses, and if the answer is not in the tools reponses or the context, then the assistant must answer by "Sorry, I do not know the answer". 7 | 8 | Assistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. 9 | 10 | Overall, Assistant is a powerful system that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist. 11 | 12 | Final answers should be concise and to the point. If presented with lots of information, the assistant should try to summarize and give a concise answer. 13 | 14 | Observations have sources, the assistant MUST include the source name in the final answer. If there are multiple sources, the assistant MUST cite each one in their own square brackets. For example, the assistant must use \"[folder3/info343][http://wesbite]\" and not \"[folder3/info343,http://wesbite]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://wesbite". 15 | 16 | THE ASSISTANT MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE USER INPUT OR TOOLS' RESPONSES, THE ASSISTANT MUST NOT ANSWER FROM MEMORY AND MUST NOT MAKE UP ANSWERS. 17 | 18 | """ 19 | 20 | 21 | FORMAT_INSTRUCTIONS = """RESPONSE FORMAT INSTRUCTIONS 22 | ---------------------------- 23 | 24 | When responding to me please, please output a response in one of two formats: 25 | 26 | **Option 1:** 27 | Use this if you want the human to use a tool. 28 | Markdown code snippet formatted in the following schema.The assistant must strictly follow the following format, with no additional or other text in the response outside the json block: 29 | 30 | ```json 31 | {{{{ 32 | "action": "string" \\ The action to take. Must be one of {tool_names} 33 | "action_input": "string" \\ The input to the action 34 | }}}} 35 | ``` 36 | 37 | **Option #2:** 38 | Use this if you want to respond directly to the human. Markdown code snippet formatted in the following schema. The assistant must strictly follow the following format, with no additional or other text in the response outside the json block: 39 | 40 | ```json 41 | {{{{ 42 | "action": "Final Answer", 43 | "action_input": "[source name 1][source name 2] string" \\ You should put what you want to return to use here 44 | }}}} 45 | ```""" 46 | 47 | SUFFIX = """TOOLS 48 | ------ 49 | Assistant must ask the user to use tools to look up information that may be helpful in answering the users original question. The tools the human can use are: 50 | 51 | {{tools}} 52 | 53 | {format_instructions} 54 | 55 | USER'S INPUT 56 | -------------------- 57 | Here is the user's input (remember to respond with a markdown code snippet of a json blob with a single action, and NOTHING else): 58 | 59 | {{{{input}}}}""" 60 | 61 | 62 | 63 | TEMPLATE_TOOL_RESPONSE = """TOOL RESPONSE: 64 | --------------------- 65 | [source name 1][source name 2] {observation} 66 | 67 | USER'S INPUT 68 | -------------------- 69 | 70 | Okay, so what is the response to my last comment? If using information obtained from the tools you must mention it explicitly without mentioning the tool names - I have forgotten all TOOL RESPONSES! Remember to respond with a markdown code snippet of a json blob with a single action, and NOTHING else. The assistant MUST NOT MENTION THE TOOL NAME, the final answer to the original input question MUST BE detailed and specific but concise, human-friendly and easy to read. (do NOT use the tool names in the final answer, and do not use machine jargon). Make sure that you send the correct source as a reference, if the source is already included in the history, make sure to include it again in the Final Answer.""" 71 | -------------------------------------------------------------------------------- /utils/langchain_helpers/mod_react_prompt.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import PromptTemplate, BasePromptTemplate 2 | 3 | 4 | mod_react_prefix = """Answer the following questions as best you can. You have access to only the following tools:""" 5 | 6 | # If after using 2 tools and the assistant has a partial final answer, then the assistant must formulate a final answer, and then add to it "I'm not sure if this is the answer you are looking for, but here is what I found." and then the assistant MUST stop searching. 7 | # YOU MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE OBSERVATIONS, FROM THE INITIAL CONTEXT OR FROM PREVIOUS CONVERSATION, DO NOT ANSWER FROM MEMORY. 8 | 9 | 10 | mod_react_format_instructions = """The assistant can use ONLY the listed tools. The assistant MUST NOT make up tool names. 11 | 12 | After each time the assistant uses a tool, the assistant shall thoroughly inspect the tool results in the Observation and formulate a final answer if the results have enough information. If the assistant has a final answer, then the assistant MUST stop using the tools. 13 | If the assistant does not have the information needed to formulate an answer, the assistant MUST continue using the tools with different action inputs for a maximum total of 3 tool uses. If after using the first tool, the assistant has enough information needed to formulate an answer, the assistant MUST STOP using the tools and return a final answer to the user. If the assistant decides to continue using the tools, then the assistant MUST change the Action Input with every tool. If there are lots of facts or information options, the assistant MUST try its best to summarize the information in the final answer, and must stop using the tools. 14 | 15 | The assistant MUST NOT use the tools more than 3 times. 16 | The assistant MUST NOT use the tools if the assistant has a final answer. 17 | The assistant MUST NOT use the same tool twice or more with the exact same input. 18 | 19 | Observations have sources, the assistant MUST include the source name in the final answer. If there are multiple sources, the assistant MUST cite each one in their own square brackets. For example, the assistant must use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://microsoft.com". 20 | 21 | THE ASSISTANT MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE OBSERVATIONS, FROM THE USER'S INPUT, INITIAL CONTEXT OR FROM PREVIOUS CONVERSATION, THE ASSISTANT MUST NOT ANSWER FROM MEMORY. If the assistant is sure about a fact that is not explicitly stated in the knowledge base (such as knowning which country a city is located in), the assistant is permitted to use that fact from memory in the final answer but need to state this explicitly in the Final Answer. However, the assistant MUST NOT make up facts. 22 | 23 | It is critically important that the assistant MUST not mention the tool names in the Final Answer. 24 | 25 | If the Conversation History or Initial Context are not related to the question, then the assistant MUST ignore them. ALWAYS remember that the assistant MUST synthesize a Final Answer out of all the information collected for the user's benefit. If there are several pieces of information in the final answer, the assistant can choose to answer in bullet point format. The assistant MUST be detailed and specific but concise when giving a final answer, with facts that are RELEVANT ONLY to the question. 26 | 27 | It is critically important that the assistant USE the following format STRICTLY, the assistant's answer MUST be in the below format. The assistant MUST either generate a thought with an action and action input, or a thought with a final answer: 28 | 29 | #FORMAT# 30 | Question: the input question you must answer 31 | Thought: you should always think about what to do. First, identify in the previous observations any facts or information that can help in answering the above Question, and make sure to explicitly output them in the current Thought. If the question needs multiple tools, break it down into multiple action inputs for multiple tools. Decide on the most relevant tool for the next step. 32 | Action: the action to take, should be one of [{tool_names}] 33 | Action Input: the input to the action 34 | Observation: [folder1/file1] the result of the action.\n[http://wikipedia.com] second result of the action\n[website.com] third result of the action 35 | ... (this Thought/Action/Action Input/Observation can repeat up to 3 times with different action inputs in each time) 36 | Thought: After carefully analyzing the previous Observations, I now know the final answer. Formulate a final answer from all previous thoughts and observations, and write down an elaborate, detailed, and specific answer, which is directly relevant to the question. 37 | Final Answer: [folder1/file1][http://wikipedia.com][website.com] the final answer to the original input question that is human-friendly and easy to read. (do NOT use the tool names in the final answer, and do not use machine jargon) 38 | #FORMAT# 39 | 40 | Follow the above Format strictly, and make sure to follow the instructions in each step (Thought/Action/Action Input/Observation). DO NOT USE THE TOOL NAMES IN THE ANSWER. 41 | 42 | """ 43 | 44 | # Identify explicitly any information inside those observations that can help in answering the above question. 45 | 46 | 47 | mod_react_suffix = """Begin! 48 | Conversation History: {history} 49 | 50 | Question: {input} 51 | 52 | 53 | Thought:{agent_scratchpad}""" 54 | 55 | # Initial Context:{pre_context} 56 | 57 | 58 | 59 | 60 | mod_evaluate_instructions = """<|im_start|> 61 | The assistant is a super helpful assistant that plays the role of detective and has ultra high attention to details. The assistant must go through the below context paragraph by paragraph and try to find relevant information to the user's question. The current time and date will be provided for the assistant in the Context. The assistant can use the current date and time to derive the day and date for any time-related questions, such as this afternoon, this evening, today, tomorrow, this weekend or next week. 62 | <|im_end|> 63 | <|im_start|>user 64 | 65 | Instruction: Identify in the above facts or information that can help in answering the following question: "##{history}\nHuman: {question}##" and list them in bullet point format. Be elaborate, detailed and specific when identifying facts or information. Do NOT be concise so as not to miss critical information. 66 | YOU MUST STRICTLY USE THE CONTEXT TO IDENTIFY FACTS OR INFORMATION, DO NOT ANSWER FROM MEMORY. 67 | Facts have sources, you MUST include the source name in the EACH bullet point at the beginning before any text. If there are multiple sources, cite each one in their own square brackets. For example, use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://microsoft.com". 68 | 69 | Context: 70 | - [https://www.timeanddate.com] {todays_time} 71 | 72 | {context} 73 | 74 | 75 | Use the following format: 76 | - [folder1/file1] the first fact or information (elaborate, detailed, and specific) 77 | - [http://website.com] the second fact or information (elaborate, detailed, and specific) 78 | - [http://wikipedia.com] the third fact or information (elaborate, detailed, and specific) 79 | - [folder3/file3] the fourth fact or information (elaborate, detailed, and specific) 80 | - [http://microsoft.com] the fifth fact or information (elaborate, detailed, and specific) 81 | - [folder4/file4] the sixth fact or information (elaborate, detailed, and specific) 82 | - [http://outlook.com] the seventh fact or information (elaborate, detailed, and specific) 83 | - [https://linkedin.com] the eighth fact or information (elaborate, detailed, and specific) 84 | - (and so on ...) 85 | 86 | 87 | 88 | Begin: 89 | <|im_end|> 90 | <|im_start|>assistant 91 | """ 92 | 93 | 94 | mod_extract_intent_instructions = """<|im_start|> 95 | The assistant is a super helpful assistant that plays the role of a search engine expert and has ultra high attention to details. The assistant must go through the below question and think about the most important keywords to extract. Please extract the intent of the below question and the keywords in as few words as possible. Imagine extracting the intent as keywords to be the input to a search engine. DO NOT ANSWER THE QUESTION, EXTRACT ONLY THE INTENT. 96 | 97 | <|im_end|> 98 | <|im_start|>user 99 | 100 | The following are examples, and must be strictly used as output format: 101 | 102 | Question: what hotels are recommended in Las Vegas? 103 | Intent: knowledge base 104 | Keywords: recommend hotels Las Vegas 105 | 106 | Question: Hi 107 | Intent: chit chat 108 | Keywords: chit chat 109 | 110 | Question: Don't you want to know about me? 111 | Intent: chit chat 112 | Keywords: chit chat 113 | 114 | Question: Do you eat? 115 | Intent: chit chat 116 | Keywords: chit chat 117 | 118 | Question: I'm curious about your family 119 | Intent: chit chat 120 | Keywords: chit chat 121 | 122 | Question: Surprise me 123 | Intent: chit chat 124 | Keywords: chit chat 125 | 126 | Question: Who is Barack Obama? 127 | Intent: knowledge base 128 | Keywords: Identify Barack Obama 129 | 130 | Question: what is mentioned about the Volcano hotel? 131 | Intent: knowledge base 132 | Keywords: Volcano hotel, mentioned 133 | 134 | Question: how much are the one day pass tickets for Ferrari world? 135 | Intent: knowledge base 136 | Keywords: Ferrari world, one day pass, price 137 | 138 | Question: where is the Eiffel Tower? 139 | Intent: knowledge base 140 | Keywords: locate Eiffel Tower 141 | 142 | 143 | Use the below format strictly: 144 | 145 | Question: "{question}" 146 | 147 | <|im_end|> 148 | <|im_start|>assistant 149 | """ 150 | 151 | 152 | 153 | 154 | mod_chit_chat_instructions = """<|im_start|> 155 | The assistant is a super helpful assistant that plays the role of a chit chat buddy and is very talkative and friendly. The assistant must go through the below question and reply in a super friendly and talkative manner. The user wants to chit chat, so the assistant must indulge them. 156 | 157 | <|im_end|> 158 | <|im_start|>user 159 | 160 | 161 | Question: "{question}" 162 | 163 | <|im_end|> 164 | <|im_start|>assistant 165 | """ 166 | 167 | 168 | 169 | mod_qc_instructions = """<|im_start|> 170 | The assistant is a super helpful assistant that plays the role of a quality control engineer and has ultra high attention to details. The assistant must go through the below question and think whether the answer is an adequate response to the question. Inadequate answers appear to be incomplete as they mention that the assistant must try another action, or try a different tool. Inadequate answers also suggest the answer is not final and that the user must perform an extra action of checking pages in the reference source You MUST answer by "Yes" or "No" ONLY. No additional explanation is required. 171 | 172 | <|im_end|> 173 | <|im_start|>user 174 | 175 | The following are examples: 176 | 177 | Question: "what hotels are recommended in Las Vegas?" 178 | Answer: "The search results do not provide a clear answer to the question. I should try a different action input.\n\n 'Most luxurious hotels on the Las Vegas Strip'" 179 | Adequate: No 180 | 181 | Question: "what is mentioned about the Lost City hotel?" 182 | Answer: "The Lost City Hotel is a luxurious accommodation in Dubai with an onsite waterpark and aquarium." 183 | Adequate: Yes 184 | 185 | Question: "who is Barack Obama?" 186 | Answer: 'I need to be more specific with my input.\n\n "Barack Obama biography"' 187 | Adequate: No 188 | 189 | Question: "who is Barack Obama?" 190 | Answer: 'Unfortunately, none of the sources I searched provided any specific information about Barack Obama.' 191 | Adequate: Yes 192 | 193 | Question: "how much are the one day pass tickets for Ferrari world?" 194 | Answer: "I'm sorry, I could not find the ticket prices for Ferrari World." 195 | Adequate: Yes 196 | 197 | Question: "what is the average salary in the USA?" 198 | Answer: 'Since Cognitive Search did not provide any relevant information, I should try a different tool.' 199 | Adequate: No 200 | 201 | Question: "who is Barack Obama?" 202 | Answer: 'The context paragraph does not provide any direct information about Barack Obama. However, a Cognitive Search for "Barack Obama presidency" may yield relevant information about his presidency.' 203 | Adequate: No 204 | 205 | Question: "who is Barack Obama?" 206 | Answer: 'I apologize, but I cannot find any relevant information about Barack Obama in the given context.' 207 | Adequate: Yes 208 | 209 | Question: "What is the best thing about Las Vegas?" 210 | Answer: 'I need to try a different tool.' 211 | Adequate: No 212 | 213 | Question: "what are the total annual leaves in days with full renumeration in Australia?" 214 | Answer: 'Since Redis Search did not provide any relevant information, I should try a different tool.' 215 | Adequate: No 216 | 217 | Question: "what hotels are recommended in Seattle?" 218 | Answer: "Our travel agency offers the following hotels in Seattle: The Cinnamon Hotel, The Creek Hotel, and The Bay Hotel. 219 | Adequate: Yes 220 | 221 | Question: "what hotels are recommended in Las Vegas?" 222 | Answer: "Margie’s Travel offers the following hotels in Las Vegas: The Volcano Hotel, The Fountain Hotel, The Canal Hotel. To book your trip to Las Vegas, visit www.margiestravel.com." 223 | Adequate: Yes 224 | 225 | Question: "what is mentioned about the Lost City hotel?" 226 | Answer: "The Lost City Hotel is a luxurious accommodation in Dubai, with an onsite waterpark and aquarium, offered by Margie's Travel. To book a trip to Dubai, visit www.margiestravel.com." 227 | Adequate: Yes 228 | 229 | Question: "what is mentioned about the Volcano hotel?" 230 | Answer: "The Volcano Hotel is a stylish casino hotel with live entertainment and an extensive pool area, located in the heart of The Strip. To book a trip to Las Vegas, visit www.margiestravel.com." 231 | Adequate: Yes 232 | 233 | Question: "what is the contact info of IPA?" 234 | Answer: "The contact information for IPA Qatar is provided on page 26 of the guide." 235 | Adequate: No 236 | 237 | 238 | 239 | Question: "{question}" 240 | Answer: "{answer}" 241 | Adequate: 242 | 243 | <|im_end|> 244 | <|im_start|>assistant 245 | """ 246 | -------------------------------------------------------------------------------- /utils/langchain_helpers/mod_wiki_prompt.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import PromptTemplate, BasePromptTemplate 2 | 3 | 4 | EXAMPLES = [ 5 | # """Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into? 6 | # Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area. 7 | # Action 1: Search[Colorado orogeny] 8 | # Observation 1: [info1.pdf] The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas. 9 | # Thought 2: It does not mention the eastern sector. So I need to look up eastern sector. 10 | # Action 2: Lookup[eastern sector] 11 | # Observation 2: [info2.txt] (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny. 12 | # Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range. 13 | # Action 3: Search[High Plains] 14 | # Observation 3: [some_file.pdf] High Plains refers to one of two distinct land regions 15 | # Thought 4: I need to instead search High Plains (United States). 16 | # Action 4: Search[High Plains (United States)] 17 | # Observation 4: [filea.pdf] The High Plains are a subregion of the Great Plains. [another-ref.docx] From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130m). 18 | # Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft. 19 | # Action 5: Finish[1,800 to 7,000 ft [filea.pdf]]\n\n""", 20 | # """Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who? 21 | # Thought 1: The question simplifies to "The Simpsons" character Milhouse is named after who. I only need to search Milhouse and find who it is named after. 22 | # Action 1: Search[Milhouse] 23 | # Observation 1: [info7.pdf] Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening. 24 | # Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up "named after". 25 | # Action 2: Lookup[named after] 26 | # Observation 2: [historyref2.txt] (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. 27 | # Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon. 28 | # Action 3: Finish[(historyref2.txt) Richard Nixon\n\n]""", 29 | """Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture? 30 | Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups. 31 | Action 1: Search[Adam Clayton Powell] 32 | Observation 1: [info9.pdf] Could not find [Adam Clayton Powell]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office 33 | Building', 'Isabel Washington Powell', 'Adam Powell', vAdam Clayton Powell (film)', 'Giancarlo Esposito']. 34 | Thought 2: To find the documentary, I can search Adam Clayton Powell (film). 35 | Action 2: Search[Adam Clayton Powell (film)] 36 | Observation 2: [data123.txt] Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg. The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience. 37 | Thought 3: Adam Clayton Powell (film) is a documentary about an African-American 38 | politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture. 39 | Action 3: Finish[[data123.txt] The Saimaa Gesture]""", 40 | """Question: What profession does Nicholas Ray and Elia Kazan have in common? 41 | Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common. 42 | Action 1: Search[Nicholas Ray] 43 | Observation 1: [files-987.png] Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 - June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause. 44 | Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions. 45 | Action 2: Search[Elia Kazan] 46 | Observation 2: [files-654.txt] Elia Kazan was an American film and theatre director, producer, screenwriter and actor. 47 | Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor. 48 | Action 3: Finish[[files-987.png](files-654.txt] director, screenwriter, actor]""", 49 | """Question: Which magazine was started first Arthur's Magazine or First for Women? 50 | Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first. 51 | Action 1: Search[Arthur's Magazine] 52 | Observation 1: [magazines-1850.pdf] Arthur's Magazine (1844-1846) was an American literary periodical published in Philadelphia in the 19th century. 53 | Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next. 54 | Action 2: Search[First for Women] 55 | Observation 2: [magazines-1900.pdf] First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. 56 | Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first. 57 | Action 3: Finish[[magazines-1850.pdf)(magazines-1900.pdf) Arthur's Magazine]""", 58 | """Question: Were Pavel Urysohn and Leonid Levin known for the same type of work? 59 | Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same. 60 | Action 1: Search[Pavel Urysohn] 61 | Observation 1: [info4444.pdf] Pavel Samuilovich Urysohn (February 3, 1898 - August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory. 62 | Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work. 63 | Action 2: Search[Leonid Levin] 64 | Observation 2: [datapoints_aaa.txt] Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. 65 | Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. 66 | Action 3: Finish[[info4444.pdf] yes ]""", 67 | ] 68 | 69 | 70 | SUFFIX = """ 71 | 72 | Initial Context:{pre_context} 73 | 74 | Current Conversation: {history} 75 | 76 | Question: {input} 77 | 78 | <|im_end|> 79 | <|im_start|>assistant 80 | 81 | Begin: 82 | 83 | {agent_scratchpad} 84 | """ 85 | 86 | 87 | 88 | PREFIX = """<|im_start|>system 89 | The following is a friendly conversation between a human and an AI assistant. The AI assistant is talkative and provides lots of specific details from its context. You are an intelligent assistant helping our employees with their knowledge base questions. Answer questions as shown in the following examples, by splitting the question into individual search or lookup actions to find facts until you can answer the question. 90 | Observations are prefixed by their source name in square brackets, source names MUST be included with the actions in the answers. 91 | All questions must be answered from the results from search or look up actions, only facts resulting from those can be used in an answer. 92 | Answer questions as truthfully as possible, and ONLY answer the questions using the information from observations, do not speculate or your own knowledge. 93 | If the question is not clear or further clarifications are needed, the AI assistant MUST use the search or lookup actions to get the context and information. The AI assistant MUST use one of the tools AT LEAST ONCE. 94 | At each Observation, the assistant shall ponder carefully whether it has the final answer or not. If the assistant does, then the assistant can stop searching and provide the final answer. If the assistant does not, then the assistant must continue searching until all search sources are exhausted. 95 | Do NOT answer based on your knowledge of Wikipedia. 96 | For example, if the question is \"What color is the sky?\" and one of the information sources says \"info123: the sky is blue whenever it's not cloudy\", then answer with \"The sky is blue [info123]\" 97 | It's important to strictly follow the format where the name of the source is in brackets at the end of the sentence, and only up to the prefix before the colon [\":\"]. 98 | If there are multiple sources, cite each one in their own square brackets. For example, use \"[info343][ref-76]\" and not \"[info343,ref-76]\". 99 | Never quote tool names as sources. 100 | 101 | <|im_end|> 102 | <|im_start|>user 103 | 104 | """ 105 | 106 | 107 | 108 | mod_wiki_prompt = PromptTemplate.from_examples( 109 | EXAMPLES, SUFFIX, ["input", "agent_scratchpad", "history", "pre_context"], '\n', PREFIX 110 | ) 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | # PREFIX = """<|im_start|>system 124 | # The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. Observations are prefixed by their reference name in square brackets, reference names MUST be included with the actions in the answers. 125 | # Answer the following questions as best you can. You have access to the following tools: 126 | # Search: useful for when you need to answer questions from the local knowledge store, and to get more context 127 | # Lookup: useful for when you need to lookup terms from the local knowledge store, and to get more context 128 | # The assistant MUST use the following search sources to deduce an asnwer: Search or Lookup. All questions MUST be answered from the results from these search sources, only facts resulting from those sources can be used in an answer. If any piece of information is missing, the assistant must be persistent in finding it from search sources until all tools are exhausted. 129 | # If a source provides enough evidence for an answer, then the assistant can deduce a final answer without trying other sources. 130 | # If the question is not related to the previous conversation, then the assistant must use Search or Lookup to answer the question and ignore the previous conversation. 131 | # If this question is too broad and unclear, or if the assistant needs more context to understand the question, then the assistant can use Search or Lookup to find a more specific question to use with the tools at its disposal. 132 | # At each Observation, the assistant shall ponder carefully whether it has the final answer or not. If the assistant does, then the assistant can stop searching and provide the final answer. If the assistant does not, then the assistant must continue searching until all search sources are exhausted. 133 | # If the answer can be fully answered from the previous conversation, then the answer MUST be elaborated and repeated again and sent back to the user as a Final Asnwer. 134 | 135 | # """ 136 | 137 | # PREFIX = """<|im_start|>system 138 | # The following is a friendly conversation between a human and an AI assistant. The AI assistant is talkative and provides lots of specific details from its context. 139 | # Answer questions as shown in the following examples, by splitting the question into individual search or lookup actions to find facts until you can answer the question. 140 | # Observations are prefixed by their source name in square brackets, source names MUST be included with the actions in the answers. 141 | # All questions must be answered from the results from search or look up actions, only facts resulting from those can be used in an answer. 142 | # Answer questions as truthfully as possible, and ONLY answer the questions using the information from observations, do not speculate or answer based on the assistant's own knowledge. 143 | # If the question is not clear or further clarifications are needed, the AI assistant MUST use the search or lookup actions to get the context and information. The AI assistant MUST use one of the tools AT LEAST ONCE. Do NOT answer based on Wikipedia. 144 | # Answer the question using the provided Observations only, and if the answer is not contained within the Observations, say "Sorry, the query did not find a good match. Please rephrase your question": 145 | # """ 146 | 147 | # PREFIX = \ 148 | # """You are an intelligent assistant helping our employees with their knowledge base questions. 149 | # Answer the question using only the data provided in the information sources below. 150 | # Each source has a name followed by colon and the actual data, quote the source name for each piece of data you use in the response. 151 | # For example, if the question is \"What color is the sky?\" and one of the information sources says \"info123: the sky is blue whenever it's not cloudy\", then answer with \"The sky is blue (info123)\" 152 | # It's important to strictly follow the format where the name of the source is in parenthesis at the end of the sentence, and only up to the prefix before the colon (\":\"). 153 | # If there are multiple sources, cite each one in their own square brackets. For example, use \"(info343)(ref-76)\" and not \"(info343,ref-76)\". 154 | # Never quote tool names as sources. 155 | # Answer questions as truthfully as possible, and ONLY answer the questions using the information from observations, do not speculate or answer based on the assistant's own knowledge. 156 | # If the question is not clear or further clarifications are needed, the AI assistant MUST use the search or lookup actions to get the context and information. The AI assistant MUST use one of the tools AT LEAST ONCE. 157 | # If you cannot answer using the sources below, say that you don't know. 158 | # \n\nYou can access to the following tools:""" -------------------------------------------------------------------------------- /utils/langchain_helpers/oai_fc_agent.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | import json 4 | import yaml 5 | import copy 6 | import numpy as np 7 | import itertools 8 | 9 | 10 | from utils.env_vars import * 11 | from utils import openai_helpers 12 | from utils import http_helpers 13 | from utils.cogsearch_helpers import * 14 | 15 | 16 | instruction_prompt = """You are an AI assistant specialized in answering user questions. You can call functions to obtain specific details based on user queries. 17 | Facts have sources, you MUST include the source name in the answer at the beginning before any text. If there are multiple sources, cite each one in their own square brackets. For example, use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". You must follow the following format strictly for the final answer: 18 | Answer: [folder1/file1][http://website][http://website2] the answer based on the facts or information. 19 | DO NOT MAKE UP ANY ANSWERS, ALL ANSWERS MUST BE BASED ON THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL". The Assistant should not make up sources. ALL SOURCES MUST BE EXTRACTED FROM THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL". 20 | 21 | The below are examples of final answers: 22 | 23 | Question: "what is mentioned about the Lost City hotel?" 24 | Answer: "The Lost City Hotel is a luxurious accommodation in Dubai with an onsite waterpark and aquarium. [website]" 25 | 26 | Question: "what hotels are recommended in Las Vegas?" 27 | Answer: "Margie's Travel offers the following hotels in Las Vegas: The Volcano Hotel, The Fountain Hotel, The Canal Hotel. To book your trip to Las Vegas, visit www.margiestravel.com. [folder/Las Vegas.pdf]" 28 | 29 | Question: "who is Barack Obama?" 30 | Answer: 'Barack Obama is the 44th President of the United States of America. [http://website]' 31 | 32 | Question: "who is Barack Obama?" 33 | Answer: 'Unfortunately, none of the sources I searched provided any specific information about Barack Obama. []' 34 | 35 | Question: "how much are the one day pass tickets for Ferrari world?" 36 | Answer: "I'm sorry, I could not find the ticket prices for Ferrari World. [] 37 | 38 | THE ASSISTANT MUST STRICTLY USE THE COLLECTED EVIDENCE FROM THE USER INPUT OR THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL", THE ASSISTANT MUST NOT ANSWER FROM MEMORY AND MUST NOT MAKE UP ANSWERS. Assistant must make sure to send the correct source as a reference, if the source is already included in the history which is delimited by three dollar signs, make sure to include it again in the answer. The Assistant should not make up sources. ALL SOURCES MUST BE EXTRACTED FROM THE CONTEXT WHICH IS DELIMITED BY 3 "AT SYMBOL". 39 | """ 40 | 41 | intent_messages= [ 42 | {"role": "system", "content":instruction_prompt}, 43 | ] 44 | 45 | 46 | 47 | intent_functions= [ 48 | { 49 | "name": "extract_search_terms", 50 | "type": "function", 51 | "description": "Search through knowledge base to find relevant documents that might help in answering the user query.", 52 | "parameters": { 53 | "type": "object", 54 | "properties": { 55 | "search_terms": { 56 | "type": "array", 57 | "items": { 58 | "type": "object", 59 | "properties": { 60 | "term": {"type": "string", "description": "Search terms that would be used in the search engine" }, 61 | "additional_context": {"type": "string", "description": "Additional context related to the term." }, 62 | }, 63 | "required": ["term", "additional_context"] 64 | } 65 | } 66 | }, 67 | "required": ["search_terms"] 68 | } 69 | } 70 | ] 71 | 72 | 73 | 74 | intent_body = """ 75 | Current Conversation: 76 | $$$ 77 | {history} 78 | $$$ 79 | 80 | Query: {query} 81 | 82 | """ 83 | 84 | 85 | body = """ 86 | Current Conversation: 87 | $$$ 88 | {history} 89 | $$$ 90 | 91 | Context: 92 | @@@ 93 | {context} 94 | @@@ 95 | 96 | Question: {query} 97 | 98 | Answer: 99 | """ 100 | 101 | 102 | 103 | class oai_fc_agent(): 104 | 105 | def __init__(self): 106 | self.context = {} 107 | self.context['history'] = "" 108 | 109 | 110 | 111 | def get_dict(self, response): 112 | dd = yaml.full_load(str(response['choices'][0]['message'])) 113 | 114 | if 'function_call' in dd: 115 | dd['function_call']['arguments'] = yaml.full_load(dd['function_call']['arguments']) 116 | 117 | return dd 118 | 119 | 120 | def update_history(self, input_text, answer): 121 | self.context["history"] += f"\nUser: {input_text}\nChatBot: {answer}\n" 122 | 123 | 124 | def chat(self, query, lc_agent, history): 125 | search_results = [] 126 | content = "" 127 | messages = copy.deepcopy(intent_messages) 128 | completion_enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL) 129 | 130 | messages.append({"role": "user", "content":intent_body.format(history=history, query=query)}) 131 | print("messages", messages) 132 | 133 | response = openai_helpers.contact_openai(messages, completion_model = CHOSEN_COMP_MODEL, functions=intent_functions) 134 | 135 | dd = self.get_dict(response) 136 | 137 | 138 | if 'function_call' in dd: 139 | search_terms = dd['function_call']['arguments']['search_terms'] 140 | search_results = [] 141 | 142 | print("search_terms", search_terms) 143 | 144 | for s in search_terms: 145 | search_results.append(lc_agent.agent_cog_search(s['term'] + ' ' + s.get('additional_context', ''))) 146 | 147 | search_results = '\n'.join(search_results) 148 | 149 | empty_prompt_length = len(completion_enc.encode(instruction_prompt + body)) 150 | max_comp_model_tokens = openai_helpers.get_model_max_tokens(CHOSEN_COMP_MODEL) 151 | query = completion_enc.decode(completion_enc.encode(query)[:MAX_QUERY_TOKENS]) 152 | 153 | history = completion_enc.decode(completion_enc.encode(history)[:MAX_HISTORY_TOKENS]) 154 | query_length = len(completion_enc.encode(query)) 155 | history_length = len(completion_enc.encode(history)) 156 | 157 | functions_length = len(completion_enc.encode(str(intent_functions))) 158 | func_args_length = len(completion_enc.encode(str(dd['function_call']['arguments']))) 159 | 160 | max_context_len = max_comp_model_tokens - query_length - MAX_OUTPUT_TOKENS - empty_prompt_length - history_length - functions_length - func_args_length - 1 161 | print(max_context_len, max_comp_model_tokens, query_length, MAX_OUTPUT_TOKENS, empty_prompt_length, history_length, functions_length, func_args_length) 162 | 163 | print("max_context_len", max_context_len) 164 | search_results = completion_enc.decode(completion_enc.encode(search_results)[:max_context_len]) 165 | 166 | messages.append( # adding assistant response to messages 167 | { 168 | "role": dd["role"], 169 | "function_call": { 170 | "name": dd["function_call"]["name"], 171 | "arguments": str(dd['function_call']['arguments']) 172 | }, 173 | "content": None 174 | } 175 | ) 176 | messages.append( 177 | { 178 | "role": "function", 179 | "name": dd["function_call"]["name"], 180 | "content": str(search_results), 181 | } 182 | ) 183 | print("search_results", len(search_results), search_results) 184 | print('total tokens', len(completion_enc.encode(str(messages)))) 185 | answer = openai_helpers.contact_openai(messages, completion_model = CHOSEN_COMP_MODEL) 186 | 187 | else: 188 | answer = dd['content'] 189 | 190 | return answer 191 | 192 | 193 | 194 | def run(self, query, lc_agent = None, history = None): 195 | 196 | answer = self.chat(query, lc_agent, history) 197 | print(answer) 198 | return answer -------------------------------------------------------------------------------- /utils/langchain_helpers/oldschoolsearch.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import tiktoken 3 | import numpy as np 4 | import os 5 | import time 6 | import logging 7 | import re 8 | 9 | 10 | import utils.langchain_helpers.simple_prompt 11 | 12 | from utils import openai_helpers 13 | from utils import redis_helpers 14 | from utils import helpers 15 | 16 | 17 | 18 | from langchain.prompts.chat import ( 19 | ChatPromptTemplate, 20 | HumanMessagePromptTemplate, 21 | MessagesPlaceholder, 22 | SystemMessagePromptTemplate, 23 | ) 24 | 25 | 26 | from utils.env_vars import * 27 | 28 | 29 | system_message = "The assistant is a super helpful assistant that plays the role of a linguistic professor and has ultra high attention to details." 30 | 31 | instruction = """From the above Question and Current Conversation, output search keywords to use in a search engine to get an answer for the Question. If the Question is not related to the Current Conversation, then do not use the Current Conversation when generating the Search Keywords. 32 | Search Keywords:""" 33 | 34 | body = """ 35 | Current Conversation: 36 | {history} 37 | 38 | Question: {question} 39 | """ 40 | 41 | context_prompt = """ 42 | <|im_start|> 43 | {system_message} 44 | <|im_end|> 45 | <|im_start|>user 46 | 47 | Current Conversation: 48 | {history} 49 | 50 | Question: {question} 51 | 52 | {instruction} 53 | <|im_end|> 54 | <|im_start|>assistant 55 | """ 56 | 57 | 58 | class OldSchoolSearch(): 59 | 60 | 61 | def search(self, query, history, pre_context, filter_param=None, enable_unified_search=False, 62 | lc_agent = None, enable_cognitive_search=False, evaluate_step=True, 63 | topK=NUM_TOP_MATCHES, stream = False, verbose = False): 64 | 65 | redis_conn = redis_helpers.get_new_conn() 66 | 67 | completion_model = CHOSEN_COMP_MODEL 68 | embedding_model = CHOSEN_EMB_MODEL 69 | completion_enc = openai_helpers.get_encoder(completion_model) 70 | embedding_enc = openai_helpers.get_encoder(embedding_model) 71 | 72 | if verbose: print("Old Query: ", query) 73 | gen = openai_helpers.get_generation(completion_model) 74 | 75 | if history != '': 76 | 77 | if (gen == 4) or (gen == 3.5): 78 | messages = [ 79 | SystemMessagePromptTemplate.from_template(system_message).format(), 80 | HumanMessagePromptTemplate.from_template(body).format(history=history, question=query), 81 | HumanMessagePromptTemplate.from_template(instruction).format(), 82 | ] 83 | messages = openai_helpers.convert_messages_to_roles(messages) 84 | query = openai_helpers.contact_openai(messages) 85 | else: 86 | prompt = context_prompt.format(system_message=system_message, 87 | history=history, 88 | question=query, 89 | instruction=instruction) 90 | query = openai_helpers.contact_openai(prompt) 91 | 92 | if (gen == 4) or (gen == 3.5): 93 | p = '' 94 | for m in utils.langchain_helpers.simple_prompt.get_simple_prompt('', '', '', ''): p += m['content'] 95 | empty_prompt_length = len(completion_enc.encode(p)) 96 | else: 97 | empty_prompt_length = len(completion_enc.encode(utils.langchain_helpers.simple_prompt.get_simple_prompt('', '', '', ''))) 98 | 99 | 100 | if verbose: print("New Query: ", query) 101 | 102 | max_comp_model_tokens = openai_helpers.get_model_max_tokens(completion_model) 103 | max_emb_model_tokens = openai_helpers.get_model_max_tokens(embedding_model) 104 | 105 | if lc_agent.enable_unified_search: 106 | context = lc_agent.unified_search(query) 107 | elif enable_cognitive_search: 108 | context = lc_agent.agent_cog_search(query) 109 | # elif lc_agent.use_bing: 110 | # context = lc_agent.agent_bing_search(query) 111 | else: 112 | context = lc_agent.agent_redis_search(query) 113 | 114 | query = completion_enc.decode(completion_enc.encode(query)[:MAX_QUERY_TOKENS]) 115 | history = completion_enc.decode(completion_enc.encode(history)[:MAX_HISTORY_TOKENS]) 116 | pre_context = completion_enc.decode(completion_enc.encode(pre_context)[:PRE_CONTEXT]) 117 | 118 | context_length = len(completion_enc.encode(context)) 119 | query_length = len(completion_enc.encode(query)) 120 | history_length = len(completion_enc.encode(history)) 121 | pre_context_length = len(completion_enc.encode(pre_context)) 122 | 123 | max_context_len = max_comp_model_tokens - query_length - MAX_OUTPUT_TOKENS - empty_prompt_length - history_length - pre_context_length - 1 124 | 125 | context = completion_enc.decode(completion_enc.encode(context)[:max_context_len]) 126 | 127 | prompt = utils.langchain_helpers.simple_prompt.get_simple_prompt(context, query, history, pre_context) 128 | 129 | if verbose: 130 | print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") 131 | print(prompt) 132 | print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") 133 | 134 | if verbose: print("OSS OAI Call") 135 | answer = openai_helpers.contact_openai(prompt, completion_model, MAX_OUTPUT_TOKENS, stream=stream, verbose=verbose) 136 | 137 | return answer -------------------------------------------------------------------------------- /utils/langchain_helpers/simple_prompt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from datetime import datetime 4 | from utils import openai_helpers 5 | 6 | from langchain.prompts.chat import ( 7 | ChatPromptTemplate, 8 | HumanMessagePromptTemplate, 9 | MessagesPlaceholder, 10 | SystemMessagePromptTemplate, 11 | ) 12 | 13 | 14 | from utils.env_vars import * 15 | 16 | 17 | 18 | ## Original Prompt - too strict for OpenAI 19 | ## Answer the question using the above Context only, and if the answer is not contained within the Context above, say "Sorry, the query did not find a good match. Please rephrase your question": 20 | 21 | end_of_prev_prompt_tags=""" 22 | <|im_end|> 23 | <|im_start|>user 24 | """ 25 | 26 | append_tags = """ 27 | <|im_end|> 28 | <|im_start|>assistant 29 | """ 30 | 31 | strict_prompt = "If the facts below do not answer the question, say you don't know." 32 | 33 | instruction_template = """The system is an AI assistant that helps people find information in the provided Context and Current Conversation below. Only answer questions based on the facts listed below. {strict} 34 | Facts have sources, you MUST include the source name in the answer at the beginning before any text. If there are multiple sources, cite each one in their own square brackets. For example, use \"[folder3/info343][http://wikipedia.com]\" and not \"[folder3/info343,http://wikipedia.com]\". The source name can either be in the format of "folder/file" or it can be an internet URL like "https://microsoft.com". You must follow the following format strictly for the final answer: 35 | Answer: [folder1/file1][http://website][http://website2] the answer based on the facts or information. 36 | The current time and date will be provided for the assistant in the Context. The assistant can use the current date and time to derive the day and date for any time-related questions, such as this afternoon, this evening, today, tomorrow, this weekend or next week. 37 | The assistant must first decide if the question is related to the Current Conversation. If it is, then the assistant must answer the question based on the Current Conversation and the Context. If the question is not related to the Current Conversation, then the assistant must answer the question based on the Context only. 38 | 39 | The below are examples of final answers: 40 | 41 | Question: "what is mentioned about the Lost City hotel?" 42 | Answer: "[website] The Lost City Hotel is a luxurious accommodation in Dubai with an onsite waterpark and aquarium." 43 | 44 | Question: "what hotels are recommended in Las Vegas?" 45 | Answer: "[folder/Las Vegas.pdf] Margie’s Travel offers the following hotels in Las Vegas: The Volcano Hotel, The Fountain Hotel, The Canal Hotel. To book your trip to Las Vegas, visit www.margiestravel.com." 46 | 47 | Question: "who is Barack Obama?" 48 | Answer: '[http://website] Barack Obama is the 44th President of the United States of America.' 49 | 50 | Question: "who is Barack Obama?" 51 | Answer: '[] Unfortunately, none of the sources I searched provided any specific information about Barack Obama.' 52 | 53 | Question: "how much are the one day pass tickets for Ferrari world?" 54 | Answer: "[] I'm sorry, I could not find the ticket prices for Ferrari World." 55 | 56 | """ 57 | 58 | 59 | body = """ 60 | Initial Context: 61 | {pre_context} 62 | 63 | Current Conversation: 64 | {history} 65 | 66 | Context: 67 | [https://www.timeanddate.com] The current date and time are {todays_time}. 68 | 69 | {context} 70 | 71 | Question: {query} 72 | Answer: 73 | """ 74 | 75 | 76 | def get_simple_prompt(context, query, history, pre_context): 77 | 78 | # logging.info(f"{CHOSEN_COMP_MODEL}, {GPT35_TURBO_COMPLETIONS_MODEL}, {CHOSEN_COMP_MODEL == GPT35_TURBO_COMPLETIONS_MODEL}") 79 | todays_time = datetime.now().strftime('%A %B %d, %Y %H:%M:%S') 80 | 81 | instruction_strict = instruction_template.format(strict=strict_prompt) 82 | instruction_simple = instruction_template.format(strict="") 83 | 84 | if RESTRICTIVE_PROMPT == 'yes': 85 | instruction = instruction_strict 86 | else: 87 | instruction = instruction_simple 88 | 89 | gen = openai_helpers.get_generation(CHOSEN_COMP_MODEL) 90 | 91 | # if (CHOSEN_COMP_MODEL == GPT4_MODEL) or (CHOSEN_COMP_MODEL == GPT4_32K_MODEL): 92 | if (gen == 4) or (gen == 3.5): 93 | messages = [ 94 | SystemMessagePromptTemplate.from_template(instruction_template).format(strict=strict_prompt), 95 | HumanMessagePromptTemplate.from_template(body).format(history=history, 96 | query=query, 97 | pre_context=pre_context, 98 | context=context, 99 | todays_time=todays_time), 100 | ] 101 | prompt = openai_helpers.convert_messages_to_roles(messages) 102 | elif (CHOSEN_COMP_MODEL == GPT35_TURBO_COMPLETIONS_MODEL): 103 | 104 | prompt = f""" 105 | <|im_start|>system 106 | {instruction} 107 | 108 | 109 | <|im_end|> 110 | <|im_start|>user 111 | 112 | 113 | Initial Context: 114 | {pre_context} 115 | 116 | Current Conversation: 117 | {history} 118 | 119 | Context: 120 | [https://www.timeanddate.com] The current date and time are {datetime.now().strftime('%A %B %d, %Y %H:%M:%S')}. 121 | 122 | {context} 123 | 124 | Question: {query} 125 | Answer: 126 | <|im_end|> 127 | <|im_start|>assistant 128 | """ 129 | 130 | else: 131 | 132 | prompt =f"""{instruction} 133 | 134 | Initial Context: 135 | {pre_context} 136 | 137 | Current Conversation: 138 | {history} 139 | 140 | Context: 141 | [https://www.timeanddate.com] The current date and time are {datetime.now().strftime('%A %B %d, %Y %H:%M:%S')}. 142 | 143 | {context} 144 | 145 | 146 | Question: {query} 147 | Answer: 148 | 149 | """ 150 | 151 | # logging.info(f"Using as prompt instruction: {instruction}") 152 | # print(f"Using as prompt instruction: {instruction}") 153 | 154 | return prompt -------------------------------------------------------------------------------- /utils/langchain_helpers/streaming_handler.py: -------------------------------------------------------------------------------- 1 | """Callback Handler streams to stdout on new llm token.""" 2 | import sys 3 | from typing import Any, Dict, List, Union 4 | import re 5 | from langchain.callbacks.base import BaseCallbackHandler 6 | from langchain.schema import AgentAction, AgentFinish, LLMResult 7 | 8 | 9 | class StreamingSocketIOCallbackHandler(BaseCallbackHandler): 10 | """Callback handler for streaming. Only works with LLMs that support streaming.""" 11 | 12 | def __init__(self, socketio_obj, connection_id): 13 | self.socketio_obj = socketio_obj 14 | self.connection_id = connection_id 15 | super().__init__() 16 | 17 | def on_llm_start( 18 | self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any 19 | ) -> None: 20 | """Run when LLM starts running.""" 21 | self.buffer = '' 22 | self.partial_answer = '' 23 | self.num_partial_answer = 0 24 | 25 | def output_partial_answer(self): 26 | self.partial_answer = self.partial_answer.replace('":', '').replace('"', '').replace('}', '').replace('```', '').replace(':', '').replace('\\n', '
') 27 | self.socketio_obj.emit('token', self.partial_answer, to=self.connection_id) 28 | self.partial_answer = '' 29 | self.num_partial_answer = 0 30 | 31 | def process_new_token(self, token): 32 | self.partial_answer += token # 33 | self.num_partial_answer += 1 34 | 35 | source_matches = re.findall(r'\[(.*?)\]', self.partial_answer) 36 | for s in source_matches: 37 | self.partial_answer = self.partial_answer.replace('['+s+']', '') 38 | 39 | if ('[' in self.partial_answer) and (']' not in self.partial_answer): 40 | return 41 | else: 42 | if (self.num_partial_answer >= 5) and (not self.partial_answer.endswith('\\')): 43 | self.output_partial_answer() 44 | 45 | 46 | def on_llm_new_token(self, token: str, **kwargs: Any) -> None: 47 | """Run on new LLM token. Only available when streaming is enabled.""" 48 | self.buffer += token 49 | 50 | if '"action": "Final Answer"' in self.buffer: 51 | if '"action_input":' in self.buffer: 52 | self.process_new_token(token) 53 | 54 | if 'Final Answer:' in self.buffer: 55 | self.process_new_token(token) 56 | 57 | 58 | def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: 59 | """Run when LLM ends running.""" 60 | self.output_partial_answer() 61 | 62 | def on_llm_error( 63 | self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any 64 | ) -> None: 65 | """Run when LLM errors.""" 66 | 67 | def on_chain_start( 68 | self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any 69 | ) -> None: 70 | """Run when chain starts running.""" 71 | 72 | def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None: 73 | """Run when chain ends running.""" 74 | 75 | def on_chain_error( 76 | self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any 77 | ) -> None: 78 | """Run when chain errors.""" 79 | 80 | def on_tool_start( 81 | self, serialized: Dict[str, Any], input_str: str, **kwargs: Any 82 | ) -> None: 83 | """Run when tool starts running.""" 84 | 85 | def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any: 86 | """Run on agent action.""" 87 | 88 | def on_tool_end(self, output: str, **kwargs: Any) -> None: 89 | """Run when tool ends running.""" 90 | 91 | def on_tool_error( 92 | self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any 93 | ) -> None: 94 | """Run when tool errors.""" 95 | 96 | def on_text(self, text: str, **kwargs: Any) -> None: 97 | """Run on arbitrary text.""" 98 | 99 | def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None: 100 | """Run on agent end.""" 101 | 102 | 103 | 104 | 105 | class StreamingStdOutCallbackHandler(BaseCallbackHandler): 106 | """Callback handler for streaming. Only works with LLMs that support streaming.""" 107 | 108 | buffer: str = '' 109 | partial_answer: str = '' 110 | num_partial_answer: int = 0 111 | 112 | 113 | def on_llm_start( 114 | self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any 115 | ) -> None: 116 | """Run when LLM starts running.""" 117 | self.buffer = '' 118 | self.partial_answer = '' 119 | self.num_partial_answer = 0 120 | 121 | 122 | def output_partial_answer(self): 123 | self.partial_answer = self.partial_answer.replace('":', '').replace('"', '').replace('}', '').replace('```', '').replace(':', '') 124 | sys.stdout.write(self.partial_answer) 125 | sys.stdout.flush() 126 | self.partial_answer = '' 127 | self.num_partial_answer = 0 128 | 129 | def process_new_token(self, token): 130 | self.partial_answer += token # 131 | self.num_partial_answer += 1 132 | 133 | source_matches = re.findall(r'\[(.*?)\]', self.partial_answer) 134 | for s in source_matches: 135 | self.partial_answer = self.partial_answer.replace('['+s+']', '') 136 | 137 | if ('[' in self.partial_answer) and (']' not in self.partial_answer): 138 | return 139 | else: 140 | if (self.num_partial_answer >= 5) and (not self.partial_answer.endswith('\\')): 141 | self.output_partial_answer() 142 | 143 | 144 | def on_llm_new_token(self, token: str, **kwargs: Any) -> None: 145 | """Run on new LLM token. Only available when streaming is enabled.""" 146 | self.buffer += token 147 | 148 | if '"action": "Final Answer"' in self.buffer: 149 | if '"action_input":' in self.buffer: 150 | self.process_new_token(token) 151 | 152 | if 'Final Answer:' in self.buffer: 153 | self.process_new_token(token) 154 | 155 | 156 | def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: 157 | """Run when LLM ends running.""" 158 | self.output_partial_answer() 159 | 160 | 161 | 162 | def on_llm_error( 163 | self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any 164 | ) -> None: 165 | """Run when LLM errors.""" 166 | 167 | def on_chain_start( 168 | self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any 169 | ) -> None: 170 | """Run when chain starts running.""" 171 | 172 | def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None: 173 | """Run when chain ends running.""" 174 | 175 | def on_chain_error( 176 | self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any 177 | ) -> None: 178 | """Run when chain errors.""" 179 | 180 | def on_tool_start( 181 | self, serialized: Dict[str, Any], input_str: str, **kwargs: Any 182 | ) -> None: 183 | """Run when tool starts running.""" 184 | 185 | def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any: 186 | """Run on agent action.""" 187 | pass 188 | 189 | def on_tool_end(self, output: str, **kwargs: Any) -> None: 190 | """Run when tool ends running.""" 191 | 192 | def on_tool_error( 193 | self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any 194 | ) -> None: 195 | """Run when tool errors.""" 196 | 197 | def on_text(self, text: str, **kwargs: Any) -> None: 198 | """Run on arbitrary text.""" 199 | 200 | def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None: 201 | """Run on agent end.""" 202 | -------------------------------------------------------------------------------- /utils/language.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | import uuid 4 | import os 5 | import logging 6 | 7 | import typing 8 | from azure.core.credentials import AzureKeyCredential 9 | from azure.ai.textanalytics import TextAnalyticsClient 10 | 11 | from utils.env_vars import * 12 | 13 | def detect_content_language(content): 14 | path = '/detect' 15 | constructed_url = TRANSLATION_ENDPOINT + path 16 | 17 | params = { 18 | 'api-version': '3.0', 19 | } 20 | 21 | headers = { 22 | 'Ocp-Apim-Subscription-Key': TRANSLATION_API_KEY, 23 | 'Ocp-Apim-Subscription-Region': TRANSLATION_LOCATION, 24 | 'Content-type': 'application/json', 25 | 'X-ClientTraceId': str(uuid.uuid4()) 26 | } 27 | 28 | # You can pass more than one object in body. 29 | body = [{'text': content}] 30 | 31 | request = requests.post(constructed_url, params=params, headers=headers, json=body) 32 | response = request.json() 33 | 34 | try: 35 | lang = response[0]['language'] 36 | return lang 37 | except: 38 | return 'xx' 39 | 40 | 41 | 42 | 43 | def translate(text, from_lang, to_lang = 'en'): 44 | 45 | path = '/translate' 46 | constructed_url = TRANSLATION_ENDPOINT + path 47 | body = [{'text': text}] 48 | 49 | params = { 50 | 'api-version': '3.0', 51 | 'from': from_lang, 52 | 'to': [to_lang] 53 | } 54 | 55 | headers = { 56 | 'Ocp-Apim-Subscription-Key': TRANSLATION_API_KEY, 57 | 'Ocp-Apim-Subscription-Region': TRANSLATION_LOCATION, 58 | 'Content-type': 'application/json', 59 | 'X-ClientTraceId': str(uuid.uuid4()) 60 | } 61 | 62 | request = requests.post(constructed_url, params=params, headers=headers, json=body) 63 | response = request.json() 64 | 65 | try: 66 | # print(response) 67 | return response[0]['translations'][0]['text'] 68 | except Exception as e: 69 | print(e) 70 | return response 71 | 72 | 73 | 74 | def extract_entities(text): 75 | 76 | text_analytics_client = TextAnalyticsClient(endpoint=COG_SERV_ENDPOINT, credential=AzureKeyCredential(COG_SERV_KEY)) 77 | reviews = [text] 78 | 79 | result = text_analytics_client.recognize_entities(reviews) 80 | result = [review for review in result if not review.is_error] 81 | organization_to_reviews: typing.Dict[str, typing.List[str]] = {} 82 | 83 | entities = [] 84 | 85 | for idx, review in enumerate(result): 86 | for entity in review.entities: 87 | entities.append(entity.text) 88 | #print(entity.text) 89 | 90 | return entities -------------------------------------------------------------------------------- /utils/openai_helpers.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import tiktoken 3 | import numpy as np 4 | import os 5 | import time 6 | import logging 7 | 8 | from tenacity import ( 9 | retry, 10 | stop_after_attempt, 11 | wait_random_exponential, 12 | ) 13 | 14 | 15 | # from langchain.prompts.chat import ( 16 | # ChatPromptTemplate, 17 | # HumanMessagePromptTemplate, 18 | # MessagesPlaceholder, 19 | # SystemMessagePromptTemplate, 20 | # AIMessagePromptTemplate 21 | # ) 22 | 23 | 24 | # from langchain.schema import ( 25 | # AIMessage, 26 | # HumanMessage, 27 | # SystemMessage 28 | # ) 29 | 30 | from utils.env_vars import * 31 | 32 | 33 | import openai 34 | openai.api_version = OPENAI_API_VERSION 35 | 36 | 37 | 38 | system_start_prompt = "<|im_start|>system " 39 | user_start_prompt = "<|im_start|>user " 40 | assistant_start_prompt = "<|im_start|>assistant " 41 | end_prompt = "<|im_end|> " 42 | 43 | 44 | system_start_prompt=""" 45 | <|im_end|> 46 | <|im_start|>user 47 | """ 48 | 49 | append_tags = """ 50 | <|im_end|> 51 | <|im_start|>assistant 52 | """ 53 | 54 | 55 | 56 | def check_model_deployment(oai_model): 57 | try: 58 | model_exists = False 59 | result = openai.Deployment.list() 60 | for deployment in result.data: 61 | if (deployment["model"] == oai_model): 62 | model_exists = True 63 | #logging.info(f"Found deployment {deployment}") 64 | return deployment["id"] 65 | 66 | 67 | if not model_exists: 68 | openai.Deployment.create(model=oai_model, scale_settings={"scale_type":"standard"}) 69 | time.sleep(30) 70 | assert model_exists, f"Model {oai_model} is not deployed, deploying now" 71 | 72 | except Exception as e: 73 | 74 | print(e) 75 | counter = 0 76 | deployed = False 77 | 78 | while counter < 2: 79 | time.sleep(2) 80 | result = openai.Deployment.list() 81 | print(f"Found {len(result.data)} deployments") 82 | 83 | for deployment in result.data: 84 | logging.info(f"OpenAI Deployment Exception --- Found deployment {deployment}") 85 | if (deployment["status"] == "succeeded") and (deployment["model"] == oai_model): 86 | deployed = True 87 | print(f"The right model {deployment['model']} was found") 88 | return deployment["id"] 89 | 90 | if deployed: break 91 | 92 | counter += 1 93 | 94 | return "" 95 | 96 | 97 | 98 | # completion_deployment_id = check_model_deployment(CHOSEN_COMP_MODEL) 99 | # embedding_deployment_id = check_model_deployment(CHOSEN_EMB_MODEL) 100 | 101 | 102 | 103 | def experiment_prompt(context, query): 104 | 105 | prompt =f""" 106 | Context: {context} 107 | 108 | Question: {query} 109 | 110 | 111 | Answer the question using the above Context only, and if the answer is not contained within the Context above, say "Sorry, I don't know": 112 | """ 113 | 114 | 115 | 116 | def get_summ_prompt(text): 117 | 118 | prompt =f""" 119 | Summarize the following text. 120 | 121 | Text: 122 | ### 123 | {text} 124 | ### 125 | 126 | Summary: 127 | """ 128 | 129 | return prompt 130 | 131 | 132 | def get_generation(model): 133 | if model == "text-davinci-003": 134 | return 3 135 | elif model == "gpt-35-turbo": 136 | return 3.5 137 | elif model == "gpt-35-turbo-16k": 138 | return 3.5 139 | elif model == "gpt-4-32k": 140 | return 4 141 | elif model == "gpt-4": 142 | return 4 143 | else: 144 | assert False, f"Generation unknown for model {model}" 145 | 146 | 147 | 148 | def convert_messages_to_roles(messages): 149 | roles = [] 150 | for m in messages: 151 | if isinstance(m, HumanMessage): 152 | roles.append({'role':'user', 'content': m.content}) 153 | elif isinstance(m, AIMessage): 154 | roles.append({'role':'assistant', 'content': m.content}) 155 | elif isinstance(m, SystemMessage): 156 | roles.append({'role':'system', 'content': m.content}) 157 | elif isinstance(m, Messages): 158 | roles.append({'role':'user', 'content': m.content}) 159 | else: 160 | assert False, f"Unknown message type {type(m)}" 161 | 162 | return roles 163 | 164 | 165 | def get_model_max_tokens(model): 166 | if model == "text-search-davinci-doc-001": 167 | return DAVINCI_003_EMB_MAX_TOKENS 168 | elif model == "text-search-davinci-query-001": 169 | return DAVINCI_003_EMB_MAX_TOKENS 170 | elif model == "text-davinci-003": 171 | return DAVINCI_003_MODEL_MAX_TOKENS 172 | elif model == "text-embedding-ada-002": 173 | return ADA_002_MODEL_MAX_TOKENS 174 | elif model == "gpt-35-turbo": 175 | return GPT35_TURBO_COMPLETIONS_MAX_TOKENS 176 | elif model == "gpt-35-turbo-16k": 177 | return GPT35_TURBO_16K_COMPLETIONS_MAX_TOKENS 178 | elif model == "gpt-4-32k": 179 | return GPT4_32K_COMPLETIONS_MODEL_MAX_TOKENS 180 | elif model == "gpt-4": 181 | return GPT4_COMPLETIONS_MODEL_MAX_TOKENS 182 | else: 183 | return GPT35_TURBO_COMPLETIONS_MAX_TOKENS 184 | 185 | 186 | def get_encoding_name(model): 187 | if model == "text-search-davinci-doc-001": 188 | return "p50k_base" 189 | elif model == "text-embedding-ada-002": 190 | return "cl100k_base" 191 | elif model == "gpt-35-turbo": 192 | return "cl100k_base" 193 | elif model == "gpt-35-turbo-16k": 194 | return "cl100k_base" 195 | elif model == "gpt-4-32k": 196 | return "cl100k_base" 197 | elif model == "gpt-4": 198 | return "cl100k_base" 199 | elif model == "text-davinci-003": 200 | return "p50k_base" 201 | else: 202 | return "gpt2" 203 | 204 | 205 | def get_encoder(model): 206 | if model == "text-search-davinci-doc-001": 207 | return tiktoken.get_encoding("p50k_base") 208 | elif model == "text-embedding-ada-002": 209 | return tiktoken.get_encoding("cl100k_base") 210 | elif model == "gpt-35-turbo": 211 | return tiktoken.get_encoding("cl100k_base") 212 | elif model == "gpt-35-turbo-16k": 213 | return tiktoken.get_encoding("cl100k_base") 214 | elif model == "gpt-4-32k": 215 | return tiktoken.get_encoding("cl100k_base") 216 | elif model == "gpt-4": 217 | return tiktoken.get_encoding("cl100k_base") 218 | elif model == "text-davinci-003": 219 | return tiktoken.get_encoding("p50k_base") 220 | else: 221 | return tiktoken.get_encoding("gpt2") 222 | 223 | 224 | 225 | def get_model_dims(embedding_model): 226 | if embedding_model == "text-search-davinci-doc-001": 227 | return DAVINCI_003_EMBED_NUM_DIMS 228 | elif embedding_model == "text-embedding-ada-002": 229 | return ADA_002_EMBED_NUM_DIMS 230 | else: 231 | return ADA_002_EMBED_NUM_DIMS 232 | 233 | 234 | def get_token_length(text, model = CHOSEN_EMB_MODEL): 235 | enc = get_encoder(model) 236 | return len(enc.encode(text)) 237 | 238 | 239 | 240 | # @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(30)) 241 | def get_openai_embedding(query, embedding_model = CHOSEN_EMB_MODEL): 242 | return openai.Embedding.create(input=query, engine=embedding_model)['data'][0]['embedding'] 243 | 244 | 245 | 246 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(20)) 247 | def openai_summarize(text, completion_model, max_output_tokens = MAX_OUTPUT_TOKENS, lang='en'): 248 | prompt = get_summ_prompt(text) 249 | return contact_openai(prompt, completion_model, max_output_tokens) 250 | 251 | 252 | 253 | # @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(7)) 254 | def contact_openai(prompt, completion_model = CHOSEN_COMP_MODEL, max_output_tokens = MAX_OUTPUT_TOKENS, functions=None, stream = False, verbose = False): 255 | if verbose: print("\n########################### Calling OAI Completion API - start call") 256 | 257 | 258 | b = time.time() 259 | openai.api_version = "2023-07-01-preview" 260 | 261 | if not isinstance(prompt, list): 262 | prompt = [{'role':'user', 'content': prompt}] 263 | 264 | if functions is None: 265 | resp = openai.ChatCompletion.create( 266 | messages=prompt, 267 | temperature=TEMPERATURE, 268 | max_tokens=max_output_tokens, 269 | engine=completion_model, 270 | stream = stream 271 | ) 272 | else: 273 | resp = openai.ChatCompletion.create( 274 | messages=prompt, 275 | temperature=TEMPERATURE, 276 | max_tokens=max_output_tokens, 277 | engine=completion_model, 278 | functions=functions, 279 | function_call="auto", 280 | stream = stream 281 | ) 282 | a = time.time() 283 | if verbose: print(f"Using GPT-4 - Chat Completion - with stream {stream} - OpenAI response time: {a-b}") 284 | if stream: return resp 285 | else: 286 | if functions is None: 287 | return resp["choices"][0]["message"]['content'].strip(" \n") 288 | else: 289 | return resp 290 | 291 | 292 | 293 | 294 | 295 | -------------------------------------------------------------------------------- /utils/redis_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import redis 4 | from redis import Redis 5 | import logging 6 | import copy 7 | from redis.commands.search.field import VectorField 8 | from redis.commands.search.field import TextField 9 | from redis.commands.search.field import TagField 10 | from redis.commands.search.query import Query 11 | from redis.commands.search.result import Result 12 | 13 | 14 | ## https://redis-py.readthedocs.io/en/stable/commands.html 15 | ## https://redis.io/docs/stack/search/reference/query_syntax/ 16 | 17 | 18 | 19 | from utils.kb_doc import KB_Doc 20 | 21 | from tenacity import ( 22 | retry, 23 | stop_after_attempt, 24 | wait_random_exponential, 25 | ) 26 | 27 | 28 | from utils.env_vars import * 29 | 30 | 31 | def get_model_dims(embedding_model): 32 | if embedding_model == "text-search-davinci-doc-001": 33 | return DAVINCI_003_EMBED_NUM_DIMS 34 | elif embedding_model == "text-embedding-ada-002": 35 | return ADA_002_EMBED_NUM_DIMS 36 | else: 37 | return ADA_002_EMBED_NUM_DIMS 38 | 39 | 40 | def create_search_index (redis_new_conn, vector_field_name, number_of_vectors, vector_dimensions=512, distance_metric='L2'): 41 | if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None 42 | 43 | M=40 44 | EF=200 45 | 46 | fields = [VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric, "INITIAL_CAP": number_of_vectors, "M": M, "EF_CONSTRUCTION": EF})] + \ 47 | [TextField(f) for f in KB_Doc().get_fields() if f != VECTOR_FIELD_IN_REDIS] 48 | 49 | redis_new_conn.ft(REDIS_INDEX_NAME).create_index(fields) 50 | 51 | 52 | def flush_cached_values_only(): 53 | if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None 54 | 55 | redis_conn = get_new_conn() 56 | ks = redis_conn.keys() 57 | print(f"Found {len(ks)} values that are cached in Redis") 58 | 59 | for k in ks: 60 | ttl = redis_conn.ttl(k) 61 | if ttl > 0: 62 | print(f"Key has {ttl} seconds to live, deleting...") 63 | redis_conn.expire(name=k, time=1) 64 | 65 | 66 | 67 | def redis_reset_index(redis_new_conn): 68 | #flush all data 69 | redis_new_conn.flushall() 70 | 71 | #create flat index & load vectors 72 | create_search_index(redis_new_conn,VECTOR_FIELD_IN_REDIS, NUMBER_PRODUCTS_INDEX, get_model_dims(CHOSEN_EMB_MODEL), 'COSINE') 73 | 74 | 75 | def test_redis(redis_new_conn): 76 | if (REDIS_ADDR is None) or (REDIS_ADDR == '') or (USE_REDIS_CACHE != 1): return None 77 | print("test redis") 78 | 79 | try: 80 | out = redis_new_conn.ft(REDIS_INDEX_NAME).info() 81 | print(f"Found Redis Index {REDIS_INDEX_NAME}") 82 | except Exception as e: 83 | # print(f"Redis Index {REDIS_INDEX_NAME} not found. Creating a new index.") 84 | logging.error(f"Redis Index {REDIS_INDEX_NAME} not found. Creating a new index.") 85 | redis_reset_index(redis_new_conn) 86 | 87 | 88 | def get_new_conn(): 89 | if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None 90 | 91 | if REDIS_PASSWORD == '': 92 | redis_conn = Redis(host = REDIS_ADDR, port = REDIS_PORT) 93 | else: 94 | redis_conn = redis.StrictRedis(host=REDIS_ADDR, port=int(REDIS_PORT), password=REDIS_PASSWORD, ssl=True) 95 | 96 | #print('Connected to redis', redis_conn) 97 | test_redis(redis_conn) 98 | 99 | return redis_conn 100 | 101 | 102 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4)) 103 | def redis_upsert_embedding(redis_conn, e_dict): 104 | if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None 105 | 106 | try: 107 | #embeds = np.array(e[VECTOR_FIELD_IN_REDIS]).astype(np.float32).tobytes() 108 | #meta = {'text_en': e['text_en'], 'text':e['text'], 'doc_url': e['doc_url'], 'timestamp': e['timestamp'], VECTOR_FIELD_IN_REDIS:embeds} 109 | e = copy.deepcopy(e_dict) 110 | 111 | for k in e: 112 | if isinstance(e[k], list) and (len(e[k]) > 0): 113 | if isinstance(e[k][0], float): e[k] = np.array(e[k]).astype(np.float32).tobytes() 114 | if isinstance(e[k][0], str): e[k] = ', '.join(e[k]) 115 | 116 | # e[VECTOR_FIELD_IN_REDIS] = np.array(e[VECTOR_FIELD_IN_REDIS]).astype(np.float32).tobytes() 117 | 118 | for k in e: 119 | if isinstance(e[k], list): 120 | print(e[k]) 121 | 122 | p = redis_conn.pipeline(transaction=False) 123 | p.hset(e['id'], mapping=e) 124 | p.execute() 125 | return 1 126 | 127 | except Exception as e: 128 | print(f"Embedding Except: {e}") 129 | logging.error(f"Embedding Except: {e}") 130 | return 0 131 | 132 | 133 | 134 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4)) 135 | def redis_query_embedding_index(redis_conn, query_emb, t_id, topK=5, filter_param=None): 136 | if (REDIS_ADDR is None) or (REDIS_ADDR == ''): return None 137 | 138 | if (filter_param is None) or (filter_param == '*'): 139 | filter_param = '*' 140 | else: 141 | if not filter_param.startswith('@'): 142 | filter_param = '@' + filter_param 143 | 144 | filter_param = filter_param.replace('-', '\-') 145 | fields = list(KB_Doc().get_fields()) + ['vector_score'] 146 | query_vector = np.array(query_emb).astype(np.float32).tobytes() 147 | query_string = f'({filter_param})=>[KNN {topK} @{VECTOR_FIELD_IN_REDIS} $vec_param AS vector_score]' 148 | 149 | q = Query(query_string).sort_by('vector_score').paging(0,topK).return_fields(*fields).dialect(2) 150 | params_dict = {"vec_param": query_vector} 151 | results = redis_conn.ft(REDIS_INDEX_NAME).search(q, query_params = params_dict) 152 | 153 | return [{k: match.__dict__[k] for k in (set(list(match.__dict__.keys())) - set([VECTOR_FIELD_IN_REDIS]))} for match in results.docs if match.id != t_id] 154 | 155 | 156 | 157 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4)) 158 | def redis_set(redis_conn, key, field, value, expiry = None, verbose = False, force=False): 159 | 160 | print("Entering REDIS SET", REDIS_ADDR, force) 161 | if (REDIS_ADDR is not None) and (REDIS_ADDR != '') and (force == True): 162 | pass 163 | else: 164 | if (REDIS_ADDR is None) or (REDIS_ADDR == '') or (USE_REDIS_CACHE != 1): return None 165 | 166 | print("Executing REDIS SET") 167 | key = key.replace('"', '') 168 | res = redis_conn.hset(key, field, value) 169 | 170 | if expiry is not None: 171 | redis_conn.expire(name=key, time=expiry) 172 | if verbose: print("\nSetting Redis Key: ", key, field, expiry) 173 | return res 174 | 175 | 176 | 177 | @retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(4)) 178 | def redis_get(redis_conn, key, field, expiry = CONVERSATION_TTL_SECS, verbose = False, force=False): 179 | 180 | print("Entering REDIS GET", REDIS_ADDR, force) 181 | if (REDIS_ADDR is not None) and (REDIS_ADDR != '') and (force == True): 182 | pass 183 | else: 184 | if (REDIS_ADDR is None) or (REDIS_ADDR == '') or (USE_REDIS_CACHE != 1): return None 185 | print("Executing REDIS GET") 186 | 187 | key = key.replace('"', '') 188 | if verbose: print("\nGetting Redis Key: ", key, field) 189 | if redis_conn.ttl(key) > 0: redis_conn.expire(name=key, time=expiry) 190 | return redis_conn.hget(key, field) 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /utils/storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import urllib 5 | from requests.utils import requote_uri 6 | from datetime import datetime, timedelta 7 | import logging 8 | import smart_open 9 | from azure.storage.blob import BlobServiceClient, BlobClient 10 | from azure.storage.blob import ContainerClient, __version__ 11 | from azure.storage.blob import generate_blob_sas, BlobSasPermissions 12 | import copy 13 | import uuid 14 | import json 15 | 16 | from utils.env_vars import * 17 | 18 | 19 | def get_kb_container_client(): 20 | blob_service_client = BlobServiceClient.from_connection_string(KB_BLOB_CONN_STR) 21 | return blob_service_client 22 | 23 | 24 | blob_service_client = get_kb_container_client() 25 | 26 | 27 | def get_container_name(url): 28 | return url.split('.blob.core.windows.net/')[1].split('/')[0] 29 | 30 | 31 | def create_sas_from_container_and_blob(container, blob_name): 32 | blob_client = blob_service_client.get_blob_client(container=container, blob=blob_name) 33 | 34 | token = generate_blob_sas( 35 | account_name=blob_client.account_name, 36 | account_key=blob_client.credential.account_key, 37 | container_name=container, 38 | blob_name=blob_name, 39 | permission=BlobSasPermissions(read=True), 40 | expiry=datetime.utcnow() + timedelta(hours=20*365*24), 41 | ) 42 | 43 | sas_url = blob_client.url + '?' + token 44 | #print(f"Processing now '{blob_name}' with SAS URL {sas_url}") 45 | return sas_url 46 | 47 | 48 | def get_filename(blob_path): 49 | try: 50 | return urllib.parse.unquote(os.path.basename(blob_path.split('?')[0])) 51 | except: 52 | return 'default_file_name_exception' 53 | 54 | 55 | 56 | def create_sas(blob_path): 57 | blob_name = get_filename(blob_path) 58 | container = get_container_name(blob_path) 59 | return create_sas_from_container_and_blob(container, blob_name) 60 | 61 | 62 | 63 | 64 | def save_json_document(data_dict, container = OUTPUT_BLOB_CONTAINER): 65 | 66 | ret_dict = {} 67 | 68 | new_doc = copy.deepcopy(data_dict) 69 | 70 | new_doc['id'] = new_doc.get('id', str(uuid.uuid4())) 71 | new_doc['categoryId'] = CATEGORYID 72 | new_doc['timestamp'] = new_doc.get('timestamp', datetime.now().strftime("%m/%d/%Y, %H:%M:%S")) 73 | new_doc['doc_url'] = new_doc.get('doc_url', f'https://microsoft.com/{str(uuid.uuid4())}') 74 | 75 | if 'content' in new_doc.keys(): 76 | del new_doc['content'] 77 | 78 | container_client = blob_service_client.get_container_client(container) 79 | 80 | try: 81 | container_properties = container_client.get_container_properties() 82 | except Exception as e: 83 | container_client.create_container() 84 | 85 | blob_name = urllib.parse.unquote(os.path.basename(new_doc['doc_url'].split('?')[0])) 86 | pre, ext = os.path.splitext(blob_name) 87 | blob_name = pre + '.json' 88 | blob_client = container_client.get_blob_client(blob=blob_name) 89 | blob_client.upload_blob(json.dumps(new_doc, indent=4), overwrite=True) 90 | ret_dict['status'] = f"Document {new_doc['id']} was successfully saved to the {OUTPUT_BLOB_CONTAINER} container" 91 | logging.info(ret_dict['status']) 92 | 93 | return ret_dict 94 | 95 | 96 | 97 | 98 | def list_documents(container): 99 | container_client = blob_service_client.get_container_client(container) 100 | generator = container_client.list_blobs() 101 | blobs = [] 102 | for blob in generator: 103 | blob_client = blob_service_client.get_blob_client(container=container, blob=blob.name) 104 | blobs.append(blob_client.url) 105 | 106 | return blobs 107 | 108 | 109 | def get_document_url(container, filename): 110 | url = f"https://{blob_service_client.account_name}.blob.core.windows.net/{container}/{filename}" 111 | return requote_uri(url) 112 | 113 | 114 | def get_document(container, filename): 115 | 116 | transport_params = { 117 | 'client': blob_service_client 118 | } 119 | 120 | with smart_open.open(f"azure://{container}/{filename}", transport_params=transport_params) as fin: 121 | data = fin.read() 122 | 123 | return data 124 | 125 | 126 | def download_document(url, as_text = True): 127 | 128 | blob_client = blob_service_client.get_blob_client(container=container, blob=blob_name) 129 | blob_name = urllib.parse.unquote(os.path.basename(blob_path)) 130 | container = get_container_name(blob_path) 131 | download_stream = blob_client.download_blob() 132 | 133 | if as_text: 134 | return download_stream.content_as_text() 135 | else: 136 | return download_stream.content_as_bytes() 137 | 138 | 139 | -------------------------------------------------------------------------------- /utils/summarization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import pandas as pd 5 | import urllib 6 | from datetime import datetime, timedelta 7 | import logging 8 | import copy 9 | import uuid 10 | import json 11 | import openpyxl 12 | import time 13 | 14 | from langchain import OpenAI, PromptTemplate, LLMChain 15 | from langchain.text_splitter import CharacterTextSplitter 16 | from langchain.chains.mapreduce import MapReduceChain 17 | from langchain.prompts import PromptTemplate 18 | from langchain.text_splitter import TokenTextSplitter, TextSplitter 19 | from langchain.chains.summarize import load_summarize_chain 20 | from langchain.docstore.document import Document 21 | from langchain.callbacks.base import CallbackManager 22 | 23 | from utils import openai_helpers 24 | from utils import helpers 25 | from utils import fr_helpers 26 | 27 | from utils.env_vars import * 28 | 29 | 30 | ## Use with Python 3.9+ ONLY 31 | # """ 32 | # from utils import km_agents 33 | # from utils import openai_helpers 34 | # from utils import fr_helpers 35 | # from utils import summarization 36 | # folder = './docs_to_summarize' 37 | # ref_summ_df = summarization.summarize_folder(folder, mode='refine', verbose=False) 38 | # mp_summ_df = summarization.summarize_folder(folder, mode='map_reduce', verbose=False) 39 | # """ 40 | 41 | 42 | 43 | mapreduce_prompt_template = """The maximum output is about 500 to 750 tokens, so make sure to take advantage of this to the maximum.\n 44 | Write an elaborate summary of 3 paragraphs of the following: 45 | 46 | 47 | {text} 48 | 49 | 50 | SUMMARY:""" 51 | 52 | 53 | refine_prompt_template = """Write an elaborate summary of 3 paragraphs of the following: 54 | 55 | {text} 56 | 57 | """ 58 | 59 | refine_template = ( 60 | "Your job is to produce a final summary of 3 paragraphs that is elaborate and rich in details.\n" 61 | "The maximum output is about 500 to 750 tokens, so make sure to take advantage of this to the maximum.\n" 62 | "We have provided an existing summary up to a certain point: {existing_answer}\n" 63 | "We have the opportunity to refine the existing summary." 64 | "(only if needed) with some more context below.\n" 65 | "------------\n" 66 | "{text}\n" 67 | "------------\n" 68 | "Given the new context, refine the original summary." 69 | "If the context isn't useful, return the original summary." 70 | ) 71 | 72 | 73 | 74 | def chunk_doc(all_text, mode='refine', model=CHOSEN_COMP_MODEL, max_output_tokens=MAX_OUTPUT_TOKENS, chunk_overlap=500): 75 | 76 | enc_name = openai_helpers.get_encoding_name(model) 77 | enc = openai_helpers.get_encoder(model) 78 | 79 | max_tokens = openai_helpers.get_model_max_tokens(model) 80 | 81 | if mode == 'refine': 82 | max_tokens = max_tokens - len(enc.encode(refine_prompt_template)) - len(enc.encode(refine_template)) - 2*MAX_OUTPUT_TOKENS - chunk_overlap 83 | elif mode == 'map_reduce': 84 | max_tokens = max_tokens - len(enc.encode(mapreduce_prompt_template)) - MAX_OUTPUT_TOKENS - chunk_overlap 85 | else: 86 | raise Exception('Invalid mode') 87 | 88 | text_splitter = TokenTextSplitter(encoding_name=enc_name, chunk_size = max_tokens, chunk_overlap=chunk_overlap) 89 | 90 | texts = text_splitter.split_text(all_text) 91 | docs = [Document(page_content=t) for t in texts] 92 | 93 | enc = openai_helpers.get_encoder(CHOSEN_COMP_MODEL) 94 | 95 | l_arr = [] 96 | for d in texts: 97 | l_arr.append(str(len(enc.encode(d)))) 98 | 99 | print("Chunks Generated", len(docs), ' | max_tokens', max_tokens, " | Chunk Lengths:", ', '.join(l_arr)) 100 | 101 | return docs 102 | 103 | 104 | def clean_up_text(text): 105 | text = text.replace('....', '') 106 | return text 107 | 108 | 109 | 110 | def get_refined_summarization(docs, model=CHOSEN_COMP_MODEL, max_output_tokens=MAX_OUTPUT_TOKENS, stream=False, callbacks=[]): 111 | 112 | PROMPT = PromptTemplate(template=refine_prompt_template, input_variables=["text"]) 113 | refine_prompt = PromptTemplate(input_variables=["existing_answer", "text"],template=refine_template) 114 | 115 | llm = helpers.get_llm(model, temperature=0, max_output_tokens=max_output_tokens, stream=stream, callbacks=callbacks) 116 | 117 | chain = load_summarize_chain(llm, chain_type="refine", question_prompt=PROMPT, refine_prompt=refine_prompt, return_intermediate_steps=True) 118 | summ = chain({"input_documents": docs}, return_only_outputs=True) 119 | 120 | return summ 121 | 122 | 123 | def get_mapreduced_summarization(docs, model=CHOSEN_COMP_MODEL, max_output_tokens=MAX_OUTPUT_TOKENS, stream=False, callbacks=[]): 124 | 125 | PROMPT = PromptTemplate(template=mapreduce_prompt_template, input_variables=["text"]) 126 | 127 | llm = helpers.get_llm(model, temperature=0, max_output_tokens=max_output_tokens, stream=stream, callbacks=callbacks) 128 | 129 | chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=PROMPT, combine_prompt=PROMPT, return_intermediate_steps=True) 130 | summ = chain({"input_documents": docs}, return_only_outputs=True) 131 | 132 | return summ 133 | 134 | 135 | 136 | 137 | def read_document(path, verbose = False): 138 | if verbose: print(f"Reading {path}") 139 | 140 | all_text = '' 141 | ext = os.path.splitext(path)[1] 142 | 143 | if ext == '.xlsx': 144 | dataframe = openpyxl.load_workbook(path, data_only=True) 145 | sheets = [s for s in dataframe.sheetnames if 'HiddenCache' not in s] 146 | for sheet in sheets: 147 | print('sheet', sheet) 148 | all_text += pd.read_excel(path, sheet_name=sheets[0]).to_string(na_rep='') + '\n\n\n\n' 149 | elif ext == '.csv': 150 | return None 151 | elif ext == '.pdf': 152 | contents, kv_contents, dfs, t_contents = fr_helpers.fr_analyze_local_doc_with_dfs(path, verbose = verbose) 153 | all_text = ' '.join([kv_contents , contents , t_contents]) 154 | else: 155 | return None 156 | 157 | all_text = clean_up_text(all_text) 158 | 159 | return all_text 160 | 161 | 162 | def summarize_document(path, mode='refine', verbose = False): 163 | 164 | print(f"##########################\nStarting Processing {path} ...") 165 | start = time.time() 166 | text = read_document(path, verbose=verbose) 167 | if text is None: return None 168 | 169 | summ = summarize_text(text, mode=mode, verbose=verbose) 170 | end = time.time() 171 | 172 | summary = { 173 | 'file': os.path.basename(path), 174 | 'intermediate_steps': summ['intermediate_steps'], 175 | 'summary': summ['output_text'], 176 | 'proc_time': end-start 177 | } 178 | 179 | print(f"Done Processing {path} in {end-start} seconds\n##########################\n") 180 | return summary 181 | 182 | 183 | def summarize_text(text, mode='refine', verbose = False): 184 | docs = chunk_doc(text, mode=mode) 185 | 186 | if mode == 'refine': 187 | summ = get_refined_summarization(docs) 188 | elif mode == 'map_reduce': 189 | summ = get_mapreduced_summarization(docs) 190 | else: 191 | raise Exception("Invalid mode") 192 | 193 | return summ 194 | 195 | 196 | 197 | def summarize_folder(folder, mode='refine', save_to_csv=True, save_to_pkl=True, verbose = False): 198 | files = os.listdir(folder) 199 | print(f"Files in folder {len(files)}") 200 | pkl_file = os.path.join(folder, f'summaries_{mode}.pkl') 201 | csv_file = os.path.join(folder, f'summaries_{mode}.csv') 202 | 203 | if os.path.exists(csv_file): 204 | summ_df = pd.read_csv(csv_file) 205 | else: 206 | summ_df = pd.DataFrame(columns=['file', 'intermediate_steps', 'summary', 'proc_time']) 207 | 208 | processed_files = list(summ_df['file']) 209 | print(f"List of already processed files {processed_files}") 210 | 211 | for f in files: 212 | path = os.path.join(folder, f) 213 | if f in processed_files: continue 214 | 215 | summary = summarize_document(path, mode=mode, verbose=verbose) 216 | if summary is None: continue 217 | summ_df = pd.concat([summ_df, pd.DataFrame([summary])], ignore_index=True) 218 | 219 | if save_to_csv: summ_df.to_csv(csv_file) 220 | if save_to_pkl: summ_df.to_pickle(pkl_file) 221 | 222 | return summ_df -------------------------------------------------------------------------------- /utils/web_crawler.py: -------------------------------------------------------------------------------- 1 | import logging, json, re, os, requests, uuid,ssl 2 | import azure.functions as func 3 | from azure.storage.blob import ContainerClient 4 | from azure.storage.blob import BlobServiceClient 5 | from bs4 import BeautifulSoup 6 | from collections import deque 7 | from html.parser import HTMLParser 8 | from urllib.parse import urlparse 9 | from urllib.request import urlopen 10 | import urllib.request 11 | import urllib 12 | 13 | import pandas as pd 14 | import numpy as np 15 | from datetime import datetime 16 | import time 17 | from utils import language 18 | 19 | 20 | HTTP_URL_PATTERN = r'^http[s]*://.+' 21 | 22 | CONTEXT = ssl._create_unverified_context() 23 | 24 | 25 | # Create a class to parse the HTML and get the hyperlinks 26 | class HyperlinkParser(HTMLParser): 27 | def __init__(self): 28 | super().__init__() 29 | # Create a list to store the hyperlinks 30 | self.hyperlinks = [] 31 | 32 | # Override the HTMLParser's handle_starttag method to get the hyperlinks 33 | def handle_starttag(self, tag, attrs): 34 | attrs = dict(attrs) 35 | 36 | # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks 37 | if tag == "a" and "href" in attrs: 38 | self.hyperlinks.append(attrs["href"]) 39 | # Function to get the hyperlinks from a URL 40 | def get_hyperlinks(url): 41 | 42 | # Try to open the URL and read the HTML 43 | try: 44 | # Open the URL and read the HTML 45 | with urllib.request.urlopen(url,context=CONTEXT) as response: 46 | 47 | # If the response is not HTML, return an empty list 48 | if not response.info().get('Content-Type').startswith("text/html"): 49 | return [] 50 | 51 | # Decode the HTML 52 | html = response.read().decode('utf-8') 53 | except Exception as e: 54 | print(e) 55 | return [] 56 | 57 | # Create the HTML Parser and then Parse the HTML to get hyperlinks 58 | parser = HyperlinkParser() 59 | parser.feed(html) 60 | 61 | return parser.hyperlinks 62 | # Function to get the hyperlinks from a URL that are within the same domain 63 | def get_domain_hyperlinks(local_domain, url): 64 | clean_links = [] 65 | for link in set(get_hyperlinks(url)): 66 | clean_link = None 67 | 68 | # If the link is a URL, check if it is within the same domain 69 | if re.search(HTTP_URL_PATTERN, link): 70 | # Parse the URL and check if the domain is the same 71 | url_obj = urlparse(link) 72 | if url_obj.netloc == local_domain: 73 | clean_link = link 74 | 75 | # If the link is not a URL, check if it is a relative link 76 | else: 77 | if link.startswith("/"): 78 | link = link[1:] 79 | elif link.startswith("#") or link.startswith("mailto:"): 80 | continue 81 | clean_link = "https://" + local_domain + "/" + link 82 | 83 | if clean_link is not None: 84 | if clean_link.endswith("/"): 85 | clean_link = clean_link[:-1] 86 | clean_links.append(clean_link) 87 | 88 | # Return the list of hyperlinks that are within the same domain 89 | return list(set(clean_links)) 90 | def remove_newlines(text): 91 | text = text.replace('\n', ' ') 92 | text = text.replace('\\n', ' ') 93 | text = text.replace(' ', ' ') 94 | text = text.replace(' ', ' ') 95 | text = text.replace(' ', ' ') 96 | text = text.replace(' ', ' ') 97 | text = text.replace(' ', ' ') 98 | text = text.replace(' ', ' ') 99 | return text 100 | def remove_urls(text): 101 | text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) 102 | return text 103 | 104 | def crawl(url, KB_BLOB_CONN_STR, KB_BLOB_CONTAINER, OUTPUT_BLOB_CONTAINER): 105 | # Parse the URL and get the domain 106 | local_domain = urlparse(url).netloc 107 | 108 | # Create a queue to store the URLs to crawl 109 | queue = deque([url]) 110 | 111 | # Create a set to store the URLs that have already been seen (no duplicates) 112 | seen = set() 113 | 114 | # While the queue is not empty, continue crawling 115 | while queue: 116 | # Get the next URL from the queue 117 | url = queue.pop() 118 | print(url) # for debugging and to see the progress 119 | if url in seen: 120 | print('already processed') 121 | else: 122 | seen.add(url) 123 | if url.endswith(".pdf"): 124 | try: 125 | dest_blob_name = os.path.basename(urlparse(url).path) 126 | source_url = url 127 | container_client = ContainerClient.from_connection_string(KB_BLOB_CONN_STR, KB_BLOB_CONTAINER) 128 | blob_client = container_client.get_blob_client(dest_blob_name) 129 | blob_client.upload_blob(b'',overwrite=True) 130 | blob_client.stage_block_from_url(block_id=1, source_url=source_url) 131 | blob_client.commit_block_list(['1']) 132 | except Exception as e: 133 | print("Could not upload this PDF file") 134 | print(e) 135 | 136 | else: 137 | try: 138 | soup = BeautifulSoup(urlopen(url,context=CONTEXT), "html.parser") 139 | text = soup.get_text() 140 | doc_id=str(uuid.uuid3(uuid.NAMESPACE_DNS, text)) 141 | timestamp = str(datetime.now()), 142 | doc_text = remove_urls(remove_newlines(text)) 143 | lang = language.detect_content_language(doc_text[:500]) 144 | new_doc = { 145 | "id": doc_id, 146 | "categoryId": 'CATEGORYID', 147 | "timestamp": timestamp, 148 | "web_url": url, 149 | "text": doc_text, 150 | "source_language": lang 151 | 152 | } 153 | try: 154 | container = ContainerClient.from_connection_string(KB_BLOB_CONN_STR, OUTPUT_BLOB_CONTAINER) 155 | try: 156 | container_properties = container.get_container_properties() 157 | except Exception as e: 158 | container.create_container() 159 | 160 | 161 | filename=local_domain+'_'+doc_id 162 | blob_name = filename + '.json' 163 | blob_client = container.get_blob_client(blob=blob_name) 164 | blob_client.upload_blob(json.dumps(new_doc, indent=4, ensure_ascii = False), overwrite=True) 165 | logging.info(f"Document {doc_id} was successfully saved to the {OUTPUT_BLOB_CONTAINER} container") 166 | 167 | except Exception as e: 168 | logging.error(f"Exception: Document {doc_id} created an exception.\n{e}") 169 | 170 | except Exception as e: 171 | print(e) 172 | # Get the hyperlinks from the URL and add them to the queue if not already seen. 173 | for link in get_domain_hyperlinks(local_domain, url): 174 | if link not in seen: 175 | queue.append(link) --------------------------------------------------------------------------------