├── .env_template ├── .gitignore ├── LICENSE ├── app └── app.py ├── docker └── dockerfile ├── images ├── arch.png ├── logo_black.png ├── logo_black_simple.png └── logo_gray.png ├── misc └── requirements.txt ├── modules └── utilities.py ├── readme.md └── sample_docs └── Easy_recipes_Boston_University.pdf /.env_template: -------------------------------------------------------------------------------- 1 | OPENAI_API_TYPE=azure 2 | OPENAI_API_KEY=XXXXXX 3 | OPENAI_API_BASE=https://XXXXXX.openai.azure.com 4 | API_VERSION=2022-12-01 5 | REDIS_HOST=XXXXXX 6 | REDIS_ACCESS_KEY=XXXXXX 7 | REDIS_PORT=6379 8 | MAINTENANCE_MODE=no -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | ./scripts/* 3 | .env 4 | ./temp_uploads/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 MaheshSQL 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | 4 | import os 5 | import streamlit as st 6 | from datetime import datetime 7 | from modules.utilities import * 8 | import pathlib 9 | from uuid import uuid4 10 | 11 | 12 | # Will need commenting when deploying the app 13 | load_dotenv() 14 | 15 | #Set env variables 16 | setEnv() 17 | 18 | # Exit if app set to MAINTENANCE_MODE (yes, no) 19 | if str(os.getenv('MAINTENANCE_MODE')).lower()=='yes': 20 | st.write('App is currently offline for maintenance, please check back later.') 21 | exit() 22 | 23 | aoai_embedding_model = 'text-search-davinci-doc-001' #'text-search-ada-doc-001' 24 | aoai_embedding_model_version = '1' 25 | 26 | aoai_text_model = 'text-davinci-003' 27 | aoai_text_model_version = '1' 28 | aoai_text_model_temperature = 0.2 29 | aoai_text_model_max_tokens = 500 30 | 31 | aoai_embedding_model_deployment = aoai_embedding_models[aoai_embedding_model]["version"][aoai_embedding_model_version]["deployment_name"] #Azure OpenAI deployment name 32 | aoai_embedding_model_dim = aoai_embedding_models[aoai_embedding_model]["version"][aoai_embedding_model_version]["dim"] 33 | 34 | aoai_text_model_deployment = aoai_embedding_models[aoai_text_model]["version"][aoai_text_model_version]["deployment_name"] #Azure OpenAI deployment name 35 | 36 | score_threshold = 50 #Show answers above or equal to this score threshold 37 | prompt_min_length = 5 38 | ms_alias_min_length = 6 39 | prompt_text_area_max_chars = 300 40 | temp_dir = '../temp_uploads/' #Where uploaded files get staged until they are indexed, files staged for few seconds only then deleted. 41 | app_version = '0.9.7' #Equal to docker image version tag, shown in sidebar. 42 | 43 | #-------------------------------------------------------------------------- 44 | # Get connection 45 | #-------------------------------------------------------------------------- 46 | az_redis = getRedisConnection(host=os.getenv('REDIS_HOST'), access_key=os.getenv('REDIS_ACCESS_KEY'), port=os.getenv('REDIS_PORT'), ssl=False) 47 | # print(az_redis) 48 | 49 | def getKeywordList(input_text): 50 | input_text = input_text.replace('.',' ') 51 | input_text = input_text.replace('-',' ') 52 | input_text = input_text.replace('=',' ') 53 | input_text = input_text.replace('?',' ') 54 | input_text = input_text.replace('!',' ') 55 | keyword_list = [word.lower() for word in input_text.split() if word.lower() not in ['?','a','an','and','or','do','of','if','not','for','are','was','were','is','can','have','has','there','their','the','how', 'why', 'when', 'what',"what's",'in', 'to', 'i', 'we', 'you']] 56 | return keyword_list 57 | 58 | def highlightKeywords(keyword_list, input_text): 59 | highlighted = " ".join(f'{t}' if t.lower() in keyword_list else t for t in input_text.split(' ')) 60 | # print(f'highlighted:{highlighted}') 61 | 62 | return highlighted 63 | 64 | def getResult(prompt, top_n, index_name): 65 | 66 | out = [] 67 | 68 | # prompt = prompt + ' Respond with "Not found" if the answer is not present in the passage.' 69 | 70 | query_result,document_lc_list = queryRedis(az_redis_connection=az_redis, prompt=prompt, 71 | aoai_embedding_model=aoai_embedding_model_deployment, index_name=index_name, top_n=top_n) 72 | # print(f'query_result:{query_result}') 73 | # print(f'document_lc_list:{document_lc_list}') 74 | 75 | # Check if any response received 76 | if document_lc_list is not None: 77 | 78 | # Open AI lc qna 79 | llm = AzureOpenAI(deployment_name=aoai_text_model_deployment,temperature=aoai_text_model_temperature, max_tokens=aoai_text_model_max_tokens) 80 | 81 | # lc 82 | # chain = load_qa_with_sources_chain(llm, chain_type="stuff") 83 | chain = load_qa_with_sources_chain(llm, chain_type="map_rerank", verbose=False, return_intermediate_steps=True) 84 | chain_out = chain({"input_documents": document_lc_list, "question": prompt}, return_only_outputs=False) 85 | # print(f'chain_out:{chain_out}') 86 | 87 | results = [] 88 | for i, item in enumerate(chain_out['intermediate_steps']): 89 | # print(item['answer'], item['score']) #Uncomment to view the answer 90 | results.append((int(item['score']),i,item['answer'])) 91 | 92 | results.sort(reverse = True) #Sort desc based on Score 93 | # print(results) 94 | # print(results[0][1]) #top first answer index 95 | 96 | # Top N answers 97 | for i in range(top_n): 98 | 99 | # Check score threshold 100 | if int(results[i][0]) >= score_threshold: 101 | out_item = None 102 | out_item = { 103 | "Answer":results[i][2], 104 | "Score": int(results[i][0]), 105 | f"Content": chain_out['input_documents'][results[i][1]].page_content, 106 | f"Source": chain_out['input_documents'][results[i][1]].metadata['source'], 107 | f"Similarity": chain_out['input_documents'][results[i][1]].metadata['similarity'], 108 | f"Page": int(chain_out['input_documents'][results[i][1]].metadata['page'])+1 109 | } 110 | out.append(out_item) 111 | 112 | 113 | return out 114 | 115 | #-------------------------------------------------------------------------- 116 | 117 | # Initialization of session vars 118 | if 'questions' not in st.session_state: 119 | st.session_state['questions'] = [] 120 | if 'answers' not in st.session_state: 121 | st.session_state['answers'] = [] 122 | 123 | 124 | st.set_page_config(page_title='Azure OpenAI Search Demo', layout='wide', page_icon='../images/logo_black_simple.png') 125 | 126 | 127 | 128 | with st.container(): 129 | 130 | def upload_button_click(): 131 | 132 | if file_uploader is not None and len(textbox_msalias.strip()) >= ms_alias_min_length: 133 | progress_bar = middle_column_12.progress(0,'') 134 | 135 | # st.write(str(os.listdir('../'))) 136 | if not os.path.exists(temp_dir): 137 | os.makedirs(temp_dir) 138 | 139 | # print(file_uploader.getvalue()) 140 | # local_file = pathlib.Path('./temp_uploads/'+str(uuid4())+'_'+file_uploader.name) 141 | local_file = pathlib.Path(temp_dir + file_uploader.name) 142 | local_file.write_bytes(file_uploader.getvalue()) #Write locally to crack open PDF/word docs 143 | 144 | local_file_path = str(local_file) 145 | # print(local_file_path) 146 | 147 | progress_bar.progress(20,'File acquired') 148 | 149 | progress_bar.progress(30,'Backend connected') 150 | 151 | # Create index if it does not exist 152 | result = createRedisIndex(az_redis_connection=az_redis, index_name=textbox_msalias , prefix = textbox_msalias, 153 | distance_metric='COSINE', DIM = aoai_embedding_model_dim, vec_type='HNSW') 154 | print(f'Create index result:{result}') 155 | 156 | progress_bar.progress(40,'Processing') 157 | 158 | # Read document, cleanse content, get content and embeddings 159 | document_page_content_list, \ 160 | document_page_embedding_list, \ 161 | document_page_no_list = getEmbeddingEntireDoc(documentPath=local_file_path, 162 | aoai_embedding_model=aoai_embedding_model_deployment, 163 | chunk_size=1) 164 | print('Embeddings retrieved') 165 | print(len(document_page_content_list), len(document_page_embedding_list), len(document_page_no_list)) 166 | # print(document_page_content_list) 167 | # print(document_page_embedding_list, document_page_no_list) 168 | 169 | progress_bar.progress(80,'Almost done') 170 | 171 | # Add document pages 172 | response = addDocumentToRedis(az_redis_connection=az_redis, 173 | documentPath=local_file_path, 174 | document_page_content_list=document_page_content_list, 175 | document_page_embedding_list=document_page_embedding_list, 176 | document_page_no_list=document_page_no_list, 177 | prefix = textbox_msalias 178 | ) 179 | print(f'addDocumentToRedis: {response}') 180 | 181 | progress_bar.progress(90,'Running cleanup') 182 | 183 | # Remove local PDF after indexing completed 184 | if os.path.exists(local_file_path): 185 | os.remove(local_file_path) 186 | 187 | progress_bar.progress(100,'Completed') 188 | 189 | if len(textbox_msalias.strip()) < ms_alias_min_length: 190 | left_column.warning('Please enter a valid alias') 191 | 192 | top_left_column, middle_left_column, right_left_column = st.columns([40,20,40]) 193 | top_left_column_1, top_left_column_2 = top_left_column.columns([25,75]) 194 | top_left_column_1.image(image='../images/logo_black.png', width=100) 195 | # top_left_column_2.write('###') 196 | top_left_column_2.subheader('Semantic Search Demo') 197 | top_left_column_2.write('Unleash the power of your documents with data-driven inquiries') 198 | 199 | # st.write('---') 200 | 201 | with st.sidebar: 202 | 203 | st.markdown(':gear: Settings') 204 | 205 | textbox_msalias = st.text_input(label='Unique alias*', max_chars=10, key='textbox_msalias', type='password', 206 | help='''Unique text value to store/query your docs under. 207 | Use same value when you revisit this app in future for consistent experience.''') 208 | selectbox_top_n = st.selectbox(label='Top N results*',options=(3,5,10), index = 2, key='selectbox_top_n') 209 | 210 | checkbox_score = st.checkbox(label='Score',key='checkbox_score', value=False, help='Value between 0 to 100 suggesting LLM confidence for answering the question by with retrieved passage of text.') 211 | checkbox_similarity = st.checkbox(label='Similarity',key='checkbox_similarity', value=False, help='Similarity between the query and retrieved passage of text.') 212 | 213 | checkbox_page_no = st.checkbox(label='Page No',key='checkbox_page_no', value=True, help='Document page number.') 214 | checkbox_show_fileupload = st.checkbox(label='Upload file',key='checkbox_show_fileupload', value=False, help='Upload file using upload widget.') 215 | 216 | st.write('### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ###') 217 | st.write("[Github Repo](https://github.com/MaheshSQL/openai-vector-search-demo)") 218 | st.caption('Version: '+app_version) 219 | st.write('
Powered by Azure OpenAI
', unsafe_allow_html=True) 220 | 221 | if checkbox_show_fileupload == True: 222 | st.write('----') 223 | 224 | left_column_11, middle_column_12, right_column_13 = st.columns([36,8,56]) 225 | file_uploader = left_column_11.file_uploader(label='Upload file.',accept_multiple_files=False, key='file_uploader_1',type=['pdf', 'docx'],label_visibility='hidden') 226 | middle_column_12.write('###') 227 | middle_column_12.write('###') 228 | # middle_column_b.write('###') 229 | upload_button = middle_column_12.button(label='Upload', on_click=upload_button_click) 230 | 231 | #fffce7 232 | right_column_13.write('''Disclaimer 233 | \nThis public demo app is not intended for use with sensitive data. 234 | We strongly advise against uploading any sensitive data to this application. 235 | We cannot guarantee the security of any data uploaded to this application. By using this application, you acknowledge that you understand and accept this risk. 236 | Please use publicly available data only.
237 | \nFor use with sensitive documents, please clone the repository and run it in your own environment.
''' 238 | ,unsafe_allow_html=True) 239 | st.write('----') 240 | 241 | 242 | with st.container(): 243 | 244 | # left_column, middle_column, right_column = st.columns([46,8,46]) 245 | left_column, middle_column, right_column = st.columns([60,10,30]) 246 | 247 | prompt = left_column.text_area(label='Enter your question:',max_chars=prompt_text_area_max_chars, key='text_area1', label_visibility ='hidden') 248 | 249 | def search_click(): 250 | 251 | questions = st.session_state['questions'] 252 | answers = st.session_state['answers'] 253 | 254 | if prompt is not None and len(prompt.strip()) >= prompt_min_length and len(textbox_msalias.strip()) >= ms_alias_min_length: 255 | answer = [] 256 | 257 | top_n = int(selectbox_top_n) 258 | 259 | try: 260 | answer = getResult(prompt, top_n, textbox_msalias) 261 | except: 262 | print('Exception in getResult()') 263 | 264 | #No results retrieved 265 | if len(answer)==0: 266 | left_column.warning('No results found. Consider uploading document/s first if you are using this app for the first time for unique alias you have specified. \n Check Upload file --> Browse file --> Click Upload to get started.') 267 | 268 | #Populate bottom pane with all N responses 269 | for ans_details in answer: 270 | 271 | keyword_list = getKeywordList(prompt) 272 | 273 | left_column.write(f'Answer: {ans_details["Answer"]}
',unsafe_allow_html=True) 274 | 275 | if checkbox_score: 276 | left_column.write(f'Score: {ans_details["Score"]}
',unsafe_allow_html=True) 277 | 278 | # left_column.write(f'Content: {ans_details[f"Content"]}
',unsafe_allow_html=True) 279 | left_column.write(f'Content: {highlightKeywords(keyword_list, ans_details[f"Content"])}
',unsafe_allow_html=True) 280 | 281 | left_column.write(f'Source: {os.path.basename(ans_details[f"Source"])}
',unsafe_allow_html=True) 282 | 283 | if checkbox_similarity: 284 | left_column.write(f'Similarity: {ans_details[f"Similarity"]}
',unsafe_allow_html=True) 285 | 286 | if checkbox_page_no: 287 | left_column.write(f'Page No: {ans_details[f"Page"]}
',unsafe_allow_html=True) 288 | 289 | left_column.write('----') 290 | 291 | if str(prompt).strip() != '' and len(answer) > 0: 292 | questions.append(prompt) 293 | answers.append(answer) 294 | 295 | st.session_state['questions'] = questions 296 | st.session_state['answers'] = answers 297 | 298 | if len(textbox_msalias.strip()) < ms_alias_min_length: 299 | left_column.warning('Please enter a valid alias') 300 | 301 | # print(f'questions:{questions}') 302 | # print(f'answers:{answers}') 303 | 304 | if len(list(reversed(questions))) > 0: 305 | right_column.write(f'Question History
',unsafe_allow_html=True) 306 | 307 | 308 | # Show in reversed order without modifying the lists set into sessions 309 | for i, item in enumerate(list(reversed(questions))): 310 | 311 | question_text = str('' + str(list(reversed(questions))[i])) 312 | # answer_text = str(''+ str(list(reversed(answers))[i])) 313 | 314 | # [{datetime.now().strftime("%d-%m-%Y %H:%M:%S")}] 315 | # right_column.write('###') 316 | 317 | right_column.write(f'Question: {question_text}
',unsafe_allow_html=True) 318 | # right_column.write(f'Answer: {answer_text}
',unsafe_allow_html=True) 319 | # right_column.write('---') 320 | # print(list(reversed(answers))[i]) 321 | for j, ans_details in enumerate(list(reversed(answers))[i]): 322 | 323 | #Only show 1 top answer in history (right side pane) 324 | if j==0: 325 | right_column.write(f'Answer: {ans_details["Answer"]}
',unsafe_allow_html=True) 326 | if checkbox_score: 327 | right_column.write(f'Score: {ans_details["Score"]}
',unsafe_allow_html=True) 328 | right_column.write(f'Content: {ans_details[f"Content"]}
',unsafe_allow_html=True) 329 | right_column.write(f'Source: {os.path.basename(ans_details[f"Source"])}
',unsafe_allow_html=True) 330 | if checkbox_similarity: 331 | right_column.write(f'Similarity: {ans_details[f"Similarity"]}
',unsafe_allow_html=True) 332 | if checkbox_page_no: 333 | right_column.write(f'Page No: {ans_details[f"Page"]}
',unsafe_allow_html=True) 334 | right_column.write('---') 335 | 336 | 337 | def clear_click(): 338 | st.session_state['text_area1'] = '' 339 | st.session_state['questions'] = [] 340 | st.session_state['answers'] = [] 341 | # st.session_state['checkbox_score'] = False 342 | # st.session_state['checkbox_similarity'] = False 343 | # st.session_state['checkbox_page_no'] = False 344 | 345 | 346 | middle_column.write('###') 347 | middle_column.write('###') 348 | search_button= middle_column.button(label='Search', on_click= search_click) 349 | clear_button = middle_column.button(label='Clear', on_click = clear_click) -------------------------------------------------------------------------------- /docker/dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.0 2 | EXPOSE 8501 3 | CMD mkdir -p /app 4 | CMD mkdir -p /modules 5 | CMD mkdir -p /temp_uploads 6 | CMD mkdir -p /images 7 | 8 | WORKDIR . 9 | COPY ../misc/requirements.txt ./ 10 | RUN pip3 install -r requirements.txt 11 | 12 | COPY ../app/app.py app/ 13 | COPY ../modules/utilities.py modules/ 14 | COPY ../images/* images/ 15 | 16 | WORKDIR /app 17 | ENTRYPOINT ["streamlit", "run"] 18 | CMD ["app.py"] -------------------------------------------------------------------------------- /images/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/arch.png -------------------------------------------------------------------------------- /images/logo_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/logo_black.png -------------------------------------------------------------------------------- /images/logo_black_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/logo_black_simple.png -------------------------------------------------------------------------------- /images/logo_gray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/logo_gray.png -------------------------------------------------------------------------------- /misc/requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.27.2 2 | tiktoken==0.3.3 3 | redis==4.5.4 4 | langchain==0.0.132 5 | pypdf==3.7.0 6 | python-dotenv==1.0.0 7 | pandas==1.5.3 8 | unstructured==0.5.11 9 | streamlit==1.21.0 -------------------------------------------------------------------------------- /modules/utilities.py: -------------------------------------------------------------------------------- 1 | #------------Imports--------------- 2 | import os 3 | 4 | from langchain.document_loaders import PyPDFLoader 5 | from langchain.document_loaders import UnstructuredWordDocumentLoader 6 | from langchain.embeddings.openai import OpenAIEmbeddings 7 | from langchain.schema import Document 8 | from langchain.llms.openai import AzureOpenAI 9 | from langchain.chains.qa_with_sources import load_qa_with_sources_chain 10 | 11 | import openai 12 | 13 | from dotenv import load_dotenv 14 | 15 | import redis 16 | from redis.commands.search.field import VectorField, TagField, TextField, NumericField 17 | from redis.commands.search.indexDefinition import IndexDefinition, IndexType 18 | from redis.commands.search.query import Query 19 | import hashlib 20 | import numpy as np 21 | 22 | import logging 23 | 24 | 25 | logging.basicConfig(level=logging.ERROR) 26 | 27 | #------------Functions--------------- 28 | 29 | '''Read PDF documents and return the list of langchain documents 30 | ''' 31 | def readPDF(source_url): 32 | try: 33 | document_pages_lc = None 34 | document_pages_lc = PyPDFLoader(source_url).load() 35 | 36 | # for page in document_pages_lc: 37 | 38 | # print(f'Source: {str(page.metadata["source"])}') 39 | # print(f'Page: {str(int(page.metadata["page"])+1)}') 40 | # print(page.page_content) 41 | 42 | return document_pages_lc 43 | except Exception as e: 44 | logging.error(f'Error readPDF(): {e}') 45 | return None 46 | 47 | '''Read MS Word documents and return the list of langchain documents 48 | ''' 49 | def readMSWord(source_url): 50 | try: 51 | one_page_size = 300 #IMP: How many words per split page of whole doc. 52 | document_pages_lc = None 53 | document_pages_lc = UnstructuredWordDocumentLoader(source_url).load() #Note: This method does not return same object as PDf loader, e.g. Doc pages not recognized. So below custom logic is built. 54 | document_pages_lc_list = [] 55 | 56 | # UnstructuredWordDocumentLoader returns whole doc as a single page, so need to impelement custom splitting 57 | for page in document_pages_lc: 58 | 59 | page_words = page.page_content.split(' ') #Split doc into words 60 | 61 | #Split document into pages of one_page_size words each 62 | for i in range((len(page_words) // one_page_size)+1): 63 | # print(i) 64 | 65 | # Note: Replaced below with Document object as in code below this section. 66 | # document_pages_lc_dict = {} #{"page_content":"",metadata={"source": "..doc", "page": 4}} 67 | # document_pages_lc_dict["page_content"] = ' '.join(page_words[i*one_page_size:(i+1)*one_page_size]) 68 | # document_pages_lc_dict["metadata"] = {"source":page.metadata["source"], "page":i} 69 | # document_pages_lc_list.append(document_pages_lc_dict) 70 | 71 | doc = Document(page_content=' '.join(page_words[i*one_page_size:(i+1)*one_page_size]), 72 | metadata={"source":page.metadata["source"], "page":i}) 73 | document_pages_lc_list.append(doc) 74 | 75 | return document_pages_lc_list 76 | except Exception as e: 77 | logging.error(f'Error readMSWord_old(): {e}') 78 | return None 79 | 80 | ''' 81 | Initialise environment variables 82 | ''' 83 | def setEnv(): 84 | try: 85 | openai.api_type = os.getenv('OPENAI_API_TYPE') 86 | openai.api_base = os.getenv('OPENAI_API_BASE') 87 | openai.api_version = os.getenv('API_VERSION') 88 | openai.api_key = os.getenv("OPENAI_API_KEY") 89 | 90 | return True 91 | except Exception as e: 92 | logging.error(f'Error setEnv(): {e}') 93 | return False 94 | 95 | ''' 96 | input_text: input text 97 | ''' 98 | def encode(input_text): 99 | return str(hashlib.sha1(f'{input_text}'.encode('utf-8')).hexdigest()) 100 | 101 | ''' 102 | txt_data: input data 103 | aoai_embedding_model: Azure OpenAI deployment name 104 | chunk_size: Maximum number of texts to embed in each batch 105 | max_retries: Maximum number of retries to make when generating. 106 | ''' 107 | def getEmbedding(txt_data, aoai_embedding_model, chunk_size=1, max_retries = 3): 108 | try: 109 | embeddings = OpenAIEmbeddings(model=aoai_embedding_model, chunk_size=chunk_size, max_retries=max_retries) 110 | query_result = embeddings.embed_query(txt_data) 111 | return query_result 112 | except Exception as e: 113 | logging.info(f'txt_data: {txt_data}') 114 | logging.error(f'Error getEmbedding(): {e}') 115 | return None 116 | 117 | 118 | ''' 119 | documentPath: Path to document (pdf/word/etc.) 120 | ''' 121 | def getDocumentExtension(documentPath): 122 | try: 123 | return os.path.basename(documentPath).split('.')[len(os.path.basename(documentPath).split('.'))-1] 124 | except Exception as e: 125 | logging.error(f'Error getDocumentExtension(): {e}') 126 | return None 127 | 128 | ''' 129 | Removes new line characters, double spaces 130 | input_text: Piece of text 131 | ''' 132 | def cleanseText(input_text): 133 | try: 134 | input_text_cleansed = None 135 | input_text_cleansed = input_text.replace('\n',' ') #Remove new line characters 136 | input_text_cleansed = input_text_cleansed.replace(' ',' ') #Remove double space 137 | 138 | return input_text_cleansed 139 | except Exception as e: 140 | logging.error(f'Error cleanseText(): {e}') 141 | return None 142 | 143 | ''' 144 | Generate embedding for entire doc 145 | documentPath: Path to the document 146 | ''' 147 | def getEmbeddingEntireDoc(documentPath, aoai_embedding_model, chunk_size=1): 148 | 149 | try: 150 | docType = None 151 | document_pages_lc = None 152 | document_page_embedding_list = [] 153 | document_page_content_list = [] 154 | document_page_no_list = [] 155 | 156 | #Get document type 157 | docType = getDocumentExtension(documentPath).lower() 158 | 159 | if docType == 'pdf': 160 | document_pages_lc = readPDF(documentPath) 161 | 162 | # Custom word doc processing as there's not page metadata like PDF loader, 163 | # also the doc is not split into pages like PDF does out of the box. Please review readMSWord() method for more details. 164 | elif docType == 'docx' or docType == 'doc': 165 | document_pages_lc = readMSWord(documentPath) 166 | 167 | for document_page in document_pages_lc: 168 | # print(document_page) 169 | # print(document_page.page_content) 170 | # print(document_page.metadata["source"]) 171 | # print(document_page.metadata["page"]) 172 | 173 | source_doc_path = None 174 | source_doc_page_no = None 175 | source_doc_page_content = None 176 | embedding_result = None 177 | 178 | # if docType == 'pdf': 179 | # source_doc_path = document_page.metadata["source"] 180 | # source_doc_page_no = int(document_page.metadata["page"]) 181 | # source_doc_page_content = document_page.page_content 182 | 183 | # elif docType == 'docx' or docType == 'doc': 184 | # source_doc_path = document_page["metadata"]["source"] 185 | # source_doc_page_no = int(document_page["metadata"]["page"]) 186 | # source_doc_page_content = document_page["page_content"] 187 | 188 | source_doc_path = document_page.metadata["source"] 189 | source_doc_page_no = int(document_page.metadata["page"]) 190 | source_doc_page_content = document_page.page_content 191 | 192 | # print(source_doc_path) 193 | # print(source_doc_page_no) 194 | # print(source_doc_page_content) 195 | 196 | source_doc_page_content_cleansed = cleanseText(source_doc_page_content) 197 | 198 | if (source_doc_page_content_cleansed) is not None and (len(source_doc_page_content_cleansed)>0) and (source_doc_page_content_cleansed.strip != ''): 199 | 200 | embedding_result = getEmbedding(source_doc_page_content_cleansed, aoai_embedding_model, chunk_size=1, max_retries = 3) 201 | # print(embedding_result) 202 | 203 | if embedding_result is not None: 204 | document_page_content_list.append(source_doc_page_content) #Retain formatting 205 | document_page_embedding_list.append(embedding_result) 206 | document_page_no_list.append(source_doc_page_no) 207 | else: 208 | print(f'Unable to embed text:{source_doc_page_content}, moving to next.') 209 | 210 | return document_page_content_list, document_page_embedding_list, document_page_no_list 211 | except Exception as e: 212 | logging.error(f'Error getEmbeddingEntireDoc(): {e}') 213 | return None, None, None 214 | 215 | ''' 216 | host: Azure redis cache host (Azure redis resource -> Properties -> Host name) or url of ACI if deployed in as container 217 | access_key: Azure redis cache access key or password if deployed as container 218 | port: Azure redis port (defaults to 6380) 219 | ssl: True/False 220 | ''' 221 | def getRedisConnection(host, access_key, port=6380, ssl=True): 222 | try: 223 | az_redis = redis.Redis(host=host, 224 | port=port, 225 | password=access_key, 226 | ssl=ssl) 227 | return az_redis 228 | except Exception as e: 229 | logging.error(f'Error getRedisConnection(): {e}') 230 | return None 231 | 232 | def checkRedisIndexExists(index_name, az_redis_connection, encrypt_index_name=False): 233 | try: 234 | if encrypt_index_name: 235 | index_name = encode(index_name) 236 | 237 | az_redis_connection.ft(index_name).info() 238 | return True 239 | except: 240 | return False 241 | 242 | def dropRedisIndex(az_redis_connection, index_name='page_embeddings_index', encrypt_index_name=False): 243 | try: 244 | if encrypt_index_name: 245 | index_name = encode(index_name) 246 | 247 | az_redis_connection.ft(index_name).dropindex(delete_documents=False) 248 | return True 249 | except Exception as e: 250 | logging.error(f'Error dropRedisIndex(): {e}') 251 | return False 252 | 253 | ''' 254 | az_redis_connection: Connection object to Azure Redis Cache, with Search enabled (Stack / enterprise) 255 | index_name: Redis index name 256 | prefix: Key prefix 257 | distance_metric: Vector field distance metrics 258 | ''' 259 | def createRedisIndex(az_redis_connection, index_name='page_embeddings_index' , prefix = 'doc', distance_metric='COSINE', DIM = 1536, vec_type = 'HNSW', encrypt_index_name=False): 260 | try: 261 | response = None 262 | 263 | if encrypt_index_name: 264 | index_name = encode(index_name) 265 | 266 | if checkRedisIndexExists(index_name, az_redis_connection)==False: 267 | 268 | #Define fields 269 | page_content = TextField(name="page_content") 270 | page_number = NumericField(name="page_number") 271 | document_path = TextField(name="document_path") 272 | page_content_vector = VectorField("page_content_vector", 273 | vec_type, { 274 | "TYPE": "FLOAT32", 275 | "DIM": DIM, 276 | "DISTANCE_METRIC": distance_metric, 277 | "INITIAL_CAP": 1000 278 | }) 279 | 280 | 281 | # create search index 282 | response = az_redis_connection.ft(index_name).create_index( 283 | fields = [page_content,page_number,document_path,page_content_vector], 284 | definition = IndexDefinition( 285 | prefix=[f'{prefix}:'], #Sqaure bracket important! 286 | index_type=IndexType.HASH) 287 | ) 288 | else: 289 | print('Index already exists.') 290 | 291 | return response 292 | 293 | except Exception as e: 294 | logging.error(f'Error createRedisIndex(): {e}') 295 | return None 296 | 297 | def addRedisIndexRecord(az_redis_connection, id, page_content, page_content_vector, page_number, documentPath, prefix = 'doc'): 298 | try: 299 | 300 | # Super Important to include dtype parameter. Otherwise the record gets added but not seen by index!!! 301 | page_content_vector = np.array(page_content_vector, dtype=np.float32) 302 | # print(f'page_content_vector.shape:{page_content_vector.shape}') 303 | 304 | az_redis_connection.hset(name=f'{prefix}:{str(id)}', mapping={"page_content": str(page_content), 305 | "page_number":int(page_number), 306 | "document_path": str(documentPath), 307 | "page_content_vector": page_content_vector.tobytes() 308 | } 309 | ) 310 | 311 | # az_redis_connection.hset(name=f'{prefix}:{str(id)}', items= ["page_content", str(page_content), 312 | # "page_number", int(page_number), 313 | # "document_path", str(documentPath), 314 | # "page_content_vector", page_content_vector.tobytes() 315 | # ]) 316 | 317 | # pipe = az_redis_connection.pipeline(transaction=False) 318 | # pipe.hset(name=f'{prefix}:{str(id)}', mapping={"page_content": str(page_content), 319 | # "page_number":int(page_number), 320 | # "page_content_vector": page_content_vector.tobytes() 321 | # } 322 | # ) 323 | # pipe.execute() 324 | 325 | return True 326 | 327 | except Exception as e: 328 | logging.error(f'Error addRedisIndexRecord(): {e}') 329 | return False 330 | 331 | ''' 332 | Iterate over read document and add it to the index 333 | ''' 334 | def addDocumentToRedis(az_redis_connection, documentPath, document_page_content_list, document_page_embedding_list, document_page_no_list, prefix, encrypt_prefix=False): 335 | try: 336 | 337 | if encrypt_prefix: 338 | prefix = encode(prefix) 339 | 340 | 341 | # Iterate through pages 342 | for i, embedding in enumerate(document_page_embedding_list): 343 | 344 | hash_key = hashlib.sha1(f'{documentPath}_{i}'.encode('utf-8')).hexdigest() 345 | 346 | addRedisIndexRecord(az_redis_connection = az_redis_connection, 347 | id = hash_key, 348 | page_content = document_page_content_list[i], 349 | page_content_vector = document_page_embedding_list[i], 350 | page_number = document_page_no_list[i], 351 | prefix = prefix, 352 | documentPath = documentPath 353 | ) 354 | 355 | 356 | return True 357 | except Exception as e: 358 | logging.error(f'Error addDocumentToRedis(): {e}') 359 | return False 360 | 361 | ''' 362 | az_redis_connection: Connection to Redis, with Search enabled (Stack / enterprise) 363 | prompt: User query 364 | aoai_embedding_model: Azure OpenAI model for prompt embedding 365 | index_name: Redis index name 366 | top_n: Return top_n close matches 367 | ''' 368 | def queryRedis(az_redis_connection, prompt, aoai_embedding_model, index_name, top_n, encrypt_index_name=False): 369 | try: 370 | 371 | document_lc_list = [] 372 | 373 | if encrypt_index_name: 374 | index_name = encode(index_name) 375 | 376 | vec_prompt = getEmbedding(txt_data=prompt, aoai_embedding_model=aoai_embedding_model, chunk_size=1, max_retries = 3) 377 | vec_prompt = np.array(vec_prompt, dtype=np.float32) #Super important to specify dtype, otherwise vector share mismatch error. 378 | 379 | # base_query = f'*=>[KNN {str(top_n)} @page_content_vector $prompt_vector AS __page_content_vector_score]' 380 | base_query = f'*=>[KNN {str(top_n)} @page_content_vector $prompt_vector AS __page_content_vector_score]' 381 | query = ( 382 | Query(base_query) 383 | .sort_by("__page_content_vector_score") #asc = False, relevance in desc order. 384 | .paging(0,top_n) 385 | .return_fields('__page_content_vector_score','page_content','page_number', 'document_path') 386 | .dialect(2) 387 | ) 388 | 389 | query_result = az_redis_connection.ft(index_name).search(query, {"prompt_vector": vec_prompt.tobytes()}) 390 | # print(type(query_result)) 391 | 392 | #Create lc document, for use with lc 393 | for item in query_result.docs: 394 | document_lc = Document(page_content=item.page_content,metadata={"source":item.document_path, "page":item.page_number, "similarity":1-float(item.__page_content_vector_score)}) 395 | document_lc_list.append(document_lc) 396 | 397 | return query_result, document_lc_list 398 | 399 | except Exception as e: 400 | logging.error(f'Error queryRedis(): {e}') 401 | return None, None 402 | 403 | #----------------------------------- 404 | # Functions end here. 405 | #----------------------------------- 406 | 407 | #For cmd background colour 408 | class bcolors: 409 | HEADER = '\033[95m' 410 | OKBLUE = '\033[94m' 411 | OKCYAN = '\033[96m' 412 | OKGREEN = '\033[92m' 413 | WARNING = '\033[93m' 414 | FAIL = '\033[91m' 415 | ENDC = '\033[0m' 416 | BOLD = '\033[1m' 417 | UNDERLINE = '\033[4m' 418 | #----------------------------------- 419 | 420 | aoai_embedding_models = { 421 | 422 | "text-search-ada-doc-001":{ 423 | "version":{ 424 | "1":{ 425 | "deployment_name": "text-search-ada-doc-001-v1", 426 | "dim": 1024 427 | } 428 | } 429 | }, 430 | 431 | "text-search-babbage-doc-001":{ 432 | "version":{ 433 | "1":{ 434 | "deployment_name": "text-search-babbage-doc-001-v1", 435 | "dim": 2048 436 | } 437 | } 438 | }, 439 | 440 | "text-search-curie-doc-001":{ 441 | "version":{ 442 | "1":{ 443 | "deployment_name": "text-search-curie-doc-001-v1", 444 | "dim": 4096 445 | } 446 | } 447 | }, 448 | 449 | "text-search-davinci-doc-001":{ 450 | "version":{ 451 | "1":{ 452 | "deployment_name": "text-search-davinci-doc-001-v1", 453 | "dim": 12288 454 | } 455 | } 456 | }, 457 | 458 | "text-embedding-ada-002":{ 459 | "version":{ 460 | "1":{ 461 | "deployment_name": "text-embedding-ada-002-v1", 462 | "dim": 1536 463 | } 464 | } 465 | }, 466 | 467 | "text-davinci-003":{ 468 | "version":{ 469 | "1":{ 470 | "deployment_name": "text-davinci-003-v1" 471 | } 472 | } 473 | } 474 | 475 | } -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Azure OpenAI Semantic Search Demo | Document Upload 2 | 3 |  4 | 5 | ## About 6 | 7 | Version: 0.9.7 8 | 9 | The ability to conduct semantic search on vector data is a powerful feature that allows you to find relevant content based on a specific natural language query. This demo is helpful for showcasing and comprehending the abstractive responses generated from your own data in PDF or Word format documents. 10 | 11 | This solution has been created by drawing inspiration from existing Enterprise Chat GPT and Document Q&A vector search demos, however it uses a simplified architecture pattern and offers following features, most of which are unique to this implementation. 12 | 13 | ## Key Features 14 | - _Simplified architecture_ 15 | - _Built-in document cracking (.pdf, .docx)_ 16 | - _Utilise text embeddings_ 17 | - _Upload own document and ask questions_ 18 | 19 | ## How to deploy? 20 | **Run locally from Visual Studio Code or command prompt** 21 | - Open VS Code terminal or command prompt. 22 | - Clone this repository and open in VS Code. 23 | - Create a new conda environment 24 | - ```conda create -n openaidemo_py39 python=3.9 anaconda``` 25 | - ```conda activate openaidemo_py39``` 26 | - For more info go to [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-python.html) 27 | - Navigate to _misc_ directory and run 28 | - ```pip install -r requirements.txt``` 29 | - Provision Azure OpenAI service 30 | - Under deployments within Azure OpenAI Studio, deploy 2 models 31 | - Model for text search e.g. text-search-davinci-doc-001 32 | - Model for text generation e.g. text-davinci-003 33 | - Model availability varies by region in which OpenAI service is provisioned in Azure. 34 | - For more info go to [Create a resource and deploy a model using Azure OpenAI](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource) 35 | - Open _modules/utilities.py_ file and scroll to very bottom 36 | - Update value for "deployment_name" tag for respective model version. 37 | - If you have used the 2 models mentioned in steps above, no changes are required in _app/app.py_ file. 38 | - Otherwise, update values for aoai_embedding_model and aoai_text_model variables at the begining of app/app.py file. 39 | - Provision Redis stack as Azure Container Instance 40 | - Use image = redis/redis-stack-server:latest 41 | - On _advanced tab_ make sure you add 42 | - --requirepass