├── .env_template
├── .gitignore
├── LICENSE
├── app
    └── app.py
├── docker
    └── dockerfile
├── images
    ├── arch.png
    ├── logo_black.png
    ├── logo_black_simple.png
    └── logo_gray.png
├── misc
    └── requirements.txt
├── modules
    └── utilities.py
├── readme.md
└── sample_docs
    └── Easy_recipes_Boston_University.pdf


/.env_template:
--------------------------------------------------------------------------------
1 | OPENAI_API_TYPE=azure
2 | OPENAI_API_KEY=XXXXXX
3 | OPENAI_API_BASE=https://XXXXXX.openai.azure.com
4 | API_VERSION=2022-12-01
5 | REDIS_HOST=XXXXXX
6 | REDIS_ACCESS_KEY=XXXXXX
7 | REDIS_PORT=6379
8 | MAINTENANCE_MODE=no


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | ./scripts/*
3 | .env
4 | ./temp_uploads/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 MaheshSQL
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/app/app.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('..')
  3 | 
  4 | import os
  5 | import streamlit as st
  6 | from datetime import datetime
  7 | from modules.utilities import *
  8 | import pathlib
  9 | from uuid import uuid4
 10 | 
 11 | 
 12 | # Will need commenting when deploying the app
 13 | load_dotenv()
 14 | 
 15 | #Set env variables
 16 | setEnv()
 17 | 
 18 | # Exit if app set to MAINTENANCE_MODE (yes, no)
 19 | if str(os.getenv('MAINTENANCE_MODE')).lower()=='yes':    
 20 |     st.write('App is currently offline for maintenance, please check back later.')
 21 |     exit()
 22 | 
 23 | aoai_embedding_model = 'text-search-davinci-doc-001' #'text-search-ada-doc-001'
 24 | aoai_embedding_model_version = '1'
 25 | 
 26 | aoai_text_model = 'text-davinci-003' 
 27 | aoai_text_model_version = '1'
 28 | aoai_text_model_temperature = 0.2
 29 | aoai_text_model_max_tokens = 500
 30 | 
 31 | aoai_embedding_model_deployment = aoai_embedding_models[aoai_embedding_model]["version"][aoai_embedding_model_version]["deployment_name"] #Azure OpenAI deployment name
 32 | aoai_embedding_model_dim = aoai_embedding_models[aoai_embedding_model]["version"][aoai_embedding_model_version]["dim"]
 33 | 
 34 | aoai_text_model_deployment = aoai_embedding_models[aoai_text_model]["version"][aoai_text_model_version]["deployment_name"] #Azure OpenAI deployment name
 35 | 
 36 | score_threshold = 50 #Show answers above or equal to this score threshold
 37 | prompt_min_length = 5
 38 | ms_alias_min_length = 6
 39 | prompt_text_area_max_chars = 300
 40 | temp_dir = '../temp_uploads/' #Where uploaded files get staged until they are indexed, files staged for few seconds only then deleted.
 41 | app_version = '0.9.7' #Equal to docker image version tag, shown in sidebar.
 42 | 
 43 | #--------------------------------------------------------------------------
 44 | # Get connection
 45 | #--------------------------------------------------------------------------
 46 | az_redis = getRedisConnection(host=os.getenv('REDIS_HOST'), access_key=os.getenv('REDIS_ACCESS_KEY'), port=os.getenv('REDIS_PORT'), ssl=False)
 47 | # print(az_redis)
 48 | 
 49 | def getKeywordList(input_text):
 50 |     input_text = input_text.replace('.',' ')
 51 |     input_text = input_text.replace('-',' ')
 52 |     input_text = input_text.replace('=',' ')
 53 |     input_text = input_text.replace('?',' ')
 54 |     input_text = input_text.replace('!',' ')
 55 |     keyword_list = [word.lower() for word in input_text.split() if word.lower() not in ['?','a','an','and','or','do','of','if','not','for','are','was','were','is','can','have','has','there','their','the','how', 'why', 'when', 'what',"what's",'in', 'to', 'i', 'we', 'you']]
 56 |     return keyword_list
 57 | 
 58 | def highlightKeywords(keyword_list, input_text):
 59 |     highlighted = " ".join(f'<span style="background-color: #ffff99">{t}</span>' if t.lower() in keyword_list else t for t in input_text.split(' '))    
 60 |     # print(f'highlighted:{highlighted}')   
 61 |     
 62 |     return highlighted
 63 | 
 64 | def getResult(prompt, top_n, index_name):
 65 | 
 66 |     out = []
 67 | 
 68 |     # prompt = prompt + ' Respond with "Not found" if the answer is not present in the passage.'
 69 | 
 70 |     query_result,document_lc_list = queryRedis(az_redis_connection=az_redis, prompt=prompt, 
 71 |                             aoai_embedding_model=aoai_embedding_model_deployment, index_name=index_name, top_n=top_n)
 72 |     # print(f'query_result:{query_result}') 
 73 |     # print(f'document_lc_list:{document_lc_list}')
 74 | 
 75 |     # Check if any response received
 76 |     if document_lc_list is not None:
 77 | 
 78 |         # Open AI lc qna
 79 |         llm = AzureOpenAI(deployment_name=aoai_text_model_deployment,temperature=aoai_text_model_temperature, max_tokens=aoai_text_model_max_tokens)
 80 | 
 81 |         # lc
 82 |         # chain = load_qa_with_sources_chain(llm, chain_type="stuff")
 83 |         chain = load_qa_with_sources_chain(llm, chain_type="map_rerank", verbose=False, return_intermediate_steps=True)
 84 |         chain_out = chain({"input_documents": document_lc_list, "question": prompt}, return_only_outputs=False)
 85 |         # print(f'chain_out:{chain_out}')
 86 | 
 87 |         results = []
 88 |         for i, item in enumerate(chain_out['intermediate_steps']):
 89 |             # print(item['answer'], item['score']) #Uncomment to view the answer
 90 |             results.append((int(item['score']),i,item['answer']))
 91 | 
 92 |         results.sort(reverse = True) #Sort desc based on Score
 93 |         # print(results)
 94 |         # print(results[0][1]) #top first answer index    
 95 | 
 96 |         # Top N answers
 97 |         for i in range(top_n):           
 98 |             
 99 |             # Check score threshold
100 |             if int(results[i][0]) >= score_threshold:
101 |                 out_item = None
102 |                 out_item = {
103 |                     "Answer":results[i][2],
104 |                     "Score": int(results[i][0]),
105 |                     f"Content": chain_out['input_documents'][results[i][1]].page_content,
106 |                     f"Source": chain_out['input_documents'][results[i][1]].metadata['source'],
107 |                     f"Similarity": chain_out['input_documents'][results[i][1]].metadata['similarity'],
108 |                     f"Page": int(chain_out['input_documents'][results[i][1]].metadata['page'])+1
109 |                     }
110 |                 out.append(out_item)      
111 |     
112 | 
113 |     return out    
114 | 
115 | #--------------------------------------------------------------------------
116 | 
117 | # Initialization of session vars
118 | if 'questions' not in st.session_state:
119 |     st.session_state['questions'] = []
120 | if 'answers' not in st.session_state:
121 |     st.session_state['answers'] = []
122 | 
123 | 
124 | st.set_page_config(page_title='Azure OpenAI Search Demo', layout='wide', page_icon='../images/logo_black_simple.png')
125 | 
126 | 
127 | 
128 | with st.container():
129 |     
130 |     def upload_button_click():
131 | 
132 |         if file_uploader is not None and len(textbox_msalias.strip()) >= ms_alias_min_length:
133 |             progress_bar = middle_column_12.progress(0,'')
134 |             
135 |             # st.write(str(os.listdir('../')))
136 |             if not os.path.exists(temp_dir):
137 |                 os.makedirs(temp_dir)
138 | 
139 |             # print(file_uploader.getvalue())
140 |             # local_file = pathlib.Path('./temp_uploads/'+str(uuid4())+'_'+file_uploader.name)
141 |             local_file = pathlib.Path(temp_dir + file_uploader.name)            
142 |             local_file.write_bytes(file_uploader.getvalue()) #Write locally to crack open PDF/word docs
143 |             
144 |             local_file_path = str(local_file)
145 |             # print(local_file_path)
146 | 
147 |             progress_bar.progress(20,'File acquired')                      
148 | 
149 |             progress_bar.progress(30,'Backend connected')
150 | 
151 |             # Create index if it does not exist
152 |             result = createRedisIndex(az_redis_connection=az_redis, index_name=textbox_msalias , prefix = textbox_msalias, 
153 |                                     distance_metric='COSINE', DIM = aoai_embedding_model_dim, vec_type='HNSW') 
154 |             print(f'Create index result:{result}')
155 | 
156 |             progress_bar.progress(40,'Processing')
157 | 
158 |             # Read document, cleanse content, get content and embeddings
159 |             document_page_content_list, \
160 |             document_page_embedding_list, \
161 |             document_page_no_list = getEmbeddingEntireDoc(documentPath=local_file_path, 
162 |                                                         aoai_embedding_model=aoai_embedding_model_deployment, 
163 |                                                         chunk_size=1)
164 |             print('Embeddings retrieved')
165 |             print(len(document_page_content_list), len(document_page_embedding_list), len(document_page_no_list))
166 |             # print(document_page_content_list)
167 |             # print(document_page_embedding_list, document_page_no_list)
168 | 
169 |             progress_bar.progress(80,'Almost done')
170 | 
171 |             # Add document pages
172 |             response = addDocumentToRedis(az_redis_connection=az_redis, 
173 |                             documentPath=local_file_path,
174 |                             document_page_content_list=document_page_content_list, 
175 |                             document_page_embedding_list=document_page_embedding_list, 
176 |                             document_page_no_list=document_page_no_list,
177 |                             prefix = textbox_msalias
178 |                             )
179 |             print(f'addDocumentToRedis: {response}')
180 | 
181 |             progress_bar.progress(90,'Running cleanup')
182 | 
183 |             # Remove local PDF after indexing completed
184 |             if os.path.exists(local_file_path):
185 |                 os.remove(local_file_path)
186 | 
187 |             progress_bar.progress(100,'Completed')
188 | 
189 |         if len(textbox_msalias.strip()) < ms_alias_min_length:
190 |             left_column.warning('Please enter a valid alias')
191 | 
192 |     top_left_column, middle_left_column, right_left_column = st.columns([40,20,40])
193 |     top_left_column_1, top_left_column_2 = top_left_column.columns([25,75])
194 |     top_left_column_1.image(image='../images/logo_black.png', width=100)
195 |     # top_left_column_2.write('###')
196 |     top_left_column_2.subheader('Semantic Search Demo')    
197 |     top_left_column_2.write('Unleash the power of your documents with data-driven inquiries')    
198 | 
199 |     # st.write('---')   
200 | 
201 |     with st.sidebar:      
202 |                     
203 |         st.markdown(':gear: Settings')
204 | 
205 |         textbox_msalias = st.text_input(label='Unique alias*', max_chars=10, key='textbox_msalias', type='password', 
206 |                                         help='''Unique text value to store/query your docs under. 
207 |                                         Use same value when you revisit this app in future for consistent experience.''')
208 |         selectbox_top_n = st.selectbox(label='Top N results*',options=(3,5,10), index = 2, key='selectbox_top_n')        
209 | 
210 |         checkbox_score = st.checkbox(label='Score',key='checkbox_score', value=False, help='Value between 0 to 100 suggesting LLM confidence for answering the question by with retrieved passage of text.')
211 |         checkbox_similarity = st.checkbox(label='Similarity',key='checkbox_similarity', value=False, help='Similarity between the query and retrieved passage of text.')   
212 | 
213 |         checkbox_page_no = st.checkbox(label='Page No',key='checkbox_page_no', value=True, help='Document page number.')    
214 |         checkbox_show_fileupload = st.checkbox(label='Upload file',key='checkbox_show_fileupload', value=False, help='Upload file using upload widget.')
215 | 
216 |         st.write('### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ### \n ###')
217 |         st.write("[Github Repo](https://github.com/MaheshSQL/openai-vector-search-demo)")
218 |         st.caption('Version: '+app_version)
219 |         st.write('<p style="font-size:14px; color:black;"><b>Powered by Azure OpenAI</b></p>', unsafe_allow_html=True)
220 |     
221 |     if checkbox_show_fileupload == True:
222 |         st.write('----')
223 | 
224 |         left_column_11, middle_column_12, right_column_13 = st.columns([36,8,56])
225 |         file_uploader = left_column_11.file_uploader(label='Upload file.',accept_multiple_files=False, key='file_uploader_1',type=['pdf', 'docx'],label_visibility='hidden')    
226 |         middle_column_12.write('###')
227 |         middle_column_12.write('###')
228 |         # middle_column_b.write('###')    
229 |         upload_button = middle_column_12.button(label='Upload', on_click=upload_button_click)       
230 |         
231 |         #fffce7
232 |         right_column_13.write('''<b><u>Disclaimer</u></b> 
233 |                         \n <p style="font-size:16px; color:black;background-color:#f7b0b0">This public demo app is <b>not intended for use with sensitive data</b>. 
234 |                         We strongly advise against uploading any sensitive data to this application. 
235 |                         We cannot guarantee the security of any data uploaded to this application. By using this application, you acknowledge that you understand and accept this risk.
236 |                         Please use <b>publicly available data</b> only.</p>
237 |                         \n <p style="font-size:16px; color:black;background-color:#f7b0b0"><i>For use with sensitive documents, please clone the repository and run it in your own environment.</i></p>'''
238 |                         ,unsafe_allow_html=True)
239 |         st.write('----')       
240 |         
241 | 
242 | with st.container():
243 | 
244 |     # left_column, middle_column, right_column = st.columns([46,8,46])
245 |     left_column, middle_column, right_column = st.columns([60,10,30])
246 |     
247 |     prompt = left_column.text_area(label='Enter your question:',max_chars=prompt_text_area_max_chars, key='text_area1', label_visibility ='hidden')    
248 | 
249 |     def search_click():
250 | 
251 |         questions = st.session_state['questions']
252 |         answers = st.session_state['answers']
253 |         
254 |         if prompt is not None and len(prompt.strip()) >= prompt_min_length and len(textbox_msalias.strip()) >= ms_alias_min_length:
255 |             answer = []
256 | 
257 |             top_n = int(selectbox_top_n)
258 | 
259 |             try:
260 |                 answer = getResult(prompt, top_n, textbox_msalias)
261 |             except:
262 |                 print('Exception in getResult()')
263 | 
264 |             #No results retrieved
265 |             if len(answer)==0:
266 |                 left_column.warning('No results found. Consider uploading document/s first if you are using this app for the first time for unique alias you have specified. \n Check Upload file --> Browse file --> Click Upload to get started.')
267 | 
268 |             #Populate bottom pane with all N responses
269 |             for ans_details in answer:
270 | 
271 |                 keyword_list = getKeywordList(prompt)                
272 | 
273 |                 left_column.write(f'<p style="font-size:16px; color:white;background-color:#7e93ff"><b>Answer</b>: {ans_details["Answer"]}</p>',unsafe_allow_html=True)                
274 | 
275 |                 if checkbox_score:
276 |                     left_column.write(f'<p style="font-size:12px; color:black"><b>Score</b>: {ans_details["Score"]}</p>',unsafe_allow_html=True)
277 |                 
278 |                 # left_column.write(f'<p style="font-size:16px; color:black;background-color:#e8ebfa""><b>Content</b>: {ans_details[f"Content"]}</p>',unsafe_allow_html=True)
279 |                 left_column.write(f'<p style="font-size:16px; color:black;background-color:#e8ebfa""><b>Content</b>: {highlightKeywords(keyword_list, ans_details[f"Content"])}</p>',unsafe_allow_html=True)                
280 | 
281 |                 left_column.write(f'<p style="font-size:14px; color:black"><b>Source</b>:<i> {os.path.basename(ans_details[f"Source"])}</i></p>',unsafe_allow_html=True)
282 |                 
283 |                 if checkbox_similarity:
284 |                     left_column.write(f'<p style="font-size:12px; color:black"><b>Similarity</b>: {ans_details[f"Similarity"]}</p>',unsafe_allow_html=True)
285 |                 
286 |                 if checkbox_page_no:
287 |                     left_column.write(f'<p style="font-size:12px; color:black"><b>Page No</b>: {ans_details[f"Page"]}</p>',unsafe_allow_html=True)                
288 |                 
289 |                 left_column.write('----')
290 | 
291 |             if str(prompt).strip() != '' and len(answer) > 0:
292 |                 questions.append(prompt)
293 |                 answers.append(answer)        
294 | 
295 |             st.session_state['questions'] = questions          
296 |             st.session_state['answers'] = answers 
297 | 
298 |         if len(textbox_msalias.strip()) < ms_alias_min_length:
299 |             left_column.warning('Please enter a valid alias')
300 | 
301 |         # print(f'questions:{questions}')
302 |         # print(f'answers:{answers}')
303 | 
304 |         if len(list(reversed(questions))) > 0:
305 |             right_column.write(f'<p style="font-size:16px; color:black"><b>Question History</b></p>',unsafe_allow_html=True)    
306 |             
307 | 
308 |         # Show in reversed order without modifying the lists set into sessions
309 |         for i, item in enumerate(list(reversed(questions))):                       
310 | 
311 |             question_text = str('' + str(list(reversed(questions))[i]))
312 |             # answer_text = str(''+ str(list(reversed(answers))[i]))
313 |             
314 |             # [{datetime.now().strftime("%d-%m-%Y %H:%M:%S")}]
315 |             # right_column.write('###')
316 |             
317 |             right_column.write(f'<p style="font-size:16px; color:black;background-color:#f0f2f6"><b>Question</b>: {question_text} </p>',unsafe_allow_html=True)
318 |             # right_column.write(f'<p style="font-size:14px; color:black"><b>Answer</b>: {answer_text}</p>',unsafe_allow_html=True)
319 |             # right_column.write('---')
320 |             # print(list(reversed(answers))[i])
321 |             for j, ans_details in enumerate(list(reversed(answers))[i]):                
322 |                 
323 |                 #Only show 1 top answer in history (right side pane)
324 |                 if j==0:
325 |                     right_column.write(f'<p style="font-size:14px; color:black"><b>Answer</b>: {ans_details["Answer"]}</p>',unsafe_allow_html=True)
326 |                     if checkbox_score:
327 |                         right_column.write(f'<p style="font-size:12px; color:black"><b>Score</b>: {ans_details["Score"]}</p>',unsafe_allow_html=True)
328 |                     right_column.write(f'<p style="font-size:12px; color:black"><b>Content</b>: {ans_details[f"Content"]}</p>',unsafe_allow_html=True)
329 |                     right_column.write(f'<p style="font-size:12px; color:black"><b>Source</b>:<i> {os.path.basename(ans_details[f"Source"])}</i></p>',unsafe_allow_html=True)
330 |                     if checkbox_similarity:
331 |                         right_column.write(f'<p style="font-size:12px; color:black"><b>Similarity</b>: {ans_details[f"Similarity"]}</p>',unsafe_allow_html=True)
332 |                     if checkbox_page_no:
333 |                         right_column.write(f'<p style="font-size:12px; color:black"><b>Page No</b>: {ans_details[f"Page"]}</p>',unsafe_allow_html=True)                
334 |                     right_column.write('---')               
335 | 
336 | 
337 |     def clear_click():
338 |         st.session_state['text_area1'] = ''   
339 |         st.session_state['questions'] = []
340 |         st.session_state['answers'] = []
341 |         # st.session_state['checkbox_score'] = False
342 |         # st.session_state['checkbox_similarity'] = False
343 |         # st.session_state['checkbox_page_no'] = False
344 |     
345 |        
346 |     middle_column.write('###')
347 |     middle_column.write('###')
348 |     search_button= middle_column.button(label='Search', on_click= search_click)
349 |     clear_button = middle_column.button(label='Clear', on_click = clear_click)    


--------------------------------------------------------------------------------
/docker/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.0
 2 | EXPOSE 8501
 3 | CMD mkdir -p /app
 4 | CMD mkdir -p /modules
 5 | CMD mkdir -p /temp_uploads
 6 | CMD mkdir -p /images
 7 | 
 8 | WORKDIR .
 9 | COPY ../misc/requirements.txt ./
10 | RUN pip3 install -r requirements.txt
11 | 
12 | COPY ../app/app.py app/
13 | COPY ../modules/utilities.py modules/
14 | COPY ../images/* images/
15 | 
16 | WORKDIR /app
17 | ENTRYPOINT ["streamlit", "run"]
18 | CMD ["app.py"]


--------------------------------------------------------------------------------
/images/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/arch.png


--------------------------------------------------------------------------------
/images/logo_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/logo_black.png


--------------------------------------------------------------------------------
/images/logo_black_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/logo_black_simple.png


--------------------------------------------------------------------------------
/images/logo_gray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/images/logo_gray.png


--------------------------------------------------------------------------------
/misc/requirements.txt:
--------------------------------------------------------------------------------
1 | openai==0.27.2
2 | tiktoken==0.3.3
3 | redis==4.5.4
4 | langchain==0.0.132
5 | pypdf==3.7.0
6 | python-dotenv==1.0.0
7 | pandas==1.5.3
8 | unstructured==0.5.11
9 | streamlit==1.21.0


--------------------------------------------------------------------------------
/modules/utilities.py:
--------------------------------------------------------------------------------
  1 | #------------Imports---------------
  2 | import os
  3 | 
  4 | from langchain.document_loaders import PyPDFLoader
  5 | from langchain.document_loaders import UnstructuredWordDocumentLoader
  6 | from langchain.embeddings.openai import OpenAIEmbeddings
  7 | from langchain.schema import Document
  8 | from langchain.llms.openai import AzureOpenAI
  9 | from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 10 | 
 11 | import openai
 12 | 
 13 | from dotenv import load_dotenv
 14 | 
 15 | import redis
 16 | from redis.commands.search.field import VectorField, TagField, TextField, NumericField
 17 | from redis.commands.search.indexDefinition import IndexDefinition, IndexType
 18 | from redis.commands.search.query import Query
 19 | import hashlib
 20 | import numpy as np
 21 | 
 22 | import logging
 23 | 
 24 | 
 25 | logging.basicConfig(level=logging.ERROR)
 26 | 
 27 | #------------Functions---------------
 28 | 
 29 | '''Read PDF documents and return the list of langchain documents
 30 | '''
 31 | def readPDF(source_url):
 32 |     try:
 33 |         document_pages_lc = None
 34 |         document_pages_lc = PyPDFLoader(source_url).load()
 35 | 
 36 |         # for page in document_pages_lc:
 37 |             
 38 |         #     print(f'Source: {str(page.metadata["source"])}')
 39 |         #     print(f'Page: {str(int(page.metadata["page"])+1)}')
 40 |         #     print(page.page_content)
 41 | 
 42 |         return document_pages_lc
 43 |     except Exception as e:
 44 |         logging.error(f'Error readPDF(): {e}')
 45 |         return None
 46 | 
 47 | '''Read MS Word documents and return the list of langchain documents
 48 | '''
 49 | def readMSWord(source_url):
 50 |     try:
 51 |         one_page_size = 300 #IMP: How many words per split page of whole doc.
 52 |         document_pages_lc = None
 53 |         document_pages_lc = UnstructuredWordDocumentLoader(source_url).load() #Note: This method does not return same object as PDf loader, e.g. Doc pages not recognized. So below custom logic is built.
 54 |         document_pages_lc_list = []        
 55 |         
 56 |         # UnstructuredWordDocumentLoader returns whole doc as a single page, so need to impelement custom splitting
 57 |         for page in document_pages_lc:                       
 58 |             
 59 |             page_words = page.page_content.split(' ') #Split doc into words
 60 | 
 61 |             #Split document into pages of one_page_size words each
 62 |             for i in range((len(page_words) // one_page_size)+1):
 63 |                 # print(i)
 64 |                 
 65 |                 # Note: Replaced below with Document object as in code below this section.
 66 |                 # document_pages_lc_dict = {} #{"page_content":"",metadata={"source": "..doc", "page": 4}}
 67 |                 # document_pages_lc_dict["page_content"] =  ' '.join(page_words[i*one_page_size:(i+1)*one_page_size])
 68 |                 # document_pages_lc_dict["metadata"] = {"source":page.metadata["source"], "page":i}
 69 |                 # document_pages_lc_list.append(document_pages_lc_dict)     
 70 | 
 71 |                 doc = Document(page_content=' '.join(page_words[i*one_page_size:(i+1)*one_page_size]),
 72 |                                metadata={"source":page.metadata["source"], "page":i})
 73 |                 document_pages_lc_list.append(doc)                   
 74 |         
 75 |         return document_pages_lc_list
 76 |     except Exception as e:
 77 |         logging.error(f'Error readMSWord_old(): {e}')
 78 |         return None
 79 |     
 80 | '''
 81 | Initialise environment variables
 82 | '''
 83 | def setEnv():
 84 |     try:
 85 |         openai.api_type = os.getenv('OPENAI_API_TYPE')
 86 |         openai.api_base = os.getenv('OPENAI_API_BASE')
 87 |         openai.api_version = os.getenv('API_VERSION')
 88 |         openai.api_key = os.getenv("OPENAI_API_KEY")
 89 |         
 90 |         return True
 91 |     except Exception as e:
 92 |         logging.error(f'Error setEnv(): {e}')    
 93 |         return False
 94 | 
 95 | '''
 96 | input_text: input text
 97 | '''
 98 | def encode(input_text):
 99 |     return str(hashlib.sha1(f'{input_text}'.encode('utf-8')).hexdigest())
100 | 
101 | '''
102 | txt_data: input data
103 | aoai_embedding_model: Azure OpenAI deployment name
104 | chunk_size: Maximum number of texts to embed in each batch
105 | max_retries: Maximum number of retries to make when generating.
106 | '''
107 | def getEmbedding(txt_data, aoai_embedding_model, chunk_size=1, max_retries = 3):
108 |     try:
109 |         embeddings = OpenAIEmbeddings(model=aoai_embedding_model, chunk_size=chunk_size, max_retries=max_retries)        
110 |         query_result = embeddings.embed_query(txt_data)
111 |         return query_result
112 |     except Exception as e:
113 |         logging.info(f'txt_data: {txt_data}')
114 |         logging.error(f'Error getEmbedding(): {e}')        
115 |         return None
116 | 
117 | 
118 | '''
119 | documentPath: Path to document (pdf/word/etc.)
120 | '''
121 | def getDocumentExtension(documentPath):
122 |     try:
123 |         return os.path.basename(documentPath).split('.')[len(os.path.basename(documentPath).split('.'))-1]
124 |     except Exception as e:
125 |         logging.error(f'Error getDocumentExtension(): {e}')    
126 |         return None
127 | 
128 | '''
129 | Removes new line characters, double spaces
130 | input_text: Piece of text
131 | '''
132 | def cleanseText(input_text):
133 |     try:
134 |         input_text_cleansed = None
135 |         input_text_cleansed = input_text.replace('\n',' ') #Remove new line characters
136 |         input_text_cleansed = input_text_cleansed.replace('  ',' ') #Remove double space
137 | 
138 |         return input_text_cleansed
139 |     except Exception as e:
140 |         logging.error(f'Error cleanseText(): {e}')
141 |         return None
142 | 
143 | '''
144 | Generate embedding for entire doc
145 | documentPath: Path to the document
146 | '''
147 | def getEmbeddingEntireDoc(documentPath, aoai_embedding_model, chunk_size=1):
148 | 
149 |     try:
150 |         docType = None
151 |         document_pages_lc = None
152 |         document_page_embedding_list = []    
153 |         document_page_content_list = []
154 |         document_page_no_list = []
155 | 
156 |         #Get document type
157 |         docType = getDocumentExtension(documentPath).lower()
158 |         
159 |         if docType == 'pdf':
160 |             document_pages_lc = readPDF(documentPath)
161 | 
162 |         # Custom word doc processing as there's not page metadata like PDF loader, 
163 |         # also the doc is not split into pages like PDF does out of the box. Please review readMSWord() method for more details.
164 |         elif docType == 'docx' or docType == 'doc':
165 |             document_pages_lc = readMSWord(documentPath)
166 |         
167 |         for document_page in document_pages_lc:
168 |             # print(document_page)
169 |             # print(document_page.page_content)
170 |             # print(document_page.metadata["source"])
171 |             # print(document_page.metadata["page"])
172 | 
173 |             source_doc_path = None
174 |             source_doc_page_no = None
175 |             source_doc_page_content = None
176 |             embedding_result = None
177 | 
178 |             # if docType == 'pdf':
179 |             #     source_doc_path = document_page.metadata["source"]
180 |             #     source_doc_page_no = int(document_page.metadata["page"])
181 |             #     source_doc_page_content = document_page.page_content
182 | 
183 |             # elif docType == 'docx' or docType == 'doc':
184 |             #     source_doc_path = document_page["metadata"]["source"]
185 |             #     source_doc_page_no = int(document_page["metadata"]["page"])
186 |             #     source_doc_page_content = document_page["page_content"]
187 | 
188 |             source_doc_path = document_page.metadata["source"]
189 |             source_doc_page_no = int(document_page.metadata["page"])
190 |             source_doc_page_content = document_page.page_content
191 |             
192 |             # print(source_doc_path)
193 |             # print(source_doc_page_no)
194 |             # print(source_doc_page_content)
195 | 
196 |             source_doc_page_content_cleansed = cleanseText(source_doc_page_content)
197 | 
198 |             if (source_doc_page_content_cleansed) is not None and (len(source_doc_page_content_cleansed)>0) and (source_doc_page_content_cleansed.strip != ''):                    
199 | 
200 |                 embedding_result = getEmbedding(source_doc_page_content_cleansed, aoai_embedding_model, chunk_size=1, max_retries = 3)
201 |                 # print(embedding_result)
202 | 
203 |                 if embedding_result is not None:
204 |                     document_page_content_list.append(source_doc_page_content) #Retain formatting
205 |                     document_page_embedding_list.append(embedding_result)        
206 |                     document_page_no_list.append(source_doc_page_no)
207 |                 else:
208 |                     print(f'Unable to embed text:{source_doc_page_content}, moving to next.')
209 | 
210 |         return document_page_content_list, document_page_embedding_list, document_page_no_list
211 |     except Exception as e:
212 |         logging.error(f'Error getEmbeddingEntireDoc(): {e}')
213 |         return None, None, None        
214 | 
215 | '''
216 | host: Azure redis cache host (Azure redis resource -> Properties -> Host name) or url of ACI if deployed in as container
217 | access_key: Azure redis cache access key or password if deployed as container
218 | port: Azure redis port (defaults to 6380)
219 | ssl: True/False
220 | '''
221 | def getRedisConnection(host, access_key, port=6380, ssl=True):
222 |     try:
223 |         az_redis = redis.Redis(host=host, 
224 |                             port=port, 
225 |                             password=access_key, 
226 |                             ssl=ssl)
227 |         return az_redis
228 |     except Exception as e:
229 |         logging.error(f'Error getRedisConnection(): {e}')
230 |         return None
231 | 
232 | def checkRedisIndexExists(index_name, az_redis_connection, encrypt_index_name=False):
233 |     try:
234 |         if encrypt_index_name:
235 |             index_name = encode(index_name)
236 | 
237 |         az_redis_connection.ft(index_name).info()
238 |         return True
239 |     except: 
240 |         return False
241 | 
242 | def dropRedisIndex(az_redis_connection, index_name='page_embeddings_index', encrypt_index_name=False):
243 |     try:
244 |         if encrypt_index_name:
245 |             index_name = encode(index_name)
246 | 
247 |         az_redis_connection.ft(index_name).dropindex(delete_documents=False)
248 |         return True
249 |     except Exception as e:
250 |         logging.error(f'Error dropRedisIndex(): {e}')
251 |         return False 
252 |                      
253 | '''
254 | az_redis_connection: Connection object to Azure Redis Cache, with Search enabled (Stack / enterprise)
255 | index_name: Redis index name
256 | prefix: Key prefix
257 | distance_metric: Vector field distance metrics
258 | '''
259 | def createRedisIndex(az_redis_connection, index_name='page_embeddings_index' , prefix = 'doc', distance_metric='COSINE', DIM = 1536, vec_type = 'HNSW', encrypt_index_name=False):
260 |     try:
261 |         response = None
262 | 
263 |         if encrypt_index_name:            
264 |             index_name = encode(index_name)            
265 | 
266 |         if checkRedisIndexExists(index_name, az_redis_connection)==False:
267 | 
268 |             #Define fields
269 |             page_content = TextField(name="page_content")
270 |             page_number = NumericField(name="page_number")
271 |             document_path = TextField(name="document_path")
272 |             page_content_vector = VectorField("page_content_vector",
273 |                         vec_type, {
274 |                             "TYPE": "FLOAT32",
275 |                             "DIM": DIM,
276 |                             "DISTANCE_METRIC": distance_metric,
277 |                             "INITIAL_CAP": 1000                            
278 |                         })
279 |             
280 |             
281 |             # create search index        
282 |             response = az_redis_connection.ft(index_name).create_index(
283 |             fields = [page_content,page_number,document_path,page_content_vector],
284 |             definition = IndexDefinition(
285 |                 prefix=[f'{prefix}:'], #Sqaure bracket important!                
286 |                 index_type=IndexType.HASH)
287 |             )
288 |         else:
289 |             print('Index already exists.')        
290 | 
291 |         return response
292 | 
293 |     except Exception as e:
294 |         logging.error(f'Error createRedisIndex(): {e}')
295 |         return None
296 | 
297 | def addRedisIndexRecord(az_redis_connection, id, page_content, page_content_vector, page_number, documentPath, prefix = 'doc'):
298 |     try:       
299 | 
300 |         # Super Important to include dtype parameter. Otherwise the record gets added but not seen by index!!!
301 |         page_content_vector = np.array(page_content_vector, dtype=np.float32)        
302 |         # print(f'page_content_vector.shape:{page_content_vector.shape}')       
303 |         
304 |         az_redis_connection.hset(name=f'{prefix}:{str(id)}', mapping={"page_content": str(page_content),
305 |                                                                "page_number":int(page_number), 
306 |                                                                "document_path": str(documentPath),
307 |                                                                "page_content_vector": page_content_vector.tobytes()
308 |                                                                }
309 |                                                                ) 
310 |         
311 |         # az_redis_connection.hset(name=f'{prefix}:{str(id)}',  items= ["page_content", str(page_content),
312 |         #                                           "page_number", int(page_number), 
313 |         #                                           "document_path", str(documentPath),
314 |         #                                           "page_content_vector", page_content_vector.tobytes()                                                   
315 |         #                                           ])
316 | 
317 |         # pipe = az_redis_connection.pipeline(transaction=False)
318 |         # pipe.hset(name=f'{prefix}:{str(id)}', mapping={"page_content": str(page_content),
319 |         #                                                        "page_number":int(page_number), 
320 |         #                                                        "page_content_vector": page_content_vector.tobytes()
321 |         #                                                        }
322 |         #                                                        ) 
323 |         # pipe.execute()
324 |         
325 |         return True
326 | 
327 |     except Exception as e:        
328 |         logging.error(f'Error addRedisIndexRecord(): {e}')
329 |         return False
330 | 
331 | '''
332 | Iterate over read document and add it to the index
333 | '''
334 | def addDocumentToRedis(az_redis_connection, documentPath, document_page_content_list, document_page_embedding_list, document_page_no_list, prefix, encrypt_prefix=False):
335 |     try:
336 | 
337 |         if encrypt_prefix:            
338 |             prefix = encode(prefix)
339 |             
340 | 
341 |         # Iterate through pages
342 |         for i, embedding in enumerate(document_page_embedding_list):   
343 | 
344 |             hash_key = hashlib.sha1(f'{documentPath}_{i}'.encode('utf-8')).hexdigest()         
345 |             
346 |             addRedisIndexRecord(az_redis_connection = az_redis_connection, 
347 |                                 id = hash_key,                                 
348 |                                 page_content = document_page_content_list[i], 
349 |                                 page_content_vector = document_page_embedding_list[i], 
350 |                                 page_number = document_page_no_list[i], 
351 |                                 prefix = prefix,
352 |                                 documentPath = documentPath
353 |                                 )    
354 |                          
355 | 
356 |         return True
357 |     except Exception as e:
358 |         logging.error(f'Error addDocumentToRedis(): {e}')
359 |         return False
360 | 
361 | '''
362 | az_redis_connection: Connection to Redis, with Search enabled (Stack / enterprise)
363 | prompt: User query
364 | aoai_embedding_model: Azure OpenAI model for prompt embedding
365 | index_name: Redis index name
366 | top_n: Return top_n close matches
367 | '''    
368 | def queryRedis(az_redis_connection, prompt, aoai_embedding_model, index_name, top_n, encrypt_index_name=False):
369 |     try:
370 | 
371 |         document_lc_list = []
372 | 
373 |         if encrypt_index_name:
374 |             index_name = encode(index_name)
375 | 
376 |         vec_prompt = getEmbedding(txt_data=prompt, aoai_embedding_model=aoai_embedding_model, chunk_size=1, max_retries = 3)
377 |         vec_prompt = np.array(vec_prompt, dtype=np.float32)  #Super important to specify dtype, otherwise vector share mismatch error.
378 | 
379 |         # base_query = f'*=>[KNN {str(top_n)} @page_content_vector $prompt_vector AS __page_content_vector_score]'
380 |         base_query = f'*=>[KNN {str(top_n)} @page_content_vector $prompt_vector AS __page_content_vector_score]'
381 |         query = (
382 |             Query(base_query)
383 |             .sort_by("__page_content_vector_score") #asc = False, relevance in desc order.
384 |             .paging(0,top_n)
385 |             .return_fields('__page_content_vector_score','page_content','page_number', 'document_path')
386 |             .dialect(2)            
387 |         )       
388 | 
389 |         query_result = az_redis_connection.ft(index_name).search(query, {"prompt_vector": vec_prompt.tobytes()})
390 |         # print(type(query_result))        
391 | 
392 |         #Create lc document, for use with lc
393 |         for item in query_result.docs:
394 |             document_lc = Document(page_content=item.page_content,metadata={"source":item.document_path, "page":item.page_number, "similarity":1-float(item.__page_content_vector_score)})
395 |             document_lc_list.append(document_lc)
396 | 
397 |         return query_result, document_lc_list
398 | 
399 |     except Exception as e:
400 |         logging.error(f'Error queryRedis(): {e}')
401 |         return None, None
402 | 
403 | #-----------------------------------
404 | # Functions end here.
405 | #-----------------------------------
406 | 
407 | #For cmd background colour
408 | class bcolors:
409 |     HEADER = '\033[95m'
410 |     OKBLUE = '\033[94m'
411 |     OKCYAN = '\033[96m'
412 |     OKGREEN = '\033[92m'
413 |     WARNING = '\033[93m'
414 |     FAIL = '\033[91m'
415 |     ENDC = '\033[0m'
416 |     BOLD = '\033[1m'
417 |     UNDERLINE = '\033[4m'
418 | #-----------------------------------
419 | 
420 | aoai_embedding_models = {
421 | 
422 |     "text-search-ada-doc-001":{
423 |         "version":{
424 |             "1":{
425 |                 "deployment_name": "text-search-ada-doc-001-v1",
426 |                 "dim": 1024    
427 |                 }
428 |             }    
429 |         },
430 | 
431 |     "text-search-babbage-doc-001":{
432 |         "version":{
433 |             "1":{
434 |                 "deployment_name": "text-search-babbage-doc-001-v1",
435 |                 "dim": 2048    
436 |                 }
437 |             }    
438 |         },
439 | 
440 |     "text-search-curie-doc-001":{
441 |         "version":{
442 |             "1":{
443 |                 "deployment_name": "text-search-curie-doc-001-v1",
444 |                 "dim": 4096    
445 |                 }
446 |             }    
447 |         },
448 | 
449 |     "text-search-davinci-doc-001":{
450 |         "version":{
451 |             "1":{
452 |                 "deployment_name": "text-search-davinci-doc-001-v1",
453 |                 "dim": 12288    
454 |                 }
455 |             }    
456 |         },
457 | 
458 |     "text-embedding-ada-002":{
459 |         "version":{
460 |             "1":{
461 |                 "deployment_name": "text-embedding-ada-002-v1",
462 |                 "dim": 1536    
463 |                 }
464 |             }    
465 |         },
466 | 
467 |     "text-davinci-003":{
468 |         "version":{
469 |             "1":{
470 |                 "deployment_name": "text-davinci-003-v1"                
471 |                 }
472 |             }    
473 |         }
474 | 
475 |     }


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Azure OpenAI Semantic Search Demo | Document Upload
 2 | 
 3 | ![](images/arch.png)
 4 | 
 5 | ## About
 6 | 
 7 | Version: 0.9.7
 8 | 
 9 | The ability to conduct semantic search on vector data is a powerful feature that allows you to find relevant content based on a specific natural language query. This demo is helpful for showcasing and comprehending the abstractive responses generated from your own data in PDF or Word format documents.
10 | 
11 | This solution has been created by drawing inspiration from existing Enterprise Chat GPT and Document Q&A vector search demos, however it uses a simplified architecture pattern and offers following features, most of which are unique to this implementation.
12 | 
13 | ## Key Features
14 | - _Simplified architecture_
15 | - _Built-in document cracking (.pdf, .docx)_
16 | - _Utilise text embeddings_
17 | - _Upload own document and ask questions_
18 | 
19 | ## How to deploy?
20 | **Run locally from Visual Studio Code or command prompt**
21 | - Open VS Code terminal or command prompt.
22 | - Clone this repository and open in VS Code.
23 | - Create a new conda environment
24 |     - ```conda create -n openaidemo_py39 python=3.9 anaconda```
25 |     - ```conda activate openaidemo_py39```
26 |     - For more info go to [Conda documentation](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-python.html) 
27 | - Navigate to _misc_ directory and run 
28 |     - ```pip install -r requirements.txt```
29 | - Provision Azure OpenAI service
30 |     - Under deployments within Azure OpenAI Studio, deploy 2 models
31 |     - Model for text search e.g. text-search-davinci-doc-001
32 |     - Model for text generation e.g. text-davinci-003
33 |     - Model availability varies by region in which OpenAI service is provisioned in Azure.
34 |     - For more info go to [Create a resource and deploy a model using Azure OpenAI](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource)
35 | - Open _modules/utilities.py_ file and scroll to very bottom
36 |     - Update value for "deployment_name" tag for respective model version.
37 | - If you have used the 2 models mentioned in steps above, no changes are required in _app/app.py_ file.
38 |     - Otherwise, update values for aoai_embedding_model and aoai_text_model variables at the begining of app/app.py file.
39 | - Provision Redis stack as Azure Container Instance
40 |     - Use image = redis/redis-stack-server:latest
41 |     - On _advanced tab_ make sure you add
42 |         - --requirepass <your_strong_redis_pwd>
43 |     - Alternatively, you may run the Redis stack in local docker environment. In this case, you do not require to provision it as ACI service.
44 |     - For more info go to [Quickstart: Deploy a container instance in Azure using the Azure portal](https://learn.microsoft.com/en-us/azure/container-instances/container-instances-quickstart-portal)
45 | - Almost there, rename _.env_template_ file to _.env_ 
46 |     - Renamed file should be placed  within same directory as this readme.md file. 
47 |     - Replace placeholders with correct values.    
48 | - Navigate to /app directory and run following command from VS Code terminal. This will open the App UI in a browser window.
49 |     - ```streamlit run app.py```
50 | 
51 | **Run as a local docker container**
52 | - Complete all configuration steps from section above before running any steps below.
53 | - ```docker build -t oaisearch:0.9.7 -f ./docker/dockerfile .```
54 | - ```docker run -d -p 8501:8501 oaisearch:0.9.7```
55 | - Open web browser and go to http://localhost:8501 URL to see the app UI.
56 | 
57 | **To Azure**
58 | - More info will be added soon.
59 | 
60 | ## Looking for Azure Cognitive Semantic Search | Large documents with OpenAI enrichment?
61 | - [Azure Cognitive Semantic Search | Large documents | OpenAI enrichment](https://github.com/MaheshSQL/cognitive-semantic-search-openai-accelerator)
62 | 
63 | ## Looking for Azure OpenAI Lecture Generation (text-to-speech) Demo?
64 | - [Azure OpenAI Lecture Generation Demo](https://github.com/MaheshSQL/openai-lecture-generation)


--------------------------------------------------------------------------------
/sample_docs/Easy_recipes_Boston_University.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MaheshSQL/openai-vector-search-demo/e5aacc5c1dc35f5311cbaf8a626b821134ef778b/sample_docs/Easy_recipes_Boston_University.pdf


--------------------------------------------------------------------------------