├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── environment.yaml
├── example.env
├── rag_skills
    ├── chatbotSkills.py
    ├── conversationalRetrievalwithLangchain.ipynb
    ├── demonstrateSkills.ipynb
    └── utils.py
└── samples
    └── financial_transcripts
        ├── Dockerfile
        ├── README.md
        ├── chatBot.py
        ├── images
            ├── MSFT.jpg
            ├── Microsoft_logo.png
            ├── Microsoft_logo.svg
            ├── chatbot.jpg
            └── openai.png
        ├── llm_app.py
        ├── st_config.yaml
        ├── st_main.py
        ├── step0_data_preprocessor.ipynb
        ├── step1_chunk_and_extract.ipynb
        ├── step2_embed.ipynb
        └── step3_db_storing_vectorsearch.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | *.DS_Store
  7 | 
  8 | vectorstore
  9 | # Environment in subdirectories 
 10 | **/*.env 
 11 | !**/example.env
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | **/DATA/*
 37 | ! **/DATA/README.md
 38 | **/AnalyzedPDF/*
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .nox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | *.py,cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | cover/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | .pybuilder/
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | #   For a library or package, you might want to ignore these files since the code is
 98 | #   intended to run in multiple environments; otherwise, check them in:
 99 | # .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # poetry
109 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
111 | #   commonly ignored for libraries.
112 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113 | #poetry.lock
114 | 
115 | # pdm
116 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117 | #pdm.lock
118 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119 | #   in version control.
120 | #   https://pdm.fming.dev/#use-with-ide
121 | .pdm.toml
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
172 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Retrieval Augmentation Generation (RAG) patterns
 2 | 
 3 | Following the launch of ChatGPT, many companies express a keen interest in developing search engines in the style of ChatGPT, tailored to their specific datasets. To address this challenge, Retrieval Augmentation Generation (RAG) has emerged as a popular solution. RAG comprises a three-step process: 
 4 | 
 5 | First, pertinent information (referred to as context) is retrieved from the database based on the human query
 6 | Then, this context is enhanced and integrated with the human query.
 7 | Finally, the enriched context is presented to GPT-style models to generate a response.
 8 | 
 9 | We've observed that the RAG approach can take on various forms. For instance, certain problems necessitate a memory of past conversations, while in others, the database may offer an extensive context that surpasses the limits of LLM prompts. We've devised solutions for these challenges, which we refer to as skills. The objective of this repository is to offer a compendium of these skills, showcase how each can be applied independently, and also present some end-to-end examples demonstrating their utilization.
10 | 
11 | Please note that in addition to this repository, we maintain others that focus on different aspects of RAG.
12 | 
13 | 1. This repo would illustrate the use of Azure Cognitive Search (ACS) as a vector store. For those interested in employing other databases such as Postgres, AzureSQL, MongoDB, please refer to this [repository](https://github.com/microsoft/AzureDataRetrievalAugmentedGenerationSamples).
14 | 
15 | 2. For deployment, We employ [Streamlit](https://streamlit.io/). Alternatively, other options such as deploying through Azure Web App using docker containers or creating a chatbot in Microsoft Teams can be explored in this [repository](https://github.com/microsoft/QnABot-for-FabricDocs.git).
16 | 
17 | ## Skills
18 | 
19 | This repo contains a collection of skills available in `rag_skills/chatbotSkills.py` and their code samples for individual skills: 
20 | 
21 | 1. Chatbot with memory functionality: `chatbotSkills.py` contains functions for chatbot enabled with memory capabilities. Two distinct types of memory skills are available:
22 | 
23 |     a. `qa_chain_ConversationBufferMemory`: This skill leverages the entire chat history, context and human queries for generating responses. It's recommended for shorter conversations.
24 | 
25 |     b. `qa_chain_ConversationSummaryMemory`: This skill uses the condensed version of chat history, context and human queries for generating responses. It's preferable for longer conversations.
26 | 
27 | 2. `user_query_based_context_summarization`: Summarize or extract the relevant information from the context based on the user query. 
28 | 
29 | 3. `combine_docs`: This skill is useful when search retrieves multiple contexts from the database that cannot fit into a single language model call. It combines multiple contexts while retaining the information that is relevant to user query. It also ensures that the total context token count remains below a certain threshold. 
30 | 
31 | > NOTE: Skills are being updated as code snippets faster than this repo at [rag_skills](https://github.com/microsoft/rag_skills)
32 | 
33 | ## End2End Sample for Different RAG patterns
34 | 
35 | This repository also includes one end-to-end sample centered around financial transcripts. Please note that the plan is to incorporate additional samples that showcase various RAG patterns in the future.
36 | 
37 | 
38 | | Sample name                       | Description                         | Tech Stack                                                       |
39 | | --------------------------------- | ----------------------------------- | ---------------------------------------------------------------- |
40 | | Financial Earnings calls assistant | Summarizes and Q&A on earning calls | ACS, deployed on streamlit        |
41 | 
42 | 
43 | ## How to use?
44 | 
45 | 1. Lightweight examples of various skills are provided in `rag_skills/demonstrateSkills.ipynb`
46 | 2. End2End samples are at `samples/` folder. Please follow the README in the folder itself.
47 | 
48 | ## Contributing
49 | 
50 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
51 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
52 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
53 | 
54 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
55 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
56 | provided by the bot. You will only need to do this once across all repos using our CLA.
57 | 
58 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
59 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
60 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
61 | 
62 | ## Trademarks
63 | 
64 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
65 | trademarks or logos is subject to and must follow 
66 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
67 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
68 | Any use of third-party trademarks or logos are subject to those third-party's policies.
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: appliedai
 2 | channels:
 3 |  - conda-forge
 4 |  - defaults
 5 | dependencies:
 6 |  - python=3.8.11
 7 |  - pip
 8 |  - pip:
 9 |     - numpy==1.24.4
10 |     - openai==0.27.8
11 |     - python-dotenv==1.0.0
12 |     - scikit-learn==1.3.0
13 |     - mlflow==2.4.2
14 |     - requests==2.31.0
15 |     - pyyaml==6.0.1
16 |     - psycopg2-binary==2.9.6
17 |     - pandas==2.0.3
18 |     - pgvector==0.2.0
19 |     - langchain>=0.0.317
20 |     - docx2txt==0.8
21 |     - docx2pdf==0.1.8
22 |     - streamlit==1.25.0
23 |     - tiktoken==0.4.0
24 |     - azure-ai-formrecognizer==3.3.0
25 |     - azure-storage-blob==12.17.0
26 |     - azure-search-documents==11.4.0b8
27 |     - pydantic==1.10.11


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
 1 | # azure open ai
 2 | OPENAI_API_BASE=""
 3 | OPENAI_API_TYPE=""
 4 | OPENAI_API_KEY=""
 5 | OPENAI_DEPLOYMENT_EMBEDDING=""
 6 | OPENAI_MODEL_EMBEDDING=""
 7 | OPENAI_DEPLOYMENT_COMPLETION=""
 8 | OPENAI_MODEL_COMPLETION=""
 9 | OPENAI_API_VERSION=""
10 | 
11 | # azure form recognizer
12 | AZURE_FORM_RECOGNIZER_ENDPOINT = 
13 | AZURE_FORM_RECOGNIZER_NAME = ""
14 | AZURE_FORM_RECOGNIZER_KEY = ""
15 | 
16 | # azure cognitive search
17 | COGSEARCH_NAME = "" 
18 | COGSEARCH_INDEX_NAME = ""
19 | COGSEARCH_API_KEY = "" 
20 | 


--------------------------------------------------------------------------------
/rag_skills/chatbotSkills.py:
--------------------------------------------------------------------------------
  1 | # Import required libraries from LangChain and set up OpenAI
  2 | from langchain.llms import AzureOpenAI
  3 | from langchain import PromptTemplate
  4 | from langchain.chains import LLMChain, ConversationChain
  5 | from langchain.memory import ConversationBufferMemory, ConversationSummaryMemory
  6 | from langchain.prompts import PromptTemplate
  7 | import openai
  8 | import os
  9 | from dotenv import dotenv_values
 10 | from rag_skills.utils import count_tokens
 11 | 
 12 | ##############################################################
 13 | ###### QA chain with conversational buffer memory #############
 14 | ##############################################################
 15 | def qa_chain_ConversationBufferMemory(llm, prefix_template=None, to_debug=False):
 16 |     # Write a preprompt with context and query as variables
 17 |     if prefix_template is None:
 18 |         prefix_template = """
 19 |         You are a chatbot having a conversation with a human. 
 20 |         Given the Context, Chat History, and a Human Query, 
 21 |         create a final answer only using the Context. Don't hallucinate at all. """
 22 |         
 23 |     template = prefix_template + """
 24 |     Context: 
 25 |     {context}
 26 |     
 27 |     Chat History: 
 28 |     {chat_history}
 29 | 
 30 |     Human Query: {human_input}
 31 |     Chatbot:"""
 32 | 
 33 |     # Define a prompt template
 34 |     prompt = PromptTemplate(
 35 |         input_variables=["chat_history", "human_input", "context"], template=template
 36 |     )
 37 |     
 38 |     # Define Memory
 39 |     memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input")
 40 |     
 41 |     # Define a chain
 42 |     qa_chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=to_debug)
 43 |     return qa_chain
 44 | 
 45 | 
 46 | ##############################################################
 47 | ###### QA chain with converational Summary memory #############
 48 | ##############################################################
 49 | def qa_chain_ConversationSummaryMemory(llm, prefix_template=None, to_debug=False):
 50 |     # Write a preprompt with context and query as variables
 51 |     if prefix_template is None:
 52 |         prefix_template = """
 53 |         You are a chatbot having a conversation with a human. 
 54 |         Given the Context, Chat History, and a Human Query, 
 55 |         create a final answer. Don't hallucinate at all. If you don't have an answer, say "I don't know".
 56 |         """
 57 |         
 58 |     template = prefix_template + """
 59 |     Context: 
 60 |     {context}
 61 |     
 62 |     Chat History: 
 63 |     {chat_history}
 64 | 
 65 |     Human Query: {human_input}
 66 |     Chatbot:
 67 |     """
 68 | 
 69 |     # Define a prompt template
 70 |     prompt = PromptTemplate(
 71 |         input_variables=["chat_history", "human_input", "context"],
 72 |         template=template
 73 |     )
 74 |     
 75 |     #Define Memory
 76 |     
 77 |     memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", input_key="human_input")
 78 |     memory.prompt.template = """
 79 |     Progressively summarize the lines of conversation provided, adding onto the previous summary returning a new summary.
 80 | 
 81 |     Current summary:
 82 |     {summary}
 83 | 
 84 |     New lines of conversation:
 85 |     {new_lines}
 86 | 
 87 |     New summary:
 88 |     """
 89 |     
 90 |     # Define a chain
 91 |     qa_chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=to_debug)
 92 |     return qa_chain
 93 | 
 94 | ################################################################
 95 | ###### Summarize chain with user query and context #############
 96 | ################################################################
 97 | def user_query_based_context_summarization(llm, prefix_template=None, to_debug=False):
 98 |     # Write a preprompt with context and query as variables
 99 |     if prefix_template is None:
100 |         prefix_template = """
101 |         Write a concise summary of the context so that it includes the details related to the human query.
102 |         """
103 |         
104 |     template = prefix_template + """
105 |     Context: 
106 |     {context}
107 |     
108 |     Human Query: {human_input}
109 |     Concise Summary:
110 |     """
111 | 
112 |     # Define a prompt template
113 |     prompt = PromptTemplate(
114 |         input_variables=["human_input", "context"], template=template
115 |     )    
116 |   
117 |     # Define a chain
118 |     query_based_summary_chain = LLMChain(llm=llm, prompt=prompt, verbose=to_debug)
119 |     return query_based_summary_chain
120 | 
121 | ################################################################
122 | ###### Write a summary given multiple contexts #############
123 | ################################################################
124 | 
125 | def combine_docs(context_list, llm, to_debug=False, max_tokens=16000, user_query=None, prefix_template=None):
126 |     """Given a list of documents, combine them into a single document with a max token limit."""
127 | 
128 |     ## When all the documents can be concatenated
129 |     context_all = ""
130 |     for i in context_list:
131 |         context_all = context_all + i + "\n\n"
132 | 
133 |     if count_tokens(context_all) < max_tokens:
134 |         return context_all
135 | 
136 |     ## When all the documents cannot be concatenated
137 |     if user_query is None:
138 |         user_query = ""
139 | 
140 |     query_based_summary_chain = user_query_based_context_summarization(llm,
141 |         prefix_template=prefix_template,
142 |         to_debug=to_debug
143 |     )
144 | 
145 |     context_all = ""
146 |     for i in context_list:
147 |         context_all = context_all + i + "\n\n"
148 | 
149 |         ## If the context_all is greater than max_tokens, then summarize the context_all again
150 |         if count_tokens(context_all) > max_tokens: 
151 |             context_all = query_based_summary_chain.run({
152 |                 'context': context_all,
153 |                 'human_input': user_query
154 |             })
155 |     
156 |     return context_all
157 | 
158 | 


--------------------------------------------------------------------------------
/rag_skills/conversationalRetrievalwithLangchain.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "In this notebook we will demonstrate how to use LangChain to perform [conversational retrieval](https://python.langchain.com/docs/use_cases/question_answering/chat_vector_db).\n",
  8 |     "\n",
  9 |     "In a conversational question and answering scenario, users often pose follow-up questions related to the same topic, with the context being crucial to understand their queries. To address such cases effectively, we use the ConversationalRetrievalChain. Behind the scenes, this chain takes the user's question and converts it into a standalone query by considering the conversation history. Subsequently, it uses this standalone question to query the search service for relevant information."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "! pip install azure-identity"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "In this notebook we are utilizing an existing search index we already set up with Azure Cognitive Search. You can follow this [link](https://python.langchain.com/docs/integrations/vectorstores/azuresearch) to create your own search index. In our search index, the content vector field is named \"contentVector\", so we set it as an environment variable below."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 1,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import os\n",
 35 |     "os.environ[\"AZURESEARCH_FIELDS_CONTENT_VECTOR\"] = \"contentVector\""
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from langchain.embeddings.openai import OpenAIEmbeddings\n",
 45 |     "from langchain.llms import AzureOpenAI\n",
 46 |     "from langchain.chains import ConversationalRetrievalChain\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "True"
 58 |       ]
 59 |      },
 60 |      "execution_count": 3,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "from dotenv import load_dotenv\n",
 67 |     "load_dotenv(\"../llm.env\")"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from langchain.vectorstores.azuresearch import AzureSearch\n",
 77 |     "\n",
 78 |     "\n",
 79 |     "model = os.getenv(\"OPENAI_DEPLOYMENT_EMBEDDING\")\n",
 80 |     "embeddings = OpenAIEmbeddings(deployment=model)\n",
 81 |     "index_name = \"testqa\"\n",
 82 |     "vectore_store_name = os.getenv(\"COGSEARCH_NAME\")\n",
 83 |     "vector_store_address = f\"https://{vectore_store_name}.search.windows.net\"\n",
 84 |     "vector_store_password = os.getenv(\"COGSEARCH_API_KEY\")\n",
 85 |     "vector_store = AzureSearch(\n",
 86 |     "    azure_search_endpoint=vector_store_address,\n",
 87 |     "    azure_search_key=vector_store_password,\n",
 88 |     "    index_name=index_name,\n",
 89 |     "    embedding_function=embeddings.embed_query,\n",
 90 |     ")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "from langchain.memory import ConversationBufferMemory\n",
100 |     "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n",
101 |     "memory.output_key = \"answer\""
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "In this notebook we are focusing on the ConversationalRetrievalChain only, for demonstration purposes. If you use your own search index you might need to modify your queries so that it is relevant to the information in your index."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "llm = AzureOpenAI(deployment_name=os.getenv(\"OPENAI_DEPLOYMENT_COMPLETION\"), model_name=os.getenv(\"OPENAI_MODEL_COMPLETION\"),temperature=0)\n",
118 |     "qa = ConversationalRetrievalChain.from_llm(llm, vector_store.as_retriever(search_kwargs={\"k\": 1}), memory=memory, return_generated_question=True)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 7,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n",
128 |     "result = qa({\"question\": query})"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 8,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "' Edgar Mitchell called Earth a \"sparkling blue and white jewel.\"'"
140 |       ]
141 |      },
142 |      "execution_count": 8,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "result[\"answer\"]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "query = \"When and how did NASA make its first observation about it from the space?\"\n",
158 |     "result = qa({\"question\": query})"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 10,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "' NASA first observed Earth from space with the launch of Explorer 1 in 1960.'"
170 |       ]
171 |      },
172 |      "execution_count": 10,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "result[\"answer\"]"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "As you can see, if we had query the search service using the user's question directly, it is not clear wht the user is reffering to by \"its\" and \"it\". However, the generated standalone question makes it more clear, and enables for more efficient and robust queries."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 11,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "' When and how did NASA first observe Earth from space?'"
197 |       ]
198 |      },
199 |      "execution_count": 11,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "result[\"generated_question\"]"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "When the user asks a question on a different topic, the generated question still reflects the user's question correctly."
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 12,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "query = \"Why can't we see volcanic plumes with our eyes?\"\n",
222 |     "result = qa({\"question\": query})"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 13,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       " Volcanic plumes are not visible to the naked eye because they are typically invisible in the electromagnetic spectrum. However, satellites can use infrared to distinguish the plumes from ice and clouds.\n",
235 |       " Why are volcanic plumes not visible to the naked eye?\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "print(result[\"answer\"])\n",
241 |     "print(result[\"generated_question\"])"
242 |    ]
243 |   }
244 |  ],
245 |  "metadata": {
246 |   "kernelspec": {
247 |    "display_name": "Python 3",
248 |    "language": "python",
249 |    "name": "python3"
250 |   },
251 |   "language_info": {
252 |    "codemirror_mode": {
253 |     "name": "ipython",
254 |     "version": 3
255 |    },
256 |    "file_extension": ".py",
257 |    "mimetype": "text/x-python",
258 |    "name": "python",
259 |    "nbconvert_exporter": "python",
260 |    "pygments_lexer": "ipython3",
261 |    "version": "3.8.11"
262 |   },
263 |   "orig_nbformat": 4
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 2
267 | }
268 | 


--------------------------------------------------------------------------------
/rag_skills/demonstrateSkills.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "a2469417",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import sys\n",
 12 |     "import openai\n",
 13 |     "from dotenv import dotenv_values\n",
 14 |     "from langchain.chat_models import AzureChatOpenAI\n",
 15 |     "sys.path.append(\"..\") ## add directory above\n"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "1af107d4",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "env_name = \"../llm.env\" # change to use your own .env file\n",
 26 |     "config = dotenv_values(env_name)\n",
 27 |     "\n",
 28 |     "#Azure OpenAI\n",
 29 |     "openai.api_type = config[\"OPENAI_API_TYPE\"] #\"azure\"\n",
 30 |     "openai.api_key = config['OPENAI_API_KEY']\n",
 31 |     "openai.api_base = config['OPENAI_API_BASE']\n",
 32 |     "openai.api_version = config['OPENAI_API_VERSION']"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "e9c886c1",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "engine = \"gpt-4-32k\"\n",
 43 |     "llm = AzureChatOpenAI(\n",
 44 |     "    deployment_name=engine,\n",
 45 |     "    openai_api_base=openai.api_base,\n",
 46 |     "    openai_api_version=openai.api_version,\n",
 47 |     "    openai_api_key=openai.api_key,\n",
 48 |     "    openai_api_type=openai.api_type,\n",
 49 |     "    temperature=0.0,\n",
 50 |     "    verbose=True\n",
 51 |     ")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "id": "e0b0e6f8",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## 1. Demonstrating conversation summary memory chain:\n",
 60 |     "This chain summarizes the previous user conversation and appends the summary to context for answering questions "
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "4886138d",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "from chatbotSkills import qa_chain_ConversationSummaryMemory\n",
 71 |     "\n",
 72 |     "# Make a Question Answer chain function and pass \n",
 73 |     "prefix_template = \"\"\"\n",
 74 |     "    You are a chatbot having a conversation with a human.\n",
 75 |     "    Given the Context, Chat History, and a Human Query, \n",
 76 |     "    create a final answer. Don't hallucinate at all. If you don't have an answer, say \"I don't know\".\n",
 77 |     "    \"\"\"\n",
 78 |     "\n",
 79 |     "qa_chain = qa_chain_ConversationSummaryMemory(llm, prefix_template=prefix_template, to_debug=False)\n",
 80 |     "\n",
 81 |     "## Question Answering\n",
 82 |     "\n",
 83 |     "#Question 1\n",
 84 |     "answer = qa_chain.run({\n",
 85 |     "    'context': \"USSA is a space agency in county Y. It is a government agency responsible for the exploration and development of space.\",\n",
 86 |     "    'human_input': \"What is USSA\" \n",
 87 |     "})\n",
 88 |     "\n",
 89 |     "print(\"Question 1: \")\n",
 90 |     "print(answer)\n",
 91 |     "\n",
 92 |     "# Question 2: \n",
 93 |     "answer = qa_chain.run({\n",
 94 |     "    'context': \"Zootopia is a 2016 American computer-animated buddy cop action comedy film produced by Walt Disney Animation Studios.\",\n",
 95 |     "    'human_input': \"Do you know about any space agency?\" \n",
 96 |     "}) \n",
 97 |     "\n",
 98 |     "print(\"Question 2: \")\n",
 99 |     "print(answer)\n",
100 |     "print()\n",
101 |     "print(\"\"\"Context in question 2 does not contain any specific information regarding the \n",
102 |     "       user question but still llm provides correct answer by using the memory of previous conversation\n",
103 |     "       \"\"\"\n",
104 |     ")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "id": "bb443051",
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "print(qa_chain.memory) ## You can see the memory using this call"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "id": "0af592da",
120 |    "metadata": {},
121 |    "source": [
122 |     "## 2. Demonstrating conversation buffer memory chain:\n",
123 |     "This chain summarizes the previous user conversation and appends the summary to context for answering questions \n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "69704b89",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "from chatbotSkills import qa_chain_ConversationBufferMemory\n",
134 |     "\n",
135 |     "# Make a Question Answer chain function and pass \n",
136 |     "prefix_template = \"\"\"\n",
137 |     "    You are a chatbot having a conversation with a human. \n",
138 |     "    Given the Context, Chat History, and a Human Query, \n",
139 |     "    create a final answer. Don't hallucinate at all. If you don't have an answer, say \"I don't know\".\n",
140 |     "    \"\"\"\n",
141 |     "\n",
142 |     "qa_chain = qa_chain_ConversationBufferMemory(llm, prefix_template=prefix_template, to_debug=False)\n",
143 |     "\n",
144 |     "## Question Answering\n",
145 |     "\n",
146 |     "#Question 1\n",
147 |     "answer = qa_chain.run({\n",
148 |     "   'context': \"USSA is a space agency in county Y. It is a government agency responsible for the exploration and development of space.\",\n",
149 |     "   'human_input': \"What is USSA\" \n",
150 |     "})\n",
151 |     "\n",
152 |     "print(\"Question 1: \")\n",
153 |     "print(answer)\n",
154 |     "\n",
155 |     "# Question 2: \n",
156 |     "answer = qa_chain.run({\n",
157 |     "    'context': \"Zootopia is a 2016 American computer-animated buddy cop action comedy film produced by Walt Disney Animation Studios.\",\n",
158 |     "    'human_input': \"Do you know about any space agency?\" \n",
159 |     "}) \n",
160 |     "\n",
161 |     "print(\"Question 2: \")\n",
162 |     "print(answer)\n",
163 |     "print()\n",
164 |     "print(\"\"\"Context in question 2 does not contain any specific information regarding the \n",
165 |     "       user question but still llm provides correct answer by using the memory of previous conversation\n",
166 |     "       \"\"\"\n",
167 |     ")"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "bb3356e0",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "print(qa_chain.memory) ## You can see the memory using this call"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "id": "8542530f",
183 |    "metadata": {},
184 |    "source": [
185 |     "## 3. Demonstrating user query based context summarization chain:\n",
186 |     "Sometimes context can be large and don't fit in a prompt window. So, this chain summarizes context given user query "
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "539bc2d2",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "from chatbotSkills import user_query_based_context_summarization\n",
197 |     "\n",
198 |     "# Template \n",
199 |     "prefix_template = \"\"\"\n",
200 |     "    Write a concise summary of the context so that it includes the details related to the human query.\n",
201 |     "    \"\"\"\n",
202 |     "\n",
203 |     "context_summary_chain = user_query_based_context_summarization(llm, prefix_template=prefix_template, to_debug=False)\n",
204 |     "\n",
205 |     "context = \"\"\"USSA is a space agency in county Y. It is a government agency responsible\n",
206 |     "    for the exploration and development of space.\n",
207 |     "    Zootopia is a 2016 American computer-animated buddy cop action comedy\n",
208 |     "    film produced by Walt Disney Animation Studios.\n",
209 |     "    \"\"\"\n",
210 |     "\n",
211 |     "#Question 1\n",
212 |     "answer = context_summary_chain.run({\n",
213 |     "    'context': context,\n",
214 |     "    'human_input': \"What is USSA?\" \n",
215 |     "})\n",
216 |     "\n",
217 |     "\n",
218 |     "print(\"Question 1: \")\n",
219 |     "print(answer)\n",
220 |     "print()\n",
221 |     "\n",
222 |     "print (\"\"\"This llm extracts only relevant information from the context. \"\"\")"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "id": "34951a40",
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "print(qa_chain.memory) ## You can see the memory using this call"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "id": "fc2d397d",
238 |    "metadata": {},
239 |    "source": [
240 |     "## 3. Demonstrating combine_docs:\n",
241 |     "Sometimes contexts retrieved from the database can be large and doesn't fit in a prompt. So, this code will extract relevant information from the context given the user query."
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "id": "c6b9e2d7",
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "from chatbotSkills import combine_docs, count_tokens\n",
252 |     "\n",
253 |     "context_1 = \"\"\"\n",
254 |     "    The United Space Exploration Administration (USSA) stands as county Y's premier governmental space agency, \n",
255 |     "    entrusted with the monumental task of spearheading the exploration, investigation, and advancement of the cosmic frontier.\n",
256 |     "    With a dedicated cadre of brilliant scientists, intrepid astronauts, \n",
257 |     "    and cutting-edge technology, USSA pioneers a path to unlock the mysteries of the universe and \n",
258 |     "    harness its potential for the betterment of humanity. \n",
259 |     "    \"\"\"\n",
260 |     "context_2 = \"\"\"\n",
261 |     "    Through audacious missions and visionary initiatives, \n",
262 |     "    USSA propels the nation to new heights, ensuring that the celestial realm \n",
263 |     "    becomes a beacon of knowledge, opportunity, and inspiration for generations to come.\n",
264 |     "    \"\"\"\n",
265 |     "\n",
266 |     "context_list = [context_1, context_2]\n",
267 |     "\n",
268 |     "input_token_count = count_tokens(context_1+context_2, engine)\n",
269 |     "prefix_template = \"\"\"\n",
270 |     "    Extract information from the context so that it includes the details related to the human query. \n",
271 |     "    \"\"\"\n",
272 |     "user_query = \"What does USSA stand for?\" \n",
273 |     "max_input_tokens = 100 ## For demonstration, we are assuming that max token for input context should not exceed 100\n",
274 |     "\n",
275 |     "output = combine_docs(context_list, llm, to_debug=False, max_tokens=max_input_tokens, \n",
276 |     "                      user_query=user_query, prefix_template=prefix_template)\n",
277 |     "\n",
278 |     "output_token_count = count_tokens(output, engine)\n",
279 |     "print(\"input_token_count: \", input_token_count)\n",
280 |     "print(\"output_token_count: \", output_token_count)\n",
281 |     "\n",
282 |     "print(\"output: \", output)\n",
283 |     "print()\n",
284 |     "print(\"\"\"This demonstrates that the combine_docs function reduces the tokens for the input\"\"\")\n",
285 |     "\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "id": "48a003e8",
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": []
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "language_info": {
299 |    "name": "python"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 5
304 | }
305 | 


--------------------------------------------------------------------------------
/rag_skills/utils.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | import pandas as pd
 3 | 
 4 | ##############################################################
 5 | ###### Tokens #############
 6 | ##############################################################
 7 | def count_tokens(string: str, encoding_name: str="gpt-4-32k") -> int:
 8 |     """Returns the number of tokens in a text string."""
 9 |     encoding = tiktoken.encoding_for_model(encoding_name)
10 |     num_tokens = len(encoding.encode(string))
11 |     return num_tokens
12 | 
13 | ##############################################################
14 | ###### Get Prompt Template from csv  #############
15 | ##############################################################
16 | 
17 | def get_prompt_template(prompt_id, prompt_templates_name=None):
18 |     """
19 |     Retrieve LLM prompt template using prompt_id from a csv file.
20 |     """
21 |     if prompt_templates_name == None:
22 |         prompt_templates_name = config['PROMPT_TEMPLATE_FILE']
23 |     df = pd.read_csv(os.path.join(os.path.dirname(__file__), prompt_templates_name))
24 |     prompt = df[df['prompt_id'] == prompt_id]['prompt_template'].values[0]
25 |     return prompt


--------------------------------------------------------------------------------
/samples/financial_transcripts/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mambaorg/micromamba:0.15.3
 2 | USER root
 3 | RUN apt-get update && DEBIAN_FRONTEND=“noninteractive” apt-get install -y --no-install-recommends \
 4 |        nginx \
 5 |        ca-certificates \
 6 |        apache2-utils \
 7 |        certbot \
 8 |        python3-certbot-nginx \
 9 |        sudo \
10 |        cifs-utils \
11 |        && \
12 |     rm -rf /var/lib/apt/lists/*
13 | RUN apt-get update && apt-get -y install cron
14 | RUN mkdir /opt/chatbot
15 | RUN chmod -R 777 /opt/chatbot
16 | WORKDIR /opt/chatbot
17 | USER micromamba
18 | EXPOSE 8000
19 | COPY ../../rag_skills /opt/chatbot/rag_skills
20 | COPY ../../environment.yaml ./environment.yaml
21 | COPY ../../llm.env llm.env
22 | RUN micromamba install -y -n base -f environment.yaml && \
23 |    micromamba clean --all --yes
24 | COPY /samples/financial_transcripts/ /opt/chatbot/
25 | USER root
26 | RUN chmod -R 777 /opt/chatbot
27 | USER micromamba
28 | ENTRYPOINT ["streamlit", "run"]
29 | CMD ["st_main.py","--server.port","8000","--theme.base","dark"]
30 | 
31 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/README.md:
--------------------------------------------------------------------------------
 1 | # Project
 2 | 
 3 | This README provides a comprehensive guide on implementing a question-answering system using the Retrieval Augmentation Generation (RAG) pattern on Microsoft (MSFT) earnings call transcripts. The process leverages Azure Cognitive Search (ACS) for creating a vector database and Azure Form Recognizer for processing raw documents.
 4 | 
 5 | Before commencing the project, ensure that you add the necessary keys to `example.env` in the root directory and rename it as `llm.env`. Specifically, we require keys for [Azure AI Document Intelligence (previously Form Recognizer)](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0), [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal), and [Azure Cognitive Search services](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal). 
 6 | 
 7 | Be sure to create new names in the `llm.env` for `COGSEARCH_INDEX_NAME` because the jupyter notebooks below will assist you with creating the new vector database with this name.
 8 | 
 9 | ## Large Language Model (LLM) application
10 | 
11 | In this project, we've employed the RAG approach to develop an LLM application. This involves retrieving context from the database using a human query, augmenting the context, and then prompting GPT-style models to generate an answer. However, we encountered two key challenges with the basic RAG implementation:
12 | 
13 | 1. Context Size: At times, the context retrieved from the database is excessively large and doesn't fit within the prompt for GPT-style models. To address this, we utilized an intermediate GPT call to summarize or extract the pertinent information from the retrieved context.
14 | 
15 | 2. User Queries Requiring Chat History: Some user queries necessitate the access to previous chat history for accurate responses. To tackle this, we summarize the chat history and augment it with the human query and context.
16 | 
17 | To address these challenges, we implemented the architecture shown in the figure below. The chatBot class in chatBot.py implements different functionalities shown in the figure.
18 | 
19 | <img src="images/chatbot.jpg" alt="Chatbot Architecture" width="75%" height="75%"/>
20 | 
21 | ## How to Run
22 | 
23 | ### Creating a Vector Database on Azure Cognitive Search (ACS)
24 | 
25 | Crating a vector database is a four step process outlined below: 
26 | 
27 | 1. `step0_data_preprocessor.ipynb` accesses the DATA\ word docs and convert them to PDF to be used by the Azure Form Recognizer in the next step.
28 | 2. `step1_chunk_and_extract.ipynb` chunks and extracts PDF files using Azure Form Recognizer and save to .csv files.
29 | 3. `step2_embed.ipynb` reads and embeds using Azure OpenAI and saves to .csv files.
30 | 4. `step3_db_storing_vectorsearch.ipynb` reads and inserts data into ACS and shows examples of various search capabilities using ACS hybrid search from data.
31 | 
32 | ### Running LLM application
33 | To run the LLM application, execute the `llm_app.py` file.
34 | 
35 | #### Deployment with streamlit
36 | 
37 | 1. Run locally
38 | 
39 | ```
40 | streamlit run st_main.py --server.port 8000
41 | ```
42 | 
43 | 2. Build docker. Since the `rag_skills/chatbotSkills.py` and `environment.yaml` files are at the parent directory, the Dockerfile only works if you run the command from the parent directory.
44 | 
45 | ```
46 | docker build -t bot:v1 -f samples/financial_transcripts/Dockerfile .
47 | docker run --rm -p 8000:8000 bot:v1
48 | ```
49 | 
50 | Go to an open web browser and type `localhost:8000`
51 | 
52 | ## Contributing
53 | 
54 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
55 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
56 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
57 | 
58 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
59 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
60 | provided by the bot. You will only need to do this once across all repos using our CLA.
61 | 
62 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
63 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
64 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
65 | 
66 | ## Trademarks
67 | 
68 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 
69 | trademarks or logos is subject to and must follow 
70 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
71 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
72 | Any use of third-party trademarks or logos are subject to those third-party's policies.
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/chatBot.py:
--------------------------------------------------------------------------------
  1 | # Import required libraries  
  2 | import os
  3 | import re
  4 | import openai
  5 | import sys  
  6 | from dotenv import dotenv_values
  7 | from azure.core.credentials import AzureKeyCredential  
  8 | from azure.search.documents.models import Vector
  9 | sys.path.append("../..")   ## add directory above
 10 | from rag_skills.chatbotSkills import qa_chain_ConversationSummaryMemory, combine_docs
 11 | 
 12 | # Get the absolute path to the .env file
 13 | env_name = os.path.join(os.path.dirname(__file__), "llm.env")
 14 | 
 15 | # Load environment variables from the .env file
 16 | config = dotenv_values(env_name)
 17 | 
 18 | if len(config) == 0:
 19 |     env_name = os.path.join(os.path.dirname(__file__), "../../llm.env")
 20 |     config = dotenv_values(env_name)
 21 | 
 22 |     if len(config) == 0:
 23 |         raise Exception("No environment variables loaded. Please check the *.env file.")
 24 | 
 25 | #Azure OpenAI
 26 | openai.api_type = config["OPENAI_API_TYPE"] #"azure"
 27 | openai.api_key = config['OPENAI_API_KEY']
 28 | openai.api_base = config['OPENAI_API_BASE']
 29 | openai.api_version = config['OPENAI_API_VERSION']
 30 | 
 31 | ## Cog Search
 32 | cogsearch_name = config["COGSEARCH_NAME"] 
 33 | index_name = config["COGSEARCH_INDEX_NAME"]
 34 | key = config["COGSEARCH_API_KEY"]
 35 | service_endpoint = "https://"+config["COGSEARCH_NAME"] + ".search.windows.net"
 36 | 
 37 | credential = AzureKeyCredential(key)
 38 | 
 39 | def createEmbeddings(text):
 40 |     response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"])
 41 |     embeddings = response['data'][0]['embedding']
 42 |     return embeddings
 43 | 
 44 | ## Retrieves relevant content from Azure Cognitive Search (ACS)
 45 | def acs_retriever(search_client, query=None, queryEmbedding = None, 
 46 |                   colName=None, colVal=None, searchtype=None, numChunks=5, vectorColName="Embedding"):
 47 |     # query: user query
 48 |     # colName: List of column name to search in ACS columns
 49 |     # colVal: List of column values to search in ACS
 50 |     # searchtype options: "filter", "vector", "hybrid", filter vector", "filter hybrid"
 51 |     # vectorColName: Name of vector embedding in ACS
 52 | 
 53 |     if query is not None:
 54 |         vector = Vector(value=queryEmbedding, k=numChunks, fields=vectorColName)
 55 |   
 56 |     if colName == None: ## No filters
 57 |         if searchtype == None or searchtype == "vector": #(default vector)
 58 |             results = search_client.search(search_text=None, vectors= [vector])
 59 |         else: # hybrid
 60 |             results = search_client.search(search_text=query, vectors= [vector])
 61 |             
 62 |     else: ## Filters        
 63 |         filter_str = " and ".join(f"({key} eq '{value}')" for key, value in zip(colName, colVal))
 64 |         filter_str = f"({filter_str})"
 65 |         
 66 |         if query == None: #Pure filter
 67 |             results = search_client.search(search_text = None, filter = filter_str)
 68 |         elif searchtype == None or searchtype == "filter vector" or searchtype == "vector": # (default filter vector)
 69 |             results = search_client.search(search_text = None, vectors = [vector], filter = filter_str)
 70 |         else: # filter hybrid
 71 |             results = search_client.search(search_text = query, vectors = [vector], filter = filter_str)
 72 |          
 73 |     output = [result for result in results]
 74 |     return output
 75 | 
 76 | def queryParser(query):
 77 |     # Extract ticker using regular expression
 78 |     ticker_match = re.search(r'\bticker\s+(\w+)', query, re.IGNORECASE)
 79 |     ticker = ticker_match.group(1) if ticker_match else None
 80 | 
 81 |     # Extract year using regular expression
 82 |     year_match = re.search(r'\byear\s+(\d{2})', query, re.IGNORECASE)
 83 |     year = int(year_match.group(1)) if year_match else None
 84 | 
 85 |     # Extract quarter using regular expression
 86 |     quarter_match = re.search(r'\bquarter\s+(\d)', query, re.IGNORECASE)
 87 |     quarter = quarter_match.group(1) if quarter_match else None
 88 | 
 89 |     return ticker, str(year), quarter
 90 | 
 91 | 
 92 | ######################################
 93 | ## Chatbot
 94 | ######################################
 95 | 
 96 | class chatBot:
 97 |     def __init__(
 98 |         self,
 99 |         llm,
100 |         acs_search_client,
101 |         max_token_for_context=16000, 
102 |         template_qa_chain=None,
103 |         template_context_summarization=None, 
104 |         numChunks=10,
105 |         vectorColName="contentVector",
106 |         chunkColName="Chunk",
107 |         to_debug=False
108 |     ):
109 |         
110 |         # ACS
111 |         self.search_client = acs_search_client
112 |         self.numChunks = numChunks
113 |         self.vectorColName = vectorColName
114 |         self.chunkColName = chunkColName
115 |         
116 |         # LLM chain
117 |         self.llm = llm
118 |         self.max_token_for_context = max_token_for_context
119 |         
120 |         if template_qa_chain:
121 |             self.template_qa_chain = template_qa_chain
122 |         else: 
123 |             self.template_qa_chain= """You are a chatbot having a conversation with a human. 
124 |                                     Given the Context, Chat History, and Human Query, 
125 |                                     answer without hallucinating. 
126 |                                     If you don't have the answer say 'I don't have the answer'
127 |                                     """
128 |             
129 |         if template_context_summarization:
130 |             self.template_context_summarization = template_context_summarization
131 |         else:
132 |             self.template_context_summarization =  """Summarize the context so it includes 
133 |                                                     the details related to the human query.
134 |                                                     """
135 |         
136 |         # Memory chain
137 |         self.qa_chain = qa_chain_ConversationSummaryMemory(
138 |             prefix_template=self.template_qa_chain, 
139 |             to_debug=to_debug,
140 |             llm=self.llm
141 |         )
142 | 
143 |         # Transcripts specific
144 |         self.ticker = None
145 |         self.year = None
146 |         self.quarter = None
147 | 
148 |     
149 |     def run(self, human_query):
150 | 
151 |         queryEmbedding = createEmbeddings(human_query)
152 |         ############## Parse query
153 |         ticker, year, quarter = queryParser(human_query)
154 | 
155 |         ## if user query doesn't contain ticker, year, and quarter, use the previous one
156 |         if ticker != None:
157 |             self.ticker = ticker
158 |         if year != str(None):
159 |             self.year = year
160 |         if quarter != None:
161 |             self.quarter = quarter
162 | 
163 |         ## If ticker, year, and quarter are not found, return error message
164 |         if self.ticker == None or self.year == None or self.quarter == None:
165 |             print(self.ticker, self.year, self.quarter)
166 |             return "Sorry, please provide the ticker <INSERT>, year <INSERT>, and quarter <INSERT>. Example - ticker MSFT, quarter 3, year 23."
167 |         
168 |         ############### Retrieve from ACS
169 |         output = acs_retriever(
170 |             self.search_client,
171 |             query=human_query, 
172 |             queryEmbedding=queryEmbedding,
173 |             colName=['Ticker', 'Year', 'Quarter'],
174 |             colVal=[self.ticker, self.year, self.quarter], 
175 |             searchtype=None,
176 |             numChunks=self.numChunks,
177 |             vectorColName=self.vectorColName
178 |         )
179 | 
180 |         #################### Combine Context
181 |         context_list = [i[self.chunkColName]  for i in output]
182 |         
183 |         context_all = combine_docs(
184 |             context_list,
185 |             to_debug=False,
186 |             llm=self.llm, 
187 |             max_tokens=self.max_token_for_context,
188 |             user_query=human_query,
189 |             prefix_template=self.template_context_summarization
190 |         )
191 | 
192 |         ## Append Ticker, Year, Quarter to the context
193 |         context_all = "\nTicker: " + self.ticker + "\nYear: " + self.year + "\nQuarter: " + self.quarter + "\n"+ context_all
194 | 
195 |         # Augment and Generate Answer
196 |         # qa_chain below is a predefined chain with memory. It summarizes the chat history and augment to the context      
197 |         answer = self.qa_chain.run({'context': context_all,'human_input': human_query})
198 |         return answer
199 |     
200 |     
201 |     def retrieveChatHistory(self):
202 |         return self.qa_chain.memory.chat_memory
203 |         
204 |         
205 |     
206 | 
207 | 
208 | 
209 |     


--------------------------------------------------------------------------------
/samples/financial_transcripts/images/MSFT.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/MSFT.jpg


--------------------------------------------------------------------------------
/samples/financial_transcripts/images/Microsoft_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/Microsoft_logo.png


--------------------------------------------------------------------------------
/samples/financial_transcripts/images/Microsoft_logo.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 23 23"><path fill="#f3f3f3" d="M0 0h23v23H0z"/><path fill="#f35325" d="M1 1h10v10H1z"/><path fill="#81bc06" d="M12 1h10v10H12z"/><path fill="#05a6f0" d="M1 12h10v10H1z"/><path fill="#ffba08" d="M12 12h10v10H12z"/></svg>


--------------------------------------------------------------------------------
/samples/financial_transcripts/images/chatbot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/chatbot.jpg


--------------------------------------------------------------------------------
/samples/financial_transcripts/images/openai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/openai.png


--------------------------------------------------------------------------------
/samples/financial_transcripts/llm_app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import openai
  3 | from dotenv import dotenv_values
  4 | from langchain.chat_models import AzureChatOpenAI
  5 | from azure.core.credentials import AzureKeyCredential  
  6 | from azure.search.documents import SearchClient  
  7 | import time
  8 | import sys
  9 | from chatBot import chatBot
 10 | sys.path.append("../..")   ## add directory above
 11 | from rag_skills.utils import count_tokens
 12 | 
 13 | ### Cofigurations
 14 | VERBOSE = True
 15 | TEMPERATURE = 0.0
 16 | TOP_P = 1.0
 17 | NUM_CHUNKS = 10
 18 | MAX_TOKEN_FOR_CONTEXT = 27000
 19 | VECTOR_COL_NAME = "Embedding" ## Column name in ACS for vector embedding
 20 | CHUNK_NAME = "Chunk" ## Column name in ACS for text data that contains the context
 21 | 
 22 | TEMPLATE_QA_CHAIN = """You are a chatbot having a conversation with a human. 
 23 |         Given the Context, Chat History, and Human Query, 
 24 |         answer without hallucinating. If you don't have the answer say "I don't have the answer" """
 25 | 
 26 | TEMPLATE_CONTEXT_SUMMARIZATION =  """
 27 |         Summarize the context so it includes the details related to the human query. """
 28 | 
 29 | # Get the absolute path to the .env file
 30 | env_name = os.path.join(os.path.dirname(__file__), "llm.env")
 31 | 
 32 | # Load environment variables from the .env file
 33 | config = dotenv_values(env_name)
 34 | 
 35 | if len(config) == 0:
 36 |     env_name = os.path.join(os.path.dirname(__file__), "../../llm.env")
 37 |     config = dotenv_values(env_name)
 38 | 
 39 |     if len(config) == 0:
 40 |         raise Exception("No environment variables loaded. Please check the *.env file.")
 41 | 
 42 | for key, value in config.items():
 43 |     os.environ[key] = value
 44 | 
 45 | # LOAD OpenAI configs
 46 | openai.api_type = config["OPENAI_API_TYPE"]
 47 | openai.api_key = config['OPENAI_API_KEY']
 48 | openai.api_base = config['OPENAI_API_BASE']
 49 | openai.api_version = config['OPENAI_API_VERSION']
 50 | print("ENV VARIABLES LOADED")
 51 | 
 52 | # Model choice
 53 | DEPLOYMENT_NAME = config['OPENAI_DEPLOYMENT_COMPLETION']
 54 | 
 55 | ## Azure cognitive search
 56 | cogsearch_name = os.getenv("COGSEARCH_NAME")
 57 | index_name = os.getenv("COGSEARCH_INDEX_NAME")
 58 | cogsearch_api_key = os.getenv("COGSEARCH_API_KEY")
 59 | service_endpoint = "https://" + config["COGSEARCH_NAME"] + ".search.windows.net"
 60 | 
 61 | credential = AzureKeyCredential(cogsearch_api_key)
 62 | search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
 63 | 
 64 | optional_params = {
 65 |     'top_p': TOP_P,
 66 | }
 67 | 
 68 | llm = AzureChatOpenAI(
 69 |     deployment_name=DEPLOYMENT_NAME,
 70 |     openai_api_base=openai.api_base,
 71 |     openai_api_version=openai.api_version,
 72 |     openai_api_key=openai.api_key,
 73 |     openai_api_type = openai.api_type,
 74 |     temperature=TEMPERATURE,
 75 |     model_kwargs=optional_params,
 76 |     verbose=VERBOSE,
 77 | )
 78 | 
 79 | ## Chatbot class that implements the conversation agent
 80 | 
 81 | cb = chatBot(
 82 |     llm,
 83 |     search_client,
 84 |     max_token_for_context=MAX_TOKEN_FOR_CONTEXT, 
 85 |     template_qa_chain=TEMPLATE_QA_CHAIN,
 86 |     template_context_summarization=TEMPLATE_CONTEXT_SUMMARIZATION, 
 87 |     numChunks=NUM_CHUNKS,
 88 |     vectorColName=VECTOR_COL_NAME,
 89 |     chunkColName=CHUNK_NAME,
 90 |     to_debug=VERBOSE
 91 | )
 92 | 
 93 | def get_answer(msg):
 94 |     ans = cb.run(msg)
 95 |  
 96 |     return ans
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     question = """what are the top 3 themes in the earnings call transcripts from ticker MSFT for the quarter 1 in year 23?
101 |     """
102 |     
103 |     start = time.time()
104 |     ans = get_answer(question)
105 |     end = time.time()
106 |     print(ans)
107 |     print("Time elapsed: {}".format(end-start))
108 |     
109 |     result_num_tokens = count_tokens(ans)
110 |     print("Response num tokens: {}".format(result_num_tokens))
111 | 
112 |     ans = get_answer("Is it possible to get more details about cloud ?")
113 |     print(ans)


--------------------------------------------------------------------------------
/samples/financial_transcripts/st_config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | streamlit:
 3 |   title: "MSFT - Financial earnings call assistant"
 4 |   tab_title: "LLM assistant"
 5 |   logo: "images/Microsoft_logo.png"
 6 |   page_icon: "images/Microsoft_logo.png"
 7 |   avatar: "images/openai.png"
 8 |   assistant_intro_message: "Hi there :wave:, I'm an AI assistant. I can look up information through a database of earnings call transcripts to answer your questions. Please always specify the ticker and quarter in your question for example: \"ticker MSFT, quarter 1, year 23. How did MSFT do compared to last quarter?\""
 9 |   about: "A demo of an AI assistant. Powered by Azure OpenAI Large Language Models and search using native vector search capabilities on PostGres, authors: Azure Data/Applied AI team"
10 | azure:
11 |   dummy: "test"
12 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/st_main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import yaml
 3 | import os
 4 | import openai
 5 | 
 6 | from llm_app import get_answer
 7 | 
 8 | # Read config yaml file
 9 | with open('st_config.yaml', 'r') as file:
10 |     config = yaml.safe_load(file)
11 | #print(config)
12 | title = config['streamlit']['title']
13 | avatar = {
14 |     'user': None,
15 |     'assistant': config['streamlit']['avatar']
16 | }
17 | 
18 | # Set page config
19 | st.set_page_config(
20 |     page_title=config['streamlit']['tab_title'], 
21 |     page_icon=config['streamlit']['page_icon'], 
22 |     )
23 | 
24 | # Set sidebar
25 | st.sidebar.image(config['streamlit']['logo'], width=50)
26 | st.sidebar.title("About")
27 | st.sidebar.info(config['streamlit']['about'])
28 | 
29 | # Set logo
30 | #st.image(config['streamlit']['logo'], width=50)
31 | 
32 | # Set page title
33 | st.title(title)
34 | 
35 | # Initialize chat history
36 | if "messages" not in st.session_state:
37 |     st.session_state.messages = [] 
38 |     st.session_state.messages.append({
39 |         "role": "assistant", 
40 |         "content": config['streamlit']['assistant_intro_message']
41 |         })
42 | 
43 | # Display chat messages from history on app rerun
44 | for message in st.session_state.messages:
45 |     with st.chat_message(message["role"], avatar=avatar[message["role"]]):
46 |         st.markdown(message["content"])
47 | 
48 | # React to user input
49 | if prompt := st.chat_input("Send a message"):
50 |     # Add user message to chat history
51 |     st.session_state.messages.append({"role": "user", "content": prompt})
52 |     # Display user message in chat message container
53 |     with st.chat_message("user"):
54 |         st.markdown(prompt)
55 |     # Get bot response    
56 |     response = get_answer(prompt)
57 |     with st.chat_message("assistant", avatar=config['streamlit']['avatar']):
58 |         st.markdown(response)
59 |     # Add assistant response to chat history
60 |     st.session_state.messages.append({"role": "assistant", "content": response})
61 | 
62 | 
63 |     
64 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/step0_data_preprocessor.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Step1: Download data from azure blob storage\n",
  8 |     "\n",
  9 |     "This code downloads the Msft financial transcripts from a blob storage. If you do not have access to blob, then download \"Microsoft Earning Call Transcripts\" for four quarters for year 2023 and put it in \"Data\" folder. Make sure to rename the file similar to \"MSFTTranscriptFY23Q4.docx\"\n",
 10 |     "\n",
 11 |     "Msft Earning Call Transcripts for 2023-Q4\n",
 12 |     "https://www.fool.com/earnings/call-transcripts/2023/07/25/microsoft-msft-q4-2023-earnings-call-transcript/\n",
 13 |     "\n"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "from azure.storage.blob import BlobServiceClient\n",
 23 |     "import os\n",
 24 |     "from pathlib import Path\n",
 25 |     "\n",
 26 |     "# Name of the container in the Blob Storage\n",
 27 |     "container_name = \"public\"\n",
 28 |     "\n",
 29 |     "# Local directory path to save the downloaded files\n",
 30 |     "local_directory = Path(\"DATA/\")\n",
 31 |     "\n",
 32 |     "def download_files_from_blob_storage(container_name, local_directory):\n",
 33 |     "    # Create a BlobServiceClient using the default credentials (public access)\n",
 34 |     "    blob_service_client = BlobServiceClient.from_connection_string(\"DefaultEndpointsProtocol=https;AccountName=appliedaipublicdata;EndpointSuffix=core.windows.net\")\n",
 35 |     "\n",
 36 |     "    # Get a reference to the container\n",
 37 |     "    container_client = blob_service_client.get_container_client(container_name)\n",
 38 |     "\n",
 39 |     "    # List all blobs in the container\n",
 40 |     "    blob_list = container_client.list_blobs()\n",
 41 |     "\n",
 42 |     "    for blob in blob_list:\n",
 43 |     "        blob_name = blob.name\n",
 44 |     "        print(blob_name)\n",
 45 |     "        \n",
 46 |     "        # Check if the blob has a .docx extension (Word document)\n",
 47 |     "        if blob_name.lower().endswith(\".docx\"):\n",
 48 |     "            blob_client = container_client.get_blob_client(blob_name)\n",
 49 |     "            \n",
 50 |     "            # Construct the local file path to save the blob\n",
 51 |     "            local_file_path = os.path.join(local_directory, blob_name.split(\"/\")[-1])  # Use only the last part of the blob path\n",
 52 |     "            \n",
 53 |     "            # Download the blob to the local directory\n",
 54 |     "            with open(local_file_path, \"wb\") as local_file:\n",
 55 |     "                blob_data = blob_client.download_blob()\n",
 56 |     "                local_file.write(blob_data.readall())\n",
 57 |     "            \n",
 58 |     "            print(f\"Downloaded: {blob_name}\")\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "download_files_from_blob_storage(container_name, local_directory)\n"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "### Step 2: Convert .docx to .pdf format"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from docx2pdf import convert\n",
 78 |     "import os\n",
 79 |     "\n",
 80 |     "directory = Path('DATA')\n",
 81 |     "docx_files = [filename for filename in os.listdir(directory) if filename.endswith('.docx')]\n",
 82 |     "name_len_docx = []\n",
 83 |     "name_len_pdf = []\n",
 84 |     "print(len(docx_files))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "for filename in docx_files:\n",
 94 |     "    \n",
 95 |     "    docx_path = os.path.join(directory, filename)\n",
 96 |     "    # if len(filename)>35:\n",
 97 |     "    #     filename = filename[:35]\n",
 98 |     "    pdf_path = os.path.join(directory, f\"{os.path.splitext(filename)[0]}.pdf\")\n",
 99 |     "\n",
100 |     "    # Check if PDF already exists\n",
101 |     "    if os.path.exists(pdf_path):\n",
102 |     "        print(f\"Skipping conversion for {filename}. PDF already exists.\")\n",
103 |     "        continue\n",
104 |     "\n",
105 |     "    name_len_docx.append(len(docx_path))\n",
106 |     "    print(filename, name_len_docx)\n",
107 |     "    name_len_pdf.append(len(pdf_path))\n",
108 |     "    print(name_len_pdf)\n",
109 |     "    try: \n",
110 |     "        convert(docx_path, pdf_path)\n",
111 |     "    except:\n",
112 |     "        print('Error in converting file, retrying...')\n",
113 |     "        try:\n",
114 |     "            convert(docx_path, pdf_path)\n",
115 |     "        except:\n",
116 |     "            Exception(\"Error in converting file\")\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   }
126 |  ],
127 |  "metadata": {
128 |   "kernelspec": {
129 |    "display_name": "nbdev",
130 |    "language": "python",
131 |    "name": "nbdev"
132 |   },
133 |   "language_info": {
134 |    "codemirror_mode": {
135 |     "name": "ipython",
136 |     "version": 3
137 |    },
138 |    "file_extension": ".py",
139 |    "mimetype": "text/x-python",
140 |    "name": "python",
141 |    "nbconvert_exporter": "python",
142 |    "pygments_lexer": "ipython3",
143 |    "version": "3.8.11"
144 |   },
145 |   "orig_nbformat": 4
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 2
149 | }
150 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/step1_chunk_and_extract.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from dotenv import dotenv_values\n",
 10 |     "from pathlib import Path\n",
 11 |     "import os\n",
 12 |     "import pandas as pd\n",
 13 |     "from azure.core.credentials import AzureKeyCredential\n",
 14 |     "from azure.ai.formrecognizer import DocumentAnalysisClient\n",
 15 |     "\n",
 16 |     "# specify the name of the .env file name \n",
 17 |     "env_name = \"../../llm.env\" # change to your own .env file name\n",
 18 |     "config = dotenv_values(env_name)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# Extract data and context using Azure Form Recognizer\n",
 26 |     "\n",
 27 |     "This code sample shows Prebuilt Document operations with the Azure Form Recognizer client library. \n",
 28 |     "The async versions of the samples require Python 3.6 or later.\n",
 29 |     "\n",
 30 |     "To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs\n",
 31 |     "https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk\n"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "\"\"\"\n",
 41 |     "Remember to remove the key from your code when you're done, and never post it publicly. For production, use\n",
 42 |     "secure methods to store and access your credentials. For more information, see \n",
 43 |     "https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration\n",
 44 |     "\"\"\"\n",
 45 |     "\n",
 46 |     "endpoint = config[\"AZURE_FORM_RECOGNIZER_ENDPOINT\"]\n",
 47 |     "key = config[\"AZURE_FORM_RECOGNIZER_KEY\"]\n"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "##  Read pdf files using Azure Form Recognizer and split into chunks \n",
 55 |     "Azure form recognizer reads pdf files and then we chunk the extracted text, and also save page number and line number for the extracted chunks "
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "import re\n",
 65 |     "\n",
 66 |     "################################################################################\n",
 67 |     "#################### Helper Functions ##########################################\n",
 68 |     "################################################################################\n",
 69 |     "\n",
 70 |     "# Read pdf files\n",
 71 |     "def analyze_pdf(doc_path):  \n",
 72 |     "    with open(doc_path, \"rb\") as f:\n",
 73 |     "        poller = document_analysis_client.begin_analyze_document(\n",
 74 |     "            \"prebuilt-document\", document=f\n",
 75 |     "        )\n",
 76 |     "    result = poller.result()\n",
 77 |     "                \n",
 78 |     "    return result\n",
 79 |     "\n",
 80 |     "# Extract stock symbol, year, and quarter from filename\n",
 81 |     "def extract_info_from_filename(filename):\n",
 82 |     "    '''\n",
 83 |     "    Input: filename (\"MSFTTranscriptFY23Q4\")\n",
 84 |     "    Output: Extract stock symbol, year and quarter from filename\n",
 85 |     "    '''\n",
 86 |     "    pattern = r'([A-Z]+)TranscriptFY(\\d{2})Q(\\d)'\n",
 87 |     "    match = re.search(pattern, filename)\n",
 88 |     "    \n",
 89 |     "    if match:\n",
 90 |     "        symbol = match.group(1)\n",
 91 |     "        fiscal_year = match.group(2)\n",
 92 |     "        fiscal_quarter = match.group(3)\n",
 93 |     "        return symbol, fiscal_year, fiscal_quarter\n",
 94 |     "    else:\n",
 95 |     "        return None\n",
 96 |     "\n",
 97 |     "# Extract line number and page number\n",
 98 |     "def create_line_page_tuples(result):\n",
 99 |     "    '''\n",
100 |     "    Input: result of form recognizer analyze_pdf function\n",
101 |     "    Output: Create list of tuples of the form (line, page_num, line_num) \n",
102 |     "    This will keep reference of the line number and page number of each line in the document.\n",
103 |     "    '''\n",
104 |     "    line_page_tuples = []\n",
105 |     "\n",
106 |     "    total_pages = len(result.pages)\n",
107 |     "    for page_num in range(total_pages):\n",
108 |     "        lines = result.pages[page_num].lines\n",
109 |     "        total_lines = len(lines)\n",
110 |     "\n",
111 |     "        for line_num in range(total_lines):\n",
112 |     "            line = lines[line_num].content\n",
113 |     "            line_page_tuples.append((line, page_num + 1, line_num + 1))\n",
114 |     "\n",
115 |     "    return line_page_tuples\n",
116 |     "\n",
117 |     "# Retrieve page number and chunks\n",
118 |     "def chunk_with_page_number(line_page_tuples, chunk_length=10, chunk_overlap=2):\n",
119 |     "    '''\n",
120 |     "    Given the list of tuples of the form (line, page_num, line_num) and chunk length and overlap,\n",
121 |     "    it will create chunks of text with page number and line number of the first line in the chunk.\n",
122 |     "    chunk length: number of lines in each chunk\n",
123 |     "    chunk_overlap: number of overlapping lines between chunks\n",
124 |     "    '''\n",
125 |     "    pointer = 0 \n",
126 |     "    chunks = []\n",
127 |     "    total_lines = len(line_page_tuples)\n",
128 |     "    #for line, page_number, line_number in line_page_tuples:\n",
129 |     "    while pointer < total_lines:\n",
130 |     "        line_count = 0\n",
131 |     "        current_chunk = \"\"\n",
132 |     "        if not chunks: \n",
133 |     "            # for first chunk we can not use overlap\n",
134 |     "            pointer = 0\n",
135 |     "        else:\n",
136 |     "            pointer = pointer - chunk_overlap\n",
137 |     "        \n",
138 |     "        # take starting page number and line number \n",
139 |     "        page_number, line_number = line_page_tuples[pointer][1:]  \n",
140 |     "        while line_count < chunk_length and pointer < total_lines:\n",
141 |     "            current_chunk = current_chunk + line_page_tuples[pointer][0]\n",
142 |     "            current_chunk = current_chunk + \" \"\n",
143 |     "            line_count += 1\n",
144 |     "            pointer += 1\n",
145 |     "        chunks.append((current_chunk, page_number, line_number))\n",
146 |     "    return chunks\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 4,
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "writing the results of: \n",
159 |       "MSFTTranscriptFY23Q1.pdf\n",
160 |       "writing the results of: \n",
161 |       "MSFTTranscriptFY23Q2.pdf\n",
162 |       "writing the results of: \n",
163 |       "MSFTTranscriptFY23Q3.pdf\n",
164 |       "writing the results of: \n",
165 |       "MSFTTranscriptFY23Q4.pdf\n"
166 |      ]
167 |     },
168 |     {
169 |      "data": {
170 |       "text/html": [
171 |        "<div>\n",
172 |        "<style scoped>\n",
173 |        "    .dataframe tbody tr th:only-of-type {\n",
174 |        "        vertical-align: middle;\n",
175 |        "    }\n",
176 |        "\n",
177 |        "    .dataframe tbody tr th {\n",
178 |        "        vertical-align: top;\n",
179 |        "    }\n",
180 |        "\n",
181 |        "    .dataframe thead th {\n",
182 |        "        text-align: right;\n",
183 |        "    }\n",
184 |        "</style>\n",
185 |        "<table border=\"1\" class=\"dataframe\">\n",
186 |        "  <thead>\n",
187 |        "    <tr style=\"text-align: right;\">\n",
188 |        "      <th></th>\n",
189 |        "      <th>Id</th>\n",
190 |        "      <th>Ticker</th>\n",
191 |        "      <th>Year</th>\n",
192 |        "      <th>Quarter</th>\n",
193 |        "      <th>Chunk</th>\n",
194 |        "      <th>PageNumber</th>\n",
195 |        "      <th>LineNumber</th>\n",
196 |        "    </tr>\n",
197 |        "  </thead>\n",
198 |        "  <tbody>\n",
199 |        "    <tr>\n",
200 |        "      <th>0</th>\n",
201 |        "      <td>1</td>\n",
202 |        "      <td>MSFT</td>\n",
203 |        "      <td>23</td>\n",
204 |        "      <td>1</td>\n",
205 |        "      <td>Microsoft FY23 First Quarter Earnings Conferen...</td>\n",
206 |        "      <td>1</td>\n",
207 |        "      <td>1</td>\n",
208 |        "    </tr>\n",
209 |        "    <tr>\n",
210 |        "      <th>1</th>\n",
211 |        "      <td>2</td>\n",
212 |        "      <td>MSFT</td>\n",
213 |        "      <td>23</td>\n",
214 |        "      <td>1</td>\n",
215 |        "      <td>On the Microsoft Investor Relations website, y...</td>\n",
216 |        "      <td>1</td>\n",
217 |        "      <td>9</td>\n",
218 |        "    </tr>\n",
219 |        "    <tr>\n",
220 |        "      <th>2</th>\n",
221 |        "      <td>3</td>\n",
222 |        "      <td>MSFT</td>\n",
223 |        "      <td>23</td>\n",
224 |        "      <td>1</td>\n",
225 |        "      <td>GAAP. They are included as additional clarifyi...</td>\n",
226 |        "      <td>1</td>\n",
227 |        "      <td>17</td>\n",
228 |        "    </tr>\n",
229 |        "    <tr>\n",
230 |        "      <th>3</th>\n",
231 |        "      <td>4</td>\n",
232 |        "      <td>MSFT</td>\n",
233 |        "      <td>23</td>\n",
234 |        "      <td>1</td>\n",
235 |        "      <td>same in constant currency, we will refer to th...</td>\n",
236 |        "      <td>2</td>\n",
237 |        "      <td>6</td>\n",
238 |        "    </tr>\n",
239 |        "    <tr>\n",
240 |        "      <th>4</th>\n",
241 |        "      <td>5</td>\n",
242 |        "      <td>MSFT</td>\n",
243 |        "      <td>23</td>\n",
244 |        "      <td>1</td>\n",
245 |        "      <td>predictions, projections, or other statements ...</td>\n",
246 |        "      <td>2</td>\n",
247 |        "      <td>14</td>\n",
248 |        "    </tr>\n",
249 |        "  </tbody>\n",
250 |        "</table>\n",
251 |        "</div>"
252 |       ],
253 |       "text/plain": [
254 |        "   Id Ticker Year Quarter                                              Chunk  \\\n",
255 |        "0   1   MSFT   23       1  Microsoft FY23 First Quarter Earnings Conferen...   \n",
256 |        "1   2   MSFT   23       1  On the Microsoft Investor Relations website, y...   \n",
257 |        "2   3   MSFT   23       1  GAAP. They are included as additional clarifyi...   \n",
258 |        "3   4   MSFT   23       1  same in constant currency, we will refer to th...   \n",
259 |        "4   5   MSFT   23       1  predictions, projections, or other statements ...   \n",
260 |        "\n",
261 |        "   PageNumber  LineNumber  \n",
262 |        "0           1           1  \n",
263 |        "1           1           9  \n",
264 |        "2           1          17  \n",
265 |        "3           2           6  \n",
266 |        "4           2          14  "
267 |       ]
268 |      },
269 |      "execution_count": 4,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "# Define document analysis client\n",
276 |     "document_analysis_client = DocumentAnalysisClient(\n",
277 |     "        endpoint=endpoint, credential=AzureKeyCredential(key)\n",
278 |     "    )\n",
279 |     "\n",
280 |     "doc_dir = Path(\"DATA/\")\n",
281 |     "pdf_files = [filename for filename in os.listdir(doc_dir) if filename.endswith('.pdf')]\n",
282 |     "\n",
283 |     "dfs = []\n",
284 |     "\n",
285 |     "for file_name in pdf_files:\n",
286 |     "    \n",
287 |     "    values = extract_info_from_filename(file_name) # symbol, fiscal_year, fiscal_quarter\n",
288 |     "    file_path = os.path.join(doc_dir, f\"{os.path.splitext(file_name)[0]}.pdf\")\n",
289 |     "    \n",
290 |     "    # analyze the pdf using form recognizer\n",
291 |     "    result = analyze_pdf(file_path)\n",
292 |     "    \n",
293 |     "    # get the chunks in a tuple of the form (chunk, page_number, line_number)\n",
294 |     "    line_page_tuples = create_line_page_tuples(result)\n",
295 |     "    chunks = chunk_with_page_number(line_page_tuples=line_page_tuples, chunk_length=10, chunk_overlap=2)\n",
296 |     "   \n",
297 |     "    # Write results to dataframe \n",
298 |     "    df_chunks = pd.DataFrame(chunks, columns = ['Chunk', 'PageNumber', 'LineNumber'])  \n",
299 |     "\n",
300 |     "    df_chunks[\"Ticker\"], df_chunks[\"Year\"], df_chunks[\"Quarter\"]  = \"NULL\", \"NULL\", \"NULL\"\n",
301 |     "    if values:\n",
302 |     "        symbol, fiscal_year, fiscal_quarter = values\n",
303 |     "        df_chunks[\"Ticker\"], df_chunks[\"Year\"], df_chunks[\"Quarter\"]  = symbol, fiscal_year, fiscal_quarter\n",
304 |     "        \n",
305 |     "    # Reorder dataframe column name\n",
306 |     "    new_column_order = ['Ticker', 'Year', 'Quarter', 'Chunk', 'PageNumber', 'LineNumber']\n",
307 |     "    df_chunks = df_chunks[new_column_order]\n",
308 |     "        \n",
309 |     "    # Add all datframe to list\n",
310 |     "    dfs.append(df_chunks)\n",
311 |     "\n",
312 |     "    # Saving results to csv files\n",
313 |     "    if not os.path.exists(\"AnalyzedPDF/\"):\n",
314 |     "        os.makedirs(\"AnalyzedPDF/\")\n",
315 |     "\n",
316 |     "    print('writing the results of: \\n' + file_name)  \n",
317 |     "    if not os.path.exists(f\"AnalyzedPDF/Chunks_{file_name[0:-4]}.csv\"):\n",
318 |     "        df_chunks.to_csv(f\"AnalyzedPDF/Chunks_{file_name[0:-4]}.csv\", index=False)\n",
319 |     "    else:\n",
320 |     "        print(f'File: chunks_{file_name}.csv already exists, skipping...')\n",
321 |     "        \n",
322 |     "## Combine all the files\n",
323 |     "df = pd.concat(dfs, ignore_index=True)\n",
324 |     "df = df.reset_index(drop=True)\n",
325 |     "df.insert(0, 'Id', [i for i in range(1, df.shape[0]+1)]) # Add 'Id' column\n",
326 |     "\n",
327 |     "## Save to csv\n",
328 |     "df.to_csv(\"AnalyzedPDF/Chunks.csv\", index=False)\n",
329 |     "\n",
330 |     "df.head()\n"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": []
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "kernelspec": {
343 |    "display_name": "nanogpt",
344 |    "language": "python",
345 |    "name": "nanogpt"
346 |   },
347 |   "language_info": {
348 |    "codemirror_mode": {
349 |     "name": "ipython",
350 |     "version": 3
351 |    },
352 |    "file_extension": ".py",
353 |    "mimetype": "text/x-python",
354 |    "name": "python",
355 |    "nbconvert_exporter": "python",
356 |    "pygments_lexer": "ipython3",
357 |    "version": "3.8.16"
358 |   }
359 |  },
360 |  "nbformat": 4,
361 |  "nbformat_minor": 2
362 | }
363 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/step2_embed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chunk Embedding using Azure OpenAI   "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Load environment variables and keys "
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from dotenv import dotenv_values\n",
 24 |     "\n",
 25 |     "# specify the name of the .env file name \n",
 26 |     "env_name = \"../../llm.env\" # change to your own .env file name\n",
 27 |     "config = dotenv_values(env_name)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "#### Load the chunks and create embedding\n",
 35 |     "In this section, we will load the data into a pandas dataframe, use select columns, and create vector embedding using azure open ai. "
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import openai\n",
 45 |     "import pandas as pd\n",
 46 |     "import pandas as pd\n",
 47 |     "import numpy as np\n",
 48 |     "import time\n",
 49 |     "\n",
 50 |     "openai.api_type = config[\"OPENAI_API_TYPE\"] \n",
 51 |     "openai.api_key = config[\"OPENAI_API_KEY\"]\n",
 52 |     "openai.api_base = config[\"OPENAI_API_BASE\"] \n",
 53 |     "openai.api_version = config[\"OPENAI_API_VERSION\"] \n",
 54 |     "\n",
 55 |     "def createEmbeddings(text):\n",
 56 |     "    response = openai.Embedding.create(input=text , engine=config[\"OPENAI_DEPLOYMENT_EMBEDDING\"])\n",
 57 |     "    embeddings = response['data'][0]['embedding']\n",
 58 |     "    return embeddings\n",
 59 |     "\n",
 60 |     "# Read data into a DataFrame\n",
 61 |     "df = pd.read_csv('AnalyzedPDF/Chunks.csv')\n",
 62 |     "\n",
 63 |     "# Create a new column called 'embedding' in the DataFrame\n",
 64 |     "df['Embedding'] = np.empty((len(df),), dtype=object)\n",
 65 |     "\n",
 66 |     "# Iterate over each row in the DataFrame and assign the concatenation and embeddings\n",
 67 |     "for index, row in df.iterrows():\n",
 68 |     "    text = row['Chunk']\n",
 69 |     "    \n",
 70 |     "    # Concatenate the desired columns\n",
 71 |     "    concat_text = f\"{text}\"\n",
 72 |     "    \n",
 73 |     "    # Create embeddings using the provided function\n",
 74 |     "    embeddings = createEmbeddings(concat_text)\n",
 75 |     "    #print(embeddings)\n",
 76 |     "    \n",
 77 |     "    # Assign the embeddings to the 'embedding' column\n",
 78 |     "    df.at[index, 'Embedding'] = embeddings\n",
 79 |     "    time.sleep(0.1)\n"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "We will rename the column names and add a new column as primary index."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/html": [
 97 |        "<div>\n",
 98 |        "<style scoped>\n",
 99 |        "    .dataframe tbody tr th:only-of-type {\n",
100 |        "        vertical-align: middle;\n",
101 |        "    }\n",
102 |        "\n",
103 |        "    .dataframe tbody tr th {\n",
104 |        "        vertical-align: top;\n",
105 |        "    }\n",
106 |        "\n",
107 |        "    .dataframe thead th {\n",
108 |        "        text-align: right;\n",
109 |        "    }\n",
110 |        "</style>\n",
111 |        "<table border=\"1\" class=\"dataframe\">\n",
112 |        "  <thead>\n",
113 |        "    <tr style=\"text-align: right;\">\n",
114 |        "      <th></th>\n",
115 |        "      <th>Id</th>\n",
116 |        "      <th>Ticker</th>\n",
117 |        "      <th>Year</th>\n",
118 |        "      <th>Quarter</th>\n",
119 |        "      <th>Chunk</th>\n",
120 |        "      <th>PageNumber</th>\n",
121 |        "      <th>LineNumber</th>\n",
122 |        "      <th>Embedding</th>\n",
123 |        "    </tr>\n",
124 |        "  </thead>\n",
125 |        "  <tbody>\n",
126 |        "    <tr>\n",
127 |        "      <th>0</th>\n",
128 |        "      <td>1</td>\n",
129 |        "      <td>MSFT</td>\n",
130 |        "      <td>23</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>Microsoft FY23 First Quarter Earnings Conferen...</td>\n",
133 |        "      <td>1</td>\n",
134 |        "      <td>1</td>\n",
135 |        "      <td>[-0.022691458463668823, -0.028929660096764565,...</td>\n",
136 |        "    </tr>\n",
137 |        "    <tr>\n",
138 |        "      <th>1</th>\n",
139 |        "      <td>2</td>\n",
140 |        "      <td>MSFT</td>\n",
141 |        "      <td>23</td>\n",
142 |        "      <td>1</td>\n",
143 |        "      <td>On the Microsoft Investor Relations website, y...</td>\n",
144 |        "      <td>1</td>\n",
145 |        "      <td>9</td>\n",
146 |        "      <td>[-0.022940216585993767, -0.008343684487044811,...</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>2</th>\n",
150 |        "      <td>3</td>\n",
151 |        "      <td>MSFT</td>\n",
152 |        "      <td>23</td>\n",
153 |        "      <td>1</td>\n",
154 |        "      <td>GAAP. They are included as additional clarifyi...</td>\n",
155 |        "      <td>1</td>\n",
156 |        "      <td>17</td>\n",
157 |        "      <td>[-0.01130777969956398, -0.0038822712376713753,...</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>3</th>\n",
161 |        "      <td>4</td>\n",
162 |        "      <td>MSFT</td>\n",
163 |        "      <td>23</td>\n",
164 |        "      <td>1</td>\n",
165 |        "      <td>same in constant currency, we will refer to th...</td>\n",
166 |        "      <td>2</td>\n",
167 |        "      <td>6</td>\n",
168 |        "      <td>[-0.017685849219560623, -0.02943631075322628, ...</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>4</th>\n",
172 |        "      <td>5</td>\n",
173 |        "      <td>MSFT</td>\n",
174 |        "      <td>23</td>\n",
175 |        "      <td>1</td>\n",
176 |        "      <td>predictions, projections, or other statements ...</td>\n",
177 |        "      <td>2</td>\n",
178 |        "      <td>14</td>\n",
179 |        "      <td>[-0.00915693398565054, -0.019673412665724754, ...</td>\n",
180 |        "    </tr>\n",
181 |        "    <tr>\n",
182 |        "      <th>...</th>\n",
183 |        "      <td>...</td>\n",
184 |        "      <td>...</td>\n",
185 |        "      <td>...</td>\n",
186 |        "      <td>...</td>\n",
187 |        "      <td>...</td>\n",
188 |        "      <td>...</td>\n",
189 |        "      <td>...</td>\n",
190 |        "      <td>...</td>\n",
191 |        "    </tr>\n",
192 |        "    <tr>\n",
193 |        "      <th>437</th>\n",
194 |        "      <td>438</td>\n",
195 |        "      <td>MSFT</td>\n",
196 |        "      <td>23</td>\n",
197 |        "      <td>4</td>\n",
198 |        "      <td>Can you just talk about where customers are ri...</td>\n",
199 |        "      <td>44</td>\n",
200 |        "      <td>19</td>\n",
201 |        "      <td>[-0.004939808044582605, 0.000936132506467402, ...</td>\n",
202 |        "    </tr>\n",
203 |        "    <tr>\n",
204 |        "      <th>438</th>\n",
205 |        "      <td>439</td>\n",
206 |        "      <td>MSFT</td>\n",
207 |        "      <td>23</td>\n",
208 |        "      <td>4</td>\n",
209 |        "      <td>complement, I'll call it, your databases, beca...</td>\n",
210 |        "      <td>45</td>\n",
211 |        "      <td>7</td>\n",
212 |        "      <td>[-0.0132768414914608, 0.004370962269604206, -0...</td>\n",
213 |        "    </tr>\n",
214 |        "    <tr>\n",
215 |        "      <th>439</th>\n",
216 |        "      <td>440</td>\n",
217 |        "      <td>MSFT</td>\n",
218 |        "      <td>23</td>\n",
219 |        "      <td>4</td>\n",
220 |        "      <td>with a very disruptive business model. I mean,...</td>\n",
221 |        "      <td>45</td>\n",
222 |        "      <td>15</td>\n",
223 |        "      <td>[-0.013180367648601532, -0.007650672923773527,...</td>\n",
224 |        "    </tr>\n",
225 |        "    <tr>\n",
226 |        "      <th>440</th>\n",
227 |        "      <td>441</td>\n",
228 |        "      <td>MSFT</td>\n",
229 |        "      <td>23</td>\n",
230 |        "      <td>4</td>\n",
231 |        "      <td>architecture lays out, our business model arou...</td>\n",
232 |        "      <td>46</td>\n",
233 |        "      <td>2</td>\n",
234 |        "      <td>[0.003990992438048124, -0.0018922516610473394,...</td>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>441</th>\n",
238 |        "      <td>442</td>\n",
239 |        "      <td>MSFT</td>\n",
240 |        "      <td>23</td>\n",
241 |        "      <td>4</td>\n",
242 |        "      <td>speaking with all of you soon. SATYA NADELLA: ...</td>\n",
243 |        "      <td>46</td>\n",
244 |        "      <td>10</td>\n",
245 |        "      <td>[-0.013442852534353733, -0.01743759959936142, ...</td>\n",
246 |        "    </tr>\n",
247 |        "  </tbody>\n",
248 |        "</table>\n",
249 |        "<p>442 rows × 8 columns</p>\n",
250 |        "</div>"
251 |       ],
252 |       "text/plain": [
253 |        "      Id Ticker  Year  Quarter  \\\n",
254 |        "0      1   MSFT    23        1   \n",
255 |        "1      2   MSFT    23        1   \n",
256 |        "2      3   MSFT    23        1   \n",
257 |        "3      4   MSFT    23        1   \n",
258 |        "4      5   MSFT    23        1   \n",
259 |        "..   ...    ...   ...      ...   \n",
260 |        "437  438   MSFT    23        4   \n",
261 |        "438  439   MSFT    23        4   \n",
262 |        "439  440   MSFT    23        4   \n",
263 |        "440  441   MSFT    23        4   \n",
264 |        "441  442   MSFT    23        4   \n",
265 |        "\n",
266 |        "                                                 Chunk  PageNumber  \\\n",
267 |        "0    Microsoft FY23 First Quarter Earnings Conferen...           1   \n",
268 |        "1    On the Microsoft Investor Relations website, y...           1   \n",
269 |        "2    GAAP. They are included as additional clarifyi...           1   \n",
270 |        "3    same in constant currency, we will refer to th...           2   \n",
271 |        "4    predictions, projections, or other statements ...           2   \n",
272 |        "..                                                 ...         ...   \n",
273 |        "437  Can you just talk about where customers are ri...          44   \n",
274 |        "438  complement, I'll call it, your databases, beca...          45   \n",
275 |        "439  with a very disruptive business model. I mean,...          45   \n",
276 |        "440  architecture lays out, our business model arou...          46   \n",
277 |        "441  speaking with all of you soon. SATYA NADELLA: ...          46   \n",
278 |        "\n",
279 |        "     LineNumber                                          Embedding  \n",
280 |        "0             1  [-0.022691458463668823, -0.028929660096764565,...  \n",
281 |        "1             9  [-0.022940216585993767, -0.008343684487044811,...  \n",
282 |        "2            17  [-0.01130777969956398, -0.0038822712376713753,...  \n",
283 |        "3             6  [-0.017685849219560623, -0.02943631075322628, ...  \n",
284 |        "4            14  [-0.00915693398565054, -0.019673412665724754, ...  \n",
285 |        "..          ...                                                ...  \n",
286 |        "437          19  [-0.004939808044582605, 0.000936132506467402, ...  \n",
287 |        "438           7  [-0.0132768414914608, 0.004370962269604206, -0...  \n",
288 |        "439          15  [-0.013180367648601532, -0.007650672923773527,...  \n",
289 |        "440           2  [0.003990992438048124, -0.0018922516610473394,...  \n",
290 |        "441          10  [-0.013442852534353733, -0.01743759959936142, ...  \n",
291 |        "\n",
292 |        "[442 rows x 8 columns]"
293 |       ]
294 |      },
295 |      "execution_count": 3,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "# Print the DataFrame with 'Id' as the first column after index\n",
302 |     "df.head(1000)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "Use the following code to save the embeddings and processed data for future use or skip the previous part of the code and and load the processed data to save into the db. "
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 4,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "# save CSV for future use. \n",
319 |     "df.to_csv('AnalyzedPDF/ChunksEmbedding.csv', index=False)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": []
328 |   }
329 |  ],
330 |  "metadata": {
331 |   "kernelspec": {
332 |    "display_name": "nanogpt",
333 |    "language": "python",
334 |    "name": "nanogpt"
335 |   },
336 |   "language_info": {
337 |    "codemirror_mode": {
338 |     "name": "ipython",
339 |     "version": 3
340 |    },
341 |    "file_extension": ".py",
342 |    "mimetype": "text/x-python",
343 |    "name": "python",
344 |    "nbconvert_exporter": "python",
345 |    "pygments_lexer": "ipython3",
346 |    "version": "3.8.16"
347 |   }
348 |  },
349 |  "nbformat": 4,
350 |  "nbformat_minor": 2
351 | }
352 | 


--------------------------------------------------------------------------------
/samples/financial_transcripts/step3_db_storing_vectorsearch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Store chunks into Vector Database using Azure Cognitive Search (ACS)"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 28,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import re\n",
 18 |     "import pandas as pd\n",
 19 |     "import json  \n",
 20 |     "import openai  \n",
 21 |     "from dotenv import load_dotenv\n",
 22 |     "from tenacity import retry, wait_random_exponential, stop_after_attempt  \n",
 23 |     "from azure.core.credentials import AzureKeyCredential  \n",
 24 |     "from azure.search.documents import SearchClient  \n",
 25 |     "from azure.search.documents.indexes import SearchIndexClient  \n",
 26 |     "from azure.search.documents.models import Vector  \n",
 27 |     "from azure.search.documents.indexes.models import (  \n",
 28 |     "    SearchIndex,  \n",
 29 |     "    SearchField,  \n",
 30 |     "    SearchFieldDataType,  \n",
 31 |     "    SimpleField,  \n",
 32 |     "    SearchableField,  \n",
 33 |     "    SearchIndex,  \n",
 34 |     "    SemanticConfiguration,  \n",
 35 |     "    PrioritizedFields,  \n",
 36 |     "    SemanticField,  \n",
 37 |     "    SearchField,  \n",
 38 |     "    SemanticSettings,  \n",
 39 |     "    VectorSearch,  \n",
 40 |     "    HnswVectorSearchAlgorithmConfiguration\n",
 41 |     ")\n",
 42 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
 43 |     "from ast import literal_eval"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "# Load environment variables and keys "
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 17,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "from dotenv import dotenv_values\n",
 60 |     "\n",
 61 |     "# specify the name of the .env file name \n",
 62 |     "env_name = \"../../llm.env\" # change to your own .env file name\n",
 63 |     "config = dotenv_values(env_name)\n",
 64 |     "\n",
 65 |     "# Azure OpenAI\n",
 66 |     "openai.api_type = config[\"OPENAI_API_TYPE\"] #\"azure\"\n",
 67 |     "openai.api_key = config['OPENAI_API_KEY']\n",
 68 |     "openai.api_base = config['OPENAI_API_BASE']\n",
 69 |     "openai.api_version = config['OPENAI_API_VERSION']\n",
 70 |     "\n",
 71 |     "## Cog Search\n",
 72 |     "cogsearch_name = config[\"COGSEARCH_NAME\"]\n",
 73 |     "index_name = config[\"COGSEARCH_INDEX_NAME\"]\n",
 74 |     "key = config[\"COGSEARCH_API_KEY\"]\n",
 75 |     "service_endpoint = \"https://\"+config[\"COGSEARCH_NAME\"] + \".search.windows.net\"\n",
 76 |     "\n",
 77 |     "credential = AzureKeyCredential(key)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 34,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "def createEmbeddings(text):\n",
 87 |     "    response = openai.Embedding.create(input=text , engine=config[\"OPENAI_DEPLOYMENT_EMBEDDING\"])\n",
 88 |     "    embeddings = response['data'][0]['embedding']\n",
 89 |     "    return embeddings"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "#### Store the embeddings in Azure Cognitive Search Vector Store\n",
 97 |     "\n",
 98 |     "[AzureCogSearch](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) provides a simple interface to create a vector database, store and retrieve data using vector search. You can read more about [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main) more about Vector Search.\n",
 99 |     "\n",
100 |     "There are two steps to store data in AzureCogSearch vector database:\n",
101 |     "- First, we create the index (or schema) of the vector database\n",
102 |     "- Second, we add the chunked documents and their embeddings to the vector datastore"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 18,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "df_chunks_embedding = pd.read_csv('AnalyzedPDF/ChunksEmbedding.csv')"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 19,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/html": [
122 |        "<div>\n",
123 |        "<style scoped>\n",
124 |        "    .dataframe tbody tr th:only-of-type {\n",
125 |        "        vertical-align: middle;\n",
126 |        "    }\n",
127 |        "\n",
128 |        "    .dataframe tbody tr th {\n",
129 |        "        vertical-align: top;\n",
130 |        "    }\n",
131 |        "\n",
132 |        "    .dataframe thead th {\n",
133 |        "        text-align: right;\n",
134 |        "    }\n",
135 |        "</style>\n",
136 |        "<table border=\"1\" class=\"dataframe\">\n",
137 |        "  <thead>\n",
138 |        "    <tr style=\"text-align: right;\">\n",
139 |        "      <th></th>\n",
140 |        "      <th>Id</th>\n",
141 |        "      <th>Ticker</th>\n",
142 |        "      <th>Year</th>\n",
143 |        "      <th>Quarter</th>\n",
144 |        "      <th>Chunk</th>\n",
145 |        "      <th>PageNumber</th>\n",
146 |        "      <th>LineNumber</th>\n",
147 |        "      <th>Embedding</th>\n",
148 |        "    </tr>\n",
149 |        "  </thead>\n",
150 |        "  <tbody>\n",
151 |        "    <tr>\n",
152 |        "      <th>0</th>\n",
153 |        "      <td>1</td>\n",
154 |        "      <td>MSFT</td>\n",
155 |        "      <td>23</td>\n",
156 |        "      <td>1</td>\n",
157 |        "      <td>Microsoft FY23 First Quarter Earnings Conferen...</td>\n",
158 |        "      <td>1</td>\n",
159 |        "      <td>1</td>\n",
160 |        "      <td>[-0.022691456601023674, -0.028929658234119415,...</td>\n",
161 |        "    </tr>\n",
162 |        "    <tr>\n",
163 |        "      <th>1</th>\n",
164 |        "      <td>2</td>\n",
165 |        "      <td>MSFT</td>\n",
166 |        "      <td>23</td>\n",
167 |        "      <td>1</td>\n",
168 |        "      <td>On the Microsoft Investor Relations website, y...</td>\n",
169 |        "      <td>1</td>\n",
170 |        "      <td>9</td>\n",
171 |        "      <td>[-0.022940216585993767, -0.008343684487044811,...</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "      <th>2</th>\n",
175 |        "      <td>3</td>\n",
176 |        "      <td>MSFT</td>\n",
177 |        "      <td>23</td>\n",
178 |        "      <td>1</td>\n",
179 |        "      <td>GAAP. They are included as additional clarifyi...</td>\n",
180 |        "      <td>1</td>\n",
181 |        "      <td>17</td>\n",
182 |        "      <td>[-0.01130777969956398, -0.0038822712376713753,...</td>\n",
183 |        "    </tr>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "   Id Ticker  Year  Quarter  \\\n",
190 |        "0   1   MSFT    23        1   \n",
191 |        "1   2   MSFT    23        1   \n",
192 |        "2   3   MSFT    23        1   \n",
193 |        "\n",
194 |        "                                               Chunk  PageNumber  LineNumber  \\\n",
195 |        "0  Microsoft FY23 First Quarter Earnings Conferen...           1           1   \n",
196 |        "1  On the Microsoft Investor Relations website, y...           1           9   \n",
197 |        "2  GAAP. They are included as additional clarifyi...           1          17   \n",
198 |        "\n",
199 |        "                                           Embedding  \n",
200 |        "0  [-0.022691456601023674, -0.028929658234119415,...  \n",
201 |        "1  [-0.022940216585993767, -0.008343684487044811,...  \n",
202 |        "2  [-0.01130777969956398, -0.0038822712376713753,...  "
203 |       ]
204 |      },
205 |      "execution_count": 19,
206 |      "metadata": {},
207 |      "output_type": "execute_result"
208 |     }
209 |    ],
210 |    "source": [
211 |     "df_chunks_embedding.head(3)\n",
212 |     "#columns should look like the following with order preserved\n",
213 |     "#Id, Chunk, PageNumber, LineNumber, DocId, Embedding"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 20,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       " rag_prop_j_3 created\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "\n",
231 |     "# Create a search index\n",
232 |     "index_client = SearchIndexClient(\n",
233 |     "    endpoint=service_endpoint, credential=credential)\n",
234 |     "fields = [\n",
235 |     "    SimpleField(name=\"Id\", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),\n",
236 |     "    SearchableField(name=\"Ticker\", type=SearchFieldDataType.String, filterable=True),\n",
237 |     "    SearchableField(name=\"Year\", type=SearchFieldDataType.String, filterable=True),\n",
238 |     "    SearchableField(name=\"Quarter\", type=SearchFieldDataType.String, filterable=True),\n",
239 |     "    SearchableField(name=\"Chunk\", type=SearchFieldDataType.String, searchable=True),\n",
240 |     "    SearchableField(name=\"PageNumber\", type=SearchFieldDataType.String, filterable=True),\n",
241 |     "    SearchableField(name=\"LineNumber\", type=SearchFieldDataType.String, filterable=True),\n",
242 |     "    \n",
243 |     "    SearchField(name=\"Embedding\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n",
244 |     "                searchable=True, vector_search_dimensions=1536, vector_search_configuration=\"my-vector-config\"),\n",
245 |     "]\n",
246 |     "\n",
247 |     "vector_search = VectorSearch(\n",
248 |     "    algorithm_configurations=[\n",
249 |     "        HnswVectorSearchAlgorithmConfiguration(\n",
250 |     "            name=\"my-vector-config\",\n",
251 |     "            kind=\"hnsw\",\n",
252 |     "            parameters={\n",
253 |     "                \"m\": 4,\n",
254 |     "                \"efConstruction\": 400,\n",
255 |     "                \"efSearch\": 500,\n",
256 |     "                \"metric\": \"cosine\"\n",
257 |     "            }\n",
258 |     "        )\n",
259 |     "    ]\n",
260 |     ")\n",
261 |     "\n",
262 |     "semantic_config = SemanticConfiguration(\n",
263 |     "    name=\"my-semantic-config\",\n",
264 |     "    prioritized_fields=PrioritizedFields(\n",
265 |     "        title_field=SemanticField(field_name=\"Ticker\"),\n",
266 |     "        prioritized_content_fields=[SemanticField(field_name=\"Chunk\")]\n",
267 |     "    )\n",
268 |     ")\n",
269 |     "\n",
270 |     "# Create the semantic settings with the configuration\n",
271 |     "semantic_settings = SemanticSettings(configurations=[semantic_config])\n",
272 |     "\n",
273 |     "# Create the search index with the semantic settings\n",
274 |     "index = SearchIndex(name=index_name, fields=fields,\n",
275 |     "                    vector_search=vector_search, semantic_settings=semantic_settings)\n",
276 |     "result = index_client.create_or_update_index(index)\n",
277 |     "print(f' {result.name} created')"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 31,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "Uploaded 442 payload\n"
290 |      ]
291 |     }
292 |    ],
293 |    "source": [
294 |     "\n",
295 |     "## Upload data to Index\n",
296 |     "def batch_append_payload(df, search_client):\n",
297 |     "    \"\"\"append payload for batch insertion (note: max 1000 rows per insertion) of embeddings to Cognitive Search\"\"\"\n",
298 |     "    value_list = []\n",
299 |     "    for index, row in df.iterrows():\n",
300 |     "        value_list.append(\n",
301 |     "            {\n",
302 |     "                \"Id\": str(index),\n",
303 |     "                \"Ticker\": row[\"Ticker\"],\n",
304 |     "                \"Year\": str(row[\"Year\"]),\n",
305 |     "                \"Quarter\": str(row[\"Quarter\"]),\n",
306 |     "                \"Chunk\": row[\"Chunk\"],\n",
307 |     "                \"PageNumber\": str(row[\"PageNumber\"]),\n",
308 |     "                \"LineNumber\": str(row[\"LineNumber\"]),\n",
309 |     "                \"Embedding\": literal_eval(row['Embedding']),\n",
310 |     "            }\n",
311 |     "        )\n",
312 |     "        \n",
313 |     "#         print(len(value_list))\n",
314 |     "        \n",
315 |     "        if len(value_list) >= 1000:\n",
316 |     "            result = search_client.upload_documents(value_list)\n",
317 |     "            print(f\"Uploaded {len(value_list)} payload\")\n",
318 |     "            value_list = []\n",
319 |     "    result = search_client.upload_documents(value_list)\n",
320 |     "    print(f\"Uploaded {len(value_list)} payload\")\n",
321 |     "    \n",
322 |     "            \n",
323 |     "            \n",
324 |     "#     print('payload of size {}'.format(len(value_list)))\n",
325 |     "\n",
326 |     "    return value_list\n",
327 |     "\n",
328 |     "\n",
329 |     "search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)\n",
330 |     "payload = batch_append_payload(df_chunks_embedding, search_client)\n",
331 |     " \n",
332 |     "# print(f\"Uploaded {len(payload)} payload\") \n"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "# Search Types 1: Pure Vector Search"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 51,
345 |    "metadata": {},
346 |    "outputs": [
347 |     {
348 |      "name": "stdout",
349 |      "output_type": "stream",
350 |      "text": [
351 |       "MSFT\n",
352 |       "2\n",
353 |       "23\n",
354 |       "Microsoft FY23 Second Quarter Earnings Conference Call Brett Iversen, Satya Nadella, Amy Hood Tuesday, January 24, 2023 BRETT IVERSEN: Good afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer, Amy Hood, chief financial officer, Alice Jolla, chief accounting officer, and Keith Dolliver, deputy general counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to \n"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "# Pure Vector Search\n",
360 |     "query = \"Microsoft earnings call for year 2022 for Quarter 2\"  \n",
361 |     "  \n",
362 |     "search_client = SearchClient(service_endpoint, index_name, credential=credential)\n",
363 |     "vector = Vector(value=createEmbeddings(query), k=2, fields=\"Embedding\")\n",
364 |     "  \n",
365 |     "results = search_client.search(  \n",
366 |     "    search_text=None,  \n",
367 |     "    vectors=[vector],\n",
368 |     "#     select=[\"Ticker\", \"Quarter\", \"Year\"],\n",
369 |     ")\n",
370 |     "\n",
371 |     "# results\n",
372 |     "  \n",
373 |     "for result in results: \n",
374 |     "    print(result['Ticker'])\n",
375 |     "    print(result['Quarter'])\n",
376 |     "    print(result['Year'])\n",
377 |     "    print(result['Chunk'])\n",
378 |     "    break"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "# Search Types 2: Pure Filter"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 52,
391 |    "metadata": {},
392 |    "outputs": [
393 |     {
394 |      "name": "stdout",
395 |      "output_type": "stream",
396 |      "text": [
397 |       "Ticker: MSFT\n",
398 |       "Quarter: 1\n",
399 |       "Year: 23\n",
400 |       "Microsoft FY23 First Quarter Earnings Conference Call Brett Iversen, Satya Nadella, Amy Hood Tuesday, October 25, 2022 BRETT IVERSEN: Good afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer, Amy Hood, chief financial officer, Alice Jolla, chief accounting officer, and Keith Dolliver, deputy general counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to \n",
401 |       "\n"
402 |      ]
403 |     }
404 |    ],
405 |    "source": [
406 |     "results = search_client.search(  \n",
407 |     "    search_text=None,  \n",
408 |     "    filter=\"(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') \",\n",
409 |     ")  \n",
410 |     "\n",
411 |     "for result in results:\n",
412 |     "    print(f\"Ticker: {result['Ticker']}\")\n",
413 |     "    print(f\"Quarter: {result['Quarter']}\") \n",
414 |     "    print(f\"Year: {result['Year']}\") \n",
415 |     "    print(result['Chunk'])\n",
416 |     "    print()\n",
417 |     "    break\n"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "# Search Types 3: Vector Search with filters"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 53,
430 |    "metadata": {},
431 |    "outputs": [
432 |     {
433 |      "name": "stdout",
434 |      "output_type": "stream",
435 |      "text": [
436 |       "Ticker: MSFT\n",
437 |       "Quarter: 1\n",
438 |       "Year: 23\n",
439 |       "you're still seeing digitization. This is still the tailwind that helps customers solve problems. This is still the way to build growth and leverage in your business. And yet, you still want to optimize your workloads. You still want to run them the most efficiently so that you can then make room for new workload growth. We saw that across all segments. If there was one segment where I may have seen it a bit more, I would say, in the small or mid-sized segment of the market, that tends to be more through partner. We rely on partners to help customers do those same optimizations and prepare workloads. But it is that one point I know that people are focused on. \n",
440 |       "\n"
441 |      ]
442 |     }
443 |    ],
444 |    "source": [
445 |     "# Pure Vector Search with Filter\n",
446 |     "query = \"What are the KPIs?\"  \n",
447 |     "  \n",
448 |     "search_client = SearchClient(service_endpoint, index_name, credential=credential)  \n",
449 |     "vector = Vector(value=createEmbeddings(query), k=5, fields=\"Embedding\")  \n",
450 |     "\n",
451 |     "results = search_client.search(  \n",
452 |     "    search_text=None,  \n",
453 |     "    vectors=[vector],\n",
454 |     "    filter=\"(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') \",\n",
455 |     "#     select=[\"Ticker\", \"Quarter\", \"Year\"],\n",
456 |     ")  \n",
457 |     "  \n",
458 |     "for result in results:\n",
459 |     "    print(f\"Ticker: {result['Ticker']}\")\n",
460 |     "    print(f\"Quarter: {result['Quarter']}\") \n",
461 |     "    print(f\"Year: {result['Year']}\") \n",
462 |     "    print(result['Chunk'])\n",
463 |     "    print()\n",
464 |     "\n",
465 |     "    break"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "# Search Types 4: Hybrid Search with filters"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 54,
478 |    "metadata": {},
479 |    "outputs": [
480 |     {
481 |      "name": "stdout",
482 |      "output_type": "stream",
483 |      "text": [
484 |       "Ticker: MSFT\n",
485 |       "Quarter: 1\n",
486 |       "Year: 23\n",
487 |       "AMY HOOD: Thanks, Keith, and I do appreciate you asking about that one point, because I do know it is a point of focus every quarter. And what I would say is there is some inherent volatility to that number. A point here or there, and you've heard me say it when we've been a point better, and you've heard me say it when we've been a point worse. And I want to focus mostly on what and how we see the number, which is that it is still a very large growth rate with growth across all segments and with growth across all geos. That was, to the question, generally in line with where we expected. And what we did see through the quarter is a real focus both by customers, but also by our sales and customer success teams on \n",
488 |       "\n"
489 |      ]
490 |     }
491 |    ],
492 |    "source": [
493 |     "# Pure Vector Search with Filter\n",
494 |     "query = \"What are the KPIs?\"  \n",
495 |     "  \n",
496 |     "search_client = SearchClient(service_endpoint, index_name, credential=credential)  \n",
497 |     "vector = Vector(value=createEmbeddings(query), k=5, fields=\"Embedding\")  \n",
498 |     "\n",
499 |     "results = search_client.search(  \n",
500 |     "    search_text=query,  \n",
501 |     "    vectors=[vector],\n",
502 |     "    filter=\"(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') \",\n",
503 |     "#     select=[\"Ticker\", \"Quarter\", \"Year\"],\n",
504 |     "    top = 3\n",
505 |     ")  \n",
506 |     "  \n",
507 |     "for result in results:\n",
508 |     "    print(f\"Ticker: {result['Ticker']}\")\n",
509 |     "    print(f\"Quarter: {result['Quarter']}\") \n",
510 |     "    print(f\"Year: {result['Year']}\") \n",
511 |     "    print(result['Chunk'])\n",
512 |     "    print()\n",
513 |     "    break"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": []
522 |   }
523 |  ],
524 |  "metadata": {
525 |   "kernelspec": {
526 |    "display_name": "nanogpt",
527 |    "language": "python",
528 |    "name": "nanogpt"
529 |   },
530 |   "language_info": {
531 |    "codemirror_mode": {
532 |     "name": "ipython",
533 |     "version": 3
534 |    },
535 |    "file_extension": ".py",
536 |    "mimetype": "text/x-python",
537 |    "name": "python",
538 |    "nbconvert_exporter": "python",
539 |    "pygments_lexer": "ipython3",
540 |    "version": "3.8.16"
541 |   }
542 |  },
543 |  "nbformat": 4,
544 |  "nbformat_minor": 2
545 | }
546 | 


--------------------------------------------------------------------------------