├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── environment.yaml ├── example.env ├── rag_skills ├── chatbotSkills.py ├── conversationalRetrievalwithLangchain.ipynb ├── demonstrateSkills.ipynb └── utils.py └── samples └── financial_transcripts ├── Dockerfile ├── README.md ├── chatBot.py ├── images ├── MSFT.jpg ├── Microsoft_logo.png ├── Microsoft_logo.svg ├── chatbot.jpg └── openai.png ├── llm_app.py ├── st_config.yaml ├── st_main.py ├── step0_data_preprocessor.ipynb ├── step1_chunk_and_extract.ipynb ├── step2_embed.ipynb └── step3_db_storing_vectorsearch.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | *.DS_Store 7 | 8 | vectorstore 9 | # Environment in subdirectories 10 | **/*.env 11 | !**/example.env 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | **/DATA/* 37 | ! **/DATA/README.md 38 | **/AnalyzedPDF/* 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .nox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *.cover 60 | *.py,cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | cover/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | .pybuilder/ 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | # For a library or package, you might want to ignore these files since the code is 98 | # intended to run in multiple environments; otherwise, check them in: 99 | # .python-version 100 | 101 | # pipenv 102 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 103 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 104 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 105 | # install all needed dependencies. 106 | #Pipfile.lock 107 | 108 | # poetry 109 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 110 | # This is especially recommended for binary packages to ensure reproducibility, and is more 111 | # commonly ignored for libraries. 112 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 113 | #poetry.lock 114 | 115 | # pdm 116 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 117 | #pdm.lock 118 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 119 | # in version control. 120 | # https://pdm.fming.dev/#use-with-ide 121 | .pdm.toml 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ 172 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Retrieval Augmentation Generation (RAG) patterns 2 | 3 | Following the launch of ChatGPT, many companies express a keen interest in developing search engines in the style of ChatGPT, tailored to their specific datasets. To address this challenge, Retrieval Augmentation Generation (RAG) has emerged as a popular solution. RAG comprises a three-step process: 4 | 5 | First, pertinent information (referred to as context) is retrieved from the database based on the human query 6 | Then, this context is enhanced and integrated with the human query. 7 | Finally, the enriched context is presented to GPT-style models to generate a response. 8 | 9 | We've observed that the RAG approach can take on various forms. For instance, certain problems necessitate a memory of past conversations, while in others, the database may offer an extensive context that surpasses the limits of LLM prompts. We've devised solutions for these challenges, which we refer to as skills. The objective of this repository is to offer a compendium of these skills, showcase how each can be applied independently, and also present some end-to-end examples demonstrating their utilization. 10 | 11 | Please note that in addition to this repository, we maintain others that focus on different aspects of RAG. 12 | 13 | 1. This repo would illustrate the use of Azure Cognitive Search (ACS) as a vector store. For those interested in employing other databases such as Postgres, AzureSQL, MongoDB, please refer to this [repository](https://github.com/microsoft/AzureDataRetrievalAugmentedGenerationSamples). 14 | 15 | 2. For deployment, We employ [Streamlit](https://streamlit.io/). Alternatively, other options such as deploying through Azure Web App using docker containers or creating a chatbot in Microsoft Teams can be explored in this [repository](https://github.com/microsoft/QnABot-for-FabricDocs.git). 16 | 17 | ## Skills 18 | 19 | This repo contains a collection of skills available in `rag_skills/chatbotSkills.py` and their code samples for individual skills: 20 | 21 | 1. Chatbot with memory functionality: `chatbotSkills.py` contains functions for chatbot enabled with memory capabilities. Two distinct types of memory skills are available: 22 | 23 | a. `qa_chain_ConversationBufferMemory`: This skill leverages the entire chat history, context and human queries for generating responses. It's recommended for shorter conversations. 24 | 25 | b. `qa_chain_ConversationSummaryMemory`: This skill uses the condensed version of chat history, context and human queries for generating responses. It's preferable for longer conversations. 26 | 27 | 2. `user_query_based_context_summarization`: Summarize or extract the relevant information from the context based on the user query. 28 | 29 | 3. `combine_docs`: This skill is useful when search retrieves multiple contexts from the database that cannot fit into a single language model call. It combines multiple contexts while retaining the information that is relevant to user query. It also ensures that the total context token count remains below a certain threshold. 30 | 31 | > NOTE: Skills are being updated as code snippets faster than this repo at [rag_skills](https://github.com/microsoft/rag_skills) 32 | 33 | ## End2End Sample for Different RAG patterns 34 | 35 | This repository also includes one end-to-end sample centered around financial transcripts. Please note that the plan is to incorporate additional samples that showcase various RAG patterns in the future. 36 | 37 | 38 | | Sample name | Description | Tech Stack | 39 | | --------------------------------- | ----------------------------------- | ---------------------------------------------------------------- | 40 | | Financial Earnings calls assistant | Summarizes and Q&A on earning calls | ACS, deployed on streamlit | 41 | 42 | 43 | ## How to use? 44 | 45 | 1. Lightweight examples of various skills are provided in `rag_skills/demonstrateSkills.ipynb` 46 | 2. End2End samples are at `samples/` folder. Please follow the README in the folder itself. 47 | 48 | ## Contributing 49 | 50 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 51 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 52 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 53 | 54 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 55 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 56 | provided by the bot. You will only need to do this once across all repos using our CLA. 57 | 58 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 59 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 60 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 61 | 62 | ## Trademarks 63 | 64 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 65 | trademarks or logos is subject to and must follow 66 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 67 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 68 | Any use of third-party trademarks or logos are subject to those third-party's policies. 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: appliedai 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.8.11 7 | - pip 8 | - pip: 9 | - numpy==1.24.4 10 | - openai==0.27.8 11 | - python-dotenv==1.0.0 12 | - scikit-learn==1.3.0 13 | - mlflow==2.4.2 14 | - requests==2.31.0 15 | - pyyaml==6.0.1 16 | - psycopg2-binary==2.9.6 17 | - pandas==2.0.3 18 | - pgvector==0.2.0 19 | - langchain>=0.0.317 20 | - docx2txt==0.8 21 | - docx2pdf==0.1.8 22 | - streamlit==1.25.0 23 | - tiktoken==0.4.0 24 | - azure-ai-formrecognizer==3.3.0 25 | - azure-storage-blob==12.17.0 26 | - azure-search-documents==11.4.0b8 27 | - pydantic==1.10.11 -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | # azure open ai 2 | OPENAI_API_BASE="" 3 | OPENAI_API_TYPE="" 4 | OPENAI_API_KEY="" 5 | OPENAI_DEPLOYMENT_EMBEDDING="" 6 | OPENAI_MODEL_EMBEDDING="" 7 | OPENAI_DEPLOYMENT_COMPLETION="" 8 | OPENAI_MODEL_COMPLETION="" 9 | OPENAI_API_VERSION="" 10 | 11 | # azure form recognizer 12 | AZURE_FORM_RECOGNIZER_ENDPOINT = 13 | AZURE_FORM_RECOGNIZER_NAME = "" 14 | AZURE_FORM_RECOGNIZER_KEY = "" 15 | 16 | # azure cognitive search 17 | COGSEARCH_NAME = "" 18 | COGSEARCH_INDEX_NAME = "" 19 | COGSEARCH_API_KEY = "" 20 | -------------------------------------------------------------------------------- /rag_skills/chatbotSkills.py: -------------------------------------------------------------------------------- 1 | # Import required libraries from LangChain and set up OpenAI 2 | from langchain.llms import AzureOpenAI 3 | from langchain import PromptTemplate 4 | from langchain.chains import LLMChain, ConversationChain 5 | from langchain.memory import ConversationBufferMemory, ConversationSummaryMemory 6 | from langchain.prompts import PromptTemplate 7 | import openai 8 | import os 9 | from dotenv import dotenv_values 10 | from rag_skills.utils import count_tokens 11 | 12 | ############################################################## 13 | ###### QA chain with conversational buffer memory ############# 14 | ############################################################## 15 | def qa_chain_ConversationBufferMemory(llm, prefix_template=None, to_debug=False): 16 | # Write a preprompt with context and query as variables 17 | if prefix_template is None: 18 | prefix_template = """ 19 | You are a chatbot having a conversation with a human. 20 | Given the Context, Chat History, and a Human Query, 21 | create a final answer only using the Context. Don't hallucinate at all. """ 22 | 23 | template = prefix_template + """ 24 | Context: 25 | {context} 26 | 27 | Chat History: 28 | {chat_history} 29 | 30 | Human Query: {human_input} 31 | Chatbot:""" 32 | 33 | # Define a prompt template 34 | prompt = PromptTemplate( 35 | input_variables=["chat_history", "human_input", "context"], template=template 36 | ) 37 | 38 | # Define Memory 39 | memory = ConversationBufferMemory(memory_key="chat_history", input_key="human_input") 40 | 41 | # Define a chain 42 | qa_chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=to_debug) 43 | return qa_chain 44 | 45 | 46 | ############################################################## 47 | ###### QA chain with converational Summary memory ############# 48 | ############################################################## 49 | def qa_chain_ConversationSummaryMemory(llm, prefix_template=None, to_debug=False): 50 | # Write a preprompt with context and query as variables 51 | if prefix_template is None: 52 | prefix_template = """ 53 | You are a chatbot having a conversation with a human. 54 | Given the Context, Chat History, and a Human Query, 55 | create a final answer. Don't hallucinate at all. If you don't have an answer, say "I don't know". 56 | """ 57 | 58 | template = prefix_template + """ 59 | Context: 60 | {context} 61 | 62 | Chat History: 63 | {chat_history} 64 | 65 | Human Query: {human_input} 66 | Chatbot: 67 | """ 68 | 69 | # Define a prompt template 70 | prompt = PromptTemplate( 71 | input_variables=["chat_history", "human_input", "context"], 72 | template=template 73 | ) 74 | 75 | #Define Memory 76 | 77 | memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", input_key="human_input") 78 | memory.prompt.template = """ 79 | Progressively summarize the lines of conversation provided, adding onto the previous summary returning a new summary. 80 | 81 | Current summary: 82 | {summary} 83 | 84 | New lines of conversation: 85 | {new_lines} 86 | 87 | New summary: 88 | """ 89 | 90 | # Define a chain 91 | qa_chain = LLMChain(llm=llm, prompt=prompt, memory=memory, verbose=to_debug) 92 | return qa_chain 93 | 94 | ################################################################ 95 | ###### Summarize chain with user query and context ############# 96 | ################################################################ 97 | def user_query_based_context_summarization(llm, prefix_template=None, to_debug=False): 98 | # Write a preprompt with context and query as variables 99 | if prefix_template is None: 100 | prefix_template = """ 101 | Write a concise summary of the context so that it includes the details related to the human query. 102 | """ 103 | 104 | template = prefix_template + """ 105 | Context: 106 | {context} 107 | 108 | Human Query: {human_input} 109 | Concise Summary: 110 | """ 111 | 112 | # Define a prompt template 113 | prompt = PromptTemplate( 114 | input_variables=["human_input", "context"], template=template 115 | ) 116 | 117 | # Define a chain 118 | query_based_summary_chain = LLMChain(llm=llm, prompt=prompt, verbose=to_debug) 119 | return query_based_summary_chain 120 | 121 | ################################################################ 122 | ###### Write a summary given multiple contexts ############# 123 | ################################################################ 124 | 125 | def combine_docs(context_list, llm, to_debug=False, max_tokens=16000, user_query=None, prefix_template=None): 126 | """Given a list of documents, combine them into a single document with a max token limit.""" 127 | 128 | ## When all the documents can be concatenated 129 | context_all = "" 130 | for i in context_list: 131 | context_all = context_all + i + "\n\n" 132 | 133 | if count_tokens(context_all) < max_tokens: 134 | return context_all 135 | 136 | ## When all the documents cannot be concatenated 137 | if user_query is None: 138 | user_query = "" 139 | 140 | query_based_summary_chain = user_query_based_context_summarization(llm, 141 | prefix_template=prefix_template, 142 | to_debug=to_debug 143 | ) 144 | 145 | context_all = "" 146 | for i in context_list: 147 | context_all = context_all + i + "\n\n" 148 | 149 | ## If the context_all is greater than max_tokens, then summarize the context_all again 150 | if count_tokens(context_all) > max_tokens: 151 | context_all = query_based_summary_chain.run({ 152 | 'context': context_all, 153 | 'human_input': user_query 154 | }) 155 | 156 | return context_all 157 | 158 | -------------------------------------------------------------------------------- /rag_skills/conversationalRetrievalwithLangchain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "In this notebook we will demonstrate how to use LangChain to perform [conversational retrieval](https://python.langchain.com/docs/use_cases/question_answering/chat_vector_db).\n", 8 | "\n", 9 | "In a conversational question and answering scenario, users often pose follow-up questions related to the same topic, with the context being crucial to understand their queries. To address such cases effectively, we use the ConversationalRetrievalChain. Behind the scenes, this chain takes the user's question and converts it into a standalone query by considering the conversation history. Subsequently, it uses this standalone question to query the search service for relevant information." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "! pip install azure-identity" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "In this notebook we are utilizing an existing search index we already set up with Azure Cognitive Search. You can follow this [link](https://python.langchain.com/docs/integrations/vectorstores/azuresearch) to create your own search index. In our search index, the content vector field is named \"contentVector\", so we set it as an environment variable below." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import os\n", 35 | "os.environ[\"AZURESEARCH_FIELDS_CONTENT_VECTOR\"] = \"contentVector\"" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from langchain.embeddings.openai import OpenAIEmbeddings\n", 45 | "from langchain.llms import AzureOpenAI\n", 46 | "from langchain.chains import ConversationalRetrievalChain\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "True" 58 | ] 59 | }, 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "from dotenv import load_dotenv\n", 67 | "load_dotenv(\"../llm.env\")" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from langchain.vectorstores.azuresearch import AzureSearch\n", 77 | "\n", 78 | "\n", 79 | "model = os.getenv(\"OPENAI_DEPLOYMENT_EMBEDDING\")\n", 80 | "embeddings = OpenAIEmbeddings(deployment=model)\n", 81 | "index_name = \"testqa\"\n", 82 | "vectore_store_name = os.getenv(\"COGSEARCH_NAME\")\n", 83 | "vector_store_address = f\"https://{vectore_store_name}.search.windows.net\"\n", 84 | "vector_store_password = os.getenv(\"COGSEARCH_API_KEY\")\n", 85 | "vector_store = AzureSearch(\n", 86 | " azure_search_endpoint=vector_store_address,\n", 87 | " azure_search_key=vector_store_password,\n", 88 | " index_name=index_name,\n", 89 | " embedding_function=embeddings.embed_query,\n", 90 | ")" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "from langchain.memory import ConversationBufferMemory\n", 100 | "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True)\n", 101 | "memory.output_key = \"answer\"" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "In this notebook we are focusing on the ConversationalRetrievalChain only, for demonstration purposes. If you use your own search index you might need to modify your queries so that it is relevant to the information in your index." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "llm = AzureOpenAI(deployment_name=os.getenv(\"OPENAI_DEPLOYMENT_COMPLETION\"), model_name=os.getenv(\"OPENAI_MODEL_COMPLETION\"),temperature=0)\n", 118 | "qa = ConversationalRetrievalChain.from_llm(llm, vector_store.as_retriever(search_kwargs={\"k\": 1}), memory=memory, return_generated_question=True)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 7, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "query = \"What did the astronaut Edgar Mitchell call Earth?\"\n", 128 | "result = qa({\"question\": query})" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 8, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "' Edgar Mitchell called Earth a \"sparkling blue and white jewel.\"'" 140 | ] 141 | }, 142 | "execution_count": 8, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "result[\"answer\"]" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "query = \"When and how did NASA make its first observation about it from the space?\"\n", 158 | "result = qa({\"question\": query})" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "' NASA first observed Earth from space with the launch of Explorer 1 in 1960.'" 170 | ] 171 | }, 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "result[\"answer\"]" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "As you can see, if we had query the search service using the user's question directly, it is not clear wht the user is reffering to by \"its\" and \"it\". However, the generated standalone question makes it more clear, and enables for more efficient and robust queries." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 11, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "' When and how did NASA first observe Earth from space?'" 197 | ] 198 | }, 199 | "execution_count": 11, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "result[\"generated_question\"]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "When the user asks a question on a different topic, the generated question still reflects the user's question correctly." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 12, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "query = \"Why can't we see volcanic plumes with our eyes?\"\n", 222 | "result = qa({\"question\": query})" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 13, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | " Volcanic plumes are not visible to the naked eye because they are typically invisible in the electromagnetic spectrum. However, satellites can use infrared to distinguish the plumes from ice and clouds.\n", 235 | " Why are volcanic plumes not visible to the naked eye?\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "print(result[\"answer\"])\n", 241 | "print(result[\"generated_question\"])" 242 | ] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": "Python 3", 248 | "language": "python", 249 | "name": "python3" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 3 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython3", 261 | "version": "3.8.11" 262 | }, 263 | "orig_nbformat": 4 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /rag_skills/demonstrateSkills.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "a2469417", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import sys\n", 12 | "import openai\n", 13 | "from dotenv import dotenv_values\n", 14 | "from langchain.chat_models import AzureChatOpenAI\n", 15 | "sys.path.append(\"..\") ## add directory above\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "1af107d4", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "env_name = \"../llm.env\" # change to use your own .env file\n", 26 | "config = dotenv_values(env_name)\n", 27 | "\n", 28 | "#Azure OpenAI\n", 29 | "openai.api_type = config[\"OPENAI_API_TYPE\"] #\"azure\"\n", 30 | "openai.api_key = config['OPENAI_API_KEY']\n", 31 | "openai.api_base = config['OPENAI_API_BASE']\n", 32 | "openai.api_version = config['OPENAI_API_VERSION']" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "e9c886c1", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "engine = \"gpt-4-32k\"\n", 43 | "llm = AzureChatOpenAI(\n", 44 | " deployment_name=engine,\n", 45 | " openai_api_base=openai.api_base,\n", 46 | " openai_api_version=openai.api_version,\n", 47 | " openai_api_key=openai.api_key,\n", 48 | " openai_api_type=openai.api_type,\n", 49 | " temperature=0.0,\n", 50 | " verbose=True\n", 51 | ")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "e0b0e6f8", 57 | "metadata": {}, 58 | "source": [ 59 | "## 1. Demonstrating conversation summary memory chain:\n", 60 | "This chain summarizes the previous user conversation and appends the summary to context for answering questions " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "4886138d", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "from chatbotSkills import qa_chain_ConversationSummaryMemory\n", 71 | "\n", 72 | "# Make a Question Answer chain function and pass \n", 73 | "prefix_template = \"\"\"\n", 74 | " You are a chatbot having a conversation with a human.\n", 75 | " Given the Context, Chat History, and a Human Query, \n", 76 | " create a final answer. Don't hallucinate at all. If you don't have an answer, say \"I don't know\".\n", 77 | " \"\"\"\n", 78 | "\n", 79 | "qa_chain = qa_chain_ConversationSummaryMemory(llm, prefix_template=prefix_template, to_debug=False)\n", 80 | "\n", 81 | "## Question Answering\n", 82 | "\n", 83 | "#Question 1\n", 84 | "answer = qa_chain.run({\n", 85 | " 'context': \"USSA is a space agency in county Y. It is a government agency responsible for the exploration and development of space.\",\n", 86 | " 'human_input': \"What is USSA\" \n", 87 | "})\n", 88 | "\n", 89 | "print(\"Question 1: \")\n", 90 | "print(answer)\n", 91 | "\n", 92 | "# Question 2: \n", 93 | "answer = qa_chain.run({\n", 94 | " 'context': \"Zootopia is a 2016 American computer-animated buddy cop action comedy film produced by Walt Disney Animation Studios.\",\n", 95 | " 'human_input': \"Do you know about any space agency?\" \n", 96 | "}) \n", 97 | "\n", 98 | "print(\"Question 2: \")\n", 99 | "print(answer)\n", 100 | "print()\n", 101 | "print(\"\"\"Context in question 2 does not contain any specific information regarding the \n", 102 | " user question but still llm provides correct answer by using the memory of previous conversation\n", 103 | " \"\"\"\n", 104 | ")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "bb443051", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "print(qa_chain.memory) ## You can see the memory using this call" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "0af592da", 120 | "metadata": {}, 121 | "source": [ 122 | "## 2. Demonstrating conversation buffer memory chain:\n", 123 | "This chain summarizes the previous user conversation and appends the summary to context for answering questions \n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "69704b89", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "from chatbotSkills import qa_chain_ConversationBufferMemory\n", 134 | "\n", 135 | "# Make a Question Answer chain function and pass \n", 136 | "prefix_template = \"\"\"\n", 137 | " You are a chatbot having a conversation with a human. \n", 138 | " Given the Context, Chat History, and a Human Query, \n", 139 | " create a final answer. Don't hallucinate at all. If you don't have an answer, say \"I don't know\".\n", 140 | " \"\"\"\n", 141 | "\n", 142 | "qa_chain = qa_chain_ConversationBufferMemory(llm, prefix_template=prefix_template, to_debug=False)\n", 143 | "\n", 144 | "## Question Answering\n", 145 | "\n", 146 | "#Question 1\n", 147 | "answer = qa_chain.run({\n", 148 | " 'context': \"USSA is a space agency in county Y. It is a government agency responsible for the exploration and development of space.\",\n", 149 | " 'human_input': \"What is USSA\" \n", 150 | "})\n", 151 | "\n", 152 | "print(\"Question 1: \")\n", 153 | "print(answer)\n", 154 | "\n", 155 | "# Question 2: \n", 156 | "answer = qa_chain.run({\n", 157 | " 'context': \"Zootopia is a 2016 American computer-animated buddy cop action comedy film produced by Walt Disney Animation Studios.\",\n", 158 | " 'human_input': \"Do you know about any space agency?\" \n", 159 | "}) \n", 160 | "\n", 161 | "print(\"Question 2: \")\n", 162 | "print(answer)\n", 163 | "print()\n", 164 | "print(\"\"\"Context in question 2 does not contain any specific information regarding the \n", 165 | " user question but still llm provides correct answer by using the memory of previous conversation\n", 166 | " \"\"\"\n", 167 | ")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "bb3356e0", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "print(qa_chain.memory) ## You can see the memory using this call" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "id": "8542530f", 183 | "metadata": {}, 184 | "source": [ 185 | "## 3. Demonstrating user query based context summarization chain:\n", 186 | "Sometimes context can be large and don't fit in a prompt window. So, this chain summarizes context given user query " 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "539bc2d2", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "from chatbotSkills import user_query_based_context_summarization\n", 197 | "\n", 198 | "# Template \n", 199 | "prefix_template = \"\"\"\n", 200 | " Write a concise summary of the context so that it includes the details related to the human query.\n", 201 | " \"\"\"\n", 202 | "\n", 203 | "context_summary_chain = user_query_based_context_summarization(llm, prefix_template=prefix_template, to_debug=False)\n", 204 | "\n", 205 | "context = \"\"\"USSA is a space agency in county Y. It is a government agency responsible\n", 206 | " for the exploration and development of space.\n", 207 | " Zootopia is a 2016 American computer-animated buddy cop action comedy\n", 208 | " film produced by Walt Disney Animation Studios.\n", 209 | " \"\"\"\n", 210 | "\n", 211 | "#Question 1\n", 212 | "answer = context_summary_chain.run({\n", 213 | " 'context': context,\n", 214 | " 'human_input': \"What is USSA?\" \n", 215 | "})\n", 216 | "\n", 217 | "\n", 218 | "print(\"Question 1: \")\n", 219 | "print(answer)\n", 220 | "print()\n", 221 | "\n", 222 | "print (\"\"\"This llm extracts only relevant information from the context. \"\"\")" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "34951a40", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "print(qa_chain.memory) ## You can see the memory using this call" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "fc2d397d", 238 | "metadata": {}, 239 | "source": [ 240 | "## 3. Demonstrating combine_docs:\n", 241 | "Sometimes contexts retrieved from the database can be large and doesn't fit in a prompt. So, this code will extract relevant information from the context given the user query." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "c6b9e2d7", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "from chatbotSkills import combine_docs, count_tokens\n", 252 | "\n", 253 | "context_1 = \"\"\"\n", 254 | " The United Space Exploration Administration (USSA) stands as county Y's premier governmental space agency, \n", 255 | " entrusted with the monumental task of spearheading the exploration, investigation, and advancement of the cosmic frontier.\n", 256 | " With a dedicated cadre of brilliant scientists, intrepid astronauts, \n", 257 | " and cutting-edge technology, USSA pioneers a path to unlock the mysteries of the universe and \n", 258 | " harness its potential for the betterment of humanity. \n", 259 | " \"\"\"\n", 260 | "context_2 = \"\"\"\n", 261 | " Through audacious missions and visionary initiatives, \n", 262 | " USSA propels the nation to new heights, ensuring that the celestial realm \n", 263 | " becomes a beacon of knowledge, opportunity, and inspiration for generations to come.\n", 264 | " \"\"\"\n", 265 | "\n", 266 | "context_list = [context_1, context_2]\n", 267 | "\n", 268 | "input_token_count = count_tokens(context_1+context_2, engine)\n", 269 | "prefix_template = \"\"\"\n", 270 | " Extract information from the context so that it includes the details related to the human query. \n", 271 | " \"\"\"\n", 272 | "user_query = \"What does USSA stand for?\" \n", 273 | "max_input_tokens = 100 ## For demonstration, we are assuming that max token for input context should not exceed 100\n", 274 | "\n", 275 | "output = combine_docs(context_list, llm, to_debug=False, max_tokens=max_input_tokens, \n", 276 | " user_query=user_query, prefix_template=prefix_template)\n", 277 | "\n", 278 | "output_token_count = count_tokens(output, engine)\n", 279 | "print(\"input_token_count: \", input_token_count)\n", 280 | "print(\"output_token_count: \", output_token_count)\n", 281 | "\n", 282 | "print(\"output: \", output)\n", 283 | "print()\n", 284 | "print(\"\"\"This demonstrates that the combine_docs function reduces the tokens for the input\"\"\")\n", 285 | "\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "48a003e8", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "language_info": { 299 | "name": "python" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 5 304 | } 305 | -------------------------------------------------------------------------------- /rag_skills/utils.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | import pandas as pd 3 | 4 | ############################################################## 5 | ###### Tokens ############# 6 | ############################################################## 7 | def count_tokens(string: str, encoding_name: str="gpt-4-32k") -> int: 8 | """Returns the number of tokens in a text string.""" 9 | encoding = tiktoken.encoding_for_model(encoding_name) 10 | num_tokens = len(encoding.encode(string)) 11 | return num_tokens 12 | 13 | ############################################################## 14 | ###### Get Prompt Template from csv ############# 15 | ############################################################## 16 | 17 | def get_prompt_template(prompt_id, prompt_templates_name=None): 18 | """ 19 | Retrieve LLM prompt template using prompt_id from a csv file. 20 | """ 21 | if prompt_templates_name == None: 22 | prompt_templates_name = config['PROMPT_TEMPLATE_FILE'] 23 | df = pd.read_csv(os.path.join(os.path.dirname(__file__), prompt_templates_name)) 24 | prompt = df[df['prompt_id'] == prompt_id]['prompt_template'].values[0] 25 | return prompt -------------------------------------------------------------------------------- /samples/financial_transcripts/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mambaorg/micromamba:0.15.3 2 | USER root 3 | RUN apt-get update && DEBIAN_FRONTEND=“noninteractive” apt-get install -y --no-install-recommends \ 4 | nginx \ 5 | ca-certificates \ 6 | apache2-utils \ 7 | certbot \ 8 | python3-certbot-nginx \ 9 | sudo \ 10 | cifs-utils \ 11 | && \ 12 | rm -rf /var/lib/apt/lists/* 13 | RUN apt-get update && apt-get -y install cron 14 | RUN mkdir /opt/chatbot 15 | RUN chmod -R 777 /opt/chatbot 16 | WORKDIR /opt/chatbot 17 | USER micromamba 18 | EXPOSE 8000 19 | COPY ../../rag_skills /opt/chatbot/rag_skills 20 | COPY ../../environment.yaml ./environment.yaml 21 | COPY ../../llm.env llm.env 22 | RUN micromamba install -y -n base -f environment.yaml && \ 23 | micromamba clean --all --yes 24 | COPY /samples/financial_transcripts/ /opt/chatbot/ 25 | USER root 26 | RUN chmod -R 777 /opt/chatbot 27 | USER micromamba 28 | ENTRYPOINT ["streamlit", "run"] 29 | CMD ["st_main.py","--server.port","8000","--theme.base","dark"] 30 | 31 | -------------------------------------------------------------------------------- /samples/financial_transcripts/README.md: -------------------------------------------------------------------------------- 1 | # Project 2 | 3 | This README provides a comprehensive guide on implementing a question-answering system using the Retrieval Augmentation Generation (RAG) pattern on Microsoft (MSFT) earnings call transcripts. The process leverages Azure Cognitive Search (ACS) for creating a vector database and Azure Form Recognizer for processing raw documents. 4 | 5 | Before commencing the project, ensure that you add the necessary keys to `example.env` in the root directory and rename it as `llm.env`. Specifically, we require keys for [Azure AI Document Intelligence (previously Form Recognizer)](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/create-document-intelligence-resource?view=doc-intel-3.1.0), [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal), and [Azure Cognitive Search services](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal). 6 | 7 | Be sure to create new names in the `llm.env` for `COGSEARCH_INDEX_NAME` because the jupyter notebooks below will assist you with creating the new vector database with this name. 8 | 9 | ## Large Language Model (LLM) application 10 | 11 | In this project, we've employed the RAG approach to develop an LLM application. This involves retrieving context from the database using a human query, augmenting the context, and then prompting GPT-style models to generate an answer. However, we encountered two key challenges with the basic RAG implementation: 12 | 13 | 1. Context Size: At times, the context retrieved from the database is excessively large and doesn't fit within the prompt for GPT-style models. To address this, we utilized an intermediate GPT call to summarize or extract the pertinent information from the retrieved context. 14 | 15 | 2. User Queries Requiring Chat History: Some user queries necessitate the access to previous chat history for accurate responses. To tackle this, we summarize the chat history and augment it with the human query and context. 16 | 17 | To address these challenges, we implemented the architecture shown in the figure below. The chatBot class in chatBot.py implements different functionalities shown in the figure. 18 | 19 | Chatbot Architecture 20 | 21 | ## How to Run 22 | 23 | ### Creating a Vector Database on Azure Cognitive Search (ACS) 24 | 25 | Crating a vector database is a four step process outlined below: 26 | 27 | 1. `step0_data_preprocessor.ipynb` accesses the DATA\ word docs and convert them to PDF to be used by the Azure Form Recognizer in the next step. 28 | 2. `step1_chunk_and_extract.ipynb` chunks and extracts PDF files using Azure Form Recognizer and save to .csv files. 29 | 3. `step2_embed.ipynb` reads and embeds using Azure OpenAI and saves to .csv files. 30 | 4. `step3_db_storing_vectorsearch.ipynb` reads and inserts data into ACS and shows examples of various search capabilities using ACS hybrid search from data. 31 | 32 | ### Running LLM application 33 | To run the LLM application, execute the `llm_app.py` file. 34 | 35 | #### Deployment with streamlit 36 | 37 | 1. Run locally 38 | 39 | ``` 40 | streamlit run st_main.py --server.port 8000 41 | ``` 42 | 43 | 2. Build docker. Since the `rag_skills/chatbotSkills.py` and `environment.yaml` files are at the parent directory, the Dockerfile only works if you run the command from the parent directory. 44 | 45 | ``` 46 | docker build -t bot:v1 -f samples/financial_transcripts/Dockerfile . 47 | docker run --rm -p 8000:8000 bot:v1 48 | ``` 49 | 50 | Go to an open web browser and type `localhost:8000` 51 | 52 | ## Contributing 53 | 54 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 55 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 56 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 57 | 58 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 59 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 60 | provided by the bot. You will only need to do this once across all repos using our CLA. 61 | 62 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 63 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 64 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 65 | 66 | ## Trademarks 67 | 68 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 69 | trademarks or logos is subject to and must follow 70 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 71 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 72 | Any use of third-party trademarks or logos are subject to those third-party's policies. 73 | 74 | 75 | -------------------------------------------------------------------------------- /samples/financial_transcripts/chatBot.py: -------------------------------------------------------------------------------- 1 | # Import required libraries 2 | import os 3 | import re 4 | import openai 5 | import sys 6 | from dotenv import dotenv_values 7 | from azure.core.credentials import AzureKeyCredential 8 | from azure.search.documents.models import Vector 9 | sys.path.append("../..") ## add directory above 10 | from rag_skills.chatbotSkills import qa_chain_ConversationSummaryMemory, combine_docs 11 | 12 | # Get the absolute path to the .env file 13 | env_name = os.path.join(os.path.dirname(__file__), "llm.env") 14 | 15 | # Load environment variables from the .env file 16 | config = dotenv_values(env_name) 17 | 18 | if len(config) == 0: 19 | env_name = os.path.join(os.path.dirname(__file__), "../../llm.env") 20 | config = dotenv_values(env_name) 21 | 22 | if len(config) == 0: 23 | raise Exception("No environment variables loaded. Please check the *.env file.") 24 | 25 | #Azure OpenAI 26 | openai.api_type = config["OPENAI_API_TYPE"] #"azure" 27 | openai.api_key = config['OPENAI_API_KEY'] 28 | openai.api_base = config['OPENAI_API_BASE'] 29 | openai.api_version = config['OPENAI_API_VERSION'] 30 | 31 | ## Cog Search 32 | cogsearch_name = config["COGSEARCH_NAME"] 33 | index_name = config["COGSEARCH_INDEX_NAME"] 34 | key = config["COGSEARCH_API_KEY"] 35 | service_endpoint = "https://"+config["COGSEARCH_NAME"] + ".search.windows.net" 36 | 37 | credential = AzureKeyCredential(key) 38 | 39 | def createEmbeddings(text): 40 | response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"]) 41 | embeddings = response['data'][0]['embedding'] 42 | return embeddings 43 | 44 | ## Retrieves relevant content from Azure Cognitive Search (ACS) 45 | def acs_retriever(search_client, query=None, queryEmbedding = None, 46 | colName=None, colVal=None, searchtype=None, numChunks=5, vectorColName="Embedding"): 47 | # query: user query 48 | # colName: List of column name to search in ACS columns 49 | # colVal: List of column values to search in ACS 50 | # searchtype options: "filter", "vector", "hybrid", filter vector", "filter hybrid" 51 | # vectorColName: Name of vector embedding in ACS 52 | 53 | if query is not None: 54 | vector = Vector(value=queryEmbedding, k=numChunks, fields=vectorColName) 55 | 56 | if colName == None: ## No filters 57 | if searchtype == None or searchtype == "vector": #(default vector) 58 | results = search_client.search(search_text=None, vectors= [vector]) 59 | else: # hybrid 60 | results = search_client.search(search_text=query, vectors= [vector]) 61 | 62 | else: ## Filters 63 | filter_str = " and ".join(f"({key} eq '{value}')" for key, value in zip(colName, colVal)) 64 | filter_str = f"({filter_str})" 65 | 66 | if query == None: #Pure filter 67 | results = search_client.search(search_text = None, filter = filter_str) 68 | elif searchtype == None or searchtype == "filter vector" or searchtype == "vector": # (default filter vector) 69 | results = search_client.search(search_text = None, vectors = [vector], filter = filter_str) 70 | else: # filter hybrid 71 | results = search_client.search(search_text = query, vectors = [vector], filter = filter_str) 72 | 73 | output = [result for result in results] 74 | return output 75 | 76 | def queryParser(query): 77 | # Extract ticker using regular expression 78 | ticker_match = re.search(r'\bticker\s+(\w+)', query, re.IGNORECASE) 79 | ticker = ticker_match.group(1) if ticker_match else None 80 | 81 | # Extract year using regular expression 82 | year_match = re.search(r'\byear\s+(\d{2})', query, re.IGNORECASE) 83 | year = int(year_match.group(1)) if year_match else None 84 | 85 | # Extract quarter using regular expression 86 | quarter_match = re.search(r'\bquarter\s+(\d)', query, re.IGNORECASE) 87 | quarter = quarter_match.group(1) if quarter_match else None 88 | 89 | return ticker, str(year), quarter 90 | 91 | 92 | ###################################### 93 | ## Chatbot 94 | ###################################### 95 | 96 | class chatBot: 97 | def __init__( 98 | self, 99 | llm, 100 | acs_search_client, 101 | max_token_for_context=16000, 102 | template_qa_chain=None, 103 | template_context_summarization=None, 104 | numChunks=10, 105 | vectorColName="contentVector", 106 | chunkColName="Chunk", 107 | to_debug=False 108 | ): 109 | 110 | # ACS 111 | self.search_client = acs_search_client 112 | self.numChunks = numChunks 113 | self.vectorColName = vectorColName 114 | self.chunkColName = chunkColName 115 | 116 | # LLM chain 117 | self.llm = llm 118 | self.max_token_for_context = max_token_for_context 119 | 120 | if template_qa_chain: 121 | self.template_qa_chain = template_qa_chain 122 | else: 123 | self.template_qa_chain= """You are a chatbot having a conversation with a human. 124 | Given the Context, Chat History, and Human Query, 125 | answer without hallucinating. 126 | If you don't have the answer say 'I don't have the answer' 127 | """ 128 | 129 | if template_context_summarization: 130 | self.template_context_summarization = template_context_summarization 131 | else: 132 | self.template_context_summarization = """Summarize the context so it includes 133 | the details related to the human query. 134 | """ 135 | 136 | # Memory chain 137 | self.qa_chain = qa_chain_ConversationSummaryMemory( 138 | prefix_template=self.template_qa_chain, 139 | to_debug=to_debug, 140 | llm=self.llm 141 | ) 142 | 143 | # Transcripts specific 144 | self.ticker = None 145 | self.year = None 146 | self.quarter = None 147 | 148 | 149 | def run(self, human_query): 150 | 151 | queryEmbedding = createEmbeddings(human_query) 152 | ############## Parse query 153 | ticker, year, quarter = queryParser(human_query) 154 | 155 | ## if user query doesn't contain ticker, year, and quarter, use the previous one 156 | if ticker != None: 157 | self.ticker = ticker 158 | if year != str(None): 159 | self.year = year 160 | if quarter != None: 161 | self.quarter = quarter 162 | 163 | ## If ticker, year, and quarter are not found, return error message 164 | if self.ticker == None or self.year == None or self.quarter == None: 165 | print(self.ticker, self.year, self.quarter) 166 | return "Sorry, please provide the ticker , year , and quarter . Example - ticker MSFT, quarter 3, year 23." 167 | 168 | ############### Retrieve from ACS 169 | output = acs_retriever( 170 | self.search_client, 171 | query=human_query, 172 | queryEmbedding=queryEmbedding, 173 | colName=['Ticker', 'Year', 'Quarter'], 174 | colVal=[self.ticker, self.year, self.quarter], 175 | searchtype=None, 176 | numChunks=self.numChunks, 177 | vectorColName=self.vectorColName 178 | ) 179 | 180 | #################### Combine Context 181 | context_list = [i[self.chunkColName] for i in output] 182 | 183 | context_all = combine_docs( 184 | context_list, 185 | to_debug=False, 186 | llm=self.llm, 187 | max_tokens=self.max_token_for_context, 188 | user_query=human_query, 189 | prefix_template=self.template_context_summarization 190 | ) 191 | 192 | ## Append Ticker, Year, Quarter to the context 193 | context_all = "\nTicker: " + self.ticker + "\nYear: " + self.year + "\nQuarter: " + self.quarter + "\n"+ context_all 194 | 195 | # Augment and Generate Answer 196 | # qa_chain below is a predefined chain with memory. It summarizes the chat history and augment to the context 197 | answer = self.qa_chain.run({'context': context_all,'human_input': human_query}) 198 | return answer 199 | 200 | 201 | def retrieveChatHistory(self): 202 | return self.qa_chain.memory.chat_memory 203 | 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /samples/financial_transcripts/images/MSFT.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/MSFT.jpg -------------------------------------------------------------------------------- /samples/financial_transcripts/images/Microsoft_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/Microsoft_logo.png -------------------------------------------------------------------------------- /samples/financial_transcripts/images/Microsoft_logo.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /samples/financial_transcripts/images/chatbot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/chatbot.jpg -------------------------------------------------------------------------------- /samples/financial_transcripts/images/openai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/rag-e2e-sample/96459d2e40f2778030f36e8812e37deecf34335f/samples/financial_transcripts/images/openai.png -------------------------------------------------------------------------------- /samples/financial_transcripts/llm_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | from dotenv import dotenv_values 4 | from langchain.chat_models import AzureChatOpenAI 5 | from azure.core.credentials import AzureKeyCredential 6 | from azure.search.documents import SearchClient 7 | import time 8 | import sys 9 | from chatBot import chatBot 10 | sys.path.append("../..") ## add directory above 11 | from rag_skills.utils import count_tokens 12 | 13 | ### Cofigurations 14 | VERBOSE = True 15 | TEMPERATURE = 0.0 16 | TOP_P = 1.0 17 | NUM_CHUNKS = 10 18 | MAX_TOKEN_FOR_CONTEXT = 27000 19 | VECTOR_COL_NAME = "Embedding" ## Column name in ACS for vector embedding 20 | CHUNK_NAME = "Chunk" ## Column name in ACS for text data that contains the context 21 | 22 | TEMPLATE_QA_CHAIN = """You are a chatbot having a conversation with a human. 23 | Given the Context, Chat History, and Human Query, 24 | answer without hallucinating. If you don't have the answer say "I don't have the answer" """ 25 | 26 | TEMPLATE_CONTEXT_SUMMARIZATION = """ 27 | Summarize the context so it includes the details related to the human query. """ 28 | 29 | # Get the absolute path to the .env file 30 | env_name = os.path.join(os.path.dirname(__file__), "llm.env") 31 | 32 | # Load environment variables from the .env file 33 | config = dotenv_values(env_name) 34 | 35 | if len(config) == 0: 36 | env_name = os.path.join(os.path.dirname(__file__), "../../llm.env") 37 | config = dotenv_values(env_name) 38 | 39 | if len(config) == 0: 40 | raise Exception("No environment variables loaded. Please check the *.env file.") 41 | 42 | for key, value in config.items(): 43 | os.environ[key] = value 44 | 45 | # LOAD OpenAI configs 46 | openai.api_type = config["OPENAI_API_TYPE"] 47 | openai.api_key = config['OPENAI_API_KEY'] 48 | openai.api_base = config['OPENAI_API_BASE'] 49 | openai.api_version = config['OPENAI_API_VERSION'] 50 | print("ENV VARIABLES LOADED") 51 | 52 | # Model choice 53 | DEPLOYMENT_NAME = config['OPENAI_DEPLOYMENT_COMPLETION'] 54 | 55 | ## Azure cognitive search 56 | cogsearch_name = os.getenv("COGSEARCH_NAME") 57 | index_name = os.getenv("COGSEARCH_INDEX_NAME") 58 | cogsearch_api_key = os.getenv("COGSEARCH_API_KEY") 59 | service_endpoint = "https://" + config["COGSEARCH_NAME"] + ".search.windows.net" 60 | 61 | credential = AzureKeyCredential(cogsearch_api_key) 62 | search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential) 63 | 64 | optional_params = { 65 | 'top_p': TOP_P, 66 | } 67 | 68 | llm = AzureChatOpenAI( 69 | deployment_name=DEPLOYMENT_NAME, 70 | openai_api_base=openai.api_base, 71 | openai_api_version=openai.api_version, 72 | openai_api_key=openai.api_key, 73 | openai_api_type = openai.api_type, 74 | temperature=TEMPERATURE, 75 | model_kwargs=optional_params, 76 | verbose=VERBOSE, 77 | ) 78 | 79 | ## Chatbot class that implements the conversation agent 80 | 81 | cb = chatBot( 82 | llm, 83 | search_client, 84 | max_token_for_context=MAX_TOKEN_FOR_CONTEXT, 85 | template_qa_chain=TEMPLATE_QA_CHAIN, 86 | template_context_summarization=TEMPLATE_CONTEXT_SUMMARIZATION, 87 | numChunks=NUM_CHUNKS, 88 | vectorColName=VECTOR_COL_NAME, 89 | chunkColName=CHUNK_NAME, 90 | to_debug=VERBOSE 91 | ) 92 | 93 | def get_answer(msg): 94 | ans = cb.run(msg) 95 | 96 | return ans 97 | 98 | 99 | if __name__ == '__main__': 100 | question = """what are the top 3 themes in the earnings call transcripts from ticker MSFT for the quarter 1 in year 23? 101 | """ 102 | 103 | start = time.time() 104 | ans = get_answer(question) 105 | end = time.time() 106 | print(ans) 107 | print("Time elapsed: {}".format(end-start)) 108 | 109 | result_num_tokens = count_tokens(ans) 110 | print("Response num tokens: {}".format(result_num_tokens)) 111 | 112 | ans = get_answer("Is it possible to get more details about cloud ?") 113 | print(ans) -------------------------------------------------------------------------------- /samples/financial_transcripts/st_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | streamlit: 3 | title: "MSFT - Financial earnings call assistant" 4 | tab_title: "LLM assistant" 5 | logo: "images/Microsoft_logo.png" 6 | page_icon: "images/Microsoft_logo.png" 7 | avatar: "images/openai.png" 8 | assistant_intro_message: "Hi there :wave:, I'm an AI assistant. I can look up information through a database of earnings call transcripts to answer your questions. Please always specify the ticker and quarter in your question for example: \"ticker MSFT, quarter 1, year 23. How did MSFT do compared to last quarter?\"" 9 | about: "A demo of an AI assistant. Powered by Azure OpenAI Large Language Models and search using native vector search capabilities on PostGres, authors: Azure Data/Applied AI team" 10 | azure: 11 | dummy: "test" 12 | -------------------------------------------------------------------------------- /samples/financial_transcripts/st_main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import yaml 3 | import os 4 | import openai 5 | 6 | from llm_app import get_answer 7 | 8 | # Read config yaml file 9 | with open('st_config.yaml', 'r') as file: 10 | config = yaml.safe_load(file) 11 | #print(config) 12 | title = config['streamlit']['title'] 13 | avatar = { 14 | 'user': None, 15 | 'assistant': config['streamlit']['avatar'] 16 | } 17 | 18 | # Set page config 19 | st.set_page_config( 20 | page_title=config['streamlit']['tab_title'], 21 | page_icon=config['streamlit']['page_icon'], 22 | ) 23 | 24 | # Set sidebar 25 | st.sidebar.image(config['streamlit']['logo'], width=50) 26 | st.sidebar.title("About") 27 | st.sidebar.info(config['streamlit']['about']) 28 | 29 | # Set logo 30 | #st.image(config['streamlit']['logo'], width=50) 31 | 32 | # Set page title 33 | st.title(title) 34 | 35 | # Initialize chat history 36 | if "messages" not in st.session_state: 37 | st.session_state.messages = [] 38 | st.session_state.messages.append({ 39 | "role": "assistant", 40 | "content": config['streamlit']['assistant_intro_message'] 41 | }) 42 | 43 | # Display chat messages from history on app rerun 44 | for message in st.session_state.messages: 45 | with st.chat_message(message["role"], avatar=avatar[message["role"]]): 46 | st.markdown(message["content"]) 47 | 48 | # React to user input 49 | if prompt := st.chat_input("Send a message"): 50 | # Add user message to chat history 51 | st.session_state.messages.append({"role": "user", "content": prompt}) 52 | # Display user message in chat message container 53 | with st.chat_message("user"): 54 | st.markdown(prompt) 55 | # Get bot response 56 | response = get_answer(prompt) 57 | with st.chat_message("assistant", avatar=config['streamlit']['avatar']): 58 | st.markdown(response) 59 | # Add assistant response to chat history 60 | st.session_state.messages.append({"role": "assistant", "content": response}) 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /samples/financial_transcripts/step0_data_preprocessor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Step1: Download data from azure blob storage\n", 8 | "\n", 9 | "This code downloads the Msft financial transcripts from a blob storage. If you do not have access to blob, then download \"Microsoft Earning Call Transcripts\" for four quarters for year 2023 and put it in \"Data\" folder. Make sure to rename the file similar to \"MSFTTranscriptFY23Q4.docx\"\n", 10 | "\n", 11 | "Msft Earning Call Transcripts for 2023-Q4\n", 12 | "https://www.fool.com/earnings/call-transcripts/2023/07/25/microsoft-msft-q4-2023-earnings-call-transcript/\n", 13 | "\n" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from azure.storage.blob import BlobServiceClient\n", 23 | "import os\n", 24 | "from pathlib import Path\n", 25 | "\n", 26 | "# Name of the container in the Blob Storage\n", 27 | "container_name = \"public\"\n", 28 | "\n", 29 | "# Local directory path to save the downloaded files\n", 30 | "local_directory = Path(\"DATA/\")\n", 31 | "\n", 32 | "def download_files_from_blob_storage(container_name, local_directory):\n", 33 | " # Create a BlobServiceClient using the default credentials (public access)\n", 34 | " blob_service_client = BlobServiceClient.from_connection_string(\"DefaultEndpointsProtocol=https;AccountName=appliedaipublicdata;EndpointSuffix=core.windows.net\")\n", 35 | "\n", 36 | " # Get a reference to the container\n", 37 | " container_client = blob_service_client.get_container_client(container_name)\n", 38 | "\n", 39 | " # List all blobs in the container\n", 40 | " blob_list = container_client.list_blobs()\n", 41 | "\n", 42 | " for blob in blob_list:\n", 43 | " blob_name = blob.name\n", 44 | " print(blob_name)\n", 45 | " \n", 46 | " # Check if the blob has a .docx extension (Word document)\n", 47 | " if blob_name.lower().endswith(\".docx\"):\n", 48 | " blob_client = container_client.get_blob_client(blob_name)\n", 49 | " \n", 50 | " # Construct the local file path to save the blob\n", 51 | " local_file_path = os.path.join(local_directory, blob_name.split(\"/\")[-1]) # Use only the last part of the blob path\n", 52 | " \n", 53 | " # Download the blob to the local directory\n", 54 | " with open(local_file_path, \"wb\") as local_file:\n", 55 | " blob_data = blob_client.download_blob()\n", 56 | " local_file.write(blob_data.readall())\n", 57 | " \n", 58 | " print(f\"Downloaded: {blob_name}\")\n", 59 | "\n", 60 | "\n", 61 | "download_files_from_blob_storage(container_name, local_directory)\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Step 2: Convert .docx to .pdf format" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from docx2pdf import convert\n", 78 | "import os\n", 79 | "\n", 80 | "directory = Path('DATA')\n", 81 | "docx_files = [filename for filename in os.listdir(directory) if filename.endswith('.docx')]\n", 82 | "name_len_docx = []\n", 83 | "name_len_pdf = []\n", 84 | "print(len(docx_files))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "for filename in docx_files:\n", 94 | " \n", 95 | " docx_path = os.path.join(directory, filename)\n", 96 | " # if len(filename)>35:\n", 97 | " # filename = filename[:35]\n", 98 | " pdf_path = os.path.join(directory, f\"{os.path.splitext(filename)[0]}.pdf\")\n", 99 | "\n", 100 | " # Check if PDF already exists\n", 101 | " if os.path.exists(pdf_path):\n", 102 | " print(f\"Skipping conversion for {filename}. PDF already exists.\")\n", 103 | " continue\n", 104 | "\n", 105 | " name_len_docx.append(len(docx_path))\n", 106 | " print(filename, name_len_docx)\n", 107 | " name_len_pdf.append(len(pdf_path))\n", 108 | " print(name_len_pdf)\n", 109 | " try: \n", 110 | " convert(docx_path, pdf_path)\n", 111 | " except:\n", 112 | " print('Error in converting file, retrying...')\n", 113 | " try:\n", 114 | " convert(docx_path, pdf_path)\n", 115 | " except:\n", 116 | " Exception(\"Error in converting file\")\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "nbdev", 130 | "language": "python", 131 | "name": "nbdev" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.8.11" 144 | }, 145 | "orig_nbformat": 4 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } 150 | -------------------------------------------------------------------------------- /samples/financial_transcripts/step1_chunk_and_extract.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from dotenv import dotenv_values\n", 10 | "from pathlib import Path\n", 11 | "import os\n", 12 | "import pandas as pd\n", 13 | "from azure.core.credentials import AzureKeyCredential\n", 14 | "from azure.ai.formrecognizer import DocumentAnalysisClient\n", 15 | "\n", 16 | "# specify the name of the .env file name \n", 17 | "env_name = \"../../llm.env\" # change to your own .env file name\n", 18 | "config = dotenv_values(env_name)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "# Extract data and context using Azure Form Recognizer\n", 26 | "\n", 27 | "This code sample shows Prebuilt Document operations with the Azure Form Recognizer client library. \n", 28 | "The async versions of the samples require Python 3.6 or later.\n", 29 | "\n", 30 | "To learn more, please visit the documentation - Quickstart: Form Recognizer Python client library SDKs\n", 31 | "https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "\"\"\"\n", 41 | "Remember to remove the key from your code when you're done, and never post it publicly. For production, use\n", 42 | "secure methods to store and access your credentials. For more information, see \n", 43 | "https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration\n", 44 | "\"\"\"\n", 45 | "\n", 46 | "endpoint = config[\"AZURE_FORM_RECOGNIZER_ENDPOINT\"]\n", 47 | "key = config[\"AZURE_FORM_RECOGNIZER_KEY\"]\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Read pdf files using Azure Form Recognizer and split into chunks \n", 55 | "Azure form recognizer reads pdf files and then we chunk the extracted text, and also save page number and line number for the extracted chunks " 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "import re\n", 65 | "\n", 66 | "################################################################################\n", 67 | "#################### Helper Functions ##########################################\n", 68 | "################################################################################\n", 69 | "\n", 70 | "# Read pdf files\n", 71 | "def analyze_pdf(doc_path): \n", 72 | " with open(doc_path, \"rb\") as f:\n", 73 | " poller = document_analysis_client.begin_analyze_document(\n", 74 | " \"prebuilt-document\", document=f\n", 75 | " )\n", 76 | " result = poller.result()\n", 77 | " \n", 78 | " return result\n", 79 | "\n", 80 | "# Extract stock symbol, year, and quarter from filename\n", 81 | "def extract_info_from_filename(filename):\n", 82 | " '''\n", 83 | " Input: filename (\"MSFTTranscriptFY23Q4\")\n", 84 | " Output: Extract stock symbol, year and quarter from filename\n", 85 | " '''\n", 86 | " pattern = r'([A-Z]+)TranscriptFY(\\d{2})Q(\\d)'\n", 87 | " match = re.search(pattern, filename)\n", 88 | " \n", 89 | " if match:\n", 90 | " symbol = match.group(1)\n", 91 | " fiscal_year = match.group(2)\n", 92 | " fiscal_quarter = match.group(3)\n", 93 | " return symbol, fiscal_year, fiscal_quarter\n", 94 | " else:\n", 95 | " return None\n", 96 | "\n", 97 | "# Extract line number and page number\n", 98 | "def create_line_page_tuples(result):\n", 99 | " '''\n", 100 | " Input: result of form recognizer analyze_pdf function\n", 101 | " Output: Create list of tuples of the form (line, page_num, line_num) \n", 102 | " This will keep reference of the line number and page number of each line in the document.\n", 103 | " '''\n", 104 | " line_page_tuples = []\n", 105 | "\n", 106 | " total_pages = len(result.pages)\n", 107 | " for page_num in range(total_pages):\n", 108 | " lines = result.pages[page_num].lines\n", 109 | " total_lines = len(lines)\n", 110 | "\n", 111 | " for line_num in range(total_lines):\n", 112 | " line = lines[line_num].content\n", 113 | " line_page_tuples.append((line, page_num + 1, line_num + 1))\n", 114 | "\n", 115 | " return line_page_tuples\n", 116 | "\n", 117 | "# Retrieve page number and chunks\n", 118 | "def chunk_with_page_number(line_page_tuples, chunk_length=10, chunk_overlap=2):\n", 119 | " '''\n", 120 | " Given the list of tuples of the form (line, page_num, line_num) and chunk length and overlap,\n", 121 | " it will create chunks of text with page number and line number of the first line in the chunk.\n", 122 | " chunk length: number of lines in each chunk\n", 123 | " chunk_overlap: number of overlapping lines between chunks\n", 124 | " '''\n", 125 | " pointer = 0 \n", 126 | " chunks = []\n", 127 | " total_lines = len(line_page_tuples)\n", 128 | " #for line, page_number, line_number in line_page_tuples:\n", 129 | " while pointer < total_lines:\n", 130 | " line_count = 0\n", 131 | " current_chunk = \"\"\n", 132 | " if not chunks: \n", 133 | " # for first chunk we can not use overlap\n", 134 | " pointer = 0\n", 135 | " else:\n", 136 | " pointer = pointer - chunk_overlap\n", 137 | " \n", 138 | " # take starting page number and line number \n", 139 | " page_number, line_number = line_page_tuples[pointer][1:] \n", 140 | " while line_count < chunk_length and pointer < total_lines:\n", 141 | " current_chunk = current_chunk + line_page_tuples[pointer][0]\n", 142 | " current_chunk = current_chunk + \" \"\n", 143 | " line_count += 1\n", 144 | " pointer += 1\n", 145 | " chunks.append((current_chunk, page_number, line_number))\n", 146 | " return chunks\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 4, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "writing the results of: \n", 159 | "MSFTTranscriptFY23Q1.pdf\n", 160 | "writing the results of: \n", 161 | "MSFTTranscriptFY23Q2.pdf\n", 162 | "writing the results of: \n", 163 | "MSFTTranscriptFY23Q3.pdf\n", 164 | "writing the results of: \n", 165 | "MSFTTranscriptFY23Q4.pdf\n" 166 | ] 167 | }, 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 185 | "\n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | "
IdTickerYearQuarterChunkPageNumberLineNumber
01MSFT231Microsoft FY23 First Quarter Earnings Conferen...11
12MSFT231On the Microsoft Investor Relations website, y...19
23MSFT231GAAP. They are included as additional clarifyi...117
34MSFT231same in constant currency, we will refer to th...26
45MSFT231predictions, projections, or other statements ...214
\n", 251 | "
" 252 | ], 253 | "text/plain": [ 254 | " Id Ticker Year Quarter Chunk \\\n", 255 | "0 1 MSFT 23 1 Microsoft FY23 First Quarter Earnings Conferen... \n", 256 | "1 2 MSFT 23 1 On the Microsoft Investor Relations website, y... \n", 257 | "2 3 MSFT 23 1 GAAP. They are included as additional clarifyi... \n", 258 | "3 4 MSFT 23 1 same in constant currency, we will refer to th... \n", 259 | "4 5 MSFT 23 1 predictions, projections, or other statements ... \n", 260 | "\n", 261 | " PageNumber LineNumber \n", 262 | "0 1 1 \n", 263 | "1 1 9 \n", 264 | "2 1 17 \n", 265 | "3 2 6 \n", 266 | "4 2 14 " 267 | ] 268 | }, 269 | "execution_count": 4, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "# Define document analysis client\n", 276 | "document_analysis_client = DocumentAnalysisClient(\n", 277 | " endpoint=endpoint, credential=AzureKeyCredential(key)\n", 278 | " )\n", 279 | "\n", 280 | "doc_dir = Path(\"DATA/\")\n", 281 | "pdf_files = [filename for filename in os.listdir(doc_dir) if filename.endswith('.pdf')]\n", 282 | "\n", 283 | "dfs = []\n", 284 | "\n", 285 | "for file_name in pdf_files:\n", 286 | " \n", 287 | " values = extract_info_from_filename(file_name) # symbol, fiscal_year, fiscal_quarter\n", 288 | " file_path = os.path.join(doc_dir, f\"{os.path.splitext(file_name)[0]}.pdf\")\n", 289 | " \n", 290 | " # analyze the pdf using form recognizer\n", 291 | " result = analyze_pdf(file_path)\n", 292 | " \n", 293 | " # get the chunks in a tuple of the form (chunk, page_number, line_number)\n", 294 | " line_page_tuples = create_line_page_tuples(result)\n", 295 | " chunks = chunk_with_page_number(line_page_tuples=line_page_tuples, chunk_length=10, chunk_overlap=2)\n", 296 | " \n", 297 | " # Write results to dataframe \n", 298 | " df_chunks = pd.DataFrame(chunks, columns = ['Chunk', 'PageNumber', 'LineNumber']) \n", 299 | "\n", 300 | " df_chunks[\"Ticker\"], df_chunks[\"Year\"], df_chunks[\"Quarter\"] = \"NULL\", \"NULL\", \"NULL\"\n", 301 | " if values:\n", 302 | " symbol, fiscal_year, fiscal_quarter = values\n", 303 | " df_chunks[\"Ticker\"], df_chunks[\"Year\"], df_chunks[\"Quarter\"] = symbol, fiscal_year, fiscal_quarter\n", 304 | " \n", 305 | " # Reorder dataframe column name\n", 306 | " new_column_order = ['Ticker', 'Year', 'Quarter', 'Chunk', 'PageNumber', 'LineNumber']\n", 307 | " df_chunks = df_chunks[new_column_order]\n", 308 | " \n", 309 | " # Add all datframe to list\n", 310 | " dfs.append(df_chunks)\n", 311 | "\n", 312 | " # Saving results to csv files\n", 313 | " if not os.path.exists(\"AnalyzedPDF/\"):\n", 314 | " os.makedirs(\"AnalyzedPDF/\")\n", 315 | "\n", 316 | " print('writing the results of: \\n' + file_name) \n", 317 | " if not os.path.exists(f\"AnalyzedPDF/Chunks_{file_name[0:-4]}.csv\"):\n", 318 | " df_chunks.to_csv(f\"AnalyzedPDF/Chunks_{file_name[0:-4]}.csv\", index=False)\n", 319 | " else:\n", 320 | " print(f'File: chunks_{file_name}.csv already exists, skipping...')\n", 321 | " \n", 322 | "## Combine all the files\n", 323 | "df = pd.concat(dfs, ignore_index=True)\n", 324 | "df = df.reset_index(drop=True)\n", 325 | "df.insert(0, 'Id', [i for i in range(1, df.shape[0]+1)]) # Add 'Id' column\n", 326 | "\n", 327 | "## Save to csv\n", 328 | "df.to_csv(\"AnalyzedPDF/Chunks.csv\", index=False)\n", 329 | "\n", 330 | "df.head()\n" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [] 339 | } 340 | ], 341 | "metadata": { 342 | "kernelspec": { 343 | "display_name": "nanogpt", 344 | "language": "python", 345 | "name": "nanogpt" 346 | }, 347 | "language_info": { 348 | "codemirror_mode": { 349 | "name": "ipython", 350 | "version": 3 351 | }, 352 | "file_extension": ".py", 353 | "mimetype": "text/x-python", 354 | "name": "python", 355 | "nbconvert_exporter": "python", 356 | "pygments_lexer": "ipython3", 357 | "version": "3.8.16" 358 | } 359 | }, 360 | "nbformat": 4, 361 | "nbformat_minor": 2 362 | } 363 | -------------------------------------------------------------------------------- /samples/financial_transcripts/step2_embed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chunk Embedding using Azure OpenAI " 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Load environment variables and keys " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from dotenv import dotenv_values\n", 24 | "\n", 25 | "# specify the name of the .env file name \n", 26 | "env_name = \"../../llm.env\" # change to your own .env file name\n", 27 | "config = dotenv_values(env_name)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "#### Load the chunks and create embedding\n", 35 | "In this section, we will load the data into a pandas dataframe, use select columns, and create vector embedding using azure open ai. " 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import openai\n", 45 | "import pandas as pd\n", 46 | "import pandas as pd\n", 47 | "import numpy as np\n", 48 | "import time\n", 49 | "\n", 50 | "openai.api_type = config[\"OPENAI_API_TYPE\"] \n", 51 | "openai.api_key = config[\"OPENAI_API_KEY\"]\n", 52 | "openai.api_base = config[\"OPENAI_API_BASE\"] \n", 53 | "openai.api_version = config[\"OPENAI_API_VERSION\"] \n", 54 | "\n", 55 | "def createEmbeddings(text):\n", 56 | " response = openai.Embedding.create(input=text , engine=config[\"OPENAI_DEPLOYMENT_EMBEDDING\"])\n", 57 | " embeddings = response['data'][0]['embedding']\n", 58 | " return embeddings\n", 59 | "\n", 60 | "# Read data into a DataFrame\n", 61 | "df = pd.read_csv('AnalyzedPDF/Chunks.csv')\n", 62 | "\n", 63 | "# Create a new column called 'embedding' in the DataFrame\n", 64 | "df['Embedding'] = np.empty((len(df),), dtype=object)\n", 65 | "\n", 66 | "# Iterate over each row in the DataFrame and assign the concatenation and embeddings\n", 67 | "for index, row in df.iterrows():\n", 68 | " text = row['Chunk']\n", 69 | " \n", 70 | " # Concatenate the desired columns\n", 71 | " concat_text = f\"{text}\"\n", 72 | " \n", 73 | " # Create embeddings using the provided function\n", 74 | " embeddings = createEmbeddings(concat_text)\n", 75 | " #print(embeddings)\n", 76 | " \n", 77 | " # Assign the embeddings to the 'embedding' column\n", 78 | " df.at[index, 'Embedding'] = embeddings\n", 79 | " time.sleep(0.1)\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "We will rename the column names and add a new column as primary index." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/html": [ 97 | "
\n", 98 | "\n", 111 | "\n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | "
IdTickerYearQuarterChunkPageNumberLineNumberEmbedding
01MSFT231Microsoft FY23 First Quarter Earnings Conferen...11[-0.022691458463668823, -0.028929660096764565,...
12MSFT231On the Microsoft Investor Relations website, y...19[-0.022940216585993767, -0.008343684487044811,...
23MSFT231GAAP. They are included as additional clarifyi...117[-0.01130777969956398, -0.0038822712376713753,...
34MSFT231same in constant currency, we will refer to th...26[-0.017685849219560623, -0.02943631075322628, ...
45MSFT231predictions, projections, or other statements ...214[-0.00915693398565054, -0.019673412665724754, ...
...........................
437438MSFT234Can you just talk about where customers are ri...4419[-0.004939808044582605, 0.000936132506467402, ...
438439MSFT234complement, I'll call it, your databases, beca...457[-0.0132768414914608, 0.004370962269604206, -0...
439440MSFT234with a very disruptive business model. I mean,...4515[-0.013180367648601532, -0.007650672923773527,...
440441MSFT234architecture lays out, our business model arou...462[0.003990992438048124, -0.0018922516610473394,...
441442MSFT234speaking with all of you soon. SATYA NADELLA: ...4610[-0.013442852534353733, -0.01743759959936142, ...
\n", 249 | "

442 rows × 8 columns

\n", 250 | "
" 251 | ], 252 | "text/plain": [ 253 | " Id Ticker Year Quarter \\\n", 254 | "0 1 MSFT 23 1 \n", 255 | "1 2 MSFT 23 1 \n", 256 | "2 3 MSFT 23 1 \n", 257 | "3 4 MSFT 23 1 \n", 258 | "4 5 MSFT 23 1 \n", 259 | ".. ... ... ... ... \n", 260 | "437 438 MSFT 23 4 \n", 261 | "438 439 MSFT 23 4 \n", 262 | "439 440 MSFT 23 4 \n", 263 | "440 441 MSFT 23 4 \n", 264 | "441 442 MSFT 23 4 \n", 265 | "\n", 266 | " Chunk PageNumber \\\n", 267 | "0 Microsoft FY23 First Quarter Earnings Conferen... 1 \n", 268 | "1 On the Microsoft Investor Relations website, y... 1 \n", 269 | "2 GAAP. They are included as additional clarifyi... 1 \n", 270 | "3 same in constant currency, we will refer to th... 2 \n", 271 | "4 predictions, projections, or other statements ... 2 \n", 272 | ".. ... ... \n", 273 | "437 Can you just talk about where customers are ri... 44 \n", 274 | "438 complement, I'll call it, your databases, beca... 45 \n", 275 | "439 with a very disruptive business model. I mean,... 45 \n", 276 | "440 architecture lays out, our business model arou... 46 \n", 277 | "441 speaking with all of you soon. SATYA NADELLA: ... 46 \n", 278 | "\n", 279 | " LineNumber Embedding \n", 280 | "0 1 [-0.022691458463668823, -0.028929660096764565,... \n", 281 | "1 9 [-0.022940216585993767, -0.008343684487044811,... \n", 282 | "2 17 [-0.01130777969956398, -0.0038822712376713753,... \n", 283 | "3 6 [-0.017685849219560623, -0.02943631075322628, ... \n", 284 | "4 14 [-0.00915693398565054, -0.019673412665724754, ... \n", 285 | ".. ... ... \n", 286 | "437 19 [-0.004939808044582605, 0.000936132506467402, ... \n", 287 | "438 7 [-0.0132768414914608, 0.004370962269604206, -0... \n", 288 | "439 15 [-0.013180367648601532, -0.007650672923773527,... \n", 289 | "440 2 [0.003990992438048124, -0.0018922516610473394,... \n", 290 | "441 10 [-0.013442852534353733, -0.01743759959936142, ... \n", 291 | "\n", 292 | "[442 rows x 8 columns]" 293 | ] 294 | }, 295 | "execution_count": 3, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "# Print the DataFrame with 'Id' as the first column after index\n", 302 | "df.head(1000)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Use the following code to save the embeddings and processed data for future use or skip the previous part of the code and and load the processed data to save into the db. " 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 4, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "# save CSV for future use. \n", 319 | "df.to_csv('AnalyzedPDF/ChunksEmbedding.csv', index=False)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [] 328 | } 329 | ], 330 | "metadata": { 331 | "kernelspec": { 332 | "display_name": "nanogpt", 333 | "language": "python", 334 | "name": "nanogpt" 335 | }, 336 | "language_info": { 337 | "codemirror_mode": { 338 | "name": "ipython", 339 | "version": 3 340 | }, 341 | "file_extension": ".py", 342 | "mimetype": "text/x-python", 343 | "name": "python", 344 | "nbconvert_exporter": "python", 345 | "pygments_lexer": "ipython3", 346 | "version": "3.8.16" 347 | } 348 | }, 349 | "nbformat": 4, 350 | "nbformat_minor": 2 351 | } 352 | -------------------------------------------------------------------------------- /samples/financial_transcripts/step3_db_storing_vectorsearch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Store chunks into Vector Database using Azure Cognitive Search (ACS)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 28, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import re\n", 18 | "import pandas as pd\n", 19 | "import json \n", 20 | "import openai \n", 21 | "from dotenv import load_dotenv\n", 22 | "from tenacity import retry, wait_random_exponential, stop_after_attempt \n", 23 | "from azure.core.credentials import AzureKeyCredential \n", 24 | "from azure.search.documents import SearchClient \n", 25 | "from azure.search.documents.indexes import SearchIndexClient \n", 26 | "from azure.search.documents.models import Vector \n", 27 | "from azure.search.documents.indexes.models import ( \n", 28 | " SearchIndex, \n", 29 | " SearchField, \n", 30 | " SearchFieldDataType, \n", 31 | " SimpleField, \n", 32 | " SearchableField, \n", 33 | " SearchIndex, \n", 34 | " SemanticConfiguration, \n", 35 | " PrioritizedFields, \n", 36 | " SemanticField, \n", 37 | " SearchField, \n", 38 | " SemanticSettings, \n", 39 | " VectorSearch, \n", 40 | " HnswVectorSearchAlgorithmConfiguration\n", 41 | ")\n", 42 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 43 | "from ast import literal_eval" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Load environment variables and keys " 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 17, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from dotenv import dotenv_values\n", 60 | "\n", 61 | "# specify the name of the .env file name \n", 62 | "env_name = \"../../llm.env\" # change to your own .env file name\n", 63 | "config = dotenv_values(env_name)\n", 64 | "\n", 65 | "# Azure OpenAI\n", 66 | "openai.api_type = config[\"OPENAI_API_TYPE\"] #\"azure\"\n", 67 | "openai.api_key = config['OPENAI_API_KEY']\n", 68 | "openai.api_base = config['OPENAI_API_BASE']\n", 69 | "openai.api_version = config['OPENAI_API_VERSION']\n", 70 | "\n", 71 | "## Cog Search\n", 72 | "cogsearch_name = config[\"COGSEARCH_NAME\"]\n", 73 | "index_name = config[\"COGSEARCH_INDEX_NAME\"]\n", 74 | "key = config[\"COGSEARCH_API_KEY\"]\n", 75 | "service_endpoint = \"https://\"+config[\"COGSEARCH_NAME\"] + \".search.windows.net\"\n", 76 | "\n", 77 | "credential = AzureKeyCredential(key)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 34, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "def createEmbeddings(text):\n", 87 | " response = openai.Embedding.create(input=text , engine=config[\"OPENAI_DEPLOYMENT_EMBEDDING\"])\n", 88 | " embeddings = response['data'][0]['embedding']\n", 89 | " return embeddings" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Store the embeddings in Azure Cognitive Search Vector Store\n", 97 | "\n", 98 | "[AzureCogSearch](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) provides a simple interface to create a vector database, store and retrieve data using vector search. You can read more about [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main) more about Vector Search.\n", 99 | "\n", 100 | "There are two steps to store data in AzureCogSearch vector database:\n", 101 | "- First, we create the index (or schema) of the vector database\n", 102 | "- Second, we add the chunked documents and their embeddings to the vector datastore" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 18, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "df_chunks_embedding = pd.read_csv('AnalyzedPDF/ChunksEmbedding.csv')" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 19, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/html": [ 122 | "
\n", 123 | "\n", 136 | "\n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
IdTickerYearQuarterChunkPageNumberLineNumberEmbedding
01MSFT231Microsoft FY23 First Quarter Earnings Conferen...11[-0.022691456601023674, -0.028929658234119415,...
12MSFT231On the Microsoft Investor Relations website, y...19[-0.022940216585993767, -0.008343684487044811,...
23MSFT231GAAP. They are included as additional clarifyi...117[-0.01130777969956398, -0.0038822712376713753,...
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | " Id Ticker Year Quarter \\\n", 190 | "0 1 MSFT 23 1 \n", 191 | "1 2 MSFT 23 1 \n", 192 | "2 3 MSFT 23 1 \n", 193 | "\n", 194 | " Chunk PageNumber LineNumber \\\n", 195 | "0 Microsoft FY23 First Quarter Earnings Conferen... 1 1 \n", 196 | "1 On the Microsoft Investor Relations website, y... 1 9 \n", 197 | "2 GAAP. They are included as additional clarifyi... 1 17 \n", 198 | "\n", 199 | " Embedding \n", 200 | "0 [-0.022691456601023674, -0.028929658234119415,... \n", 201 | "1 [-0.022940216585993767, -0.008343684487044811,... \n", 202 | "2 [-0.01130777969956398, -0.0038822712376713753,... " 203 | ] 204 | }, 205 | "execution_count": 19, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "df_chunks_embedding.head(3)\n", 212 | "#columns should look like the following with order preserved\n", 213 | "#Id, Chunk, PageNumber, LineNumber, DocId, Embedding" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 20, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | " rag_prop_j_3 created\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "\n", 231 | "# Create a search index\n", 232 | "index_client = SearchIndexClient(\n", 233 | " endpoint=service_endpoint, credential=credential)\n", 234 | "fields = [\n", 235 | " SimpleField(name=\"Id\", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),\n", 236 | " SearchableField(name=\"Ticker\", type=SearchFieldDataType.String, filterable=True),\n", 237 | " SearchableField(name=\"Year\", type=SearchFieldDataType.String, filterable=True),\n", 238 | " SearchableField(name=\"Quarter\", type=SearchFieldDataType.String, filterable=True),\n", 239 | " SearchableField(name=\"Chunk\", type=SearchFieldDataType.String, searchable=True),\n", 240 | " SearchableField(name=\"PageNumber\", type=SearchFieldDataType.String, filterable=True),\n", 241 | " SearchableField(name=\"LineNumber\", type=SearchFieldDataType.String, filterable=True),\n", 242 | " \n", 243 | " SearchField(name=\"Embedding\", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),\n", 244 | " searchable=True, vector_search_dimensions=1536, vector_search_configuration=\"my-vector-config\"),\n", 245 | "]\n", 246 | "\n", 247 | "vector_search = VectorSearch(\n", 248 | " algorithm_configurations=[\n", 249 | " HnswVectorSearchAlgorithmConfiguration(\n", 250 | " name=\"my-vector-config\",\n", 251 | " kind=\"hnsw\",\n", 252 | " parameters={\n", 253 | " \"m\": 4,\n", 254 | " \"efConstruction\": 400,\n", 255 | " \"efSearch\": 500,\n", 256 | " \"metric\": \"cosine\"\n", 257 | " }\n", 258 | " )\n", 259 | " ]\n", 260 | ")\n", 261 | "\n", 262 | "semantic_config = SemanticConfiguration(\n", 263 | " name=\"my-semantic-config\",\n", 264 | " prioritized_fields=PrioritizedFields(\n", 265 | " title_field=SemanticField(field_name=\"Ticker\"),\n", 266 | " prioritized_content_fields=[SemanticField(field_name=\"Chunk\")]\n", 267 | " )\n", 268 | ")\n", 269 | "\n", 270 | "# Create the semantic settings with the configuration\n", 271 | "semantic_settings = SemanticSettings(configurations=[semantic_config])\n", 272 | "\n", 273 | "# Create the search index with the semantic settings\n", 274 | "index = SearchIndex(name=index_name, fields=fields,\n", 275 | " vector_search=vector_search, semantic_settings=semantic_settings)\n", 276 | "result = index_client.create_or_update_index(index)\n", 277 | "print(f' {result.name} created')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 31, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "Uploaded 442 payload\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "\n", 295 | "## Upload data to Index\n", 296 | "def batch_append_payload(df, search_client):\n", 297 | " \"\"\"append payload for batch insertion (note: max 1000 rows per insertion) of embeddings to Cognitive Search\"\"\"\n", 298 | " value_list = []\n", 299 | " for index, row in df.iterrows():\n", 300 | " value_list.append(\n", 301 | " {\n", 302 | " \"Id\": str(index),\n", 303 | " \"Ticker\": row[\"Ticker\"],\n", 304 | " \"Year\": str(row[\"Year\"]),\n", 305 | " \"Quarter\": str(row[\"Quarter\"]),\n", 306 | " \"Chunk\": row[\"Chunk\"],\n", 307 | " \"PageNumber\": str(row[\"PageNumber\"]),\n", 308 | " \"LineNumber\": str(row[\"LineNumber\"]),\n", 309 | " \"Embedding\": literal_eval(row['Embedding']),\n", 310 | " }\n", 311 | " )\n", 312 | " \n", 313 | "# print(len(value_list))\n", 314 | " \n", 315 | " if len(value_list) >= 1000:\n", 316 | " result = search_client.upload_documents(value_list)\n", 317 | " print(f\"Uploaded {len(value_list)} payload\")\n", 318 | " value_list = []\n", 319 | " result = search_client.upload_documents(value_list)\n", 320 | " print(f\"Uploaded {len(value_list)} payload\")\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | "# print('payload of size {}'.format(len(value_list)))\n", 325 | "\n", 326 | " return value_list\n", 327 | "\n", 328 | "\n", 329 | "search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)\n", 330 | "payload = batch_append_payload(df_chunks_embedding, search_client)\n", 331 | " \n", 332 | "# print(f\"Uploaded {len(payload)} payload\") \n" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "# Search Types 1: Pure Vector Search" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 51, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "MSFT\n", 352 | "2\n", 353 | "23\n", 354 | "Microsoft FY23 Second Quarter Earnings Conference Call Brett Iversen, Satya Nadella, Amy Hood Tuesday, January 24, 2023 BRETT IVERSEN: Good afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer, Amy Hood, chief financial officer, Alice Jolla, chief accounting officer, and Keith Dolliver, deputy general counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to \n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "# Pure Vector Search\n", 360 | "query = \"Microsoft earnings call for year 2022 for Quarter 2\" \n", 361 | " \n", 362 | "search_client = SearchClient(service_endpoint, index_name, credential=credential)\n", 363 | "vector = Vector(value=createEmbeddings(query), k=2, fields=\"Embedding\")\n", 364 | " \n", 365 | "results = search_client.search( \n", 366 | " search_text=None, \n", 367 | " vectors=[vector],\n", 368 | "# select=[\"Ticker\", \"Quarter\", \"Year\"],\n", 369 | ")\n", 370 | "\n", 371 | "# results\n", 372 | " \n", 373 | "for result in results: \n", 374 | " print(result['Ticker'])\n", 375 | " print(result['Quarter'])\n", 376 | " print(result['Year'])\n", 377 | " print(result['Chunk'])\n", 378 | " break" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "# Search Types 2: Pure Filter" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 52, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "Ticker: MSFT\n", 398 | "Quarter: 1\n", 399 | "Year: 23\n", 400 | "Microsoft FY23 First Quarter Earnings Conference Call Brett Iversen, Satya Nadella, Amy Hood Tuesday, October 25, 2022 BRETT IVERSEN: Good afternoon and thank you for joining us today. On the call with me are Satya Nadella, chairman and chief executive officer, Amy Hood, chief financial officer, Alice Jolla, chief accounting officer, and Keith Dolliver, deputy general counsel. On the Microsoft Investor Relations website, you can find our earnings press release and financial summary slide deck, which is intended to \n", 401 | "\n" 402 | ] 403 | } 404 | ], 405 | "source": [ 406 | "results = search_client.search( \n", 407 | " search_text=None, \n", 408 | " filter=\"(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') \",\n", 409 | ") \n", 410 | "\n", 411 | "for result in results:\n", 412 | " print(f\"Ticker: {result['Ticker']}\")\n", 413 | " print(f\"Quarter: {result['Quarter']}\") \n", 414 | " print(f\"Year: {result['Year']}\") \n", 415 | " print(result['Chunk'])\n", 416 | " print()\n", 417 | " break\n" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "# Search Types 3: Vector Search with filters" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 53, 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "Ticker: MSFT\n", 437 | "Quarter: 1\n", 438 | "Year: 23\n", 439 | "you're still seeing digitization. This is still the tailwind that helps customers solve problems. This is still the way to build growth and leverage in your business. And yet, you still want to optimize your workloads. You still want to run them the most efficiently so that you can then make room for new workload growth. We saw that across all segments. If there was one segment where I may have seen it a bit more, I would say, in the small or mid-sized segment of the market, that tends to be more through partner. We rely on partners to help customers do those same optimizations and prepare workloads. But it is that one point I know that people are focused on. \n", 440 | "\n" 441 | ] 442 | } 443 | ], 444 | "source": [ 445 | "# Pure Vector Search with Filter\n", 446 | "query = \"What are the KPIs?\" \n", 447 | " \n", 448 | "search_client = SearchClient(service_endpoint, index_name, credential=credential) \n", 449 | "vector = Vector(value=createEmbeddings(query), k=5, fields=\"Embedding\") \n", 450 | "\n", 451 | "results = search_client.search( \n", 452 | " search_text=None, \n", 453 | " vectors=[vector],\n", 454 | " filter=\"(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') \",\n", 455 | "# select=[\"Ticker\", \"Quarter\", \"Year\"],\n", 456 | ") \n", 457 | " \n", 458 | "for result in results:\n", 459 | " print(f\"Ticker: {result['Ticker']}\")\n", 460 | " print(f\"Quarter: {result['Quarter']}\") \n", 461 | " print(f\"Year: {result['Year']}\") \n", 462 | " print(result['Chunk'])\n", 463 | " print()\n", 464 | "\n", 465 | " break" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "# Search Types 4: Hybrid Search with filters" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 54, 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "name": "stdout", 482 | "output_type": "stream", 483 | "text": [ 484 | "Ticker: MSFT\n", 485 | "Quarter: 1\n", 486 | "Year: 23\n", 487 | "AMY HOOD: Thanks, Keith, and I do appreciate you asking about that one point, because I do know it is a point of focus every quarter. And what I would say is there is some inherent volatility to that number. A point here or there, and you've heard me say it when we've been a point better, and you've heard me say it when we've been a point worse. And I want to focus mostly on what and how we see the number, which is that it is still a very large growth rate with growth across all segments and with growth across all geos. That was, to the question, generally in line with where we expected. And what we did see through the quarter is a real focus both by customers, but also by our sales and customer success teams on \n", 488 | "\n" 489 | ] 490 | } 491 | ], 492 | "source": [ 493 | "# Pure Vector Search with Filter\n", 494 | "query = \"What are the KPIs?\" \n", 495 | " \n", 496 | "search_client = SearchClient(service_endpoint, index_name, credential=credential) \n", 497 | "vector = Vector(value=createEmbeddings(query), k=5, fields=\"Embedding\") \n", 498 | "\n", 499 | "results = search_client.search( \n", 500 | " search_text=query, \n", 501 | " vectors=[vector],\n", 502 | " filter=\"(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') \",\n", 503 | "# select=[\"Ticker\", \"Quarter\", \"Year\"],\n", 504 | " top = 3\n", 505 | ") \n", 506 | " \n", 507 | "for result in results:\n", 508 | " print(f\"Ticker: {result['Ticker']}\")\n", 509 | " print(f\"Quarter: {result['Quarter']}\") \n", 510 | " print(f\"Year: {result['Year']}\") \n", 511 | " print(result['Chunk'])\n", 512 | " print()\n", 513 | " break" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [] 522 | } 523 | ], 524 | "metadata": { 525 | "kernelspec": { 526 | "display_name": "nanogpt", 527 | "language": "python", 528 | "name": "nanogpt" 529 | }, 530 | "language_info": { 531 | "codemirror_mode": { 532 | "name": "ipython", 533 | "version": 3 534 | }, 535 | "file_extension": ".py", 536 | "mimetype": "text/x-python", 537 | "name": "python", 538 | "nbconvert_exporter": "python", 539 | "pygments_lexer": "ipython3", 540 | "version": "3.8.16" 541 | } 542 | }, 543 | "nbformat": 4, 544 | "nbformat_minor": 2 545 | } 546 | --------------------------------------------------------------------------------