├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── blogs
    ├── opensearch-data-ingestion
    │   ├── 2_kb_to_vectordb_opensearch.ipynb
    │   └── blog_post.qmd
    └── rag
    │   ├── README.md
    │   ├── api
    │       ├── app
    │       │   ├── __init__.py
    │       │   ├── api
    │       │   │   ├── __init__.py
    │       │   │   └── api_v1
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── api.py
    │       │   │   │   └── endpoints
    │       │   │   │       ├── __init__.py
    │       │   │   │       ├── fastapi_request.py
    │       │   │   │       ├── initialize.py
    │       │   │   │       ├── llm_ep.py
    │       │   │   │       └── sm_helper.py
    │       │   ├── main.py
    │       │   └── requirements.txt
    │       └── deploy.sh
    │   ├── app
    │       ├── requirements.txt
    │       └── webapp.py
    │   ├── blog_post.html
    │   ├── blog_post.md
    │   ├── blog_post.qmd
    │   ├── data_ingestion_to_vectordb.ipynb
    │   ├── img
    │       ├── ML-14328-Amit.png
    │       ├── ML-14328-architecture.drawio
    │       ├── ML-14328-architecture.png
    │       ├── ML-14328-cfn-delete.png
    │       ├── ML-14328-cfn-outputs.png
    │       ├── ML-14328-cloudformation-launch-stack.png
    │       ├── ML-14328-ntuteja.png
    │       ├── ML-14328-service-quota.png
    │       ├── ML-14328-sm-nb-jl.png
    │       ├── ML-14328-sm-nb-path.png
    │       ├── ML-14328-sm-nb-runall.png
    │       ├── ML-14328-streamlit-app.png
    │       └── ML-14328-xinhuang.jpg
    │   └── template.yml
└── workshop
    ├── 0_deploy_models.ipynb
    └── 1_kb_to_vectordb.ipynb


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | blogs/rag/api/deps/
 3 | blogs/rag/api/function.zip
 4 | function.zip
 5 | env/
 6 | .ipynb_checkpoints/
 7 | 1.py
 8 | RAG_AWS_OPEN_SEARCH_XIN_HUANG_0419.ipynb
 9 | rag.yaml
10 | rag2.yaml
11 | blogs/rag/container/
12 | blogs/rag/docs/
13 | blogs/rag/faiss_index/
14 | blogs/rag/sagemaker.readthedocs.io/
15 | blogs/rag/scripts/
16 | opensearch.yml
17 | 1.*
18 | 4.yaml
19 | blogs\opensearch-data-ingestion\blog_post.qmd
20 | blogs/rag/ML-14328 Blog Post.docx
21 | blogs/rag/blog_post.docx
22 | blogs/rag/blog_post1.htm
23 | blogs/rag/blog_post1_files/
24 | blogs/rag/~$og_post1.htm
25 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17 | 
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LLM Apps Workshop
 2 | 
 3 | This repository provides the source code for Large Language Model (LLM) based applications that are used for blog posts, workshop and demos.
 4 | 
 5 | [Amazon Sagemaker Jumpstart](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-jumpstart.html) makes it simple to host LLMs as [Sagemaker Endpoints](https://docs.aws.amazon.com/sagemaker/latest/dg/realtime-endpoints.html) enabling uses-cases for inference (text to text generation) and embeddings generation.
 6 | 
 7 | The LLM applications in this repo include inference, generating embeddings, question answer (zero-shot and few-shot learning, prompt engineering), retrieval augmented generation (RAG) and domain adapted fine-tuning (coming soon..).
 8 | 
 9 | ## Repository structure
10 | 
11 | The code in this repo is organized into the following sub-folders, each having its own README.md file.
12 | 
13 | ```.
14 | ├── README.md
15 | ├── blogs/
16 | ├── blogs/rag/
17 | ├── blogs/rag/api
18 | ├── blogs/rag/app
19 | ├── workshop/
20 | ```
21 | 
22 | ### Blogs
23 | 
24 | - [Building a Powerful Question Answering Bot with Amazon SageMaker JumpStart, Amazon OpenSearch, Streamlit, and LangChain: A Step-by-Step Guide](./blogs/rag/blog_post.md)
25 | 
26 | ## Security
27 | 
28 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
29 | 
30 | ## License
31 | 
32 | This library is licensed under the MIT-0 License. See the [LICENSE](./LICENSE) file.
33 | 


--------------------------------------------------------------------------------
/blogs/opensearch-data-ingestion/blog_post.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Building a Powerful Question Answering Bot with Amazon SageMaker, Amazon OpenSearch, Streamlit, and LangChain: A Step-by-Step Guide"
  3 | format:
  4 |   html:
  5 |     embed-resources: true
  6 |     output-file: blog_post.html
  7 |     theme: cosmo
  8 |     code-copy: true
  9 |     code-line-numbers: true
 10 |     highlight-style: github
 11 |   gfm: 
 12 |     output-file: blog_post.md
 13 | ---
 14 | 
 15 | _Amit Arora_, _Xin Huang_, _Navneet Tuteja_
 16 | 
 17 | One of the most common applications of Generative AI (GenAI) and Large Language Models (LLMs) in an enterprise environment is answering questions based on the enterprise's knowledge corpus. Pre-trained foundation models (FMs) perform well at Natural Language Understanding (NLU) tasks such summarization, text generation and question answering on a broad variety of topics but either struggle to provide accurate (without hallucinations) answers or completely fail at answering questions about content that they have not seen as part of their training data. Furthermore, FMs are trained with a point in time snapshot of data and have no inherent ability to access fresh data at inference time, without this ability they might provide responses that are potentially incorrect or inadequate.
 18 | 
 19 | A commonly used approach to address the above mentioned problem is to use a technique called Retrieval Augumented Generation (RAG). In the RAG approach we convert the user question into vector embeddings using an LLM and then do a similarity search for these embeddings in a pre-populated vector database holding the embeddings for the enterprise knowledge corpus. A small number of similar documents (typically three) is added as context along with the user question to the "prompt" provided to another LLM and then that LLM generates an answer to the user question using information provided as context in the prompt. RAG models were introduced by [Lewis et al.](https://arxiv.org/abs/2005.11401) in 2020 as a model where parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a pre-trained neural retriever.
 20 | 
 21 | In this blog post we provide a step-by-step guide with all the building blocks for creating an enterprise ready RAG application such as a question answering bot. We use a combination of different AWS services, open-source foundation models ([FLAN-T5 XXL](https://huggingface.co/google/flan-t5-xxl) for text generation and [GPT-j-6B](https://huggingface.co/EleutherAI/gpt-j-6b) for embeddings) and packages such as [LangChain](https://python.langchain.com/en/latest/index.html) for interfacing with all the components and [Streamlit](https://streamlit.io/) for building the bot frontend. 
 22 | 
 23 | We provide a cloud formation template to stand up all the resources required for building this solution and then demonstrate how to use LangChain for tying everything together from interfacing with LLMs hosted on SageMaker, to chunking of knowledge base documents and ingesting document embeddings into OpenSearch and implementing the question answer task,
 24 | 
 25 | We can use the same architecture to swap the open-source models with the [Amazon Titan](https://aws.amazon.com/bedrock/titan/) models. After [Amazon Bedrock](https://aws.amazon.com/bedrock/) launches, we will publish a follow-up post showing how to implement similar GenAI applications using Amazon Bedrock, so stay tuned.
 26 | 
 27 | ## Solution overview
 28 | 
 29 | We use the [SageMaker docs](https://sagemaker.readthedocs.io) as the knowledge corpus for this post. We convert the html pages on this site into smaller overalapping chunks of information and then convert these chunks into embeddings using the gpt-j-6b model and store the embeddings into OpenSearch. We implement the RAG functionality inside an AWS Lambda function with an Amazon API Gateway frontend. We implement a chatbot application in Streamlit which invokes the Lambda via the API Gateway and the Lambda does a similarity search for the user question with the embeddings in OpenSearch. The matching documents (chunks) are added to the prompt as context by the Lambda and then the Lambda use the flan-t5-xxl model deployed as a SageMaker Endpoint to generate an answer to the user question. All code for this post is available in the [GitHub repo](https://github.com/aws-samples/llm-apps-workshop/tree/main/blogs/rag).
 30 | 
 31 | 
 32 | The following figure represents the high-level architecture of the proposed solution.
 33 | 
 34 | ![Architecture](img/ML-14328-architecture.png){#fig-architecture}
 35 | 
 36 | As illustrated in the architecture diagram, we use the following AWS services:
 37 | 
 38 | - [Amazon SageMaker](https://aws.amazon.com/pm/sagemaker) and [Amazon SageMaker JumpStart](https://aws.amazon.com/sagemaker/jumpstart/) for hosting the two LLMs.
 39 | - [Amazon OpenSearch Service](https://aws.amazon.com/opensearch-service/) for storing the embeddings of the enterprise knowledge corpus and doing similarity search with user questions.
 40 | - [AWS Lambda](https://aws.amazon.com/lambda/) for implementing the RAG functionality and exposing it as a REST endpoint via the [Amazon API Gateway](https://aws.amazon.com/api-gateway/).
 41 | - [Amazon SageMaker Processing Jobs](https://docs.aws.amazon.com/sagemaker/latest/dg/processing-job.html) for large scale data ingestion into OpenSearch.
 42 | - [Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/) for hosting the Streamlit application.
 43 | - [AWS IAM](https://aws.amazon.com/iam/) roles and policies for access management.
 44 | - [AWS CloudFormation](https://aws.amazon.com/cloudformation/) for creating the entire solution stack through infrastructure as code.
 45 | 
 46 | In terms of open-source packages used in this solution, we use [LangChain](https://python.langchain.com/en/latest/index.html) for interfacing OpenSearch and SageMaker, and [FastAPI](https://github.com/tiangolo/fastapi) for implementing the REST API interface in the Lambda.
 47 | 
 48 | The workflow for instantiating the solution presented in this blog in your own AWS account is as follows:
 49 | 
 50 | 1. Run the AWS CloudFormation template provided with this blog in your account. This will create all the necessary infrastructure resources needed for this solution. 
 51 | 
 52 | 1. Run the [`data_ingestion_to_vectordb.ipynb`](./data_ingestion_to_vectordb.ipynb) notebook in SageMaker Notebooks. This will ingest data from [SageMaker docs](https://sagemaker.readthedocs.io) into an OpenSearch index.
 53 | 
 54 | 1. Run the Streamlit application on a Terminal in SageMaker Studio and open the URL for the application in a new browser tab. 
 55 | 
 56 | 1. Ask your questions about SageMaker via the chat interface provided by the Streamlit app and view the responses generated by the LLM. 
 57 | 
 58 | These steps are discussed in detail in the sections below.
 59 | 
 60 | ### Prerequisites
 61 | 
 62 | To implement the solution provided in this post, you should have an [AWS account](https://signin.aws.amazon.com/signin?redirect_uri=https%3A%2F%2Fportal.aws.amazon.com%2Fbilling%2Fsignup%2Fresume&client_id=signup) and familiarity with LLMs, OpenSearch and SageMaker.
 63 | 
 64 | #### Use AWS Cloud Formation to create the solution stack
 65 | 
 66 | We use AWS CloudFormation to create a SageMaker notebook called `aws-llm-apps-blog` and an IAM role called `LLMAppsBlogIAMRole`. Choose **Launch Stack** for the Region you want to deploy resources to. **This template takes about 15 minutes to run completely**.
 67 | 
 68 |    |AWS Region                |     Link        |
 69 |    |:------------------------:|:-----------:|
 70 |    |us-east-1 (N. Virginia)    | [<img src="./img/cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
 71 |    |us-east-2 (Ohio)          | [<img src="./img/cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
 72 |    |us-west-1 (N. California) | [<img src="./img/cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-west-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
 73 |    |eu-west-1 (Dublin)        | [<img src="./img/cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
 74 |    |ap-northeast-1 (Tokyo)    | [<img src="./img/cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
 75 | 
 76 | #### Ingest the data into OpenSearch
 77 | 
 78 | To ingest the data, complete the following steps:
 79 | 
 80 | 1. On the SageMaker console, choose **Notebooks** in the navigation pane.
 81 | 
 82 | 1. Select the notebook aws-llm-apps-blog and choose **Open JupyterLab**.
 83 | 
 84 |     ![Open JupyterLab](img/ML-14328-sm-nb-jl.png){#fig-open-jl}
 85 | 
 86 | 1. Choose `data_ingestion_to_vectordb.ipynb` to open it in JupyterLab. This notebook will ingest the [SageMaker docs](https://sagemaker.readthedocs.io) to an OpenSearch index called `llm_apps_workshop_embeddings`.
 87 |     ![Open Data Inestion Notebook](img/ML-14328-sm-nb-path.png){#fig-open-data-ingestion-nb}
 88 | 
 89 | 1. Once the notebook is open, then, on the Run menu, choose **Run All Cells** to run the code in this notebook. This will download the dataset locally into the notebook and then ingest it into the OpenSearch index. This notebook takes about 20 minutes to run. The notebook also ingests the data into another vector database called [`FAISS`](https://github.com/facebookresearch/faiss) for illustration purposes, the FAISS index files are saved locally and the uploaded to S3 so that they can optionally be used by the Lambda function as an illustration of using an alternate vector database. 
 90 | 
 91 |     ![Notebook Run All Cells](img/ML-14328-sm-nb-runall){#fig-notebook-run-all-cells}
 92 | 
 93 |     The following code snippets in the notebook show the use of LangChain to ingest the dataset into OpenSearch. See the `data_ingestion_to_vectordb.ipynb` notebook for the full code.
 94 | 
 95 |     Split the dataset into shards that can be ingested in parallel to speed up the ingestion process.
 96 | 
 97 |     ```{.python}
 98 |     loader = ReadTheDocsLoader(args.input_data_dir)
 99 |     text_splitter = RecursiveCharacterTextSplitter(
100 |         # Set a really small chunk size, just to show.
101 |         chunk_size=args.chunk_size_for_doc_split,
102 |         chunk_overlap=args.chunk_overlap_for_doc_split,
103 |         length_function=len,
104 |     )
105 |     
106 |     # Stage one: read all the docs, split them into chunks. 
107 |     st = time.time() 
108 |     logger.info('Loading documents ...')
109 |     docs = loader.load()
110 |     
111 |     # add a custom metadata field, such as timestamp
112 |     for doc in docs:
113 |         doc.metadata['timestamp'] = time.time()
114 |         doc.metadata['embeddings_model'] = args.embeddings_model_endpoint_name
115 |     chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
116 |     et = time.time() - st
117 |     logger.info(f'Time taken: {et} seconds. {len(chunks)} chunks generated') 
118 |     
119 |     
120 |     db_shards = (len(chunks) // MAX_OS_DOCS_PER_PUT) + 1
121 |     logger.info(f'Loading chunks into vector store ... using {db_shards} shards') 
122 |     ```
123 | 
124 |     The `from_documents` function from `langchain.vectorstores.OpenSearchVectorSearch` creates an OpenSearch `k-NN` index so that it is setup for storing embeddings and doing a `similarity_search`.
125 | 
126 |     ```{.python}
127 |     # by default langchain would create a k-NN index and the embeddings would be ingested as a k-NN vector type
128 |     docsearch = OpenSearchVectorSearch.from_documents(index_name=args.opensearch_index_name,
129 |                                                       documents=shards[0],
130 |                                                       embedding=embeddings,
131 |                                                       opensearch_url=args.opensearch_cluster_domain,
132 |                                                       http_auth=http_auth)
133 |     ```
134 | 
135 |     Once the OpenSearch index is setup by ingesting the first shard, subsequent shards are ingested in parallel via the `add_documents` function from `langchain.vectorstores.OpenSearchVectorSearch`.
136 | 
137 |     ```{.python}
138 |         
139 |     def process_shard(shard: List[Document],
140 |                   embeddings_model_endpoint_name: str,
141 |                   aws_region: str, 
142 |                   os_index_name: str,
143 |                   os_domain_ep: str,
144 |                   os_http_auth: Tuple) -> int: 
145 |         logger.info(f'Starting process_shard of {len(shard)} chunks.')
146 |         st = time.time()
147 |         embeddings = create_sagemaker_embeddings_from_js_model(embeddings_model_endpoint_name, aws_region)
148 |         docsearch = OpenSearchVectorSearch(index_name=os_index_name,
149 |                                            embedding_function=embeddings,
150 |                                            opensearch_url=os_domain_ep,
151 |                                            http_auth=os_http_auth)    
152 |         docsearch.add_documents(documents=shard)
153 |         et = time.time() - st
154 |         logger.info(f'Shard completed in {et} seconds.')
155 |         return 0
156 |     ```
157 | 
158 |     LangChain interfaces with the SageMaker Endpoint for creating embeddings of the document chunks using the `langchain.embeddings.SagemakerEndpointEmbeddings` and the `langchain.embeddings.sagemkaer_endpoint.EmbeddingsContentHandler` class, both of which are extended by the code in this notebook. 
159 | 
160 |     Here are a few salient points to note about ingesting data into OpenSearch via LangChain.
161 | 
162 |     - LangChain ingests the data into OpenSearch via the [`_bulk`](https://opensearch.org/docs/1.2/opensearch/rest-api/document-apis/bulk/) API, 500 documents are ingested in a single PUT request.
163 |     - LangChain creates an OpenSearch index with a schema that has a `metadata` field that includes `source` (corresponding to the filename of the chunk), a `text` field for the raw text and `vector_field` for the embeddings.
164 |     - Refer to `opensearch_vector_search.py` in the LangChain [GitHub repo](https://github.com/hwchase17/langchain/) for further customization of the OpenSearch index parameters.
165 | 
166 | 1. We use Sagemkaer Processing Job launched from this notebook to ingest data into Openearch. The code fragments seen above are part of a Python script which is provided to the SageMaker Procesing Job to run in a distributed fashiom. 
167 |   - We create a custom container in which we will install the `langchain` and `opensearch-py` Python packges and then upload this container image to Amazon Elastic Container Registry (ECR).
168 |   - We sse the Sagemaker `ScriptProcessor` class to create a Sagemaker Processing job that will run on multiple nodes.
169 |       - The data files available in S3 are automatically distributed across in the Sagemaker Processing Job instances by setting `s3_data_distribution_type='ShardedByS3Key'` as part of the `ProcessingInput` provided to the processing job.
170 |       - Each node processes a subset of the files and this brings down the overall time required to ingest the data into Opensearch.
171 |       - Each node also uses Python `multiprocessing` to internally also parallelize the file processing. Thus, **there are two levels of parallelization happening, one at the cluster level where individual nodes are distributing the work (files) amongst themselves and another at the node level where the files in a node are also split between multiple processes running on the node**.
172 | 
173 |   ```{.python}
174 |   # setup the ScriptProcessor with the above parameters
175 |   processor = ScriptProcessor(base_job_name=base_job_name,
176 |                               image_uri=image_uri,
177 |                               role=aws_role,
178 |                               instance_type=instance_type,
179 |                               instance_count=instance_count,
180 |                               command=["python3"],
181 |                               tags=tags)
182 | 
183 |   # setup input from S3, note the ShardedByS3Key, this ensures that 
184 |   # each instance gets a random and equal subset of the files in S3.
185 |   inputs = [ProcessingInput(source=f"s3://{bucket}/{app_name}/{DOMAIN}",
186 |                             destination='/opt/ml/processing/input_data',
187 |                             s3_data_distribution_type='ShardedByS3Key',
188 |                             s3_data_type='S3Prefix')]
189 | 
190 | 
191 |   logger.info(f"creating an opensearch index with name={opensearch_index}")
192 |   # ready to run the processing job
193 |   st = time.time()
194 |   processor.run(code="container/load_data_into_opensearch.py",
195 |                 inputs=inputs,
196 |                 outputs=[],
197 |                 arguments=["--opensearch-cluster-domain", opensearch_domain_endpoint,
198 |                           "--opensearch-secretid", os_creds_secretid_in_secrets_manager,
199 |                           "--opensearch-index-name", opensearch_index,
200 |                           "--aws-region", aws_region,
201 |                           "--embeddings-model-endpoint-name", embeddings_model_endpoint_name,
202 |                           "--chunk-size-for-doc-split", str(CHUNK_SIZE_FOR_DOC_SPLIT),
203 |                           "--chunk-overlap-for-doc-split", str(CHUNK_OVERLAP_FOR_DOC_SPLIT),
204 |                           "--input-data-dir", "/opt/ml/processing/input_data",
205 |                           "--create-index-hint-file", CREATE_OS_INDEX_HINT_FILE,
206 |                           "--process-count", "2"])
207 |   ```
208 | 
209 | 1. Close the notebook after all cells run without any error. Your data is now available in OpenSearch. The following screenshot shows the `california_housing` table created in OpenSearch.
210 | 
211 |    ![OpenSearch Index](img/ML-14328-opensearch-index.png){#fig-opensearch-index}
212 | 
213 | ## Clean up
214 | 
215 | To avoid incurring future charges, delete the resources. You can do this by deleting the CloudFormation template used to create the IAM role and SageMaker notebook.
216 | 
217 | ![Cleaning Up](img/cfn-delete.png){#fig-cleaning-up-2}
218 | 
219 | 
220 | ## Conclusion
221 | 
222 | In this post, we showed ..
223 | 
224 | We encourage you to learn more by exploring the [Amazon SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/) and building a solution using the sample implementation provided in this post and a dataset relevant to your business. If you have questions or suggestions, leave a comment.
225 | 
226 |   *  *  *  *  *
227 | 
228 | ## Author bio
229 | 
230 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-Amit.png">Amit Arora is an AI and ML specialist architect at Amazon Web Services, helping enterprise customers use cloud-based machine learning services to rapidly scale their innovations. He is also an adjunct lecturer in the MS data science and analytics program at Georgetown University in Washington D.C.
231 | 
232 | <br><br>
233 | 
234 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-xinhuang.jpg">Dr. Xin Huang is a Senior Applied Scientist for Amazon SageMaker JumpStart and Amazon SageMaker built-in algorithms. He focuses on developing scalable machine learning algorithms. His research interests are in the area of natural language processing, explainable deep learning on tabular data, and robust analysis of non-parametric space-time clustering. He has published many papers in ACL, ICDM, KDD conferences, and Royal Statistical Society: Series A..
235 | 
236 | <br><br>
237 | 
238 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-ntuteja.jfif">Navneet Tuteja is a Data Specialist at Amazon Web Services. Before joining AWS, Navneet worked as a facilitator for organizations seeking to modernize their data architectures and implement comprehensive AI/ML solutions. She holds an engineering degree from Thapar University, as well as a master's degree in statistics from Texas A&M University.
239 | 


--------------------------------------------------------------------------------
/blogs/rag/README.md:
--------------------------------------------------------------------------------
 1 | # QA with LLM and RAG
 2 | 
 3 | A question answer task on a corpus of enterprise specific data is a common use-case in an enterprise scenario. If the data to be used for this task is publicly available then chances are that a pre-trained foundation large language model (LLM) will be able to provide a reasonable response to the question but this approach suffers from the following problems: a) the LLM is trained with a point in time snapshot of the data so its response will not be current, b) the LLM could hallucinate i.e. provide convincing looking responses that are factually incorrect and c) most importantly, the model may never have seen the enterprise specific data and is therefore not able to provide a useful response.
 4 | 
 5 | All of these problems can be solved by using one of the following approaches:
 6 | 
 7 | 1. Use Retrieval Augmented Generation (RAG) i.e. consult the enterprise specific knowledge corpus to find specific chunks of data (text) that are likely to contain answers to the question asked and then include this relevant data as context along with the question in the "prompt" provided to the LLM.
 8 | 
 9 | 1. As an additional step we could also Fine-tune the LLM on a question answering task using the enterprise specific knowledge corpus and then use RAG. The fine-tuned model now already has better baseline understanding of the enterprise data than the pre-trained LLM and in combination with RAG it can consult the most up to date version of the knowledge corpus to provide the best response to a question.
10 | 
11 | The following diagram shows a potential architecture of this solution for a virtual agent assist platform.
12 | 
13 | ![](images/finetuning_llm_and_rag.png)
14 | 
15 | Here is a screenshot of a Chatbot app built on this architecture.
16 | ![](images/chatbot.png)
17 | 
18 | ## Installation
19 | 
20 | Follow the steps listed below to create and run the RAG solution. The [blog_post.md](./blog_post.md) describes this solution in detail.
21 | 
22 | 1. Launch the cloud formation template included in this repository using one of the buttons from the table below. The cloud formation template will create an IAM role called `LLMAppsBlogIAMRole` and a SageMaker Notebook called `aws-llm-apps-blog` that we will use for running the code in this repository. This cloud formation template creates the Amazon OpenSearch cluster, LLM endpoints for text generation and embeddings and a Amazon SageMaker Notebook with this repository cloned to run the next steps.
23 | 
24 | 
25 |    |AWS Region                |     Link        |
26 |    |:------------------------:|:-----------:|
27 |    |us-east-1 (N. Virginia)    | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
28 |    |us-west-2 (Oregon)          | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
29 |    |eu-west-1 (Dublin)        | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
30 |    |ap-northeast-1 (Tokyo)    | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
31 | 
32 | 1. Once the cloud formation stack has been created successfully, open the Outputs tab of the stack and note the URL for the API Gateway we will be needing it to the run a RAG query later on.
33 | 
34 | 1. Open the `aws-llm-apps-blog` SageMaker Notebook created by the cloud formation template and then find the [`data_ingestion_to_vectordb.ipynb`](data_ingestion_to_vectordb.ipynb) file and double click on it.
35 | 
36 | 1. Do a `Run All` for this notebook. It will ingest the data (embeddings) into the OpenSearch cluster and once that is done, we are now ready to ask some questions via the `/rag` endpoint of the Lambda function.
37 | 
38 | 1. Query the API Gateway `/rag` endpoint using the following command. The endpoint can be seen on the Outputs tab of the cloud formation stack, it is value of the `LLMAppAPIEndpoint` key.
39 | 
40 |     ```{{bash}}
41 |     curl -X POST "https://replace-with-your-api-gw-url/prod/api/v1/llm/rag" -H  "accept: application/json" -H  "Content-Type: application/json" -d "{\"q\":\"Which versions of XGBoost does SageMaker support?\"}"
42 |     ```
43 | 1. Run the [`streamlit`](https://streamlit.io/) app for the chatbot on `SageMaker Studio`. On `Sagemaker Studio` create a new `Terminal` and run the following commands:
44 | 
45 |     ```{{bash}}
46 |     git clone https://github.com/aws-samples/llm-apps-workshop    
47 |     cd llm-apps-workshop/blogs/rag/app
48 |     pip install -r requirements.txt
49 |     streamlit run webapp.py    
50 |     ```
51 |     This will start a streamlit app on SageMaker Studio, you can access the app by opening the following URL in a new browser tab `https://replace-with-your-studio-domain.studio.replace-with-your-region.sagemaker.aws/jupyter/default/proxy/8501/webapp`
52 | 
53 | ### Building your version of the Lambda
54 | 
55 | 1. Open a new Terminal on the SageMaker Notebook and change to the `rag/api` directory using the following command:
56 | 
57 |     ```
58 |     cd /home/ec2-user/SageMaker/repos/llm-apps-workshop/blogs/rag/api
59 |     ```
60 | 
61 | 1. Create a `conda` environment for `Python 3.9`.
62 | 
63 |     ```{{bash}}
64 | 
65 |     conda create -n py39 python=3.9 -y
66 | 
67 |     # activate the environment
68 |     source activate py39
69 |     ```
70 | 
71 | 1. Package and upload `function.zip` to the SageMaker bucket for your region.
72 | 
73 |     ```{{bash}}
74 |     ./deploy.sh
75 |     ```
76 | 
77 | 1. Update the code for the Lambda function to point to the S3 file uploaded in the step above.
78 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/api/app/__init__.py


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/api/app/api/__init__.py


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/api/app/api/api_v1/__init__.py


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/api.py:
--------------------------------------------------------------------------------
1 | 
2 | from .endpoints import llm_ep
3 | from fastapi import APIRouter
4 | 
5 | router = APIRouter()
6 | router.include_router(llm_ep.router, prefix="/llm", tags=["llm"])
7 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/endpoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/api/app/api/api_v1/endpoints/__init__.py


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/endpoints/fastapi_request.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import boto3
 3 | from enum import Enum
 4 | from pydantic import BaseModel
 5 | 
 6 | ACCOUNT_ID = boto3.client("sts").get_caller_identity()["Account"]
 7 | REGION = boto3.Session().region_name
 8 | 
 9 | class Text2TextModelName(str, Enum):
10 |     flant5xxl = "flan-t5-xxl"
11 | 
12 | class EmbeddingsModelName(str, Enum):
13 |     gptj6b = "gpt-j-6b"
14 | 
15 | class VectorDBType(str, Enum):
16 |     OPENSEARCH = "opensearch"
17 |     FAISS = "faiss"
18 | 
19 | class Request(BaseModel):
20 |     q: str
21 |     max_length: int = 500
22 |     num_return_sequences: int = 1
23 |     top_k: int = 250
24 |     top_p: float = 0.95
25 |     do_sample: bool = False
26 |     temperature: float = 1
27 |     verbose: bool = False
28 |     max_matching_docs: int = 3  
29 |     text_generation_model: Text2TextModelName = Text2TextModelName.flant5xxl
30 |     embeddings_generation_model: EmbeddingsModelName = EmbeddingsModelName.gptj6b
31 |     vectordb_s3_path: str = f"s3://sagemaker-{REGION}-{ACCOUNT_ID}/{os.environ.get('APP_NAME')}/faiss_index/"
32 |     vectordb_type: VectorDBType = VectorDBType.OPENSEARCH
33 | 
34 | SAGEMAKER_ENDPOINT_MAPPING = {
35 |     Text2TextModelName.flant5xxl: os.environ.get('TEXT2TEXT_ENDPOINT_NAME'),
36 |     EmbeddingsModelName.gptj6b: os.environ.get('EMBEDDING_ENDPOINT_NAME'),
37 | }
38 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/endpoints/initialize.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import boto3
  4 | import logging
  5 | from typing import List, Callable
  6 | from urllib.parse import urlparse
  7 | from langchain.vectorstores import FAISS
  8 | from langchain.vectorstores import OpenSearchVectorSearch
  9 | from langchain.embeddings import SagemakerEndpointEmbeddings
 10 | from langchain.llms.sagemaker_endpoint import SagemakerEndpoint
 11 | from langchain.llms.sagemaker_endpoint import LLMContentHandler
 12 | from langchain.llms.sagemaker_endpoint import ContentHandlerBase
 13 | from .fastapi_request import SAGEMAKER_ENDPOINT_MAPPING, Request
 14 | from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings):
 18 |     def embed_documents(
 19 |         self, texts: List[str], chunk_size: int = 5
 20 |     ) -> List[List[float]]:
 21 |         """Compute doc embeddings using a SageMaker Inference Endpoint.
 22 | 
 23 |         Args:
 24 |             texts: The list of texts to embed.
 25 |             chunk_size: The chunk size defines how many input texts will
 26 |                 be grouped together as request. If None, will use the
 27 |                 chunk size specified by the class.
 28 | 
 29 |         Returns:
 30 |             List of embeddings, one for each text.
 31 |         """
 32 |         results = []
 33 |         #print(f"length of texts = {len(texts)}")
 34 |         _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size
 35 |         
 36 |         for i in range(0, len(texts), _chunk_size):
 37 |             response = self._embedding_func(texts[i : i + _chunk_size])
 38 |             #print(response)
 39 |             results.extend(response)
 40 |         return results
 41 | 
 42 | class ContentHandlerForEmbeddings(EmbeddingsContentHandler):
 43 |     """
 44 |     encode input string as utf-8 bytes, read the embeddings
 45 |     from the output
 46 |     """
 47 |     content_type = "application/json"
 48 |     accepts = "application/json"
 49 |     def transform_input(self, prompt: str, model_kwargs = {}) -> bytes:
 50 |         input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
 51 |         return input_str.encode('utf-8') 
 52 | 
 53 |     def transform_output(self, output: bytes) -> str:
 54 |         response_json = json.loads(output.read().decode("utf-8"))
 55 |         embeddings = response_json["embedding"]
 56 |         if len(embeddings) == 1:
 57 |             return [embeddings[0]]
 58 |         return embeddings
 59 | 
 60 | class ContentHandlerForTextGeneration(LLMContentHandler):
 61 |     content_type = "application/json"
 62 |     accepts = "application/json"
 63 | 
 64 |     def transform_input(self, prompt: str, model_kwargs = {}) -> bytes:
 65 |         input_str = json.dumps({"text_inputs": prompt, **model_kwargs})
 66 |         return input_str.encode('utf-8')
 67 | 
 68 |     def transform_output(self, output: bytes) -> str:
 69 |         response_json = json.loads(output.read().decode("utf-8"))
 70 |         return response_json["generated_texts"][0]
 71 | 
 72 | def _create_sagemaker_embeddings(endpoint_name: str, region: str = "us-east-1") -> SagemakerEndpointEmbeddingsJumpStart:
 73 |     # create a content handler object which knows how to serialize
 74 |     # and deserialize communication with the model endpoint
 75 |     content_handler = ContentHandlerForEmbeddings()
 76 | 
 77 |     # read to create the Sagemaker embeddings, we are providing
 78 |     # the Sagemaker endpoint that will be used for generating the
 79 |     # embeddings to the class
 80 |     embeddings = SagemakerEndpointEmbeddingsJumpStart( 
 81 |         endpoint_name=endpoint_name,
 82 |         region_name=region, 
 83 |         content_handler=content_handler
 84 |     )
 85 |     logger.info(f"embeddings type={type(embeddings)}")
 86 | 
 87 |     return embeddings
 88 | 
 89 | def _get_credentials(secret_id: str, region_name: str) -> str:
 90 | 
 91 |     client = boto3.client('secretsmanager', region_name=region_name)
 92 |     response = client.get_secret_value(SecretId=secret_id)
 93 |     secrets_value = json.loads(response['SecretString'])
 94 |     return secrets_value
 95 | 
 96 | def load_vector_db_opensearch(secret_id: str,
 97 |                               region: str,
 98 |                               opensearch_domain_endpoint: str,
 99 |                               opensearch_index: str,
100 |                               embeddings_model: str) -> OpenSearchVectorSearch:
101 |     logger.info(f"load_vector_db_opensearch, secret_id={secret_id}, region={region}, "
102 |                 f"opensearch_domain_endpoint={opensearch_domain_endpoint}, opensearch_index={opensearch_index}, "  
103 |                 f"embeddings_model={embeddings_model}")
104 |     opensearch_domain_endpoint = f"https://{opensearch_domain_endpoint}"
105 |     embeddings_model_endpoint = SAGEMAKER_ENDPOINT_MAPPING[embeddings_model]
106 |     logger.info(f"embeddings_model_endpoint={embeddings_model_endpoint}, opensearch_domain_endpoint={opensearch_domain_endpoint}")
107 |     creds = _get_credentials(secret_id, region)
108 |     http_auth = (creds['username'], creds['password'])
109 |     vector_db = OpenSearchVectorSearch(index_name=opensearch_index,
110 |                                        embedding_function=_create_sagemaker_embeddings(embeddings_model_endpoint,
111 |                                                                                        region),
112 |                                        opensearch_url=opensearch_domain_endpoint,
113 |                                        http_auth=http_auth)
114 |     logger.info(f"returning handle to OpenSearchVectorSearch, vector_db={vector_db}")
115 |     return vector_db
116 | 
117 | def load_vector_db_faiss(vectordb_s3_path: str, vectordb_local_path: str, embeddings_endpoint_name: str, region: str) -> FAISS:
118 |     os.makedirs(vectordb_local_path, exist_ok=True)
119 |     # download the vectordb files from S3
120 |     # note that the following code is only applicable to FAISS
121 |     # would need to be enhanced to support other vector dbs
122 |     vectordb_files = ["index.pkl", "index.faiss"]
123 |     for vdb_file in vectordb_files:        
124 |         s3 = boto3.client('s3')
125 |         fpath = os.path.join(vectordb_local_path, vdb_file)
126 |         with open(fpath, 'wb') as f:
127 |             parsed = urlparse(vectordb_s3_path)
128 |             bucket = parsed.netloc
129 |             path =  os.path.join(parsed.path[1:], vdb_file)
130 |             logger.info(f"going to download from bucket={bucket}, path={path}, to {fpath}")
131 |             s3.download_fileobj(bucket, path, f)
132 |             logger.info(f"after downloading from bucket={bucket}, path={path}, to {fpath}")
133 | 
134 |     # files are downloaded, lets load the vectordb
135 |     logger.info("creating a Sagemaker embeddings object to hydrate the vector db")
136 |     embeddings = _create_sagemaker_embeddings(SAGEMAKER_ENDPOINT_MAPPING[embeddings_endpoint_name], region)
137 |     vector_db = FAISS.load_local(vectordb_local_path, embeddings)
138 |     logger.info(f"vector db hydrated, type={type(vector_db)} it has {vector_db.index.ntotal} embeddings")
139 | 
140 |     return vector_db
141 | 
142 | def setup_sagemaker_endpoint_for_text_generation(req: Request, region: str = "us-east-1") -> Callable:
143 |     parameters = {
144 |     "max_length": req.max_length,
145 |     "num_return_sequences": req.num_return_sequences,
146 |     "top_k": req.top_k,
147 |     "top_p": req.top_p,
148 |     "do_sample": req.do_sample,
149 |     "temperature": req.temperature,}
150 |     
151 |     endpoint_name = req.text_generation_model
152 |     content_handler = ContentHandlerForTextGeneration()    
153 |     print(f"SAGEMAKER_ENDPOINT_MAPPING[{endpoint_name}]={SAGEMAKER_ENDPOINT_MAPPING[endpoint_name]}")
154 |     sm_llm = SagemakerEndpoint(
155 |         endpoint_name=SAGEMAKER_ENDPOINT_MAPPING[endpoint_name],
156 |         region_name=region,
157 |         model_kwargs=parameters,
158 |         content_handler=content_handler)
159 |     return sm_llm
160 | 
161 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/endpoints/llm_ep.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import boto3
  4 | import logging
  5 | from typing import Any, Dict
  6 | from fastapi import APIRouter
  7 | from urllib.parse import urlparse
  8 | from langchain import PromptTemplate
  9 | from .fastapi_request import (Request,
 10 |                               Text2TextModelName,
 11 |                               EmbeddingsModelName,
 12 |                               VectorDBType)
 13 | from .sm_helper import query_sm_endpoint
 14 | from langchain.chains.question_answering import load_qa_chain
 15 | from .initialize import (setup_sagemaker_endpoint_for_text_generation,
 16 |                         load_vector_db_faiss,
 17 |                         load_vector_db_opensearch)
 18 | 
 19 | logging.getLogger().setLevel(logging.INFO)
 20 | logger = logging.getLogger()
 21 | #logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO)
 22 | 
 23 | # initialize the vector db as a global variable so that it
 24 | # can persist across lambda invocations
 25 | VECTOR_DB_DIR = os.path.join("/tmp", "_vectordb")
 26 | _vector_db = None
 27 | _current_vectordb_type = None
 28 | _sm_llm = None
 29 | 
 30 | router = APIRouter()
 31 | 
 32 | def _init(req: Request):
 33 |     # vector db is a global static variable, so that it only
 34 |     # created once across multiple lambda invocations, if possible
 35 |     global _vector_db
 36 |     global _current_vectordb_type
 37 |     logger.info(f"req.vectordb_type={req.vectordb_type}, _vector_db={_vector_db}")
 38 |     if req.vectordb_type != _current_vectordb_type:
 39 |         logger.info(f"req.vectordb_type={req.vectordb_type} does not match _current_vectordb_type={_current_vectordb_type}, "
 40 |                     f"resetting _vector_db")
 41 |         _vector_db = None
 42 | 
 43 |     if req.vectordb_type == VectorDBType.OPENSEARCH and _vector_db is None:
 44 |         # ARN of the secret is of the following format arn:aws:secretsmanager:region:account_id:secret:my_path/my_secret_name-autoid
 45 |         os_creds_secretid_in_secrets_manager = "-".join(os.environ.get('OPENSEARCH_SECRET').split(":")[-1].split('-')[:-1])
 46 |         _vector_db = load_vector_db_opensearch(os_creds_secretid_in_secrets_manager,
 47 |                                                boto3.Session().region_name,
 48 |                                                os.environ.get('OPENSEARCH_DOMAIN_ENDPOINT'),
 49 |                                                os.environ.get('OPENSEARCH_INDEX'),
 50 |                                                req.embeddings_generation_model)
 51 |     elif req.vectordb_type == VectorDBType.FAISS and _vector_db is None:
 52 |         logger.info(f"vector db does not exist, creating it now")
 53 |         _vector_db = load_vector_db_faiss(req.vectordb_s3_path,
 54 |                                           VECTOR_DB_DIR,
 55 |                                           req.embeddings_generation_model,
 56 |                                           boto3.Session().region_name)
 57 |         logger.info("after creating vector db")
 58 |     elif _vector_db is not None:
 59 |         logger.info(f"seems like vector db already exists...")
 60 |     else:
 61 |         logger.error(f"req.vectordb_type={req.vectordb_type} which is not supported, _vector_db={_vector_db}")
 62 | 
 63 |     # just like the vector db the sagemaker endpoint used for
 64 |     # text generation is also global and shared across invocations
 65 |     # if possible
 66 |     global _sm_llm
 67 |     if _sm_llm is None:
 68 |         logger.info(f"SM LLM endpoint is not setup, setting it up now")
 69 |         _sm_llm = setup_sagemaker_endpoint_for_text_generation(req,
 70 |                                                                boto3.Session().region_name)
 71 |         logger.info("after setting up sagemaker llm endpoint")
 72 |     else:
 73 |         logger.info(f"sagemaker llm endpoint already exists..")
 74 | 
 75 | 
 76 | @router.post("/text2text")
 77 | async def llm_textgen(req: Request) -> Dict[str, Any]:
 78 |     # dump the received request for debugging purposes
 79 |     logger.info(f"req={req}")
 80 | 
 81 |     # initialize vector db and Sagemaker Endpoint
 82 |     _init(req)
 83 | 
 84 |     # now that we have the matching docs, lets pack them as a context
 85 |     # into the prompt and ask the LLM to generate a response
 86 |     answer = query_sm_endpoint(req)
 87 |     resp = {'question': req.q, 'answer': answer}
 88 |     return resp
 89 | 
 90 | @router.post("/rag")
 91 | async def rag_handler(req: Request) -> Dict[str, Any]:
 92 |     # dump the received request for debugging purposes
 93 |     logger.info(f"req={req}")
 94 | 
 95 |     # initialize vector db and Sagemaker Endpoint
 96 |     _init(req)
 97 | 
 98 |     # Use the vector db to find similar documents to the query
 99 |     # the vector db call would automatically convert the query text
100 |     # into embeddings
101 |     docs = _vector_db.similarity_search(req.q, k=req.max_matching_docs)
102 |     logger.info(f"here are the {req.max_matching_docs} closest matching docs to the query=\"{req.q}\"")
103 |     for d in docs:
104 |         logger.info(f"---------")
105 |         logger.info(d)
106 |         logger.info(f"---------")
107 | 
108 |     # now that we have the matching docs, lets pack them as a context
109 |     # into the prompt and ask the LLM to generate a response
110 |     prompt_template = """Answer based on context:\n\n{context}\n\n{question}"""
111 | 
112 |     prompt = PromptTemplate(
113 |         template=prompt_template, input_variables=["context", "question"]
114 |     )
115 |     logger.info(f"prompt sent to llm = \"{prompt}\"")
116 |     chain = load_qa_chain(llm=_sm_llm, prompt=prompt)
117 |     answer = chain({"input_documents": docs, "question": req.q}, return_only_outputs=True)['output_text']
118 |     logger.info(f"answer received from llm,\nquestion: \"{req.q}\"\nanswer: \"{answer}\"")
119 |     resp = {'question': req.q, 'answer': answer}
120 |     if req.verbose is True:
121 |         resp['docs'] = docs
122 | 
123 |     return resp
124 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/api/api_v1/endpoints/sm_helper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | import boto3
 4 | import logging
 5 | from typing import List, Dict
 6 | from .fastapi_request import (Request,
 7 |                               SAGEMAKER_ENDPOINT_MAPPING)
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type="application/json") -> Dict:
12 |     client = boto3.client("runtime.sagemaker")
13 |     response = client.invoke_endpoint(
14 |         EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json
15 |     )
16 |     return response
17 | 
18 | def parse_response_model_flan_t5(query_response) -> List:
19 |     model_predictions = json.loads(query_response["Body"].read())
20 |     generated_text = model_predictions["generated_texts"]
21 |     return generated_text
22 | 
23 | def query_sm_endpoint(req: Request) -> List:
24 |     payload = {
25 |         "text_inputs": req.q,
26 |         "max_length": req.max_length,
27 |         "num_return_sequences": req.num_return_sequences,
28 |         "top_k": req.top_k,
29 |         "top_p": req.top_p,
30 |         "do_sample": req.do_sample,
31 |     }
32 | 
33 |     endpoint_name = req.text_generation_model
34 |     query_response = query_endpoint_with_json_payload(
35 |             json.dumps(payload).encode("utf-8"), endpoint_name=SAGEMAKER_ENDPOINT_MAPPING[endpoint_name]
36 |         )
37 | 
38 |     generated_texts = parse_response_model_flan_t5(query_response)
39 |     logger.info(f"the generated output is: {generated_texts}")
40 |     return generated_texts
41 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/main.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import subprocess
 3 | from mangum import Mangum
 4 | from fastapi import FastAPI
 5 | from app.api.api_v1.api import router as api_router
 6 | 
 7 | app = FastAPI()
 8 | 
 9 | @app.get("/")
10 | async def root():
11 |     return {"message": "API for question answering bot"}
12 | 
13 | app.include_router(api_router, prefix="/api/v1")
14 | handler = Mangum(app)
15 | 


--------------------------------------------------------------------------------
/blogs/rag/api/app/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.95.1
 2 | mangum==0.9.2
 3 | uvicorn==0.11.8
 4 | langchain==0.2.10
 5 | boto3==1.26.113
 6 | faiss-cpu==1.7.3
 7 | numpy==1.24.2
 8 | opensearch-py==2.2.0
 9 | 
10 | 


--------------------------------------------------------------------------------
/blogs/rag/api/deploy.sh:
--------------------------------------------------------------------------------
 1 | # Build the function.zip file for a Lambda function.
 2 | # 1. Package all the dependencies as listed in requirements.txt.
 3 | # 2. Special handling for numpy as we need the version that works on Amazon Linux.
 4 | # 3. Remove boto3 to reduce the size of the package (uncompressed < 250MB) as it is already included in the Lambda runtime.
 5 | # 4. pip install with the --no-cache-dir option to reduce the size of the package.
 6 | 
 7 | # all dependencies used by the lambda are installed in the deps folder
 8 | DEPS_DIR=deps
 9 | 
10 | # numpy fies from pypi, note this URL is specifically for Python 3.9, would need to be changed for a different Python version
11 | NUMPY_WHL_URL=https://files.pythonhosted.org/packages/f4/f4/45e6e3f7a23b9023554903a122c95585e9787f9403d386bafb7a95d24c9b/numpy-1.24.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
12 | NUMPY_WHL=`basename $NUMPY_WHL_URL`
13 | 
14 | # useful constants 
15 | FN_ZIP_FILE=function.zip
16 | REQS_TXT_PATH=app/requirements.txt
17 | APP_DIR=app
18 | APP_NAME=llm-apps-workshop
19 | 
20 | # derive bucket name to put the function.zip in, to be used if the caller did not provide a bucket name
21 | ACCOUNT_ID=`aws sts get-caller-identity --output text --query 'Account'`
22 | REGION=`aws ec2 describe-availability-zones --output text --query 'AvailabilityZones[0].[RegionName]'`
23 | FN_BUCKET_NAME=sagemaker-$REGION-$ACCOUNT_ID
24 | 
25 | if [ -z "$1" ]
26 | then
27 |   echo "bucket name from $FN_ZIP_FILE not provided as input, going with the default $FN_BUCKET_NAME"
28 |   fn_bucket=$FN_BUCKET_NAME
29 | else
30 |   echo "bucket name provided as $1, going to use that for uploading $FN_ZIP_FILE"
31 |   fn_bucket=$1
32 | fi
33 | 
34 | # remove any existing depenencies dir or function.zip to start fresh
35 | echo going to remove $DEPS_DIR and $FN_ZIP_FILE
36 | rm -rf $DEPS_DIR
37 | rm -f $FN_ZIP_FILE
38 | 
39 | # get all the dependencies in a dir
40 | echo going to pip install dependencies listed in $REQS_TXT_PATH
41 | pip install -r $REQS_TXT_PATH --no-cache-dir --target=$DEPS_DIR
42 | echo done installing dependencies
43 | 
44 | # clean up the dependencies dir to remove numpy because we will be installing it manually via the .whl file
45 | cd $DEPS_DIR
46 | rm -rf numpy*
47 | echo going to wget numpy via $NUMPY_WHL_URL
48 | wget $NUMPY_WHL_URL
49 | echo goig to install numpy via $NUMPY_WHL
50 | unzip $NUMPY_WHL  
51 | echo installed numpy via $NUMPY_WHL_URL
52 | 
53 | # remove boto3, it is is already included in the lambda python runtime
54 | rm -rf boto*
55 | rm -f $NUMPY_WHL
56 | echo delete boto and $NUMPY_WHL
57 | 
58 | # zip up the dependencies
59 | echo going to package dependencies in $FN_ZIP_FILE
60 | rm -rf `find . -name .ipynb_checkpoints`
61 | zip -r9 ../$FN_ZIP_FILE .
62 | cd -
63 | 
64 | # add the app files (Lambda code) to the zip file
65 | echo going to package $APP_DIR in $FN_ZIP_FILE
66 | zip -g ./$FN_ZIP_FILE -r $APP_DIR
67 | 
68 | # upload the function.zip to s3 so that it is available for a Lambda deployment
69 | echo going to upload $FN_ZIP_FILE to $fn_bucket
70 | ls -ltr $FN_ZIP_FILE 
71 | aws s3 cp $FN_ZIP_FILE s3://$fn_bucket/$APP_NAME/
72 | 
73 | echo "all done"
74 | 


--------------------------------------------------------------------------------
/blogs/rag/app/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.37.0
2 | boto3==1.26.129
3 | 


--------------------------------------------------------------------------------
/blogs/rag/app/webapp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A simple web application to implement a chatbot. This app uses Streamlit 
  3 | for the UI and the Python requests package to talk to an API endpoint that
  4 | implements text generation and Retrieval Augmented Generation (RAG) using LLMs
  5 | and Amazon OpenSearch as the vector database.
  6 | """
  7 | import boto3
  8 | import streamlit as st
  9 | import requests as req
 10 | from typing import List, Tuple, Dict
 11 | 
 12 | # utility functions
 13 | def get_cfn_outputs(stackname: str) -> List:
 14 |     cfn = boto3.client('cloudformation')
 15 |     outputs = {}
 16 |     for output in cfn.describe_stacks(StackName=stackname)['Stacks'][0]['Outputs']:
 17 |         outputs[output['OutputKey']] = output['OutputValue']
 18 |     return outputs
 19 | 
 20 | # global constants
 21 | STREAMLIT_SESSION_VARS: List[Tuple] = [("generated", []), ("past", []), ("input", ""), ("stored_session", [])]
 22 | HTTP_OK: int = 200
 23 | 
 24 | # two options for the chatbot, 1) get answer directly from the LLM
 25 | # 2) use RAG (find documents similar to the user query and then provide
 26 | # those as context to the LLM).
 27 | MODE_RAG: str = 'RAG'
 28 | MODE_TEXT2TEXT: str = 'Text Generation'
 29 | MODE_VALUES: List[str] = [MODE_RAG, MODE_TEXT2TEXT]
 30 | 
 31 | # Currently we use the flan-t5-xxl for text generation
 32 | # and gpt-j-6b for embeddings but in future we could support more
 33 | TEXT2TEXT_MODEL_LIST: List[str] = ["flan-t5-xxl"]
 34 | EMBEDDINGS_MODEL_LIST: List[str] = ["gpt-j-6b"]
 35 | 
 36 | # if running this app on a compute environment that has
 37 | # IAM cloudformation::DescribeStacks access read the 
 38 | # stack outputs to get the name of the LLM endpoint
 39 | CFN_ACCESS = False
 40 | if CFN_ACCESS is True:
 41 |     CFN_STACK_NAME: str = "llm-apps-blog-rag"
 42 |     outputs = get_cfn_outputs(CFN_STACK_NAME)
 43 | else:
 44 |     # create an outputs dictionary with keys of interest
 45 |     # the key value would need to be edited manually before
 46 |     # running this app
 47 |     outputs: Dict = {}
 48 |     # REPLACE __API_GW_ENDPOINT__ WITH ACTUAL API GW ENDPOINT URL
 49 |     outputs["LLMAppAPIEndpoint"] = "__API_GW_ENDPOINT__"
 50 | 
 51 | # API endpoint
 52 | # this is retrieved from the cloud formation template that was
 53 | # used to create this solution
 54 | api: str = outputs.get("LLMAppAPIEndpoint")
 55 | api_rag_ep: str = f"{api}/api/v1/llm/rag"
 56 | api_text2text_ep: str = f"{api}/api/v1/llm/text2text"
 57 | print(f"api_rag_ep={api_rag_ep}\napi_text2text_ep={api_text2text_ep}")
 58 | 
 59 | ####################
 60 | # Streamlit code
 61 | ####################
 62 | 
 63 | # Page title
 64 | st.set_page_config(page_title='Virtual assistant for knowledge base 👩‍💻', layout='wide')
 65 | 
 66 | # keep track of conversations by using streamlit_session
 67 | _ = [st.session_state.setdefault(k, v) for k,v in STREAMLIT_SESSION_VARS]
 68 | 
 69 | # Define function to get user input
 70 | def get_user_input() -> str:
 71 |     """
 72 |     Returns the text entered by the user
 73 |     """
 74 |     print(st.session_state)    
 75 |     input_text = st.text_input("You: ",
 76 |                                st.session_state["input"],
 77 |                                key="input",
 78 |                                placeholder="Ask me a question and I will consult the knowledge base to answer...", 
 79 |                                label_visibility='hidden')
 80 |     return input_text
 81 | 
 82 | 
 83 | # sidebar with options
 84 | with st.sidebar.expander("⚙️", expanded=True):
 85 |     text2text_model = st.selectbox(label='Text2Text Model', options=TEXT2TEXT_MODEL_LIST)
 86 |     embeddings_model = st.selectbox(label='Embeddings Model', options=EMBEDDINGS_MODEL_LIST)
 87 |     mode = st.selectbox(label='Mode', options=MODE_VALUES)
 88 |     
 89 | 
 90 | # streamlit app layout sidebar + main panel
 91 | # the main panel has a title, a sub header and user input textbox
 92 | # and a text area for response and history
 93 | st.title("👩‍💻 Virtual assistant for a knowledge base")
 94 | st.subheader(f" Powered by :blue[{TEXT2TEXT_MODEL_LIST[0]}] for text generation and :blue[{EMBEDDINGS_MODEL_LIST[0]}] for embeddings")
 95 | 
 96 | # get user input
 97 | user_input: str = get_user_input()
 98 | 
 99 | # based on the selected mode type call the appropriate API endpoint
100 | if user_input:
101 |     # headers for request and response encoding, same for both endpoints
102 |     headers: Dict = {"accept": "application/json", "Content-Type": "application/json"}
103 |     output: str = None
104 |     if mode == MODE_TEXT2TEXT:        
105 |         data = {"q": user_input}
106 |         resp = req.post(api_text2text_ep, headers=headers, json=data)
107 |         if resp.status_code != HTTP_OK:
108 |             output = resp.text
109 |         else:
110 |             output = resp.json()['answer'][0]
111 |     elif mode == MODE_RAG:        
112 |         data = {"q": user_input, "verbose": True}
113 |         resp = req.post(api_rag_ep, headers=headers, json=data)
114 |         if resp.status_code != HTTP_OK:
115 |             output = resp.text
116 |         else:
117 |             resp = resp.json()
118 |             sources = [d['metadata']['source'] for d in resp['docs']]
119 |             output = f"{resp['answer']} \n \n Sources: {sources}"
120 |     else:
121 |         print("error")
122 |         output = f"unhandled mode value={mode}"
123 |     st.session_state.past.append(user_input)  
124 |     st.session_state.generated.append(output) 
125 | 
126 | 
127 | # download the chat history
128 | download_str: List = []
129 | with st.expander("Conversation", expanded=True):
130 |     for i in range(len(st.session_state['generated'])-1, -1, -1):
131 |         st.info(st.session_state["past"][i],icon="❓") 
132 |         st.success(st.session_state["generated"][i], icon="👩‍💻")
133 |         download_str.append(st.session_state["past"][i])
134 |         download_str.append(st.session_state["generated"][i])
135 |     
136 |     download_str = '\n'.join(download_str)
137 |     if download_str:
138 |         st.download_button('Download', download_str)
139 | 


--------------------------------------------------------------------------------
/blogs/rag/blog_post.md:
--------------------------------------------------------------------------------
  1 | Build a powerful question answering bot with Amazon SageMaker, Amazon
  2 | OpenSearch Service, Streamlit, and LangChain
  3 | ================
  4 | 
  5 | *Amit Arora*, *Xin Huang*, *Navneet Tuteja*
  6 | 
  7 | One of the most common applications of generative AI and large language
  8 | models (LLMs) in an enterprise environment is answering questions based
  9 | on the enterprise’s knowledge corpus. [Amazon
 10 | Lex](https://aws.amazon.com/lex/) provides the framework for building
 11 | [AI based
 12 | chatbots](https://aws.amazon.com/solutions/retail/ai-for-chatbots).
 13 | Pre-trained foundation models (FMs) perform well at natural language
 14 | understanding (NLU) tasks such summarization, text generation and
 15 | question answering on a broad variety of topics but either struggle to
 16 | provide accurate (without hallucinations) answers or completely fail at
 17 | answering questions about content that they haven’t seen as part of
 18 | their training data. Furthermore, FMs are trained with a point in time
 19 | snapshot of data and have no inherent ability to access fresh data at
 20 | inference time; without this ability they might provide responses that
 21 | are potentially incorrect or inadequate.
 22 | 
 23 | A commonly used approach to address this problem is to use a technique
 24 | called Retrieval Augmented Generation (RAG). In the RAG-based approach
 25 | we convert the user question into vector embeddings using an LLM and
 26 | then do a similarity search for these embeddings in a pre-populated
 27 | vector database holding the embeddings for the enterprise knowledge
 28 | corpus. A small number of similar documents (typically three) is added
 29 | as context along with the user question to the “prompt” provided to
 30 | another LLM and then that LLM generates an answer to the user question
 31 | using information provided as context in the prompt. RAG models were
 32 | introduced by [Lewis et al.](https://arxiv.org/abs/2005.11401) in 2020
 33 | as a model where parametric memory is a pre-trained seq2seq model and
 34 | the non-parametric memory is a dense vector index of Wikipedia, accessed
 35 | with a pre-trained neural retriever. To understand the overall structure
 36 | of a RAG-based approach, refer to [Question answering using Retrieval
 37 | Augmented Generation with foundation models in Amazon SageMaker
 38 | JumpStart](https://aws.amazon.com/blogs/machine-learning/question-answering-using-retrieval-augmented-generation-with-foundation-models-in-amazon-sagemaker-jumpstart/).
 39 | 
 40 | In this post we provide a step-by-step guide with all the building
 41 | blocks for creating an enterprise ready RAG application such as a
 42 | question answering bot. We use a combination of different AWS services,
 43 | open-source foundation models ([FLAN-T5
 44 | XXL](https://huggingface.co/google/flan-t5-xxl) for text generation and
 45 | [GPT-j-6B](https://huggingface.co/EleutherAI/gpt-j-6b) for embeddings)
 46 | and packages such as
 47 | [LangChain](https://python.langchain.com/en/latest/index.html) for
 48 | interfacing with all the components and
 49 | [Streamlit](https://streamlit.io/) for building the bot frontend.
 50 | 
 51 | We provide an AWS Cloud Formation template to stand up all the resources
 52 | required for building this solution. We then demonstrate how to use
 53 | LangChain for tying everything together:
 54 | 
 55 | - Interfacing with LLMs hosted on Amazon SageMaker.
 56 | - Chunking of knowledge base documents.
 57 | - Ingesting document embeddings into Amazon OpenSearch Service.
 58 | - Implementing the question answering task.
 59 | 
 60 | We can use the same architecture to swap the open-source models with the
 61 | [Amazon Titan](https://aws.amazon.com/bedrock/titan/) models. After
 62 | [Amazon Bedrock](https://aws.amazon.com/bedrock/) launches, we will
 63 | publish a follow-up post showing how to implement similar generative AI
 64 | applications using Amazon Bedrock, so stay tuned.
 65 | 
 66 | ## Solution overview
 67 | 
 68 | We use the [SageMaker docs](https://sagemaker.readthedocs.io) as the
 69 | knowledge corpus for this post. We convert the HTML pages on this site
 70 | into smaller overlapping chunks (to retain some context continuity
 71 | between chunks) of information and then convert these chunks into
 72 | embeddings using the gpt-j-6b model and store the embeddings in
 73 | OpenSearch Service. We implement the RAG functionality inside an AWS
 74 | Lambda function with Amazon API Gateway to handle routing all requests
 75 | to the Lambda. We implement a chatbot application in Streamlit which
 76 | invokes the function via the API Gateway and the function does a
 77 | similarity search in the OpenSearch Service index for the embeddings of
 78 | user question. The matching documents (chunks) are added to the prompt
 79 | as context by the Lambda function and then the function uses the
 80 | flan-t5-xxl model deployed as a SageMaker endpoint to generate an answer
 81 | to the user question. All the code for this post is available in the
 82 | [GitHub
 83 | repo](https://github.com/aws-samples/llm-apps-workshop/tree/main/blogs/rag).
 84 | 
 85 | The following figure represents the high-level architecture of the
 86 | proposed solution.
 87 | 
 88 | <figure>
 89 | <img src="img/ML-14328-architecture.png" id="fig-architecture"
 90 | alt="Figure 1: Architecture" />
 91 | <figcaption aria-hidden="true">Figure 1: Architecture</figcaption>
 92 | </figure>
 93 | 
 94 | Step-by-step explanation:
 95 | 
 96 | 1.  The User provides a question via the Streamlit web application.
 97 | 2.  The Streamlit application invokes the API Gateway endpoint REST API.
 98 | 3.  The API Gateway invokes the Lambda function.
 99 | 4.  The function invokes the SageMaker endpoint to convert user question
100 |     into embeddings.
101 | 5.  The function invokes invokes an OpenSearch Service API to find
102 |     similar documents to the user question.
103 | 6.  The function creates a “prompt” with the user query and the “similar
104 |     documents” as context and asks the SageMaker endpoint to generate a
105 |     response.
106 | 7.  The response is provided from the function to the API Gateway.
107 | 8.  The API Gateway provides the response to the Streamlit application.
108 | 9.  The User is able to view the response on the Streamlit application,
109 | 
110 | As illustrated in the architecture diagram, we use the following AWS
111 | services:
112 | 
113 | - [SageMaker](https://aws.amazon.com/pm/sagemaker) and [Amazon SageMaker
114 |   JumpStart](https://aws.amazon.com/sagemaker/jumpstart/) for hosting
115 |   the two LLMs.
116 | - [OpenSearch Service](https://aws.amazon.com/opensearch-service/) for
117 |   storing the embeddings of the enterprise knowledge corpus and doing
118 |   similarity search with user questions.
119 | - [Lambda](https://aws.amazon.com/lambda/) for implementing the RAG
120 |   functionality and exposing it as a REST endpoint via the [API
121 |   Gateway](https://aws.amazon.com/api-gateway/).
122 | - [Amazon SageMaker Processing
123 |   jobs](https://docs.aws.amazon.com/sagemaker/latest/dg/processing-job.html)
124 |   for large scale data ingestion into OpenSearch.
125 | - [Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/)
126 |   for hosting the Streamlit application.
127 | - [AWS Identity and Access Management](https://aws.amazon.com/iam/)
128 |   roles and policies for access management.
129 | - [AWS CloudFormation](https://aws.amazon.com/cloudformation/) for
130 |   creating the entire solution stack through infrastructure as code.
131 | 
132 | In terms of open-source packages used in this solution, we use
133 | [LangChain](https://python.langchain.com/en/latest/index.html) for
134 | interfacing with OpenSearch Service and SageMaker, and
135 | [FastAPI](https://github.com/tiangolo/fastapi) for implementing the REST
136 | API interface in the Lambda.
137 | 
138 | The workflow for instantiating the solution presented in this post in
139 | your own AWS account is as follows:
140 | 
141 | 1.  Run the CloudFormation template provided with this post in your
142 |     account. This will create all the necessary infrastructure resources
143 |     needed for this solution:
144 | 
145 |     1.  SageMaker endpoints for the LLMs
146 |     2.  OpenSearch Service cluster
147 |     3.  API Gateway
148 |     4.  Lambda function
149 |     5.  SageMaker Notebook
150 |     6.  IAM roles
151 | 
152 | 2.  Run the
153 |     [`data_ingestion_to_vectordb.ipynb`](./data_ingestion_to_vectordb.ipynb)
154 |     notebook in the SageMaker notebook to ingest data from [SageMaker
155 |     docs](https://sagemaker.readthedocs.io) into an OpenSearch Service
156 |     index.
157 | 
158 | 3.  Run the Streamlit application on a terminal in Studio and open the
159 |     URL for the application in a new browser tab.
160 | 
161 | 4.  Ask your questions about SageMaker via the chat interface provided
162 |     by the Streamlit app and view the responses generated by the LLM.
163 | 
164 | These steps are discussed in detail in the following sections.
165 | 
166 | ### Prerequisites
167 | 
168 | To implement the solution provided in this post, you should have an [AWS
169 | account](https://signin.aws.amazon.com/signin?redirect_uri=https%3A%2F%2Fportal.aws.amazon.com%2Fbilling%2Fsignup%2Fresume&client_id=signup)
170 | and familiarity with LLMs, OpenSearch Service and SageMaker.
171 | 
172 | We need access to accelerated instances (GPUs) for hosting the LLMs.
173 | This solution uses one instance each of `ml.g5.12xlarge` and
174 | `ml.g5.24xlarge`; you can check the availability of these instances in
175 | your AWS account and request these instances as needed via a
176 | `Sevice Quota` increase request as shown in the following screenshot.
177 | 
178 | <figure>
179 | <img src="img/ML-14328-service-quota.png"
180 | id="fig-service-quota-increase"
181 | alt="Figure 2: Service Quota Increase Request" />
182 | <figcaption aria-hidden="true">Figure 2: Service Quota Increase
183 | Request</figcaption>
184 | </figure>
185 | 
186 | #### Use AWS Cloud Formation to create the solution stack
187 | 
188 | We use AWS CloudFormation to create a SageMaker notebook called
189 | `aws-llm-apps-blog` and an IAM role called `LLMAppsBlogIAMRole`. Choose
190 | **Launch Stack** for the Region you want to deploy resources to. All
191 | parameters needed by the CloudFormation template have default values
192 | already filled in, except for the OpenSearch Service password which
193 | you’d have to provide. Make a note of the OpenSearch Service username
194 | and password, we use those in subsequent steps. **This template takes
195 | about 15 minutes to complete**.
196 | 
197 | |   AWS Region   |                                                                                                                                      Link                                                                                                                                      |
198 | |:--------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
199 | |   us-east-1    |   [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml)    |
200 | |   us-west-2    |   [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml)    |
201 | |   eu-west-1    |   [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml)    |
202 | | ap-northeast-1 | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
203 | 
204 | After the stack is created successfully, navigate to the stack’s
205 | `Outputs` tab on the AWS CloudFormation console and note the values for
206 | `OpenSearchDomainEndpoint` and `LLMAppAPIEndpoint`. We use those in the
207 | subsequent steps.
208 | 
209 | <figure>
210 | <img src="img/ML-14328-cfn-outputs.png" id="fig-cfn-outputs"
211 | alt="Figure 3: Cloud Formation Stack Outputs" />
212 | <figcaption aria-hidden="true">Figure 3: Cloud Formation Stack
213 | Outputs</figcaption>
214 | </figure>
215 | 
216 | #### Ingest the data into OpenSearch Service
217 | 
218 | To ingest the data, complete the following steps:
219 | 
220 | 1.  On the SageMaker console, choose **Notebooks** in the navigation
221 |     pane.
222 | 
223 | 2.  Select the notebook aws-llm-apps-blog and choose **Open
224 |     JupyterLab**.
225 | 
226 |     <figure>
227 |     <img src="img/ML-14328-sm-nb-jl.png" id="fig-open-jl"
228 |     alt="Figure 4: Open JupyterLab" />
229 |     <figcaption aria-hidden="true">Figure 4: Open JupyterLab</figcaption>
230 |     </figure>
231 | 
232 | 3.  Choose
233 |     [`data_ingestion_to_vectordb.ipynb`](./data_ingestion_to_vectordb.ipynb)
234 |     to open it in JupyterLab. This notebook will ingest the [SageMaker
235 |     docs](https://sagemaker.readthedocs.io) to an OpenSearch Service
236 |     index called `llm_apps_workshop_embeddings`.
237 | 
238 |     <figure>
239 |     <img src="img/ML-14328-sm-nb-path.png" id="fig-open-data-ingestion-nb"
240 |     alt="Figure 5: Open Data Ingestion Notebook" />
241 |     <figcaption aria-hidden="true">Figure 5: Open Data Ingestion
242 |     Notebook</figcaption>
243 |     </figure>
244 | 
245 | 4.  When the notebook is open, on the Run menu, choose **Run All Cells**
246 |     to run the code in this notebook. This will download the dataset
247 |     locally into the notebook and then ingest it into the OpenSearch
248 |     Service index. This notebook takes about 20 minutes to run. The
249 |     notebook also ingests the data into another vector database called
250 |     [`FAISS`](https://github.com/facebookresearch/faiss). The FAISS
251 |     index files are saved locally and the uploaded to Amazon Simple
252 |     Storage Service (S3) so that they can optionally be used by the
253 |     Lambda function as an illustration of using an alternate vector
254 |     database.
255 | 
256 |     <figure>
257 |     <img src="img/ML-14328-sm-nb-runall.png" id="fig-notebook-run-all-cells"
258 |     alt="Figure 6: Notebook Run All Cells" />
259 |     <figcaption aria-hidden="true">Figure 6: Notebook Run All
260 |     Cells</figcaption>
261 |     </figure>
262 | 
263 |     Now we’re ready to split the documents into chunks, which can then
264 |     be converted into embeddings to be ingested into OpenSearch. We use
265 |     the LangChain `RecursiveCharacterTextSplitter` class to chunk the
266 |     documents and then use the LangChain
267 |     `SagemakerEndpointEmbeddingsJumpStart` class to convert these chunks
268 |     into embeddings using the gpt-j-6b LLM. We store the embeddings in
269 |     OpenSearch Service via the LangChain `OpenSearchVectorSearch` class.
270 |     We package this code into Python scripts that are provided to the
271 |     SageMaker Processing Job via a custom container. See the
272 |     [`data_ingestion_to_vectordb.ipynb`](https://github.com/aws-samples/llm-apps-workshop/blob/main/blogs/rag/data_ingestion_to_vectordb.ipynb)
273 |     notebook for the full code.
274 | 
275 |     1.  Create a custom container, then install in it the `LangChain`
276 |         and `opensearch-py` Python packages.
277 |     2.  Upload this container image to Amazon Elastic Container Registry
278 |         (ECR).
279 |     3.  We use the SageMaker `ScriptProcessor` class to create a
280 |         SageMaker Processing job that will run on multiple nodes.
281 |         - The data files available in Amazon S3 are automatically
282 |           distributed across in the SageMaker Processing job instances
283 |           by setting `s3_data_distribution_type='ShardedByS3Key'` as
284 |           part of the `ProcessingInput` provided to the processing job.
285 |         - Each node processes a subset of the files and this brings down
286 |           the overall time required to ingest the data into OpenSearch
287 |           Service.
288 |         - Each node also uses Python `multiprocessing` to internally
289 |           also parallelize the file processing. Therefore, **there are
290 |           two levels of parallelization happening, one at the cluster
291 |           level where individual nodes are distributing the work (files)
292 |           amongst themselves and another at the node level where the
293 |           files in a node are also split between multiple processes
294 |           running on the node**.
295 | 
296 |     ``` python
297 |     # setup the ScriptProcessor with the above parameters
298 |     processor = ScriptProcessor(base_job_name=base_job_name,
299 |                                 image_uri=image_uri,
300 |                                 role=aws_role,
301 |                                 instance_type=instance_type,
302 |                                 instance_count=instance_count,
303 |                                 command=["python3"],
304 |                                 tags=tags)
305 | 
306 |     # setup input from S3, note the ShardedByS3Key, this ensures that 
307 |     # each instance gets a random and equal subset of the files in S3.
308 |     inputs = [ProcessingInput(source=f"s3://{bucket}/{app_name}/{DOMAIN}",
309 |                               destination='/opt/ml/processing/input_data',
310 |                               s3_data_distribution_type='ShardedByS3Key',
311 |                               s3_data_type='S3Prefix')]
312 | 
313 | 
314 |     logger.info(f"creating an opensearch index with name={opensearch_index}")
315 |     # ready to run the processing job
316 |     st = time.time()
317 |     processor.run(code="container/load_data_into_opensearch.py",
318 |                   inputs=inputs,
319 |                   outputs=[],
320 |                   arguments=["--opensearch-cluster-domain", opensearch_domain_endpoint,
321 |                             "--opensearch-secretid", os_creds_secretid_in_secrets_manager,
322 |                             "--opensearch-index-name", opensearch_index,
323 |                             "--aws-region", aws_region,
324 |                             "--embeddings-model-endpoint-name", embeddings_model_endpoint_name,
325 |                             "--chunk-size-for-doc-split", str(CHUNK_SIZE_FOR_DOC_SPLIT),
326 |                             "--chunk-overlap-for-doc-split", str(CHUNK_OVERLAP_FOR_DOC_SPLIT),
327 |                             "--input-data-dir", "/opt/ml/processing/input_data",
328 |                             "--create-index-hint-file", CREATE_OS_INDEX_HINT_FILE,
329 |                             "--process-count", "2"])
330 |     ```
331 | 
332 | 5.  Close the notebook after all cells run without any error. Your data
333 |     is now available in OpenSearch Service. Enter the following URL in
334 |     your browser’s address bar to get a count of documents in the
335 |     `llm_apps_workshop_embeddings` index. Use the OpenSearch Service
336 |     domain endpoint from the CloudFormation stack outputs in the URL
337 |     below. You’d be prompted for the OpenSearch Service username and
338 |     password, these are available from the CloudFormations stack.
339 | 
340 |         https://<your-opensearch-domain-endpoint>/llm_apps_workshop_embeddings/_count
341 | 
342 |     The browser window should show an output similar to the following.
343 |     This output shows that 5,667 documents were ingested into the
344 |     `llm_apps_workshop_embeddings` index.
345 |     `{"count":5667,"_shards":{"total":5,"successful":5,"skipped":0,"failed":0}}`
346 | 
347 | ### Run the Streamlit application in Studio
348 | 
349 | Now we’re ready to run the Streamlit web application for our question
350 | answering bot. This application allows the user to ask a question and
351 | then fetches the answer via the `/llm/rag` REST API endpoint provided by
352 | the Lambda function.
353 | 
354 | Studio provides a convenient platform to host the Streamlit web
355 | application. The following steps describes how to run the Streamlit app
356 | on Studio. Alternatively, you could also follow the same procedure to
357 | run the app on your laptop.
358 | 
359 | 1.  Open Studio and then open a new terminal.
360 | 
361 | 2.  Run the following commands on the terminal to clone the code
362 |     repository for this post and install the Python packages needed by
363 |     the application:
364 | 
365 |     ``` bash
366 |     git clone https://github.com/aws-samples/llm-apps-workshop
367 |     cd llm-apps-workshop/blogs/rag/app
368 |     pip install -r requirements.txt
369 |     ```
370 | 
371 | 3.  The API Gateway endpoint URL that is available from the
372 |     CloudFormation stack output needs to be set in the webapp.py file.
373 |     This is done by running the following `sed` command. Replace the
374 |     `<replace-with-LLMAppAPIEndpoint-value-from-cloudformation-stack-outputs>`
375 |     in the shell commands with the value of the `LLMAppAPIEndpoint`
376 |     field from the CloudFormation stack output and then run the
377 |     following commands to start a Streamlit app on Studio.
378 | 
379 |     ``` bash
380 |     EP=<replace-with-LLMAppAPIEndpoint-value-from-cloudformation-stack-outputs>
381 |     # replace __API_GW_ENDPOINT__ with  output from the cloud formation stack
382 |     sed -i "s|__API_GW_ENDPOINT__|$EP|g" webapp.py
383 |     streamlit run webapp.py    
384 |     ```
385 | 
386 | 4.  When the application runs successfully, you’ll see an output similar
387 |     to the following (the IP addresses you will see will be different
388 |     from the ones shown in this example). **Note the port number
389 |     (typically 8501) from the output** to use as part of the URL for app
390 |     in the next step.
391 | 
392 |     ``` bash
393 |     sagemaker-user@studio$ streamlit run webapp.py 
394 | 
395 |     Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
396 | 
397 | 
398 |       You can now view your Streamlit app in your browser.
399 | 
400 |       Network URL: http://169.255.255.2:8501
401 |       External URL: http://52.4.240.77:8501
402 |     ```
403 | 
404 | 5.  You can access the app in a new browser tab using a URL that is
405 |     similar to your Studio domain URL. For example, if your Studio URL
406 |     is
407 |     `https://d-randomidentifier.studio.us-east-1.sagemaker.aws/jupyter/default/lab?`
408 |     then the URL for your Streamlit app will be
409 |     `https://d-randomidentifier.studio.us-east-1.sagemaker.aws/jupyter/default/proxy/8501/webapp`
410 |     (notice that *lab* is replaced with *proxy/8501/webapp*). If the
411 |     port number noted in the previous step is different from 8501 then
412 |     use that instead of 8501 in the URL for the Streamlit app. The
413 |     following screenshot shows the app with a couple of user questions.
414 |     <img src="img/ML-14328-streamlit-app.png" id="fig-qa-bot"
415 |     alt="Question answering bot" />
416 | 
417 | ### A closer look at the RAG implementation in the Lambda function
418 | 
419 | Now that we have the application working end to end, lets take a closer
420 | look at the Lambda function. The Lambda function uses
421 | [`FastAPI`](https://fastapi.tiangolo.com/lo/) to implement the REST API
422 | for RAG and the [`Mangum`](https://pypi.org/project/mangum/) package to
423 | wrap the API with a handler that we package and deploy in the function.
424 | We use the API Gateway to route all incoming requests to invoke the
425 | function and handle the routing internally within our application.
426 | 
427 | The following code snippet shows how we find documents in the OpenSearch
428 | index that are similar to the user question and then create a prompt by
429 | combining the question and the similar documents. This prompt is then
430 | provided to the LLM for generating an answer to the user question.
431 | 
432 | ``` python
433 | 
434 | @router.post("/rag")
435 | async def rag_handler(req: Request) -> Dict[str, Any]:
436 |     # dump the received request for debugging purposes
437 |     logger.info(f"req={req}")
438 | 
439 |     # initialize vector db and SageMaker Endpoint
440 |     _init(req)
441 | 
442 |     # Use the vector db to find similar documents to the query
443 |     # the vector db call would automatically convert the query text
444 |     # into embeddings
445 |     docs = _vector_db.similarity_search(req.q, k=req.max_matching_docs)
446 |     logger.info(f"here are the {req.max_matching_docs} closest matching docs to the query=\"{req.q}\"")
447 |     for d in docs:
448 |         logger.info(f"---------")
449 |         logger.info(d)
450 |         logger.info(f"---------")
451 | 
452 |     # now that we have the matching docs, lets pack them as a context
453 |     # into the prompt and ask the LLM to generate a response
454 |     prompt_template = """Answer based on context:\n\n{context}\n\n{question}"""
455 | 
456 |     prompt = PromptTemplate(
457 |         template=prompt_template, input_variables=["context", "question"]
458 |     )
459 |     logger.info(f"prompt sent to llm = \"{prompt}\"")
460 |     chain = load_qa_chain(llm=_sm_llm, prompt=prompt)
461 |     answer = chain({"input_documents": docs, "question": req.q}, return_only_outputs=True)['output_text']
462 |     logger.info(f"answer received from llm,\nquestion: \"{req.q}\"\nanswer: \"{answer}\"")
463 |     resp = {'question': req.q, 'answer': answer}
464 |     if req.verbose is True:
465 |         resp['docs'] = docs
466 | 
467 |     return resp
468 | ```
469 | 
470 | ## Clean up
471 | 
472 | To avoid incurring future charges, delete the resources. You can do this
473 | by deleting the CloudFormation stack as shown in the following
474 | screenshot.
475 | 
476 | <figure>
477 | <img src="img/ML-14328-cfn-delete.png" id="fig-cleaning-up-2"
478 | alt="Figure 7: Cleaning Up" />
479 | <figcaption aria-hidden="true">Figure 7: Cleaning Up</figcaption>
480 | </figure>
481 | 
482 | ## Conclusion
483 | 
484 | In this post, we showed how to create an enterprise ready RAG solution
485 | using a combination of AWS service, open-source LLMs and open-source
486 | Python packages.
487 | 
488 | We encourage you to learn more by exploring
489 | [JumpStart](https://aws.amazon.com/sagemaker/jumpstart/), [Amazon
490 | Titan](https://aws.amazon.com/bedrock/titan/) models, [Amazon
491 | Bedrock](https://aws.amazon.com/bedrock/), and [OpenSearch
492 | Service](https://aws.amazon.com/opensearch-service/) and building a
493 | solution using the sample implementation provided in this post and a
494 | dataset relevant to your business. If you have questions or suggestions,
495 | leave a comment.
496 | 
497 | ------------------------------------------------------------------------
498 | 
499 | ## Author bio
500 | 
501 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-Amit.png">Amit
502 | Arora is an AI and ML Specialist Architect at Amazon Web Services,
503 | helping enterprise customers use cloud-based machine learning services
504 | to rapidly scale their innovations. He is also an adjunct lecturer in
505 | the MS data science and analytics program at Georgetown University in
506 | Washington D.C.
507 | 
508 | <br><br>
509 | 
510 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-xinhuang.jpg">Dr. Xin
511 | Huang is a Senior Applied Scientist for Amazon SageMaker JumpStart and
512 | Amazon SageMaker built-in algorithms. He focuses on developing scalable
513 | machine learning algorithms. His research interests are in the area of
514 | natural language processing, explainable deep learning on tabular data,
515 | and robust analysis of non-parametric space-time clustering. He has
516 | published many papers in ACL, ICDM, KDD conferences, and Royal
517 | Statistical Society: Series A.
518 | 
519 | <br><br>
520 | 
521 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-ntuteja.png">Navneet
522 | Tuteja is a Data Specialist at Amazon Web Services. Before joining AWS,
523 | Navneet worked as a facilitator for organizations seeking to modernize
524 | their data architectures and implement comprehensive AI/ML solutions.
525 | She holds an engineering degree from Thapar University, as well as a
526 | master’s degree in statistics from Texas A&M University.
527 | 


--------------------------------------------------------------------------------
/blogs/rag/blog_post.qmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Build a powerful question answering bot with Amazon SageMaker, Amazon OpenSearch Service, Streamlit, and LangChain"
  3 | format:
  4 |   html:
  5 |     embed-resources: true
  6 |     output-file: blog_post.html
  7 |     theme: cosmo
  8 |     code-copy: true
  9 |     code-line-numbers: true
 10 |     highlight-style: github
 11 |   docx:
 12 |     embed-resources: true
 13 |     output-file: blog_post.docx
 14 |     theme: cosmo
 15 |     code-copy: true
 16 |     code-line-numbers: true
 17 |     highlight-style: github
 18 |   gfm: 
 19 |     output-file: blog_post.md
 20 | ---
 21 | 
 22 | _Amit Arora_, _Xin Huang_, _Navneet Tuteja_
 23 | 
 24 | One of the most common applications of generative AI and large language models (LLMs) in an enterprise environment is answering questions based on the enterprise’s knowledge corpus. [Amazon Lex](https://aws.amazon.com/lex/) provides the framework for building [AI based chatbots](https://aws.amazon.com/solutions/retail/ai-for-chatbots). Pre-trained foundation models (FMs) perform well at natural language understanding (NLU) tasks such summarization, text generation and question answering on a broad variety of topics but either struggle to provide accurate (without hallucinations) answers or completely fail at answering questions about content that they haven't seen as part of their training data. Furthermore, FMs are trained with a point in time snapshot of data and have no inherent ability to access fresh data at inference time; without this ability they might provide responses that are potentially incorrect or inadequate.
 25 | 
 26 | A commonly used approach to address this problem is to use a technique called Retrieval Augmented Generation (RAG). In the RAG-based approach we convert the user question into vector embeddings using an LLM and then do a similarity search for these embeddings in a pre-populated vector database holding the embeddings for the enterprise knowledge corpus. A small number of similar documents (typically three) is added as context along with the user question to the "prompt" provided to another LLM and then that LLM generates an answer to the user question using information provided as context in the prompt. RAG models were introduced by [Lewis et al.](https://arxiv.org/abs/2005.11401) in 2020 as a model where parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a pre-trained neural retriever. To understand the overall structure of a RAG-based approach, refer to [Question answering using Retrieval Augmented Generation with foundation models in Amazon SageMaker JumpStart](https://aws.amazon.com/blogs/machine-learning/question-answering-using-retrieval-augmented-generation-with-foundation-models-in-amazon-sagemaker-jumpstart/).
 27 | 
 28 | In this post we provide a step-by-step guide with all the building blocks for creating an enterprise ready RAG application such as a question answering bot. We use a combination of different AWS services, open-source foundation models ([FLAN-T5 XXL](https://huggingface.co/google/flan-t5-xxl) for text generation and [GPT-j-6B](https://huggingface.co/EleutherAI/gpt-j-6b) for embeddings) and packages such as [LangChain](https://python.langchain.com/en/latest/index.html) for interfacing with all the components and [Streamlit](https://streamlit.io/) for building the bot frontend. 
 29 | 
 30 | We provide an AWS Cloud Formation template to stand up all the resources required for building this solution. We then demonstrate how to use LangChain for tying everything together:
 31 | 
 32 |   - Interfacing with LLMs hosted on Amazon SageMaker.
 33 |   - Chunking of knowledge base documents.
 34 |   - Ingesting document embeddings into Amazon OpenSearch Service.
 35 |   - Implementing the question answering task.
 36 | 
 37 | We can use the same architecture to swap the open-source models with the [Amazon Titan](https://aws.amazon.com/bedrock/titan/) models. After [Amazon Bedrock](https://aws.amazon.com/bedrock/) launches, we will publish a follow-up post showing how to implement similar generative AI applications using Amazon Bedrock, so stay tuned.
 38 | 
 39 | ## Solution overview
 40 | 
 41 | We use the [SageMaker docs](https://sagemaker.readthedocs.io) as the knowledge corpus for this post. We convert the HTML pages on this site into smaller overlapping chunks (to retain some context continuity between chunks) of information and then convert these chunks into embeddings using the gpt-j-6b model and store the embeddings in OpenSearch Service. We implement the RAG functionality inside an AWS Lambda function with Amazon API Gateway to handle routing all requests to the Lambda. We implement a chatbot application in Streamlit which invokes the function via the API Gateway and the function does a similarity search in the OpenSearch Service index for the embeddings of user question. The matching documents (chunks) are added to the prompt as context by the Lambda function and then the function uses the flan-t5-xxl model deployed as a SageMaker endpoint to generate an answer to the user question. All the code for this post is available in the [GitHub repo](https://github.com/aws-samples/llm-apps-workshop/tree/main/blogs/rag).
 42 | 
 43 | 
 44 | The following figure represents the high-level architecture of the proposed solution.
 45 | 
 46 | ![Architecture](img/ML-14328-architecture.png){#fig-architecture}
 47 | 
 48 | Step-by-step explanation:
 49 | 
 50 | 1. The User provides a question via the Streamlit web application.
 51 | 1. The Streamlit application invokes the API Gateway endpoint REST API.
 52 | 1. The API Gateway invokes the Lambda function.
 53 | 1. The function invokes the SageMaker endpoint to convert user question into embeddings.
 54 | 1. The function invokes invokes an OpenSearch Service API to find similar documents to the user question.
 55 | 1. The function creates a "prompt" with the user query and the "similar documents" as context and asks the SageMaker endpoint to generate a response.
 56 | 1. The response is provided from the function to the API Gateway.
 57 | 1. The API Gateway provides the response to the Streamlit application.
 58 | 1. The User is able to view the response on the Streamlit application,
 59 | 
 60 | As illustrated in the architecture diagram, we use the following AWS services:
 61 | 
 62 | - [SageMaker](https://aws.amazon.com/pm/sagemaker) and [Amazon SageMaker JumpStart](https://aws.amazon.com/sagemaker/jumpstart/) for hosting the two LLMs.
 63 | - [OpenSearch Service](https://aws.amazon.com/opensearch-service/) for storing the embeddings of the enterprise knowledge corpus and doing similarity search with user questions.
 64 | - [Lambda](https://aws.amazon.com/lambda/) for implementing the RAG functionality and exposing it as a REST endpoint via the [API Gateway](https://aws.amazon.com/api-gateway/).
 65 | - [Amazon SageMaker Processing jobs](https://docs.aws.amazon.com/sagemaker/latest/dg/processing-job.html) for large scale data ingestion into OpenSearch.
 66 | - [Amazon SageMaker Studio](https://aws.amazon.com/sagemaker/studio/) for hosting the Streamlit application.
 67 | - [AWS Identity and Access Management](https://aws.amazon.com/iam/) roles and policies for access management.
 68 | - [AWS CloudFormation](https://aws.amazon.com/cloudformation/) for creating the entire solution stack through infrastructure as code.
 69 | 
 70 | In terms of open-source packages used in this solution, we use [LangChain](https://python.langchain.com/en/latest/index.html) for interfacing with OpenSearch Service and SageMaker, and [FastAPI](https://github.com/tiangolo/fastapi) for implementing the REST API interface in the Lambda.
 71 | 
 72 | The workflow for instantiating the solution presented in this post in your own AWS account is as follows:
 73 | 
 74 | 1. Run the CloudFormation template provided with this post in your account. This will create all the necessary infrastructure resources needed for this solution: 
 75 |     a. SageMaker endpoints for the LLMs
 76 |     a. OpenSearch Service cluster
 77 |     a. API Gateway
 78 |     a. Lambda function
 79 |     a. SageMaker Notebook
 80 |     a. IAM roles
 81 | 
 82 | 1. Run the [`data_ingestion_to_vectordb.ipynb`](./data_ingestion_to_vectordb.ipynb) notebook in the SageMaker notebook to ingest data from [SageMaker docs](https://sagemaker.readthedocs.io) into an OpenSearch Service index.
 83 | 
 84 | 1. Run the Streamlit application on a terminal in Studio and open the URL for the application in a new browser tab. 
 85 | 
 86 | 1. Ask your questions about SageMaker via the chat interface provided by the Streamlit app and view the responses generated by the LLM. 
 87 | 
 88 | These steps are discussed in detail in the following sections.
 89 | 
 90 | ### Prerequisites
 91 | 
 92 | To implement the solution provided in this post, you should have an [AWS account](https://signin.aws.amazon.com/signin?redirect_uri=https%3A%2F%2Fportal.aws.amazon.com%2Fbilling%2Fsignup%2Fresume&client_id=signup) and familiarity with LLMs, OpenSearch Service and SageMaker.
 93 | 
 94 | We need access to accelerated instances (GPUs) for hosting the LLMs. This solution uses one instance each of `ml.g5.12xlarge` and `ml.g5.24xlarge`; you can check the availability of these instances in your AWS account and request these instances as needed via a `Sevice Quota` increase request as shown in the following screenshot.
 95 | 
 96 | ![Service Quota Increase Request](img/ML-14328-service-quota.png){#fig-service-quota-increase}
 97 | 
 98 | #### Use AWS Cloud Formation to create the solution stack
 99 | 
100 | We use AWS CloudFormation to create a SageMaker notebook called `aws-llm-apps-blog` and an IAM role called `LLMAppsBlogIAMRole`. Choose **Launch Stack** for the Region you want to deploy resources to. All parameters needed by the CloudFormation template have default values already filled in, except for the OpenSearch Service password which you'd have to provide. Make a note of the OpenSearch Service username and password, we use those in subsequent steps. **This template takes about 15 minutes to complete**.
101 | 
102 |   |AWS Region                |     Link    |
103 |   |:------------------------:|:-----------:|
104 |   |us-east-1  | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml)|
105 |   |us-west-2          | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
106 |   |eu-west-1      | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=eu-west-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
107 |   |ap-northeast-1  | [<img src="./img/ML-14328-cloudformation-launch-stack.png">](https://console.aws.amazon.com/cloudformation/home?region=ap-northeast-1#/stacks/new?stackName=llm-apps-blog-rag&templateURL=https://aws-blogs-artifacts-public.s3.amazonaws.com/artifacts/ML-14328/template.yml) |
108 | 
109 | After the stack is created successfully, navigate to the stack's `Outputs` tab on the AWS CloudFormation console and note the values for `OpenSearchDomainEndpoint` and `LLMAppAPIEndpoint`. We use those in the subsequent steps.
110 | 
111 | ![Cloud Formation Stack Outputs](img/ML-14328-cfn-outputs.png){#fig-cfn-outputs}
112 | 
113 | #### Ingest the data into OpenSearch Service
114 | 
115 | To ingest the data, complete the following steps:
116 | 
117 | 1. On the SageMaker console, choose **Notebooks** in the navigation pane.
118 | 
119 | 1. Select the notebook aws-llm-apps-blog and choose **Open JupyterLab**.
120 | 
121 |     ![Open JupyterLab](img/ML-14328-sm-nb-jl.png){#fig-open-jl}
122 | 
123 | 1. Choose [`data_ingestion_to_vectordb.ipynb`](./data_ingestion_to_vectordb.ipynb) to open it in JupyterLab. This notebook will ingest the [SageMaker docs](https://sagemaker.readthedocs.io) to an OpenSearch Service index called `llm_apps_workshop_embeddings`.
124 | 
125 |     ![Open Data Ingestion Notebook](img/ML-14328-sm-nb-path.png){#fig-open-data-ingestion-nb}
126 | 
127 | 1. When the notebook is open, on the Run menu, choose **Run All Cells** to run the code in this notebook. This will download the dataset locally into the notebook and then ingest it into the OpenSearch Service index. This notebook takes about 20 minutes to run. The notebook also ingests the data into another vector database called [`FAISS`](https://github.com/facebookresearch/faiss). The FAISS index files are saved locally and the uploaded to Amazon Simple Storage Service (S3) so that they can optionally be used by the Lambda function as an illustration of using an alternate vector database.
128 | 
129 |     ![Notebook Run All Cells](img/ML-14328-sm-nb-runall){#fig-notebook-run-all-cells}
130 | 
131 |     Now we're ready to split the documents into chunks, which can then be converted into embeddings to be ingested into OpenSearch. We use the LangChain `RecursiveCharacterTextSplitter` class to chunk the documents and then use the LangChain `SagemakerEndpointEmbeddingsJumpStart` class to convert these chunks into embeddings using the gpt-j-6b LLM. We store the embeddings in OpenSearch Service via the LangChain `OpenSearchVectorSearch` class. We package this code into Python scripts that are provided to the SageMaker Processing Job via a custom container. See the [`data_ingestion_to_vectordb.ipynb`](https://github.com/aws-samples/llm-apps-workshop/blob/main/blogs/rag/data_ingestion_to_vectordb.ipynb) notebook for the full code. 
132 |     a. Create a custom container, then install in it the `LangChain` and `opensearch-py` Python packages. 
133 |     a. Upload this container image to Amazon Elastic Container Registry (ECR).
134 |     a. We use the SageMaker `ScriptProcessor` class to create a SageMaker Processing job that will run on multiple nodes.
135 |         * The data files available in Amazon S3 are automatically distributed across in the SageMaker Processing job instances by setting `s3_data_distribution_type='ShardedByS3Key'` as part of the `ProcessingInput` provided to the processing job.
136 |         * Each node processes a subset of the files and this brings down the overall time required to ingest the data into OpenSearch Service.
137 |         * Each node also uses Python `multiprocessing` to internally also parallelize the file processing. Therefore, **there are two levels of parallelization happening, one at the cluster level where individual nodes are distributing the work (files) amongst themselves and another at the node level where the files in a node are also split between multiple processes running on the node**.
138 | 
139 |     ```{.python}
140 |     # setup the ScriptProcessor with the above parameters
141 |     processor = ScriptProcessor(base_job_name=base_job_name,
142 |                                 image_uri=image_uri,
143 |                                 role=aws_role,
144 |                                 instance_type=instance_type,
145 |                                 instance_count=instance_count,
146 |                                 command=["python3"],
147 |                                 tags=tags)
148 | 
149 |     # setup input from S3, note the ShardedByS3Key, this ensures that 
150 |     # each instance gets a random and equal subset of the files in S3.
151 |     inputs = [ProcessingInput(source=f"s3://{bucket}/{app_name}/{DOMAIN}",
152 |                               destination='/opt/ml/processing/input_data',
153 |                               s3_data_distribution_type='ShardedByS3Key',
154 |                               s3_data_type='S3Prefix')]
155 | 
156 | 
157 |     logger.info(f"creating an opensearch index with name={opensearch_index}")
158 |     # ready to run the processing job
159 |     st = time.time()
160 |     processor.run(code="container/load_data_into_opensearch.py",
161 |                   inputs=inputs,
162 |                   outputs=[],
163 |                   arguments=["--opensearch-cluster-domain", opensearch_domain_endpoint,
164 |                             "--opensearch-secretid", os_creds_secretid_in_secrets_manager,
165 |                             "--opensearch-index-name", opensearch_index,
166 |                             "--aws-region", aws_region,
167 |                             "--embeddings-model-endpoint-name", embeddings_model_endpoint_name,
168 |                             "--chunk-size-for-doc-split", str(CHUNK_SIZE_FOR_DOC_SPLIT),
169 |                             "--chunk-overlap-for-doc-split", str(CHUNK_OVERLAP_FOR_DOC_SPLIT),
170 |                             "--input-data-dir", "/opt/ml/processing/input_data",
171 |                             "--create-index-hint-file", CREATE_OS_INDEX_HINT_FILE,
172 |                             "--process-count", "2"])
173 |     ```
174 | 
175 | 1. Close the notebook after all cells run without any error. Your data is now available in OpenSearch Service. Enter the following URL in your browser's address bar to get a count of documents in the `llm_apps_workshop_embeddings` index. Use the OpenSearch Service domain endpoint from the CloudFormation stack outputs in the URL below. You'd be prompted for the OpenSearch Service username and password, these are available from the CloudFormations stack.
176 | 
177 |     ```
178 |     https://<your-opensearch-domain-endpoint>/llm_apps_workshop_embeddings/_count
179 |     ```
180 | 
181 |     The browser window should show an output similar to the following. This output shows that 5,667 documents were ingested into the `llm_apps_workshop_embeddings` index.
182 |     ```
183 |     {"count":5667,"_shards":{"total":5,"successful":5,"skipped":0,"failed":0}}
184 |     ```
185 | 
186 | ### Run the Streamlit application in Studio
187 | 
188 | Now we're ready to run the Streamlit web application for our question answering bot. This application allows the user to ask a question and then fetches the answer via the `/llm/rag` REST API endpoint provided by the Lambda function. 
189 | 
190 | Studio provides a convenient platform to host the Streamlit web application. The following steps describes how to run the Streamlit app on Studio. Alternatively, you could also follow the same procedure to run the app on your laptop.
191 | 
192 | 1. Open Studio and then open a new terminal.
193 | 
194 | 1. Run the following commands on the terminal to clone the code repository for this post and install the Python packages needed by the application:
195 | 
196 |     ```{.bash}
197 |     git clone https://github.com/aws-samples/llm-apps-workshop
198 |     cd llm-apps-workshop/blogs/rag/app
199 |     pip install -r requirements.txt
200 |     ```
201 | 1. The API Gateway endpoint URL that is available from the CloudFormation stack output needs to be set in the webapp.py file. This is done by running the following `sed` command. Replace the `<replace-with-LLMAppAPIEndpoint-value-from-cloudformation-stack-outputs>` in the shell commands with the value of the `LLMAppAPIEndpoint` field from the CloudFormation stack output and then run the following commands to start a Streamlit app on Studio. 
202 | 
203 |     ```{.bash}
204 |     EP=<replace-with-LLMAppAPIEndpoint-value-from-cloudformation-stack-outputs>
205 |     # replace __API_GW_ENDPOINT__ with  output from the cloud formation stack
206 |     sed -i "s|__API_GW_ENDPOINT__|$EP|g" webapp.py
207 |     streamlit run webapp.py    
208 |     ```
209 | 
210 | 1. When the application runs successfully, you'll see an output similar to the following (the IP addresses you will see will be different from the ones shown in this example). **Note the port number (typically 8501) from the output** to use as part of the URL for app in the next step.
211 | 
212 |     ```{.bash}
213 |     sagemaker-user@studio$ streamlit run webapp.py 
214 | 
215 |     Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
216 | 
217 | 
218 |       You can now view your Streamlit app in your browser.
219 | 
220 |       Network URL: http://169.255.255.2:8501
221 |       External URL: http://52.4.240.77:8501
222 |     ```
223 | 
224 | 1. You can access the app in a new browser tab using a URL that is similar to your Studio domain URL. For example, if your Studio URL is `https://d-randomidentifier.studio.us-east-1.sagemaker.aws/jupyter/default/lab?` then the URL for your Streamlit app will be `https://d-randomidentifier.studio.us-east-1.sagemaker.aws/jupyter/default/proxy/8501/webapp` (notice that _lab_ is replaced with _proxy/8501/webapp_). If the port number noted in the previous step is different from 8501 then use that instead of 8501 in the URL for the Streamlit app. The following screenshot shows the app with a couple of user questions.
225 |     ![Question answering bot](img/ML-14328-streamlit-app.png){#fig-qa-bot}
226 | 
227 | ### A closer look at the RAG implementation in the Lambda function
228 | 
229 | Now that we have the application working end to end, lets take a closer look at the Lambda function. The Lambda function uses [`FastAPI`](https://fastapi.tiangolo.com/lo/) to implement the REST API for RAG and the [`Mangum`](https://pypi.org/project/mangum/) package to wrap the API with a handler that we package and deploy in the function. We use the API Gateway to route all incoming requests to invoke the function and handle the routing internally within our application.
230 | 
231 | The following code snippet shows how we find documents in the OpenSearch index that are similar to the user question and then create a prompt by combining the question and the similar documents. This prompt is then provided to the LLM for generating an answer to the user question.
232 | 
233 | ```{.python}
234 | 
235 | @router.post("/rag")
236 | async def rag_handler(req: Request) -> Dict[str, Any]:
237 |     # dump the received request for debugging purposes
238 |     logger.info(f"req={req}")
239 | 
240 |     # initialize vector db and SageMaker Endpoint
241 |     _init(req)
242 | 
243 |     # Use the vector db to find similar documents to the query
244 |     # the vector db call would automatically convert the query text
245 |     # into embeddings
246 |     docs = _vector_db.similarity_search(req.q, k=req.max_matching_docs)
247 |     logger.info(f"here are the {req.max_matching_docs} closest matching docs to the query=\"{req.q}\"")
248 |     for d in docs:
249 |         logger.info(f"---------")
250 |         logger.info(d)
251 |         logger.info(f"---------")
252 | 
253 |     # now that we have the matching docs, lets pack them as a context
254 |     # into the prompt and ask the LLM to generate a response
255 |     prompt_template = """Answer based on context:\n\n{context}\n\n{question}"""
256 | 
257 |     prompt = PromptTemplate(
258 |         template=prompt_template, input_variables=["context", "question"]
259 |     )
260 |     logger.info(f"prompt sent to llm = \"{prompt}\"")
261 |     chain = load_qa_chain(llm=_sm_llm, prompt=prompt)
262 |     answer = chain({"input_documents": docs, "question": req.q}, return_only_outputs=True)['output_text']
263 |     logger.info(f"answer received from llm,\nquestion: \"{req.q}\"\nanswer: \"{answer}\"")
264 |     resp = {'question': req.q, 'answer': answer}
265 |     if req.verbose is True:
266 |         resp['docs'] = docs
267 | 
268 |     return resp
269 | ```
270 | 
271 | ## Clean up
272 | 
273 | To avoid incurring future charges, delete the resources. You can do this by deleting the CloudFormation stack as shown in the following screenshot.
274 | 
275 | ![Cleaning Up](img/ML-14328-cfn-delete.png){#fig-cleaning-up-2}
276 | 
277 | 
278 | ## Conclusion
279 | 
280 | In this post, we showed how to create an enterprise ready RAG solution using a combination of AWS service, open-source LLMs and open-source Python packages.
281 | 
282 | We encourage you to learn more by exploring [JumpStart](https://aws.amazon.com/sagemaker/jumpstart/), [Amazon Titan](https://aws.amazon.com/bedrock/titan/) models, [Amazon Bedrock](https://aws.amazon.com/bedrock/), and [OpenSearch Service](https://aws.amazon.com/opensearch-service/) and building a solution using the sample implementation provided in this post and a dataset relevant to your business. If you have questions or suggestions, leave a comment.
283 | 
284 |   *  *  *  *  *
285 | 
286 | ## Author bio
287 | 
288 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-Amit.png">Amit Arora is an AI and ML Specialist Architect at Amazon Web Services, helping enterprise customers use cloud-based machine learning services to rapidly scale their innovations. He is also an adjunct lecturer in the MS data science and analytics program at Georgetown University in Washington D.C.
289 | 
290 | <br><br>
291 | 
292 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-xinhuang.jpg">Dr. Xin Huang is a Senior Applied Scientist for Amazon SageMaker JumpStart and Amazon SageMaker built-in algorithms. He focuses on developing scalable machine learning algorithms. His research interests are in the area of natural language processing, explainable deep learning on tabular data, and robust analysis of non-parametric space-time clustering. He has published many papers in ACL, ICDM, KDD conferences, and Royal Statistical Society: Series A.
293 | 
294 | <br><br>
295 | 
296 | <img style="float: left; margin: 0 10px 0 0;" src="img/ML-14328-ntuteja.png">Navneet Tuteja is a Data Specialist at Amazon Web Services. Before joining AWS, Navneet worked as a facilitator for organizations seeking to modernize their data architectures and implement comprehensive AI/ML solutions. She holds an engineering degree from Thapar University, as well as a master's degree in statistics from Texas A&M University.
297 | 


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-Amit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-Amit.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-architecture.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="app.diagrams.net" modified="2023-05-20T12:09:39.706Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" etag="MHZr-pvM9M6JJszZioBI" version="21.2.9" type="device" pages="2">
  2 |   <diagram name="Page-1" id="wL2zpgHCOEQd1yw8bdDc">
  3 |     <mxGraphModel dx="1434" dy="796" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-2" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Fine-tuned LLM&lt;br&gt;&lt;/b&gt;(trained, hosted on Amazon SageMaker)&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
  8 |           <mxGeometry x="480" y="470" width="170" height="90" as="geometry" />
  9 |         </mxCell>
 10 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-1" value="" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#4AB29A;gradientDirection=north;fillColor=#116D5B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.sagemaker;" parent="1" vertex="1">
 11 |           <mxGeometry x="614" y="526" width="30" height="30" as="geometry" />
 12 |         </mxCell>
 13 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-21" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="glEfhbe2DrdjLcbNgMK2-4" target="glEfhbe2DrdjLcbNgMK2-7" edge="1">
 14 |           <mxGeometry relative="1" as="geometry" />
 15 |         </mxCell>
 16 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-4" value="&lt;b&gt;&lt;font style=&quot;font-size: 13px;&quot;&gt;Vector Database&lt;/font&gt;&lt;/b&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
 17 |           <mxGeometry x="430" y="610" width="154" height="90" as="geometry" />
 18 |         </mxCell>
 19 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-35" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="glEfhbe2DrdjLcbNgMK2-5" target="glEfhbe2DrdjLcbNgMK2-6" edge="1">
 20 |           <mxGeometry relative="1" as="geometry" />
 21 |         </mxCell>
 22 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-5" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;Enterprise Knowledge corpus&lt;br&gt;&lt;/b&gt;(Amazon S3)&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
 23 |           <mxGeometry x="933" y="610" width="154" height="90" as="geometry" />
 24 |         </mxCell>
 25 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="glEfhbe2DrdjLcbNgMK2-6" target="glEfhbe2DrdjLcbNgMK2-4" edge="1">
 26 |           <mxGeometry relative="1" as="geometry" />
 27 |         </mxCell>
 28 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-6" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;LLM for generating embeddings&lt;br&gt;&lt;/b&gt;(Amazon SageMaker)&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
 29 |           <mxGeometry x="670" y="610" width="200" height="90" as="geometry" />
 30 |         </mxCell>
 31 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-7" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;&lt;b&gt;VEP LLM Assist Endpoint&lt;br&gt;&lt;/b&gt;(Amazon API Gateway + AWS Lambda)&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
 32 |           <mxGeometry x="191" y="470" width="164" height="90" as="geometry" />
 33 |         </mxCell>
 34 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-8" value="" style="endArrow=classic;html=1;rounded=0;" parent="1" edge="1">
 35 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 36 |             <mxPoint x="90" y="498" as="sourcePoint" />
 37 |             <mxPoint x="190" y="498" as="targetPoint" />
 38 |           </mxGeometry>
 39 |         </mxCell>
 40 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-9" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;Customer Question&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 41 |           <mxGeometry x="110" y="463" width="60" height="30" as="geometry" />
 42 |         </mxCell>
 43 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-10" value="" style="endArrow=classic;html=1;rounded=0;" parent="1" edge="1">
 44 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 45 |             <mxPoint x="188" y="537" as="sourcePoint" />
 46 |             <mxPoint x="88" y="536" as="targetPoint" />
 47 |           </mxGeometry>
 48 |         </mxCell>
 49 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-11" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;LLM Generated Response&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 50 |           <mxGeometry x="105" y="549" width="60" height="30" as="geometry" />
 51 |         </mxCell>
 52 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-12" value="" style="endArrow=classic;html=1;rounded=0;" parent="1" edge="1">
 53 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 54 |             <mxPoint x="370" y="498" as="sourcePoint" />
 55 |             <mxPoint x="460" y="498" as="targetPoint" />
 56 |           </mxGeometry>
 57 |         </mxCell>
 58 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-14" value="Documents" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 59 |           <mxGeometry x="872" y="626" width="60" height="30" as="geometry" />
 60 |         </mxCell>
 61 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-16" value="" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#4AB29A;gradientDirection=north;fillColor=#116D5B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.sagemaker;" parent="1" vertex="1">
 62 |           <mxGeometry x="830" y="665" width="30" height="30" as="geometry" />
 63 |         </mxCell>
 64 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-19" value="" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#60A337;gradientDirection=north;fillColor=#277116;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.s3;" parent="1" vertex="1">
 65 |           <mxGeometry x="1053" y="666" width="28" height="28" as="geometry" />
 66 |         </mxCell>
 67 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-22" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;Prompt&lt;br&gt;(customer question + context)&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 68 |           <mxGeometry x="355" y="460" width="125" height="30" as="geometry" />
 69 |         </mxCell>
 70 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-24" value="" style="endArrow=classic;html=1;rounded=0;" parent="1" edge="1">
 71 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 72 |             <mxPoint x="464" y="540.5" as="sourcePoint" />
 73 |             <mxPoint x="364" y="539.5" as="targetPoint" />
 74 |           </mxGeometry>
 75 |         </mxCell>
 76 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-25" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;LLM Generated Response&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 77 |           <mxGeometry x="384" y="550" width="60" height="30" as="geometry" />
 78 |         </mxCell>
 79 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-26" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;Documents relevant to customer question&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 80 |           <mxGeometry x="290" y="614" width="120" height="30" as="geometry" />
 81 |         </mxCell>
 82 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-27" value="" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#945DF2;gradientDirection=north;fillColor=#5A30B5;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.api_gateway;" parent="1" vertex="1">
 83 |           <mxGeometry x="320" y="526" width="30" height="30" as="geometry" />
 84 |         </mxCell>
 85 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-32" value="&lt;font style=&quot;font-size: 13px;&quot;&gt;Embeddings&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
 86 |           <mxGeometry x="599" y="626" width="60" height="30" as="geometry" />
 87 |         </mxCell>
 88 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-36" value="&lt;h1&gt;LLM Assisted Virtual Agent Platform&lt;/h1&gt;&lt;p style=&quot;font-size: 14px;&quot;&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;A pre-trained Generative AI model fine-tuned on an enterprise&#39;s knowledge corpus provides responses to customer queries.&amp;nbsp;&lt;/font&gt;&lt;/p&gt;&lt;p style=&quot;font-size: 14px;&quot;&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;The entire knowledge corpus is converted to embeddings. Most relevant embeddings are included as &quot;context&quot; in the &quot;prompt&quot; provided to the Generative AI model.&lt;/font&gt;&lt;/p&gt;" style="text;html=1;strokeColor=none;fillColor=none;spacing=5;spacingTop=-20;whiteSpace=wrap;overflow=hidden;rounded=0;" parent="1" vertex="1">
 89 |           <mxGeometry x="80" y="270" width="470" height="150" as="geometry" />
 90 |         </mxCell>
 91 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-37" value="1" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=13;" parent="1" vertex="1">
 92 |           <mxGeometry x="170" y="450" width="20" height="20" as="geometry" />
 93 |         </mxCell>
 94 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-39" value="2" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=13;" parent="1" vertex="1">
 95 |           <mxGeometry x="400" y="624" width="20" height="20" as="geometry" />
 96 |         </mxCell>
 97 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-40" value="3" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=13;" parent="1" vertex="1">
 98 |           <mxGeometry x="444" y="450" width="20" height="20" as="geometry" />
 99 |         </mxCell>
100 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-41" value="4" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=13;" parent="1" vertex="1">
101 |           <mxGeometry x="446" y="547" width="20" height="20" as="geometry" />
102 |         </mxCell>
103 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-42" value="5" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=13;" parent="1" vertex="1">
104 |           <mxGeometry x="167" y="547" width="20" height="20" as="geometry" />
105 |         </mxCell>
106 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-44" value="" style="verticalLabelPosition=bottom;shadow=0;dashed=0;align=center;html=1;verticalAlign=top;strokeWidth=1;shape=mxgraph.mockup.markup.curlyBrace;strokeColor=#999999;fontSize=13;rotation=-180;" parent="1" vertex="1">
107 |           <mxGeometry x="540.08" y="720" width="551" height="20" as="geometry" />
108 |         </mxCell>
109 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-45" value="&lt;span style=&quot;font-size: 13px;&quot;&gt;Periodic (offline) ingestion of knowledge base into a vector database&lt;/span&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
110 |           <mxGeometry x="695.27" y="750" width="230.54" height="30" as="geometry" />
111 |         </mxCell>
112 |         <mxCell id="glEfhbe2DrdjLcbNgMK2-46" value="" style="outlineConnect=0;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;shape=mxgraph.aws3.elasticsearch_service;fillColor=#F58534;gradientColor=none;fontSize=13;" parent="1" vertex="1">
113 |           <mxGeometry x="552" y="665" width="26" height="31" as="geometry" />
114 |         </mxCell>
115 |       </root>
116 |     </mxGraphModel>
117 |   </diagram>
118 |   <diagram id="m2dm4Ois7a044IRhhdzs" name="Page-2">
119 |     <mxGraphModel dx="2284" dy="796" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
120 |       <root>
121 |         <mxCell id="0" />
122 |         <mxCell id="1" parent="0" />
123 |         <mxCell id="cLbmFV5fhnx_uU40nDEj-4" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=1.5;" parent="1" edge="1">
124 |           <mxGeometry relative="1" as="geometry">
125 |             <mxPoint x="150" y="287" as="sourcePoint" />
126 |             <mxPoint x="210" y="287" as="targetPoint" />
127 |           </mxGeometry>
128 |         </mxCell>
129 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.75;exitDx=0;exitDy=0;strokeWidth=1.5;" parent="1" source="BONjwN0Ef_ad3_QLhuGD-3" edge="1">
130 |           <mxGeometry relative="1" as="geometry">
131 |             <mxPoint x="-40" y="319" as="targetPoint" />
132 |           </mxGeometry>
133 |         </mxCell>
134 |         <mxCell id="BONjwN0Ef_ad3_QLhuGD-3" value="" style="shape=image;html=1;verticalAlign=top;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;imageAspect=0;aspect=fixed;image=https://cdn4.iconfinder.com/data/icons/essentials-74/24/004_-_Website-128.png" parent="1" vertex="1">
135 |           <mxGeometry x="80" y="259" width="80" height="80" as="geometry" />
136 |         </mxCell>
137 |         <mxCell id="cLbmFV5fhnx_uU40nDEj-5" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeWidth=1.5;" parent="1" edge="1">
138 |           <mxGeometry relative="1" as="geometry">
139 |             <mxPoint x="288" y="287" as="sourcePoint" />
140 |             <mxPoint x="370" y="287" as="targetPoint" />
141 |           </mxGeometry>
142 |         </mxCell>
143 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.75;exitDx=0;exitDy=0;exitPerimeter=0;entryX=1;entryY=0.75;entryDx=0;entryDy=0;strokeWidth=1.5;" parent="1" source="VfcQOOTff2Z6IAM3ECGz-2" target="BONjwN0Ef_ad3_QLhuGD-3" edge="1">
144 |           <mxGeometry relative="1" as="geometry" />
145 |         </mxCell>
146 |         <mxCell id="VfcQOOTff2Z6IAM3ECGz-2" value="" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#945DF2;gradientDirection=north;fillColor=#5A30B5;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.api_gateway;" parent="1" vertex="1">
147 |           <mxGeometry x="210" y="260" width="78" height="78" as="geometry" />
148 |         </mxCell>
149 |         <mxCell id="cLbmFV5fhnx_uU40nDEj-6" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;startArrow=classic;startFill=1;strokeWidth=1.5;" parent="1" source="VfcQOOTff2Z6IAM3ECGz-3" target="0dFsV1McNjQj3HDfaPEC-1" edge="1">
150 |           <mxGeometry relative="1" as="geometry">
151 |             <Array as="points">
152 |               <mxPoint x="409" y="378" />
153 |               <mxPoint x="410" y="378" />
154 |             </Array>
155 |           </mxGeometry>
156 |         </mxCell>
157 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.75;exitDx=0;exitDy=0;exitPerimeter=0;strokeWidth=1.5;" parent="1" source="VfcQOOTff2Z6IAM3ECGz-3" edge="1">
158 |           <mxGeometry relative="1" as="geometry">
159 |             <mxPoint x="290" y="319" as="targetPoint" />
160 |           </mxGeometry>
161 |         </mxCell>
162 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.25;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.25;entryDx=0;entryDy=0;entryPerimeter=0;strokeWidth=1.5;startArrow=classic;startFill=1;" parent="1" source="VfcQOOTff2Z6IAM3ECGz-3" target="qsVVxSyBkCnicMPbibAm-1" edge="1">
163 |           <mxGeometry relative="1" as="geometry" />
164 |         </mxCell>
165 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-21" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.75;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0;entryY=0.75;entryDx=0;entryDy=0;entryPerimeter=0;strokeWidth=1.5;startArrow=classic;startFill=1;" parent="1" source="VfcQOOTff2Z6IAM3ECGz-3" target="BONjwN0Ef_ad3_QLhuGD-1" edge="1">
166 |           <mxGeometry relative="1" as="geometry">
167 |             <Array as="points">
168 |               <mxPoint x="480" y="319" />
169 |               <mxPoint x="480" y="459" />
170 |             </Array>
171 |           </mxGeometry>
172 |         </mxCell>
173 |         <mxCell id="VfcQOOTff2Z6IAM3ECGz-3" value="" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#F78E04;gradientDirection=north;fillColor=#D05C17;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.lambda;" parent="1" vertex="1">
174 |           <mxGeometry x="371" y="260" width="78" height="78" as="geometry" />
175 |         </mxCell>
176 |         <mxCell id="qsVVxSyBkCnicMPbibAm-1" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Amazon SageMaker Endpoint &lt;br&gt;(LLM for Text Generation)&lt;/font&gt;" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#4AB29A;gradientDirection=north;fillColor=#116D5B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.sagemaker;" parent="1" vertex="1">
177 |           <mxGeometry x="550" y="260" width="78" height="78" as="geometry" />
178 |         </mxCell>
179 |         <mxCell id="0dFsV1McNjQj3HDfaPEC-1" value="Amazon OpenSearch Service &#xa;(Vector database)" style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=default;verticalAlign=top;aspect=fixed;imageAspect=0;image=data:image/png,iVBORw0KGgoAAAANSUhEUgAAAIYAAACGCAIAAACXG2XGAAAAAXNSR0IArs4c6QAAHT1JREFUeJztnXmcFMXZx3/Vc+zO3gd7sOxyL7ciEhQPREgUVDQqmChRg0c8E2NiYg7fGE18NUaTN5+8Rt/kVV/PoKiJBsUDRREU8UAQUMMl7C7Hsvc1O7Mz0/X+0d3VVdXVs7Mze6Fbn8/M9HQ/dT3fqXqqnqruIZRSDIXBFLSBLsBQkIMXwM8XbIOmASBigyHmBwFAQKGBISRUum591UAAUO6URuwEjVianIIUCwAh3LEhZr1Rlq8GjYsiRjIjUA2AIEYJ0QQhAj5bqzwAEbQhpG1qRBGL8lfEtEEg6kqlPlz/eI5VPF2HoBMAMItEKQxF6AB0q2LSdeurDgqbLQWg2zUzY+lyClIsAHx3Sg0x642wfHXogtqkPpgCIDoAQYxQqgtCFHy2VnkAKmhDSNvUiCIW4a+IaYNC1JVKfYDxe7BSHKICDAIqGv9liIpQlQSoUGWsFKjAQsJdG6LCV6U7KkDvU9EEOTPFISrAwFHRwJ2wD4eo8FWxqNgK6TsqtnkfosIn7kJFUEifUdHkE0NUBpQKZPM+RIVPfICoOMx7f1OhQqxBRkWt376lwk8VB4YKJzn4qCAeFdpHVDSuWtYFO94QFTkrjgr6ggpsWzJEhdgkBpYKZ0u+8lQwKKgI6yVDVAYFFd68S9eGqAwMFU2QE6/1ARW+fENUFFTA+bj6h4qU2ZFDxTVWb1OxbcmXnQpSpBIvVu9S4W3Jl5qKUK7BTUVcVRyiwokNCBUoVhWHqHBiA0LFy8kRChDrkLLdLKCghJr7XywxXtQ41HVoGiXCziNLioIQCkJ0Ck03e0vCGivLjAIEOqgGQijTOtEBzUqJgFAKnUCzjo1yWbGGjfBPmp2p+QjkPWpmdCPwlyJd9NN1wbb6GEAp5XcHUYAQHVQDCNWpuduI6NA1qgn6ZrGsqli1pMTafCSkLanYjkUp9fKaO9KppGVrV9xTlpbR4/2CMxdmPXDtQarLmhsAKvFWFY/AHqyo3J8EDwC5Rd7MfA8r70D1YDoopZSbKh75VEgyOKwgFHjAqIC3JUbPcoT3YHZ4+s7aPVs6bYWrdrQe8/WsBVcWsPM6ceuL0G89mGVLvixUWBsCEOnSI2GdX/zjFGwWPxoR20I8/fYHFWtbNV+PI7wHkwIfC1K3Il+0ziv6IuEr+r4H05zXvjxUEpmvOMsl6xf9SQWcj+tLQcXxq49PxQFxUFDhbInYux2JdkUdxFhyReVAATL5hIzxMwOaF9I4zDUSSFeH/uGLHU210ZTtCo9EvnZkUIEOotn5CbUhlFBhbh+PihW5ZIxv8U+HwVX7rqFiatrDPz6cirU3hCTzfuT1YAAo14NJgTKxuD0YX5qiCl8SPAAUjTR/3yn2YF5Ljh/spd5WoI7VN20FADXbih3YNCVOW+FBMu8k33g623Q9pobNQlrA400D03OKI2OYSGT9pk7FPVZqVAArURUVua1YGopLhS+GNLfBo7+sra+JAM77Iu2555yluSctzrbOpjxfsW3JIKDiSyNjpmX6061BIAGzrpL+iPUWjdCqzzrDQb7/M4PhajFmkegBFfUJxyzSfYyQAhUjNy/sH+kAU7nirtGjpgQUlYwbGg9G/nL9vkiXonvh5/ZQUlEFFRWCxKikPLcH5B0qst1O3dq7xxKtfU6BJwkeAAqG+0rG+hUTcSM/cRYpW3su6E5rzyVjfnQzXwFSnq+A3+04sFQ0T1KjHACAphFpNMUyQlwqUnDqV/LRQkVFbFcUqVKBw+2IgenB+PD2sw3bN7TxZ4yHF5jVAwBUTA6ccWWxLWH0GJAqauXn0oMphsHyKWqlwaXm6MHkKKl4J7kRl3it/6lwdWrY31W1PWgRsD41yj+bQlyqogCgy797ys8i3eyKQqNOKqR/qBhHGuReZsB6MLFKwiwSANWJYxYpR1F6drvvwbgU1OmK+mEpO3owR3mS6sG8VgL873kg2opKI5aUVRydEDZfkSTNRNHZHmtvjkVCeqjD4ufeVoSOjgDuI2Pi3lbEYgirMkm0FfC2ZGCpCBXj1BKHihHCQT0UNPYxYN/24J3f3g1Yz8lhlYlHxQxEN1To2lbcqMiSqVIR3Y69QQUusRRUJFGhJNyURUlFj2H7O20frW7ZvakjFuVFqNvcXkFFCooxgtBhqamoxgjJUTGOvOA8Mw7NJUElTiyZCndZ1WvFpbJ7S9vuLR3GVSKLJEqFxhy5xv3Vu1FRFD45KgAMJIRSnXtKVRJUSkalnXpBSUaWhyuh22jEvBaL0c82tm16vVlBxbbA3bQV5gfzaKRkdFr5hED+cF9uodefrmleEg3rHa2xlrpo7RddB3aH2ptiEpVNq1s0H+ZfXJCR7dEJZTqPo18VFXVIkAqFJGZ1XClSWXbb2JKR6a4EXML0uXmHq0I1O0KKibOuG09ti0/F58eUE3OmnpQ9YWamP9DNlqFDe8Kfv9+x+Y3WptoIo/LhqpbP3+345o3FVl4utiQxKuIel4SoOMQ4W5IKlSR4GGFYeVrNjk7V8LUbKunZ3pPPKzj+rPzMXI8zqjKUjk0rHZs299sFezYH1z7duO/TkJF4e3P0ydsOah5AHFlYGVpH8ahYMt34wRKiIpj3pKmww1BHLNjm7KGF4PGR3EKfmaOZjEyEcrNIiFSIByeeUzB/aVEgWw2jsz0W6tBjUZqWoWVkej1+4SohGDcjY9yMjJ0fdaz6a33joYjROPSYNYsUi8GPjNVUpJInRgUA1WQqhqoN80415q5IjooV1j9ft+rhA4Y++c1sVuoASHFF2s0PT3JElVWhpDJshO+Cn1SMnCw0Sl3HF1s6dn7csXdbZ111OGzOSMw880t9w8emj52eMen4zLxiH4tVOTPz+v/OXP1o3XsvtriNjC0VxqXirEkCVGAYEicV1kqSpQK56XK5ctCEWIIc70NypiBSqZyV+Z1fVPCulGBr7J0XGj96pbm1MWp7XLhVLwBNhyLNtZHPNrS/9D8YfXTGyefnT5yVaUT3puGMq4oqJgee/1NtJEINKg37u/QYtd2gcdftnUMBN48LpdTuFNypwBpxgZLkqKg6VF6nSiqSmKP1cy4dm8qshQXn/XC4Zj1PNRal655reOvp+q6grhyD8VTYyHjvJ8G924Ll49PPura4fILZ1KbNycor9j5+64FQp04o3b8j/OpD9WdcVWQXqOd7XKCiomhjKiq2Iw/iIkJCT96h1iWxRPyhwgutpCKd4kQpMPP0/PNvLGM8DleH7/v+nlf/73A4qAvSCj8Y5RI0K1mzK/S3m6pWP1LP1tXLJ6ZfeseItIChQLpxZeum15krmlNN/N0UDh1QHTGHa9np8Cc6eD+YJmWXHBXXTN2ouNRBFYtWzsha/KMy9qP87L22+36w59DesBCzh1QosO65xkduqWGusBGVaRfcXGrFoq/+rb6lIcq+sliJ7HGRjnpKxWwlA0jFGfvA3hCLlV/kW3pLBWsfm99sfvy31V1h4WGLSVPZu73zoZ9VB1vNIWLlzIxvXFJoxAp3xl76S51Rj55QoaB2buxs4lTA++9SoiIlLxdWpiLF4L9vXd9S/VmQXbrgpvIMa7D77w/aVvx+vx5VuOiTplK7L/zE7fujZpPDyUvyKyamG7F2fNDReCDCJdI9FSq+JUNFmpfwZiy+tYdq7cUU5Xxd7veBCYVjVaQULz98iKUwa0H+uBlZhlDDwfDf76yO6VQ5X/Gna1NPzp50XPbwsek5BV5NI+0t0cN7w7s2d3yytq29Jaq09kYla3aEXrjv0OKbSgEQgm/+sPj+71frOqUEeowCbNXLqlx3e1wI98YyNah4hPorrD34EVePqDiSE0rFj7MUd1G4EsHn77fWVYcAQgl8fnL6shKW4vK7akLBmHNkTDyYu6Rw7gXD0rOEmWN+ui+/xDfx+KyFVxR/+Erz6kfrQx0xNypb3modf2zG9Hk5AIrK/ceelv3hq602Br3fqADMvPOzhQR7sLiDDsF2yD2YshKUAlj3z3rzO8XsRYVskv/eiw1VnwflFHQ9p8B79e9HLbisROLBB4+XHL8o/9LfVMTvwV55qI6Z+rkXFXis9BxrkVwsHXBMs7g8kuzBbPMeh4q9TpkIFalfl6XkCOx7e3N01+Z2VqmTzhlmHEXD+muP1XIpmrrMGea7/s9jRk3N5FMLtsb2bgvu/Kj90L4wG+PqMfryg7WIa1faW2Lrn2s0zucUeCfNzrLF3KjAuYRFU6bC2RIdVOP2CEhb1HXr3kxlDyaVS+gebCm+BxMLBAJg6zstesz84U2YkVVYZjqn3lvV2NEStTI354W+gGfZHaNYMwKw/d22t5+pr/rMvD+RABm5nmPm5c1fOmzdcw1Vn3fEn0VCx8YXm+csKTC8A7MW5mxb16Ze9ZJu7nZUnlW1pz2Y7eNKnYoKR3wqcjUA7Pig1bIrZNopeezae6sa+bSNFM+6vKRsjDn9jnVhxR9rtqxtBackCgRbYu8+37DpjZZQewzofm4f6tC3rWubuSAXwKijMjKyPcG2mEwFxFjy6xsqgOUUsnWqi41IuvVJ6MFc92dI8xW3Hky+tvfToBWLTp5lbnw+uCdUuzck9XsFw/3HLypgKTx5Z9WWN5vNr46RcagtapW++5Hx5rfMSbumofJrGWadrUca84ZE+KpUQlI9GMz/s5HNVGJU4u2a6TGV1oZIa6O5sTe/yJ9fYvZaOz5qgzQXAj1hUSGbPL73YuP2Da2AvfMogfkKv/dVoFL9aSebo1RMSrclEqTiosaeUGE+rl6iwpm7eFT4GMaJWJSya8PH2b73PVvbTTGOyrSTc8z6ROlrj9famSRKBW5UYlG9eocxtENZZbpdZ7hSkaufMhXbIvQOFaLa7eqkIlZEvEaLuTXK2qqwXSkKCmTleQtKzTa0c1N7sCUKcWRsSidLpa6qyzguKPWBrzP6iYrtdkQKVKRSpUglK88eRzXVhrmyU1Dkl9rLhDU7O6WRMZAqlaZa04mSkePxegnQDRWF7zE1KoLbEclScYR4VNyiMNHMHHOeFg3rsQgV1A3qT7NbdntzRMogdSpswgjAH9CotYnUlnC2FWcmKVCR3Y7obypxvJOIRFjRbXWHQvbaflrAI8bqBSoR7ikextpid1S49KVZZM+pQPyXn/hU7K+9QcVFChRA1DT18Kdrkqop0Hywi0mPGJ9hX06ainUbsSGWlm5LxyL2xmJATUWuTcpUNIhn3KkgDhVerPt7W7qjEjTn6vB4iT+DSKpua4nWHwgZ5yqPzfSZ/VgcKvLN3TIVCFTYWoCuo6M1JugX3VFxjIyToGLdzIAUqdghoTuO2HeVVNPhMDtdXJbuVPXW9a3GiUCWZ/aiQj4xFRX0iErBCHNw0d4ctUT6lQrb1D+IqByuspFYcxRB1RtWNljzGJyxrKRoZBpLQha1Cpo4ldLR5hC89oswJ+JORTSNVnZJU+HMeypU7EuWYYlPBUKgFEjL8DCpqh0dTHjc9GxezIjdeCi8/vkG45s3Tbv67rElo9lUxp2K4/EUTiqBTG/pGBPwgd0hUURNRaiJ+9w+cSrCVrJeoJLoHUd2lsZHINNjzhApDXfqNdYUetKsHH5TGlPeqof21+w0nb6ZuV6vj8/JhQq6p1I5K4vltnNTu0Ok76lY91vzP+BkqEilSo7K6GlZTGrrO6YbMTvfO+Fr2c5+Lxahj9z2RV1NGMCqBw/W7AhC9IOZopYeE6Qy8+u5xreuTr3q05Dr+kpfUtHkKylSEQeN6I4K3yVPm53Lrmx+q4mJnXJeEQQTZB4213bdd+PO15fXrn2ujhVcpoIeUCmq8I+bYS6IfbK2NRaLcWLuVITqW5eSp8Lbkl6ikoB3UiyQdTTpuBx/uilXVxPasamVnR8xIQAVlY7m6CsPH+RSpKCCmOJlVklBZd6Fw1iv9f4rzY5BQfdUKFKlAtmW9B8VMQEAgM+vzTi1gH1d++xhJrH4++WiheAOZXXLYoqXWSWByshJgenzctnFGfNzpPkKS81KXzJDXH6pUbHnWfKVnlDhQ5LrKwQATllSwqr52caW3VvMBaXRU7NO/mZRPCq8knpOxevHuTcO55f4TjinYNHVJQlSkSuDlKiw2yXjUKF9SIV90ymAEWMDk2bmsCq/8ECNbtXtnGtGjBiX4UoFKVE5+5rhw0fLdy2deG7BWVeV9oxKAp78+FQg/q+iGxX0LhWJidTwF11TzgpctSO4Znmt8cXr175317j8Er+tbmfPIVOR66GkcsriguPOzDeO25qiKx84xGKddF7+md/rYVtJlQrn4+oVKvzWI1fvpMxRoFI+LuPoOfmswKsePfDFdmN+gJxC33X3ji8o9Tv06kZFUQ/pddJ5hWdcUWoWWKdP313z7guNK+7dz2KdfH7BGVcWx6Ei1CQRT353VIR9Jr1DJTmfMfclM9d6sgugR/WHb93TfNj0/haWpd/w54ljpmSmTkXz4Kyrhp99jW1CVt5/cNeWIIDNb7Q8+8cDLNacxYULr+g/KpzbESlQkYqVBBWJK/fR1tT1l5/sbGkwF/tyCn3X/deE+ReVmjd8IhkqBWX+q+8Ze8pi5rLE6scOb1jZCEvdm1Y3/+NPNpVTlhQuuKyfqIhuRyRJRVEsdyp8HNf4oie/rqbzvh/taDho+gE9XrLoyrIf/3XS5OOy0UMqgSzPgmUlN/3vhNFTM5jgqgcPvfHkYUAYGX/4avM//3yQpTj3W4Wnf7dIRUUotvmRNBX5GSrWgxYosfeCGbVxu7+Y38otl46Lxe/Sk265ku4VFxLg7qWrq+n847WfX3bb2PHHmI7IsjGB7901fv+u4LsrGzavbQq1xZyb+finVpRPSP/aaQWzFhRId8gvv7t6y5stYLXlHnL4wctNGsG5Nww3rpx64TBKsfqxOvZAUHlVkX94ibEVke1MBIhOoXW/S094Hpe1EZz0lIoSiRsVKRDn83rsBHn9oqM1cv9Pd566uPjMy8u8flOtI8ZnXPCjjMU3lFf9O/jF1rbaqnDjwa7OTl3vomkBLTPXW1SRXjYuUHlMVm6RD6ow67T87e+0RLu42nJUNq5q0jzknOvNIcC8i4ZRitcft6nUVXfxqaVGhSERWoi9YTJxKqKOhS3+CVKRb3g0tgnLv3roMX3Nitqt7zSfdUX59Ll5zDJrHjJ6SuboKcKW7TihtSGSmev1eAmAyplZl9466rHb90UjaiobVjYSDWdfa1KZv3SYrtM1T9YbVD5+syW70LvgMvt206SpGJ0tN1W0rYl0LiG7wuu3B+sriiNWL6WFAEDr9ocf/c3ue6/+dMNL9V2d7uM4Vdi/p/Ppe6rvvPjzJ+7Yx9bBJs7KvvTXo7w+sSCcXXn3hcaX/lbLrnzj4qJ5S4cxu/L2sw2vPWK5Pp2lT9iuCNu0pV99j9uKGIwbh+O0FU6W6vJfu3GRuF8b31aMvdoHdnWu+MO+f91fPfmEvIkzcypnZBeU+pUJhYOx6h2duz5u/2Rdc121OUbY9k7rk/9Z9Z1bRhptZeKs7EtuHfXY7ftiUXVbWf+PBqLhzCvNu5BOu6QIOt58ymwra59pIGyoJO3JT6itgN1YxJv3FKjYCrZHxnGoSKrXXQYI3VIhQKhT/3hN48drmgiBP6AVlaXnlfh8fo/PT7pCentzrKUh0njAWjkmVncIAHTr+pYn76y6+JaRxuagScdlX/KrUY//1pXKumcbNI0svNx8zOdp3y2ilL71dINB5a2nG+B2p0T3VACYt7B4WRVTocIHNgZLiAo1Cil6VYSuSE0lLeCJRHQ9auuXUtLVqe/fHdy/26l+9RgMoFvXtTx5Z9V3fmlSmTw7++L/GPnEHVVuVNauqNc0nL7MpHL6smKqY+0zDfwYLCUq1qqirdjk7IoUeroW6TT4opjCroyZmvnbZ6af94OKypk5Rs/jSNvNGnGnAYB+8nbL3++qYrdsTTkh5+JbRnpsB4JRJduuvPlU/erH6tiVBZcXz1lSmKgnvxu7AnCriug3Kk5TDkgtw5m4Qr+Zud655xVfd0/lyMnmQCstQysdE8jK81ozj0SpbFnbsvx31czrPOXEnKW/jEdlzd/rXn/CprLw8uIpJ2T3FhV+VRG9Q8U60SOfsTLEo+JsEMCIcYGbH5z8m+eO/t2Lx0w6LkcRS4xRMjqdtbDNbzU/xVGZdlLORT+vsH02Zlltdb/xRN2a5fVmBQmW3DQ8kOVJnQocq4roDSpqP5jbA+HgHBkTt8TjrRu5pu5OZfTUzEtvHc2ofPxm8/K7bSpHzcld+ot4VFY/enj9P8yb9tIzPbPPzgeQOhXnqiKSoCIoQIwl3aatlOYLZ8XiRBKhQiFRofIYQUVF16eckHPJr20qm9c0P/X7aiZx1Jzci35WrkmPTuOovPzQoUbL8zZ5dpZVlpSo8KuKqmjGW9JUKBCXCoXjodbWlW6oqHPlDURi1h6YOjvnkl+Nsh7ART9+Q6By9Ny8C3/uSkWP4d8fmA9iLa5I45yliVKhEKmIq4ou0RQSCVOx3nRRWbykrqJijAjjUFFm6ixFolROzM0exjxgdNPrTSvutalMn5v37ZtdqUS6zGJ6fIRPONG1SMcuXmlVUR2tB3ZFOIYrFTEoL3VDxdHaXNKOS0WcJPGpf/iaQOWYeXnf+qm1S8Yutw5gzDRzvNdaH+F1h2SpcLYkJSqCDohKDPEHV6rbNbprK2IK0hjBNZKbNZI7vg9fa3rmDzaVGfPzLry5Qmorx87LqZhk/hPOzs0dEHUHBxVhdcTMkFMNoeD+eIkqZu4QJ5/uc3sWTruo/NTzy9y0ZgT3R/CZYckNo86/bmR8GQ//FzQqTJffPo7dOmTm60zE777cAQD0g1ebCCFLflxuFHnG1/Nyi3yrH689+EU4K9c7a2H+HGtdklJsXNnIisO7Giicf7jmEIH5qBzIbsekqVjB59d8fhcvoioof/U+vwa1/1AddErNonCp+QNaT9KwCyRRef+VRkKw+EcmlbFHZ159z1hnvPdeaty/J8wUkTQViDcz2G897cGqd3UkV/39u4PGGKylsau9JZJEIpEuva4mZBSwribMjG2PQntLtNVa23f2YBtfblxxb3Usqvj1GOHTDa0vPnAQ4P15Vpek6sEsSUcnBwAglNJls1YLT/K13qz0ededQgxAUVn6/AvKA1mOf7WFcII/isXotnebtq63GjshFZWZJ55V7E/3qBJh8YVeLxKhm9Y0WJsiCQHGTc+eMb/A59fUKYilMBMJ6xtXNRzY0ykkLviMAZCRUzLOva6sYmIGHzfYFluzvG79c+Y03nxEmMiOLzUBhQbxKae8eult/6q0kAjRkAQV4iIGQIP9qBxJll+3l8Y/fCwmRgkA4am6omUSchaLz8vJ/72n8Bk7qwFCCMrGB8YelZlT6AsF9UO7Ond90hEJU0EsJSr0tn9V8v+rKD7/182uQCEmvDk6ZP4BRpKs9GA8ngofy3b4U1BirHpZkMQHVfM5i8XnllvkVQK3VRnZrlBKDuzqPLCr09az9aLslNvfqyZmV8BtGrIKYX91sSsuYsKbo0Pm54NxRsaOdRPFHyESR6w4Hhex+OK8xnVkHM9nzF+TXvaH2x9EU2ol6GpXiPVULUV5BCpijkNU2LH0sj+S+dtu87rGy7tScYwKhqiwY+llfyT1Z+pmK+F1a8dR61fM+KtNRUiRe9kfSVHhVxXFVDFERRRzUJFT5F72Rw+pEP4JjZQ6Uh2iAlGsX6gIS1iKVIeoWMfqavQBFU0WcKY6RIWJKqvRy1Qgm3d1qkNUmKiyGr1IRWne1al+xahQdaw+pwLelhwRVPht931KBQNHRZwqDnoq0HHEUuEk41AR/6j7q0tFUW6rpr1HRTx0pyK6Hb+qVDBoqMDaNCSqe4jKgFJhm4a+QlS4peDBSEX4IxIuxS8zFTK4qWhwlGcwUVHH+nJTkcz7YKOC1Knwm6H7kYpbrG6oQJgqqot3xFPBwFCJE6sbKppCRi7eEBXu2KppX1EB/h+9O2KWEOaaQAAAAABJRU5ErkJggg==;fontSize=14;" parent="1" vertex="1">
180 |           <mxGeometry x="370" y="418" width="80" height="80" as="geometry" />
181 |         </mxCell>
182 |         <mxCell id="BONjwN0Ef_ad3_QLhuGD-1" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Amazon SageMaker Endpoint&lt;br&gt;(LLM for Embeddings)&lt;/font&gt;" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#4AB29A;gradientDirection=north;fillColor=#116D5B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.sagemaker;" parent="1" vertex="1">
183 |           <mxGeometry x="560" y="400" width="78" height="78" as="geometry" />
184 |         </mxCell>
185 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0.5;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#6666FF;strokeWidth=1.5;" parent="1" source="BONjwN0Ef_ad3_QLhuGD-2" target="cLbmFV5fhnx_uU40nDEj-1" edge="1">
186 |           <mxGeometry relative="1" as="geometry" />
187 |         </mxCell>
188 |         <mxCell id="BONjwN0Ef_ad3_QLhuGD-2" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Amazon S3 &lt;br&gt;(Enterprise knowledge corpus)&lt;/font&gt;" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#60A337;gradientDirection=north;fillColor=#277116;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.s3;" parent="1" vertex="1">
189 |           <mxGeometry x="772" y="590" width="78" height="78" as="geometry" />
190 |         </mxCell>
191 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=1;entryDx=0;entryDy=0;entryPerimeter=0;startArrow=classic;startFill=1;strokeColor=#6666FF;strokeWidth=1.5;" parent="1" source="cLbmFV5fhnx_uU40nDEj-1" target="BONjwN0Ef_ad3_QLhuGD-1" edge="1">
192 |           <mxGeometry relative="1" as="geometry" />
193 |         </mxCell>
194 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=1;entryDx=0;entryDy=0;strokeColor=#6666FF;strokeWidth=1.5;" parent="1" source="cLbmFV5fhnx_uU40nDEj-1" target="0dFsV1McNjQj3HDfaPEC-1" edge="1">
195 |           <mxGeometry relative="1" as="geometry" />
196 |         </mxCell>
197 |         <mxCell id="cLbmFV5fhnx_uU40nDEj-1" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Amazon SageMaker Processing Job&lt;br&gt;&lt;/font&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;(Documents to embeddings)&lt;/font&gt;" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#4AB29A;gradientDirection=north;fillColor=#116D5B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.sagemaker;" parent="1" vertex="1">
198 |           <mxGeometry x="560" y="590" width="78" height="78" as="geometry" />
199 |         </mxCell>
200 |         <mxCell id="cLbmFV5fhnx_uU40nDEj-2" value="" style="endArrow=classic;html=1;rounded=0;strokeWidth=1.5;" parent="1" edge="1">
201 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
202 |             <mxPoint x="-40" y="286" as="sourcePoint" />
203 |             <mxPoint x="79" y="286" as="targetPoint" />
204 |           </mxGeometry>
205 |         </mxCell>
206 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-13" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Web application&lt;br&gt;(Streamlit)&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
207 |           <mxGeometry x="62" y="341" width="120" height="30" as="geometry" />
208 |         </mxCell>
209 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-14" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Amazon API Gateway&lt;br&gt;(Proxy for GenAI REST APIs)&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
210 |           <mxGeometry x="189" y="357" width="120" height="30" as="geometry" />
211 |         </mxCell>
212 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-15" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;AWS Lambda (RAG)&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
213 |           <mxGeometry x="344" y="343" width="120" height="30" as="geometry" />
214 |         </mxCell>
215 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-19" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;&lt;b&gt;User question&lt;/b&gt;&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
216 |           <mxGeometry x="-40" y="256" width="120" height="30" as="geometry" />
217 |         </mxCell>
218 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-20" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;&lt;b&gt;LLM Generated response&lt;/b&gt;&lt;br&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
219 |           <mxGeometry x="-50" y="326" width="120" height="30" as="geometry" />
220 |         </mxCell>
221 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-22" value="&lt;b&gt;2&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
222 |           <mxGeometry x="171" y="256" width="20" height="20" as="geometry" />
223 |         </mxCell>
224 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-24" value="&lt;b&gt;3&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
225 |           <mxGeometry x="310" y="256" width="20" height="20" as="geometry" />
226 |         </mxCell>
227 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-25" value="&lt;b&gt;5&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
228 |           <mxGeometry x="417" y="380" width="20" height="20" as="geometry" />
229 |         </mxCell>
230 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-26" value="&lt;b&gt;4&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
231 |           <mxGeometry x="488" y="387" width="20" height="20" as="geometry" />
232 |         </mxCell>
233 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-27" value="&lt;b&gt;6&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
234 |           <mxGeometry x="497" y="250" width="20" height="20" as="geometry" />
235 |         </mxCell>
236 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-29" value="&lt;b&gt;1&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
237 |           <mxGeometry y="240" width="20" height="20" as="geometry" />
238 |         </mxCell>
239 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-30" value="&lt;b&gt;7&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
240 |           <mxGeometry x="320" y="331" width="20" height="20" as="geometry" />
241 |         </mxCell>
242 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-32" value="&lt;b&gt;8&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
243 |           <mxGeometry x="179" y="326" width="20" height="20" as="geometry" />
244 |         </mxCell>
245 |         <mxCell id="qRE-cFeZ8KGbCTFtKMaO-1" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;Architecture for a RAG-based LLM powered question answer bot&lt;/b&gt;&lt;/span&gt;&lt;h1 style=&quot;border-color: var(--border-color);&quot;&gt;&lt;span style=&quot;background-color: initial; font-size: 14px; font-weight: 400;&quot;&gt;A pre-trained Generative AI model provides &quot;generative&quot; and &quot;factual&quot; responses from an enterprise&#39;s knowledge corpus to User questions.&amp;nbsp;&lt;/span&gt;&lt;br&gt;&lt;/h1&gt;&lt;h1&gt;&lt;p style=&quot;border-color: var(--border-color); font-weight: 400; font-size: 14px;&quot;&gt;&lt;font style=&quot;border-color: var(--border-color);&quot;&gt;The enterprise knowledge corpus is converted to embeddings. Most relevant embeddings are included as &quot;context&quot; in the &quot;prompt&quot; provided to the generative AI model.&lt;/font&gt;&lt;/p&gt;&lt;/h1&gt;" style="text;html=1;strokeColor=none;fillColor=none;spacing=5;spacingTop=-20;whiteSpace=wrap;overflow=hidden;rounded=0;" parent="1" vertex="1">
246 |           <mxGeometry x="70" y="20" width="650" height="190" as="geometry" />
247 |         </mxCell>
248 |         <mxCell id="qRE-cFeZ8KGbCTFtKMaO-2" value="&lt;b&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;AWS Account&lt;/font&gt;&lt;/b&gt;" style="points=[[0,0],[0.25,0],[0.5,0],[0.75,0],[1,0],[1,0.25],[1,0.5],[1,0.75],[1,1],[0.75,1],[0.5,1],[0.25,1],[0,1],[0,0.75],[0,0.5],[0,0.25]];outlineConnect=0;gradientColor=none;html=1;whiteSpace=wrap;fontSize=12;fontStyle=0;container=1;pointerEvents=0;collapsible=0;recursiveResize=0;shape=mxgraph.aws4.group;grIcon=mxgraph.aws4.group_account;strokeColor=#CD2264;fillColor=none;verticalAlign=top;align=left;spacingLeft=30;fontColor=#CD2264;dashed=0;" parent="1" vertex="1">
249 |           <mxGeometry x="70" y="220" width="840" height="510" as="geometry" />
250 |         </mxCell>
251 |         <mxCell id="qRE-cFeZ8KGbCTFtKMaO-7" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;Cloud Formation&lt;/font&gt;" style="outlineConnect=0;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;shape=mxgraph.aws3.cloudformation;fillColor=#759C3E;gradientColor=none;" parent="qRE-cFeZ8KGbCTFtKMaO-2" vertex="1">
252 |           <mxGeometry x="760" y="20" width="60" height="66" as="geometry" />
253 |         </mxCell>
254 |         <mxCell id="qRE-cFeZ8KGbCTFtKMaO-6" value="&lt;font style=&quot;font-size: 14px;&quot;&gt;IAM&lt;/font&gt;" style="sketch=0;outlineConnect=0;fontColor=#232F3E;gradientColor=none;fillColor=#BF0816;strokeColor=none;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;pointerEvents=1;shape=mxgraph.aws4.role;" parent="qRE-cFeZ8KGbCTFtKMaO-2" vertex="1">
255 |           <mxGeometry x="660" y="31" width="78" height="44" as="geometry" />
256 |         </mxCell>
257 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-11" value="&lt;b&gt;&lt;font style=&quot;font-size: 14px;&quot;&gt;Real-time flow on user query&lt;/font&gt;&lt;/b&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="qRE-cFeZ8KGbCTFtKMaO-2" vertex="1">
258 |           <mxGeometry x="90" y="420" width="199" height="30" as="geometry" />
259 |         </mxCell>
260 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-12" value="&lt;b&gt;&lt;font color=&quot;#6666ff&quot; style=&quot;font-size: 14px;&quot;&gt;Offline data ingestion&lt;/font&gt;&lt;/b&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="qRE-cFeZ8KGbCTFtKMaO-2" vertex="1">
261 |           <mxGeometry x="86" y="460" width="160" height="30" as="geometry" />
262 |         </mxCell>
263 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-9" value="" style="endArrow=classic;html=1;rounded=0;strokeWidth=2;" parent="qRE-cFeZ8KGbCTFtKMaO-2" edge="1">
264 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
265 |             <mxPoint x="17" y="434.5" as="sourcePoint" />
266 |             <mxPoint x="86" y="434.5" as="targetPoint" />
267 |           </mxGeometry>
268 |         </mxCell>
269 |         <mxCell id="q3Vhbwtdkttvv8X8PN4a-10" value="" style="endArrow=classic;html=1;rounded=0;strokeColor=#6666FF;strokeWidth=2;" parent="qRE-cFeZ8KGbCTFtKMaO-2" edge="1">
270 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
271 |             <mxPoint x="17" y="474.5" as="sourcePoint" />
272 |             <mxPoint x="86" y="474.5" as="targetPoint" />
273 |           </mxGeometry>
274 |         </mxCell>
275 |         <mxCell id="qRE-cFeZ8KGbCTFtKMaO-10" value="&lt;b&gt;9&lt;/b&gt;" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;" parent="1" vertex="1">
276 |           <mxGeometry y="362" width="20" height="20" as="geometry" />
277 |         </mxCell>
278 |       </root>
279 |     </mxGraphModel>
280 |   </diagram>
281 | </mxfile>
282 | 


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-architecture.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-cfn-delete.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-cfn-delete.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-cfn-outputs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-cfn-outputs.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-cloudformation-launch-stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-cloudformation-launch-stack.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-ntuteja.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-ntuteja.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-service-quota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-service-quota.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-sm-nb-jl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-sm-nb-jl.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-sm-nb-path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-sm-nb-path.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-sm-nb-runall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-sm-nb-runall.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-streamlit-app.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-streamlit-app.png


--------------------------------------------------------------------------------
/blogs/rag/img/ML-14328-xinhuang.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/llm-apps-workshop/fe82a61c338750c0ece33438ade7e66d4f56b8ce/blogs/rag/img/ML-14328-xinhuang.jpg


--------------------------------------------------------------------------------
/blogs/rag/template.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: Template to provision OpenSearch cluster and SageMaker Notebook for semantic search
  3 | 
  4 | Metadata:
  5 |   AWS::CloudFormation::Interface:
  6 |     ParameterGroups:
  7 |       - Label:
  8 |           default: Required Parameters
  9 |         Parameters:
 10 |           - AppName          
 11 |           - OpenSearchIndexName
 12 |           - LambdaFunctionName
 13 |           - APIGatewayName
 14 |           - SageMakerNotebookName
 15 |     ParameterLabels:      
 16 |       AppName:
 17 |         default: Name of the overall application
 18 |       OpenSearchIndexName:
 19 |         default: OpenSearch index name
 20 |       LambdaFunctionName:
 21 |         default: Lambda function name
 22 |       APIGatewayName:
 23 |         default: API gateway name
 24 |       SageMakerNotebookName:
 25 |         default: Name of SageMaker Notebook Instance
 26 |       
 27 | Parameters:
 28 |   VpcId:
 29 |     Type: AWS::EC2::VPC::Id
 30 |     Description: ID of the existing VPC where OpenSearch will be deployed.
 31 |   SubnetIds:
 32 |     Type: List<AWS::EC2::Subnet::Id>
 33 |     Description: List of Subnet IDs for the OpenSearch domain. Provide one subnet ID per Availability Zone.
 34 |   SecurityGroupIds:
 35 |     Type: List<AWS::EC2::SecurityGroup::Id>
 36 |     Description: List of Security Group IDs to associate with the OpenSearch domain.
 37 |   OpenSearchIndexName:
 38 |     Default: llm_apps_workshop_embeddings
 39 |     Type: String    
 40 |     Description: Name of the OpenSearch index for storing embeddings.
 41 |   AppName:
 42 |     Default: llm-apps-blog
 43 |     Type: String
 44 |     AllowedValues: [llm-apps-blog]
 45 |     Description: Name of the overall application, this is used while creating the ML model endpoint.
 46 |   LambdaFunctionName:
 47 |     Default: LLMApp
 48 |     Type: String
 49 |     AllowedPattern: '[a-zA-Z0-9]+[a-zA-Z0-9-]+[a-zA-Z0-9]+'
 50 |     Description: Name of the Lambda function for LLM Apps
 51 |   APIGatewayName:
 52 |     Default: LLMAppAPIGW
 53 |     Type: String
 54 |     AllowedPattern: '[a-zA-Z0-9]+[a-zA-Z0-9-]+[a-zA-Z0-9]+'
 55 |     Description: Name of the Lambda function for LLM Apps
 56 |   SageMakerNotebookName:
 57 |     Default: aws-llm-apps-blog
 58 |     Type: String
 59 |     Description: Enter name of SageMaker Notebook instance. The notebook name must _not_ already exist in your AWS account/region.
 60 |     MinLength: 1
 61 |     MaxLength: 63
 62 |     AllowedPattern: ^[a-z0-9](-*[a-z0-9])*
 63 |     ConstraintDescription: Must be lowercase or numbers with a length of 1-63 characters.
 64 |   SageMakerIAMRole:
 65 |     Description: Name of IAM role that will be created by this cloud formation template. The role name must _not_ already exist in your AWS account.
 66 |     Type: String
 67 |     Default: "LLMAppsBlogIAMRole"   
 68 | 
 69 |   ApiStageName:
 70 |     Description: The APi Gateway API stage name
 71 |     Type: String
 72 |     Default: prod
 73 | 
 74 | Mappings: 
 75 |   RegionMap: 
 76 |     us-east-1:
 77 |       BUCKET: aws-blogs-artifacts-public
 78 |       KEY: "artifacts/ML-14328/function.zip"
 79 |     us-west-2:
 80 |       BUCKET: aws-bigdata-blog-replica-us-west-2
 81 |       KEY: "artifacts/ML-14328/function.zip"
 82 |     eu-west-1:
 83 |       BUCKET: aws-bigdata-blog-replica-eu-west-1
 84 |       KEY: "artifacts/ML-14328/function.zip"
 85 |     eu-west-2:
 86 |       BUCKET: aws-bigdata-blog-replica-eu-west-2
 87 |       KEY: "artifacts/ML-14328/function.zip"
 88 |     ap-northeast-1:
 89 |       BUCKET: aws-bigdata-blog-replica-ap-northeast-1
 90 |       KEY: "artifacts/ML-14328/function.zip"
 91 | 
 92 | Resources:
 93 |   OpenSearchSecret:
 94 |     Type: AWS::SecretsManager::Secret
 95 |     Properties:
 96 |       Name: !Sub OpenSearchSecret-${AWS::StackName}
 97 |       Description: OpenSearch username and password
 98 |       GenerateSecretString:
 99 |         PasswordLength: 32
100 |         ExcludeCharacters: '"@/\'
101 |         RequireEachIncludedType: true
102 |         ExcludePunctuation: false
103 |         ExcludeLowercase: false
104 |         ExcludeNumbers: false
105 |         ExcludeUppercase: false
106 |         SecretStringTemplate: '{"username":"opensearchuser"}'
107 |         GenerateStringKey: password
108 | 
109 |   CodeRepository:
110 |     Type: AWS::SageMaker::CodeRepository
111 |     Properties:
112 |       GitConfig:
113 |           RepositoryUrl: https://github.com/aws-samples/llm-apps-workshop
114 | 
115 |   NotebookInstance:
116 |     Type: AWS::SageMaker::NotebookInstance
117 |     Properties:
118 |       SubnetId: !Select 
119 |         - 0
120 |         - !Ref SubnetIds
121 |       SecurityGroupIds: !Ref SecurityGroupIds
122 |       NotebookInstanceName: !Ref SageMakerNotebookName
123 |       InstanceType: ml.t3.2xlarge
124 |       RoleArn: !GetAtt Role.Arn
125 |       DefaultCodeRepository: !GetAtt CodeRepository.CodeRepositoryName
126 | 
127 |   Role:
128 |     Type: AWS::IAM::Role
129 |     Properties:
130 |       RoleName: !Ref SageMakerIAMRole
131 |       Policies:
132 |         - PolicyName: CustomNotebookAccess
133 |           PolicyDocument:
134 |             Version: 2012-10-17
135 |             Statement:
136 |               - Sid: ReadFromOpenSearch
137 |                 Effect: Allow
138 |                 Action:
139 |                   - "es:ESHttp*"
140 |                 Resource:
141 |                   - !Sub arn:aws:es:${AWS::Region}:${AWS::AccountId}:domain/*
142 |               - Sid: ReadSecretFromSecretsManager
143 |                 Effect: Allow
144 |                 Action:
145 |                   - "secretsmanager:GetSecretValue"
146 |                 Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:*"
147 |               - Sid: ReadWriteFromECR
148 |                 Effect: Allow
149 |                 Action:
150 |                   - "ecr:BatchGetImage"
151 |                   - "ecr:BatchCheckLayerAvailability"
152 |                   - "ecr:CompleteLayerUpload"
153 |                   - "ecr:DescribeImages"
154 |                   - "ecr:DescribeRepositories"
155 |                   - "ecr:GetDownloadUrlForLayer"
156 |                   - "ecr:InitiateLayerUpload"
157 |                   - "ecr:ListImages"
158 |                   - "ecr:PutImage"
159 |                   - "ecr:UploadLayerPart"
160 |                   - "ecr:CreateRepository"
161 |                   - "ecr:GetAuthorizationToken"
162 |                   - "ec2:DescribeAvailabilityZones"
163 |                 Resource: "*"
164 |       ManagedPolicyArns:
165 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
166 |         - arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
167 |         - arn:aws:iam::aws:policy/TranslateReadOnly
168 |       AssumeRolePolicyDocument:
169 |         Version: 2012-10-17
170 |         Statement:
171 |           - Effect: Allow
172 |             Principal:
173 |               Service:
174 |               - sagemaker.amazonaws.com
175 |             Action:
176 |               - 'sts:AssumeRole'
177 | 
178 | 
179 |   OpenSearchServiceDomain:
180 |     Type: AWS::OpenSearchService::Domain
181 |     Properties:
182 |       VPCOptions:
183 |         SubnetIds: !Ref SubnetIds
184 |         SecurityGroupIds: !Ref SecurityGroupIds
185 |       AccessPolicies:
186 |         Version: 2012-10-17
187 |         Statement:
188 |           - Effect: Allow
189 |             Principal:
190 |               AWS: '*'
191 |             Action: 'es:*'
192 |             Resource: !Sub arn:aws:es:${AWS::Region}:${AWS::AccountId}:domain/*/*
193 |       EngineVersion: 'OpenSearch_2.5'
194 |       ClusterConfig:
195 |         InstanceType: "r6g.xlarge.search"
196 |         InstanceCount: 2 # To specify an even number of data nodes
197 |       EBSOptions:
198 |         EBSEnabled: True
199 |         VolumeSize: 20
200 |         VolumeType: 'gp3'
201 |       AdvancedSecurityOptions:
202 |         AnonymousAuthEnabled: False
203 |         Enabled: True
204 |         InternalUserDatabaseEnabled: True
205 |         MasterUserOptions:
206 |           MasterUserName: !Sub "{{resolve:secretsmanager:OpenSearchSecret-${AWS::StackName}:SecretString:username}}"
207 |           MasterUserPassword: !Sub "{{resolve:secretsmanager:OpenSearchSecret-${AWS::StackName}:SecretString:password}}"
208 |       NodeToNodeEncryptionOptions:
209 |         Enabled: True
210 |       EncryptionAtRestOptions:
211 |         Enabled: True
212 |         KmsKeyId: alias/aws/es
213 |       DomainEndpointOptions:
214 |         EnforceHTTPS: True
215 | 
216 |   LLMEndpoint:
217 |     Type: "AWS::SageMaker::Endpoint"
218 |     Properties:
219 |       EndpointName: !Sub
220 |       - '${AppName}-flan-t5-xxl-endpoint-${RandomGUID}'
221 |       - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
222 |       EndpointConfigName: !GetAtt LLMEndpointConfig.EndpointConfigName
223 | 
224 |   LLMEndpointConfig:
225 |     Type: "AWS::SageMaker::EndpointConfig"
226 |     Properties:
227 |       EndpointConfigName: !Sub
228 |       - '${AppName}-flan-t5-xxl-endpoint-${RandomGUID}'
229 |       - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
230 |       ProductionVariants:
231 |         - InitialInstanceCount: 1
232 |           InitialVariantWeight: 1.0
233 |           InstanceType: "ml.g5.12xlarge"
234 |           ModelName: !GetAtt LLMModel.ModelName
235 |           VariantName: !GetAtt LLMModel.ModelName
236 |     Metadata:
237 |       cfn_nag:
238 |         rules_to_suppress:
239 |           - id: W1200
240 |             reason: Solution does not have KMS encryption enabled by default
241 | 
242 |   LLMModel:
243 |     Type: "AWS::SageMaker::Model"
244 |     Properties:
245 |       ModelName: !Sub
246 |       - '${AppName}-flan-t5-xxl-model-${RandomGUID}'
247 |       - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
248 |       PrimaryContainer:
249 |         ModelDataUrl: !Sub "s3://jumpstart-cache-prod-${AWS::Region}/huggingface-infer/prepack/v1.0.1/infer-prepack-huggingface-text2text-flan-t5-xxl.tar.gz"
250 |         Image: !Sub "763104351884.dkr.ecr.${AWS::Region}.amazonaws.com/pytorch-inference:1.12.0-gpu-py38"
251 |         Environment: {"TS_DEFAULT_WORKERS_PER_MODEL": "1"}
252 |         Mode: "SingleModel"
253 |       ExecutionRoleArn: !GetAtt Role.Arn
254 | 
255 |   EmbeddingEndpoint:
256 |     Type: "AWS::SageMaker::Endpoint"
257 |     Properties:
258 |       EndpointName: !Sub
259 |       - '${AppName}-gpt-j-6b-endpoint-${RandomGUID}'
260 |       - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
261 |       EndpointConfigName: !GetAtt EmbeddingEndpointConfig.EndpointConfigName
262 | 
263 |   EmbeddingEndpointConfig:
264 |     Type: "AWS::SageMaker::EndpointConfig"
265 |     Properties:
266 |       EndpointConfigName: !Sub
267 |       - '${AppName}-gpt-j-6b-endppoint-${RandomGUID}'
268 |       - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
269 |       ProductionVariants:
270 |         - InitialInstanceCount: 1
271 |           InitialVariantWeight: 1.0
272 |           InstanceType: "ml.g5.24xlarge"
273 |           ModelName: !GetAtt EmbeddingModel.ModelName
274 |           VariantName: !GetAtt EmbeddingModel.ModelName
275 |     Metadata:
276 |       cfn_nag:
277 |         rules_to_suppress:
278 |           - id: W1200
279 |             reason: Solution does not have KMS encryption enabled by default
280 | 
281 |   EmbeddingModel:
282 |     Type: "AWS::SageMaker::Model"
283 |     Properties:
284 |       ModelName: !Sub
285 |       - '${AppName}-gpt-j-6b-model-${RandomGUID}'
286 |       - { RandomGUID: !Select [0, !Split ["-", !Select [2, !Split ["/", !Ref AWS::StackId ]]]] }
287 |       PrimaryContainer:
288 |         ModelDataUrl: !Sub "s3://jumpstart-cache-prod-${AWS::Region}/huggingface-infer/prepack/v1.0.0/infer-prepack-huggingface-textembedding-gpt-j-6b.tar.gz"
289 |         Image: !Sub "763104351884.dkr.ecr.${AWS::Region}.amazonaws.com/pytorch-inference:1.12.0-gpu-py38"
290 |         Environment: {"TS_DEFAULT_WORKERS_PER_MODEL": "2"}
291 |         Mode: "SingleModel"
292 |       ExecutionRoleArn: !GetAtt Role.Arn
293 | 
294 |   APIGateway:
295 |       Type: "AWS::ApiGateway::RestApi"
296 |       Properties:
297 |         Name: !Sub ${APIGatewayName}
298 |         Description: "LLMApp API Gateway"
299 | 
300 |   ProxyResource:
301 |     Type: "AWS::ApiGateway::Resource"
302 |     Properties:
303 |       ParentId: !GetAtt APIGateway.RootResourceId
304 |       RestApiId: !Ref APIGateway
305 |       PathPart: '{proxy+}'
306 | 
307 |   APIGatewayRootMethod:
308 |     Type: "AWS::ApiGateway::Method"
309 |     Properties:
310 |       AuthorizationType: NONE
311 |       HttpMethod: ANY
312 |       Integration:
313 |         IntegrationHttpMethod: POST
314 |         Type: AWS_PROXY
315 |         IntegrationResponses:
316 |           -
317 |             StatusCode: 200
318 |         Uri: !Sub
319 |           - "arn:aws:apigateway:${AWS::Region}:lambda:path/2015-03-31/functions/${lambdaArn}/invocations"
320 |           - lambdaArn: !GetAtt "LLMAppFunction.Arn"
321 |       ResourceId: !Ref ProxyResource
322 |       RestApiId: !Ref "APIGateway"
323 | 
324 | 
325 |   APIGatewayDeployment:
326 |     Type: "AWS::ApiGateway::Deployment"
327 |     DependsOn:
328 |       - "APIGatewayRootMethod"
329 |     Properties:
330 |       RestApiId: !Ref "APIGateway"
331 |       StageName: !Ref ApiStageName
332 | 
333 |   LLMAppFunction:
334 |     Type: AWS::Lambda::Function
335 |     DependsOn: OpenSearchServiceDomain
336 |     Properties:
337 |       FunctionName: !Ref LambdaFunctionName
338 |       Description: Lambda function for providing an LLM app (text generation, RAG) endpoint.
339 |       Runtime: python3.9
340 |       Code:
341 |         S3Bucket: !FindInMap [RegionMap, !Ref "AWS::Region", BUCKET]
342 |         S3Key: !FindInMap [RegionMap, !Ref "AWS::Region", KEY]
343 |       Handler: app.main.handler
344 |       MemorySize: 512
345 |       Timeout: 60
346 |       Role: !GetAtt LambdaIAMRole.Arn
347 |       Tags: 
348 |         -
349 |           Key: Project
350 |           Value: !Ref AppName
351 |         -
352 |           Key: Version
353 |           Value: v3
354 |       Environment:
355 |         Variables:
356 |           TEXT2TEXT_ENDPOINT_NAME: !GetAtt  LLMEndpoint.EndpointName
357 |           EMBEDDING_ENDPOINT_NAME: !GetAtt  EmbeddingEndpoint.EndpointName
358 |           OPENSEARCH_SECRET: !Ref OpenSearchSecret
359 |           OPENSEARCH_DOMAIN_ENDPOINT: !GetAtt OpenSearchServiceDomain.DomainEndpoint
360 |           OPENSEARCH_INDEX: !Ref OpenSearchIndexName
361 |           APP_NAME: !Ref AppName
362 | 
363 |   LambdaApiGatewayInvoke:
364 |     Type: "AWS::Lambda::Permission"
365 |     Properties:
366 |       Action: "lambda:InvokeFunction"
367 |       FunctionName: !GetAtt "LLMAppFunction.Arn"
368 |       Principal: "apigateway.amazonaws.com"
369 |       SourceArn: !Sub "arn:aws:execute-api:${AWS::Region}:${AWS::AccountId}:${APIGateway}/*/*/*"
370 | 
371 |   LambdaIAMRole:
372 |     Type: "AWS::IAM::Role"
373 |     Properties:
374 |       AssumeRolePolicyDocument:
375 |         Version: "2012-10-17"
376 |         Statement:
377 |           - Action:
378 |               - "sts:AssumeRole"
379 |             Effect: "Allow"
380 |             Principal:
381 |               Service:
382 |                 - "lambda.amazonaws.com"
383 |       ManagedPolicyArns:
384 |         - arn:aws:iam::aws:policy/AmazonS3FullAccess
385 |       Policies:
386 |         - PolicyDocument:
387 |             Version: "2012-10-17"
388 |             Statement:
389 |               - Action:
390 |                   - "logs:CreateLogGroup"
391 |                   - "logs:CreateLogStream"
392 |                   - "logs:PutLogEvents"
393 |                 Effect: "Allow"
394 |                 Resource:
395 |                   - !Sub "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/${LambdaFunctionName}:*"
396 |           PolicyName: "lambda"
397 |         - PolicyDocument:
398 |             Version: "2012-10-17"
399 |             Statement:
400 |               - Sid: Logging
401 |                 Action:
402 |                   - "logs:CreateLogGroup"
403 |                   - "logs:CreateLogStream"
404 |                   - "logs:PutLogEvents"
405 |                 Effect: "Allow"
406 |                 Resource: "*"
407 |               - Sid: AllowSageMakerInvoke
408 |                 Effect: Allow
409 |                 Action:
410 |                   - sagemaker:InvokeEndpoint
411 |                 Resource: !Sub "arn:aws:sagemaker:${AWS::Region}:${AWS::AccountId}:endpoint/*"
412 |               - Sid: ReadSecretManagerSecret
413 |                 Effect: Allow
414 |                 Action:
415 |                   - secretsmanager:GetSecretValue
416 |                 Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:*"
417 |                 
418 |           PolicyName: "lambdaVPC"
419 |       
420 | 
421 |   lambdaLogGroup:
422 |     Type: "AWS::Logs::LogGroup"
423 |     Properties:
424 |       LogGroupName: !Sub "/aws/lambda/${LambdaFunctionName}"
425 |       RetentionInDays: 90
426 | 
427 | Outputs:
428 |   OpenSearchDomainEndpoint:
429 |     Description: OpenSearch domain endpoint
430 |     Value:
431 |       'Fn::GetAtt':
432 |         - OpenSearchServiceDomain
433 |         - DomainEndpoint
434 | 
435 |   OpenSourceDomainArn:
436 |     Description: OpenSearch domain ARN
437 |     Value:
438 |       'Fn::GetAtt':
439 |         - OpenSearchServiceDomain
440 |         - Arn
441 | 
442 |   OpenSearchDomainName:
443 |     Description: OpenSearch domain name
444 |     Value: !Ref OpenSearchServiceDomain
445 | 
446 |   Region:
447 |     Description: Deployed Region
448 |     Value: !Ref AWS::Region
449 | 
450 |   SageMakerNotebookURL:
451 |     Description: SageMaker Notebook Instance
452 |     Value: !Join
453 |       - ''
454 |       - - !Sub 'https://console.aws.amazon.com/sagemaker/home?region=${AWS::Region}#/notebook-instances/openNotebook/'
455 |         - !GetAtt NotebookInstance.NotebookInstanceName
456 |         - '?view=classic'
457 | 
458 |   LLMEndpointName:
459 |     Description: Name of the LLM endpoint
460 |     Value: !GetAtt LLMEndpoint.EndpointName
461 | 
462 |   EmbeddingEndpointName:
463 |     Description: Name of the LLM endpoint
464 |     Value: !GetAtt EmbeddingEndpoint.EndpointName
465 |   
466 |   OpenSearchSecret:
467 |     Description: Name of the OpenSearch secret in Secrets Manager
468 |     Value: !Ref OpenSearchSecret
469 | 
470 |   LLMAppAPIEndpoint:
471 |     Description: "API (prod) stage endpoint"
472 |     Value: !Sub "https://${APIGateway}.execute-api.${AWS::Region}.amazonaws.com/${ApiStageName}/"
473 | 


--------------------------------------------------------------------------------
/workshop/0_deploy_models.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a07b4596-d0ba-453b-b578-6de2953eccf2",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Deploy Models\n",
  9 |     "\n",
 10 |     "This notebook deploys two large language models (LLMs): one for generating embeddings and another for a question answering task. These models are then used in the `embed_data.ipynb` notebook and the `qa-w-rag-finetuned-llm` Lambda function."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "6454bf8a-e910-489e-a255-853932c0a67d",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "In this notebook we are deploying the [`FLAN-T5 XXL`](https://huggingface.co/google/flan-t5-xxl) model as the LLM for generating the response to questions and the [`GPT-J-6B`](https://huggingface.co/EleutherAI/gpt-j-6b) as the LLM for generating embeddings."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "b7844155-3341-475f-b68d-7b4092e47b35",
 25 |    "metadata": {
 26 |     "tags": []
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "!pip install --upgrade sagemaker --quiet"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "a62ece34-eed7-4fa4-b74b-80302150b432",
 37 |    "metadata": {
 38 |     "tags": []
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import sys\n",
 43 |     "import time\n",
 44 |     "import logging\n",
 45 |     "import sagemaker, boto3, json\n",
 46 |     "from sagemaker.model import Model\n",
 47 |     "from sagemaker.session import Session\n",
 48 |     "from sagemaker.predictor import Predictor\n",
 49 |     "from sagemaker.utils import name_from_base\n",
 50 |     "from sagemaker import image_uris, model_uris, script_uris, hyperparameters"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "id": "fe90e81a-08ef-4d03-8059-11fafba14c4c",
 57 |    "metadata": {
 58 |     "tags": []
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# global constants\n",
 63 |     "APP_NAME = \"qa-w-rag\"\n",
 64 |     "EMBEDDING_MODEL = \"huggingface-textembedding-gpt-j-6b\""
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "id": "c55b4ca0-dedf-4706-b9df-04d7a98a57d6",
 71 |    "metadata": {
 72 |     "tags": []
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "logger = logging.getLogger()\n",
 77 |     "logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "id": "aee0c720-4641-4321-acc3-13a6069d8013",
 84 |    "metadata": {
 85 |     "tags": []
 86 |    },
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "2023-05-02 14:22:28,427,credentials,MainProcess,INFO,Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole\n",
 93 |       "2023-05-02 14:22:28,766,credentials,MainProcess,INFO,Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole\n",
 94 |       "2023-05-02 14:22:28,872,232507896,MainProcess,INFO,aws_role=arn:aws:iam::015469603702:role/SageMakerRepoRole, aws_region=us-east-1\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "sagemaker_session = Session()\n",
100 |     "aws_role = sagemaker_session.get_caller_identity_arn()\n",
101 |     "aws_region = boto3.Session().region_name\n",
102 |     "sess = sagemaker.Session()\n",
103 |     "model_version = \"*\"\n",
104 |     "logger.info(f\"aws_role={aws_role}, aws_region={aws_region}\")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 6,
110 |    "id": "50e1c44e-28bf-4caf-a298-621e4687aff7",
111 |    "metadata": {
112 |     "tags": []
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "def parse_response_model_flan_t5(query_response):\n",
117 |     "    model_predictions = json.loads(query_response[\"Body\"].read())\n",
118 |     "    generated_text = model_predictions[\"generated_texts\"]\n",
119 |     "    return generated_text"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 7,
125 |    "id": "d58a8b8c-ca42-4557-867d-51f2489d064e",
126 |    "metadata": {
127 |     "tags": []
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "MODEL_CONFIG_LIST = [\n",
132 |     "    {\n",
133 |     "        \"model_id\": \"huggingface-text2text-flan-t5-xxl\",\n",
134 |     "        \"model_version\": \"*\",\n",
135 |     "        \"instance_type\": \"ml.g5.12xlarge\",\n",
136 |     "        \"instance_count\": 1,\n",
137 |     "        \"env\": {\"TS_DEFAULT_WORKERS_PER_MODEL\": \"1\"},\n",
138 |     "        \"predictor_cls\": Predictor\n",
139 |     "    },\n",
140 |     "    {\n",
141 |     "        \"model_id\": \"huggingface-textembedding-gpt-j-6b\",\n",
142 |     "        \"model_version\": \"*\",\n",
143 |     "        \"instance_type\": \"ml.g5.24xlarge\",\n",
144 |     "        \"instance_count\": 1,\n",
145 |     "        \"env\": {\"TS_DEFAULT_WORKERS_PER_MODEL\": \"2\"},\n",
146 |     "    }\n",
147 |     "]"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 8,
153 |    "id": "0e7655ad-50c0-42d1-9b1e-47bd081ccd46",
154 |    "metadata": {
155 |     "tags": []
156 |    },
157 |    "outputs": [
158 |     {
159 |      "name": "stderr",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "2023-05-02 14:22:28,893,2580055255,MainProcess,INFO,going to deploy model={'model_id': 'huggingface-text2text-flan-t5-xxl', 'model_version': '*', 'instance_type': 'ml.g5.12xlarge', 'instance_count': 1, 'env': {'TS_DEFAULT_WORKERS_PER_MODEL': '1'}, 'predictor_cls': <class 'sagemaker.predictor.Predictor'>}, endpoint_name=qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892\n",
163 |       "2023-05-02 14:22:28,928,credentials,MainProcess,INFO,Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole\n",
164 |       "2023-05-02 14:22:29,436,2580055255,MainProcess,INFO,deploy_image_uri=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38, model_uri=s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.2/infer-prepack-huggingface-text2text-flan-t5-xxl.tar.gz\n",
165 |       "2023-05-02 14:22:29,532,session,MainProcess,INFO,Creating model with name: qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892\n",
166 |       "2023-05-02 14:22:30,448,session,MainProcess,INFO,Creating endpoint-config with name qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892\n",
167 |       "2023-05-02 14:22:30,961,session,MainProcess,INFO,Creating endpoint with name qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892\n"
168 |      ]
169 |     },
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "-------------!"
175 |      ]
176 |     },
177 |     {
178 |      "name": "stderr",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "2023-05-02 14:29:33,163,2580055255,MainProcess,INFO,\u001b[1mmodel=huggingface-text2text-flan-t5-xxl has been deployed successfully at endpoint=qa-w-rag-huggingface-text2text-flan-t5--2023-05-02-14-22-28-892, took 424.27027797698975seconds\u001b[0m\n",
182 |       "\n",
183 |       "2023-05-02 14:29:33,163,2580055255,MainProcess,INFO,going to deploy model={'model_id': 'huggingface-textembedding-gpt-j-6b', 'model_version': '*', 'instance_type': 'ml.g5.24xlarge', 'instance_count': 1, 'env': {'TS_DEFAULT_WORKERS_PER_MODEL': '2'}}, endpoint_name=qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163\n",
184 |       "2023-05-02 14:29:33,325,2580055255,MainProcess,INFO,deploy_image_uri=763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38, model_uri=s3://jumpstart-cache-prod-us-east-1/huggingface-infer/prepack/v1.0.0/infer-prepack-huggingface-textembedding-gpt-j-6b.tar.gz\n",
185 |       "2023-05-02 14:29:33,353,session,MainProcess,INFO,Creating model with name: qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163\n",
186 |       "2023-05-02 14:29:34,205,session,MainProcess,INFO,Creating endpoint-config with name qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163\n",
187 |       "2023-05-02 14:29:34,661,session,MainProcess,INFO,Creating endpoint with name qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163\n"
188 |      ]
189 |     },
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "-----------!"
195 |      ]
196 |     },
197 |     {
198 |      "name": "stderr",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "2023-05-02 14:35:36,697,2580055255,MainProcess,INFO,\u001b[1mmodel=huggingface-textembedding-gpt-j-6b has been deployed successfully at endpoint=qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163, took 363.5339434146881seconds\u001b[0m\n",
202 |       "\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "newline, bold, unbold = \"\\n\", \"\\033[1m\", \"\\033[0m\"\n",
208 |     "\n",
209 |     "for model in MODEL_CONFIG_LIST: \n",
210 |     "    start = time.time()\n",
211 |     "    endpoint_name = name_from_base(f\"{APP_NAME}-{model['model_id']}\")\n",
212 |     "    logger.info(f\"going to deploy model={model}, endpoint_name={endpoint_name}\")    \n",
213 |     "    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.\n",
214 |     "    deploy_image_uri = image_uris.retrieve(\n",
215 |     "        region=None,\n",
216 |     "        framework=None,  # automatically inferred from model_id\n",
217 |     "        image_scope=\"inference\",\n",
218 |     "        model_id=model['model_id'],\n",
219 |     "        model_version=model['model_version'],\n",
220 |     "        instance_type=model['instance_type'],\n",
221 |     "    )\n",
222 |     "    # Retrieve the model uri.\n",
223 |     "    model_uri = model_uris.retrieve(\n",
224 |     "        model_id=model['model_id'], model_version=model['model_version'], model_scope=\"inference\"\n",
225 |     "    )\n",
226 |     "    logger.info(f\"deploy_image_uri={deploy_image_uri}, model_uri={model_uri}\")\n",
227 |     "    model_inference = Model(\n",
228 |     "        image_uri=deploy_image_uri,\n",
229 |     "        model_data=model_uri,\n",
230 |     "        role=aws_role,\n",
231 |     "        predictor_cls=model.get(\"predictor_cls\"),\n",
232 |     "        name=endpoint_name,\n",
233 |     "        env=model['env'],\n",
234 |     "    )\n",
235 |     "    model_predictor_inference = model_inference.deploy(\n",
236 |     "        initial_instance_count=model['instance_count'],\n",
237 |     "        instance_type=model['instance_type'],\n",
238 |     "        predictor_cls=model.get(\"predictor_cls\"),\n",
239 |     "        endpoint_name=endpoint_name,\n",
240 |     "    )\n",
241 |     "    time_taken = time.time() - start\n",
242 |     "    logger.info(f\"{bold}model={model['model_id']} has been deployed successfully at endpoint={endpoint_name}, took {time_taken}seconds{unbold}{newline}\")\n",
243 |     "    model[\"endpoint_name\"] = endpoint_name"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 9,
249 |    "id": "1bc50a11-5630-4dc4-8b3b-a3dcf70e69a3",
250 |    "metadata": {
251 |     "tags": []
252 |    },
253 |    "outputs": [
254 |     {
255 |      "name": "stderr",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "2023-05-02 14:35:36,705,4061947677,MainProcess,INFO,EMBEDDING_MODEL=huggingface-textembedding-gpt-j-6b,   embedding_model_endpoint_name=qa-w-rag-huggingface-textembedding-gpt--2023-05-02-14-29-33-163\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "embedding_model_endpoint_name = None\n",
264 |     "for model in MODEL_CONFIG_LIST:\n",
265 |     "    if model['model_id'] == EMBEDDING_MODEL:\n",
266 |     "        embedding_model_endpoint_name = model['endpoint_name']\n",
267 |     "        logger.info(f\"EMBEDDING_MODEL={EMBEDDING_MODEL},   embedding_model_endpoint_name={embedding_model_endpoint_name}\")\n"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 10,
273 |    "id": "630e26d5-345b-48af-95d5-be13bb17d3a4",
274 |    "metadata": {
275 |     "tags": []
276 |    },
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "Stored 'embedding_model_endpoint_name' (str)\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "%store embedding_model_endpoint_name"
288 |    ]
289 |   }
290 |  ],
291 |  "metadata": {
292 |   "availableInstances": [
293 |    {
294 |     "_defaultOrder": 0,
295 |     "_isFastLaunch": true,
296 |     "category": "General purpose",
297 |     "gpuNum": 0,
298 |     "hideHardwareSpecs": false,
299 |     "memoryGiB": 4,
300 |     "name": "ml.t3.medium",
301 |     "vcpuNum": 2
302 |    },
303 |    {
304 |     "_defaultOrder": 1,
305 |     "_isFastLaunch": false,
306 |     "category": "General purpose",
307 |     "gpuNum": 0,
308 |     "hideHardwareSpecs": false,
309 |     "memoryGiB": 8,
310 |     "name": "ml.t3.large",
311 |     "vcpuNum": 2
312 |    },
313 |    {
314 |     "_defaultOrder": 2,
315 |     "_isFastLaunch": false,
316 |     "category": "General purpose",
317 |     "gpuNum": 0,
318 |     "hideHardwareSpecs": false,
319 |     "memoryGiB": 16,
320 |     "name": "ml.t3.xlarge",
321 |     "vcpuNum": 4
322 |    },
323 |    {
324 |     "_defaultOrder": 3,
325 |     "_isFastLaunch": false,
326 |     "category": "General purpose",
327 |     "gpuNum": 0,
328 |     "hideHardwareSpecs": false,
329 |     "memoryGiB": 32,
330 |     "name": "ml.t3.2xlarge",
331 |     "vcpuNum": 8
332 |    },
333 |    {
334 |     "_defaultOrder": 4,
335 |     "_isFastLaunch": true,
336 |     "category": "General purpose",
337 |     "gpuNum": 0,
338 |     "hideHardwareSpecs": false,
339 |     "memoryGiB": 8,
340 |     "name": "ml.m5.large",
341 |     "vcpuNum": 2
342 |    },
343 |    {
344 |     "_defaultOrder": 5,
345 |     "_isFastLaunch": false,
346 |     "category": "General purpose",
347 |     "gpuNum": 0,
348 |     "hideHardwareSpecs": false,
349 |     "memoryGiB": 16,
350 |     "name": "ml.m5.xlarge",
351 |     "vcpuNum": 4
352 |    },
353 |    {
354 |     "_defaultOrder": 6,
355 |     "_isFastLaunch": false,
356 |     "category": "General purpose",
357 |     "gpuNum": 0,
358 |     "hideHardwareSpecs": false,
359 |     "memoryGiB": 32,
360 |     "name": "ml.m5.2xlarge",
361 |     "vcpuNum": 8
362 |    },
363 |    {
364 |     "_defaultOrder": 7,
365 |     "_isFastLaunch": false,
366 |     "category": "General purpose",
367 |     "gpuNum": 0,
368 |     "hideHardwareSpecs": false,
369 |     "memoryGiB": 64,
370 |     "name": "ml.m5.4xlarge",
371 |     "vcpuNum": 16
372 |    },
373 |    {
374 |     "_defaultOrder": 8,
375 |     "_isFastLaunch": false,
376 |     "category": "General purpose",
377 |     "gpuNum": 0,
378 |     "hideHardwareSpecs": false,
379 |     "memoryGiB": 128,
380 |     "name": "ml.m5.8xlarge",
381 |     "vcpuNum": 32
382 |    },
383 |    {
384 |     "_defaultOrder": 9,
385 |     "_isFastLaunch": false,
386 |     "category": "General purpose",
387 |     "gpuNum": 0,
388 |     "hideHardwareSpecs": false,
389 |     "memoryGiB": 192,
390 |     "name": "ml.m5.12xlarge",
391 |     "vcpuNum": 48
392 |    },
393 |    {
394 |     "_defaultOrder": 10,
395 |     "_isFastLaunch": false,
396 |     "category": "General purpose",
397 |     "gpuNum": 0,
398 |     "hideHardwareSpecs": false,
399 |     "memoryGiB": 256,
400 |     "name": "ml.m5.16xlarge",
401 |     "vcpuNum": 64
402 |    },
403 |    {
404 |     "_defaultOrder": 11,
405 |     "_isFastLaunch": false,
406 |     "category": "General purpose",
407 |     "gpuNum": 0,
408 |     "hideHardwareSpecs": false,
409 |     "memoryGiB": 384,
410 |     "name": "ml.m5.24xlarge",
411 |     "vcpuNum": 96
412 |    },
413 |    {
414 |     "_defaultOrder": 12,
415 |     "_isFastLaunch": false,
416 |     "category": "General purpose",
417 |     "gpuNum": 0,
418 |     "hideHardwareSpecs": false,
419 |     "memoryGiB": 8,
420 |     "name": "ml.m5d.large",
421 |     "vcpuNum": 2
422 |    },
423 |    {
424 |     "_defaultOrder": 13,
425 |     "_isFastLaunch": false,
426 |     "category": "General purpose",
427 |     "gpuNum": 0,
428 |     "hideHardwareSpecs": false,
429 |     "memoryGiB": 16,
430 |     "name": "ml.m5d.xlarge",
431 |     "vcpuNum": 4
432 |    },
433 |    {
434 |     "_defaultOrder": 14,
435 |     "_isFastLaunch": false,
436 |     "category": "General purpose",
437 |     "gpuNum": 0,
438 |     "hideHardwareSpecs": false,
439 |     "memoryGiB": 32,
440 |     "name": "ml.m5d.2xlarge",
441 |     "vcpuNum": 8
442 |    },
443 |    {
444 |     "_defaultOrder": 15,
445 |     "_isFastLaunch": false,
446 |     "category": "General purpose",
447 |     "gpuNum": 0,
448 |     "hideHardwareSpecs": false,
449 |     "memoryGiB": 64,
450 |     "name": "ml.m5d.4xlarge",
451 |     "vcpuNum": 16
452 |    },
453 |    {
454 |     "_defaultOrder": 16,
455 |     "_isFastLaunch": false,
456 |     "category": "General purpose",
457 |     "gpuNum": 0,
458 |     "hideHardwareSpecs": false,
459 |     "memoryGiB": 128,
460 |     "name": "ml.m5d.8xlarge",
461 |     "vcpuNum": 32
462 |    },
463 |    {
464 |     "_defaultOrder": 17,
465 |     "_isFastLaunch": false,
466 |     "category": "General purpose",
467 |     "gpuNum": 0,
468 |     "hideHardwareSpecs": false,
469 |     "memoryGiB": 192,
470 |     "name": "ml.m5d.12xlarge",
471 |     "vcpuNum": 48
472 |    },
473 |    {
474 |     "_defaultOrder": 18,
475 |     "_isFastLaunch": false,
476 |     "category": "General purpose",
477 |     "gpuNum": 0,
478 |     "hideHardwareSpecs": false,
479 |     "memoryGiB": 256,
480 |     "name": "ml.m5d.16xlarge",
481 |     "vcpuNum": 64
482 |    },
483 |    {
484 |     "_defaultOrder": 19,
485 |     "_isFastLaunch": false,
486 |     "category": "General purpose",
487 |     "gpuNum": 0,
488 |     "hideHardwareSpecs": false,
489 |     "memoryGiB": 384,
490 |     "name": "ml.m5d.24xlarge",
491 |     "vcpuNum": 96
492 |    },
493 |    {
494 |     "_defaultOrder": 20,
495 |     "_isFastLaunch": false,
496 |     "category": "General purpose",
497 |     "gpuNum": 0,
498 |     "hideHardwareSpecs": true,
499 |     "memoryGiB": 0,
500 |     "name": "ml.geospatial.interactive",
501 |     "supportedImageNames": [
502 |      "sagemaker-geospatial-v1-0"
503 |     ],
504 |     "vcpuNum": 0
505 |    },
506 |    {
507 |     "_defaultOrder": 21,
508 |     "_isFastLaunch": true,
509 |     "category": "Compute optimized",
510 |     "gpuNum": 0,
511 |     "hideHardwareSpecs": false,
512 |     "memoryGiB": 4,
513 |     "name": "ml.c5.large",
514 |     "vcpuNum": 2
515 |    },
516 |    {
517 |     "_defaultOrder": 22,
518 |     "_isFastLaunch": false,
519 |     "category": "Compute optimized",
520 |     "gpuNum": 0,
521 |     "hideHardwareSpecs": false,
522 |     "memoryGiB": 8,
523 |     "name": "ml.c5.xlarge",
524 |     "vcpuNum": 4
525 |    },
526 |    {
527 |     "_defaultOrder": 23,
528 |     "_isFastLaunch": false,
529 |     "category": "Compute optimized",
530 |     "gpuNum": 0,
531 |     "hideHardwareSpecs": false,
532 |     "memoryGiB": 16,
533 |     "name": "ml.c5.2xlarge",
534 |     "vcpuNum": 8
535 |    },
536 |    {
537 |     "_defaultOrder": 24,
538 |     "_isFastLaunch": false,
539 |     "category": "Compute optimized",
540 |     "gpuNum": 0,
541 |     "hideHardwareSpecs": false,
542 |     "memoryGiB": 32,
543 |     "name": "ml.c5.4xlarge",
544 |     "vcpuNum": 16
545 |    },
546 |    {
547 |     "_defaultOrder": 25,
548 |     "_isFastLaunch": false,
549 |     "category": "Compute optimized",
550 |     "gpuNum": 0,
551 |     "hideHardwareSpecs": false,
552 |     "memoryGiB": 72,
553 |     "name": "ml.c5.9xlarge",
554 |     "vcpuNum": 36
555 |    },
556 |    {
557 |     "_defaultOrder": 26,
558 |     "_isFastLaunch": false,
559 |     "category": "Compute optimized",
560 |     "gpuNum": 0,
561 |     "hideHardwareSpecs": false,
562 |     "memoryGiB": 96,
563 |     "name": "ml.c5.12xlarge",
564 |     "vcpuNum": 48
565 |    },
566 |    {
567 |     "_defaultOrder": 27,
568 |     "_isFastLaunch": false,
569 |     "category": "Compute optimized",
570 |     "gpuNum": 0,
571 |     "hideHardwareSpecs": false,
572 |     "memoryGiB": 144,
573 |     "name": "ml.c5.18xlarge",
574 |     "vcpuNum": 72
575 |    },
576 |    {
577 |     "_defaultOrder": 28,
578 |     "_isFastLaunch": false,
579 |     "category": "Compute optimized",
580 |     "gpuNum": 0,
581 |     "hideHardwareSpecs": false,
582 |     "memoryGiB": 192,
583 |     "name": "ml.c5.24xlarge",
584 |     "vcpuNum": 96
585 |    },
586 |    {
587 |     "_defaultOrder": 29,
588 |     "_isFastLaunch": true,
589 |     "category": "Accelerated computing",
590 |     "gpuNum": 1,
591 |     "hideHardwareSpecs": false,
592 |     "memoryGiB": 16,
593 |     "name": "ml.g4dn.xlarge",
594 |     "vcpuNum": 4
595 |    },
596 |    {
597 |     "_defaultOrder": 30,
598 |     "_isFastLaunch": false,
599 |     "category": "Accelerated computing",
600 |     "gpuNum": 1,
601 |     "hideHardwareSpecs": false,
602 |     "memoryGiB": 32,
603 |     "name": "ml.g4dn.2xlarge",
604 |     "vcpuNum": 8
605 |    },
606 |    {
607 |     "_defaultOrder": 31,
608 |     "_isFastLaunch": false,
609 |     "category": "Accelerated computing",
610 |     "gpuNum": 1,
611 |     "hideHardwareSpecs": false,
612 |     "memoryGiB": 64,
613 |     "name": "ml.g4dn.4xlarge",
614 |     "vcpuNum": 16
615 |    },
616 |    {
617 |     "_defaultOrder": 32,
618 |     "_isFastLaunch": false,
619 |     "category": "Accelerated computing",
620 |     "gpuNum": 1,
621 |     "hideHardwareSpecs": false,
622 |     "memoryGiB": 128,
623 |     "name": "ml.g4dn.8xlarge",
624 |     "vcpuNum": 32
625 |    },
626 |    {
627 |     "_defaultOrder": 33,
628 |     "_isFastLaunch": false,
629 |     "category": "Accelerated computing",
630 |     "gpuNum": 4,
631 |     "hideHardwareSpecs": false,
632 |     "memoryGiB": 192,
633 |     "name": "ml.g4dn.12xlarge",
634 |     "vcpuNum": 48
635 |    },
636 |    {
637 |     "_defaultOrder": 34,
638 |     "_isFastLaunch": false,
639 |     "category": "Accelerated computing",
640 |     "gpuNum": 1,
641 |     "hideHardwareSpecs": false,
642 |     "memoryGiB": 256,
643 |     "name": "ml.g4dn.16xlarge",
644 |     "vcpuNum": 64
645 |    },
646 |    {
647 |     "_defaultOrder": 35,
648 |     "_isFastLaunch": false,
649 |     "category": "Accelerated computing",
650 |     "gpuNum": 1,
651 |     "hideHardwareSpecs": false,
652 |     "memoryGiB": 61,
653 |     "name": "ml.p3.2xlarge",
654 |     "vcpuNum": 8
655 |    },
656 |    {
657 |     "_defaultOrder": 36,
658 |     "_isFastLaunch": false,
659 |     "category": "Accelerated computing",
660 |     "gpuNum": 4,
661 |     "hideHardwareSpecs": false,
662 |     "memoryGiB": 244,
663 |     "name": "ml.p3.8xlarge",
664 |     "vcpuNum": 32
665 |    },
666 |    {
667 |     "_defaultOrder": 37,
668 |     "_isFastLaunch": false,
669 |     "category": "Accelerated computing",
670 |     "gpuNum": 8,
671 |     "hideHardwareSpecs": false,
672 |     "memoryGiB": 488,
673 |     "name": "ml.p3.16xlarge",
674 |     "vcpuNum": 64
675 |    },
676 |    {
677 |     "_defaultOrder": 38,
678 |     "_isFastLaunch": false,
679 |     "category": "Accelerated computing",
680 |     "gpuNum": 8,
681 |     "hideHardwareSpecs": false,
682 |     "memoryGiB": 768,
683 |     "name": "ml.p3dn.24xlarge",
684 |     "vcpuNum": 96
685 |    },
686 |    {
687 |     "_defaultOrder": 39,
688 |     "_isFastLaunch": false,
689 |     "category": "Memory Optimized",
690 |     "gpuNum": 0,
691 |     "hideHardwareSpecs": false,
692 |     "memoryGiB": 16,
693 |     "name": "ml.r5.large",
694 |     "vcpuNum": 2
695 |    },
696 |    {
697 |     "_defaultOrder": 40,
698 |     "_isFastLaunch": false,
699 |     "category": "Memory Optimized",
700 |     "gpuNum": 0,
701 |     "hideHardwareSpecs": false,
702 |     "memoryGiB": 32,
703 |     "name": "ml.r5.xlarge",
704 |     "vcpuNum": 4
705 |    },
706 |    {
707 |     "_defaultOrder": 41,
708 |     "_isFastLaunch": false,
709 |     "category": "Memory Optimized",
710 |     "gpuNum": 0,
711 |     "hideHardwareSpecs": false,
712 |     "memoryGiB": 64,
713 |     "name": "ml.r5.2xlarge",
714 |     "vcpuNum": 8
715 |    },
716 |    {
717 |     "_defaultOrder": 42,
718 |     "_isFastLaunch": false,
719 |     "category": "Memory Optimized",
720 |     "gpuNum": 0,
721 |     "hideHardwareSpecs": false,
722 |     "memoryGiB": 128,
723 |     "name": "ml.r5.4xlarge",
724 |     "vcpuNum": 16
725 |    },
726 |    {
727 |     "_defaultOrder": 43,
728 |     "_isFastLaunch": false,
729 |     "category": "Memory Optimized",
730 |     "gpuNum": 0,
731 |     "hideHardwareSpecs": false,
732 |     "memoryGiB": 256,
733 |     "name": "ml.r5.8xlarge",
734 |     "vcpuNum": 32
735 |    },
736 |    {
737 |     "_defaultOrder": 44,
738 |     "_isFastLaunch": false,
739 |     "category": "Memory Optimized",
740 |     "gpuNum": 0,
741 |     "hideHardwareSpecs": false,
742 |     "memoryGiB": 384,
743 |     "name": "ml.r5.12xlarge",
744 |     "vcpuNum": 48
745 |    },
746 |    {
747 |     "_defaultOrder": 45,
748 |     "_isFastLaunch": false,
749 |     "category": "Memory Optimized",
750 |     "gpuNum": 0,
751 |     "hideHardwareSpecs": false,
752 |     "memoryGiB": 512,
753 |     "name": "ml.r5.16xlarge",
754 |     "vcpuNum": 64
755 |    },
756 |    {
757 |     "_defaultOrder": 46,
758 |     "_isFastLaunch": false,
759 |     "category": "Memory Optimized",
760 |     "gpuNum": 0,
761 |     "hideHardwareSpecs": false,
762 |     "memoryGiB": 768,
763 |     "name": "ml.r5.24xlarge",
764 |     "vcpuNum": 96
765 |    },
766 |    {
767 |     "_defaultOrder": 47,
768 |     "_isFastLaunch": false,
769 |     "category": "Accelerated computing",
770 |     "gpuNum": 1,
771 |     "hideHardwareSpecs": false,
772 |     "memoryGiB": 16,
773 |     "name": "ml.g5.xlarge",
774 |     "vcpuNum": 4
775 |    },
776 |    {
777 |     "_defaultOrder": 48,
778 |     "_isFastLaunch": false,
779 |     "category": "Accelerated computing",
780 |     "gpuNum": 1,
781 |     "hideHardwareSpecs": false,
782 |     "memoryGiB": 32,
783 |     "name": "ml.g5.2xlarge",
784 |     "vcpuNum": 8
785 |    },
786 |    {
787 |     "_defaultOrder": 49,
788 |     "_isFastLaunch": false,
789 |     "category": "Accelerated computing",
790 |     "gpuNum": 1,
791 |     "hideHardwareSpecs": false,
792 |     "memoryGiB": 64,
793 |     "name": "ml.g5.4xlarge",
794 |     "vcpuNum": 16
795 |    },
796 |    {
797 |     "_defaultOrder": 50,
798 |     "_isFastLaunch": false,
799 |     "category": "Accelerated computing",
800 |     "gpuNum": 1,
801 |     "hideHardwareSpecs": false,
802 |     "memoryGiB": 128,
803 |     "name": "ml.g5.8xlarge",
804 |     "vcpuNum": 32
805 |    },
806 |    {
807 |     "_defaultOrder": 51,
808 |     "_isFastLaunch": false,
809 |     "category": "Accelerated computing",
810 |     "gpuNum": 1,
811 |     "hideHardwareSpecs": false,
812 |     "memoryGiB": 256,
813 |     "name": "ml.g5.16xlarge",
814 |     "vcpuNum": 64
815 |    },
816 |    {
817 |     "_defaultOrder": 52,
818 |     "_isFastLaunch": false,
819 |     "category": "Accelerated computing",
820 |     "gpuNum": 4,
821 |     "hideHardwareSpecs": false,
822 |     "memoryGiB": 192,
823 |     "name": "ml.g5.12xlarge",
824 |     "vcpuNum": 48
825 |    },
826 |    {
827 |     "_defaultOrder": 53,
828 |     "_isFastLaunch": false,
829 |     "category": "Accelerated computing",
830 |     "gpuNum": 4,
831 |     "hideHardwareSpecs": false,
832 |     "memoryGiB": 384,
833 |     "name": "ml.g5.24xlarge",
834 |     "vcpuNum": 96
835 |    },
836 |    {
837 |     "_defaultOrder": 54,
838 |     "_isFastLaunch": false,
839 |     "category": "Accelerated computing",
840 |     "gpuNum": 8,
841 |     "hideHardwareSpecs": false,
842 |     "memoryGiB": 768,
843 |     "name": "ml.g5.48xlarge",
844 |     "vcpuNum": 192
845 |    }
846 |   ],
847 |   "instance_type": "ml.m5.large",
848 |   "kernelspec": {
849 |    "display_name": "conda_python3",
850 |    "language": "python",
851 |    "name": "conda_python3"
852 |   },
853 |   "language_info": {
854 |    "codemirror_mode": {
855 |     "name": "ipython",
856 |     "version": 3
857 |    },
858 |    "file_extension": ".py",
859 |    "mimetype": "text/x-python",
860 |    "name": "python",
861 |    "nbconvert_exporter": "python",
862 |    "pygments_lexer": "ipython3",
863 |    "version": "3.10.8"
864 |   }
865 |  },
866 |  "nbformat": 4,
867 |  "nbformat_minor": 5
868 | }
869 | 


--------------------------------------------------------------------------------