├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── app ├── README.md ├── app.py ├── env_vars.sh ├── images │ ├── ai-icon.png │ └── user-icon.png ├── pgvector_chat_flan_xl.py ├── pgvector_chat_llama2.py ├── qa-with-llm-and-rag.png └── requirements.txt ├── cdk_stacks ├── .gitignore ├── README.md ├── app.py ├── cdk.context.json ├── cdk.json ├── rag_with_pgvector │ ├── __init__.py │ ├── aurora_postgresql.py │ ├── sm_embedding_endpoint.py │ ├── sm_llm_endpoint.py │ ├── sm_studio.py │ └── vpc.py ├── rag_with_pgvector_arch.svg ├── requirements.txt └── source.bat └── data_ingestion_to_vectordb ├── container ├── Dockerfile ├── credentials.py ├── load_data_into_pgvector.py └── sm_helper.py ├── data_ingestion_to_pgvector.ipynb └── scripts └── get_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | Untitled*.ipynb 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | .DS_Store 108 | .idea/ 109 | bin/ 110 | lib64 111 | pyvenv.cfg 112 | *.bak 113 | share/ 114 | cdk.out/ 115 | cdk.context.json* 116 | zap/ 117 | 118 | */.gitignore 119 | */setup.py 120 | */source.bat 121 | 122 | */*/.gitignore 123 | */*/setup.py 124 | */*/source.bat 125 | 126 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QA with LLM and RAG (Retrieval Augmented Generation) 2 | 3 | This project is a Question Answering application with Large Language Models (LLMs) and Amazon Aurora Postgresql using [pgvector](https://github.com/pgvector/pgvector). An application using the RAG(Retrieval Augmented Generation) approach retrieves information most relevant to the user’s request from the enterprise knowledge base or content, bundles it as context along with the user’s request as a prompt, and then sends it to the LLM to get a GenAI response. 4 | 5 | LLMs have limitations around the maximum word count for the input prompt, therefore choosing the right passages among thousands or millions of documents in the enterprise, has a direct impact on the LLM’s accuracy. 6 | 7 | In this project, Amazon Aurora Postgresql with pgvector is used for knowledge base. 8 | 9 | The overall architecture is like this: 10 | 11 | ![rag_with_pgvector_arch](./cdk_stacks/rag_with_pgvector_arch.svg) 12 | 13 | ### Overall Workflow 14 | 15 | 1. Deploy the cdk stacks (For more information, see [here](./cdk_stacks/README.md)). 16 | - A SageMaker Studio in a private VPC. 17 | - A SageMaker Endpoint for text generation. 18 | - A SageMaker Endpoint for generating embeddings. 19 | - An Amazon Aurora Postgresql cluster for storing embeddings. 20 | - Aurora Postgresql cluster's access credentials (username and password) stored in AWS Secrets Mananger as a name such as `RAGPgVectorStackAuroraPostg-xxxxxxxxxxxx`. 21 | 2. Open JupyterLab in SageMaker Studio and then open a new terminal. 22 | 3. Run the following commands on the terminal to clone the code repository for this project: 23 | ``` 24 | git clone --depth=1 https://github.com/aws-samples/rag-with-amazon-postgresql-using-pgvector.git 25 | ``` 26 | 4. Open `data_ingestion_to_pgvector.ipynb` notebook and Run it. (For more information, see [here](./data_ingestion_to_vectordb/data_ingestion_to_pgvector.ipynb)) 27 | 5. Run Streamlit application. (For more information, see [here](./app/README.md)) 28 | 29 | ### References 30 | 31 | * [Leverage pgvector and Amazon Aurora PostgreSQL for Natural Language Processing, Chatbots and Sentiment Analysis (2023-07-13)](https://aws.amazon.com/blogs/database/leverage-pgvector-and-amazon-aurora-postgresql-for-natural-language-processing-chatbots-and-sentiment-analysis/) 32 | * [Accelerate HNSW indexing and searching with pgvector on Amazon Aurora PostgreSQL-compatible edition and Amazon RDS for PostgreSQL (2023-11-06)](https://aws.amazon.com/blogs/database/accelerate-hnsw-indexing-and-searching-with-pgvector-on-amazon-aurora-postgresql-compatible-edition-and-amazon-rds-for-postgresql/) 33 | * [Optimize generative AI applications with pgvector indexing: A deep dive into IVFFlat and HNSW techniques (2024-03-15)](https://aws.amazon.com/blogs/database/optimize-generative-ai-applications-with-pgvector-indexing-a-deep-dive-into-ivfflat-and-hnsw-techniques/) 34 | * [Improve the performance of generative AI workloads on Amazon Aurora with Optimized Reads and pgvector (2024-02-09)](https://aws.amazon.com/blogs/database/accelerate-generative-ai-workloads-on-amazon-aurora-with-optimized-reads-and-pgvector/) 35 | * [Building AI-powered search in PostgreSQL using Amazon SageMaker and pgvector (2023-05-03)](https://aws.amazon.com/blogs/database/building-ai-powered-search-in-postgresql-using-amazon-sagemaker-and-pgvector/) 36 | * [Build Streamlit apps in Amazon SageMaker Studio (2023-04-11)](https://aws.amazon.com/blogs/machine-learning/build-streamlit-apps-in-amazon-sagemaker-studio/) 37 | * [Quickly build high-accuracy Generative AI applications on enterprise data using Amazon Kendra, LangChain, and large language models (2023-05-03)](https://aws.amazon.com/blogs/machine-learning/quickly-build-high-accuracy-generative-ai-applications-on-enterprise-data-using-amazon-kendra-langchain-and-large-language-models/) 38 | * [(github) Amazon Kendra Retriver Samples](https://github.com/aws-samples/amazon-kendra-langchain-extensions/tree/main/kendra_retriever_samples) 39 | * [Question answering using Retrieval Augmented Generation with foundation models in Amazon SageMaker JumpStart (2023-05-02)](https://aws.amazon.com/blogs/machine-learning/question-answering-using-retrieval-augmented-generation-with-foundation-models-in-amazon-sagemaker-jumpstart/) 40 | * [Use proprietary foundation models from Amazon SageMaker JumpStart in Amazon SageMaker Studio (2023-06-27)](https://aws.amazon.com/blogs/machine-learning/use-proprietary-foundation-models-from-amazon-sagemaker-jumpstart-in-amazon-sagemaker-studio/) 41 | * [LangChain](https://python.langchain.com/docs/get_started/introduction.html) - A framework for developing applications powered by language models. 42 | * [Streamlit](https://streamlit.io/) - A faster way to build and share data apps 43 | * [rag-with-amazon-kendra-and-sagemaker](https://github.com/aws-samples/aws-kr-startup-samples/tree/main/gen-ai/rag-with-amazon-kendra-and-sagemaker) - Question Answering application with Large Language Models (LLMs) and Amazon Kendra 44 | * [rag-with-amazon-opensearch-and-sagemaker](https://github.com/aws-samples/rag-with-amazon-opensearch-and-sagemaker) - Question Answering application with Large Language Models (LLMs) and Amazon OpenSearch Service 45 | * [rag-with-amazon-opensearch-serverless](https://github.com/aws-samples/rag-with-amazon-opensearch-serverless) - Question Answering application with Large Language Models (LLMs) and Amazon OpenSearch Serverless Service 46 | * [Pgvector changelog - v0.4.0 (2023-01-11)](https://github.com/pgvector/pgvector/blob/master/CHANGELOG.md#040-2023-01-11) 47 | > Increased max dimensions for vector from `1024` to `16000`
48 | > Increased max dimensions for index from `1024` to `2000` 49 | 50 | ## Security 51 | 52 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 53 | 54 | ## License 55 | 56 | This library is licensed under the MIT-0 License. See the LICENSE file. 57 | -------------------------------------------------------------------------------- /app/README.md: -------------------------------------------------------------------------------- 1 | ## Run the Streamlit application in Studio 2 | 3 | Now we’re ready to run the Streamlit web application for our question answering bot. 4 | 5 | SageMaker Studio provides a convenient platform to host the Streamlit web application. The following steps describe how to run the Streamlit app on SageMaker Studio. Alternatively, you could also follow the same procedure to run the app on Amazon EC2 instance or Cloud9 in your AWS Account. 6 | 7 | 1. Open JupyterLab and then open a new **Terminal**. 8 | 2. Run the following commands on the terminal to clone the code repository for this post and install the Python packages needed by the application: 9 | ``` 10 | git clone --depth=1 https://github.com/aws-samples/rag-with-amazon-postgresql-using-pgvector-and-sagemaker.git 11 | cd rag-with-amazon-postgresql-using-pgvector-and-sagemaker/app 12 | python -m venv .env 13 | source .env/bin/activate 14 | pip install -r requirements.txt 15 | ``` 16 | 3. In the shell, set the following environment variables with the values that are available from the CloudFormation stack output. 17 | ``` 18 | export AWS_REGION=us-east-1 19 | export PGVECTOR_SECRET_ID="your-postgresql-secret-id" 20 | export COLLECTION_NAME="llm_rag_embeddings" 21 | export EMBEDDING_ENDPOINT_NAME="your-sagemakr-endpoint-for-embedding-model" 22 | export TEXT2TEXT_ENDPOINT_NAME="your-sagemaner-endpoint-for-text-generation-model" 23 | ``` 24 | :information_source: `COLLECTION_NAME` can be found in [data ingestion to vectordb](../data_ingestion_to_vectordb/data_ingestion_to_pgvector.ipynb) step. 25 | 4. When the application runs successfully, you’ll see an output similar to the following (the IP addresses you will see will be different from the ones shown in this example). Note the port number (typically `8501`) from the output to use as part of the URL for app in the next step. 26 | ``` 27 | sagemaker-user@studio$ streamlit run app.py 28 | 29 | Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False. 30 | 31 | You can now view your Streamlit app in your browser. 32 | 33 | Network URL: http://169.255.255.2:8501 34 | External URL: http://52.4.240.77:8501 35 | ``` 36 | 5. You can access the app in a new browser tab using a URL that is similar to your Studio domain URL. For example, if your Studio URL is `https://d-randomidentifier.studio.us-east-1.sagemaker.aws/jupyter/default/lab?` then the URL for your Streamlit app will be `https://d-randomidentifier.studio.us-east-1.sagemaker.aws/jupyter/default/proxy/8501/app` (notice that `lab` is replaced with `proxy/8501/app`). If the port number noted in the previous step is different from `8501` then use that instead of `8501` in the URL for the Streamlit app. 37 | 38 | The following screenshot shows the app with a couple of user questions. (e.g., `What are the versions of XGBoost supported by Amazon SageMaker?`) 39 | 40 | ![qa-with-llm-and-rag](./qa-with-llm-and-rag.png) 41 | 42 | ## References 43 | 44 | * [Leverage pgvector and Amazon Aurora PostgreSQL for Natural Language Processing, Chatbots and Sentiment Analysis (2023-07-13)](https://aws.amazon.com/blogs/database/leverage-pgvector-and-amazon-aurora-postgresql-for-natural-language-processing-chatbots-and-sentiment-analysis/) 45 | * [Building AI-powered search in PostgreSQL using Amazon SageMaker and pgvector (2023-05-03)](https://aws.amazon.com/blogs/database/building-ai-powered-search-in-postgresql-using-amazon-sagemaker-and-pgvector/) 46 | * [Use proprietary foundation models from Amazon SageMaker JumpStart in Amazon SageMaker Studio (2023-06-27)](https://aws.amazon.com/blogs/machine-learning/use-proprietary-foundation-models-from-amazon-sagemaker-jumpstart-in-amazon-sagemaker-studio/) 47 | * [Build Streamlit apps in Amazon SageMaker Studio (2023-04-11)](https://aws.amazon.com/blogs/machine-learning/build-streamlit-apps-in-amazon-sagemaker-studio/) 48 | * [Quickly build high-accuracy Generative AI applications on enterprise data using Amazon Kendra, LangChain, and large language models (2023-05-02)](https://aws.amazon.com/blogs/machine-learning/quickly-build-high-accuracy-generative-ai-applications-on-enterprise-data-using-amazon-kendra-langchain-and-large-language-models/) 49 | * [sagemaker-huggingface-inference-toolkit](https://github.com/aws/sagemaker-huggingface-inference-toolkit) - SageMaker Hugging Face Inference Toolkit is an open-source library for serving 🤗 Transformers and Diffusers models on Amazon SageMaker. 50 | * [LangChain](https://python.langchain.com/docs/get_started/introduction.html) - A framework for developing applications powered by language models. 51 | * [Streamlit](https://streamlit.io/) - A faster way to build and share data apps 52 | -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=4 shiftwidth=4 softtabstop=4 expandtab 4 | 5 | import os 6 | import streamlit as st 7 | import uuid 8 | 9 | import pgvector_chat_flan_xl as flanxl 10 | import pgvector_chat_llama2 as llama2 11 | 12 | PROVIDER_NAME = os.environ.get('PROVIDER_NAME', 'llama2') 13 | 14 | USER_ICON = "images/user-icon.png" 15 | AI_ICON = "images/ai-icon.png" 16 | MAX_HISTORY_LENGTH = 5 17 | PROVIDER_MAP = { 18 | 'flanxl': 'Flan XL', 19 | 'llama2': 'Llama2 7B', 20 | } 21 | 22 | # Check if the user ID is already stored in the session state 23 | if 'user_id' in st.session_state: 24 | user_id = st.session_state['user_id'] 25 | 26 | # If the user ID is not yet stored in the session state, generate a random UUID 27 | else: 28 | user_id = str(uuid.uuid4()) 29 | st.session_state['user_id'] = user_id 30 | 31 | 32 | if 'llm_chain' not in st.session_state: 33 | llm_app = llama2 if PROVIDER_NAME == 'llama2' else flanxl 34 | st.session_state['llm_app'] = llm_app 35 | st.session_state['llm_chain'] = llm_app.build_chain() 36 | 37 | if 'chat_history' not in st.session_state: 38 | st.session_state['chat_history'] = [] 39 | 40 | if "chats" not in st.session_state: 41 | st.session_state.chats = [ 42 | { 43 | 'id': 0, 44 | 'question': '', 45 | 'answer': '' 46 | } 47 | ] 48 | 49 | if "questions" not in st.session_state: 50 | st.session_state.questions = [] 51 | 52 | if "answers" not in st.session_state: 53 | st.session_state.answers = [] 54 | 55 | if "input" not in st.session_state: 56 | st.session_state.input = "" 57 | 58 | 59 | st.markdown(""" 60 | 75 | """, unsafe_allow_html=True) 76 | 77 | 78 | def write_logo(): 79 | col1, col2, col3 = st.columns([5, 1, 5]) 80 | with col2: 81 | st.image(AI_ICON, use_column_width='always') 82 | 83 | 84 | def write_top_bar(): 85 | col1, col2, col3 = st.columns([1,10,2]) 86 | with col1: 87 | st.image(AI_ICON, use_column_width='always') 88 | with col2: 89 | selected_provider = PROVIDER_NAME 90 | if selected_provider in PROVIDER_MAP: 91 | provider = PROVIDER_MAP[selected_provider] 92 | else: 93 | provider = selected_provider.capitalize() 94 | header = f"An AI App powered by Amazon Aurora Postgresql with pgvector and {provider}!" 95 | st.write(f"

{header}

", unsafe_allow_html=True) 96 | with col3: 97 | clear = st.button("Clear Chat") 98 | return clear 99 | 100 | 101 | clear = write_top_bar() 102 | 103 | if clear: 104 | st.session_state.questions = [] 105 | st.session_state.answers = [] 106 | st.session_state.input = "" 107 | st.session_state["chat_history"] = [] 108 | 109 | 110 | def handle_input(): 111 | input = st.session_state.input 112 | question_with_id = { 113 | 'question': input, 114 | 'id': len(st.session_state.questions) 115 | } 116 | st.session_state.questions.append(question_with_id) 117 | 118 | chat_history = st.session_state["chat_history"] 119 | if len(chat_history) == MAX_HISTORY_LENGTH: 120 | chat_history = chat_history[:-1] 121 | 122 | llm_chain = st.session_state['llm_chain'] 123 | chain = st.session_state['llm_app'] 124 | result = chain.run_chain(llm_chain, input, chat_history) 125 | answer = result['answer'] 126 | chat_history.append((input, answer)) 127 | 128 | document_list = [] 129 | if 'source_documents' in result: 130 | for d in result['source_documents']: 131 | if not (d.metadata['source'] in document_list): 132 | document_list.append((d.metadata['source'])) 133 | 134 | st.session_state.answers.append({ 135 | 'answer': result, 136 | 'sources': document_list, 137 | 'id': len(st.session_state.questions) 138 | }) 139 | st.session_state.input = "" 140 | 141 | 142 | def write_user_message(md): 143 | col1, col2 = st.columns([1,12]) 144 | 145 | with col1: 146 | st.image(USER_ICON, use_column_width='always') 147 | with col2: 148 | st.warning(md['question']) 149 | 150 | 151 | def render_result(result): 152 | answer, sources = st.tabs(['Answer', 'Sources']) 153 | with answer: 154 | render_answer(result['answer']) 155 | with sources: 156 | if 'source_documents' in result: 157 | render_sources(result['source_documents']) 158 | else: 159 | render_sources([]) 160 | 161 | 162 | def render_answer(answer): 163 | col1, col2 = st.columns([1,12]) 164 | with col1: 165 | st.image(AI_ICON, use_column_width='always') 166 | with col2: 167 | st.info(answer['answer']) 168 | 169 | 170 | def render_sources(sources): 171 | col1, col2 = st.columns([1,12]) 172 | with col2: 173 | with st.expander("Sources"): 174 | for s in sources: 175 | st.write(s) 176 | 177 | 178 | # Each answer will have context of the question asked in order to associate the provided feedback with the respective question 179 | def write_chat_message(md, q): 180 | chat = st.container() 181 | with chat: 182 | render_answer(md['answer']) 183 | render_sources(md['sources']) 184 | 185 | 186 | with st.container(): 187 | for (q, a) in zip(st.session_state.questions, st.session_state.answers): 188 | write_user_message(q) 189 | write_chat_message(a, q) 190 | 191 | st.markdown('---') 192 | input = st.text_input("You are talking to an AI, ask any question.", key="input", on_change=handle_input) 193 | -------------------------------------------------------------------------------- /app/env_vars.sh: -------------------------------------------------------------------------------- 1 | export AWS_REGION="your-aws-region" 2 | export PGVECTOR_SECRET_ID="your-postgresql-secret" 3 | export COLLECTION_NAME="llm_rag_embeddings" 4 | export EMBEDDING_ENDPOINT_NAME="your-sagemaker-endpoint-for-embedding-model" 5 | export TEXT2TEXT_ENDPOINT_NAME="your-sagemaker-endpoint-for-text-generation-model" -------------------------------------------------------------------------------- /app/images/ai-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/rag-with-amazon-postgresql-using-pgvector-and-sagemaker/1b5ca45eff14b162e8be28cb179338e1ad4d7bbd/app/images/ai-icon.png -------------------------------------------------------------------------------- /app/images/user-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/rag-with-amazon-postgresql-using-pgvector-and-sagemaker/1b5ca45eff14b162e8be28cb179338e1ad4d7bbd/app/images/user-icon.png -------------------------------------------------------------------------------- /app/pgvector_chat_flan_xl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=4 shiftwidth=4 softtabstop=4 expandtab 4 | 5 | import os 6 | import json 7 | import logging 8 | import sys 9 | from typing import List 10 | import urllib 11 | 12 | import boto3 13 | 14 | from langchain_postgres import PGVector 15 | from langchain_community.embeddings import SagemakerEndpointEmbeddings 16 | from langchain_community.embeddings.sagemaker_endpoint import EmbeddingsContentHandler 17 | 18 | from langchain_community.llms import SagemakerEndpoint 19 | from langchain_community.llms.sagemaker_endpoint import LLMContentHandler 20 | 21 | from langchain.prompts import PromptTemplate 22 | from langchain.chains import ConversationalRetrievalChain 23 | 24 | logger = logging.getLogger() 25 | logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr) 26 | 27 | 28 | class bcolors: 29 | HEADER = '\033[95m' 30 | OKBLUE = '\033[94m' 31 | OKCYAN = '\033[96m' 32 | OKGREEN = '\033[92m' 33 | WARNING = '\033[93m' 34 | FAIL = '\033[91m' 35 | ENDC = '\033[0m' 36 | BOLD = '\033[1m' 37 | UNDERLINE = '\033[4m' 38 | 39 | 40 | MAX_HISTORY_LENGTH = 5 41 | 42 | 43 | def _create_sagemaker_embeddings(endpoint_name: str, region: str = "us-east-1") -> SagemakerEndpointEmbeddings: 44 | 45 | class ContentHandlerForEmbeddings(EmbeddingsContentHandler): 46 | """ 47 | encode input string as utf-8 bytes, read the embeddings 48 | from the output 49 | """ 50 | 51 | content_type = "application/json" 52 | accepts = "application/json" 53 | 54 | def transform_input(self, prompt: str, model_kwargs={}) -> bytes: 55 | input_str = json.dumps({"text_inputs": prompt, **model_kwargs}) 56 | return input_str.encode('utf-8') 57 | 58 | def transform_output(self, output: bytes) -> str: 59 | response_json = json.loads(output.read().decode("utf-8")) 60 | embeddings = response_json["embedding"] 61 | if len(embeddings) == 1: 62 | return [embeddings[0]] 63 | return embeddings 64 | 65 | # create a content handler object which knows how to serialize 66 | # and deserialize communication with the model endpoint 67 | content_handler = ContentHandlerForEmbeddings() 68 | 69 | # read to create the Sagemaker embeddings, we are providing 70 | # the Sagemaker endpoint that will be used for generating the 71 | # embeddings to the class 72 | # 73 | embeddings = SagemakerEndpointEmbeddings( 74 | endpoint_name=endpoint_name, 75 | region_name=region, 76 | content_handler=content_handler 77 | ) 78 | logger.info(f"embeddings type={type(embeddings)}") 79 | 80 | return embeddings 81 | 82 | 83 | def _get_credentials(secret_id: str, region_name: str = 'us-east-1') -> str: 84 | client = boto3.client('secretsmanager', region_name=region_name) 85 | response = client.get_secret_value(SecretId=secret_id) 86 | secrets_value = json.loads(response['SecretString']) 87 | return secrets_value 88 | 89 | 90 | def build_chain(): 91 | region = os.environ["AWS_REGION"] 92 | embeddings_model_endpoint = os.environ["EMBEDDING_ENDPOINT_NAME"] 93 | text2text_model_endpoint = os.environ["TEXT2TEXT_ENDPOINT_NAME"] 94 | 95 | pgvector_secret_id = os.environ["PGVECTOR_SECRET_ID"] 96 | secret = _get_credentials(pgvector_secret_id, region) 97 | db_username = secret['username'] 98 | db_password = urllib.parse.quote_plus(secret['password']) 99 | db_port = secret['port'] 100 | db_host = secret['host'] 101 | 102 | CONNECTION_STRING = PGVector.connection_string_from_db_params( 103 | driver = 'psycopg', 104 | user = db_username, 105 | password = db_password, 106 | host = db_host, 107 | port = db_port, 108 | database = '' 109 | ) 110 | 111 | collection_name = os.environ["COLLECTION_NAME"] 112 | 113 | class ContentHandler(LLMContentHandler): 114 | content_type = "application/json" 115 | accepts = "application/json" 116 | 117 | def transform_input(self, prompt: str, model_kwargs: dict) -> bytes: 118 | input_str = json.dumps({"inputs": prompt, **model_kwargs}) 119 | return input_str.encode('utf-8') 120 | 121 | def transform_output(self, output: bytes) -> str: 122 | response_json = json.loads(output.read().decode("utf-8")) 123 | return response_json[0]["generated_text"] 124 | 125 | content_handler = ContentHandler() 126 | 127 | model_kwargs = { 128 | "max_length": 500, 129 | "num_return_sequences": 1, 130 | "top_k": 250, 131 | "top_p": 0.95, 132 | "do_sample": False, 133 | "temperature": 1 134 | } 135 | 136 | llm = SagemakerEndpoint( 137 | endpoint_name=text2text_model_endpoint, 138 | region_name=region, 139 | model_kwargs=model_kwargs, 140 | content_handler=content_handler 141 | ) 142 | 143 | vectorstore = PGVector( 144 | collection_name=collection_name, 145 | connection=CONNECTION_STRING, 146 | embeddings=_create_sagemaker_embeddings(embeddings_model_endpoint, region) 147 | ) 148 | retriever = vectorstore.as_retriever() 149 | 150 | prompt_template = """Answer based on context:\n\n{context}\n\n{question}""" 151 | 152 | PROMPT = PromptTemplate( 153 | template=prompt_template, input_variables=["context", "question"] 154 | ) 155 | 156 | condense_qa_template = """ 157 | Given the following conversation and a follow up question, rephrase the follow up question 158 | to be a standalone question. 159 | 160 | Chat History: 161 | {chat_history} 162 | Follow Up Input: {question} 163 | Standalone question:""" 164 | standalone_question_prompt = PromptTemplate.from_template(condense_qa_template) 165 | 166 | qa = ConversationalRetrievalChain.from_llm( 167 | llm=llm, 168 | retriever=retriever, 169 | condense_question_prompt=standalone_question_prompt, 170 | return_source_documents=True, 171 | combine_docs_chain_kwargs={"prompt":PROMPT} 172 | ) 173 | 174 | logger.info(f"\ntype('qa'): \"{type(qa)}\"\n") 175 | return qa 176 | 177 | 178 | def run_chain(chain, prompt: str, history=[]): 179 | return chain.invoke({"question": prompt, "chat_history": history}) 180 | 181 | 182 | if __name__ == "__main__": 183 | chat_history = [] 184 | qa = build_chain() 185 | print(bcolors.OKBLUE + "Hello! How can I help you?" + bcolors.ENDC) 186 | print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC) 187 | print(">", end=" ", flush=True) 188 | for query in sys.stdin: 189 | if (query.strip().lower().startswith("new search:")): 190 | query = query.strip().lower().replace("new search:","") 191 | chat_history = [] 192 | elif (len(chat_history) == MAX_HISTORY_LENGTH): 193 | chat_history.pop(0) 194 | result = run_chain(qa, query, chat_history) 195 | chat_history.append((query, result["answer"])) 196 | print(bcolors.OKGREEN + result['answer'] + bcolors.ENDC) 197 | if 'source_documents' in result: 198 | print(bcolors.OKGREEN + 'Sources:') 199 | for d in result['source_documents']: 200 | print(d.metadata['source']) 201 | print(bcolors.ENDC) 202 | print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC) 203 | print(">", end=" ", flush=True) 204 | print(bcolors.OKBLUE + "Bye" + bcolors.ENDC) -------------------------------------------------------------------------------- /app/pgvector_chat_llama2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=4 shiftwidth=4 softtabstop=4 expandtab 4 | 5 | import os 6 | import json 7 | import logging 8 | import sys 9 | from typing import List 10 | import urllib 11 | 12 | import boto3 13 | 14 | from langchain_postgres import PGVector 15 | from langchain_community.embeddings import SagemakerEndpointEmbeddings 16 | from langchain_community.embeddings.sagemaker_endpoint import EmbeddingsContentHandler 17 | 18 | from langchain_community.llms import SagemakerEndpoint 19 | from langchain_community.llms.sagemaker_endpoint import LLMContentHandler 20 | 21 | from langchain.prompts import PromptTemplate 22 | from langchain.chains import ConversationalRetrievalChain 23 | 24 | logger = logging.getLogger() 25 | logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr) 26 | 27 | 28 | class bcolors: 29 | HEADER = '\033[95m' 30 | OKBLUE = '\033[94m' 31 | OKCYAN = '\033[96m' 32 | OKGREEN = '\033[92m' 33 | WARNING = '\033[93m' 34 | FAIL = '\033[91m' 35 | ENDC = '\033[0m' 36 | BOLD = '\033[1m' 37 | UNDERLINE = '\033[4m' 38 | 39 | 40 | MAX_HISTORY_LENGTH = 5 41 | 42 | 43 | class SagemakerEndpointEmbeddingsJumpStart(SagemakerEndpointEmbeddings): 44 | def embed_documents( 45 | self, texts: List[str], chunk_size: int = 5 46 | ) -> List[List[float]]: 47 | """Compute doc embeddings using a SageMaker Inference Endpoint. 48 | 49 | Args: 50 | texts: The list of texts to embed. 51 | chunk_size: The chunk size defines how many input texts will 52 | be grouped together as request. If None, will use the 53 | chunk size specified by the class. 54 | 55 | Returns: 56 | List of embeddings, one for each text. 57 | """ 58 | results = [] 59 | 60 | _chunk_size = len(texts) if chunk_size > len(texts) else chunk_size 61 | for i in range(0, len(texts), _chunk_size): 62 | response = self._embedding_func(texts[i : i + _chunk_size]) 63 | results.extend(response) 64 | return results 65 | 66 | 67 | def _create_sagemaker_embeddings(endpoint_name: str, region: str = "us-east-1") -> SagemakerEndpointEmbeddingsJumpStart: 68 | 69 | class ContentHandlerForEmbeddings(EmbeddingsContentHandler): 70 | """ 71 | encode input string as utf-8 bytes, read the embeddings 72 | from the output 73 | """ 74 | content_type = "application/json" 75 | accepts = "application/json" 76 | def transform_input(self, prompt: str, model_kwargs = {}) -> bytes: 77 | input_str = json.dumps({"text_inputs": prompt, **model_kwargs}) 78 | return input_str.encode('utf-8') 79 | 80 | def transform_output(self, output: bytes) -> str: 81 | response_json = json.loads(output.read().decode("utf-8")) 82 | embeddings = response_json["embedding"] 83 | if len(embeddings) == 1: 84 | return [embeddings[0]] 85 | return embeddings 86 | 87 | # create a content handler object which knows how to serialize 88 | # and deserialize communication with the model endpoint 89 | content_handler = ContentHandlerForEmbeddings() 90 | 91 | # read to create the Sagemaker embeddings, we are providing 92 | # the Sagemaker endpoint that will be used for generating the 93 | # embeddings to the class 94 | embeddings = SagemakerEndpointEmbeddingsJumpStart( 95 | endpoint_name=endpoint_name, 96 | region_name=region, 97 | content_handler=content_handler 98 | ) 99 | logger.info(f"embeddings type={type(embeddings)}") 100 | 101 | return embeddings 102 | 103 | 104 | def _get_credentials(secret_id: str, region_name: str) -> str: 105 | client = boto3.client('secretsmanager', region_name=region_name) 106 | response = client.get_secret_value(SecretId=secret_id) 107 | secrets_value = json.loads(response['SecretString']) 108 | return secrets_value 109 | 110 | 111 | def build_chain(): 112 | region = os.environ["AWS_REGION"] 113 | embeddings_model_endpoint = os.environ["EMBEDDING_ENDPOINT_NAME"] 114 | text2text_model_endpoint = os.environ["TEXT2TEXT_ENDPOINT_NAME"] 115 | 116 | pgvector_secret_id = os.environ["PGVECTOR_SECRET_ID"] 117 | secret = _get_credentials(pgvector_secret_id, region) 118 | db_username = secret['username'] 119 | db_password = urllib.parse.quote_plus(secret['password']) 120 | db_port = secret['port'] 121 | db_host = secret['host'] 122 | 123 | CONNECTION_STRING = PGVector.connection_string_from_db_params( 124 | driver = 'psycopg', 125 | user = db_username, 126 | password = db_password, 127 | host = db_host, 128 | port = db_port, 129 | database = '' 130 | ) 131 | 132 | collection_name = os.environ["COLLECTION_NAME"] 133 | 134 | # https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/llama-2-chat-completion.ipynb 135 | class ContentHandler(LLMContentHandler): 136 | content_type = "application/json" 137 | accepts = "application/json" 138 | 139 | def transform_input(self, prompt: str, model_kwargs: dict) -> bytes: 140 | system_prompt = "You are a helpful assistant. Always answer to questions as helpfully as possible." \ 141 | " If you don't know the answer to a question, say I don't know the answer" 142 | 143 | payload = { 144 | "inputs": [ 145 | [ 146 | {"role": "system", "content": system_prompt}, 147 | {"role": "user", "content": prompt}, 148 | ], 149 | ], 150 | "parameters": model_kwargs, 151 | } 152 | input_str = json.dumps(payload) 153 | return input_str.encode("utf-8") 154 | 155 | def transform_output(self, output: bytes) -> str: 156 | response_json = json.loads(output.read().decode("utf-8")) 157 | content = response_json[0]["generation"]["content"] 158 | return content 159 | 160 | content_handler = ContentHandler() 161 | 162 | # https://github.com/aws/amazon-sagemaker-examples/blob/main/introduction_to_amazon_algorithms/jumpstart-foundation-models/llama-2-text-completion.ipynb 163 | model_kwargs = { 164 | "max_new_tokens": 256, 165 | "top_p": 0.9, 166 | "temperature": 0.6, 167 | "return_full_text": False, 168 | } 169 | 170 | llm = SagemakerEndpoint( 171 | endpoint_name=text2text_model_endpoint, 172 | region_name=region, 173 | model_kwargs=model_kwargs, 174 | endpoint_kwargs={"CustomAttributes": "accept_eula=true"}, 175 | content_handler=content_handler 176 | ) 177 | 178 | vectorstore = PGVector( 179 | collection_name=collection_name, 180 | connection=CONNECTION_STRING, 181 | embeddings=_create_sagemaker_embeddings(embeddings_model_endpoint, region) 182 | ) 183 | retriever = vectorstore.as_retriever() 184 | 185 | prompt_template = """Answer based on context:\n\n{context}\n\n{question}""" 186 | 187 | PROMPT = PromptTemplate( 188 | template=prompt_template, input_variables=["context", "question"] 189 | ) 190 | 191 | condense_qa_template = """ 192 | Given the following conversation and a follow up question, rephrase the follow up question 193 | to be a standalone question. 194 | 195 | Chat History: 196 | {chat_history} 197 | Follow Up Input: {question} 198 | Standalone question:""" 199 | standalone_question_prompt = PromptTemplate.from_template(condense_qa_template) 200 | 201 | qa = ConversationalRetrievalChain.from_llm( 202 | llm=llm, 203 | retriever=retriever, 204 | condense_question_prompt=standalone_question_prompt, 205 | return_source_documents=True, 206 | combine_docs_chain_kwargs={"prompt":PROMPT}, 207 | verbose=False 208 | ) 209 | 210 | logger.info(f"\ntype('qa'): \"{type(qa)}\"\n") 211 | return qa 212 | 213 | 214 | def run_chain(chain, prompt: str, history=[]): 215 | return chain.invoke({"question": prompt, "chat_history": history}) 216 | 217 | 218 | if __name__ == "__main__": 219 | chat_history = [] 220 | qa = build_chain() 221 | print(bcolors.OKBLUE + "Hello! How can I help you?" + bcolors.ENDC) 222 | print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC) 223 | print(">", end=" ", flush=True) 224 | for query in sys.stdin: 225 | if (query.strip().lower().startswith("new search:")): 226 | query = query.strip().lower().replace("new search:","") 227 | chat_history = [] 228 | elif (len(chat_history) == MAX_HISTORY_LENGTH): 229 | chat_history.pop(0) 230 | result = run_chain(qa, query, chat_history) 231 | chat_history.append((query, result["answer"])) 232 | print(bcolors.OKGREEN + result['answer'] + bcolors.ENDC) 233 | if 'source_documents' in result: 234 | print(bcolors.OKGREEN + '\nSources:') 235 | for d in result['source_documents']: 236 | print(d.metadata['source']) 237 | print(bcolors.ENDC) 238 | print(bcolors.OKCYAN + "Ask a question, start a New search: or CTRL-D to exit." + bcolors.ENDC) 239 | print(">", end=" ", flush=True) 240 | print(bcolors.OKBLUE + "Bye" + bcolors.ENDC) -------------------------------------------------------------------------------- /app/qa-with-llm-and-rag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/rag-with-amazon-postgresql-using-pgvector-and-sagemaker/1b5ca45eff14b162e8be28cb179338e1ad4d7bbd/app/qa-with-llm-and-rag.png -------------------------------------------------------------------------------- /app/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.26.159 2 | langchain>=0.3,<0.4 3 | langchain-community>=0.3,<0.4 4 | pgvector==0.2.5 5 | psycopg[binary]==3.1.19 6 | SQLAlchemy==2.0.28 7 | streamlit==1.37.0 8 | -------------------------------------------------------------------------------- /cdk_stacks/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .venv 6 | *.egg-info 7 | 8 | # CDK asset staging directory 9 | .cdk.staging 10 | cdk.out 11 | -------------------------------------------------------------------------------- /cdk_stacks/README.md: -------------------------------------------------------------------------------- 1 | 2 | # RAG Application CDK Python project! 3 | 4 | ![rag_with_pgvector_arch](./rag_with_pgvector_arch.svg) 5 | 6 | This is an QA application with LLMs and RAG project for CDK development with Python. 7 | 8 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 9 | 10 | This project is set up like a standard Python project. The initialization 11 | process also creates a virtualenv within this project, stored under the `.venv` 12 | directory. To create the virtualenv it assumes that there is a `python3` 13 | (or `python` for Windows) executable in your path with access to the `venv` 14 | package. If for any reason the automatic creation of the virtualenv fails, 15 | you can create the virtualenv manually. 16 | 17 | To manually create a virtualenv on MacOS and Linux: 18 | 19 | ``` 20 | $ python3 -m venv .venv 21 | ``` 22 | 23 | After the init process completes and the virtualenv is created, you can use the following 24 | step to activate your virtualenv. 25 | 26 | ``` 27 | $ source .venv/bin/activate 28 | ``` 29 | 30 | If you are a Windows platform, you would activate the virtualenv like this: 31 | 32 | ``` 33 | % .venv\Scripts\activate.bat 34 | ``` 35 | 36 | Once the virtualenv is activated, you can install the required dependencies. 37 | 38 | ``` 39 | (.venv) $ pip install -r requirements.txt 40 | ``` 41 | 42 | To add additional dependencies, for example other CDK libraries, just add 43 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 44 | command. 45 | 46 | Before synthesizing the CloudFormation, you should set approperly the cdk context configuration file, `cdk.context.json`. 47 | 48 | For example: 49 | 50 |
 51 | {
 52 |   "db_cluster_name": "postgresql-cluster-name",
 53 |   "jumpstart_model_info": {
 54 |     "model_id": "huggingface-text2text-flan-t5-xl",
 55 |     "version": "2.1.0"
 56 |   },
 57 |   "sagemaker_studio_domain_name": "sagmake-studio-domain-name"
 58 | }
 59 | 
60 | 61 | :information_source: The `model_id`, and `version` provided by SageMaker JumpStart can be found in [**SageMaker Built-in Algorithms with pre-trained Model Table**](https://sagemaker.readthedocs.io/en/stable/doc_utils/pretrainedmodels.html). 62 | 63 | > :warning: **Important**: Make sure you need to make sure `docker daemon` is running.
64 | > Otherwise you will encounter the following errors: 65 | 66 | ``` 67 | ERROR: Cannot connect to the Docker daemon at unix://$HOME/.docker/run/docker.sock. Is the docker daemon running? 68 | jsii.errors.JavaScriptError: 69 | Error: docker exited with status 1 70 | ``` 71 | 72 | Now this point you can now synthesize the CloudFormation template for this code. 73 | 74 | ``` 75 | (.venv) $ export CDK_DEFAULT_ACCOUNT=$(aws sts get-caller-identity --query Account --output text) 76 | (.venv) $ export CDK_DEFAULT_REGION=$(aws configure get region) 77 | (.venv) $ cdk synth --all 78 | ``` 79 | 80 | Now we will be able to deploy all the CDK stacks at once like this: 81 | 82 | ``` 83 | (.venv) $ cdk deploy --require-approval never --all 84 | ``` 85 | 86 | Or, we can provision each CDK stack one at a time like this: 87 | 88 | #### Step 1: List all CDK Stacks 89 | 90 | ``` 91 | (.venv) $ cdk list 92 | RAGVpcStack 93 | RAGSageMakerStudioStack 94 | RAGPgVectorStack 95 | EmbeddingEndpointStack 96 | LLMEndpointStack 97 | ``` 98 | 99 | #### Step 2: Create Aurora Postgresql cluster 100 | 101 | ``` 102 | (.venv) $ cdk deploy --require-approval never RAGVpcStack RAGPgVectorStack 103 | ``` 104 | 105 | #### Step 3: Create SageMaker Studio 106 | 107 | ``` 108 | (.venv) $ cdk deploy --require-approval never RAGSageMakerStudioStack 109 | ``` 110 | 111 | #### Step 4: Deploy LLM Embedding Endpoint 112 | 113 | ``` 114 | (.venv) $ cdk deploy --require-approval never EmbeddingEndpointStack 115 | ``` 116 | 117 | #### Step 5: Deploy Text Generation LLM Endpoint 118 | 119 | ``` 120 | (.venv) $ cdk deploy --require-approval never LLMEndpointStack 121 | ``` 122 | 123 | **Once all CDK stacks have been successfully created, proceed with the remaining steps of the [overall workflow](../README.md#overall-workflow).** 124 | 125 | 126 | ## Clean Up 127 | 128 | Delete the CloudFormation stacks by running the below command. 129 | 130 | ``` 131 | (.venv) $ cdk destroy --all 132 | ``` 133 | 134 | ## Useful commands 135 | 136 | * `cdk ls` list all stacks in the app 137 | * `cdk synth` emits the synthesized CloudFormation template 138 | * `cdk deploy` deploy this stack to your default AWS account/region 139 | * `cdk diff` compare deployed stack with current state 140 | * `cdk docs` open CDK documentation 141 | 142 | Enjoy! 143 | 144 | ## References 145 | 146 | * [Leverage pgvector and Amazon Aurora PostgreSQL for Natural Language Processing, Chatbots and Sentiment Analysis (2023-07-13)](https://aws.amazon.com/blogs/database/leverage-pgvector-and-amazon-aurora-postgresql-for-natural-language-processing-chatbots-and-sentiment-analysis/) 147 | * [Building AI-powered search in PostgreSQL using Amazon SageMaker and pgvector (2023-05-02)](https://aws.amazon.com/blogs/database/building-ai-powered-search-in-postgresql-using-amazon-sagemaker-and-pgvector/) 148 | * [Use proprietary foundation models from Amazon SageMaker JumpStart in Amazon SageMaker Studio (2023-06-27)](https://aws.amazon.com/blogs/machine-learning/use-proprietary-foundation-models-from-amazon-sagemaker-jumpstart-in-amazon-sagemaker-studio/) 149 | * [SageMaker Built-in Algorithms with pre-trained Model Table](https://sagemaker.readthedocs.io/en/stable/doc_utils/pretrainedmodels.html) 150 | * [AWS Deep Learning Containers Images](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html) 151 | * [Securing Amazon SageMaker Studio connectivity using a private VPC (2020-10-22)](https://aws.amazon.com/blogs/machine-learning/securing-amazon-sagemaker-studio-connectivity-using-a-private-vpc/) 152 | * [Connect SageMaker Studio Notebooks in a VPC to External Resources](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-notebooks-and-internet-access.html) 153 | * [Give SageMaker Processing Jobs Access to Resources in Your Amazon VPC](https://docs.aws.amazon.com/sagemaker/latest/dg/process-vpc.html) 154 | * **Configure the VPC Security Group** 155 | * In distributed processing, you must allow communication between the different containers in the same processing job. To do that, configure a rule for your security group that allows inbound connections between members of the same security group. 156 | * [Using the Amazon SageMaker Studio Image Build CLI to build container images from your Studio notebooks (2020-09-14)](https://aws.amazon.com/blogs/machine-learning/using-the-amazon-sagemaker-studio-image-build-cli-to-build-container-images-from-your-studio-notebooks/) 157 | * [How can I troubleshoot the InternalServerError response on Amazon SageMaker? - AWS re:Post](https://repost.aws/knowledge-center/sagemaker-http-500-internal-server-error) 158 | -------------------------------------------------------------------------------- /cdk_stacks/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | 9 | from rag_with_pgvector import ( 10 | VpcStack, 11 | AuroraPostgresqlStack, 12 | SageMakerStudioStack, 13 | EmbeddingEndpointStack, 14 | LLMEndpointStack 15 | ) 16 | 17 | APP_ENV = cdk.Environment( 18 | account=os.environ["CDK_DEFAULT_ACCOUNT"], 19 | region=os.environ["CDK_DEFAULT_REGION"] 20 | ) 21 | 22 | app = cdk.App() 23 | 24 | vpc_stack = VpcStack(app, 'RAGVpcStack', 25 | env=APP_ENV) 26 | 27 | aurora_pgsql_stack = AuroraPostgresqlStack(app, 'RAGPgVectorStack', 28 | vpc_stack.vpc, 29 | env=APP_ENV 30 | ) 31 | aurora_pgsql_stack.add_dependency(vpc_stack) 32 | 33 | sm_studio_stack = SageMakerStudioStack(app, 'RAGSageMakerStudioStack', 34 | vpc_stack.vpc, 35 | aurora_pgsql_stack.sg_rds_client, 36 | env=APP_ENV 37 | ) 38 | sm_studio_stack.add_dependency(aurora_pgsql_stack) 39 | 40 | sm_embedding_endpoint = EmbeddingEndpointStack(app, 'EmbeddingEndpointStack', 41 | env=APP_ENV 42 | ) 43 | sm_embedding_endpoint.add_dependency(sm_studio_stack) 44 | 45 | sm_llm_endpoint = LLMEndpointStack(app, 'LLMEndpointStack', 46 | env=APP_ENV 47 | ) 48 | sm_llm_endpoint.add_dependency(sm_studio_stack) 49 | 50 | app.synth() 51 | -------------------------------------------------------------------------------- /cdk_stacks/cdk.context.json: -------------------------------------------------------------------------------- 1 | { 2 | "db_cluster_name": "rag-pgvector-demo", 3 | "jumpstart_model_info": { 4 | "model_id": "meta-textgeneration-llama-2-7b-f", 5 | "version": "2.0.1" 6 | }, 7 | "sagemaker_studio_domain_name": "llm-app-rag-pgvector" 8 | } 9 | -------------------------------------------------------------------------------- /cdk_stacks/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 19 | "@aws-cdk/core:checkSecretUsage": true, 20 | "@aws-cdk/core:target-partitions": [ 21 | "aws", 22 | "aws-cn" 23 | ], 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 29 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 30 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 31 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 32 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 33 | "@aws-cdk/core:enablePartitionLiterals": true, 34 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 35 | "@aws-cdk/aws-iam:standardizedServicePrincipals": true, 36 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 37 | "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, 38 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, 39 | "@aws-cdk/aws-route53-patters:useCertificate": true, 40 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false, 41 | "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, 42 | "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, 43 | "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, 44 | "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, 45 | "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, 46 | "@aws-cdk/aws-redshift:columnId": true, 47 | "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, 48 | "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, 49 | "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, 50 | "@aws-cdk/aws-kms:aliasNameRef": true, 51 | "@aws-cdk/core:includePrefixInUniqueNameGeneration": true 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector/__init__.py: -------------------------------------------------------------------------------- 1 | from .vpc import VpcStack 2 | from .aurora_postgresql import AuroraPostgresqlStack 3 | from .sm_studio import SageMakerStudioStack 4 | from .sm_embedding_endpoint import EmbeddingEndpointStack 5 | from .sm_llm_endpoint import LLMEndpointStack -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector/aurora_postgresql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | 7 | import aws_cdk as cdk 8 | from aws_cdk import ( 9 | Stack, 10 | aws_ec2, 11 | aws_logs, 12 | aws_rds 13 | ) 14 | from constructs import Construct 15 | 16 | class AuroraPostgresqlStack(Stack): 17 | 18 | def __init__(self, scope: Construct, construct_id: str, vpc, **kwargs) -> None: 19 | super().__init__(scope, construct_id, **kwargs) 20 | 21 | sg_postgresql_client = aws_ec2.SecurityGroup(self, 'PostgreSQLClientSG', 22 | vpc=vpc, 23 | allow_all_outbound=True, 24 | description='security group for postgresql client', 25 | security_group_name='postgresql-client-sg' 26 | ) 27 | cdk.Tags.of(sg_postgresql_client).add('Name', 'postgresql-client-sg') 28 | 29 | sg_postgresql_server = aws_ec2.SecurityGroup(self, 'PostgreSQLServerSG', 30 | vpc=vpc, 31 | allow_all_outbound=True, 32 | description='security group for postgresql', 33 | security_group_name='postgresql-server-sg' 34 | ) 35 | sg_postgresql_server.add_ingress_rule(peer=sg_postgresql_server, connection=aws_ec2.Port.all_tcp(), 36 | description='postgresql-server-sg') 37 | sg_postgresql_server.add_ingress_rule(peer=sg_postgresql_client, connection=aws_ec2.Port.tcp(5432), 38 | description='postgresql-client-sg') 39 | cdk.Tags.of(sg_postgresql_server).add('Name', 'postgresql-server-sg') 40 | 41 | rds_subnet_group = aws_rds.SubnetGroup(self, 'PostgreSQLSubnetGroup', 42 | description='subnet group for postgresql', 43 | subnet_group_name=f'{self.stack_name}-aurora-postgresql', 44 | vpc_subnets=aws_ec2.SubnetSelection(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS), 45 | vpc=vpc 46 | ) 47 | 48 | db_cluster_name = self.node.try_get_context('db_cluster_name') 49 | rds_credentials = aws_rds.Credentials.from_generated_secret("postgres") 50 | 51 | AURORA_POSTGRES_ENGINE_VERSION = aws_rds.AuroraPostgresEngineVersion.VER_15_3 52 | rds_engine = aws_rds.DatabaseClusterEngine.aurora_postgres(version=AURORA_POSTGRES_ENGINE_VERSION) 53 | 54 | #XXX: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/AuroraPostgreSQL.Reference.ParameterGroups.html#AuroraPostgreSQL.Reference.Parameters.Cluster 55 | rds_cluster_param_group = aws_rds.ParameterGroup(self, 'AuroraPostgreSQLClusterParamGroup', 56 | engine=rds_engine, 57 | description=f'Custom cluster parameter group for aurora-postgresql{AURORA_POSTGRES_ENGINE_VERSION.aurora_postgres_major_version}', 58 | parameters={ 59 | 'log_min_duration_statement': '15000', # 15 sec 60 | 'default_transaction_isolation': 'read committed', 61 | 'client_encoding': 'UTF8', 62 | 'rds.allowed_extensions': '*', 63 | 'shared_preload_libraries': 'pg_stat_statements,pg_similarity' 64 | } 65 | ) 66 | 67 | #XXX: https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/AuroraPostgreSQL.Reference.ParameterGroups.html#AuroraPostgreSQL.Reference.Parameters.Instance 68 | rds_db_param_group = aws_rds.ParameterGroup(self, 'AuroraPostgreSQLDBParamGroup', 69 | engine=rds_engine, 70 | description=f'Custom parameter group for aurora-postgresql{AURORA_POSTGRES_ENGINE_VERSION.aurora_postgres_major_version}', 71 | parameters={ 72 | 'log_min_duration_statement': '15000', # 15 sec 73 | 'default_transaction_isolation': 'read committed', 74 | 'rds.allowed_extensions': '*', 75 | 'shared_preload_libraries': 'pg_stat_statements,pg_similarity' 76 | } 77 | ) 78 | 79 | db_cluster = aws_rds.DatabaseCluster(self, 'AuroraPostgresDBCluster', 80 | engine=rds_engine, 81 | credentials=rds_credentials, # A username of 'admin' (or 'postgres' for PostgreSQL) and SecretsManager-generated password 82 | writer=aws_rds.ClusterInstance.provisioned("Writer", 83 | instance_type=aws_ec2.InstanceType.of(aws_ec2.InstanceClass.MEMORY6_GRAVITON, aws_ec2.InstanceSize.LARGE), 84 | parameter_group=rds_db_param_group, 85 | auto_minor_version_upgrade=False, 86 | ), 87 | readers=[ 88 | aws_rds.ClusterInstance.provisioned("Reader", 89 | instance_type=aws_ec2.InstanceType.of(aws_ec2.InstanceClass.MEMORY6_GRAVITON, aws_ec2.InstanceSize.LARGE), 90 | parameter_group=rds_db_param_group, 91 | auto_minor_version_upgrade=False 92 | ) 93 | ], 94 | parameter_group=rds_cluster_param_group, 95 | cloudwatch_logs_retention=aws_logs.RetentionDays.THREE_DAYS, 96 | cluster_identifier=db_cluster_name, 97 | subnet_group=rds_subnet_group, 98 | backup=aws_rds.BackupProps( 99 | retention=cdk.Duration.days(3), 100 | preferred_window="03:00-04:00" 101 | ), 102 | security_groups=[sg_postgresql_server], 103 | vpc=vpc, 104 | vpc_subnets=aws_ec2.SubnetSelection(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS) 105 | ) 106 | db_cluster.apply_removal_policy(cdk.RemovalPolicy.DESTROY) #XXX: For testing 107 | 108 | self.rds_credentials = db_cluster.secret 109 | self.sg_rds_client = sg_postgresql_client 110 | 111 | 112 | cdk.CfnOutput(self, 'DBClusterId', value=db_cluster.cluster_identifier, export_name='VectorDBClusterId') 113 | cdk.CfnOutput(self, 'DBClusterEndpoint', value=db_cluster.cluster_endpoint.socket_address, export_name='VectorDBClusterEndpoint') 114 | cdk.CfnOutput(self, 'DBClusterReadEndpoint', value=db_cluster.cluster_read_endpoint.socket_address, export_name='VectorDBClusterReadEndpoint') 115 | #XXX: https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_secretsmanager/README.html 116 | # secret_arn="arn:aws:secretsmanager:::secret:-" 117 | cdk.CfnOutput(self, 'DBSecret', value=db_cluster.secret.secret_name, export_name='VectorDBSecret') 118 | cdk.CfnOutput(self, 'DBClientSecurityGroupId', value=sg_postgresql_client.security_group_id, export_name='VectorDBClientSecurityGroupId') 119 | 120 | -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector/sm_embedding_endpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Stack 12 | ) 13 | from constructs import Construct 14 | 15 | from cdklabs.generative_ai_cdk_constructs import ( 16 | CustomSageMakerEndpoint, 17 | DeepLearningContainerImage, 18 | SageMakerInstanceType, 19 | ) 20 | 21 | random.seed(47) 22 | 23 | 24 | class EmbeddingEndpointStack(Stack): 25 | 26 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 27 | super().__init__(scope, construct_id, **kwargs) 28 | 29 | bucket_name = f'jumpstart-cache-prod-{cdk.Aws.REGION}' 30 | key_name = 'huggingface-infer/prepack/v1.0.0/infer-prepack-huggingface-textembedding-gpt-j-6b-fp16.tar.gz' 31 | 32 | RANDOM_GUID = ''.join(random.sample(string.digits, k=7)) 33 | endpoint_name = f"gpt-j-6b-fp16-endpoint-{RANDOM_GUID}" 34 | 35 | #XXX: https://github.com/awslabs/generative-ai-cdk-constructs/blob/main/src/patterns/gen-ai/aws-model-deployment-sagemaker/README_custom_sagemaker_endpoint.md 36 | self.embedding_endpoint = CustomSageMakerEndpoint(self, 'EmbeddingEndpoint', 37 | model_id='gpt-j-6b-fp16', 38 | instance_type=SageMakerInstanceType.ML_G5_2_XLARGE, 39 | container=DeepLearningContainerImage.from_deep_learning_container_image( 40 | 'pytorch-inference', 41 | '1.12.0-gpu-py38' 42 | ), 43 | model_data_url=f's3://{bucket_name}/{key_name}', 44 | endpoint_name=endpoint_name, 45 | instance_count=1, 46 | # volume_size_in_gb=100 47 | ) 48 | 49 | cdk.CfnOutput(self, 'EmbeddingEndpointName', 50 | value=self.embedding_endpoint.cfn_endpoint.endpoint_name, 51 | export_name=f'{self.stack_name}-EmbeddingEndpointName') 52 | cdk.CfnOutput(self, 'EmbeddingEndpointArn', 53 | value=self.embedding_endpoint.endpoint_arn, 54 | export_name=f'{self.stack_name}-EmbeddingEndpointArn') 55 | -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector/sm_llm_endpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Stack 12 | ) 13 | from constructs import Construct 14 | 15 | from cdklabs.generative_ai_cdk_constructs import ( 16 | JumpStartSageMakerEndpoint, 17 | JumpStartModel, 18 | SageMakerInstanceType 19 | ) 20 | 21 | random.seed(47) 22 | 23 | 24 | def name_from_base(base, max_length=63): 25 | unique = ''.join(random.sample(string.digits, k=7)) 26 | max_length = 63 27 | trimmed_base = base[: max_length - len(unique) - 1] 28 | return "{}-{}".format(trimmed_base, unique) 29 | 30 | 31 | class LLMEndpointStack(Stack): 32 | 33 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 34 | super().__init__(scope, construct_id, **kwargs) 35 | 36 | jumpstart_model = self.node.try_get_context('jumpstart_model_info') 37 | model_id, model_version = jumpstart_model.get('model_id', 'meta-textgeneration-llama-2-7b-f'), jumpstart_model.get('version', '2.0.1') 38 | model_name = f"{model_id.upper().replace('-', '_')}_{model_version.replace('.', '_')}" 39 | 40 | llm_endpoint_name = name_from_base(model_id.replace('/', '-').replace('.', '-')) 41 | 42 | #XXX: Available JumStart Model List 43 | # https://github.com/awslabs/generative-ai-cdk-constructs/blob/main/src/patterns/gen-ai/aws-model-deployment-sagemaker/jumpstart-model.ts 44 | llm_endpoint = JumpStartSageMakerEndpoint(self, 'LLMEndpoint', 45 | model=JumpStartModel.of(model_name), 46 | accept_eula=True, 47 | instance_type=SageMakerInstanceType.ML_G5_2_XLARGE, 48 | endpoint_name=llm_endpoint_name 49 | ) 50 | 51 | cdk.CfnOutput(self, 'LLMEndpointName', 52 | value=llm_endpoint.cfn_endpoint.endpoint_name, 53 | export_name=f'{self.stack_name}-LLMEndpointName') 54 | cdk.CfnOutput(self, 'LLMEndpointArn', 55 | value=llm_endpoint.endpoint_arn, 56 | export_name=f'{self.stack_name}-LLMEndpointArn') 57 | -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector/sm_studio.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import random 6 | import string 7 | 8 | import aws_cdk as cdk 9 | 10 | from aws_cdk import ( 11 | Stack, 12 | aws_ec2, 13 | aws_iam, 14 | aws_sagemaker 15 | ) 16 | from constructs import Construct 17 | 18 | random.seed(47) 19 | 20 | class SageMakerStudioStack(Stack): 21 | 22 | def __init__(self, scope: Construct, construct_id: str, vpc, sg_rds_client, **kwargs) -> None: 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | sagemaker_execution_policy_doc = aws_iam.PolicyDocument() 26 | sagemaker_execution_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 27 | "effect": aws_iam.Effect.ALLOW, 28 | "resources": ["arn:aws:s3:::*"], 29 | "actions": [ 30 | "s3:GetObject", 31 | "s3:PutObject", 32 | "s3:DeleteObject", 33 | "s3:ListBucket" 34 | ] 35 | })) 36 | 37 | sagemaker_custom_access_policy_doc = aws_iam.PolicyDocument() 38 | sagemaker_custom_access_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 39 | "effect": aws_iam.Effect.ALLOW, 40 | "resources": [f"arn:aws:secretsmanager:{cdk.Aws.REGION}:{cdk.Aws.ACCOUNT_ID}:secret:*"], 41 | "actions": ["secretsmanager:GetSecretValue"] 42 | })) 43 | 44 | sagemaker_docker_build_policy_doc = aws_iam.PolicyDocument() 45 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 46 | "effect": aws_iam.Effect.ALLOW, 47 | "resources": ["*"], 48 | "actions": ["ecr:GetAuthorizationToken"] 49 | })) 50 | 51 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 52 | "effect": aws_iam.Effect.ALLOW, 53 | "resources": ["*"], 54 | "actions": [ 55 | "ecr:BatchGetImage", 56 | "ecr:BatchCheckLayerAvailability", 57 | "ecr:CompleteLayerUpload", 58 | "ecr:DescribeImages", 59 | "ecr:DescribeRepositories", 60 | "ecr:GetDownloadUrlForLayer", 61 | "ecr:InitiateLayerUpload", 62 | "ecr:ListImages", 63 | "ecr:PutImage", 64 | "ecr:UploadLayerPart", 65 | "ecr:CreateRepository", 66 | "ecr:GetAuthorizationToken", 67 | "ec2:DescribeAvailabilityZones" 68 | ] 69 | })) 70 | 71 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 72 | "effect": aws_iam.Effect.ALLOW, 73 | "resources": ["arn:aws:codebuild:*:*:project/sagemaker-studio*"], 74 | "actions": [ 75 | "codebuild:DeleteProject", 76 | "codebuild:CreateProject", 77 | "codebuild:BatchGetBuilds", 78 | "codebuild:StartBuild" 79 | ] 80 | })) 81 | 82 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 83 | "effect": aws_iam.Effect.ALLOW, 84 | "resources": ["arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*"], 85 | "actions": ["logs:CreateLogStream"], 86 | })) 87 | 88 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 89 | "effect": aws_iam.Effect.ALLOW, 90 | "resources": ["arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*"], 91 | "actions": [ 92 | "logs:GetLogEvents", 93 | "logs:PutLogEvents" 94 | ] 95 | })) 96 | 97 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 98 | "effect": aws_iam.Effect.ALLOW, 99 | "resources": ["*"], 100 | "actions": ["logs:CreateLogGroup"] 101 | })) 102 | 103 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 104 | "effect": aws_iam.Effect.ALLOW, 105 | "resources": ["arn:aws:s3:::sagemaker-*/*"], 106 | "actions": [ 107 | "s3:GetObject", 108 | "s3:DeleteObject", 109 | "s3:PutObject" 110 | ] 111 | })) 112 | 113 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 114 | "effect": aws_iam.Effect.ALLOW, 115 | "resources": ["arn:aws:s3:::sagemaker*"], 116 | "actions": ["s3:CreateBucket"], 117 | })) 118 | 119 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 120 | "effect": aws_iam.Effect.ALLOW, 121 | "resources": ["*"], 122 | "actions": [ 123 | "iam:GetRole", 124 | "iam:ListRoles" 125 | ] 126 | })) 127 | 128 | sagemaker_docker_build_policy_doc.add_statements(aws_iam.PolicyStatement(**{ 129 | "effect": aws_iam.Effect.ALLOW, 130 | "resources": ["arn:aws:iam::*:role/*"], 131 | "conditions": { 132 | "StringLikeIfExists": { 133 | "iam:PassedToService": [ 134 | "codebuild.amazonaws.com" 135 | ] 136 | } 137 | }, 138 | "actions": ["iam:PassRole"] 139 | })) 140 | 141 | sagemaker_execution_role = aws_iam.Role(self, 'SageMakerExecutionRole', 142 | role_name='AmazonSageMakerStudioExecutionRole-{suffix}'.format(suffix=''.join(random.choices((string.digits), k=5))), 143 | assumed_by=aws_iam.ServicePrincipal('sagemaker.amazonaws.com'), 144 | path='/', 145 | inline_policies={ 146 | 'sagemaker-execution-policy': sagemaker_execution_policy_doc, 147 | 'sagemaker-custom-access-policy': sagemaker_custom_access_policy_doc, 148 | 'sagemaker-docker-build-policy': sagemaker_docker_build_policy_doc, 149 | }, 150 | managed_policies=[ 151 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSageMakerFullAccess'), 152 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonSageMakerCanvasFullAccess'), 153 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AWSCloudFormationReadOnlyAccess'), 154 | # aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonVPCReadOnlyAccess'), 155 | aws_iam.ManagedPolicy.from_aws_managed_policy_name('AmazonRDSReadOnlyAccess'), 156 | ] 157 | ) 158 | 159 | #XXX: To use the sm-docker CLI, the Amazon SageMaker execution role used by the Studio notebook 160 | # environment should have a trust policy with CodeBuild 161 | sagemaker_execution_role.assume_role_policy.add_statements(aws_iam.PolicyStatement(**{ 162 | "effect": aws_iam.Effect.ALLOW, 163 | "principals": [aws_iam.ServicePrincipal('codebuild.amazonaws.com')], 164 | "actions": ["sts:AssumeRole"] 165 | })) 166 | 167 | sm_studio_user_settings = aws_sagemaker.CfnDomain.UserSettingsProperty( 168 | execution_role=sagemaker_execution_role.role_arn 169 | ) 170 | 171 | sg_sagemaker_domain = aws_ec2.SecurityGroup(self, 'SageMakerDomainSG', 172 | vpc=vpc, 173 | allow_all_outbound=True, 174 | description='security group for sagmaker studio domain', 175 | security_group_name='sagemaker-domain-sg' 176 | ) 177 | sg_sagemaker_domain.add_ingress_rule(peer=sg_sagemaker_domain, connection=aws_ec2.Port.all_tcp(), 178 | description='All traffic within the sagemaker domain security group') 179 | sg_sagemaker_domain.add_ingress_rule(peer=aws_ec2.Peer.ipv4("0.0.0.0/0"), connection=aws_ec2.Port.tcp(443), 180 | description='https') 181 | cdk.Tags.of(sg_sagemaker_domain).add('Name', 'sagemaker-domain-sg') 182 | 183 | sm_studio_domain_name = self.node.try_get_context('sagemaker_studio_domain_name') or 'llm-app-rag-pgvector' 184 | 185 | sagemaker_studio_domain = aws_sagemaker.CfnDomain(self, 'SageMakerStudioDomain', 186 | auth_mode='IAM', # [SSO | IAM] 187 | default_user_settings=sm_studio_user_settings, 188 | domain_name=sm_studio_domain_name, 189 | subnet_ids=vpc.select_subnets(subnet_type=aws_ec2.SubnetType.PRIVATE_WITH_EGRESS).subnet_ids, 190 | vpc_id=vpc.vpc_id, 191 | app_network_access_type='VpcOnly', # [PublicInternetOnly | VpcOnly] 192 | domain_settings=aws_sagemaker.CfnDomain.DomainSettingsProperty( 193 | security_group_ids=[sg_sagemaker_domain.security_group_id] 194 | ) 195 | ) 196 | 197 | #XXX: https://docs.aws.amazon.com/sagemaker/latest/dg/studio-jl.html#studio-jl-set 198 | sagmaker_jupyerlab_arn = self.node.try_get_context('sagmaker_jupyterlab_arn') 199 | 200 | default_user_settings = aws_sagemaker.CfnUserProfile.UserSettingsProperty( 201 | jupyter_server_app_settings=aws_sagemaker.CfnUserProfile.JupyterServerAppSettingsProperty( 202 | default_resource_spec=aws_sagemaker.CfnUserProfile.ResourceSpecProperty( 203 | #XXX: JupyterServer apps only support the system value. 204 | instance_type="system", 205 | sage_maker_image_arn=sagmaker_jupyerlab_arn 206 | ) 207 | ), 208 | security_groups=[ 209 | sg_sagemaker_domain.security_group_id, 210 | sg_rds_client.security_group_id 211 | ] 212 | ) 213 | 214 | sagemaker_user_profile = aws_sagemaker.CfnUserProfile(self, 'SageMakerStudioUserProfile', 215 | domain_id=sagemaker_studio_domain.attr_domain_id, 216 | user_profile_name='default-user', 217 | user_settings=default_user_settings 218 | ) 219 | 220 | 221 | cdk.CfnOutput(self, 'DomainUrl', value=sagemaker_studio_domain.attr_url, 222 | export_name=f'{self.stack_name}-DomainUrl') 223 | cdk.CfnOutput(self, 'DomainId', value=sagemaker_user_profile.domain_id, 224 | export_name=f'{self.stack_name}-DomainId') 225 | cdk.CfnOutput(self, 'UserProfileName', value=sagemaker_user_profile.user_profile_name, 226 | export_name=f'{self.stack_name}-UserProfileName') 227 | cdk.CfnOutput(self, 'DomainSecurityGroupId', value=sg_sagemaker_domain.security_group_id, 228 | export_name=f'{self.stack_name}-DomainSecurityGroupId') -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector/vpc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # vim: tabstop=2 shiftwidth=2 softtabstop=2 expandtab 4 | 5 | import os 6 | import aws_cdk as cdk 7 | 8 | from aws_cdk import ( 9 | Stack, 10 | aws_ec2, 11 | ) 12 | from constructs import Construct 13 | 14 | 15 | class VpcStack(Stack): 16 | 17 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 18 | super().__init__(scope, construct_id, **kwargs) 19 | 20 | #XXX: For creating the CDK Stacks in the existing VPC, 21 | # remove comments from the below codes and 22 | # comments out vpc = aws_ec2.Vpc(..) codes, 23 | # then pass -c vpc_name=your-existing-vpc to cdk command 24 | # for example, 25 | # cdk -c vpc_name=your-existing-vpc syth 26 | # 27 | if str(os.environ.get('USE_DEFAULT_VPC', 'false')).lower() == 'true': 28 | vpc_name = self.node.try_get_context('vpc_name') or 'default' 29 | self.vpc = aws_ec2.Vpc.from_lookup(self, 'ExistingVPC', 30 | is_default=True, 31 | vpc_name=vpc_name 32 | ) 33 | else: 34 | #XXX: To use more than 2 AZs, be sure to specify the account and region on your stack. 35 | #XXX: https://docs.aws.amazon.com/cdk/api/latest/python/aws_cdk.aws_ec2/Vpc.html 36 | self.vpc = aws_ec2.Vpc(self, 'RAGAppVPC', 37 | ip_addresses=aws_ec2.IpAddresses.cidr("10.0.0.0/16"), 38 | max_azs=3, 39 | 40 | # 'subnetConfiguration' specifies the "subnet groups" to create. 41 | # Every subnet group will have a subnet for each AZ, so this 42 | # configuration will create `2 groups × 3 AZs = 6` subnets. 43 | subnet_configuration=[ 44 | { 45 | "cidrMask": 20, 46 | "name": "Public", 47 | "subnetType": aws_ec2.SubnetType.PUBLIC, 48 | }, 49 | { 50 | "cidrMask": 20, 51 | "name": "Private", 52 | "subnetType": aws_ec2.SubnetType.PRIVATE_WITH_EGRESS 53 | } 54 | ], 55 | gateway_endpoints={ 56 | "S3": aws_ec2.GatewayVpcEndpointOptions( 57 | service=aws_ec2.GatewayVpcEndpointAwsService.S3 58 | ) 59 | } 60 | ) 61 | 62 | cdk.CfnOutput(self, 'VPCID', value=self.vpc.vpc_id, 63 | export_name=f'{self.stack_name}-VPCID') 64 | -------------------------------------------------------------------------------- /cdk_stacks/rag_with_pgvector_arch.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
Web Application
(Streamlit)
Web Application...
Amazon SageMaker Endpoint
(LLM for Text Generation)
Amazon SageMaker Endpoint...
Amazon SageMaker Endpoint
(LLM for Embeddings)
Amazon SageMaker Endpoint...
Amazon SageMaker Processing Job
(Documents to embeddings)
Amazon SageMaker Processing Job...
Amazon S3
(Knowledge corpus)
Amazon S3...
Amazon Aurora Postgresql
(Vector database)
Amazon Aurora Postgresql...
User question
User question
LLM Generated
response
LLM Generated...
AWS Account
AWS Account
Real-time flow on user query
Real-time flow on user query
Offline data ingestion
Offline data ingestion
2
2
3
3
1
1
2
2
3
3
1
1
4
4
Text is not SVG - cannot display
-------------------------------------------------------------------------------- /cdk_stacks/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.171.1 2 | constructs>=10.0.0,<11.0.0 3 | cdklabs.generative-ai-cdk-constructs==0.1.286 -------------------------------------------------------------------------------- /cdk_stacks/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /data_ingestion_to_vectordb/container/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10.13-slim 2 | 3 | # pip leaves the install caches populated which uses a 4 | # significant amount of space. These optimizations save a fair 5 | # amount of space in the image, which reduces start up time. 6 | RUN pip --no-cache-dir install -U pip 7 | RUN pip --no-cache-dir install boto3==1.33.9 \ 8 | langchain==0.2.5 \ 9 | langchain-community==0.2.4 \ 10 | langchain-postgres==0.0.7 \ 11 | SQLAlchemy==2.0.28 \ 12 | psycopg[binary]==3.1.19 \ 13 | pgvector==0.2.5 \ 14 | beautifulsoup4==4.12.3 15 | 16 | 17 | # Include python script for retrieving credentials 18 | # from AWS SecretsManager and Sagemaker helper classes 19 | ADD credentials.py /code/ 20 | ADD sm_helper.py /code/ 21 | 22 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard 23 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE 24 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update 25 | # PATH so that the train and serve programs are found when the container is invoked. 26 | ENV PYTHONUNBUFFERED=TRUE 27 | ENV PYTHONDONTWRITEBYTECODE=TRUE -------------------------------------------------------------------------------- /data_ingestion_to_vectordb/container/credentials.py: -------------------------------------------------------------------------------- 1 | """ 2 | Retrieve credentials password for given username from AWS SecretsManager 3 | """ 4 | import json 5 | import boto3 6 | 7 | def get_credentials(secret_id: str, region_name: str) -> str: 8 | 9 | client = boto3.client('secretsmanager', region_name=region_name) 10 | response = client.get_secret_value(SecretId=secret_id) 11 | secrets_value = json.loads(response['SecretString']) 12 | 13 | return secrets_value -------------------------------------------------------------------------------- /data_ingestion_to_vectordb/container/load_data_into_pgvector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | # this is needed because the credentials.py and sm_helper.py 5 | # are in /code directory of the custom container we are going 6 | # to create for Sagemaker Processing Job 7 | sys.path.insert(1, '/code') 8 | 9 | import glob 10 | import time 11 | import logging 12 | import argparse 13 | import multiprocessing as mp 14 | from functools import partial 15 | 16 | import urllib 17 | 18 | import numpy as np 19 | 20 | from langchain_community.document_loaders import ReadTheDocsLoader 21 | from langchain_postgres import PGVector 22 | from langchain.text_splitter import RecursiveCharacterTextSplitter 23 | 24 | from credentials import get_credentials 25 | from sm_helper import create_sagemaker_embeddings_from_js_model 26 | 27 | 28 | logger = logging.getLogger() 29 | logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr) 30 | 31 | 32 | def process_shard(shard, embeddings_model_endpoint_name, aws_region, collection_name, connection_string) -> int: 33 | logger.info(f'Starting process_shard of {len(shard)} chunks.') 34 | st = time.time() 35 | 36 | embeddings = create_sagemaker_embeddings_from_js_model(embeddings_model_endpoint_name, aws_region) 37 | 38 | vectordb = PGVector.from_existing_index( 39 | embedding=embeddings, 40 | collection_name=collection_name, 41 | connection=connection_string) 42 | 43 | vectordb.add_documents(documents=shard) 44 | 45 | et = time.time() - st 46 | logger.info(f'Shard completed in {et} seconds.') 47 | return 0 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | 53 | parser.add_argument("--pgvector-secretid", type=str, default=None) 54 | parser.add_argument("--pgvector-collection-name", type=str, default=None) 55 | 56 | parser.add_argument("--aws-region", type=str, default="us-east-1") 57 | parser.add_argument("--embeddings-model-endpoint-name", type=str, default=None) 58 | parser.add_argument("--chunk-size-for-doc-split", type=int, default=500) 59 | parser.add_argument("--chunk-overlap-for-doc-split", type=int, default=30) 60 | parser.add_argument("--input-data-dir", type=str, default="/opt/ml/processing/input_data") 61 | parser.add_argument("--max-docs-per-put", type=int, default=10) 62 | parser.add_argument("--process-count", type=int, default=1) 63 | parser.add_argument("--create-index-hint-file", type=str, default="_create_index_hint") 64 | 65 | args, _ = parser.parse_known_args() 66 | logger.info("Received arguments {}".format(args)) 67 | 68 | # list all the files 69 | files = glob.glob(os.path.join(args.input_data_dir, "*.*")) 70 | logger.info(f"there are {len(files)} files to process in the {args.input_data_dir} folder") 71 | 72 | # retrieve secret to talk to Amazon Aurora Postgresql 73 | secret = get_credentials(args.pgvector_secretid, args.aws_region) 74 | db_username = secret['username'] 75 | db_password = urllib.parse.quote_plus(secret['password']) 76 | db_port = secret['port'] 77 | db_host = secret['host'] 78 | 79 | CONNECTION_STRING = PGVector.connection_string_from_db_params( 80 | driver = 'psycopg', 81 | user = db_username, 82 | password = db_password, 83 | host = db_host, 84 | port = db_port, 85 | database = '' 86 | ) 87 | 88 | logger.info(f'input-data-dir: {args.input_data_dir}') 89 | loader = ReadTheDocsLoader(args.input_data_dir) 90 | text_splitter = RecursiveCharacterTextSplitter( 91 | # Set a really small chunk size, just to show. 92 | chunk_size=args.chunk_size_for_doc_split, 93 | chunk_overlap=args.chunk_overlap_for_doc_split, 94 | length_function=len, 95 | ) 96 | 97 | # Stage one: read all the docs, split them into chunks. 98 | st = time.time() 99 | 100 | logger.info('Loading documents ...') 101 | docs = loader.load() 102 | logger.info(f'{len(docs)} documents have been loaded') 103 | 104 | # add a custom metadata field, such as timestamp 105 | for doc in docs: 106 | doc.metadata['timestamp'] = time.time() 107 | doc.metadata['embeddings_model'] = args.embeddings_model_endpoint_name 108 | chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs]) 109 | 110 | et = time.time() - st 111 | logger.info(f'Time taken: {et} seconds. {len(chunks)} chunks generated') 112 | 113 | db_shards = (len(chunks) // args.max_docs_per_put) + 1 114 | print(f'Loading chunks into vector store ... using {db_shards} shards') 115 | 116 | st = time.time() 117 | shards = np.array_split(chunks, db_shards) 118 | 119 | path = os.path.join(args.input_data_dir, args.create_index_hint_file) 120 | if os.path.isfile(path) is True: 121 | logger.info(f"{path} file is present, " 122 | f"will try to create the {args.pgvector_collection_name} collection") 123 | 124 | embeddings = create_sagemaker_embeddings_from_js_model(args.embeddings_model_endpoint_name, args.aws_region) 125 | _ = PGVector(collection_name=args.pgvector_collection_name, 126 | connection=CONNECTION_STRING, 127 | embeddings=embeddings) 128 | else: 129 | logger.info(f"{path} file is not present, " 130 | f"will wait for some other node to create the {args.pgvector_collection_name} collection") 131 | time.sleep(5) 132 | 133 | with mp.Pool(processes = args.process_count) as pool: 134 | results = pool.map(partial(process_shard, 135 | embeddings_model_endpoint_name=args.embeddings_model_endpoint_name, 136 | aws_region=args.aws_region, 137 | collection_name=args.pgvector_collection_name, 138 | connection_string=CONNECTION_STRING), 139 | shards) 140 | 141 | et = time.time() - st 142 | logger.info(f'run time in seconds: {et:.2f}') 143 | logger.info("all done") 144 | -------------------------------------------------------------------------------- /data_ingestion_to_vectordb/container/sm_helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for using Samgemaker Endpoint via langchain 3 | """ 4 | import json 5 | import logging 6 | from typing import List 7 | 8 | from langchain_community.embeddings import SagemakerEndpointEmbeddings 9 | from langchain_community.embeddings.sagemaker_endpoint import EmbeddingsContentHandler 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def create_sagemaker_embeddings_from_js_model(embeddings_model_endpoint_name: str, aws_region: str = 'us-east-1') -> SagemakerEndpointEmbeddings: 15 | 16 | # class for serializing/deserializing requests/responses to/from the embeddings model 17 | class ContentHandler(EmbeddingsContentHandler): 18 | content_type = "application/json" 19 | accepts = "application/json" 20 | 21 | def transform_input(self, prompt: str, model_kwargs={}) -> bytes: 22 | input_str = json.dumps({"text_inputs": prompt, **model_kwargs}) 23 | return input_str.encode('utf-8') 24 | 25 | def transform_output(self, output: bytes) -> str: 26 | response_json = json.loads(output.read().decode("utf-8")) 27 | embeddings = response_json["embedding"] 28 | if len(embeddings) == 1: 29 | return [embeddings[0]] 30 | return embeddings 31 | 32 | # all set to create the objects for the ContentHandler and 33 | # SagemakerEndpointEmbeddings classes 34 | content_handler = ContentHandler() 35 | 36 | # note the name of the LLM Sagemaker endpoint, this is the model that we would 37 | # be using for generating the embeddings 38 | embeddings = SagemakerEndpointEmbeddings( 39 | endpoint_name=embeddings_model_endpoint_name, 40 | region_name=aws_region, 41 | content_handler=content_handler 42 | ) 43 | return embeddings -------------------------------------------------------------------------------- /data_ingestion_to_vectordb/data_ingestion_to_pgvector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "93179240-9c5f-4ba6-a1c7-3a981624f794", 6 | "metadata": {}, 7 | "source": [ 8 | "# Ingest massive amounts of data to a Vector DB (Amazon Aurora Postgresql with pgvector)\n", 9 | "**_Use of Amazon Aurora Postgresql as a vector database for storing embeddings_**\n", 10 | "\n", 11 | "This notebook works well on `ml.t3.medium` instance with `Python3` kernel from **JupyterLab** or `Data Science 2.0` kernel from **SageMaker Studio Classic**.\n", 12 | "\n", 13 | "Here is a list of packages that are used in this notebook.\n", 14 | "\n", 15 | "```\n", 16 | "!pip list | grep -E -w \"sagemaker_studio_image_build|ipython-sql|langchain|psycopg|pgvector|numpy|sh\"\n", 17 | "-----------------------------------------------------------------------------------------------------\n", 18 | "ipython-sql 0.5.0\n", 19 | "langchain 0.2.5\n", 20 | "langchain-community 0.2.4\n", 21 | "langchain-core 0.2.43\n", 22 | "langchain-postgres 0.0.7\n", 23 | "langchain-text-splitters 0.2.4\n", 24 | "numpy 1.26.4\n", 25 | "pgvector 0.2.5\n", 26 | "psycopg 3.1.19\n", 27 | "psycopg-binary 3.1.19\n", 28 | "psycopg-pool 3.2.4\n", 29 | "sagemaker_studio_image_build 0.6.0\n", 30 | "sh 2.0.4\n", 31 | "```" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "79aae52c-cd7a-4637-a07d-9c0131dc7d0a", 37 | "metadata": {}, 38 | "source": [ 39 | "## Step 1: Setup\n", 40 | "Install the required packages." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "87e64f84-b7ac-427d-b5a8-cf98b430be9b", 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "%%capture --no-stderr\n", 53 | "\n", 54 | "!pip install -U langchain==0.2.5\n", 55 | "!pip install -U langchain-community==0.2.4\n", 56 | "!pip install -U langchain-postgres==0.0.7\n", 57 | "!pip install -U SQLAlchemy==2.0.28\n", 58 | "!pip install -U pgvector==0.2.5\n", 59 | "!pip install -U psycopg[binary]==3.1.19\n", 60 | "!pip install -U ipython-sql==0.5.0\n", 61 | "!pip install -U sh==2.0.4\n", 62 | "!pip install -U sagemaker-studio-image-build==0.6.0" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "d88757ba-7ae1-4efb-9c02-ab17ec22e79a", 69 | "metadata": { 70 | "tags": [] 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "!pip list | grep -E -w \"sagemaker_studio_image_build|ipython-sql|langchain|psycopg|pgvector|numpy|sh\"" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "c017bc3f-e507-4f0c-b640-ea774c5ea9c8", 80 | "metadata": {}, 81 | "source": [ 82 | "## Step 2: Download the data from the web and upload to S3\n", 83 | "\n", 84 | "In this step we use `wget` to crawl a Python documentation style website data. All files other than `html`, `txt` and `md` are removed. **This data download would take a few minutes**." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "5c2b8c14-0ffc-4090-adf1-c2a8a1bdebaa", 91 | "metadata": { 92 | "tags": [] 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "WEBSITE = \"https://sagemaker.readthedocs.io/en/stable/\"\n", 97 | "DOMAIN = \"sagemaker.readthedocs.io\"\n", 98 | "DATA_DIR = \"docs\"" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "0eb232ee-6b62-4718-9104-345fe7978703", 105 | "metadata": { 106 | "tags": [] 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "!python ./scripts/get_data.py --website {WEBSITE} --domain {DOMAIN} --output-dir {DATA_DIR}" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "8ee1fbb8-583a-4c41-a831-715e4250ff3c", 117 | "metadata": { 118 | "tags": [] 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "import boto3\n", 123 | "import sagemaker\n", 124 | "\n", 125 | "sagemaker_session = sagemaker.session.Session()\n", 126 | "aws_region = boto3.Session().region_name\n", 127 | "bucket = sagemaker_session.default_bucket()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "6c127969-4abc-4a31-8829-c00bee321a95", 134 | "metadata": { 135 | "tags": [] 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "CREATE_OS_INDEX_HINT_FILE = \"_create_index_hint\"\n", 140 | "app_name = 'llm-app-rag'" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "25217f27-4995-4da5-8fc4-b1b9533185b5", 147 | "metadata": { 148 | "tags": [] 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "# create a dummy file called _create_index to provide a hint for Postgresql index creation\n", 153 | "# this is needed for Sagemaker Processing Job when there are multiple instance nodes\n", 154 | "# all running the same code for data ingestion but only one node needs to create the index\n", 155 | "!touch {DATA_DIR}/{CREATE_OS_INDEX_HINT_FILE}\n", 156 | "\n", 157 | "# upload this data to S3, to be used when we run the Sagemaker Processing Job\n", 158 | "!aws s3 cp --recursive {DATA_DIR}/ s3://{bucket}/{app_name}/{DOMAIN}" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "743e8296", 164 | "metadata": {}, 165 | "source": [ 166 | "## Step 3: Setup Aurora Postgresql with pgvector" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "28b236a7-b5d2-494e-a3e3-baec94adc3d4", 173 | "metadata": { 174 | "tags": [] 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "import sys\n", 179 | "import logging\n", 180 | "\n", 181 | "\n", 182 | "logger = logging.getLogger()\n", 183 | "logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "0c8d9a38-ae89-44af-83db-657dd7e851d5", 190 | "metadata": { 191 | "tags": [] 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "import json\n", 196 | "from typing import List\n", 197 | "import boto3\n", 198 | "\n", 199 | "\n", 200 | "def get_cfn_outputs(stack_name: str, region_name: str = 'us-east-1') -> List:\n", 201 | " cfn = boto3.client('cloudformation', region_name=region_name)\n", 202 | " outputs = {}\n", 203 | " for output in cfn.describe_stacks(StackName=stack_name)['Stacks'][0]['Outputs']:\n", 204 | " outputs[output['OutputKey']] = output['OutputValue']\n", 205 | " return outputs\n", 206 | "\n", 207 | "def get_secret_name(stack_name: str, region_name: str = 'us-east-1'):\n", 208 | " cf_client = boto3.client('cloudformation', region_name=region_name)\n", 209 | " response = cf_client.describe_stacks(StackName=stack_name)\n", 210 | " outputs = response[\"Stacks\"][0][\"Outputs\"]\n", 211 | "\n", 212 | " secrets = [e for e in outputs if e['ExportName'] == 'VectorDBSecret'][0]\n", 213 | " secret_name = secrets['OutputValue']\n", 214 | " return secret_name\n", 215 | "\n", 216 | "def get_secret(secret_name: str, region_name: str = 'us-east-1'):\n", 217 | " client = boto3.client('secretsmanager', region_name=region_name)\n", 218 | " get_secret_value_response = client.get_secret_value(SecretId=secret_name)\n", 219 | " secret = get_secret_value_response['SecretString']\n", 220 | "\n", 221 | " return json.loads(secret)\n", 222 | "\n", 223 | "def get_db_subnet_ids(stack_name: str, region_name: str = 'us-east-1'):\n", 224 | " cfn_outputs = get_cfn_outputs(stack_name, region_name)\n", 225 | " db_cluster_id = cfn_outputs['DBClusterId']\n", 226 | "\n", 227 | " rds_client = boto3.client('rds', region_name=region_name)\n", 228 | " db_cluster_info = rds_client.describe_db_clusters(DBClusterIdentifier=db_cluster_id)\n", 229 | " db_subnet_group_name = db_cluster_info['DBClusters'][0]['DBSubnetGroup']\n", 230 | " db_subnet_info = rds_client.describe_db_subnet_groups(DBSubnetGroupName=db_subnet_group_name)\n", 231 | " db_subnet_ids = [e['SubnetIdentifier'] for e in db_subnet_info['DBSubnetGroups'][0]['Subnets']]\n", 232 | "\n", 233 | " return db_subnet_ids" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "41d74214", 239 | "metadata": {}, 240 | "source": [ 241 | "##### Create the pgvector extension on your Aurora PostgreSQL database (DB) cluster\n", 242 | "\n", 243 | "[pgvector](https://github.com/pgvector/pgvector) is an open-source extension for PostgreSQL that adds the ability to store and search over ML-generated vector embeddings. pgvector provides different capabilities that let you identify both exact and approximate nearest neighbors. It’s designed to work seamlessly with other PostgreSQL features, including indexing and querying. Using ChatGPT and other LLM tooling often requires storing the output of these systems, i.e., vector embeddings, in a permanent storage system for retrieval at a later time." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "40e73e00", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "%config SqlMagic.style = '_DEPRECATED_DEFAULT' # Ensure that the SqlMagic style is compatible with the previous version" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "387c9ff5", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "%load_ext sql" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "71514284", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "import urllib\n", 274 | "\n", 275 | "CFN_STACK_NAME = \"RAGPgVectorStack\" # name of CloudFormation stack\n", 276 | "\n", 277 | "secret_name = get_secret_name(CFN_STACK_NAME)\n", 278 | "secret = get_secret(secret_name)\n", 279 | "\n", 280 | "db_username = secret['username']\n", 281 | "db_password = urllib.parse.quote_plus(secret['password'])\n", 282 | "db_port = secret['port']\n", 283 | "db_host = secret['host']\n", 284 | "\n", 285 | "driver = 'psycopg'\n", 286 | "\n", 287 | "connection_string = f\"postgresql+{driver}://{db_username}:{db_password}@{db_host}:{db_port}/\"\n", 288 | "connection_string" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "ab4e9407", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "%sql $connection_string" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "id": "ed914bc2", 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "%%sql\n", 309 | "\n", 310 | "CREATE EXTENSION IF NOT EXISTS vector;" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "ca3e0253", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "%%sql\n", 321 | "\n", 322 | "SELECT typname\n", 323 | "FROM pg_type\n", 324 | "WHERE typname = 'vector';" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "id": "b62a4e9f", 330 | "metadata": {}, 331 | "source": [ 332 | "## Step 4: Load data into Aurora Postgresql with pgvector\n", 333 | "\n", 334 | "- Option 1) Parallel loading data with SageMaker Processing Job\n", 335 | "- Option 2) Sequential loading data with Document Loader" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "id": "2a04dfb8", 341 | "metadata": {}, 342 | "source": [ 343 | "### Option 1) Parallel loading data with SageMaker Processing Job\n", 344 | "\n", 345 | "We now have a working script that is able to ingest data into an Aurora Postgresql. But for this to work for massive amounts of data we need to scale up the processing by running this code in a distributed fashion. We will do this using Sagemkaer Processing Job. This involves the following steps:\n", 346 | "\n", 347 | "1. Create a custom container in which we will install the `langchain`, `psycopg2` and `pgvector` packges and then upload this container image to Amazon Elastic Container Registry (ECR).\n", 348 | "2. Use the Sagemaker `ScriptProcessor` class to create a Sagemaker Processing job that will run on multiple nodes.\n", 349 | " - The data files available in S3 are automatically distributed across in the Sagemaker Processing Job instances by setting `s3_data_distribution_type='ShardedByS3Key'` as part of the `ProcessingInput` provided to the processing job.\n", 350 | " - Each node processes a subset of the files and this brings down the overall time required to ingest the data into the Aurora Postgresql.\n", 351 | " - Each node also uses Python `multiprocessing` to internally also parallelize the file processing. Thus, **there are two levels of parallelization happening, one at the cluster level where individual nodes are distributing the work (files) amongst themselves and another at the node level where the files in a node are also split between multiple processes running on the node**." 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "id": "5f48c660", 357 | "metadata": {}, 358 | "source": [ 359 | "### Create custom container\n", 360 | "\n", 361 | "We will now create a container locally and push the container image to ECR. **The container creation process takes about 1 minute**.\n", 362 | "\n", 363 | "1. The container include all the Python packages we need i.e. `langchain`, `psycopg2`, `pgvector`, `sagemaker` and `beautifulsoup4`.\n", 364 | "2. The container also includes the `credentials.py` script for retrieving credentials from Secrets Manager and `sm_helper.py` for helping to create SageMaker endpoint classes that langchain uses." 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "372cd2a4", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "DOCKER_IMAGE = \"load-data-pgvector-custom\"\n", 375 | "DOCKER_IMAGE_TAG = \"latest\"" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "c80dd828", 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "!cd ./container && sm-docker build . --repository {DOCKER_IMAGE}:{DOCKER_IMAGE_TAG}" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "id": "eab03176", 391 | "metadata": {}, 392 | "source": [ 393 | "### Create and run the Sagemaker Processing Job\n", 394 | "\n", 395 | "Now we will run the Sagemaker Processing Job to ingest the data into Aurora Postgresql." 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "id": "e52cfadf", 401 | "metadata": {}, 402 | "source": [ 403 | "##### Load the embeddings and LLM into Aurora PostgreSQL DB cluster" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "id": "477c8f74-faa9-4c4a-a6b6-de1b5699f955", 410 | "metadata": { 411 | "tags": [] 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "CFN_STACK_NAME = 'EmbeddingEndpointStack'\n", 416 | "\n", 417 | "cfn_stack_outputs = get_cfn_outputs(CFN_STACK_NAME, aws_region)\n", 418 | "embeddings_model_endpoint_name = cfn_stack_outputs['EmbeddingEndpointName']" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "id": "29861415-9ad9-44bc-b2f2-ca8bf61dd40c", 425 | "metadata": { 426 | "tags": [] 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "CFN_STACK_NAME = \"RAGPgVectorStack\"\n", 431 | "\n", 432 | "pgvector_secret_id = get_secret_name(CFN_STACK_NAME, aws_region)\n", 433 | "pgvector_collection_name = 'llm_rag_embeddings'" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "id": "1749f0fa-2df2-4289-9033-f429aac6e2f6", 440 | "metadata": { 441 | "tags": [] 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n", 446 | "aws_role = sagemaker_session.get_caller_identity_arn()" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "id": "15d12e82-239c-4844-9e2c-73bd25c49167", 453 | "metadata": { 454 | "tags": [] 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "CHUNK_SIZE_FOR_DOC_SPLIT = 500\n", 459 | "CHUNK_OVERLAP_FOR_DOC_SPLIT = 20" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "id": "64826d30", 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "db_subnet_ids = get_db_subnet_ids('RAGPgVectorStack', aws_region)\n", 470 | "db_client_security_group_id = get_cfn_outputs('RAGPgVectorStack', aws_region)['DBClientSecurityGroupId']\n", 471 | "sagemaker_domain_security_group_id = get_cfn_outputs('RAGSageMakerStudioStack', aws_region)['DomainSecurityGroupId']" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "87843222", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "from sagemaker.network import NetworkConfig\n", 482 | "\n", 483 | "\n", 484 | "# For more information, see https://docs.aws.amazon.com/sagemaker/latest/dg/process-vpc.html\n", 485 | "network_config = NetworkConfig(security_group_ids=[sagemaker_domain_security_group_id,\n", 486 | " db_client_security_group_id],\n", 487 | " subnets=db_subnet_ids)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "id": "afb7373c-e80f-4d1a-a8dc-0dc79fb28e8a", 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "import time\n", 498 | "\n", 499 | "from sagemaker.processing import (\n", 500 | " ProcessingInput,\n", 501 | " ScriptProcessor\n", 502 | ")\n", 503 | "\n", 504 | "# setup the parameters for the job\n", 505 | "base_job_name = f\"{app_name}-job\"\n", 506 | "tags = [{\"Key\": \"data\", \"Value\": \"embeddings-for-llm-apps\"}]\n", 507 | "\n", 508 | "# use the custom container we just created\n", 509 | "image_uri = f\"{account_id}.dkr.ecr.{aws_region}.amazonaws.com/{DOCKER_IMAGE}:{DOCKER_IMAGE_TAG}\"\n", 510 | "\n", 511 | "# instance type and count determined via trial and error: how much overall processing time\n", 512 | "# and what compute cost works best for your use-case\n", 513 | "instance_type = \"ml.m5.xlarge\"\n", 514 | "instance_count = 3\n", 515 | "logger.info(f\"base_job_name={base_job_name}, tags={tags}, image_uri={image_uri}, instance_type={instance_type}, instance_count={instance_count}\")\n", 516 | "\n", 517 | "# setup the ScriptProcessor with the above parameters\n", 518 | "processor = ScriptProcessor(base_job_name=base_job_name,\n", 519 | " image_uri=image_uri,\n", 520 | " role=aws_role,\n", 521 | " instance_type=instance_type,\n", 522 | " instance_count=instance_count,\n", 523 | " command=[\"python3\"],\n", 524 | " tags=tags,\n", 525 | " network_config=network_config)\n", 526 | "\n", 527 | "# setup input from S3, note the ShardedByS3Key, this ensures that\n", 528 | "# each instance gets a random and equal subset of the files in S3.\n", 529 | "inputs = [ProcessingInput(source=f\"s3://{bucket}/{app_name}/{DOMAIN}\",\n", 530 | " destination='/opt/ml/processing/input_data',\n", 531 | " s3_data_distribution_type='ShardedByS3Key',\n", 532 | " s3_data_type='S3Prefix')]\n", 533 | "\n", 534 | "\n", 535 | "logger.info(f\"creating an pgvector collection with name={pgvector_collection_name}\")\n", 536 | "\n", 537 | "# ready to run the processing job\n", 538 | "st = time.time()\n", 539 | "processor.run(code=\"container/load_data_into_pgvector.py\",\n", 540 | " inputs=inputs,\n", 541 | " outputs=[],\n", 542 | " arguments=[\"--pgvector-secretid\", pgvector_secret_id,\n", 543 | " \"--pgvector-collection-name\", pgvector_collection_name,\n", 544 | " \"--aws-region\", aws_region,\n", 545 | " \"--embeddings-model-endpoint-name\", embeddings_model_endpoint_name,\n", 546 | " \"--chunk-size-for-doc-split\", str(CHUNK_SIZE_FOR_DOC_SPLIT),\n", 547 | " \"--chunk-overlap-for-doc-split\", str(CHUNK_OVERLAP_FOR_DOC_SPLIT),\n", 548 | " \"--input-data-dir\", \"/opt/ml/processing/input_data\",\n", 549 | " \"--create-index-hint-file\", CREATE_OS_INDEX_HINT_FILE,\n", 550 | " \"--process-count\", \"2\"])\n", 551 | "\n", 552 | "time_taken = time.time() - st\n", 553 | "logger.info(f\"processing job completed, total time taken={time_taken}s\")\n", 554 | "\n", 555 | "preprocessing_job_description = processor.jobs[-1].describe()\n", 556 | "logger.info(preprocessing_job_description)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "id": "319bb2e5", 562 | "metadata": {}, 563 | "source": [ 564 | "### Option 2) Sequential loading data with Document Loader" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "id": "79edc1b8", 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "%%capture --no-stderr\n", 575 | "\n", 576 | "!pip install -U beautifulsoup4==4.12.3" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "id": "fd657ba8", 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "from langchain_community.document_loaders import ReadTheDocsLoader\n", 587 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 588 | "\n", 589 | "\n", 590 | "loader = ReadTheDocsLoader(DATA_DIR)\n", 591 | "text_splitter = RecursiveCharacterTextSplitter(\n", 592 | " chunk_size=CHUNK_SIZE_FOR_DOC_SPLIT,\n", 593 | " chunk_overlap=CHUNK_OVERLAP_FOR_DOC_SPLIT,\n", 594 | " length_function=len,\n", 595 | ")\n", 596 | "\n", 597 | "docs = loader.load()\n", 598 | "\n", 599 | "# add a custom metadata field, such as timestamp\n", 600 | "for doc in docs:\n", 601 | " doc.metadata['timestamp'] = time.time()\n", 602 | " doc.metadata['embeddings_model'] = embeddings_model_endpoint_name" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": null, 608 | "id": "5a7281c8", 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "chunks = text_splitter.create_documents(\n", 613 | " [doc.page_content for doc in docs],\n", 614 | " metadatas=[doc.metadata for doc in docs]\n", 615 | ")" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "id": "4478ff35", 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "import numpy as np\n", 626 | "\n", 627 | "\n", 628 | "MAX_DOCS_PER_PUT = 10\n", 629 | "\n", 630 | "db_shards = (len(chunks) // MAX_DOCS_PER_PUT) + 1\n", 631 | "shards = np.array_split(chunks, db_shards)\n", 632 | "print(f'Loading chunks into vector store ... using {len(db_shards)} shards')" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "id": "a8382f10", 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "import urllib\n", 643 | "from langchain_postgres import PGVector\n", 644 | "from container.credentials import get_credentials\n", 645 | "\n", 646 | "\n", 647 | "secret = get_credentials(pgvector_secret_id, aws_region)\n", 648 | "db_username = secret['username']\n", 649 | "db_password = urllib.parse.quote_plus(secret['password'])\n", 650 | "db_port = secret['port']\n", 651 | "db_host = secret['host']\n", 652 | "\n", 653 | "CONNECTION_STRING = PGVector.connection_string_from_db_params(\n", 654 | " driver='psycopg',\n", 655 | " user=db_username,\n", 656 | " password=db_password,\n", 657 | " host=db_host,\n", 658 | " port=db_port,\n", 659 | " database=''\n", 660 | ")" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "id": "de7cf804", 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "from container.sm_helper import create_sagemaker_embeddings_from_js_model\n", 671 | "\n", 672 | "\n", 673 | "embeddings = create_sagemaker_embeddings_from_js_model(\n", 674 | " embeddings_model_endpoint_name,\n", 675 | " aws_region\n", 676 | ")\n", 677 | "\n", 678 | "vectordb = PGVector(\n", 679 | " collection_name=pgvector_collection_name,\n", 680 | " connection=CONNECTION_STRING,\n", 681 | " embeddings=embeddings\n", 682 | ")" 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "id": "33716979", 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "%%time\n", 693 | "import time\n", 694 | "\n", 695 | "\n", 696 | "for i, shard in enumerate(shards):\n", 697 | " vectordb.add_documents(documents=shard)\n", 698 | " print(f\"[{i}] shard is added.\")\n", 699 | " time.sleep(0.3)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "id": "1e444161-262e-44e5-ad31-e490a763be4e", 705 | "metadata": {}, 706 | "source": [ 707 | "## Step 5: Do a similarity search for user input to documents (embeddings) in Aurora Postgresql " 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "id": "294a7292-8bdb-4d11-a23d-130a4a039cd2", 714 | "metadata": { 715 | "tags": [] 716 | }, 717 | "outputs": [], 718 | "source": [ 719 | "import urllib\n", 720 | "\n", 721 | "from langchain_postgres import PGVector\n", 722 | "\n", 723 | "from container.credentials import get_credentials\n", 724 | "from container.sm_helper import create_sagemaker_embeddings_from_js_model\n", 725 | "\n", 726 | "\n", 727 | "secret = get_credentials(pgvector_secret_id, aws_region)\n", 728 | "\n", 729 | "db_username = secret['username']\n", 730 | "db_password = urllib.parse.quote_plus(secret['password'])\n", 731 | "db_port = secret['port']\n", 732 | "db_host = secret['host']\n", 733 | "\n", 734 | "connection_string = PGVector.connection_string_from_db_params(\n", 735 | " driver='psycopg',\n", 736 | " user=db_username,\n", 737 | " password=db_password,\n", 738 | " host=db_host,\n", 739 | " port=db_port,\n", 740 | " database=''\n", 741 | ")\n", 742 | "\n", 743 | "docsearch = PGVector.from_existing_index(\n", 744 | " embedding=create_sagemaker_embeddings_from_js_model(embeddings_model_endpoint_name,\n", 745 | " aws_region),\n", 746 | " collection_name=pgvector_collection_name,\n", 747 | " connection=connection_string)\n", 748 | "\n", 749 | "q = \"Which XGBoost versions does SageMaker support?\"\n", 750 | "docs = docsearch.similarity_search(q, k=3)\n", 751 | "for doc in docs:\n", 752 | " logger.info(\"----------\")\n", 753 | " logger.info(f\"content=\\\"{doc.page_content}\\\",\\nmetadata=\\\"{doc.metadata}\\\"\")" 754 | ] 755 | }, 756 | { 757 | "cell_type": "markdown", 758 | "id": "6e29eae5-c463-4153-9167-e4628c74d13c", 759 | "metadata": { 760 | "tags": [] 761 | }, 762 | "source": [ 763 | "## Cleanup\n", 764 | "\n", 765 | "To avoid incurring future charges, delete the resources. You can do this by deleting the CloudFormation template used to create the IAM role and SageMaker notebook." 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "id": "59ce3fe8-bb71-4e22-a551-2475eb2d16b7", 771 | "metadata": {}, 772 | "source": [ 773 | "---\n", 774 | "\n", 775 | "## Conclusion\n", 776 | "In this notebook we were able to see how to use LLMs deployed on a SageMaker Endpoint to generate embeddings and then ingest those embeddings into Aurora Postgresql and finally do a similarity search for user input to the documents (embeddings) stored in Aurora Postgresql. We used langchain as an abstraction layer to talk to both the SageMaker Endpoint as well as Aurora Postgresql." 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "id": "53386268-0cf9-4a37-b3d0-711fba1e5585", 782 | "metadata": {}, 783 | "source": [ 784 | "---\n", 785 | "\n", 786 | "## Appendix" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": null, 792 | "id": "7332274c-586d-4cf9-838f-ea7e0cbe6c0f", 793 | "metadata": { 794 | "tags": [] 795 | }, 796 | "outputs": [], 797 | "source": [ 798 | "from container.sm_helper import create_sagemaker_embeddings_from_js_model\n", 799 | "\n", 800 | "CFN_STACK_NAME = 'EmbeddingEndpointStack'\n", 801 | "cfn_stack_outputs = get_cfn_outputs(CFN_STACK_NAME, aws_region)\n", 802 | "embeddings_model_endpoint_name = cfn_stack_outputs['EmbeddingEndpointName']\n", 803 | "\n", 804 | "embeddings = create_sagemaker_embeddings_from_js_model(embeddings_model_endpoint_name, aws_region)\n", 805 | "\n", 806 | "text = \"This is a sample query.\"\n", 807 | "query_result = embeddings.embed_query(text)\n", 808 | "\n", 809 | "print(query_result)\n", 810 | "print(f\"length: {len(query_result)}\")" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "id": "dd881bab", 816 | "metadata": {}, 817 | "source": [ 818 | "## References\n", 819 | "\n", 820 | " * [Leverage pgvector and Amazon Aurora PostgreSQL for Natural Language Processing, Chatbots and Sentiment Analysis](https://aws.amazon.com/blogs/database/leverage-pgvector-and-amazon-aurora-postgresql-for-natural-language-processing-chatbots-and-sentiment-analysis/)\n", 821 | " * [Building AI-powered search in PostgreSQL using Amazon SageMaker and pgvector](https://aws.amazon.com/blogs/database/building-ai-powered-search-in-postgresql-using-amazon-sagemaker-and-pgvector/)\n", 822 | " * [Using the Amazon SageMaker Studio Image Build CLI to build container images from your Studio notebooks](https://aws.amazon.com/blogs/machine-learning/using-the-amazon-sagemaker-studio-image-build-cli-to-build-container-images-from-your-studio-notebooks/)\n", 823 | " * [Give SageMaker Processing Jobs Access to Resources in Your Amazon VPC](https://docs.aws.amazon.com/sagemaker/latest/dg/process-vpc.html)\n", 824 | " * **Configure the VPC Security Group**\n", 825 | " * In distributed processing, you must allow communication between the different containers in the same processing job. To do that, configure a rule for your security group that allows inbound connections between members of the same security group.\n", 826 | " * [How can I troubleshoot the InternalServerError response on Amazon SageMaker? - AWS re:Post](https://repost.aws/knowledge-center/sagemaker-http-500-internal-server-error)\n", 827 | " * [LangChain](https://python.langchain.com/docs/get_started/introduction.html) - A framework for developing applications powered by language models." 828 | ] 829 | } 830 | ], 831 | "metadata": { 832 | "availableInstances": [ 833 | { 834 | "_defaultOrder": 0, 835 | "_isFastLaunch": true, 836 | "category": "General purpose", 837 | "gpuNum": 0, 838 | "hideHardwareSpecs": false, 839 | "memoryGiB": 4, 840 | "name": "ml.t3.medium", 841 | "vcpuNum": 2 842 | }, 843 | { 844 | "_defaultOrder": 1, 845 | "_isFastLaunch": false, 846 | "category": "General purpose", 847 | "gpuNum": 0, 848 | "hideHardwareSpecs": false, 849 | "memoryGiB": 8, 850 | "name": "ml.t3.large", 851 | "vcpuNum": 2 852 | }, 853 | { 854 | "_defaultOrder": 2, 855 | "_isFastLaunch": false, 856 | "category": "General purpose", 857 | "gpuNum": 0, 858 | "hideHardwareSpecs": false, 859 | "memoryGiB": 16, 860 | "name": "ml.t3.xlarge", 861 | "vcpuNum": 4 862 | }, 863 | { 864 | "_defaultOrder": 3, 865 | "_isFastLaunch": false, 866 | "category": "General purpose", 867 | "gpuNum": 0, 868 | "hideHardwareSpecs": false, 869 | "memoryGiB": 32, 870 | "name": "ml.t3.2xlarge", 871 | "vcpuNum": 8 872 | }, 873 | { 874 | "_defaultOrder": 4, 875 | "_isFastLaunch": true, 876 | "category": "General purpose", 877 | "gpuNum": 0, 878 | "hideHardwareSpecs": false, 879 | "memoryGiB": 8, 880 | "name": "ml.m5.large", 881 | "vcpuNum": 2 882 | }, 883 | { 884 | "_defaultOrder": 5, 885 | "_isFastLaunch": false, 886 | "category": "General purpose", 887 | "gpuNum": 0, 888 | "hideHardwareSpecs": false, 889 | "memoryGiB": 16, 890 | "name": "ml.m5.xlarge", 891 | "vcpuNum": 4 892 | }, 893 | { 894 | "_defaultOrder": 6, 895 | "_isFastLaunch": false, 896 | "category": "General purpose", 897 | "gpuNum": 0, 898 | "hideHardwareSpecs": false, 899 | "memoryGiB": 32, 900 | "name": "ml.m5.2xlarge", 901 | "vcpuNum": 8 902 | }, 903 | { 904 | "_defaultOrder": 7, 905 | "_isFastLaunch": false, 906 | "category": "General purpose", 907 | "gpuNum": 0, 908 | "hideHardwareSpecs": false, 909 | "memoryGiB": 64, 910 | "name": "ml.m5.4xlarge", 911 | "vcpuNum": 16 912 | }, 913 | { 914 | "_defaultOrder": 8, 915 | "_isFastLaunch": false, 916 | "category": "General purpose", 917 | "gpuNum": 0, 918 | "hideHardwareSpecs": false, 919 | "memoryGiB": 128, 920 | "name": "ml.m5.8xlarge", 921 | "vcpuNum": 32 922 | }, 923 | { 924 | "_defaultOrder": 9, 925 | "_isFastLaunch": false, 926 | "category": "General purpose", 927 | "gpuNum": 0, 928 | "hideHardwareSpecs": false, 929 | "memoryGiB": 192, 930 | "name": "ml.m5.12xlarge", 931 | "vcpuNum": 48 932 | }, 933 | { 934 | "_defaultOrder": 10, 935 | "_isFastLaunch": false, 936 | "category": "General purpose", 937 | "gpuNum": 0, 938 | "hideHardwareSpecs": false, 939 | "memoryGiB": 256, 940 | "name": "ml.m5.16xlarge", 941 | "vcpuNum": 64 942 | }, 943 | { 944 | "_defaultOrder": 11, 945 | "_isFastLaunch": false, 946 | "category": "General purpose", 947 | "gpuNum": 0, 948 | "hideHardwareSpecs": false, 949 | "memoryGiB": 384, 950 | "name": "ml.m5.24xlarge", 951 | "vcpuNum": 96 952 | }, 953 | { 954 | "_defaultOrder": 12, 955 | "_isFastLaunch": false, 956 | "category": "General purpose", 957 | "gpuNum": 0, 958 | "hideHardwareSpecs": false, 959 | "memoryGiB": 8, 960 | "name": "ml.m5d.large", 961 | "vcpuNum": 2 962 | }, 963 | { 964 | "_defaultOrder": 13, 965 | "_isFastLaunch": false, 966 | "category": "General purpose", 967 | "gpuNum": 0, 968 | "hideHardwareSpecs": false, 969 | "memoryGiB": 16, 970 | "name": "ml.m5d.xlarge", 971 | "vcpuNum": 4 972 | }, 973 | { 974 | "_defaultOrder": 14, 975 | "_isFastLaunch": false, 976 | "category": "General purpose", 977 | "gpuNum": 0, 978 | "hideHardwareSpecs": false, 979 | "memoryGiB": 32, 980 | "name": "ml.m5d.2xlarge", 981 | "vcpuNum": 8 982 | }, 983 | { 984 | "_defaultOrder": 15, 985 | "_isFastLaunch": false, 986 | "category": "General purpose", 987 | "gpuNum": 0, 988 | "hideHardwareSpecs": false, 989 | "memoryGiB": 64, 990 | "name": "ml.m5d.4xlarge", 991 | "vcpuNum": 16 992 | }, 993 | { 994 | "_defaultOrder": 16, 995 | "_isFastLaunch": false, 996 | "category": "General purpose", 997 | "gpuNum": 0, 998 | "hideHardwareSpecs": false, 999 | "memoryGiB": 128, 1000 | "name": "ml.m5d.8xlarge", 1001 | "vcpuNum": 32 1002 | }, 1003 | { 1004 | "_defaultOrder": 17, 1005 | "_isFastLaunch": false, 1006 | "category": "General purpose", 1007 | "gpuNum": 0, 1008 | "hideHardwareSpecs": false, 1009 | "memoryGiB": 192, 1010 | "name": "ml.m5d.12xlarge", 1011 | "vcpuNum": 48 1012 | }, 1013 | { 1014 | "_defaultOrder": 18, 1015 | "_isFastLaunch": false, 1016 | "category": "General purpose", 1017 | "gpuNum": 0, 1018 | "hideHardwareSpecs": false, 1019 | "memoryGiB": 256, 1020 | "name": "ml.m5d.16xlarge", 1021 | "vcpuNum": 64 1022 | }, 1023 | { 1024 | "_defaultOrder": 19, 1025 | "_isFastLaunch": false, 1026 | "category": "General purpose", 1027 | "gpuNum": 0, 1028 | "hideHardwareSpecs": false, 1029 | "memoryGiB": 384, 1030 | "name": "ml.m5d.24xlarge", 1031 | "vcpuNum": 96 1032 | }, 1033 | { 1034 | "_defaultOrder": 20, 1035 | "_isFastLaunch": false, 1036 | "category": "General purpose", 1037 | "gpuNum": 0, 1038 | "hideHardwareSpecs": true, 1039 | "memoryGiB": 0, 1040 | "name": "ml.geospatial.interactive", 1041 | "supportedImageNames": [ 1042 | "sagemaker-geospatial-v1-0" 1043 | ], 1044 | "vcpuNum": 0 1045 | }, 1046 | { 1047 | "_defaultOrder": 21, 1048 | "_isFastLaunch": true, 1049 | "category": "Compute optimized", 1050 | "gpuNum": 0, 1051 | "hideHardwareSpecs": false, 1052 | "memoryGiB": 4, 1053 | "name": "ml.c5.large", 1054 | "vcpuNum": 2 1055 | }, 1056 | { 1057 | "_defaultOrder": 22, 1058 | "_isFastLaunch": false, 1059 | "category": "Compute optimized", 1060 | "gpuNum": 0, 1061 | "hideHardwareSpecs": false, 1062 | "memoryGiB": 8, 1063 | "name": "ml.c5.xlarge", 1064 | "vcpuNum": 4 1065 | }, 1066 | { 1067 | "_defaultOrder": 23, 1068 | "_isFastLaunch": false, 1069 | "category": "Compute optimized", 1070 | "gpuNum": 0, 1071 | "hideHardwareSpecs": false, 1072 | "memoryGiB": 16, 1073 | "name": "ml.c5.2xlarge", 1074 | "vcpuNum": 8 1075 | }, 1076 | { 1077 | "_defaultOrder": 24, 1078 | "_isFastLaunch": false, 1079 | "category": "Compute optimized", 1080 | "gpuNum": 0, 1081 | "hideHardwareSpecs": false, 1082 | "memoryGiB": 32, 1083 | "name": "ml.c5.4xlarge", 1084 | "vcpuNum": 16 1085 | }, 1086 | { 1087 | "_defaultOrder": 25, 1088 | "_isFastLaunch": false, 1089 | "category": "Compute optimized", 1090 | "gpuNum": 0, 1091 | "hideHardwareSpecs": false, 1092 | "memoryGiB": 72, 1093 | "name": "ml.c5.9xlarge", 1094 | "vcpuNum": 36 1095 | }, 1096 | { 1097 | "_defaultOrder": 26, 1098 | "_isFastLaunch": false, 1099 | "category": "Compute optimized", 1100 | "gpuNum": 0, 1101 | "hideHardwareSpecs": false, 1102 | "memoryGiB": 96, 1103 | "name": "ml.c5.12xlarge", 1104 | "vcpuNum": 48 1105 | }, 1106 | { 1107 | "_defaultOrder": 27, 1108 | "_isFastLaunch": false, 1109 | "category": "Compute optimized", 1110 | "gpuNum": 0, 1111 | "hideHardwareSpecs": false, 1112 | "memoryGiB": 144, 1113 | "name": "ml.c5.18xlarge", 1114 | "vcpuNum": 72 1115 | }, 1116 | { 1117 | "_defaultOrder": 28, 1118 | "_isFastLaunch": false, 1119 | "category": "Compute optimized", 1120 | "gpuNum": 0, 1121 | "hideHardwareSpecs": false, 1122 | "memoryGiB": 192, 1123 | "name": "ml.c5.24xlarge", 1124 | "vcpuNum": 96 1125 | }, 1126 | { 1127 | "_defaultOrder": 29, 1128 | "_isFastLaunch": true, 1129 | "category": "Accelerated computing", 1130 | "gpuNum": 1, 1131 | "hideHardwareSpecs": false, 1132 | "memoryGiB": 16, 1133 | "name": "ml.g4dn.xlarge", 1134 | "vcpuNum": 4 1135 | }, 1136 | { 1137 | "_defaultOrder": 30, 1138 | "_isFastLaunch": false, 1139 | "category": "Accelerated computing", 1140 | "gpuNum": 1, 1141 | "hideHardwareSpecs": false, 1142 | "memoryGiB": 32, 1143 | "name": "ml.g4dn.2xlarge", 1144 | "vcpuNum": 8 1145 | }, 1146 | { 1147 | "_defaultOrder": 31, 1148 | "_isFastLaunch": false, 1149 | "category": "Accelerated computing", 1150 | "gpuNum": 1, 1151 | "hideHardwareSpecs": false, 1152 | "memoryGiB": 64, 1153 | "name": "ml.g4dn.4xlarge", 1154 | "vcpuNum": 16 1155 | }, 1156 | { 1157 | "_defaultOrder": 32, 1158 | "_isFastLaunch": false, 1159 | "category": "Accelerated computing", 1160 | "gpuNum": 1, 1161 | "hideHardwareSpecs": false, 1162 | "memoryGiB": 128, 1163 | "name": "ml.g4dn.8xlarge", 1164 | "vcpuNum": 32 1165 | }, 1166 | { 1167 | "_defaultOrder": 33, 1168 | "_isFastLaunch": false, 1169 | "category": "Accelerated computing", 1170 | "gpuNum": 4, 1171 | "hideHardwareSpecs": false, 1172 | "memoryGiB": 192, 1173 | "name": "ml.g4dn.12xlarge", 1174 | "vcpuNum": 48 1175 | }, 1176 | { 1177 | "_defaultOrder": 34, 1178 | "_isFastLaunch": false, 1179 | "category": "Accelerated computing", 1180 | "gpuNum": 1, 1181 | "hideHardwareSpecs": false, 1182 | "memoryGiB": 256, 1183 | "name": "ml.g4dn.16xlarge", 1184 | "vcpuNum": 64 1185 | }, 1186 | { 1187 | "_defaultOrder": 35, 1188 | "_isFastLaunch": false, 1189 | "category": "Accelerated computing", 1190 | "gpuNum": 1, 1191 | "hideHardwareSpecs": false, 1192 | "memoryGiB": 61, 1193 | "name": "ml.p3.2xlarge", 1194 | "vcpuNum": 8 1195 | }, 1196 | { 1197 | "_defaultOrder": 36, 1198 | "_isFastLaunch": false, 1199 | "category": "Accelerated computing", 1200 | "gpuNum": 4, 1201 | "hideHardwareSpecs": false, 1202 | "memoryGiB": 244, 1203 | "name": "ml.p3.8xlarge", 1204 | "vcpuNum": 32 1205 | }, 1206 | { 1207 | "_defaultOrder": 37, 1208 | "_isFastLaunch": false, 1209 | "category": "Accelerated computing", 1210 | "gpuNum": 8, 1211 | "hideHardwareSpecs": false, 1212 | "memoryGiB": 488, 1213 | "name": "ml.p3.16xlarge", 1214 | "vcpuNum": 64 1215 | }, 1216 | { 1217 | "_defaultOrder": 38, 1218 | "_isFastLaunch": false, 1219 | "category": "Accelerated computing", 1220 | "gpuNum": 8, 1221 | "hideHardwareSpecs": false, 1222 | "memoryGiB": 768, 1223 | "name": "ml.p3dn.24xlarge", 1224 | "vcpuNum": 96 1225 | }, 1226 | { 1227 | "_defaultOrder": 39, 1228 | "_isFastLaunch": false, 1229 | "category": "Memory Optimized", 1230 | "gpuNum": 0, 1231 | "hideHardwareSpecs": false, 1232 | "memoryGiB": 16, 1233 | "name": "ml.r5.large", 1234 | "vcpuNum": 2 1235 | }, 1236 | { 1237 | "_defaultOrder": 40, 1238 | "_isFastLaunch": false, 1239 | "category": "Memory Optimized", 1240 | "gpuNum": 0, 1241 | "hideHardwareSpecs": false, 1242 | "memoryGiB": 32, 1243 | "name": "ml.r5.xlarge", 1244 | "vcpuNum": 4 1245 | }, 1246 | { 1247 | "_defaultOrder": 41, 1248 | "_isFastLaunch": false, 1249 | "category": "Memory Optimized", 1250 | "gpuNum": 0, 1251 | "hideHardwareSpecs": false, 1252 | "memoryGiB": 64, 1253 | "name": "ml.r5.2xlarge", 1254 | "vcpuNum": 8 1255 | }, 1256 | { 1257 | "_defaultOrder": 42, 1258 | "_isFastLaunch": false, 1259 | "category": "Memory Optimized", 1260 | "gpuNum": 0, 1261 | "hideHardwareSpecs": false, 1262 | "memoryGiB": 128, 1263 | "name": "ml.r5.4xlarge", 1264 | "vcpuNum": 16 1265 | }, 1266 | { 1267 | "_defaultOrder": 43, 1268 | "_isFastLaunch": false, 1269 | "category": "Memory Optimized", 1270 | "gpuNum": 0, 1271 | "hideHardwareSpecs": false, 1272 | "memoryGiB": 256, 1273 | "name": "ml.r5.8xlarge", 1274 | "vcpuNum": 32 1275 | }, 1276 | { 1277 | "_defaultOrder": 44, 1278 | "_isFastLaunch": false, 1279 | "category": "Memory Optimized", 1280 | "gpuNum": 0, 1281 | "hideHardwareSpecs": false, 1282 | "memoryGiB": 384, 1283 | "name": "ml.r5.12xlarge", 1284 | "vcpuNum": 48 1285 | }, 1286 | { 1287 | "_defaultOrder": 45, 1288 | "_isFastLaunch": false, 1289 | "category": "Memory Optimized", 1290 | "gpuNum": 0, 1291 | "hideHardwareSpecs": false, 1292 | "memoryGiB": 512, 1293 | "name": "ml.r5.16xlarge", 1294 | "vcpuNum": 64 1295 | }, 1296 | { 1297 | "_defaultOrder": 46, 1298 | "_isFastLaunch": false, 1299 | "category": "Memory Optimized", 1300 | "gpuNum": 0, 1301 | "hideHardwareSpecs": false, 1302 | "memoryGiB": 768, 1303 | "name": "ml.r5.24xlarge", 1304 | "vcpuNum": 96 1305 | }, 1306 | { 1307 | "_defaultOrder": 47, 1308 | "_isFastLaunch": false, 1309 | "category": "Accelerated computing", 1310 | "gpuNum": 1, 1311 | "hideHardwareSpecs": false, 1312 | "memoryGiB": 16, 1313 | "name": "ml.g5.xlarge", 1314 | "vcpuNum": 4 1315 | }, 1316 | { 1317 | "_defaultOrder": 48, 1318 | "_isFastLaunch": false, 1319 | "category": "Accelerated computing", 1320 | "gpuNum": 1, 1321 | "hideHardwareSpecs": false, 1322 | "memoryGiB": 32, 1323 | "name": "ml.g5.2xlarge", 1324 | "vcpuNum": 8 1325 | }, 1326 | { 1327 | "_defaultOrder": 49, 1328 | "_isFastLaunch": false, 1329 | "category": "Accelerated computing", 1330 | "gpuNum": 1, 1331 | "hideHardwareSpecs": false, 1332 | "memoryGiB": 64, 1333 | "name": "ml.g5.4xlarge", 1334 | "vcpuNum": 16 1335 | }, 1336 | { 1337 | "_defaultOrder": 50, 1338 | "_isFastLaunch": false, 1339 | "category": "Accelerated computing", 1340 | "gpuNum": 1, 1341 | "hideHardwareSpecs": false, 1342 | "memoryGiB": 128, 1343 | "name": "ml.g5.8xlarge", 1344 | "vcpuNum": 32 1345 | }, 1346 | { 1347 | "_defaultOrder": 51, 1348 | "_isFastLaunch": false, 1349 | "category": "Accelerated computing", 1350 | "gpuNum": 1, 1351 | "hideHardwareSpecs": false, 1352 | "memoryGiB": 256, 1353 | "name": "ml.g5.16xlarge", 1354 | "vcpuNum": 64 1355 | }, 1356 | { 1357 | "_defaultOrder": 52, 1358 | "_isFastLaunch": false, 1359 | "category": "Accelerated computing", 1360 | "gpuNum": 4, 1361 | "hideHardwareSpecs": false, 1362 | "memoryGiB": 192, 1363 | "name": "ml.g5.12xlarge", 1364 | "vcpuNum": 48 1365 | }, 1366 | { 1367 | "_defaultOrder": 53, 1368 | "_isFastLaunch": false, 1369 | "category": "Accelerated computing", 1370 | "gpuNum": 4, 1371 | "hideHardwareSpecs": false, 1372 | "memoryGiB": 384, 1373 | "name": "ml.g5.24xlarge", 1374 | "vcpuNum": 96 1375 | }, 1376 | { 1377 | "_defaultOrder": 54, 1378 | "_isFastLaunch": false, 1379 | "category": "Accelerated computing", 1380 | "gpuNum": 8, 1381 | "hideHardwareSpecs": false, 1382 | "memoryGiB": 768, 1383 | "name": "ml.g5.48xlarge", 1384 | "vcpuNum": 192 1385 | }, 1386 | { 1387 | "_defaultOrder": 55, 1388 | "_isFastLaunch": false, 1389 | "category": "Accelerated computing", 1390 | "gpuNum": 8, 1391 | "hideHardwareSpecs": false, 1392 | "memoryGiB": 1152, 1393 | "name": "ml.p4d.24xlarge", 1394 | "vcpuNum": 96 1395 | }, 1396 | { 1397 | "_defaultOrder": 56, 1398 | "_isFastLaunch": false, 1399 | "category": "Accelerated computing", 1400 | "gpuNum": 8, 1401 | "hideHardwareSpecs": false, 1402 | "memoryGiB": 1152, 1403 | "name": "ml.p4de.24xlarge", 1404 | "vcpuNum": 96 1405 | } 1406 | ], 1407 | "instance_type": "ml.t3.medium", 1408 | "kernelspec": { 1409 | "display_name": "Python 3 (Data Science 2.0)", 1410 | "language": "python", 1411 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:123456789012:image/sagemaker-data-science-38" 1412 | }, 1413 | "language_info": { 1414 | "codemirror_mode": { 1415 | "name": "ipython", 1416 | "version": 3 1417 | }, 1418 | "file_extension": ".py", 1419 | "mimetype": "text/x-python", 1420 | "name": "python", 1421 | "nbconvert_exporter": "python", 1422 | "pygments_lexer": "ipython3", 1423 | "version": "3.8.13" 1424 | } 1425 | }, 1426 | "nbformat": 4, 1427 | "nbformat_minor": 5 1428 | } 1429 | -------------------------------------------------------------------------------- /data_ingestion_to_vectordb/scripts/get_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import traceback 4 | 5 | from sh import cp, find, mkdir, wget 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument("--domain", type=str, default="sagemaker.readthedocs.io") 12 | parser.add_argument("--website", type=str, default="https://sagemaker.readthedocs.io/en/stable/") 13 | parser.add_argument("--output-dir", type=str, default="docs") 14 | parser.add_argument("--dryrun", action='store_true') 15 | args, _ = parser.parse_known_args() 16 | 17 | WEBSITE, DOMAIN, KB_DIR = (args.website, args.domain, args.output_dir) 18 | 19 | if args.dryrun: 20 | print(f"WEBSITE={WEBSITE}, DOMAIN={DOMAIN}, OUTPUT_DIR={KB_DIR}", file=sys.stderr) 21 | sys.exit(0) 22 | 23 | mkdir('-p', KB_DIR) 24 | 25 | try: 26 | WGET_ARGUMENTS = f"-e robots=off --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains {DOMAIN} --no-parent {WEBSITE}" 27 | wget_argument_list = WGET_ARGUMENTS.split() 28 | wget(*wget_argument_list) 29 | except Exception as ex: 30 | traceback.print_exc() 31 | 32 | results = find(DOMAIN, '-name', '*.html') 33 | html_files = results.strip('\n').split('\n') 34 | for each in html_files: 35 | flat_i = each.replace('/', '-') 36 | cp(each, f"{KB_DIR}/{flat_i}") 37 | 38 | print(f"There are {len(html_files)} files in {KB_DIR} directory", file=sys.stderr) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() --------------------------------------------------------------------------------