├── lambda ├── utils │ ├── __init__.py │ └── bedrock.py ├── source_database_insert_sample_data.py └── embedding_function.py ├── .npmignore ├── .gitignore ├── diagrams ├── text-embeddings-pipeline.jpg └── text-embeddings-pipeline.drawio ├── screenshots └── start-dms-replication-task.png ├── .eslintrc.json ├── CODE_OF_CONDUCT.md ├── prepare-lambda-package.sh ├── prepare-lambda-package.ps1 ├── tsconfig.json ├── sample.txt ├── package.json ├── lib ├── vpc.ts ├── bastion-host.ts ├── vector-store.ts ├── embedding-function.ts ├── source-database.ts └── data-extraction.ts ├── LICENSE ├── bin └── text-embeddings-pipeline.ts ├── cdk.json ├── CONTRIBUTING.md └── README.md /lambda/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | *.ts 2 | !*.d.ts 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.js 2 | *.d.ts 3 | node_modules 4 | 5 | # CDK asset staging directory 6 | .cdk.staging 7 | cdk.out 8 | 9 | lambda_package* -------------------------------------------------------------------------------- /diagrams/text-embeddings-pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/text-embeddings-pipeline-for-rag/HEAD/diagrams/text-embeddings-pipeline.jpg -------------------------------------------------------------------------------- /screenshots/start-dms-replication-task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/text-embeddings-pipeline-for-rag/HEAD/screenshots/start-dms-replication-task.png -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "browser": true, 4 | "es2021": true 5 | }, 6 | "extends": "standard-with-typescript", 7 | "parserOptions": { 8 | "ecmaVersion": "latest", 9 | "sourceType": "module" 10 | }, 11 | "rules": { 12 | "no-new": 0 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /prepare-lambda-package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -r lambda_package 4 | rm lambda_package.zip 5 | 6 | pip install \ 7 | --platform manylinux2014_aarch64 \ 8 | --target="./lambda_package" \ 9 | --implementation cp \ 10 | --python-version 3.11 \ 11 | --only-binary=:all: --upgrade \ 12 | boto3 langchain pgvector psycopg2-binary 13 | 14 | cp -r lambda/* lambda_package 15 | rm -r lambda_package/__pycache__ 16 | 17 | cd lambda_package 18 | zip -r ../lambda_package.zip . 19 | 20 | cd .. -------------------------------------------------------------------------------- /prepare-lambda-package.ps1: -------------------------------------------------------------------------------- 1 | if (Test-Path -LiteralPath "lambda_package") { 2 | rm lambda_package -r -force 3 | } 4 | rm -fo lambda_package.zip 5 | 6 | pip install --no-user --platform manylinux2014_aarch64 --target="lambda_package" --implementation cp --python-version 3.11 --only-binary=:all: --upgrade boto3 langchain pgvector psycopg2-binary 7 | 8 | Copy-Item -Path "lambda\*" -Destination "lambda_package" -Recurse 9 | rm lambda_package\__pycache__ -r -force 10 | 11 | Add-Type -Assembly "System.IO.Compression.FileSystem" ; 12 | [System.IO.Compression.ZipFile]::CreateFromDirectory("lambda_package", "lambda_package.zip") -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "commonjs", 5 | "lib": [ 6 | "es2020", 7 | "dom" 8 | ], 9 | "declaration": true, 10 | "strict": true, 11 | "noImplicitAny": true, 12 | "strictNullChecks": true, 13 | "noImplicitThis": true, 14 | "alwaysStrict": true, 15 | "noUnusedLocals": false, 16 | "noUnusedParameters": false, 17 | "noImplicitReturns": true, 18 | "noFallthroughCasesInSwitch": false, 19 | "inlineSourceMap": true, 20 | "inlineSources": true, 21 | "experimentalDecorators": true, 22 | "strictPropertyInitialization": false, 23 | "typeRoots": [ 24 | "./node_modules/@types" 25 | ] 26 | }, 27 | "exclude": [ 28 | "node_modules", 29 | "cdk.out" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /sample.txt: -------------------------------------------------------------------------------- 1 | AWS Health provides improved visibility into planned lifecycle events 2 | 3 | Posted On: Nov 9, 2023 4 | 5 | AWS Health introduces new features to help you manage planned lifecycle events, such as Amazon EKS Kubernetes version end of standard support, Amazon RDS certificate rotations, and end of support for other open source software. AWS Health is the authoritative source of information about service events and scheduled changes affecting your AWS cloud resources. 6 | 7 | These new features provide timely visibility into upcoming planned lifecycle events, a standardized data format that allows you to prepare and take actions, as well as the ability to dynamically track the completion of required actions at the resource-level. AWS Health also provides organization-wide visibility into planned lifecycle events for teams that manage workloads across the company. 8 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "text-embeddings-pipeline", 3 | "version": "0.1.0", 4 | "bin": { 5 | "text-embeddings-pipeline": "bin/text-embeddings-pipeline.js" 6 | }, 7 | "scripts": { 8 | "build": "tsc", 9 | "watch": "tsc -w", 10 | "cdk": "cdk" 11 | }, 12 | "devDependencies": { 13 | "@types/node": "20.9.4", 14 | "@typescript-eslint/eslint-plugin": "^6.12.0", 15 | "@typescript-eslint/parser": "^6.12.0", 16 | "aws-cdk": "2.110.1", 17 | "eslint": "^8.54.0", 18 | "eslint-config-standard-with-typescript": "^40.0.0", 19 | "eslint-plugin-import": "^2.29.0", 20 | "eslint-plugin-n": "^16.3.1", 21 | "eslint-plugin-promise": "^6.1.1", 22 | "ts-node": "^10.9.1", 23 | "typescript": "~5.3.2" 24 | }, 25 | "dependencies": { 26 | "aws-cdk-lib": "2.110.1", 27 | "constructs": "^10.3.0", 28 | "source-map-support": "^0.5.21" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /lib/vpc.ts: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | import { type Construct } from 'constructs' 5 | import { Stack, type StackProps, aws_ec2 as ec2 } from 'aws-cdk-lib' 6 | 7 | export class Vpc extends Stack { 8 | public Vpc: ec2.Vpc 9 | 10 | constructor (scope: Construct, id: string, props?: StackProps) { 11 | super(scope, id, props) 12 | 13 | this.Vpc = new ec2.Vpc(this, 'vpc', { 14 | ipAddresses: ec2.IpAddresses.cidr('20.0.0.0/16'), 15 | maxAzs: 2, 16 | natGateways: 1, 17 | vpcName: id, 18 | subnetConfiguration: [ 19 | { 20 | cidrMask: 24, 21 | name: 'Public', 22 | subnetType: ec2.SubnetType.PUBLIC 23 | }, 24 | { 25 | cidrMask: 24, 26 | name: 'Private', 27 | subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS 28 | } 29 | ] 30 | }) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /bin/text-embeddings-pipeline.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | // SPDX-License-Identifier: MIT-0 5 | 6 | import 'source-map-support/register' 7 | import * as cdk from 'aws-cdk-lib' 8 | 9 | import { Vpc } from '../lib/vpc' 10 | import { VectorStore } from '../lib/vector-store' 11 | import { EmbeddingFunction } from '../lib/embedding-function' 12 | import { SourceDatabase } from '../lib/source-database' 13 | import { DataExtraction } from '../lib/data-extraction' 14 | import { BastionHost } from '../lib/bastion-host' 15 | 16 | const app = new cdk.App() 17 | const prefix = 'text-embeddings-pipeline-' 18 | 19 | const vpc = new Vpc(app, prefix + 'vpc', {}).Vpc 20 | 21 | new VectorStore(app, prefix + 'vector-store', vpc, {}) 22 | 23 | const embeddingFunction = new EmbeddingFunction( 24 | app, 25 | prefix + 'embedding-function', 26 | vpc, 27 | {} 28 | ) 29 | 30 | new BastionHost(app, prefix + 'bastion-host', vpc, {}) 31 | 32 | const sourceDatabase = new SourceDatabase( 33 | app, 34 | prefix + 'source-database', 35 | vpc, 36 | {} 37 | ) 38 | 39 | const dataExtraction = new DataExtraction( 40 | app, 41 | prefix + 'data-extraction', 42 | vpc, 43 | embeddingFunction.S3Bucket, 44 | {} 45 | ) 46 | dataExtraction.addDependency(sourceDatabase) 47 | dataExtraction.addDependency(embeddingFunction) 48 | -------------------------------------------------------------------------------- /lib/bastion-host.ts: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | import { type Construct } from 'constructs' 5 | import { Stack, type StackProps, aws_ec2 as ec2 } from 'aws-cdk-lib' 6 | 7 | export class BastionHost extends Stack { 8 | constructor (scope: Construct, id: string, vpc: ec2.Vpc, props?: StackProps) { 9 | super(scope, id, props) 10 | 11 | const securityGroup = new ec2.SecurityGroup(this, id + '-sg', { 12 | vpc, 13 | allowAllOutbound: true 14 | }) 15 | securityGroup.addIngressRule( 16 | ec2.Peer.anyIpv4(), 17 | ec2.Port.tcp(22), 18 | 'Allow SSH access' 19 | ) 20 | 21 | const instance = new ec2.Instance(this, id, { 22 | blockDevices: [ 23 | { 24 | deviceName: '/dev/xvda', 25 | volume: ec2.BlockDeviceVolume.ebs(8, { 26 | deleteOnTermination: true, 27 | encrypted: true, 28 | volumeType: ec2.EbsDeviceVolumeType.GP3 29 | }) 30 | } 31 | ], 32 | instanceName: id, 33 | instanceType: ec2.InstanceType.of( 34 | ec2.InstanceClass.T4G, 35 | ec2.InstanceSize.MICRO 36 | ), 37 | keyName: 'EC2DefaultKeyPair', 38 | machineImage: ec2.MachineImage.latestAmazonLinux2023({ 39 | cpuType: ec2.AmazonLinuxCpuType.ARM_64 40 | }), 41 | securityGroup, 42 | vpc, 43 | vpcSubnets: { subnetType: ec2.SubnetType.PUBLIC } 44 | }) 45 | 46 | instance.addUserData( 47 | ['sudo dnf update -y', 'sudo dnf install postgresql15 -y'].join('\n') 48 | ) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /lambda/source_database_insert_sample_data.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | import json 5 | import psycopg2 6 | import boto3 7 | from botocore.exceptions import ClientError 8 | 9 | def lambda_handler(event, context): 10 | # Retrieve database credentials from AWS Secrets Manager 11 | db_credential = get_db_credential() 12 | 13 | # Create a connection to the database and a cursor 14 | conn = psycopg2.connect(dbname=db_credential["username"], user=db_credential["username"], password=db_credential["password"], host=db_credential["host"], port=int(db_credential["port"])) 15 | cur = conn.cursor() 16 | 17 | # Create table and insert sample data 18 | cur.execute("CREATE TABLE faqs (question VARCHAR(250), answer VARCHAR(2000));") 19 | cur.execute("INSERT INTO faqs (question, answer) VALUES (%s, %s)", ("What is text embeddings pipeline?", "Text embeddings pipeline allows you to create embeddings of your contextual knowledge and store it in a vector store.")) 20 | conn.commit() 21 | 22 | # Close cursor and connection 23 | cur.close() 24 | conn.close() 25 | 26 | def get_db_credential(): 27 | secret_name = "text-embeddings-pipeline-source-database" 28 | 29 | # Create a Secrets Manager client 30 | session = boto3.session.Session() 31 | client = session.client( 32 | service_name='secretsmanager' 33 | ) 34 | 35 | try: 36 | get_secret_value_response = client.get_secret_value( 37 | SecretId=secret_name 38 | ) 39 | except ClientError as e: 40 | # For a list of exceptions thrown, see 41 | # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html 42 | raise e 43 | 44 | # Decrypts secret using the associated KMS key. 45 | return json.loads(get_secret_value_response['SecretString']) -------------------------------------------------------------------------------- /lib/vector-store.ts: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | import { type Construct } from 'constructs' 5 | import { 6 | Stack, 7 | type StackProps, 8 | aws_ec2 as ec2, 9 | aws_rds as rds, 10 | Duration 11 | } from 'aws-cdk-lib' 12 | import { RetentionDays } from 'aws-cdk-lib/aws-logs' 13 | 14 | export class VectorStore extends Stack { 15 | public VectorStore: rds.DatabaseInstance 16 | 17 | constructor (scope: Construct, id: string, vpc: ec2.Vpc, props?: StackProps) { 18 | super(scope, id, props) 19 | 20 | // Create a security group for the RDS PostgreSQL database instance that is used as a vector store 21 | const vectorStoreSg = new ec2.SecurityGroup(this, 'vector-store-sg', { 22 | vpc, 23 | description: 'Allow connection to RDS PostgreSQL Database Instance', 24 | allowAllOutbound: true, 25 | disableInlineRules: true, 26 | securityGroupName: id 27 | }) 28 | // This will add the rule as an external cloud formation construct 29 | vectorStoreSg.addIngressRule( 30 | ec2.Peer.anyIpv4(), 31 | ec2.Port.tcp(5432), 32 | 'Allow connection to RDS PostgreSQL Database Instance' 33 | ) 34 | 35 | // Create an RDS PostgreSQL database instance that is used as a vector store 36 | this.VectorStore = new rds.DatabaseInstance(this, 'vector-store', { 37 | engine: rds.DatabaseInstanceEngine.postgres({ 38 | version: rds.PostgresEngineVersion.of('16.1', '16') 39 | }), 40 | // Generate the secret with admin username `postgres` and random password 41 | credentials: rds.Credentials.fromGeneratedSecret('postgres', { 42 | secretName: id 43 | }), 44 | allocatedStorage: 50, 45 | backupRetention: Duration.days(0), 46 | caCertificate: rds.CaCertificate.RDS_CA_RDS2048_G1, 47 | cloudwatchLogsRetention: RetentionDays.ONE_DAY, 48 | deleteAutomatedBackups: true, 49 | instanceIdentifier: id, 50 | instanceType: ec2.InstanceType.of( 51 | ec2.InstanceClass.T4G, 52 | ec2.InstanceSize.MEDIUM 53 | ), 54 | securityGroups: [vectorStoreSg], 55 | storageType: rds.StorageType.GP3, 56 | vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS }, 57 | vpc 58 | }) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /lambda/utils/bedrock.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | """Helper utilities for working with Amazon Bedrock from Python notebooks""" 4 | # Python Built-Ins: 5 | import os 6 | from typing import Optional 7 | 8 | # External Dependencies: 9 | import boto3 10 | from botocore.config import Config 11 | 12 | 13 | def get_bedrock_client( 14 | assumed_role: Optional[str] = None, 15 | region: Optional[str] = None, 16 | runtime: Optional[bool] = True, 17 | ): 18 | """Create a boto3 client for Amazon Bedrock, with optional configuration overrides 19 | 20 | Parameters 21 | ---------- 22 | assumed_role : 23 | Optional ARN of an AWS IAM role to assume for calling the Bedrock service. If not 24 | specified, the current active credentials will be used. 25 | region : 26 | Optional name of the AWS Region in which the service should be called (e.g. "us-east-1"). 27 | If not specified, AWS_REGION or AWS_DEFAULT_REGION environment variable will be used. 28 | runtime : 29 | Optional choice of getting different client to perform operations with the Amazon Bedrock service. 30 | """ 31 | if region is None: 32 | target_region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION")) 33 | else: 34 | target_region = region 35 | 36 | print(f"Create new client (region: {target_region})") 37 | session_kwargs = {"region_name": target_region} 38 | client_kwargs = {**session_kwargs} 39 | 40 | profile_name = os.environ.get("AWS_PROFILE") 41 | if profile_name: 42 | print(f" Using profile: {profile_name}") 43 | session_kwargs["profile_name"] = profile_name 44 | 45 | retry_config = Config( 46 | region_name=target_region, 47 | retries={ 48 | "max_attempts": 10, 49 | "mode": "standard", 50 | }, 51 | ) 52 | session = boto3.Session(**session_kwargs) 53 | 54 | if assumed_role: 55 | print(f" Using role: {assumed_role}", end='') 56 | sts = session.client("sts") 57 | response = sts.assume_role( 58 | RoleArn=str(assumed_role), 59 | RoleSessionName="langchain-llm-1" 60 | ) 61 | print(" ... successful!") 62 | client_kwargs["aws_access_key_id"] = response["Credentials"]["AccessKeyId"] 63 | client_kwargs["aws_secret_access_key"] = response["Credentials"]["SecretAccessKey"] 64 | client_kwargs["aws_session_token"] = response["Credentials"]["SessionToken"] 65 | 66 | if runtime: 67 | service_name='bedrock-runtime' 68 | else: 69 | service_name='bedrock' 70 | 71 | bedrock_client = session.client( 72 | service_name=service_name, 73 | config=retry_config, 74 | **client_kwargs 75 | ) 76 | 77 | print("boto3 Bedrock client successfully created! " + str(bedrock_client._endpoint)) 78 | return bedrock_client 79 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "npx ts-node --prefer-ts-exts bin/text-embeddings-pipeline.ts", 3 | "watch": { 4 | "include": ["**"], 5 | "exclude": [ 6 | "README.md", 7 | "cdk*.json", 8 | "**/*.d.ts", 9 | "**/*.js", 10 | "tsconfig.json", 11 | "package*.json", 12 | "yarn.lock", 13 | "node_modules", 14 | "test" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 19 | "@aws-cdk/core:checkSecretUsage": true, 20 | "@aws-cdk/core:target-partitions": ["aws", "aws-cn"], 21 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 22 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 23 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 24 | "@aws-cdk/aws-iam:minimizePolicies": true, 25 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 26 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 27 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 28 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 29 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 30 | "@aws-cdk/core:enablePartitionLiterals": true, 31 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 32 | "@aws-cdk/aws-iam:standardizedServicePrincipals": true, 33 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 34 | "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, 35 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, 36 | "@aws-cdk/aws-route53-patters:useCertificate": true, 37 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false, 38 | "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, 39 | "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, 40 | "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, 41 | "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, 42 | "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, 43 | "@aws-cdk/aws-redshift:columnId": true, 44 | "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, 45 | "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, 46 | "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, 47 | "@aws-cdk/aws-kms:aliasNameRef": true, 48 | "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, 49 | "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, 50 | "@aws-cdk/aws-efs:denyAnonymousAccess": true, 51 | "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, 52 | "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, 53 | "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, 54 | "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, 55 | "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, 56 | "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, 57 | "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /lib/embedding-function.ts: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | import { type Construct } from 'constructs' 5 | import { 6 | Stack, 7 | type StackProps, 8 | RemovalPolicy, 9 | aws_ec2 as ec2, 10 | aws_iam as iam, 11 | aws_lambda as _lambda, 12 | aws_s3 as s3, 13 | aws_s3_notifications as s3Notifications, 14 | Duration 15 | } from 'aws-cdk-lib' 16 | 17 | export class EmbeddingFunction extends Stack { 18 | public S3Bucket: s3.Bucket 19 | 20 | constructor (scope: Construct, id: string, vpc: ec2.Vpc, props?: StackProps) { 21 | super(scope, id, props) 22 | 23 | // Create an S3 bucket to store the objects 24 | this.S3Bucket = new s3.Bucket(this, 's3-bucket', { 25 | blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL, 26 | encryption: s3.BucketEncryption.S3_MANAGED, 27 | removalPolicy: RemovalPolicy.DESTROY, 28 | bucketName: id + '-bucket' 29 | }) 30 | 31 | // IAM role for Lambda 32 | const lambdaExecutionRole = new iam.Role(this, 'lambda-execution-role', { 33 | assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'), 34 | roleName: id, 35 | managedPolicies: [ 36 | iam.ManagedPolicy.fromAwsManagedPolicyName( 37 | 'service-role/AWSLambdaBasicExecutionRole' 38 | ) 39 | ] 40 | }) 41 | // Required to get the secret values from AWS Secrets Manager 42 | lambdaExecutionRole.addToPolicy( 43 | new iam.PolicyStatement({ 44 | resources: [ 45 | 'arn:aws:secretsmanager:' + 46 | this.region + 47 | ':' + 48 | this.account + 49 | ':secret:text-embeddings-pipeline-vector-store-*' 50 | ], 51 | actions: ['secretsmanager:GetSecretValue'] 52 | }) 53 | ) 54 | // Required to get the object content from the Amazon S3 bucket 55 | lambdaExecutionRole.addToPolicy( 56 | new iam.PolicyStatement({ 57 | resources: [this.S3Bucket.bucketArn + '/*'], 58 | actions: ['s3:GetObject*'] 59 | }) 60 | ) 61 | // Required to invoke a model in Amazon Bedrock and be part of the VPC 62 | lambdaExecutionRole.addToPolicy( 63 | new iam.PolicyStatement({ 64 | resources: ['*'], 65 | actions: [ 66 | 'bedrock:InvokeModel', 67 | 'ec2:*NetworkInterface', 68 | 'ec2:DescribeNetworkInterfaces' 69 | ] 70 | }) 71 | ) 72 | 73 | // Create a Lambda function to convert text into embeddings 74 | const embeddingFunction = new _lambda.Function(this, 'embedding-function', { 75 | architecture: _lambda.Architecture.ARM_64, 76 | code: _lambda.Code.fromAsset('lambda_package.zip'), 77 | handler: 'embedding_function.lambda_handler', 78 | runtime: _lambda.Runtime.PYTHON_3_11, 79 | functionName: id, 80 | memorySize: 512, 81 | role: lambdaExecutionRole, 82 | timeout: Duration.seconds(15), 83 | vpc, 84 | vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS } 85 | }) 86 | 87 | // Event trigger for .txt files 88 | this.S3Bucket.addEventNotification( 89 | s3.EventType.OBJECT_CREATED, 90 | new s3Notifications.LambdaDestination(embeddingFunction), 91 | { 92 | suffix: '.txt' 93 | } 94 | ) 95 | // Event trigger for .csv files 96 | this.S3Bucket.addEventNotification( 97 | s3.EventType.OBJECT_CREATED, 98 | new s3Notifications.LambdaDestination(embeddingFunction), 99 | { 100 | suffix: '.csv' 101 | } 102 | ) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /lambda/embedding_function.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | import os 5 | import json 6 | import boto3 7 | from botocore.exceptions import ClientError 8 | import langchain 9 | from langchain.embeddings import BedrockEmbeddings 10 | from langchain.text_splitter import RecursiveCharacterTextSplitter 11 | from langchain.vectorstores.pgvector import PGVector 12 | 13 | from langchain.indexes import SQLRecordManager 14 | from langchain.indexes import index 15 | import sqlalchemy 16 | 17 | from utils import bedrock 18 | 19 | def lambda_handler(event, context): 20 | # Get content of uploaded object 21 | s3 = boto3.client('s3') 22 | s3_details = event["Records"][0]["s3"] 23 | response = s3.get_object(Bucket=s3_details["bucket"]["name"], Key=s3_details["object"]["key"]) 24 | content = response['Body'].read().decode('utf-8') 25 | 26 | # Set up client for Amazon Bedrock 27 | boto3_bedrock = bedrock.get_bedrock_client(region="us-east-1") 28 | br_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1", client=boto3_bedrock) 29 | 30 | splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) 31 | docs = splitter.create_documents([content], metadatas=[{"source": s3_details["object"]["key"]}]); 32 | print(f"Number of documents after split and chunking = {len(docs)}") 33 | 34 | # Retrieve database credentials from AWS Secrets Manager 35 | db_credential = get_db_credential() 36 | pgvector_connection_string = PGVector.connection_string_from_db_params( 37 | driver="psycopg2", 38 | host=db_credential["host"], 39 | port=int(db_credential["port"]), 40 | database=db_credential["username"], 41 | user=db_credential["username"], 42 | password=db_credential["password"], 43 | ) 44 | 45 | # Record Manager is used to load and keep in sync documents from any source into a vector store 46 | # https://blog.langchain.dev/syncing-data-sources-to-vector-stores/ 47 | collection_name = "knowledge_base" 48 | namespace = f"pgvector/{collection_name}" 49 | record_manager = SQLRecordManager( 50 | namespace, engine=sqlalchemy.create_engine("postgresql+psycopg2://postgres:" + db_credential["password"] + "@" + db_credential["host"] + "/postgres") 51 | ) 52 | record_manager.create_schema() 53 | 54 | # Create vector store 55 | vectorstore_pgvector_aws = PGVector(pgvector_connection_string, br_embeddings, collection_name=collection_name) 56 | 57 | # Create embeddings and store in vector store 58 | index( 59 | docs_source=docs, 60 | record_manager=record_manager, 61 | vector_store=vectorstore_pgvector_aws, 62 | cleanup="incremental", 63 | source_id_key="source" 64 | ) 65 | 66 | # Performing a query for testing 67 | print("Performing a query for testing") 68 | print("-" * 35) 69 | query = "How do the new features of AWS Health help me?" 70 | docs_with_score = vectorstore_pgvector_aws.similarity_search_with_score(query) 71 | for doc, score in docs_with_score: 72 | print("Score: ", score) 73 | print(doc.page_content) 74 | print("-" * 35) 75 | 76 | print("boto3: " + boto3.__version__ + ", langchain: " + langchain.__version__) 77 | 78 | def get_db_credential(): 79 | secret_name = "text-embeddings-pipeline-vector-store" 80 | 81 | # Create a Secrets Manager client 82 | session = boto3.session.Session() 83 | client = session.client( 84 | service_name='secretsmanager' 85 | ) 86 | 87 | try: 88 | get_secret_value_response = client.get_secret_value( 89 | SecretId=secret_name 90 | ) 91 | except ClientError as e: 92 | # For a list of exceptions thrown, see 93 | # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html 94 | raise e 95 | 96 | # Decrypts secret using the associated KMS key. 97 | return json.loads(get_secret_value_response['SecretString']) -------------------------------------------------------------------------------- /lib/source-database.ts: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | import { type Construct } from 'constructs' 5 | import { 6 | Stack, 7 | type StackProps, 8 | aws_ec2 as ec2, 9 | aws_rds as rds, 10 | aws_iam as iam, 11 | aws_lambda as _lambda, 12 | Duration 13 | } from 'aws-cdk-lib' 14 | import { RetentionDays } from 'aws-cdk-lib/aws-logs' 15 | import * as triggers from 'aws-cdk-lib/triggers' 16 | 17 | export class SourceDatabase extends Stack { 18 | public Database: rds.DatabaseInstance 19 | 20 | constructor (scope: Construct, id: string, vpc: ec2.Vpc, props?: StackProps) { 21 | super(scope, id, props) 22 | 23 | // Security group for source database 24 | const securityGroup = new ec2.SecurityGroup(this, 'source-database-sg', { 25 | vpc, 26 | description: 'Allow connection to RDS PostgreSQL Database Instance', 27 | allowAllOutbound: true, 28 | disableInlineRules: true, 29 | securityGroupName: id 30 | }) 31 | // This will add the rule as an external cloud formation construct 32 | securityGroup.addIngressRule( 33 | ec2.Peer.anyIpv4(), 34 | ec2.Port.tcp(5432), 35 | 'Allow connection to RDS PostgreSQL Database Instance' 36 | ) 37 | 38 | // Creates the source database. Engine version must be supported by DMS version 39 | this.Database = new rds.DatabaseInstance(this, 'source-database', { 40 | engine: rds.DatabaseInstanceEngine.postgres({ 41 | version: rds.PostgresEngineVersion.VER_15_4 42 | }), 43 | // Generate the secret with admin username `postgres` and random password 44 | credentials: rds.Credentials.fromGeneratedSecret('postgres', { 45 | secretName: id 46 | }), 47 | allocatedStorage: 50, 48 | backupRetention: Duration.days(0), 49 | caCertificate: rds.CaCertificate.RDS_CA_RDS2048_G1, 50 | cloudwatchLogsRetention: RetentionDays.ONE_DAY, 51 | deleteAutomatedBackups: true, 52 | instanceIdentifier: id, 53 | instanceType: ec2.InstanceType.of( 54 | ec2.InstanceClass.T4G, 55 | ec2.InstanceSize.MEDIUM 56 | ), 57 | securityGroups: [securityGroup], 58 | storageType: rds.StorageType.GP3, 59 | vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS }, 60 | vpc 61 | }) 62 | 63 | const lambdaExecutionRole = new iam.Role(this, 'lambda-execution-role', { 64 | assumedBy: new iam.ServicePrincipal('lambda.amazonaws.com'), 65 | roleName: id, 66 | managedPolicies: [ 67 | iam.ManagedPolicy.fromAwsManagedPolicyName( 68 | 'service-role/AWSLambdaBasicExecutionRole' 69 | ) 70 | ] 71 | }) 72 | // Required to get the secret values from AWS Secrets Manager 73 | lambdaExecutionRole.addToPolicy( 74 | new iam.PolicyStatement({ 75 | resources: [this.Database.secret?.secretArn ?? ''], 76 | actions: ['secretsmanager:GetSecretValue'] 77 | }) 78 | ) 79 | // Required to be part of the VPC 80 | lambdaExecutionRole.addToPolicy( 81 | new iam.PolicyStatement({ 82 | resources: ['*'], 83 | actions: ['ec2:*NetworkInterface', 'ec2:DescribeNetworkInterfaces'] 84 | }) 85 | ) 86 | 87 | // Create a Lambda function to insert sample data into the source database 88 | const insertSampleDataFunction = new _lambda.Function( 89 | this, 90 | 'source-database-insert-sample-data-function', 91 | { 92 | architecture: _lambda.Architecture.ARM_64, 93 | code: _lambda.Code.fromAsset('lambda_package.zip'), 94 | handler: 'source_database_insert_sample_data.lambda_handler', 95 | runtime: _lambda.Runtime.PYTHON_3_11, 96 | functionName: id + '-insert-sample-data', 97 | memorySize: 128, 98 | role: lambdaExecutionRole, 99 | timeout: Duration.seconds(15), 100 | vpc, 101 | vpcSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS } 102 | } 103 | ) 104 | 105 | // Trigger the Lambda function above once the stack is deployed 106 | const functionTrigger = new triggers.Trigger( 107 | this, 108 | 'source-database-insert-sample-data-trigger', 109 | { 110 | handler: insertSampleDataFunction 111 | } 112 | ) 113 | functionTrigger.node.addDependency(this.Database) 114 | functionTrigger.node.addDependency(insertSampleDataFunction) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Embeddings Pipeline for Retrieval Augmented Generation (RAG) 2 | 3 | This solution is a pipeline to convert contextual knowledge stored in documents and databases into text embeddings, and store them in a vector store. Applications built with Large Language Models (LLMs) can perform a similarity search on the vector store to retrieve the contextual knowledge before generating a response. This technique is known as Retrieval Augmented Generation (RAG), and it is often used to improve the quality and accuracy of the responses. 4 | 5 | ## ❗ Warning ❗ 6 | 7 | - **Review and change the configurations before using it for production**: the current configuration should not be used for production without further review and adaptation. Many anti-patterns are adopted to save cost, such as disabling backups and multi-AZ. 8 | 9 | - **Be mindful of the costs incurred**: while this solution is developed to be cost-effective, please be mindful of the costs incurred. 10 | 11 | ## Architecture 12 | 13 | !["Architecture"](diagrams/text-embeddings-pipeline.jpg "Architecture") 14 | 15 | ## Prerequisites 16 | 17 | 1. [AWS CDK CLI installed](https://docs.aws.amazon.com/cdk/v2/guide/cli.html) 18 | 2. [AWS CLI set up with a default profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html#cli-configure-files-methods) 19 | 3. [Python v3.11 installed](https://www.python.org/downloads/) 20 | 21 | ## Setup 22 | 23 | 1. Clone this repository. 24 | 25 | 2. Create an EC2 Key Pair named "EC2DefaultKeyPair" in your AWS account. 26 | 27 | 3. Install dependencies. 28 | 29 | ```bash 30 | npm install 31 | ``` 32 | 33 | 4. Bootstrap your AWS account with CDK Toolkit (if not done for your AWS account yet). 34 | 35 | ```bash 36 | cdk bootstrap 37 | ``` 38 | 39 | 5. Package Lambda function and its dependencies. 40 | 41 | - macOS: `sh prepare-lambda-package.sh` 42 | - Windows: `.\prepare-lambda-package.ps1` 43 | 44 | 6. Deploy the CDK stacks. 45 | 46 | ```bash 47 | cdk deploy --all --require-approval never 48 | ``` 49 | 50 | 7. While waiting for the previous step to complete, go to [Amazon Bedrock](https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/modelaccess) in us-east-1 and grant access to "Amazon Titan Embeddings G1 - Text". 51 | 52 | ## Walkthrough 53 | 54 | 1. There are two ways to upload data to the S3 bucket. 55 | 56 | - (a) Upload a .txt file with some content (sample.txt is an example) to the S3 bucket created by one of the stacks. 57 | 58 | - (b) Start the DMS replication task in the AWS management console. The data from the source database will be replicated to the S3 bucket and stored in .csv files. 59 | 60 | !["Start DMS replication task"](screenshots/start-dms-replication-task.png "Start DMS replication task") 61 | 62 | The Lambda function will create text embeddings of the content in .txt / .csv files and store them in the vector store. 63 | 64 | 2. Connect (SSH or instance connect) to the bastion host. Run the following command (and provide the password) to authenticate. The credentials can be found in the "text-embeddings-pipeline-vector-store" secret in AWS Secrets Manager. 65 | 66 | ```bash 67 | psql --port=5432 --dbname=postgres --username=postgres --host= 68 | ``` 69 | 70 | 3. Run the ```\dt``` to list the database tables. Tables with names starting with the prefix "langchain" are created by LangChain automatically as it creates and stores the embeddings. 71 | 72 | ``` 73 | List of relations 74 | Schema | Name | Type | Owner 75 | --------+-------------------------+-------+---------- 76 | public | langchain_pg_collection | table | postgres 77 | public | langchain_pg_embedding | table | postgres 78 | public | upsertion_record | table | postgres 79 | (3 rows) 80 | ``` 81 | 82 | 4. The documents and embeddings are stored in the "langchain_pg_embedding" table. You can see the truncated values (actual values are too long) by running the following commands. 83 | 84 | - ```SELECT embedding::varchar(80) FROM langchain_pg_embedding;``` 85 | ``` 86 | embedding 87 | ---------------------------------------------------------------------------------- 88 | [-0.005340576,-0.61328125,0.13769531,0.7890625,0.4296875,-0.13671875,-0.01379394 ... 89 | [0.59375,-0.23339844,0.45703125,-0.14257812,-0.18164062,0.0030517578,-0.00933837 ... 90 | (2 rows) 91 | ``` 92 | - ```SELECT document::varchar(80) FROM langchain_pg_embedding;``` 93 | ``` 94 | document 95 | ---------------------------------------------------------------------------------- 96 | What is text embeddings pipeline?,Text embeddings pipeline allows you to create ... 97 | AWS Health provides improved visibility into planned lifecycle events ... 98 | (2 rows) 99 | ``` 100 | 101 | ## Clean Up 102 | 103 | 1. Destroy all CDK stacks. 104 | 105 | ```bash 106 | cdk destroy --all 107 | ``` 108 | 109 | ## Security 110 | 111 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 112 | 113 | ## License 114 | 115 | This library is licensed under the MIT-0 License. See the LICENSE file. -------------------------------------------------------------------------------- /lib/data-extraction.ts: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | import { type Construct } from 'constructs' 5 | import { 6 | Stack, 7 | type StackProps, 8 | aws_ec2 as ec2, 9 | aws_iam as iam, 10 | aws_dms as dms, 11 | type aws_s3 as s3 12 | } from 'aws-cdk-lib' 13 | 14 | export class DataExtraction extends Stack { 15 | constructor ( 16 | scope: Construct, 17 | id: string, 18 | vpc: ec2.Vpc, 19 | s3Bucket: s3.Bucket, 20 | props?: StackProps 21 | ) { 22 | super(scope, id, props) 23 | 24 | // This role is required for DMS to work 25 | const dmsVpcRole = new iam.Role(this, 'dms-vpc-role', { 26 | assumedBy: new iam.ServicePrincipal('dms.amazonaws.com'), 27 | roleName: 'dms-vpc-role', 28 | managedPolicies: [ 29 | iam.ManagedPolicy.fromManagedPolicyArn( 30 | this, 31 | 'dms-vpc-management-role-policy', 32 | 'arn:aws:iam::aws:policy/service-role/AmazonDMSVPCManagementRole' 33 | ) 34 | ] 35 | }) 36 | 37 | // This role is required for DMS to get secret from Secrets Manager 38 | const secretManagerAccessRole = new iam.Role( 39 | this, 40 | 'dms-secret-manager-access-role', 41 | { 42 | assumedBy: new iam.ServicePrincipal( 43 | 'dms.' + this.region + '.amazonaws.com' 44 | ), 45 | roleName: 'dms-secret-manager-access', 46 | inlinePolicies: { 47 | dmsSecretManagerAccess: new iam.PolicyDocument({ 48 | statements: [ 49 | new iam.PolicyStatement({ 50 | resources: [ 51 | 'arn:aws:secretsmanager:' + 52 | this.region + 53 | ':' + 54 | this.account + 55 | ':secret:text-embeddings-pipeline-source-database-*' 56 | ], 57 | actions: ['secretsmanager:GetSecretValue'] 58 | }) 59 | ] 60 | }) 61 | } 62 | } 63 | ) 64 | 65 | // This role is required for DMS to put object in S3 66 | const s3AccessRole = new iam.Role(this, 'dms-s3-access-role', { 67 | assumedBy: new iam.ServicePrincipal( 68 | 'dms.' + this.region + '.amazonaws.com' 69 | ), 70 | roleName: 'dms-s3-access', 71 | inlinePolicies: { 72 | dmsS3Access: new iam.PolicyDocument({ 73 | statements: [ 74 | new iam.PolicyStatement({ 75 | resources: [s3Bucket.bucketArn, s3Bucket.bucketArn + '/*'], 76 | actions: [ 77 | 's3:ListBucket', 78 | 's3:PutObject', 79 | 's3:PutObjectTagging', 80 | 's3:DeleteObject' 81 | ] 82 | }) 83 | ] 84 | }) 85 | } 86 | }) 87 | 88 | // Create a subnet group for the replication instance 89 | const replicationSubnetGroup = new dms.CfnReplicationSubnetGroup( 90 | this, 91 | 'dms-subnet-group', 92 | { 93 | replicationSubnetGroupIdentifier: id, 94 | replicationSubnetGroupDescription: 95 | 'Subnets that have access to source and target.', 96 | subnetIds: vpc.privateSubnets.map((s) => s.subnetId) 97 | } 98 | ) 99 | replicationSubnetGroup.node.addDependency(dmsVpcRole) 100 | 101 | // Create a security group for the replication instance 102 | const replicationInstanceSecurityGroup = new ec2.SecurityGroup( 103 | this, 104 | 'replication-instance-sg', 105 | { 106 | securityGroupName: id + '-replication-instance', 107 | vpc 108 | } 109 | ) 110 | replicationInstanceSecurityGroup.addIngressRule( 111 | ec2.Peer.anyIpv4(), 112 | ec2.Port.allTcp(), 113 | 'Allow all connection' 114 | ) 115 | 116 | // Launch a replication instance in the subnet group 117 | const replicationInstance = new dms.CfnReplicationInstance( 118 | this, 119 | 'dms-replication-instance', 120 | { 121 | replicationInstanceIdentifier: id, 122 | replicationInstanceClass: 'dms.t3.small', 123 | replicationSubnetGroupIdentifier: 124 | replicationSubnetGroup.replicationSubnetGroupIdentifier, 125 | vpcSecurityGroupIds: [replicationInstanceSecurityGroup.securityGroupId], 126 | publiclyAccessible: false 127 | } 128 | ) 129 | replicationInstance.node.addDependency(dmsVpcRole) 130 | replicationInstance.node.addDependency(replicationSubnetGroup) 131 | 132 | // Source Database Endpoint 133 | const source = new dms.CfnEndpoint(this, 'dms-source', { 134 | endpointIdentifier: id + '-source', 135 | endpointType: 'source', 136 | databaseName: 'postgres', 137 | engineName: 'postgres', 138 | sslMode: 'require', 139 | 140 | postgreSqlSettings: { 141 | secretsManagerSecretId: 'text-embeddings-pipeline-source-database', 142 | secretsManagerAccessRoleArn: secretManagerAccessRole.roleArn 143 | } 144 | }) 145 | source.node.addDependency(dmsVpcRole) 146 | 147 | // Target Endpoint 148 | const target = new dms.CfnEndpoint(this, 'dms-target', { 149 | endpointIdentifier: id + '-target', 150 | endpointType: 'target', 151 | engineName: 's3', 152 | 153 | s3Settings: { 154 | bucketName: s3Bucket.bucketName, 155 | serviceAccessRoleArn: s3AccessRole.roleArn 156 | } 157 | }) 158 | target.node.addDependency(dmsVpcRole) 159 | 160 | // Replication Task 161 | new dms.CfnReplicationTask(this, 'dms-task', { 162 | replicationTaskIdentifier: id, 163 | replicationInstanceArn: replicationInstance.ref, 164 | 165 | migrationType: 'full-load', 166 | sourceEndpointArn: source.ref, 167 | targetEndpointArn: target.ref, 168 | tableMappings: JSON.stringify({ 169 | rules: [ 170 | { 171 | 'rule-type': 'selection', 172 | 'rule-id': '1', 173 | 'rule-name': '1', 174 | 'object-locator': { 175 | 'schema-name': '%', 176 | 'table-name': '%' 177 | }, 178 | 'rule-action': 'include' 179 | } 180 | ] 181 | }) 182 | }) 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /diagrams/text-embeddings-pipeline.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | --------------------------------------------------------------------------------