├── .gitignore ├── .npmignore ├── README.md ├── bin ├── dynamodb-mass-migration.ts └── run.ts ├── cdk.json ├── jest.config.js ├── lib ├── asl.ts ├── dynamodb-mass-migration-stack.ts └── migrationFunction.ts ├── package-lock.json ├── package.json ├── stepfunctions_graph.png └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | *.js 2 | !jest.config.js 3 | *.d.ts 4 | node_modules 5 | 6 | # CDK asset staging directory 7 | .cdk.staging 8 | cdk.out 9 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | *.ts 2 | !*.d.ts 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Massively parallel migrations in DynamoDB 2 | 3 | ![Step Function definition](./stepfunctions_graph.png) 4 | 5 | Thanks to recent accouncement of [Step Functions Distributed Map](https://aws.amazon.com/blogs/aws/step-functions-distributed-map-a-serverless-solution-for-large-scale-parallel-data-processing/), we can now run 10,000 of parallel executions in Step Functions. This is especially useful for transforming/migrating big datasets in DynamoDB. 6 | 7 | This repo contains a sample [AWS CDK](https://docs.aws.amazon.com/cdk/v2/guide/home.html) code for performing such a migration. 8 | 9 | Because AWS CDK does not support Step Functions Distributed Map yet, we need to use ASL definition directly. This is a bit tricky, but not too hard. 10 | 11 | ## How to use 12 | 13 | 1. Clone project, install dependencies 14 | 15 | ```bash 16 | git clone https://github.com/dynobase/dynamodb-mass-migration 17 | cd dynamodb-mass-migration 18 | npm i 19 | ``` 20 | 21 | 2. Go to `lib/migrationFunction.ts` and adjust your migration logic inside `transformFn`. By default, it just adds `updatedAt` attribute to each item. 22 | 23 | 3. Deploy the stack: 24 | 25 | ```bash 26 | AWS_PROFILE=your-profile \ 27 | AWS_REGION=us-east-1 \ 28 | npx cdk deploy --require-approval never 29 | ``` 30 | 31 | 4. After deploying, invoke the migrating state machine: 32 | 33 | > **Important IAM note:** Make sure that State Machine and Transform Function have proper IAM permissions to access and manipulate DynamoDB table. 34 | 35 | ```bash 36 | SFN_ARN= \ 37 | TABLE_NAME= \ 38 | TOTAL_SEGMENTS=100 \ # number of segments to split the table, should be less than 10,000 39 | npx ts-node bin/run.ts 40 | ``` 41 | 42 | This will start the state machine with following payload: 43 | 44 | ```json 45 | { 46 | "prewarm": false, 47 | "map": [ 48 | { 49 | "tableName": "", 50 | "totalSegments": "100", 51 | "segment": "0" // Or 1, 2, ..., 99 52 | } // ... 99 more items 53 | ] 54 | } 55 | ``` 56 | 57 | ### Table pre-warming 58 | 59 | DynamoDB tables with On-demand mode can handle up to 2,000 WCU and 6,000 RCU, or spikes up to 200% of its previous high-water mark. 60 | 61 | If table is in provisioned mode and your migration is expected to consume more, this project has a feature to [pre-warm the table before starting the migration](https://aws.amazon.com/blogs/database/running-spiky-workloads-and-optimizing-costs-by-more-than-90-using-amazon-dynamodb-on-demand-capacity-mode/). 62 | 63 | By providing `PREWARM=true`, it will set your table's RCU to 4,000 (customizable via `PREWARM_WCU` env var) and RCU to 12,000 (customizable via `PREWARM_RCU` env var). 64 | 65 | If you don't need to pre-warm the table, you can pass `PREWARM=false` to the `run.ts` script or simply skip this setting. 66 | 67 | **Todo:** add a feature to automatically set the table back to its original capacity after the migration. 68 | 69 | ## Useful links: 70 | 71 | - https://theburningmonk.com/2019/03/understanding-the-scaling-behaviour-of-dynamodb-ondemand-tables/ 72 | - https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ReadWriteCapacityMode.html#HowItWorks.InitialThroughput 73 | - https://aws.amazon.com/blogs/database/running-spiky-workloads-and-optimizing-costs-by-more-than-90-using-amazon-dynamodb-on-demand-capacity-mode/ 74 | -------------------------------------------------------------------------------- /bin/dynamodb-mass-migration.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import 'source-map-support/register'; 3 | import * as cdk from 'aws-cdk-lib'; 4 | import { DynamodbMassMigrationStack } from '../lib/dynamodb-mass-migration-stack'; 5 | 6 | const app = new cdk.App(); 7 | new DynamodbMassMigrationStack(app, 'DynamodbMassMigrationStack', { 8 | /* If you don't specify 'env', this stack will be environment-agnostic. 9 | * Account/Region-dependent features and context lookups will not work, 10 | * but a single synthesized template can be deployed anywhere. */ 11 | 12 | /* Uncomment the next line to specialize this stack for the AWS Account 13 | * and Region that are implied by the current CLI configuration. */ 14 | // env: { account: process.env.CDK_DEFAULT_ACCOUNT, region: process.env.CDK_DEFAULT_REGION }, 15 | 16 | /* Uncomment the next line if you know exactly what Account and Region you 17 | * want to deploy the stack to. */ 18 | // env: { account: '123456789012', region: 'us-east-1' }, 19 | 20 | /* For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html */ 21 | }); -------------------------------------------------------------------------------- /bin/run.ts: -------------------------------------------------------------------------------- 1 | import { StepFunctions } from "aws-sdk"; 2 | 3 | interface MapItem { 4 | segment: string; 5 | totalSegments: number; 6 | tableName: string; 7 | } 8 | interface Payload { 9 | map: MapItem[]; 10 | prewarm: boolean; 11 | prewarmWCU?: number; 12 | prewarmRCU?: number; 13 | } 14 | 15 | const stateMachineArn = process.env.SFN_ARN; 16 | const totalSegments = process.env.TOTAL_SEGMENTS 17 | ? parseInt(process.env.TOTAL_SEGMENTS, 10) 18 | : 100; 19 | const tableName = process.env.TABLE_NAME ?? "some-table"; 20 | 21 | if (!stateMachineArn) { 22 | throw new Error("SFN_ARN not set"); 23 | } 24 | 25 | const region = stateMachineArn.split(":")[3]; 26 | 27 | const sfn = new StepFunctions({ region }); 28 | 29 | const map: MapItem[] = new Array(totalSegments).fill(1).map((_, i) => ({ 30 | segment: i.toString(), 31 | totalSegments, 32 | tableName, 33 | })); 34 | 35 | const payload: Payload = { 36 | map, 37 | prewarm: false, 38 | }; 39 | 40 | if (process.env.PREWARM === "true") { 41 | payload.prewarm = true; 42 | payload.prewarmWCU = parseInt(process.env.PREWARM_WCU ?? "4000", 10); 43 | payload.prewarmRCU = parseInt(process.env.PREWARM_RCU ?? "12000", 10); 44 | 45 | console.log( 46 | `Pre-warming table with ${payload.prewarmWCU} WCU and ${payload.prewarmRCU} RCU` 47 | ); 48 | } 49 | 50 | sfn 51 | .startExecution({ 52 | stateMachineArn, 53 | input: JSON.stringify(payload), 54 | }) 55 | .promise() 56 | .then((r) => { 57 | const detailsUrl = `https://${region}.console.aws.amazon.com/states/home?region=${region}#/v2/executions/details/${r.executionArn}`; 58 | 59 | console.log( 60 | `Migration started! 61 | See the process in your browser here: ${detailsUrl}` 62 | ); 63 | }) 64 | .catch(console.error); 65 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "npx ts-node --prefer-ts-exts bin/dynamodb-mass-migration.ts", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "**/*.d.ts", 11 | "**/*.js", 12 | "tsconfig.json", 13 | "package*.json", 14 | "yarn.lock", 15 | "node_modules", 16 | "test" 17 | ] 18 | }, 19 | "context": { 20 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 21 | "@aws-cdk/core:stackRelativeExports": true, 22 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 23 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 24 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 25 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 26 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 27 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 28 | "@aws-cdk/core:checkSecretUsage": true, 29 | "@aws-cdk/aws-iam:minimizePolicies": true, 30 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 31 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 32 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 33 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 34 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 35 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 36 | "@aws-cdk/core:enablePartitionLiterals": true, 37 | "@aws-cdk/core:target-partitions": [ 38 | "aws", 39 | "aws-cn" 40 | ] 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | testEnvironment: 'node', 3 | roots: ['/test'], 4 | testMatch: ['**/*.test.ts'], 5 | transform: { 6 | '^.+\\.tsx?$': 'ts-jest' 7 | } 8 | }; 9 | -------------------------------------------------------------------------------- /lib/asl.ts: -------------------------------------------------------------------------------- 1 | export const asl = (lambdaArn: string, resultsBucket: string) => ({ 2 | Comment: "Parallel migration state machine", 3 | StartAt: "DescribeTable", 4 | States: { 5 | DescribeTable: { 6 | Type: "Task", 7 | Next: "Should scale table's throughput?", 8 | Parameters: { 9 | "TableName.$": "$.map[0].tableName", 10 | }, 11 | Resource: "arn:aws:states:::aws-sdk:dynamodb:describeTable", 12 | ResultPath: "$.tableDescription", 13 | }, 14 | "Should scale table's throughput?": { 15 | Type: "Choice", 16 | Choices: [ 17 | { 18 | Or: [ 19 | { 20 | Not: { 21 | Variable: "$.prewarm", 22 | BooleanEquals: true, 23 | }, 24 | }, 25 | { 26 | Variable: 27 | "$.tableDescription.Table.BillingModeSummary.BillingMode", 28 | StringEquals: "PAY_PER_REQUEST", 29 | }, 30 | ], 31 | Next: "Parallel Migration", 32 | }, 33 | ], 34 | Default: "Update table throughput", 35 | Comment: 36 | "On-demand tables can process 2,000 write request units or 6,000 read request units immediately. If more is needed, you can pre-warm your table", 37 | }, 38 | "Update table throughput": { 39 | Type: "Task", 40 | Next: "Wait 10 seconds", 41 | Parameters: { 42 | "TableName.$": "$.map[0].tableName", 43 | BillingMode: "PROVISIONED", 44 | ProvisionedThroughput: { 45 | "ReadCapacityUnits.$": "$.prewarmRCU", 46 | "WriteCapacityUnits.$": "$.prewarmWCU", 47 | }, 48 | }, 49 | Resource: "arn:aws:states:::aws-sdk:dynamodb:updateTable", 50 | ResultPath: null, 51 | Comment: 52 | "https://aws.amazon.com/blogs/database/running-spiky-workloads-and-optimizing-costs-by-more-than-90-using-amazon-dynamodb-on-demand-capacity-mode/", 53 | }, 54 | "Wait 10 seconds": { 55 | Type: "Wait", 56 | Seconds: 10, 57 | Next: "Check Table's status", 58 | Comment: "Poll for table readiness", 59 | }, 60 | "Check Table's status": { 61 | Type: "Task", 62 | Next: "Is table's capacity already provisioned?", 63 | Parameters: { 64 | "TableName.$": "$.map[0].tableName", 65 | }, 66 | Resource: "arn:aws:states:::aws-sdk:dynamodb:describeTable", 67 | ResultPath: "$.tableDescription", 68 | }, 69 | "Is table's capacity already provisioned?": { 70 | Type: "Choice", 71 | Choices: [ 72 | { 73 | Variable: "$.tableDescription.Table.TableStatus", 74 | StringEquals: "UPDATING", 75 | Next: "Wait 10 seconds", 76 | Comment: "Is table ready?", 77 | }, 78 | ], 79 | Default: "Parallel Migration", 80 | }, 81 | "Parallel Migration": { 82 | Type: "Map", 83 | ItemProcessor: { 84 | ProcessorConfig: { 85 | Mode: "DISTRIBUTED", 86 | ExecutionType: "EXPRESS", 87 | }, 88 | StartAt: "Transform Function", 89 | States: { 90 | "Transform Function": { 91 | Type: "Task", 92 | Resource: "arn:aws:states:::lambda:invoke", 93 | OutputPath: "$.Payload", 94 | Parameters: { 95 | "Payload.$": "$", 96 | FunctionName: lambdaArn, 97 | }, 98 | Retry: [ 99 | { 100 | ErrorEquals: [ 101 | "Lambda.ServiceException", 102 | "Lambda.AWSLambdaException", 103 | "Lambda.SdkClientException", 104 | "Lambda.TooManyRequestsException", 105 | ], 106 | IntervalSeconds: 2, 107 | MaxAttempts: 3, 108 | BackoffRate: 2, 109 | }, 110 | ], 111 | End: true, 112 | }, 113 | }, 114 | }, 115 | End: true, 116 | Label: "ParallelMigration", 117 | ResultWriter: { 118 | Resource: "arn:aws:states:::s3:putObject", 119 | Parameters: { 120 | Bucket: resultsBucket, 121 | Prefix: "results", 122 | }, 123 | }, 124 | Retry: [ 125 | { 126 | ErrorEquals: ["States.ALL"], 127 | BackoffRate: 1, 128 | IntervalSeconds: 1, 129 | MaxAttempts: 3, 130 | }, 131 | ], 132 | InputPath: "$.map", 133 | }, 134 | }, 135 | }); 136 | -------------------------------------------------------------------------------- /lib/dynamodb-mass-migration-stack.ts: -------------------------------------------------------------------------------- 1 | import * as cdk from "aws-cdk-lib"; 2 | import { CfnOutput } from "aws-cdk-lib"; 3 | import { AttributeType, BillingMode, Table } from "aws-cdk-lib/aws-dynamodb"; 4 | import { PolicyStatement } from "aws-cdk-lib/aws-iam"; 5 | import { NodejsFunction } from "aws-cdk-lib/aws-lambda-nodejs"; 6 | import { Bucket } from "aws-cdk-lib/aws-s3"; 7 | import { 8 | CfnStateMachine, 9 | Pass, 10 | StateMachine, 11 | } from "aws-cdk-lib/aws-stepfunctions"; 12 | import { Construct } from "constructs"; 13 | import { asl } from "./asl"; 14 | export class DynamodbMassMigrationStack extends cdk.Stack { 15 | constructor(scope: Construct, id: string, props?: cdk.StackProps) { 16 | super(scope, id, props); 17 | 18 | const table = new Table(this, "SampleItemsTable", { 19 | partitionKey: { 20 | name: "id", 21 | type: AttributeType.STRING, 22 | }, 23 | billingMode: BillingMode.PAY_PER_REQUEST, 24 | }); 25 | 26 | const resultsBucket = new Bucket(this, "ResultsBucket", { 27 | removalPolicy: cdk.RemovalPolicy.DESTROY, 28 | }); 29 | 30 | const migrationFunction = new NodejsFunction(this, "MigrationFunction", { 31 | entry: "./lib/migrationFunction.ts", 32 | }); 33 | table.grantFullAccess(migrationFunction); 34 | 35 | const stateMachine = new StateMachine(this, "MigrationStateMachine", { 36 | definition: new Pass(this, "StartState"), 37 | }); 38 | const cfnStatemachine = stateMachine.node.defaultChild as CfnStateMachine; 39 | 40 | resultsBucket.grantReadWrite(stateMachine); 41 | migrationFunction.grantInvoke(stateMachine); 42 | table.grantFullAccess(stateMachine); 43 | stateMachine.addToRolePolicy( 44 | new PolicyStatement({ 45 | actions: ["states:StartExecution"], 46 | resources: ["*"], 47 | }) 48 | ); 49 | 50 | cfnStatemachine.definitionString = JSON.stringify( 51 | asl(migrationFunction.functionArn, resultsBucket.bucketName) 52 | ); 53 | 54 | new CfnOutput(this, "TransformFunctionArn", { 55 | value: migrationFunction.functionArn, 56 | }); 57 | new CfnOutput(this, "MigrationMachineArn", { 58 | value: stateMachine.stateMachineArn, 59 | }); 60 | new CfnOutput(this, "SampleItemsTableName", { 61 | value: table.tableName, 62 | }); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /lib/migrationFunction.ts: -------------------------------------------------------------------------------- 1 | import { DynamoDB } from "aws-sdk"; 2 | 3 | const ddb = new DynamoDB.DocumentClient(); 4 | 5 | interface Event { 6 | segment: string; 7 | totalSegments: string; 8 | tableName: string; 9 | } 10 | 11 | export const handler = async ({ 12 | segment, 13 | totalSegments, 14 | tableName, 15 | }: Event): Promise => { 16 | if (!tableName) { 17 | throw new Error("tableName not set"); 18 | } 19 | if (!segment) { 20 | throw new Error("segment not set"); 21 | } 22 | if (!totalSegments) { 23 | throw new Error("totalSegments not set"); 24 | } 25 | const parsedTotalSegments = parseInt(totalSegments, 10); 26 | 27 | if (parsedTotalSegments > 10000) { 28 | throw new Error("totalSegments must be less than 10,000"); 29 | } 30 | 31 | let firstRun = true; 32 | let nextPageToken; 33 | let totalItemsProcessed = 0; 34 | 35 | do { 36 | firstRun = false; 37 | const scanResult = await ddb 38 | .scan({ 39 | TableName: tableName, 40 | TotalSegments: parsedTotalSegments, 41 | Segment: parseInt(segment, 10), 42 | }) 43 | .promise(); 44 | nextPageToken = scanResult.LastEvaluatedKey; 45 | totalItemsProcessed += scanResult.Count ?? 0; 46 | 47 | await Promise.all( 48 | (scanResult.Items ?? []).map((i) => transformFn(i, tableName)) 49 | ); 50 | } while (firstRun || nextPageToken); 51 | 52 | const result = { 53 | segment, 54 | totalProcessed: totalItemsProcessed, 55 | tableName, 56 | totalSegments, 57 | }; 58 | 59 | console.log(result); 60 | return result; 61 | }; 62 | 63 | async function transformFn(item: any, tableName: string): Promise { 64 | await ddb 65 | .put({ 66 | TableName: tableName, 67 | Item: { 68 | ...item, 69 | updatedAt: new Date().toISOString(), 70 | }, 71 | }) 72 | .promise(); 73 | 74 | return item; 75 | } 76 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dynamodb-mass-migration", 3 | "version": "0.1.0", 4 | "bin": { 5 | "dynamodb-mass-migration": "bin/dynamodb-mass-migration.js" 6 | }, 7 | "scripts": { 8 | "build": "tsc", 9 | "watch": "tsc -w", 10 | "test": "jest", 11 | "cdk": "cdk" 12 | }, 13 | "devDependencies": { 14 | "@types/jest": "^27.5.2", 15 | "@types/node": "10.17.27", 16 | "@types/prettier": "2.6.0", 17 | "aws-cdk": "2.50.0", 18 | "aws-sdk": "^2.1267.0", 19 | "esbuild": "^0.15.16", 20 | "jest": "^27.5.1", 21 | "ts-jest": "^27.1.4", 22 | "ts-node": "^10.9.1", 23 | "typescript": "~3.9.7" 24 | }, 25 | "dependencies": { 26 | "aws-cdk-lib": "2.50.0", 27 | "constructs": "^10.0.0", 28 | "source-map-support": "^0.5.21" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /stepfunctions_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Dynobase/dynamodb-mass-migrations/916f5d8a89cd4baca7e6e28d3d07448c7b83d5da/stepfunctions_graph.png -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2018", 4 | "module": "commonjs", 5 | "lib": [ 6 | "es2018" 7 | ], 8 | "declaration": true, 9 | "strict": true, 10 | "noImplicitAny": true, 11 | "strictNullChecks": true, 12 | "noImplicitThis": true, 13 | "alwaysStrict": true, 14 | "noUnusedLocals": false, 15 | "noUnusedParameters": false, 16 | "noImplicitReturns": true, 17 | "noFallthroughCasesInSwitch": false, 18 | "inlineSourceMap": true, 19 | "inlineSources": true, 20 | "experimentalDecorators": true, 21 | "strictPropertyInitialization": false, 22 | "typeRoots": [ 23 | "./node_modules/@types" 24 | ] 25 | }, 26 | "exclude": [ 27 | "node_modules", 28 | "cdk.out" 29 | ] 30 | } 31 | --------------------------------------------------------------------------------