├── .gitignore ├── example ├── load_items.py └── serverless.yml ├── serverless.yml ├── handler.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Distribution / packaging 2 | .Python 3 | env/ 4 | build/ 5 | develop-eggs/ 6 | dist/ 7 | downloads/ 8 | eggs/ 9 | .eggs/ 10 | lib/ 11 | lib64/ 12 | parts/ 13 | sdist/ 14 | var/ 15 | *.egg-info/ 16 | .installed.cfg 17 | *.egg 18 | 19 | # Serverless directories 20 | .serverless -------------------------------------------------------------------------------- /example/load_items.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import click 3 | 4 | client = boto3.client('dynamodb', region_name='us-east-1') 5 | 6 | @click.command() 7 | @click.option('--count', help="Number of items to insert", type=int, required=True) 8 | def insert_items(count): 9 | for i in range(count): 10 | client.put_item( 11 | TableName='scanner-test-table', 12 | Item={ 13 | "Id": { "S": str(i) } 14 | } 15 | ) 16 | 17 | print("Inserted {count} items to table".format(count=count)) 18 | 19 | 20 | if __name__ == "__main__": 21 | insert_items() 22 | -------------------------------------------------------------------------------- /example/serverless.yml: -------------------------------------------------------------------------------- 1 | service: scanner-test 2 | 3 | provider: 4 | name: aws 5 | runtime: python3.7 6 | stage: dev 7 | region: us-east-1 8 | 9 | resources: 10 | Resources: 11 | ScannedTable: 12 | Type: AWS::DynamoDB::Table 13 | Properties: 14 | TableName: "scanner-test-table" 15 | AttributeDefinitions: 16 | - AttributeName: "Id" 17 | AttributeType: "S" 18 | BillingMode: PAY_PER_REQUEST 19 | KeySchema: 20 | - AttributeName: "Id" 21 | KeyType: "HASH" 22 | IngestKinesisStream: 23 | Type: AWS::Kinesis::Stream 24 | Properties: 25 | Name: "scanner-test-stream" 26 | ShardCount: 1 27 | Outputs: 28 | ScannedTableArn: 29 | Value: { Fn::GetAtt: [ScannedTable, Arn] } 30 | IngestStreamArn: 31 | Value: { Fn::GetAtt: [IngestKinesisStream, Arn] } 32 | -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | service: serverless-dynamodb-scanner 2 | 3 | custom: 4 | dynamodbTableArn: '' 5 | kinesisStreamArn: '' 6 | 7 | provider: 8 | name: aws 9 | runtime: python3.7 10 | stage: dev 11 | region: us-east-1 12 | iamRoleStatements: 13 | - Effect: "Allow" 14 | Action: 15 | - "dynamodb:Scan" 16 | Resource: ${self:custom.dynamodbTableArn} 17 | - Effect: "Allow" 18 | Action: 19 | - "kinesis:PutRecords" 20 | Resource: ${self:custom.kinesisStreamArn} 21 | - Effect: "Allow" 22 | Action: 23 | - "ssm:GetParameter" 24 | - "ssm:PutParameter" 25 | Resource: 'arn:aws:ssm:*:*:parameter/ddbScanner/lastEvaluatedKey' 26 | - Effect: "Allow" 27 | Action: 28 | - "lambda:InvokeFunction" 29 | Resource: "arn:aws:lambda:*:*:function:ScannerLambdaFunction" 30 | environment: 31 | TABLE_ARN: ${self:custom.dynamodbTableArn} 32 | STREAM_ARN: ${self:custom.kinesisStreamArn} 33 | 34 | packaging: 35 | exclude: 36 | - example/** 37 | 38 | functions: 39 | scanner: 40 | handler: handler.scanner 41 | timeout: 900 42 | -------------------------------------------------------------------------------- /handler.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import secrets 4 | 5 | import boto3 6 | 7 | kinesis = boto3.client('kinesis') 8 | dynamodb = boto3.client('dynamodb') 9 | ssm = boto3.client('ssm') 10 | awslambda = boto3.client('lambda') 11 | 12 | LAST_EVALUATED_PARAMETER = '/ddbScanner/lastEvaluatedKey' 13 | 14 | def scanner(event, context): 15 | config = get_config() 16 | last_evaluated_key = config['last_evaluated_key'] 17 | 18 | while True: 19 | params = { 20 | 'TableName': config['table_name'], 21 | 'Limit': 500 22 | } 23 | if last_evaluated_key: 24 | params['ExclusiveStartKey'] = last_evaluated_key 25 | 26 | resp = dynamodb.scan(**params) 27 | records = [] 28 | for item in resp['Items']: 29 | records.append({ 30 | 'Data': json.dumps(item), 31 | 'PartitionKey': secrets.token_hex(12) # Partition key shouldn't matter much here. 32 | }) 33 | 34 | put_records(config['stream_name'], records) 35 | 36 | if 'LastEvaluatedKey' not in resp: 37 | # Scan is complete. Time to finish. 38 | break 39 | ssm.put_parameter( 40 | Name=LAST_EVALUATED_PARAMETER, 41 | Value=json.dumps(resp['LastEvaluatedKey']), 42 | Type='String', 43 | Overwrite=True 44 | ) 45 | last_evaluated_key = resp['LastEvaluatedKey'] 46 | 47 | if context.get_remaining_time_in_millis() < 10000: 48 | awslambda.invoke( 49 | FunctionName=context.function_name, 50 | InvocationType='Event', 51 | ) 52 | return 53 | 54 | print('Scan complete!') 55 | return 56 | 57 | 58 | def get_config(): 59 | table_arn = os.environ['TABLE_ARN'] 60 | table_name = table_arn.split('/')[1] 61 | stream_arn = os.environ['STREAM_ARN'] 62 | stream_name = stream_arn.split('/')[1] 63 | 64 | try: 65 | resp = ssm.get_parameter( 66 | Name=LAST_EVALUATED_PARAMETER 67 | ) 68 | last_evaluated_key = json.loads(resp['Parameter']['Value']) 69 | except ssm.exceptions.ParameterNotFound: 70 | last_evaluated_key = '' 71 | 72 | return { 73 | 'table_name': table_name, 74 | 'stream_name': stream_name, 75 | 'last_evaluated_key': last_evaluated_key 76 | } 77 | 78 | 79 | def put_records(stream_name, records): 80 | resp = kinesis.put_records( 81 | Records=records, 82 | StreamName=stream_name 83 | ) 84 | 85 | if resp['FailedRecordCount'] == 0: 86 | return 87 | 88 | failed_records = [] 89 | for i, record in enumerate(resp['Records']): 90 | if 'ErrorCode' in record: 91 | failed_records.append(records[i]) 92 | 93 | return put_records(stream_name, failed_records) 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Serverless DynamoDB Scanner 2 | 3 | This is a Serverless application that scans a given DynamoDB table and inserts every item into a Kinesis Stream. You can then process the Kinesis stream, allowing you to perform an operation on all existing items in a DynamoDB table. 4 | 5 | It was inspired by a tweet from [Eric Hammond](https://twitter.com/esh): 6 | 7 |
8 | 9 | ## Usage: 10 | 11 | This project uses the [Serverless Framework](https://github.com/serverless/serverless) to deploy a Lambda function and associated AWS resources. 12 | 13 | To use it, follow these steps: 14 | 15 | 1. Install the Framework and create your service: 16 | 17 | ```bash 18 | # Make sure you have the Serverless Framework installed 19 | $ npm install -g serverless 20 | 21 | $ sls create --template-url https://github.com/alexdebrie/serverless-dynamodb-scanner --path serverless-dynamodb-scanner 22 | 23 | $ cd serverless-dynamodb-scanner 24 | ``` 25 | 26 | 2. Update the configuration in `serverless.yml`. 27 | 28 | Add the ARN of the DynamoDB table you want to scan and the ARN of the Kinesis stream where you want the config added: 29 | 30 | ```yml 31 | # serverless.yml 32 | 33 | custom: 34 | dynamodbTableArn: 'arn:aws:dynamodb:us-east-1:123456789012:table/my_table' 35 | kinesisStreamArn: 'arn:aws:kinesis:us-east-1:123456789012:stream/my-stream' 36 | 37 | ... 38 | ``` 39 | 40 | 3. Deploy your service: 41 | 42 | ```bash 43 | $ sls deploy 44 | ``` 45 | 46 | 4. When you're ready, kick off your scan by invoking the function: 47 | 48 | ```bash 49 | $ sls invoke -f scanner 50 | ``` 51 | 52 | 53 | ## How does it work? 54 | 55 | The basic workflow is as follows: 56 | 57 |  58 | 59 | > A diagram that's missing a few steps ¯\\\_(ツ)_/¯ 60 | 61 | 1. Inside the Lambda function, check [AWS SSM](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-paramstore.html) for a `LastEvaluatedKey` parameter that would be sent with our Scan call. 62 | 63 | If no parameter exist in SSM, we're just starting the scan. 64 | 65 | 2. Make a `Scan` call to our DynamoDB table. 66 | 67 | 3. Insert the items returned from our Scan into our Kinesis Stream via a `PutRecords` call. 68 | 69 | 4. If the `Scan` call did not return a `LastEvaluatedKey`, our Scan is done! We can exit the function. 70 | 71 | 5. If the `Scan` did return a `LastEvaluatedKey`, store the value in SSM. 72 | 73 | 6. Do a time check -- if our function has less than 15 seconds of execution time left, we'll invoke another instance of our function and exit the loop for this one. Our next function will pick up where our scan left off by using the `LastEvaluatedKey` in SSM. 74 | --------------------------------------------------------------------------------We really want to run some code against every item in a DynamoDB table.
— Eric Hammond (@esh) February 5, 2019
Surely there's a sample project somewhere that scans a DynamoDB table, feeds records into a Kinesis Data Stream, which triggers an AWS Lambda function?
We can scale DynamoDB and Kinesis manually. https://t.co/ZyAiLfLpWh