├── .gitignore ├── Makefile ├── Readme.md ├── index.js ├── package.json └── terraform ├── bucket.tf ├── dynamo.tf ├── iam.tf ├── lambda.tf └── variables.tf /.gitignore: -------------------------------------------------------------------------------- 1 | .envrc 2 | node_modules 3 | build.zip 4 | *.tfstate 5 | *.tfstate.backup 6 | *.tfvars -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | provision: build 4 | terraform apply -target="aws_lambda_function.segment-s3-dynamo" \ 5 | -target="aws_dynamodb_table.segment-s3-dynamo" \ 6 | -target="aws_iam_role.segment-s3-dynamo-lambda" ./terraform 7 | terraform apply ./terraform 8 | 9 | build: clean 10 | npm install 11 | zip -r build.zip index.js node_modules 12 | 13 | clean: 14 | rm -f build.zip 15 | 16 | update: build 17 | terraform destroy -target=aws_lambda_function.segment-s3-dynamo ./terraform 18 | terraform apply ./terraform 19 | 20 | .PHONY: provision build clean update -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # S3-Dynamo-Lambda connector 3 | 4 | With Segment's S3 integration, you can host an entire analytics pipeline without having to worry about individual instances or hosting your own infrastructure. Here's how to set up an integration which automatically tallies events in Dynamo every hour without requiring any hosting on your side. 5 | 6 | ## Getting Started 7 | 8 | First you'll want to download and install [terraform][]. We'll use it to automatically provision and setup our infrastructure using the files in this repo's [./terraform][] directory. 9 | 10 | If you haven't already, you'll want to create an AWS Account. For the rest of this setup process won't have to create **any** AWS infrastructure manually. Our terraform scripts will take care of all of it for you. 11 | 12 | [terraform]: https://terraform.io/downloads.html 13 | [./terraform]: https://github.com/segmentio/s3-dynamo-lambda/tree/master/terraform 14 | [direnv]: http://direnv.net/ 15 | 16 | ## Setting up your project 17 | 18 | Before running the Terraform script, you'll want to download your Access Keys from the AWS Security Credentials dashboard. You'll typically want to add them to your `.bashrc`, or use a tool like [`direnv`][direnv] to add them to your environment variables: 19 | 20 | export AWS_ACCESS_KEY_ID="xxxxxxxxx" 21 | export AWS_SECRET_ACCESS_KEY="xxxxxxxx" 22 | export AWS_REGION="us-east-1" 23 | 24 | Next, clone this repo: 25 | 26 | git clone git@github.com:segmentio/s3-dynamo-lambda.git 27 | 28 | Terraform also needs to know a few specific variables, which you'll want to save in a `terraform.tfvars` file in the top-level directory of the repo you just cloned. You'll need to supply the name of the bucket you'd like to add, your AWS Account ID (a 12-digit number found under in your AWS Security Crednetials dashboard), and the region where you want to add your infrastructure (Segment's S3 worker is only us-east-1 for now, so stick with us-east-1). It should look something like this: 29 | 30 | bucket_name = "your-desired-bucket-name" 31 | 32 | From there, just run `make`. This will spin up your S3 bucket, Lambda function, and Dynamo instance, all with the appropriate permissions. 33 | 34 | $ make 35 | 36 | You'll also **need to enable an event notification for your bucket** (which hasn't been added to terraform yet). You can enable it in the AWS S3 Console, [following the instructions in the AWS docs](http://docs.aws.amazon.com/AmazonS3/latest/UG/SettingBucketNotifications.html#SettingBucketNotifications-enable-events). You'll want to trigger an event for *all `ObjectCreated`* events, and route it to the Lambda function that terraform has created. 37 | 38 | Finally, you'll want to add your bucket to the S3 integration for your Segment project: 39 | 40 | ![](https://cloudup.com/cSdeplmW4Vs+) 41 | 42 | And, that's it. You're done! A totally hosted analytics pipeline, updated every hour, on the hour. Your DynamoDB table of event counts can be found here: [https://console.aws.amazon.com/dynamodb/home?region=us-east-1#explore:name=Events](https://console.aws.amazon.com/dynamodb/home?region=us-east-1#explore:name=Events) 43 | 44 | When the next hour strikes, query away my friend! 45 | 46 | ## Background: The Lambda function 47 | 48 | We've stored our example lambda function in the [index.js](https://github.com/segmentio/s3-dynamo-lambda/blob/master/index.js) file. It reads from our S3 event logs, splits the line separated json, and adds the counts of different events into Dynamo. 49 | 50 | If you want to update the lambda function, simply change the code around and then run `make update`. The meat of the event interactions happens in our `handleEvent` function. 51 | 52 | ## Testing 53 | 54 | If you're testing the lambda function, it's easiest to use the CLI. 55 | 56 | aws lambda invoke \ 57 | --invocation-type RequestResponse \ 58 | --function-name segment-s3-dynamo \ 59 | --region us-east-1 \ 60 | --log-type Tail \ 61 | --payload file://file.json \ 62 | output.txt 63 | 64 | Where your payload file looks something like: 65 | 66 | ```json 67 | { 68 | "Records": [ 69 | { 70 | "s3": { 71 | "object": { 72 | "key": "{FILE_PATH}" 73 | }, 74 | "bucket": { 75 | "arn": "arn:aws:s3:::{YOUR_BUCKET}", 76 | "name": "{YOUR_BUCKET}" 77 | } 78 | } 79 | } 80 | ] 81 | } 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * Module dependencies 4 | */ 5 | 6 | var datemath = require('date-math'); 7 | var through2 = require('through2'); 8 | var pipe = require('multipipe'); 9 | var WaitGroup = require('waitgroup'); 10 | var AWS = require('aws-sdk'); 11 | var split = require('split'); 12 | var zlib = require('zlib'); 13 | 14 | 15 | var dynamo = new AWS.DynamoDB({ region: 'us-east-1' }); 16 | var s3 = new AWS.S3(); 17 | 18 | /** 19 | * The main event handler. 20 | * 21 | * Streams file from S3 and invokes `handleEvent` on each event in the file 22 | * 23 | * @param {Object} s3Event 24 | * @param {Object} context 25 | */ 26 | 27 | exports.handler = function(s3Event, context) { 28 | var bucket = s3Event.Records[0].s3.bucket.name; 29 | var key = s3Event.Records[0].s3.object.key; 30 | var wg = new WaitGroup; 31 | 32 | console.log('Received S3 event, downloading file...'); 33 | 34 | /** 35 | * Event Extraction Pipeline 36 | * 37 | * - incr wg 38 | * - open stream 39 | * - unzip stream 40 | * - decode Buffer chunks to String 41 | * - buffer strings to newlines 42 | * - emit parsed events 43 | * - on each event, incr wg -> flush to dynamo -> decr wg 44 | * - decr wg on stream close 45 | */ 46 | 47 | wg.add(); 48 | pipe( 49 | s3.getObject({ Bucket: bucket, Key: key }).createReadStream(), 50 | zlib.createGunzip(), 51 | stringify(), 52 | split(parse) 53 | ).on('data', handleEvent) 54 | .on('error', handleError) 55 | .on('end', function(){ 56 | wg.done(); 57 | }); 58 | 59 | wg.wait(function() { 60 | console.log('Finished Flush!'); 61 | context.done(); 62 | }); 63 | 64 | /** 65 | * The segment event handler 66 | * 67 | * Takes a segment event and insert/incrs a record in Dynamo in the following 68 | * format: 69 | * 70 | * [.] = count 71 | * 72 | * @param {Object} event 73 | */ 74 | 75 | function handleEvent(event) { 76 | if (event.type !== 'track') return; 77 | 78 | var floored = datemath.hour.floor(new Date(event.timestamp)); 79 | var Hour = floored.getTime().toString(); 80 | var Name = event.event; 81 | 82 | console.log('Event: ', Name); 83 | 84 | wg.add(); 85 | dynamo.updateItem({ 86 | Key: { 87 | Name: { S: Name }, 88 | Timestamp: { N: Hour } 89 | }, 90 | TableName: 'Events', 91 | AttributeUpdates: { 92 | Count: { 93 | Value: { N: '1' }, 94 | Action: 'ADD' 95 | } 96 | } 97 | }, function(err) { 98 | if (err) console.log('Error Flushing', Name, ':', err); 99 | else console.log('Flushed:', Name); 100 | wg.done(); 101 | }); 102 | } 103 | 104 | // decode Buffer chunks to strings 105 | function stringify() { 106 | return through2(function(data, _, cb) { 107 | cb(null, data.toString('utf8')); 108 | }); 109 | } 110 | 111 | // take lines emitted from `split` and parse them 112 | function parse(str) { 113 | return str === '' ? null : JSON.parse(str.trim()); 114 | } 115 | 116 | // handle stream errors (just bail, for now :p) 117 | function handleError(err) { 118 | console.log('Error:', err); 119 | console.log('Exiting...'); 120 | context.done(err); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "s3-dynamo-lambda", 3 | "version": "1.0.0", 4 | "description": "Example application demonstrating the use of Terraform and AWS Lambda with the Segment AWS S3 integration.", 5 | "main": "segment.js", 6 | "dependencies": { 7 | "aws-sdk": "^2.1.42", 8 | "waitgroup": "^1.1.0", 9 | "date-math": "0.0.1", 10 | "multipipe": "^0.1.2", 11 | "split": "^1.0.0", 12 | "through2": "^2.0.0" 13 | }, 14 | "devDependencies": {}, 15 | "scripts": { 16 | "test": "echo \"Error: no test specified\" && exit 1" 17 | }, 18 | "repository": { 19 | "type": "git", 20 | "url": "git+ssh://git@github.com/segmentio/s3-dynamo-lambda.git" 21 | }, 22 | "keywords": [ 23 | "Lambda", 24 | "Segment", 25 | "S3", 26 | "AWS", 27 | "DynamoDB" 28 | ], 29 | "author": "Calvin French-Owen, Chris Sperandio", 30 | "license": "MIT", 31 | "bugs": { 32 | "url": "https://github.com/segmentio/s3-dynamo-lambda/issues" 33 | }, 34 | "homepage": "https://github.com/segmentio/s3-dynamo-lambda#readme" 35 | } 36 | -------------------------------------------------------------------------------- /terraform/bucket.tf: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * The S3 bucket for uploading to our 4 | */ 5 | 6 | resource "aws_s3_bucket" "segment-s3-dynamo-bucket" { 7 | bucket = "${var.bucket_name}" 8 | depends_on = ["aws_iam_role.segment-s3-dynamo-lambda", "aws_dynamodb_table.segment-s3-dynamo"] 9 | policy = < 9 | */ 10 | 11 | resource "aws_dynamodb_table" "segment-s3-dynamo" { 12 | name = "Events" 13 | read_capacity = 20 14 | write_capacity = 20 15 | hash_key = "Name" 16 | range_key = "Timestamp" 17 | attribute { 18 | name = "Name" 19 | type = "S" 20 | } 21 | attribute { 22 | name = "Timestamp" 23 | type = "N" 24 | } 25 | } -------------------------------------------------------------------------------- /terraform/iam.tf: -------------------------------------------------------------------------------- 1 | 2 | resource "aws_iam_role" "segment-s3-dynamo-lambda" { 3 | name = "segment-s3-dynamo-lambda" 4 | assume_role_policy = <