├── .gitignore ├── .project ├── LICENSE.txt ├── README.md ├── requirements.txt ├── samples ├── DynamoDBExport │ ├── DynamoDBTableExport.json │ ├── DynamoDBtoCSV.json │ └── readme.md ├── DynamoDBExportJava │ ├── .gitignore │ ├── pom.xml │ ├── readme.md │ └── src │ │ └── main │ │ ├── java │ │ └── com │ │ │ └── amazonaws │ │ │ └── datapipelinesamples │ │ │ └── ddbexport │ │ │ ├── CommandLineArgParser.java │ │ │ ├── DDBExportPipelineCreator.java │ │ │ ├── DDBExportPipelineObjectCreator.java │ │ │ ├── Main.java │ │ │ └── PipelineMonitor.java │ │ └── resources │ │ └── log4j2.xml ├── DynamoDBImport │ ├── XMLtoDynamoDBImport.json │ └── readme.md ├── DynamoDBImportCSV │ ├── CSVtoDynamoDB.json │ └── README.md ├── DynamoDBToRedshiftConvertDataUsingHive │ ├── DynamoDBtoRedshiftHiveCSV.json │ └── README.md ├── EFSBackup │ ├── 1-Node-EFSBackupPipeline.json │ ├── 1-Node-EFSRestorePipeline.json │ ├── 2-Node-EFSBackupPipeline.json │ ├── 2-Node-EFSRestorePipeline.json │ ├── 3-Node-EFSBackupPipeline.json │ ├── README.md │ ├── efs-backup-end.sh │ ├── efs-backup-init.sh │ ├── efs-backup-rsync.sh │ ├── efs-backup.sh │ ├── efs-restore-rsync.sh │ └── efs-restore.sh ├── ExampleTemplate │ └── README.md ├── HadoopTerasort │ ├── README.md │ ├── TeraSortHadoopBenchmark.json │ ├── process-jhist.sh │ └── setup.py ├── InvokeLambda │ ├── README.md │ └── invokelambda.json ├── LoadTsvFilesInS3ToRedshift │ ├── LoadTsvFilesInS3ToRedshift.json │ └── README.md ├── OnDemandWithLamdaFunctions │ ├── lambda_function.py │ ├── ondemand.json │ └── readme.md ├── RDStoRedshiftSqoop │ ├── RDStoRedshiftSqoop.json │ ├── README.md │ └── setup │ │ ├── RdsToRedshiftSqoopSample.py │ │ ├── Setup.py │ │ ├── SetupPipelineDefinition.py │ │ ├── Teardown.py │ │ ├── Utilities.py │ │ └── setup.json ├── RDStoS3 │ ├── RDStoS3Pipeline.json │ ├── README.md │ └── setup │ │ ├── RDStoS3Sample.py │ │ ├── Setup.py │ │ ├── SetupPipelineDefinition.py │ │ ├── Teardown.py │ │ └── Utilities.py ├── RedshiftCopyActivityFromDynamoDBTable │ ├── RedshiftCopyActivityFromDynamoDBTable.json │ └── readme.md ├── RedshiftToRDS │ ├── RedshiftToRDS_WithoutRDSCreate.json │ ├── RedshiftToRDS_withTableCreate.json │ └── readme.md ├── S3ToRdsSqoop │ ├── README.md │ └── sqoop_activity.json ├── S3TsvFilesToRedshiftTablesIfReady │ ├── S3TsvFilesToRedshiftTablesIfReady.json │ └── readme.md ├── SQLActivityWithTimeout │ ├── README.md │ ├── pipeline.json │ └── setup │ │ ├── SQLActivitySample.py │ │ ├── Setup.py │ │ ├── SetupPipelineDefinition.py │ │ ├── Teardown.py │ │ ├── Utilities.py │ │ └── setup.json ├── ShellCommandWithFTP │ ├── README.md │ ├── data │ ├── ftpcommands │ └── pipeline.json ├── ShellCommandWithS3StagingDirectory │ ├── README.md │ ├── bashscript.sh │ └── shellcommandwiths3stagingdir.json ├── SimplePigActivity │ ├── pig_activity_sample.json │ └── readme.md ├── SparkPiMaximizeResourceAllocation │ ├── SparkPi-maximizeResource.json │ └── readme.md ├── billing │ ├── readme.md │ └── template.json ├── diagnose │ ├── README.md │ └── diagnose_pipeline.json ├── dynamo-db-export-as-csv │ ├── ddb-to-csv.json │ └── readme.md ├── dynamo-db-export │ ├── DynamoDB-export.json │ ├── example-parameters.json │ └── readme.md ├── dynamo-db-to-redshift │ ├── dynamo-db-to-redshift.json │ └── readme.md ├── dynamodb-to-dynamodb-crossregion │ ├── README.md │ └── pipeline.json ├── dynamodb-to-dynamodb │ ├── README.md │ └── pipeline.json ├── hadoop-activity │ ├── README.md │ └── hadoop-activity-world-count-fair.json ├── helloworld │ ├── README.md │ ├── helloworld.json │ └── setup.py ├── json-to-dynamodb │ ├── README.md │ ├── customers.json │ ├── definition.json │ └── json_to_ddb.q ├── kinesis │ ├── README.md │ ├── hive-scripts │ │ ├── create-table-from-kinesis-stream.q │ │ ├── script-runner.sh │ │ └── write-kinesis-to-s3.q │ ├── kinesis-to-s3.json │ └── setup │ │ ├── append-to-stream.sh │ │ └── setup-script.sh ├── oracle-backup │ ├── README.md │ ├── definition.json │ ├── parameters.json │ └── values.json └── rds-to-rds-copy │ └── readme.md └── setup ├── logo └── datapipelinelogo.jpeg ├── stacker.py └── stacker_tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | data-pipeline-samples 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2011-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | awscli==1.8.12 2 | boto3==1.1.4 3 | botocore==1.2.10 4 | colorama==0.3.3 5 | docutils==0.12 6 | futures==2.2.0 7 | jmespath==0.9.0 8 | pyasn1==0.1.9 9 | python-dateutil==2.4.2 10 | rsa==3.2 11 | six==1.10.0 12 | wheel==0.24.0 13 | -------------------------------------------------------------------------------- /samples/DynamoDBExport/DynamoDBTableExport.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "occurrences": "1", 5 | "period": "1 Day", 6 | "name": "RunOnce", 7 | "id": "DefaultSchedule", 8 | "type": "Schedule", 9 | "startAt": "FIRST_ACTIVATION_DATE_TIME", 10 | "maxActiveInstances" : "1" 11 | }, 12 | { 13 | "failureAndRerunMode": "CASCADE", 14 | "schedule": { 15 | "ref": "DefaultSchedule" 16 | }, 17 | "resourceRole": "DataPipelineDefaultResourceRole", 18 | "role": "DataPipelineDefaultRole", 19 | "pipelineLogUri": "s3://", 20 | "scheduleType": "cron", 21 | "name": "Default", 22 | "id": "Default" 23 | }, 24 | { 25 | "maximumRetries": "2", 26 | "name": "TableBackupActivity", 27 | "step": "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')},#{myDDBTableName},#{myDDBReadThroughputRatio}", 28 | "id": "TableBackupActivity", 29 | "runsOn": { 30 | "ref": "EmrClusterForBackup" 31 | }, 32 | "type": "EmrActivity" 33 | }, 34 | { 35 | "bootstrapAction": "s3://elasticmapreduce/bootstrap-actions/configure-hadoop, --yarn-key-value, yarn.nodemanager.resource.memory-mb=12800,--yarn-key-value,yarn.scheduler.minimum-allocation-mb=256,--mapred-key-value,mapreduce.map.memory.mb=500,--mapred-key-value,mapreduce.map.java.opts=-Xmx400M,--mapred-key-value,mapreduce.job.reduce.slowstart.completedmaps=1,--mapred-key-value,mapreduce.map.speculative=false", 36 | "name": "EmrClusterForBackup", 37 | "amiVersion": "3.8.0", 38 | "id": "EmrClusterForBackup", 39 | "type": "EmrCluster", 40 | "masterInstanceType": "m1.medium", 41 | "coreInstanceType": "#{myInstanceType}", 42 | "coreInstanceCount": "#{myInstanceCount}", 43 | "terminateAfter" : "12 hours" 44 | } 45 | ], 46 | "parameters": [ 47 | { 48 | "description": "OutputS3folder", 49 | "id": "myOutputS3Loc", 50 | "type": "AWS::S3::ObjectKey" 51 | }, 52 | { 53 | "default": "0.2", 54 | "watermark": "Valuebetween0.1-1.0", 55 | "description": "DynamoDB Read Throughput Ratio", 56 | "id": "myDDBReadThroughputRatio", 57 | "type": "Double" 58 | }, 59 | { 60 | "description": "DynamoDB Table Name", 61 | "id": "myDDBTableName", 62 | "type": "String" 63 | }, 64 | { 65 | "description": "Instance Type", 66 | "id": "myInstanceType", 67 | "watermark" : "Use m1.medium if Read Capacity Units for the job <= 900. Else use m3.xlarge", 68 | "type": "String", 69 | "default": "m3.xlarge" 70 | }, 71 | { 72 | "description": "Instance Count", 73 | "watermark" : "(Read Capacity Units / 300) for m1.medium if RCU <= 900. Else (RCU / 1500) for m3.xlarge", 74 | "id": "myInstanceCount", 75 | "type": "Integer", 76 | "default": "1" 77 | }, 78 | { 79 | "description" : "Burst IOPs", 80 | "watermark" : "Add IOPS to the DDB table by this percent for the duration of the export job", 81 | "id" : "myBurstIOPS", 82 | "type" : "Double", 83 | "default" : "0.0" 84 | } 85 | ] 86 | } 87 | -------------------------------------------------------------------------------- /samples/DynamoDBExport/DynamoDBtoCSV.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment" : "Activity used to run the hive script to export data to CSV", 5 | "output": { 6 | "ref": "DataNodeId_cnlSW" 7 | }, 8 | "input": { 9 | "ref": "DataNodeId_1ERqq" 10 | }, 11 | "name": "TableBackupActivity", 12 | "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myS3ColMapping})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n'\nLOCATION '#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}';\n \nINSERT OVERWRITE TABLE s3TempTable SELECT * FROM tempHiveTable;", 13 | "runsOn": { "ref" : "EmrClusterForBackup" }, 14 | "id": "TableBackupActivity", 15 | "type": "HiveActivity" 16 | }, 17 | { 18 | "period": "1 days", 19 | "name": "Every 1 day", 20 | "id": "DefaultSchedule", 21 | "type": "Schedule", 22 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 23 | }, 24 | { 25 | "myComment" : "The DynamoDB table from which we need to export data from", 26 | "dataFormat": { 27 | "ref": "DDBExportFormat" 28 | }, 29 | "name": "DynamoDB", 30 | "id": "DataNodeId_1ERqq", 31 | "type": "DynamoDBDataNode", 32 | "tableName": "#{myDDBTableName}" 33 | }, 34 | { 35 | "failureAndRerunMode": "CASCADE", 36 | "schedule": { 37 | "ref": "DefaultSchedule" 38 | }, 39 | "resourceRole": "DataPipelineDefaultResourceRole", 40 | "role": "DataPipelineDefaultRole", 41 | "pipelineLogUri": "#{myLogUri}", 42 | "scheduleType": "cron", 43 | "name": "Default", 44 | "id": "Default" 45 | }, 46 | { 47 | "name": "EmrClusterForBackup", 48 | "coreInstanceType": "m1.medium", 49 | "coreInstanceCount": "1", 50 | "masterInstanceType": "m1.medium", 51 | "amiVersion": "3.3.2", 52 | "id": "EmrClusterForBackup", 53 | "type": "EmrCluster", 54 | "terminateAfter": "2 Hours" 55 | }, 56 | { 57 | "myComment" : "The S3 path to which we export data to", 58 | "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}/", 59 | "dataFormat": { 60 | "ref": "DataFormatId_xqWRk" 61 | }, 62 | "name": "S3DataNode", 63 | "id": "DataNodeId_cnlSW", 64 | "type": "S3DataNode" 65 | }, 66 | { 67 | "myComment" : "Format for the S3 Path", 68 | "name": "DefaultDataFormat1", 69 | "column": "not_used STRING", 70 | "id": "DataFormatId_xqWRk", 71 | "type": "CSV" 72 | }, 73 | { 74 | "myComment" : "Format for the DynamoDB table", 75 | "name": "DDBExportFormat", 76 | "id": "DDBExportFormat", 77 | "column": "not_used STRING", 78 | "type": "DynamoDBExportDataFormat" 79 | } 80 | ], 81 | "parameters": [ 82 | { 83 | "description": "Output S3 folder", 84 | "id": "myOutputS3Loc", 85 | "type": "AWS::S3::ObjectKey" 86 | }, 87 | { 88 | "description": "DynamoDB table name", 89 | "id": "myDDBTableName", 90 | "type": "String" 91 | }, 92 | { 93 | "description": "S3 to DynamoDB Column Mapping", 94 | "id": "myDDBTableColMapping", 95 | "type": "String" 96 | }, 97 | { 98 | "description": "S3 Column Mappings", 99 | "id": "myS3ColMapping", 100 | "type": "String" 101 | }, 102 | { 103 | "description": "DataPipeline Log Uri", 104 | "id": "myLogUri", 105 | "type": "String" 106 | } 107 | ] 108 | } 109 | -------------------------------------------------------------------------------- /samples/DynamoDBExport/readme.md: -------------------------------------------------------------------------------- 1 | #DynamoDB to CSV export 2 | 3 | ##About the sample 4 | The pipeline definition is used for exporting DynamoDB data to a CSV format. 5 | 6 | ##Running the pipeline 7 | 8 | Example DynamoDB table with keys: customer_id, income, demographics, financial 9 | 10 | User needs to provide: 11 | 12 | 1. Output S3 folder: The s3 folder prefix to which the CSV data is to be exported. 13 | 2. DynamoDB read throughput ratio: The throughput to be used for the export operation. 14 | 3. DynamoDB table name: The table name from which we need to export the data. 15 | 4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string 16 | 5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas. 17 | 6. Log Uri: S3 log path to capture the pipeline logs. 18 | -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | *.iml 3 | dependency-reduced-pom.xml 4 | -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | DynamoDBExportSample 8 | DynamoDBExportSample 9 | 0.1 10 | 11 | 12 | 13 | com.amazonaws 14 | aws-java-sdk 15 | 1.10.33 16 | 17 | 18 | com.google.guava 19 | guava 20 | 19.0-rc2 21 | 22 | 23 | commons-cli 24 | commons-cli 25 | 1.3.1 26 | 27 | 28 | org.apache.logging.log4j 29 | log4j-api 30 | 2.4.1 31 | 32 | 33 | org.apache.logging.log4j 34 | log4j-core 35 | 2.4.1 36 | 37 | 38 | 39 | 40 | 41 | 42 | org.apache.maven.plugins 43 | maven-shade-plugin 44 | 2.3 45 | 46 | 47 | package 48 | 49 | shade 50 | 51 | 52 | 53 | 54 | com.amazonaws.datapipelinesamples.ddbexport.Main 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | org.apache.maven.plugins 63 | maven-compiler-plugin 64 | 3.3 65 | 66 | 1.8 67 | 1.8 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/readme.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline DynamoDB Export Java Sample 2 | 3 | ## Overview 4 | 5 | This sample makes it easy to create a pipeline that uses the latest DynamoDB export template EMR activity. You provide 6 | parameters and the tool will create the pipeline and run and monitor it once so you can verify that it is healthy. 7 | 8 | This sample also provides an example application using the AWS Data Pipeline Java SDK. It demonstrates how to 9 | create, run and monitor a pipeline. 10 | 11 | ## Prerequisites 12 | 13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the 14 | [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 15 | 16 | 17 | ## Getting started 18 | 19 | Build: mvn clean package

20 | View parameters description: java -jar path/to/DynamoDBExportSample-0.1.jar help

21 | Run: java -jar path/to/DynamoDBExportSample-0.1.jar <-yourParam foo> 22 | 23 | ## Example 24 | 25 | Create and run on a pipeline that runs once per day: 26 | 27 | java -jar /Users/foobar/DynamoDBExportJava/target/DynamoDBExportSample-0.1.jar -credentialsFile 28 | /Users/foobar/.aws/credentials -myDDBTableName footable -myOutputS3Location s3://foobar/ddb-exports -schedule daily 29 | -myLogsS3Location s3://foobar/logs -myDDBRegion us-east-1 30 | 31 | Create and run on a pipeline that runs once: 32 | 33 | java -jar /Users/foobar/DynamoDBExportJava/target/DynamoDBExportSample-0.1.jar -credentialsFile 34 | /Users/foobar/.aws/credentials -myDDBTableName footable -myOutputS3Location s3://foobar/ddb-exports -schedule once 35 | -myLogsS3Location s3://foobar/logs -myDDBRegion us-east-1 36 | 37 | ## Disclaimer 38 | 39 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for 40 | production environments. Users should carefully inspect code samples before running them. 41 | 42 | Use at your own risk. 43 | 44 | Licensed under the MIT-0 License. 45 | -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/CommandLineArgParser.java: -------------------------------------------------------------------------------- 1 | package com.amazonaws.datapipelinesamples.ddbexport; 2 | 3 | import org.apache.commons.cli.CommandLine; 4 | import org.apache.commons.cli.CommandLineParser; 5 | import org.apache.commons.cli.DefaultParser; 6 | import org.apache.commons.cli.HelpFormatter; 7 | import org.apache.commons.cli.Options; 8 | import org.apache.commons.cli.ParseException; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | 15 | public class CommandLineArgParser { 16 | private static final Logger logger = LogManager.getLogger(CommandLineArgParser.class); 17 | 18 | public static Map parseParameters(final String[] args) { 19 | Options params = new Options(); 20 | params.addOption("myDDBTableName", true, "Dynamo DB source table that will be exported (REQUIRED)"); 21 | params.addOption("myOutputS3Location", true, "S3 bucket where the export will be stored (REQUIRED)"); 22 | params.addOption("myLogsS3Location", true, "S3 bucket where the logs will be stored (REQUIRED)"); 23 | params.addOption("schedule", true, "Schedule to run pipeline on. Options are: once or daily (REQUIRED)"); 24 | params.addOption("credentialsFile", true, "Path to AWS credentials file. ex: /Users/foo/.aws/credentials " + 25 | "(REQUIRED)"); 26 | params.addOption("myDDBRegion", true, "Region to run pipeline in. Default: us-east-1 (Optional)"); 27 | 28 | return getParamsMap(args, params); 29 | } 30 | 31 | private static Map getParamsMap(final String[] args, final Options params) { 32 | CommandLineParser parser = new DefaultParser(); 33 | CommandLine cmd; 34 | Map paramsMap = new HashMap<>(); 35 | 36 | try { 37 | cmd = parser.parse(params, args); 38 | addToMapIfPreset(cmd, "credentialsFile", true, paramsMap); 39 | addToMapIfPreset(cmd, "myDDBTableName", true, paramsMap); 40 | addToMapIfPreset(cmd, "myOutputS3Location", true, paramsMap); 41 | addToMapIfPreset(cmd, "myLogsS3Location", true, paramsMap); 42 | addToMapIfPreset(cmd, "schedule", true, paramsMap); 43 | addToMapIfPreset(cmd, "myDDBRegion", false, paramsMap); 44 | } catch (ParseException | RuntimeException e) { 45 | logger.error(e.getMessage()); 46 | printHelp(params); 47 | throw new RuntimeException(); 48 | } 49 | 50 | return paramsMap; 51 | } 52 | 53 | private static void printHelp(final Options params) { 54 | HelpFormatter formatter = new HelpFormatter(); 55 | formatter.printHelp("maven", params); 56 | } 57 | 58 | private static void addToMapIfPreset(final CommandLine cmd, final String paramName, final boolean required, 59 | final Map paramsMap) { 60 | if(cmd.hasOption(paramName)) { 61 | paramsMap.put(paramName, cmd.getOptionValue(paramName)); 62 | } else if (required) { 63 | logger.error("Unable to find required parameter: " + paramName); 64 | throw new RuntimeException(); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/Main.java: -------------------------------------------------------------------------------- 1 | package com.amazonaws.datapipelinesamples.ddbexport; 2 | 3 | import com.amazonaws.auth.AWSCredentials; 4 | import com.amazonaws.auth.profile.ProfileCredentialsProvider; 5 | import com.amazonaws.services.datapipeline.DataPipelineClient; 6 | 7 | import java.util.Map; 8 | 9 | public class Main { 10 | 11 | private static DataPipelineClient dataPipelineClient; 12 | 13 | public static void main(String args[]) { 14 | Map params = CommandLineArgParser.parseParameters(args); 15 | 16 | dataPipelineClient = getClient(params.get("credentialsFile")); 17 | 18 | String pipelineId = DDBExportPipelineCreator.createPipeline(dataPipelineClient); 19 | 20 | DDBExportPipelineCreator.putPipelineDefinition(dataPipelineClient, pipelineId, params); 21 | 22 | DDBExportPipelineCreator.activatePipeline(dataPipelineClient, pipelineId); 23 | 24 | PipelineMonitor.monitorPipelineUntilCompleted(dataPipelineClient, pipelineId, "TableBackupActivity"); 25 | } 26 | 27 | private static DataPipelineClient getClient(final String profileName) { 28 | AWSCredentials credentials = new ProfileCredentialsProvider(profileName, "default").getCredentials(); 29 | return new DataPipelineClient(credentials); 30 | } 31 | } -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/PipelineMonitor.java: -------------------------------------------------------------------------------- 1 | package com.amazonaws.datapipelinesamples.ddbexport; 2 | 3 | import com.amazonaws.services.datapipeline.DataPipelineClient; 4 | import com.amazonaws.services.datapipeline.model.DescribeObjectsRequest; 5 | import com.amazonaws.services.datapipeline.model.DescribeObjectsResult; 6 | import com.amazonaws.services.datapipeline.model.Field; 7 | import com.amazonaws.services.datapipeline.model.QueryObjectsRequest; 8 | import com.amazonaws.services.datapipeline.model.QueryObjectsResult; 9 | import org.apache.logging.log4j.LogManager; 10 | import org.apache.logging.log4j.Logger; 11 | 12 | import java.util.Timer; 13 | import java.util.TimerTask; 14 | import java.util.stream.Collectors; 15 | 16 | public class PipelineMonitor { 17 | 18 | private static final Logger logger = LogManager.getLogger(DDBExportPipelineCreator.class); 19 | 20 | public static void monitorPipelineUntilCompleted(final DataPipelineClient dataPipelineClient, 21 | final String pipelineId, final String activityName) { 22 | Timer timer = new Timer(); 23 | int thirtySeconds = 30 * 1000; 24 | timer.schedule(new TimerTask() { 25 | @Override 26 | public void run() { 27 | QueryObjectsRequest queryObjectsRequest = new QueryObjectsRequest().withPipelineId(pipelineId) 28 | .withSphere("INSTANCE"); 29 | QueryObjectsResult result = dataPipelineClient.queryObjects(queryObjectsRequest); 30 | 31 | if(result.getIds().size() <= 0) { 32 | logger.info("Creating pipeline object execution graph"); 33 | return; 34 | } 35 | 36 | String emrActivityId = result.getIds().stream().filter(r -> r.contains(activityName)) 37 | .collect(Collectors.joining("\n")); 38 | DescribeObjectsResult describeObjectsResult = dataPipelineClient 39 | .describeObjects(new DescribeObjectsRequest().withObjectIds(emrActivityId) 40 | .withPipelineId(pipelineId)); 41 | 42 | String status = ""; 43 | for(Field field : describeObjectsResult.getPipelineObjects().get(0).getFields()) { 44 | if (field.getKey().equals("@status")) { 45 | logger.info(field.getKey() + "=" + field.getStringValue()); 46 | status = field.getStringValue(); 47 | } 48 | } 49 | 50 | if (status.equals("CANCELED") || status.equals("FINISHED") || status.equals("FAILED")) { 51 | this.cancel(); 52 | timer.cancel(); 53 | } 54 | } 55 | }, 0, thirtySeconds); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /samples/DynamoDBExportJava/src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /samples/DynamoDBImport/XMLtoDynamoDBImport.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "name": "EmrClusterForBackup", 5 | "coreInstanceType": "m1.medium", 6 | "coreInstanceCount": "1", 7 | "masterInstanceType": "m1.medium", 8 | "amiVersion": "3.3.2", 9 | "id": "EmrClusterForBackup", 10 | "type": "EmrCluster", 11 | "terminateAfter": "2 Hours" 12 | }, 13 | { 14 | "period": "1 days", 15 | "name": "Every 1 day", 16 | "id": "DefaultSchedule", 17 | "type": "Schedule", 18 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 19 | }, 20 | { 21 | "name": "DefaultDataFormat1", 22 | "column": "not_used STRING", 23 | "id": "DataFormatId_xqWRk", 24 | "myComment": "Format for the S3 Path", 25 | "type": "CSV" 26 | }, 27 | { 28 | "failureAndRerunMode": "CASCADE", 29 | "schedule": { 30 | "ref": "DefaultSchedule" 31 | }, 32 | "resourceRole": "DataPipelineDefaultResourceRole", 33 | "role": "DataPipelineDefaultRole", 34 | "pipelineLogUri": "#{myLogUri}", 35 | "scheduleType": "cron", 36 | "name": "Default", 37 | "id": "Default" 38 | }, 39 | { 40 | "name": "ShellCommandActivityCp", 41 | "runsOn": { "ref" : "EmrClusterForBackup" }, 42 | "id": "ActivityId_zrRQz", 43 | "type": "ShellCommandActivity", 44 | "command": "aws s3 cp s3://data-pipeline-samples/dynamodbxml/input/serde.xml /home/hadoop/serde-#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}.xml" 45 | }, 46 | { 47 | "dataFormat": { 48 | "ref": "DDBExportFormat" 49 | }, 50 | "name": "DynamoDB", 51 | "id": "DataNodeId_1ERqq", 52 | "type": "DynamoDBDataNode", 53 | "myComment": "The DynamoDB table from which we need to export data from", 54 | "tableName": "customers" 55 | }, 56 | { 57 | "column": "not_used STRING", 58 | "name": "DDBExportFormat", 59 | "id": "DDBExportFormat", 60 | "type": "DynamoDBExportDataFormat", 61 | "myComment": "Format for the DynamoDB table" 62 | }, 63 | { 64 | "directoryPath": "s3://data-pipeline-samples/dynamodbxml/input", 65 | "dataFormat": { 66 | "ref": "DataFormatId_xqWRk" 67 | }, 68 | "name": "S3DataNode", 69 | "id": "DataNodeId_cnlSW", 70 | "type": "S3DataNode", 71 | "myComment": "The S3 path to which we export data to" 72 | }, 73 | { 74 | "output": { 75 | "ref": "DataNodeId_1ERqq" 76 | }, 77 | "input": { 78 | "ref": "DataNodeId_cnlSW" 79 | }, 80 | "dependsOn": { 81 | "ref": "ActivityId_zrRQz" 82 | }, 83 | "name": "TableBackupActivity", 84 | "hiveScript": "add jar s3://data-pipeline-samples/dynamodbxml/hivexmlserde-1.0.5.3.jar;\nDROP TABLE IF EXISTS xml_bank;\nCREATE EXTERNAL TABLE xml_bank(customer_id STRING, income string, demographics string, financial string)\nROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'\nWITH SERDEPROPERTIES (\n\"column.xpath.customer_id\"=\"/record/@customer_id\",\n\"column.xpath.income\"=\"/record/income/text()\",\n\"column.xpath.demographics\"=\"/record/demographics/*\",\n\"column.xpath.financial\"=\"/record/financial/*\"\n)\nSTORED AS\nINPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'\nTBLPROPERTIES (\n\"xmlinput.start\"=\"\"\n);\nLOAD DATA LOCAL inpath '/home/hadoop/serde-#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}.xml' overwrite into table xml_bank;\nDROP TABLE IF EXISTS hiveTableName;\nCREATE EXTERNAL TABLE hiveTableName (col1 string, col2 string, col3 string, col4 string)\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"customers\", \n\"dynamodb.column.mapping\" = \"col1:customer_id,col2:income,col3:demographics,col4:financial\"); \nINSERT OVERWRITE TABLE hiveTableName SELECT * FROM xml_bank;", 85 | "runsOn": { "ref" : "EmrClusterForBackup" }, 86 | "id": "TableBackupActivity", 87 | "type": "HiveActivity", 88 | "myComment": "Activity used to run the hive script to export data to CSV" 89 | } 90 | ] 91 | } 92 | -------------------------------------------------------------------------------- /samples/DynamoDBImport/readme.md: -------------------------------------------------------------------------------- 1 | #XML to DynamoDB Import 2 | 3 | ##Running the sample pipeline 4 | The json format could be either directly imported in the Console -> Create Pipeline or used in the aws datapipeline cli.
5 | The Pipeline definition would copy an example xml from s3://data-pipeline-samples/dynamodbxml/input/serde.xml to local. This step is required for creating a temporary xml table using hive. The hive script is configured for running on a DynamoDB table with keys as "customer_id, financial, income, demographics". It finally performs an import from the temporary xml table to dynamodb
6 | The data from the xml file is parsed using hive xml serde. The parsing functionality is similar to parsing in xpath
7 | The resultant should be the data is available in the DynamoDB table.
8 | 9 | 10 | -------------------------------------------------------------------------------- /samples/DynamoDBImportCSV/CSVtoDynamoDB.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment" : "Activity used to run the hive script to import CSV data", 5 | "output": { 6 | "ref": "DataNodeId_cnlSW" 7 | }, 8 | "input": { 9 | "ref": "DataNodeId_1ERqq" 10 | }, 11 | "name": "TableRestoreActivity", 12 | "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myDDBColDefn})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\\n' LOCATION '#{myInputS3Loc}';\n \nINSERT OVERWRITE TABLE tempHiveTable SELECT * FROM s3TempTable;", 13 | "id": "TableRestoreActivity", 14 | "runsOn": { "ref" : "EmrClusterForRestore" }, 15 | "stage": "false", 16 | "type": "HiveActivity" 17 | }, 18 | { 19 | "myComment" : "The DynamoDB table from which we need to import data from", 20 | "dataFormat": { 21 | "ref": "DDBExportFormat" 22 | }, 23 | "name": "DynamoDB", 24 | "id": "DataNodeId_1ERqq", 25 | "type": "DynamoDBDataNode", 26 | "tableName": "#{myDDBTableName}" 27 | }, 28 | { 29 | "failureAndRerunMode": "CASCADE", 30 | "resourceRole": "DataPipelineDefaultResourceRole", 31 | "role": "DataPipelineDefaultRole", 32 | "pipelineLogUri": "#{myLogUri}", 33 | "scheduleType": "ONDEMAND", 34 | "name": "Default", 35 | "id": "Default" 36 | }, 37 | { 38 | "name": "EmrClusterForRestore", 39 | "coreInstanceType": "m1.medium", 40 | "coreInstanceCount": "1", 41 | "masterInstanceType": "m1.medium", 42 | "releaseLabel": "emr-4.4.0", 43 | "id": "EmrClusterForRestore", 44 | "type": "EmrCluster", 45 | "terminateAfter": "2 Hours" 46 | }, 47 | { 48 | "myComment" : "The S3 path from which we import data from", 49 | "directoryPath": "#{myInputS3Loc}", 50 | "dataFormat": { 51 | "ref": "DataFormatId_xqWRk" 52 | }, 53 | "name": "S3DataNode", 54 | "id": "DataNodeId_cnlSW", 55 | "type": "S3DataNode" 56 | }, 57 | { 58 | "myComment" : "Format for the S3 Path", 59 | "name": "DefaultDataFormat1", 60 | "column": "not_used STRING", 61 | "id": "DataFormatId_xqWRk", 62 | "type": "CSV" 63 | }, 64 | { 65 | "myComment" : "Format for the DynamoDB table", 66 | "name": "DDBExportFormat", 67 | "id": "DDBExportFormat", 68 | "column": "not_used STRING", 69 | "type": "DynamoDBExportDataFormat" 70 | } 71 | ], 72 | "parameters": [ 73 | { 74 | "description": "Input S3 folder", 75 | "id": "myInputS3Loc", 76 | "default": "s3://datapipeline-sample-csv/", 77 | "type": "AWS::S3::ObjectKey" 78 | }, 79 | { 80 | "description": "DynamoDB table name", 81 | "id": "myDDBTableName", 82 | "type": "String" 83 | }, 84 | { 85 | "description": "S3 to DynamoDB Column Mapping", 86 | "id": "myDDBTableColMapping", 87 | "default" : "id:id,age:age,job:job,marital:marital,education:education,default:default,housing:housing,loan:loan,contact:contact,month:month,day_of_week:day_of_week,duration:duration,campaign:campaign,pdays:pdays,previous:previous,poutcome:poutcome,emp_var_rate:emp_var_rate,cons_price_idx:cons_price_idx,cons_conf_idx:cons_conf_idx,euribor3m:euribor3m,nr_employed:nr_employed,y:y", 88 | "type": "String" 89 | }, 90 | { 91 | "description": "S3 Column Mappings", 92 | "id": "myS3ColMapping", 93 | "default" : "id string,age int,job string,marital string,education string,default string,housing string,loan string,contact string,month string,day_of_week string,duration int,campaign int,pdays int,previous int,poutcome string,emp_var_rate double,cons_price_idx double,cons_conf_idx double,euribor3m double,nr_employed double,y int", 94 | "type": "String" 95 | }, 96 | { 97 | "description": "DynamoDB Column Mappings", 98 | "id": "myDDBColDefn", 99 | "default" : "id string,age bigint,job string,marital string,education string,default string,housing string,loan string,contact string,month string,day_of_week string,duration bigint,campaign bigint,pdays bigint,previous bigint,poutcome string,emp_var_rate double,cons_price_idx double,cons_conf_idx double,euribor3m double,nr_employed double,y bigint", 100 | "type": "String" 101 | }, 102 | { 103 | "description": "DataPipeline Log Uri", 104 | "id": "myLogUri", 105 | "type": "AWS::S3::ObjectKey" 106 | } 107 | ] 108 | } 109 | -------------------------------------------------------------------------------- /samples/DynamoDBImportCSV/README.md: -------------------------------------------------------------------------------- 1 | #DynamoDB to CSV import 2 | 3 | ##About the sample 4 | The pipeline definition is used to import DynamoDB data to a CSV format. 5 | 6 | ##Running the pipeline 7 | 8 | Example DynamoDB table with keys: id 9 | 10 | User needs to provide: 11 | 12 | 1. Input S3 folder: The s3 folder prefix from which the CSV data is to be imported. 13 | 2. DynamoDB read throughput ratio: The throughput to be used for the import operation. 14 | 3. DynamoDB table name: The table name from which we need to import the data. 15 | 4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string 16 | 4. Dynamodb Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string 17 | 5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas. 18 | 6. Log Uri: S3 log path to capture the pipeline logs. 19 | -------------------------------------------------------------------------------- /samples/EFSBackup/1-Node-EFSBackupPipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects" : [ 3 | { 4 | "id" : "Default", 5 | "scheduleType" : "cron", 6 | "failureAndRerunMode" : "CASCADE", 7 | "schedule" : { 8 | "ref" : "DefaultSchedule" 9 | }, 10 | "name" : "Default", 11 | "role" : "DataPipelineDefaultRole", 12 | "resourceRole" : "DataPipelineDefaultResourceRole" 13 | }, 14 | { 15 | "id" : "EC2ResourceObj", 16 | "terminateAfter" : "70 Minutes", 17 | "instanceType" : "#{myInstanceType}", 18 | "name" : "EC2ResourceObj", 19 | "type" : "Ec2Resource", 20 | "securityGroupIds" : [ 21 | "#{mySrcSecGroupID}", 22 | "#{myBackupSecGroupID}" 23 | ], 24 | "subnetId" : "#{mySubnetID}", 25 | "associatePublicIpAddress" : "true", 26 | "imageId" : "#{myImageID}" 27 | }, 28 | { 29 | "id" : "DefaultSchedule", 30 | "name" : "Every Day", 31 | "startAt" : "FIRST_ACTIVATION_DATE_TIME", 32 | "type" : "Schedule", 33 | "period" : "1 Days" 34 | }, 35 | { 36 | "id" : "ShellCommandActivityObj", 37 | "name" : "ShellCommandActivityObj", 38 | "runsOn" : { 39 | "ref" : "EC2ResourceObj" 40 | }, 41 | "command" : "#{myShellCmd}", 42 | "scriptArgument" : [ 43 | "#{myEfsSource}", 44 | "#{myEfsBackup}", 45 | "#{myInterval}", 46 | "#{myRetainedBackups}", 47 | "#{myEfsID}" 48 | ], 49 | "type" : "ShellCommandActivity", 50 | "stage" : "true" 51 | } 52 | ], 53 | "parameters" : [ 54 | { 55 | "id" : "myShellCmd", 56 | "default" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-backup.sh\nchmod a+x efs-backup.sh\n./efs-backup.sh $1 $2 $3 $4 $5", 57 | "description" : "Shell command to run.", 58 | "type" : "String" 59 | }, 60 | { 61 | "id" : "myInstanceType", 62 | "default" : "m3.medium", 63 | "description" : "Instance type for creating backups.", 64 | "allowedValues" : [ 65 | "t1.micro", 66 | "m3.medium", 67 | "m3.large", 68 | "m3.xlarge", 69 | "m3.2xlarge", 70 | "c3.large", 71 | "c3.xlarge", 72 | "c3.2xlarge", 73 | "c3.4xlarge", 74 | "c3.8xlarge" 75 | ], 76 | "type" : "String" 77 | }, 78 | { 79 | "id" : "mySubnetID", 80 | "default" : "subnet-1234abcd", 81 | "description" : "VPC subnet for your backup EC2 instance (ideally the same subnet used for the production EFS mount point).", 82 | "type" : "String" 83 | }, 84 | { 85 | "id" : "mySrcSecGroupID", 86 | "default" : "sg-1111111b", 87 | "description" : "Security group that can connect to the Production EFS mount point.", 88 | "type" : "String" 89 | }, 90 | { 91 | "id" : "myBackupSecGroupID", 92 | "default" : "sg-9999999b", 93 | "description" : "Security group that can connect to the Backup EFS mount point.", 94 | "type" : "String" 95 | }, 96 | { 97 | "id" : "myInterval", 98 | "default" : "daily", 99 | "description" : "Interval for backups.", 100 | "allowedValues" : [ 101 | "hourly", 102 | "daily", 103 | "weekly", 104 | "monthly" 105 | ], 106 | "type" : "String" 107 | }, 108 | { 109 | "id" : "myRetainedBackups", 110 | "default" : "7", 111 | "description" : "Number of backups to retain.", 112 | "type" : "Integer" 113 | }, 114 | { 115 | "id" : "myEfsID", 116 | "default" : "backup-fs-12345678", 117 | "description" : "Name for the directory that will contain your backups.", 118 | "type" : "String" 119 | }, 120 | { 121 | "id" : "myEfsSource", 122 | "default" : "10.0.1.32:/", 123 | "description" : "Production EFS mount target IP address.", 124 | "type" : "String" 125 | }, 126 | { 127 | "id" : "myEfsBackup", 128 | "default" : "10.0.1.75:/", 129 | "description" : "Backup EFS mount target IP address.", 130 | "type" : "String" 131 | }, 132 | { 133 | "id" : "myImageID", 134 | "default" : "ami-12345678", 135 | "description" : "AMI ID for the EC2 instance.", 136 | "type" : "String" 137 | } 138 | ] 139 | } -------------------------------------------------------------------------------- /samples/EFSBackup/1-Node-EFSRestorePipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects" : [ 3 | { 4 | "id" : "Default", 5 | "scheduleType" : "cron", 6 | "failureAndRerunMode" : "CASCADE", 7 | "schedule" : { 8 | "ref" : "DefaultSchedule" 9 | }, 10 | "name" : "Default", 11 | "role" : "DataPipelineDefaultRole", 12 | "resourceRole" : "DataPipelineDefaultResourceRole" 13 | }, 14 | { 15 | "id" : "EC2ResourceObj", 16 | "terminateAfter" : "70 Minutes", 17 | "instanceType" : "#{myInstanceType}", 18 | "name" : "EC2ResourceObj", 19 | "type" : "Ec2Resource", 20 | "securityGroupIds" : [ 21 | "#{mySrcSecGroupID}", 22 | "#{myBackupSecGroupID}" 23 | ], 24 | "subnetId" : "#{mySubnetID}", 25 | "associatePublicIpAddress" : "true", 26 | "imageId" : "#{myImageID}" 27 | }, 28 | { 29 | "id" : "DefaultSchedule", 30 | "name" : "Every Day", 31 | "startAt" : "FIRST_ACTIVATION_DATE_TIME", 32 | "type" : "Schedule", 33 | "occurrences" : "1", 34 | "period" : "1 Days" 35 | }, 36 | { 37 | "id" : "ShellCommandActivityObj", 38 | "name" : "ShellCommandActivityObj", 39 | "runsOn" : { 40 | "ref" : "EC2ResourceObj" 41 | }, 42 | "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore.sh\nchmod a+x efs-restore.sh\n./efs-restore.sh $1 $2 $3 $4 $5", 43 | "scriptArgument" : [ 44 | "#{myEfsSource}", 45 | "#{myEfsBackup}", 46 | "#{myInterval}", 47 | "#{myBackup}", 48 | "#{myEfsID}" 49 | ], 50 | "type" : "ShellCommandActivity", 51 | "stage" : "true" 52 | } 53 | ], 54 | "parameters" : [ 55 | { 56 | "id" : "myInstanceType", 57 | "default" : "m3.large", 58 | "description" : "Instance type for performing the restore.", 59 | "allowedValues" : [ 60 | "t1.micro", 61 | "m3.medium", 62 | "m3.large", 63 | "m3.xlarge", 64 | "m3.2xlarge", 65 | "c3.large", 66 | "c3.xlarge", 67 | "c3.2xlarge", 68 | "c3.4xlarge", 69 | "c3.8xlarge" 70 | ], 71 | "type" : "String" 72 | }, 73 | { 74 | "id" : "mySubnetID", 75 | "default" : "subnet-1234abcd", 76 | "description" : "VPC subnet for your restoration EC2 instance (ideally the same subnet used for the backup EFS mount point).", 77 | "type" : "String" 78 | }, 79 | { 80 | "id" : "mySrcSecGroupID", 81 | "default" : "sg-1111111b", 82 | "description" : "Security group that can connect to the Production EFS mount point.", 83 | "type" : "String" 84 | }, 85 | { 86 | "id" : "myBackupSecGroupID", 87 | "default" : "sg-9999999b", 88 | "description" : "Security group that can connect to the Backup EFS mount point.", 89 | "type" : "String" 90 | }, 91 | { 92 | "id" : "myInterval", 93 | "default" : "daily", 94 | "description" : "Interval that you chose for the backup your going to restore.", 95 | "allowedValues" : [ 96 | "hourly", 97 | "daily", 98 | "weekly", 99 | "monthly" 100 | ], 101 | "type" : "String" 102 | }, 103 | { 104 | "id" : "myBackup", 105 | "default" : "0", 106 | "description" : "Backup number to restore (0 = the most recent backup).", 107 | "type" : "Integer" 108 | }, 109 | { 110 | "id" : "myEfsID", 111 | "default" : "backup-fs-12345678", 112 | "description" : "Name for the directory that already contains your backups.", 113 | "type" : "String" 114 | }, 115 | { 116 | "id" : "myEfsSource", 117 | "default" : "10.0.1.32:/", 118 | "description" : "Production EFS mount target IP address.", 119 | "type" : "String" 120 | }, 121 | { 122 | "id" : "myEfsBackup", 123 | "default" : "10.0.1.75:/", 124 | "description" : "Backup EFS mount target IP address.", 125 | "type" : "String" 126 | }, 127 | { 128 | "id" : "myImageID", 129 | "default" : "ami-12345678", 130 | "description" : "AMI ID for the EC2 instance.", 131 | "type" : "String" 132 | } 133 | ] 134 | } -------------------------------------------------------------------------------- /samples/EFSBackup/2-Node-EFSRestorePipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects" : [ 3 | { 4 | "id" : "Default", 5 | "scheduleType" : "cron", 6 | "failureAndRerunMode" : "CASCADE", 7 | "schedule" : { 8 | "ref" : "DefaultSchedule" 9 | }, 10 | "name" : "Default", 11 | "role" : "DataPipelineDefaultRole", 12 | "resourceRole" : "DataPipelineDefaultResourceRole" 13 | }, 14 | { 15 | "id" : "EC2Resource1", 16 | "terminateAfter" : "70 Minutes", 17 | "instanceType" : "#{myInstanceType}", 18 | "name" : "EC2Resource1", 19 | "type" : "Ec2Resource", 20 | "securityGroupIds" : [ 21 | "#{mySrcSecGroupID}", 22 | "#{myBackupSecGroupID}" 23 | ], 24 | "subnetId" : "#{mySubnetID}", 25 | "associatePublicIpAddress" : "true", 26 | "imageId" : "#{myImageID}" 27 | }, 28 | { 29 | "id" : "EC2Resource2", 30 | "terminateAfter" : "70 Minutes", 31 | "instanceType" : "#{myInstanceType}", 32 | "name" : "EC2Resource2", 33 | "type" : "Ec2Resource", 34 | "securityGroupIds" : [ 35 | "#{mySrcSecGroupID}", 36 | "#{myBackupSecGroupID}" 37 | ], 38 | "subnetId" : "#{mySubnetID}", 39 | "associatePublicIpAddress" : "true", 40 | "imageId" : "#{myImageID}" 41 | }, 42 | { 43 | "id" : "DefaultSchedule", 44 | "name" : "RunOnce", 45 | "startAt" : "FIRST_ACTIVATION_DATE_TIME", 46 | "type" : "Schedule", 47 | "occurrences" : "1", 48 | "period" : "1 Days" 49 | }, 50 | { 51 | "id" : "RestorePart1", 52 | "name" : "RestorePart1", 53 | "runsOn" : { 54 | "ref" : "EC2Resource1" 55 | }, 56 | "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore-rsync.sh\nchmod a+x efs-restore-rsync.sh\n./efs-restore-rsync.sh $1 $2 $3 $4 $5 $6 $7", 57 | "scriptArgument" : [ 58 | "#{myEfsSource}", 59 | "#{myEfsBackup}", 60 | "#{myInterval}", 61 | "#{myBackup}", 62 | "#{myEfsID}", 63 | "1", 64 | "2" 65 | ], 66 | "type" : "ShellCommandActivity", 67 | "stage" : "true" 68 | }, 69 | { 70 | "id" : "RestorePart2", 71 | "name" : "RestorePart2", 72 | "runsOn" : { 73 | "ref" : "EC2Resource2" 74 | }, 75 | "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore-rsync.sh\nchmod a+x efs-restore-rsync.sh\n./efs-restore-rsync.sh $1 $2 $3 $4 $5 $6 $7", 76 | "scriptArgument" : [ 77 | "#{myEfsSource}", 78 | "#{myEfsBackup}", 79 | "#{myInterval}", 80 | "#{myBackup}", 81 | "#{myEfsID}", 82 | "0", 83 | "2" 84 | ], 85 | "type" : "ShellCommandActivity", 86 | "stage" : "true" 87 | } 88 | ], 89 | "parameters" : [ 90 | { 91 | "id" : "myInstanceType", 92 | "default" : "m3.large", 93 | "description" : "Instance type for performing the restore.", 94 | "allowedValues" : [ 95 | "t1.micro", 96 | "m3.medium", 97 | "m3.large", 98 | "m3.xlarge", 99 | "m3.2xlarge", 100 | "c3.large", 101 | "c3.xlarge", 102 | "c3.2xlarge", 103 | "c3.4xlarge", 104 | "c3.8xlarge" 105 | ], 106 | "type" : "String" 107 | }, 108 | { 109 | "id" : "mySubnetID", 110 | "default" : "subnet-1234abcd", 111 | "description" : "VPC subnet for your restoration EC2 instance (ideally the same subnet used for the backup EFS mount point).", 112 | "type" : "String" 113 | }, 114 | { 115 | "id" : "mySrcSecGroupID", 116 | "default" : "sg-1111111b", 117 | "description" : "Security group that can connect to the Production EFS mount point.", 118 | "type" : "String" 119 | }, 120 | { 121 | "id" : "myBackupSecGroupID", 122 | "default" : "sg-9999999b", 123 | "description" : "Security group that can connect to the Backup EFS mount point.", 124 | "type" : "String" 125 | }, 126 | { 127 | "id" : "myInterval", 128 | "default" : "daily", 129 | "description" : "Interval for backups.", 130 | "allowedValues" : [ 131 | "hourly", 132 | "daily", 133 | "weekly", 134 | "monthly" 135 | ], 136 | "type" : "String" 137 | }, 138 | { 139 | "id" : "myBackup", 140 | "default" : "0", 141 | "description" : "Backup number to restore (0 = the most recent backup).", 142 | "type" : "Integer" 143 | }, 144 | { 145 | "id" : "myEfsID", 146 | "default" : "backup-fs-12345678", 147 | "description" : "Name for the directory that already contains your backups", 148 | "type" : "String" 149 | }, 150 | { 151 | "id" : "myEfsSource", 152 | "default" : "10.0.1.32:/", 153 | "description" : "Production EFS mount target IP address.", 154 | "type" : "String" 155 | }, 156 | { 157 | "id" : "myEfsBackup", 158 | "default" : "10.0.1.75:/", 159 | "description" : "Backup EFS mount target IP address.", 160 | "type" : "String" 161 | }, 162 | { 163 | "id" : "myImageID", 164 | "default" : "ami-12345678", 165 | "description" : "AMI ID for the EC2 instance.", 166 | "type" : "String" 167 | } 168 | ] 169 | } -------------------------------------------------------------------------------- /samples/EFSBackup/efs-backup-end.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Input arguments 4 | interval=$1 5 | efsid=$2 6 | 7 | echo "sudo touch /mnt/backups/$efsid/$interval.0/" 8 | sudo touch /mnt/backups/$efsid/$interval.0/ 9 | echo "$interval: completed successfully" 10 | -------------------------------------------------------------------------------- /samples/EFSBackup/efs-backup-init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Initialization of EFS backup 3 | 4 | # Input arguments 5 | source=$1 6 | destination=$2 7 | interval=$3 8 | retain=$4 9 | efsid=$5 10 | 11 | # Prepare system for rsync 12 | echo 'sudo yum -y install nfs-utils' 13 | sudo yum -y install nfs-utils 14 | if [ ! -d /backup ]; then 15 | echo 'sudo mkdir /backup' 16 | sudo mkdir /backup 17 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup" 18 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup 19 | fi 20 | if [ ! -d /mnt/backups ]; then 21 | echo 'sudo mkdir /mnt/backups' 22 | sudo mkdir /mnt/backups 23 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups" 24 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups 25 | fi 26 | 27 | # we need to decrement retain because we start counting with 0 and we need to remove the oldest backup 28 | let "retain=$retain-1" 29 | if sudo test -d /mnt/backups/$efsid/$interval.$retain; then 30 | echo "sudo rm -rf /mnt/backups/$efsid/$interval.$retain" 31 | sudo rm -rf /mnt/backups/$efsid/$interval.$retain 32 | fi 33 | 34 | # Rotate all previous backups (except the first one), up one level 35 | for x in `seq $retain -1 2`; do 36 | if sudo test -d /mnt/backups/$efsid/$interval.$[$x-1]; then 37 | echo "sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x" 38 | sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x 39 | fi 40 | done 41 | 42 | # Copy first backup with hard links, then replace first backup with new backup 43 | if sudo test -d /mnt/backups/$efsid/$interval.0 ; then 44 | echo "sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1" 45 | sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1 46 | fi 47 | if [ ! -d /mnt/backups/$efsid ]; then 48 | echo "sudo mkdir -p /mnt/backups/$efsid" 49 | sudo mkdir -p /mnt/backups/$efsid 50 | echo "sudo chmod 700 /mnt/backups/$efsid" 51 | sudo chmod 700 /mnt/backups/$efsid 52 | fi 53 | if [ ! -d /mnt/backups/efsbackup-logs ]; then 54 | echo "sudo mkdir -p /mnt/backups/efsbackup-logs" 55 | sudo mkdir -p /mnt/backups/efsbackup-logs 56 | echo "sudo chmod 700 /mnt/backups/efsbackup-logs" 57 | sudo chmod 700 /mnt/backups/efsbackup-logs 58 | fi 59 | if [ -f /tmp/efs-backup.log ]; then 60 | echo "sudo rm /tmp/efs-backup.log" 61 | sudo rm /tmp/efs-backup.log 62 | fi 63 | -------------------------------------------------------------------------------- /samples/EFSBackup/efs-backup-rsync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Input arguments 4 | source=$1 5 | destination=$2 6 | interval=$3 7 | retain=$4 8 | efsid=$5 9 | clientNum=$6 10 | numClients=$7 11 | 12 | 13 | # Prepare system for rsync 14 | echo 'sudo yum -y install nfs-utils' 15 | sudo yum -y install nfs-utils 16 | if [ ! -d /backup ]; then 17 | echo 'sudo mkdir /backup' 18 | sudo mkdir /backup 19 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup" 20 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup 21 | fi 22 | if [ ! -d /mnt/backups ]; then 23 | echo 'sudo mkdir /mnt/backups' 24 | sudo mkdir /mnt/backups 25 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups" 26 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups 27 | fi 28 | 29 | if [ -f /tmp/efs-backup.log ]; then 30 | echo "sudo rm /tmp/efs-backup.log" 31 | sudo rm /tmp/efs-backup.log 32 | fi 33 | 34 | #Copy all content this node is responsible for 35 | for myContent in `sudo ls -a --ignore . --ignore .. /backup/ | awk 'NR%'$numClients==$clientNum`; do 36 | echo "sudo rsync -s -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/$myContent /mnt/backups/$efsid/$interval.0/" 37 | sudo rsync -s -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/"$myContent" /mnt/backups/$efsid/$interval.0/ 38 | rsyncStatus=$? 39 | done 40 | 41 | if [ -f /tmp/efs-backup.log ]; then 42 | echo "sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log" 43 | sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log 44 | fi 45 | exit $rsyncStatus 46 | -------------------------------------------------------------------------------- /samples/EFSBackup/efs-backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Example would be to run this script as follows: 3 | # Every 6 hours; retain last 4 backups 4 | # efs-backup.sh $src $dst hourly 4 efs-12345 5 | # Once a day; retain last 31 days 6 | # efs-backup.sh $src $dst daily 31 efs-12345 7 | # Once a week; retain 4 weeks of backup 8 | # efs-backup.sh $src $dst weekly 7 efs-12345 9 | # Once a month; retain 3 months of backups 10 | # efs-backup.sh $src $dst monthly 3 efs-12345 11 | # 12 | # Snapshots will look like: 13 | # $dst/$efsid/hourly.0-3; daily.0-30; weekly.0-3; monthly.0-2 14 | 15 | 16 | # Input arguments 17 | source=$1 18 | destination=$2 19 | interval=$3 20 | retain=$4 21 | efsid=$5 22 | 23 | # Prepare system for rsync 24 | #echo 'sudo yum -y update' 25 | #sudo yum -y update 26 | echo 'sudo yum -y install nfs-utils' 27 | sudo yum -y install nfs-utils 28 | echo 'sudo mkdir /backup' 29 | sudo mkdir /backup 30 | echo 'sudo mkdir /mnt/backups' 31 | sudo mkdir /mnt/backups 32 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup" 33 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup 34 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups" 35 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups 36 | 37 | # we need to decrement retain because we start counting with 0 and we need to remove the oldest backup 38 | let "retain=$retain-1" 39 | if sudo test -d /mnt/backups/$efsid/$interval.$retain; then 40 | echo "sudo rm -rf /mnt/backups/$efsid/$interval.$retain" 41 | sudo rm -rf /mnt/backups/$efsid/$interval.$retain 42 | fi 43 | 44 | 45 | # Rotate all previous backups (except the first one), up one level 46 | for x in `seq $retain -1 2`; do 47 | if sudo test -d /mnt/backups/$efsid/$interval.$[$x-1]; then 48 | echo "sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x" 49 | sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x 50 | fi 51 | done 52 | 53 | # Copy first backup with hard links, then replace first backup with new backup 54 | if sudo test -d /mnt/backups/$efsid/$interval.0 ; then 55 | echo "sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1" 56 | sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1 57 | fi 58 | if [ ! -d /mnt/backups/$efsid ]; then 59 | echo "sudo mkdir -p /mnt/backups/$efsid" 60 | sudo mkdir -p /mnt/backups/$efsid 61 | echo "sudo chmod 700 /mnt/backups/$efsid" 62 | sudo chmod 700 /mnt/backups/$efsid 63 | fi 64 | if [ ! -d /mnt/backups/efsbackup-logs ]; then 65 | echo "sudo mkdir -p /mnt/backups/efsbackup-logs" 66 | sudo mkdir -p /mnt/backups/efsbackup-logs 67 | echo "sudo chmod 700 /mnt/backups/efsbackup-logs" 68 | sudo chmod 700 /mnt/backups/efsbackup-logs 69 | fi 70 | echo "sudo rm /tmp/efs-backup.log" 71 | sudo rm /tmp/efs-backup.log 72 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/ /mnt/backups/$efsid/$interval.0/" 73 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/ /mnt/backups/$efsid/$interval.0/ 74 | rsyncStatus=$? 75 | echo "sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-`date +%Y%m%d-%H%M`.log" 76 | sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-`date +%Y%m%d-%H%M`.log 77 | echo "sudo touch /mnt/backups/$efsid/$interval.0/" 78 | sudo touch /mnt/backups/$efsid/$interval.0/ 79 | exit $rsyncStatus 80 | -------------------------------------------------------------------------------- /samples/EFSBackup/efs-restore-rsync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Input arguments 4 | source=$1 5 | destination=$2 6 | interval=$3 7 | backupNum=$4 8 | efsid=$5 9 | clientNum=$6 10 | numClients=$7 11 | 12 | 13 | # Prepare system for rsync 14 | echo 'sudo yum -y install nfs-utils' 15 | sudo yum -y install nfs-utils 16 | 17 | if [ ! -d /backup ]; then 18 | echo 'sudo mkdir /backup' 19 | sudo mkdir /backup 20 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup" 21 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup 22 | fi 23 | if [ ! -d /mnt/backups ]; then 24 | echo 'sudo mkdir /mnt/backups' 25 | sudo mkdir /mnt/backups 26 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups" 27 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups 28 | fi 29 | 30 | if [ -f /tmp/efs-restore.log ]; then 31 | echo "sudo rm /tmp/efs-restore.log" 32 | sudo rm /tmp/efs-restore.log 33 | fi 34 | 35 | #Copy all content this node is responsible for 36 | for myContent in `sudo ls -a --ignore . --ignore .. /mnt/backups/$efsid/$interval.$backupNum | awk 'NR%'$numClients==$clientNum`; do 37 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum /backup/" 38 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/$myContent /backup/ 39 | rsyncStatus=$? 40 | done 41 | 42 | if [ -f /tmp/efs-restore.log ]; then 43 | echo "sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log" 44 | sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log 45 | fi 46 | exit $rsyncStatus 47 | -------------------------------------------------------------------------------- /samples/EFSBackup/efs-restore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Input arguments 4 | source=$1 5 | destination=$2 6 | interval=$3 7 | backupNum=$4 8 | efsid=$5 9 | 10 | # Prepare system for rsync 11 | echo 'sudo yum -y install nfs-utils' 12 | sudo yum -y install nfs-utils 13 | echo 'sudo mkdir /backup' 14 | sudo mkdir /backup 15 | echo 'sudo mkdir /mnt/backups' 16 | sudo mkdir /mnt/backups 17 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup" 18 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup 19 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups" 20 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups 21 | 22 | if [ ! sudo test -d /mnt/backups/$efsid/$interval.$backupNum/ ]; then 23 | echo "EFS Backup $efsid/$interval.$backupNum does not exist!" 24 | exit 1 25 | fi 26 | 27 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/ /backup/" 28 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/ /backup/ 29 | rsyncStatus=$? 30 | echo "sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-`date +%Y%m%d-%H%M`.log" 31 | sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-`date +%Y%m%d-%H%M`.log 32 | exit $rsyncStatus 33 | -------------------------------------------------------------------------------- /samples/ExampleTemplate/README.md: -------------------------------------------------------------------------------- 1 | # {{Example Name}} 2 | 3 | {{Description of activites performed in the example}} 4 | 5 | ## Parameters 6 | 7 | Parameter | Required | Description 8 | ----------|----------|------------ 9 | {{Parameter Name}} | {{yes/no}} | {{Description}} {{Example or Default}} 10 | 11 | ## Setup (Optional) 12 | 13 | You can use the setup script in the sample directory to create {{resources}} to use in this example. 14 | You can skip this step if you have {{resources}} that you want to use. The script will take a minute 15 | to complete, and when it's finished it will print the resource identifier of the 16 | {{resources}} that it created. 17 | 18 | ```sh 19 | $> python setup.py 20 | ``` 21 | 22 | If the script fails with an ImportError, you may need to [setup your virtualenv](https://github.com/awslabs/data-pipeline-samples#setup). 23 | 24 | ## Running this sample 25 | 26 | Create a new pipeline. Throughout this section we assume that the {{Example Directory}} sample directory is 27 | your current working directory. 28 | 29 | ```sh 30 | $> aws datapipeline create-pipeline --name {{example_name}} --unique-id {{example_name}} 31 | # { 32 | # "pipelineId": "df-03971252U4AVY60545T7" 33 | # } 34 | ``` 35 | 36 | Upload the [pipeline definition](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html). Use the `pipelineId` that was returned by the `create-pipeline` 37 | command. Specify the name of an S3 bucket where the output from pipline activites will be stored. 38 | This will either be the bucket name that was printed by the setup script or another bucket that 39 | you've created. You can also specify any optional parameters for this example here. 40 | 41 | 42 | ```sh 43 | $> aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file://TeraSortHadoopBenchmark.json {{--parameter-values values}} 44 | # { 45 | # "errored": false, 46 | # "validationWarnings": [], 47 | # "validationErrors": [] 48 | # } 49 | ``` 50 | 51 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command. 52 | 53 | ```sh 54 | $> aws datapipeline activate-pipeline --pipeline-id 55 | ``` 56 | 57 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the 58 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output 59 | from this command will show FINISHED for all pipeine nodes. 60 | 61 | ```sh 62 | 63 | >$ aws datapipeline list-runs --pipeline-id 64 | # {{example output}} 65 | 66 | ``` 67 | 68 | {{what happens when the pipeline is finished}} 69 | 70 | ## Next steps 71 | 72 | {{things to try next}} 73 | 74 | Once the pipeline is completed, you can delete it with the following command. If you try to run the 75 | sample again without deleting, you may receive errors or unexpected behavior. 76 | 77 | ```sh 78 | $> aws datapipeline delete-pipeline --pipeline-id 79 | ``` 80 | 81 | The resources used by this example will incur normal charges. If you provisioned resources using the 82 | setup script, you can free them by running the following command in the sample directory. 83 | 84 | ```sh 85 | $> python setup.py --teardown 86 | ``` 87 | 88 | ## Disclaimer 89 | 90 | The samples in this repository are meant to help users get started with Data Pipeline. They may not 91 | be sufficient for production environments. Users should carefully inspect samples before running 92 | them. 93 | 94 | *Use at your own risk.* 95 | 96 | Licensed under the MIT-0 License. 97 | -------------------------------------------------------------------------------- /samples/HadoopTerasort/process-jhist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Number of arguments: $#" 4 | echo "Arguments: $@" 5 | S3_Target=$1 6 | echo "S3 Target output path: $S3_Target" 7 | 8 | # -------------------------------------------------------------- 9 | # TeraSort Benchmark JHIST Publish Script 10 | # This script is a reference script. 11 | # TeraSortHadoopBenchmark pipeline uses the script hosted at: s3://datapipeline-us-east-1/sample-scripts/HadoopTeraSort/process-jhist.sh 12 | # -------------------------------------------------------------- 13 | 14 | # -------------------------------------------------------------- 15 | # Any code, applications, scripts, templates, proofs of concept, documentation and other items provided by AWS under this SOW are AWS Content, as defined in the Agreement, and are provided for illustration purposes only. All such AWS Content is provided solely at the option of AWS, and is subject to the terms of the Addendum and the Agreement. Customer is solely responsible for using, deploying, testing, and supporting any code and applications provided by AWS under the current SOW. 16 | # -------------------------------------------------------------- 17 | 18 | # -------------------------------------------------------------- 19 | # CHANGE LOG: 20 | # -------------------------------------------------------------- 21 | # 2015-04-28 RG v0.1 - Initial script 22 | # 2015-04-28 RG v0.2 - Added TeraSort & TeraValidate JHIST Processing Activities 23 | # 2015-09-01 AR v0.3 - Output to S3 target path 24 | # 2015-11-19 JT v0.4 - Update file name parsing and use mapred command 25 | # -------------------------------------------------------------- 26 | 27 | # -------------------------------------------------------------- 28 | # Define Variables 29 | # -------------------------------------------------------------- 30 | 31 | 32 | 33 | 34 | # -------------------------------------------------------------- 35 | # Process JHIST File 36 | # -------------------------------------------------------------- 37 | 38 | path_to_jhist() { 39 | # perl incantation to extract the path from the ls command 40 | # via: http://stackoverflow.com/questions/21569172/how-to-list-only-file-name-in-hdfs 41 | hdfs dfs -ls -R / | grep $1 | perl -wlne 'print +(split " ",$_,8)[7]' 42 | } 43 | 44 | TeraGen=$(path_to_jhist TeraGen) 45 | mapred job -history all $TeraGen > TeraGen-results.txt 46 | 47 | TeraSort=$(path_to_jhist TeraSort) 48 | mapred job -history all $TeraSort > TeraSort-results.txt 49 | 50 | TeraValidate=$(path_to_jhist TeraValidate) 51 | mapred job -history all ${TeraValidate} > TeraValidate-results.txt 52 | 53 | # -------------------------------------------------------------- 54 | # Copy to S3 55 | # -------------------------------------------------------------- 56 | 57 | gensecondline=`sed -n '2{p;q}' TeraGen-results.txt`; 58 | genjob=${gensecondline:12} 59 | date=$(date +"%m-%d-%y") 60 | aws s3 cp TeraGen-results.txt $S3_Target/$date-$genjob/results/ 61 | aws s3 cp TeraSort-results.txt $S3_Target/$date-$genjob/results/ 62 | aws s3 cp TeraValidate-results.txt $S3_Target/$date-$genjob/results/ 63 | aws s3 cp /home/hadoop/conf $S3_Target/$date-$genjob/conf/ --recursive 64 | 65 | exit 0 66 | -------------------------------------------------------------------------------- /samples/HadoopTerasort/setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("../../setup") 3 | 4 | from stacker import Stacker 5 | 6 | s = Stacker( 7 | "dpl-samples-hadoop-terasort", 8 | { 9 | "Resources": { 10 | "S3Bucket": { 11 | "Type": "AWS::S3::Bucket", 12 | "DeletionPolicy": "Delete" 13 | } 14 | } 15 | }) 16 | 17 | s.run(sys.argv) 18 | -------------------------------------------------------------------------------- /samples/InvokeLambda/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline InvokeLambda Sample 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a Shell Command Activity pipeline that invokes AWS Lambda function. 6 | 7 | ## Prerequisites 8 | 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 10 | 11 | ## Run this sample pipeline using the AWS CLI 12 | 13 | ```sh 14 | $> aws datapipeline create-pipeline --name invoke_lambda_pipeline --unique-id invoke_lambda_pipeline 15 | ``` 16 | 17 | You receive a pipelineId like this. 18 | ```sh 19 | # ----------------------------------------- 20 | # | CreatePipeline | 21 | # +-------------+--------------------------+ 22 | # | pipelineId | | 23 | # +-------------+--------------------------+ 24 | ``` 25 | 26 | ```sh 27 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://invokelambda.json --parameter-values myLambdaFunction= myS3LogsPath=s3:///path --pipeline-id 28 | ``` 29 | 30 | You receive a validation messages like this 31 | ```sh 32 | # ----------------------- 33 | # |PutPipelineDefinition| 34 | # +-----------+---------+ 35 | # | errored | False | 36 | # +-----------+---------+ 37 | ``` 38 | 39 | Now activate the pipeline 40 | ```sh 41 | $> aws datapipeline activate-pipeline --pipeline-id 42 | ``` 43 | 44 | Check the status of your pipeline 45 | ``` 46 | >$ aws datapipeline list-runs --pipeline-id 47 | ``` 48 | 49 | You will receive status information on the pipeline. 50 | ```sh 51 | Name Scheduled Start Status 52 | ID Started Ended 53 | --------------------------------------------------------------------------------------------------- 54 | 1. Invoke_Lambda_Activity 2016-03-23T18:40:31 WAITING_FOR_RUNNER 55 | @Invoke_Lambda_Activity_2016-03-23T18:40:31 2016-03-23T18:40:35 56 | 57 | 2. New_EC2Instance 2016-03-23T18:40:31 CREATING 58 | @New_EC2Instance_2016-03-23T18:40:31 2016-03-23T18:40:36 59 | 60 | ``` 61 | 62 | 63 | ## Disclaimer 64 | 65 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 66 | 67 | Use at your own risk. 68 | 69 | Licensed under the MIT-0 License. 70 | -------------------------------------------------------------------------------- /samples/InvokeLambda/invokelambda.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline", 5 | 6 | "id": "Default", 7 | "failureAndRerunMode": "CASCADE", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myS3LogsPath}", 11 | "scheduleType": "ONDEMAND" 12 | }, 13 | { 14 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.", 15 | 16 | "type": "Ec2Resource", 17 | "id": "New_EC2Instance", 18 | "name": "New_EC2Instance", 19 | "terminateAfter": "1 Hour", 20 | "imageId": "#{myImageId}", 21 | "region": "#{myRegion}", 22 | "instanceType": "#{myInstanceType}", 23 | "resourceRole": "DataPipelineDefaultResourceRole", 24 | "role": "DataPipelineDefaultRole" 25 | }, 26 | { 27 | "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case, it invokes Lambda Function.", 28 | 29 | "id": "Invoke_Lambda_Activity", 30 | "name": "Invoke_Lambda_Activity", 31 | "type": "ShellCommandActivity", 32 | "runsOn": { 33 | "ref": "New_EC2Instance" 34 | }, 35 | "command": "aws lambda --region #{myRegion} invoke --function-name #{myLambdaFunction} outfile.txt" 36 | } 37 | ], 38 | "parameters": [ 39 | { 40 | "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.", 41 | 42 | "id" : "myS3LogsPath", 43 | "type" : "AWS::S3::ObjectKey", 44 | "description" : "S3 path for pipeline logs." 45 | }, 46 | { 47 | "myComment": "This Parameter specifies the Lambda function name.", 48 | 49 | "id" : "myLambdaFunction", 50 | "type" : "String", 51 | "description" : "Lambda Function name" 52 | }, 53 | { 54 | "myComment": "This Parameter specifies region", 55 | 56 | "id" : "myRegion", 57 | "type" : "String", 58 | "default" : "us-east-1", 59 | "description" : "Region" 60 | }, 61 | { 62 | "myComment": "This Parameter specifies image id", 63 | 64 | "id" : "myImageId", 65 | "type" : "String", 66 | "default" : "ami-8fcee4e5", 67 | "description" : "Image Id" 68 | }, 69 | { 70 | "myComment": "This Parameter specifies instance type", 71 | 72 | "id" : "myInstanceType", 73 | "type" : "String", 74 | "default" : "m3.medium", 75 | "description" : "Instance Type" 76 | } 77 | ] 78 | } 79 | -------------------------------------------------------------------------------- /samples/LoadTsvFilesInS3ToRedshift/LoadTsvFilesInS3ToRedshift.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "failureAndRerunMode": "CASCADE", 5 | "schedule": { 6 | "ref": "DefaultSchedule" 7 | }, 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "s3://insert-here-log-location-for-DPL", 11 | "scheduleType": "cron", 12 | "name": "Default", 13 | "id": "Default" 14 | }, 15 | { 16 | "occurrences": "1", 17 | "period": "1 Day", 18 | "name": "RunOnce", 19 | "id": "DefaultSchedule", 20 | "type": "Schedule", 21 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 22 | }, 23 | { 24 | "output": { 25 | "ref": "DestRedshiftTable" 26 | }, 27 | "input": { 28 | "ref": "Input_S3_Tsv_Files" 29 | }, 30 | "name": "ExportS3ToRedshift", 31 | "runsOn": { 32 | "ref": "Ec2Instance" 33 | }, 34 | "id": "RedshiftLoadActivity", 35 | "type": "RedshiftCopyActivity", 36 | "insertMode": "OVERWRITE_EXISTING" 37 | }, 38 | { 39 | "connectionString": "#{myRedshiftJdbcConnectStr}", 40 | "databaseName": "#{myRedshiftDbName}", 41 | "*password": "#{myRedshiftPassword}", 42 | "name": "RedshiftCluster", 43 | "id": "RedshiftCluster", 44 | "type": "RedshiftDatabase", 45 | "username": "#{myRedshiftUsername}" 46 | }, 47 | { 48 | "filePath": "#{myInputTsvFilesS3Loc}", 49 | "name": "Input_S3_Tsv_Files", 50 | "id": "Input_S3_Tsv_Files", 51 | "dataFormat": { 52 | "ref": "DataFormatId_tsv" 53 | }, 54 | "type": "S3DataNode" 55 | }, 56 | { 57 | "securityGroupIds": "#{myRedshiftSecurityGrpIds}", 58 | "instanceType": "m3.xlarge", 59 | "name": "Ec2Instance", 60 | "associatePublicIpAddress": "true", 61 | "id": "Ec2Instance", 62 | "type": "Ec2Resource", 63 | "region": "us-east-1", 64 | "terminateAfter": "10 Hours", 65 | "availabilityZone": "us-east-1a" 66 | }, 67 | { 68 | "database": { 69 | "ref": "RedshiftCluster" 70 | }, 71 | "name": "DestRedshiftTable", 72 | "id": "DestRedshiftTable", 73 | "schemaName": "schemaNameInRedshift", 74 | "type": "RedshiftDataNode", 75 | "tableName": "DestRedshiftTableName" 76 | }, 77 | { 78 | "name": "S3TRDataFormat", 79 | "id": "DataFormatId_tsv", 80 | "type": "TSV" 81 | } 82 | ], 83 | "parameters": [ 84 | { 85 | "description": "Redshift password", 86 | "id": "*myRedshiftPassword", 87 | "type": "String" 88 | }, 89 | { 90 | "description": "Redshift database name", 91 | "id": "myRedshiftDbName", 92 | "type": "String" 93 | }, 94 | { 95 | "watermark": "security group id. E.g.,", 96 | "helpText": "The names of one or more security groups that are assigned to the Redshift cluster.", 97 | "description": "Security group Id(s)", 98 | "isArray": "true", 99 | "id": "myRedshiftSecurityGrpIds", 100 | "type": "String" 101 | }, 102 | { 103 | "description": "Redshift username", 104 | "id": "myRedshiftUsername", 105 | "type": "String" 106 | }, 107 | { 108 | "allowedValues": "OVERWRITE_EXISTING", 109 | "default": "OVERWRITE_EXISTING", 110 | "helpLink": "https://docs.aws.amazon.com/console/datapipeline/redshiftcopyactivity", 111 | "helpText": "Determines how to handle pre-existing data in the target table that overlaps with rows in the data to be loaded.", 112 | "description": "Table insert mode", 113 | "id": "myInsertMode", 114 | "type": "String" 115 | }, 116 | { 117 | "helpText": "The name of an existing table or a new table that will be created based on the create table SQL query parameter below.", 118 | "description": "Redshift table name", 119 | "id": "myRedshiftTableName", 120 | "type": "String" 121 | }, 122 | { 123 | "helpText": "The S3 folder where one or more tsv input files are located.", 124 | "description": "Input S3 folder", 125 | "id": "myInputTsvFilesS3Loc", 126 | "type": "AWS::S3::ObjectKey", 127 | "watermark" : "s3://tsv-files-insert-loc/2015-10-27-01-00-29" 128 | }, 129 | { 130 | "watermark": "jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true", 131 | "description": "Redshift JDBC connection string", 132 | "id": "myRedshiftJdbcConnectStr", 133 | "type": "String" 134 | } 135 | ] 136 | } 137 | -------------------------------------------------------------------------------- /samples/LoadTsvFilesInS3ToRedshift/README.md: -------------------------------------------------------------------------------- 1 | #Data Pipeline Load Tab Separated Files in S3 to Redshift 2 | 3 | ##About the sample 4 | This pipeline definition when imported would instruct Redshift to load TSV files under the specified S3 Path into a specified Redshift Table. Table insert mode is OVERWRITE_EXISTING. 5 | 6 | ##Running this sample 7 | The pipeline requires the following user input point: 8 | 9 | 1. The S3 folder where the input TSV files are located. 10 | 2. Redshift connection info along with the target table name. 11 | 3. Redshift Cluster security group id(s). 12 | 13 | 14 | ## Prerequisites 15 | 16 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 17 | TSV files under a S3 folder path is the input for this pipeline. Redshift Cluster and Table must already exist. 18 | 19 | 20 | 21 | ## Run this sample pipeline using the AWS CLI 22 | 23 | ```sh 24 | $> aws datapipeline create-pipeline --name copy_tsv_to_redshift_pipeline --unique-id copy_tsv_to_redshift_pipeline 25 | ``` 26 | 27 | You receive a pipelineId like this. 28 | ```sh 29 | # ----------------------------------------- 30 | # | CreatePipeline | 31 | # +-------------+--------------------------+ 32 | # | pipelineId | | 33 | # +-------------+--------------------------+ 34 | ``` 35 | 36 | ```sh 37 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values 38 | myInputTsvFilesS3Loc= myRedshiftJdbcConnectStr= myRedshiftUsername= myRedshiftPassword= 39 | myRedshiftTableName= myRedshiftSecurityGrpIds= --pipeline-id 40 | ``` 41 | 42 | You receive a validation messages like this 43 | ```sh 44 | # ----------------------- 45 | # |PutPipelineDefinition| 46 | # +-----------+---------+ 47 | # | errored | False | 48 | # +-----------+---------+ 49 | ``` 50 | 51 | Now activate the pipeline 52 | ```sh 53 | $> aws datapipeline activate-pipeline --pipeline-id 54 | ``` 55 | 56 | Check the status of your pipeline 57 | ```sh 58 | >$ aws datapipeline list-runs --pipeline-id 59 | ``` 60 | 61 | You will receive status information on the pipeline. 62 | 63 | 64 | ## Disclaimer 65 | 66 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 67 | 68 | Use at your own risk. 69 | 70 | Licensed under the MIT-0 License. 71 | -------------------------------------------------------------------------------- /samples/OnDemandWithLamdaFunctions/lambda_function.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import json 4 | import urllib 5 | import boto3 6 | 7 | print('Loading function') 8 | 9 | client = boto3.client('datapipeline') 10 | pipeline_id = 'df-123456789' 11 | 12 | def lambda_handler(event, context): 13 | try: 14 | response = client.activate_pipeline(pipelineId=pipeline_id) 15 | return response 16 | except Exception as e: 17 | print(e) 18 | raise e 19 | -------------------------------------------------------------------------------- /samples/OnDemandWithLamdaFunctions/ondemand.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects 5 | in the pipeline", 6 | 7 | "id": "Default", 8 | "failureAndRerunMode":"cascade", 9 | "resourceRole": "DataPipelineDefaultResourceRole", 10 | "role": "DataPipelineDefaultRole", 11 | "pipelineLogUri": "#{myS3LogsPath}", 12 | "scheduleType": "ondemand", 13 | }, 14 | { 15 | "myComment": "This object is used to create the Amazon EC2 Instance that activities 16 | in the pipeline will be executed on.", 17 | 18 | "id": "A_Fresh_NewEC2Instance", 19 | "type": "Ec2Resource", 20 | "terminateAfter": "1 Hour" 21 | }, 22 | { 23 | "myComment": "This object is a ShellCommandActivity. It is used to specify the linux 24 | shell command that will be invoked. In this case it is simply running the 'echo' command, 25 | but it can be used to run any command that is accessible on in the commandline shell of the 26 | Instance that runs on.", 27 | 28 | "id": "ShellCommandActivity_HelloWorld", 29 | "runsOn": { 30 | "ref": "A_Fresh_NewEC2Instance" 31 | }, 32 | "type": "ShellCommandActivity", 33 | "command": "echo 'Hello World!'" 34 | } 35 | ], 36 | "parameters": [ 37 | { 38 | "myComment": "Pipeline Parameters are placeholders for variables that a user can specify 39 | when uploading or activating the pipeline. In this example, we create a Parameter 40 | called 'myS3LogsPath' which is used to provide an S3 location for output logs. It is 41 | referenced above in the 'Default' object to set the 'pipelineLogUri' value. Parameters 42 | help users avoid hard coding variables in pipeline definitions. Users can supply these 43 | parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline 44 | activate-pipeline-definition'.", 45 | 46 | "id" : "myS3LogsPath", 47 | "type" : "AWS::S3::ObjectKey", 48 | "description" : "S3 path for pipeline logs." 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /samples/OnDemandWithLamdaFunctions/readme.md: -------------------------------------------------------------------------------- 1 | #### This sample shows how to create a Lamda function that responds to S3 create object events on an S3 bucket and/or a Cloudwatch Scheduled Event. 2 | 3 | The following Python code defines an AWS Lamda function to run an ondemand pipeline. This code is in a file called lamda_function.py. You simply need to set the ``pipeline_id`` variable with the id of your on-demand pipeline. 4 | 5 | ```python 6 | from __future__ import print_function 7 | 8 | import json 9 | import urllib 10 | import boto3 11 | 12 | print('Loading function') 13 | 14 | client = boto3.client('datapipeline') 15 | pipeline_id = 'df-123456789' 16 | 17 | def lambda_handler(event, context): 18 | try: 19 | response = client.activate_pipeline(pipelineId=pipeline_id) 20 | return response 21 | except Exception as e: 22 | print(e) 23 | raise e 24 | ``` 25 | ### Step 1: Create the on-demand pipeline 26 | *Make sure the pipeline is created in a region that supports Lamda.* 27 | 28 | Create the pipeline: 29 | 30 | ```sh 31 | $> aws datapipeline create-pipeline --name on_demand_lamda --unique-id on_demand_lamda 32 | ``` 33 | 34 | Upload the pipeline definition: 35 | 36 | ```sh 37 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://ondemand.json \ 38 | --parameter-values myS3LogsPath= --pipeline-id 39 | ``` 40 | 41 | Activate the pipeline to make sure it runs sucessfully: 42 | 43 | ```sh 44 | $> aws datapipeline activate-pipeline --pipeline-id 45 | ``` 46 | 47 | Check the status of your pipeline: 48 | ``` 49 | >$ aws datapipeline list-runs --pipeline-id 50 | ``` 51 | 52 | ### Step 2: Create the Lamda function 53 | 54 | 55 | ```sh 56 | >$ aws lambda create-function --function-name --runtime python2.7 \ 57 | --role --handler lambda_function.lambda_handler \ 58 | --zip-file file:///zip-with-lamda-fn-code.zip --publish --timeout 10 59 | ``` 60 | 61 | See this link for reference on the Lamda create-function command: 62 | http://docs.aws.amazon.com/cli/latest/reference/lambda/create-function.html 63 | 64 | ### Step 3: Set-up an event source for the Lamda funtion 65 | 66 | ##### Set-up an S3 bucket to call the Lamda function when objects are created 67 | 68 | Create the s3 bucket: 69 | 70 | ```sh 71 | $> aws s3 mb 72 | ``` 73 | 74 | Run the following Lambda add-permission command to grant Amazon S3 service principal permissions to perform the lambda:InvokeFunction action: 75 | 76 | ```sh 77 | $> aws lambda add-permission --function-name \ 78 | --region --statement-id --action "lambda:InvokeFunction" \ 79 | --principal s3.amazonaws.com --source-arn \ 80 | --source-account --profile adminuser 81 | ``` 82 | 83 | See this link for reference on the lamda add-permission command: 84 | http://docs.aws.amazon.com/cli/latest/reference/lambda/add-permission.html 85 | 86 | Add the notification on S3 and have it call the Lamda function: 87 | 88 | \*Make sure your notification configuration contains ``s3:ObjectCreated:*`` events 89 | 90 | ```sh 91 | $> aws s3api put-bucket-notification-configuration --bucket --notification-configuration 92 | ``` 93 | 94 | See this link for reference on the s3api put-bucket-notification-configuration command: 95 | http://docs.aws.amazon.com/cli/latest/reference/s3api/put-bucket-notification-configuration.html 96 | 97 | Upload a file to the S3 bucket and make validate the lamda function activated your pipeline: 98 | 99 | ```sh 100 | $> aws s3 cp 101 | $> aws datapipeline list-runs --pipeline-id 102 | ``` 103 | 104 | ##### OR Add a CRON schedule using Cloudwatch Scheduled Events 105 | 106 | This is only possible in the Lamda console. Instructions here: http://docs.aws.amazon.com/lambda/latest/dg/with-scheduled-events.html 107 | -------------------------------------------------------------------------------- /samples/RDStoRedshiftSqoop/RDStoRedshiftSqoop.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline.", 5 | 6 | "id": "Default", 7 | "name": "Default", 8 | "failureAndRerunMode": "CASCADE", 9 | "schedule": { 10 | "ref": "DefaultSchedule" 11 | }, 12 | "resourceRole": "DataPipelineDefaultResourceRole", 13 | "role": "DataPipelineDefaultRole", 14 | "scheduleType": "cron", 15 | "pipelineLogUri": "#{myS3LogsPath}" 16 | }, 17 | { 18 | "myComment": "This object is used to run the Sqoop activity that extracts data from RDS.", 19 | 20 | "name": "Sqoop", 21 | "id": "ActivityId_wQhxe", 22 | "runsOn": { 23 | "ref": "ResourceId_z9RNH" 24 | }, 25 | "type": "ShellCommandActivity", 26 | "command": "/usr/bin/sqoop import --connect jdbc:mysql://#{myRdsEndpoint}/millionsongs --table songs --target-dir #{myS3StagingPath} --username dplcustomer --password Dplcustomer1" 27 | }, 28 | { 29 | "myComment": "This object is used to specify the copy activity for moving data from S3 to Redshift.", 30 | 31 | "output": { 32 | "ref": "DataNodeId_7EqZ7" 33 | }, 34 | "input": { 35 | "ref": "DataNodeId_ImmS9" 36 | }, 37 | "dependsOn": { 38 | "ref": "ActivityId_wQhxe" 39 | }, 40 | "name": "CopyToRedshift", 41 | "id": "ActivityId_6OGtu", 42 | "runsOn": { 43 | "ref": "ResourceId_z9RNH" 44 | }, 45 | "type": "RedshiftCopyActivity", 46 | "insertMode": "TRUNCATE" 47 | }, 48 | { 49 | "myComment": "This object is used to control the task schedule.", 50 | 51 | "occurrences": "1", 52 | "period": "1 Day", 53 | "name": "RunOnce", 54 | "id": "DefaultSchedule", 55 | "type": "Schedule", 56 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 57 | }, 58 | { 59 | "myComment": "This object provides connection information for the Redshift cluster.", 60 | 61 | "connectionString": "jdbc:postgresql://#{myRedshiftEndpoint}:5439/dev", 62 | "*password": "Dplcustomer1", 63 | "name": "DefaultRedshiftDatabase1", 64 | "id": "RedshiftDatabaseId_S34X5", 65 | "type": "RedshiftDatabase", 66 | "username": "dplcustomer" 67 | }, 68 | { 69 | "myComment": "This object is used to provide information for the EMR cluster bootstrap", 70 | 71 | "bootstrapAction": "s3://data-pipeline-samples/sqoop-activity/install_sqoop_ba.sh", 72 | "name": "HadoopCluster", 73 | "id": "ResourceId_z9RNH", 74 | "amiVersion": "3.8.0", 75 | "type": "EmrCluster", 76 | "terminateAfter": "1 Hour" 77 | }, 78 | { 79 | "myComment": "This object provides information on the S3 staging data.", 80 | 81 | "directoryPath": "#{myS3StagingPath}", 82 | "name": "S3Input", 83 | "id": "DataNodeId_ImmS9", 84 | "type": "S3DataNode" 85 | }, 86 | { 87 | "myComment": "This object contains information about the Redshift database.", 88 | 89 | "createTableSql": "create table IF NOT EXISTS songs (track_id varchar(2048) not null distkey sortkey, title varchar(2048), song_id varchar(2048), release_name varchar(2048), artist_id varchar(2048), artist_mbid varchar(2048), artist_name varchar(2048), duration float, artist_familiarity float, artist_hotness float, year int);", 90 | "database": { 91 | "ref": "RedshiftDatabaseId_S34X5" 92 | }, 93 | "primaryKeys": "track_id", 94 | "name": "Redshift", 95 | "id": "DataNodeId_7EqZ7", 96 | "type": "RedshiftDataNode", 97 | "tableName": "songs" 98 | } 99 | ], 100 | "parameters": [] 101 | } 102 | -------------------------------------------------------------------------------- /samples/RDStoRedshiftSqoop/setup/Setup.py: -------------------------------------------------------------------------------- 1 | from RdsToRedshiftSqoopSample import RDStoRedshiftSqoopSample 2 | from Utilities import check_working_directory 3 | 4 | import argparse 5 | import sys 6 | 7 | 8 | if __name__ == '__main__': 9 | check_working_directory() 10 | parser = argparse.ArgumentParser(description='Setup for RDS to Redshift Sqoop pipeline sample') 11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path") 12 | args = parser.parse_args() 13 | s3_bucket_path = args.s3_bucket_path 14 | 15 | sample = RDStoRedshiftSqoopSample() 16 | 17 | if s3_bucket_path is None: 18 | sample.create_s3_bucket() 19 | elif not sample.validate_s3_bucket_path(s3_bucket_path): 20 | sys.exit(0) 21 | 22 | sample.create_rds_instance() 23 | sample.create_redshift_cluster() 24 | sample.run_setup_datapipeline() 25 | sample.print_setup_results() 26 | -------------------------------------------------------------------------------- /samples/RDStoRedshiftSqoop/setup/SetupPipelineDefinition.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class SetupPipelineDefinitionHelper(object): 5 | 6 | def __init__(self): 7 | with open("setup.json", "r") as setup: 8 | pipeline_string = setup.read().replace('\n', '') 9 | self.pipeline_definition = json.loads(pipeline_string) 10 | 11 | def get_setup_pipeline_objects(self): 12 | return self.pipeline_definition['objects'] 13 | 14 | def get_setup_pipeline_parameters(self): 15 | return self.pipeline_definition['parameters'] 16 | 17 | def get_setup_pipeline_parameter_values(self): 18 | return self.pipeline_definition['parameterValues'] 19 | -------------------------------------------------------------------------------- /samples/RDStoRedshiftSqoop/setup/Teardown.py: -------------------------------------------------------------------------------- 1 | from RdsToRedshiftSqoopSample import RDStoRedshiftSqoopSample 2 | from Utilities import check_working_directory 3 | 4 | import argparse 5 | 6 | 7 | if __name__ == '__main__': 8 | check_working_directory() 9 | 10 | parser = argparse.ArgumentParser(description='Teardown for RDS to Redshift Sqoop pipeline sample') 11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path") 12 | parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id") 13 | parser.add_argument('--redshift-cluster-id', action="store", dest="redshift_cluster_id") 14 | args = parser.parse_args() 15 | 16 | sample = RDStoRedshiftSqoopSample() 17 | 18 | if args.rds_instance_id is not None: 19 | sample.destroy_rds(args.rds_instance_id) 20 | 21 | if args.redshift_cluster_id is not None: 22 | sample.destroy_redshift(args.redshift_cluster_id) 23 | 24 | if args.s3_bucket_path is not None: 25 | sample.destroy_s3_bucket(args.s3_bucket_path) 26 | -------------------------------------------------------------------------------- /samples/RDStoRedshiftSqoop/setup/Utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def check_working_directory(): 6 | current_folder_path, current_folder_name = os.path.split(os.getcwd()) 7 | if current_folder_name == 'RDStoRedshiftSqoop': 8 | os.chdir('setup') 9 | elif current_folder_name != 'setup': 10 | print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoRedshiftSqoop/setup' 11 | sys.exit(0) 12 | -------------------------------------------------------------------------------- /samples/RDStoS3/RDStoS3Pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline", 5 | 6 | "id": "Default", 7 | "failureAndRerunMode": "CASCADE", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myS3LogsPath}", 11 | "scheduleType": "cron", 12 | "schedule": { 13 | "ref": "DefaultSchedule" 14 | } 15 | }, 16 | { 17 | "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.", 18 | 19 | "type": "Schedule", 20 | "id": "DefaultSchedule", 21 | "occurrences": "1", 22 | "period": "1 Day", 23 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 24 | }, 25 | { 26 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.", 27 | "name": "Ec2Instance", 28 | "type": "Ec2Resource", 29 | "id": "Ec2Instance", 30 | "instanceType": "t1.micro", 31 | "securityGroups": "#{myEc2RdsSecurityGrps}", 32 | "terminateAfter": "2 Hours", 33 | "resourceRole": "DataPipelineDefaultResourceRole", 34 | "role": "DataPipelineDefaultRole" 35 | }, 36 | { 37 | "myComment": "This object is a CopyActivity. It is used to define the work that will be done to copy the data from the database to S3.", 38 | "name": "RDStoS3CopyActivity", 39 | "type": "CopyActivity", 40 | "id": "RDStoS3CopyActivity", 41 | "output": { 42 | "ref": "S3OutputLocation" 43 | }, 44 | "input": { 45 | "ref": "SourceRDSTable" 46 | }, 47 | "runsOn": { 48 | "ref": "Ec2Instance" 49 | } 50 | }, 51 | { 52 | "myComment": "This object is a mysql datanode. It is used to represent the database which is the input datanode.", 53 | "name": "SourceRDSTable", 54 | "type": "MySqlDataNode", 55 | "id": "SourceRDSTable", 56 | "connectionString": "#{myRDSConnectStr}", 57 | "*password": "#{*myRDSPassword}", 58 | "table": "#{myRDSTableName}", 59 | "selectQuery": "select * from #{table}", 60 | "username": "#{myRDSUsername}" 61 | }, 62 | { 63 | "myComment": "This object is a S3 datanode. It is used to represent the S3 directory where the data will be stored.", 64 | "name": "S3OutputLocation", 65 | "type": "S3DataNode", 66 | "id": "S3OutputLocation", 67 | "directoryPath": "#{myOutputS3Path}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}" 68 | } 69 | ], 70 | "parameters": [ 71 | { 72 | "id": "*myRDSPassword", 73 | "type": "String", 74 | "description": "RDS MySQL password" 75 | }, 76 | { 77 | "watermark" : "s3://mybucket", 78 | "id": "myS3LogsPath", 79 | "type": "AWS::S3::ObjectKey", 80 | "description": "S3 folder for logs" 81 | }, 82 | { 83 | "watermark" : "s3://mybucket", 84 | "id": "myOutputS3Path", 85 | "type": "AWS::S3::ObjectKey", 86 | "description": "Output S3 folder" 87 | }, 88 | { 89 | "watermark" : "sg-12345", 90 | "id": "myEc2RdsSecurityGrps", 91 | "type": "String", 92 | "description": "RDS MySQL security group(s)", 93 | "isArray": "true", 94 | "optional": "true" 95 | }, 96 | { 97 | "id": "myRDSUsername", 98 | "type": "String", 99 | "description": "RDS MySQL username" 100 | }, 101 | { 102 | "id": "myRDSTableName", 103 | "type": "String", 104 | "description": "RDS MySQL table name" 105 | }, 106 | { 107 | "watermark" : "jdbc:mysql://connectionstring:3306/dbname", 108 | "id": "myRDSConnectStr", 109 | "type": "String", 110 | "description": "RDS MySQL connection string" 111 | } 112 | ] 113 | } 114 | -------------------------------------------------------------------------------- /samples/RDStoS3/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline RDStoS3 Sample 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a pipeline that outputs a MySQL table in csv format from a RDS database to an S3 bucket. 6 | 7 | The project provides scripts for setting up the resources for the pipeline, installing the [data set](http://aws.amazon.com/datasets/6468931156960467), and destroying the resources. The project also provides the [pipeline definition file](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html) which is used to create the pipeline and the AWS CLI commands for creating and executing the pipeline. See the instructions below to get started. 8 | 9 | *Note: Normal AWS charges apply for the resources created by the script. Make sure to run the teardown script as soon as you are done with the sample.* 10 | 11 | ## Prerequisites 12 | 13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 14 | 15 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket. 16 | 17 | ## Step 1: Priming this sample 18 | 19 | Run the following commands to run the setup script. The AWS resources that will be created are a RDS MySQL database and optionally an S3 bucket. 20 | 21 | The script takes an *optional* parameter for an S3 path for outputting the data from S3. If you choose to provide your own S3 path, the bucket must be in the same region as what is set for your AWS CLI configuration. Finally, please make sure the S3 bucket has a policy that allows data writes to it. 22 | 23 | If the path is not provided, the script will create the S3 bucket for you. 24 | 25 | *Setup and teardown scripts are located in the setup directory under the sqoop directory in the samples directory.* 26 | ``` 27 | $> cd /data-pipeline-samples/samples/RDStoS3 28 | $> python setup/Setup.py --s3-path [s3://optional/path/to/s3/location] 29 | ``` 30 | 31 | ## Step 2: Run this sample pipeline using the AWS CLI 32 | 33 | ```sh 34 | $> aws datapipeline create-pipeline --name rds_to_s3_pipeline --unique-id rds_to_s3_pipeline 35 | ``` 36 | 37 | You receive a pipelineId like this. 38 | ```sh 39 | # ----------------------------------------- 40 | # | CreatePipeline | 41 | # +-------------+--------------------------+ 42 | # | pipelineId | | 43 | # +-------------+--------------------------+ 44 | ``` 45 | 46 | ```sh 47 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://RDStoS3Pipeline.json --parameter-values myOutputS3Path= myS3LogsPath= myRDSPassword= myRDSUsername= myRDSTableName= myRDSConnectStr= --pipeline-id 48 | ``` 49 | 50 | You receive a validation messages like this 51 | ```sh 52 | # ----------------------- 53 | # |PutPipelineDefinition| 54 | # +-----------+---------+ 55 | # | errored | False | 56 | # +-----------+---------+ 57 | ``` 58 | 59 | Now activate the pipeline 60 | ```sh 61 | $> aws datapipeline activate-pipeline --pipeline-id 62 | ``` 63 | 64 | Check the status of your pipeline 65 | ``` 66 | >$ aws datapipeline list-runs --pipeline-id 67 | ``` 68 | 69 | You will receive status information on the pipeline. 70 | ```sh 71 | # Name Scheduled Start Status 72 | # ID Started Ended 73 | #--------------------------------------------------------------------------------------------------- 74 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES 75 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20 76 | # 77 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING 78 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20 79 | # 80 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20 81 | ``` 82 | 83 | Let the pipeline complete, then check the output S3 bucket for the output csv file. 84 | 85 | ## Step 3: IMPORTANT! Tear down this sample 86 | 87 | *Note: The setup script will provide the teardown command with parameters at end of the execution.* 88 | 89 | ``` 90 | $> python setup/Teardown.py --rds-instance-id -s3-path [s3://optional/path/to/s3/bucket/created/by/setup] 91 | ``` 92 | 93 | ## Disclaimer 94 | 95 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 96 | 97 | Use at your own risk. 98 | 99 | Licensed under the MIT-0 License. 100 | -------------------------------------------------------------------------------- /samples/RDStoS3/setup/Setup.py: -------------------------------------------------------------------------------- 1 | from RDStoS3Sample import RDStoS3Sample 2 | from Utilities import check_working_directory 3 | 4 | import argparse 5 | import sys 6 | 7 | 8 | if __name__ == '__main__': 9 | check_working_directory() 10 | parser = argparse.ArgumentParser(description='Setup for RDS to S3 pipeline sample') 11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path") 12 | args = parser.parse_args() 13 | s3_bucket_path = args.s3_bucket_path 14 | 15 | sample = RDStoS3Sample() 16 | 17 | if s3_bucket_path is None: 18 | sample.create_s3_bucket() 19 | elif not sample.validate_s3_bucket_path(s3_bucket_path): 20 | sys.exit(0) 21 | 22 | sample.create_rds_instance() 23 | sample.run_setup_datapipeline() 24 | sample.print_setup_results() 25 | -------------------------------------------------------------------------------- /samples/RDStoS3/setup/SetupPipelineDefinition.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class SetupPipelineDefinitionHelper(object): 5 | 6 | def __init__(self): 7 | with open("setup.json", "r") as setup: 8 | pipeline_string = setup.read().replace('\n', '') 9 | self.pipeline_definition = json.loads(pipeline_string) 10 | 11 | def get_setup_pipeline_objects(self): 12 | return self.pipeline_definition['objects'] 13 | 14 | def get_setup_pipeline_parameters(self): 15 | return self.pipeline_definition['parameters'] 16 | 17 | def get_setup_pipeline_parameter_values(self): 18 | return self.pipeline_definition['parameterValues'] 19 | -------------------------------------------------------------------------------- /samples/RDStoS3/setup/Teardown.py: -------------------------------------------------------------------------------- 1 | from RDStoS3Sample import RDStoS3Sample 2 | from Utilities import check_working_directory 3 | 4 | import argparse 5 | 6 | 7 | if __name__ == '__main__': 8 | check_working_directory() 9 | 10 | parser = argparse.ArgumentParser(description='Teardown for RDS to S3 pipeline sample') 11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path") 12 | parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id") 13 | parser.add_argument('--redshift-cluster-id', action="store", dest="redshift_cluster_id") 14 | args = parser.parse_args() 15 | 16 | sample = RDStoS3Sample() 17 | 18 | if args.rds_instance_id is not None: 19 | sample.destroy_rds(args.rds_instance_id) 20 | 21 | 22 | if args.s3_bucket_path is not None: 23 | sample.destroy_s3_bucket(args.s3_bucket_path) 24 | -------------------------------------------------------------------------------- /samples/RDStoS3/setup/Utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def check_working_directory(): 6 | current_folder_path, current_folder_name = os.path.split(os.getcwd()) 7 | if current_folder_name == 'RDStoS3': 8 | os.chdir('setup') 9 | elif current_folder_name != 'setup': 10 | print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoS3/setup' 11 | sys.exit(0) 12 | -------------------------------------------------------------------------------- /samples/RedshiftCopyActivityFromDynamoDBTable/RedshiftCopyActivityFromDynamoDBTable.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline.", 5 | 6 | "name": "Default", 7 | "id": "Default", 8 | "failureAndRerunMode": "CASCADE", 9 | "schedule": { 10 | "ref": "DefaultSchedule" 11 | }, 12 | "resourceRole": "DataPipelineDefaultResourceRole", 13 | "role": "DataPipelineDefaultRole", 14 | "pipelineLogUri": "#{myLogUri}", 15 | "scheduleType": "cron" 16 | }, 17 | { 18 | "myComment": "This object provides connection information for the Redshift cluster.", 19 | 20 | "name": "DefaultDatabase1", 21 | "id": "DatabaseId_Kw7C9", 22 | "connectionString": "#{myConnectionString}", 23 | "databaseName": "#{myRedshiftDatabase}", 24 | "*password": "#{myRedshiftPassword}", 25 | "type": "RedshiftDatabase", 26 | "username": "#{myRedshiftUsername}" 27 | }, 28 | { 29 | "myComment": "This object is used to provide the resource where the copy job is invoked.", 30 | 31 | "name": "DefaultResource1", 32 | "id": "ResourceId_idL0Y", 33 | "resourceRole": "DataPipelineDefaultResourceRole", 34 | "role": "DataPipelineDefaultRole", 35 | "type": "Ec2Resource", 36 | "terminateAfter": "1 Hour" 37 | }, 38 | { 39 | "myComment": "This object is used to specify the copy activity for moving data from DynamoDB to Redshift.", 40 | 41 | "name": "CopyFromDDBToRedshift", 42 | "id": "ActivityId_vmVn4", 43 | "database": { 44 | "ref": "DatabaseId_Kw7C9" 45 | }, 46 | "runsOn": { 47 | "ref": "ResourceId_idL0Y" 48 | }, 49 | "type": "SqlActivity", 50 | "script": "#{myScript}" 51 | }, 52 | { 53 | "myComment": "This object is used to control the task schedule.", 54 | 55 | "name": "RunOnce", 56 | "id": "DefaultSchedule", 57 | "occurrences": "1", 58 | "period": "1 Day", 59 | "type": "Schedule", 60 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 61 | } 62 | ], 63 | "parameters": [] 64 | } 65 | -------------------------------------------------------------------------------- /samples/RedshiftCopyActivityFromDynamoDBTable/readme.md: -------------------------------------------------------------------------------- 1 | #RedshiftCopyActivityFromDynamoDBTable Sample 2 | 3 | This sample demonstrates how you can use Data Pipeline's RedshiftCopyActivity to copy data from a DynamoDB table to a Redshift table. This sample was motivated by a use case that requires the user to provide AWS credentials to access the DynamoDB table. It is assumed that the owner of the DynamoDB table has granted the user read access to the table. To make this sample to work, you must ensure you have the following: 4 | 5 | * Connection string for the destination Redshift cluster, e.g. jdbc:redshift://_hostname_:5439/_database_ 6 | * Redshift database name 7 | * Redshift username and password. This user must have write access to the table where data will be copied to. 8 | * DynamoDB table name. Note that both the table name and column names must match on both sides of the copy. 9 | * AWS credentials, i.e the access key and the secret key, to access the DynamoDB table. 10 | * DynamoDB table read ratio. 11 | * S3 location to direct log messages generated by Data Pipeline. 12 | 13 | You will need to provide the above information in the "put-pipeline-definition" command below. 14 | 15 | ##Running this sample 16 | 17 | ```sh 18 | $> aws datapipeline create-pipeline --name redshift_copy_from_dynamodb_pipeline --unique-id redshift_copy_from_dynamodb_pipeline 19 | 20 | # You receive a pipeline activity like this. 21 | # ----------------------------------------- 22 | # | CreatePipeline | 23 | # +-------------+--------------------------+ 24 | # | pipelineId | df-0554887H4KXKTY59MRJ | 25 | # +-------------+--------------------------+ 26 | 27 | #now upload the pipeline definition 28 | 29 | $> aws datapipeline put-pipeline-definition --pipeline-id df-0554887H4KXKTY59MRJ \ 30 | --pipeline-definition file://samples/RedshiftCopyActivitySample/RedshiftCopyActivitySample.json \ 31 | --parameter-values myConnectionString= myRedshiftDatabase= \ 32 | myRedshiftUsername= myRedshiftPassword= \ 33 | myScript="copy from 'dynamodb://' credentials 'aws_access_key_id=;aws_secret_access_key=' readratio ;" \ 34 | myLogUri="" 35 | 36 | # You receive a validation messages like this 37 | 38 | # ----------------------- 39 | # |PutPipelineDefinition| 40 | # +-----------+---------+ 41 | # | errored | False | 42 | # +-----------+---------+ 43 | 44 | #now activate the pipeline 45 | $> aws datapipeline activate-pipeline --pipeline-id df-0554887H4KXKTY59MRJ 46 | 47 | 48 | #check the status of your pipeline 49 | 50 | >$ aws datapipeline list-runs --pipeline-id df-0554887H4KXKTY59MRJ 51 | # Name Scheduled Start Status 52 | # ID Started Ended 53 | #--------------------------------------------------------------------------------------------------- 54 | # 1. ActivityId_vmVn4 2015-11-06T23:52:04 WAITING_FOR_RUNNER 55 | # @ActivityId_vmVn4_2015-11-06T23:52:04 2015-11-06T23:52:11 56 | # 57 | # 2. ResourceId_idL0Y 2015-11-06T23:52:04 CREATING 58 | # @ResourceId_idL0Y_2015-11-06T23:52:04 2015-11-06T23:52:11 59 | ``` 60 | 61 | ##Related documentation 62 | https://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-redshiftcopyactivity.html 63 | 64 | -------------------------------------------------------------------------------- /samples/S3ToRdsSqoop/README.md: -------------------------------------------------------------------------------- 1 | # Loading a CSV file stored in S3 into an RDS MySQL instance 2 | 3 | This sample uses [sqoop](http://sqoop.apache.org/) to load a CSV filed stored in [S3](https://aws.amazon.com/s3/) into a [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/) database instance managed by [RDS](https://aws.amazon.com/rds/). Sqoop is a specialized tool that uses [hadoop](http://hadoop.apache.org/) to transfer bulk data in and out of relational databases. It completes this task more quickly than Data Pipeline's built-in CopyActivity, but it is also more resource intensive. The sample takes advantage of built-in support for sqoop in [EMR](https://aws.amazon.com/emr/) 5.0. 4 | 5 | ## Parameters 6 | 7 | Parameter | Required | Description 8 | ----------|----------|------------ 9 | myEmrMasterInstanceType | no | The EC2 instance type to use for the master node in the EMR cluster. Default: m2.xlarge 10 | myEmrCoreInstanceType | no | The EC2 instance type to use for the core nodes in the EMR cluster. Default: m2.xlarge 11 | myEmrCoreInstanceCount | no | The number of core nodes to launch in the EMR cluster. Default: 2 12 | myRdsEndpoint | yes | DNS endpoint for target RDS instance. The value should include the port number. Example: test.xyzw.us-east-1.rds.amazonaws.com:3306 13 | myRdsDatabaseName | yes | Name of the target MySQL or MariaDB database. 14 | myRdsTableName | yes | Name of the database table that the CSV will be imported into. 15 | myRdsUsername | yes | User name to use to connect to RDS. 16 | \*myRdsPassword | yes | Password to use to connect to RDS. 17 | myS3InputDataLocation | yes | S3 path to folder where the CSV data is stored. Example: s3://example-s3-path/folder-containing-csv-data/ 18 | myPipelineLogUri | yes | S3 folder where log data generated by this pipeline will be written. Example: s3::/example-s3-path/folder-to-contain-log-files/ 19 | 20 | ## Prerequisites 21 | 22 | This template assumes that you have already created an RDS instance running MySQL or MariaDB instance in RDS. Inside the instance you will need a database and table where the records will be inserted. You will need to know the database name, the table name, the database user name and password, and the DNS endpoint of the RDS instance. You can use the RDS console to view the DNS endpoint and the master user name and to modify the master password as needed. You will need to use the MySQL command-line tool or a graphical client like [MySQL Workbench](https://www.mysql.com/products/workbench/) to create the target database and table. See [here](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_ConnectToInstance.html) for more information on connecting to MySQL on RDS. Note that the schema of the table where you will be importing records should match the schema in the CSV file (i.e., it should have the same number of columns and appropriate column types). 23 | 24 | ## Running this sample 25 | 26 | Create a new pipeline. Throughout this section we assume that the S3ToRdsSqoop sample directory is 27 | your current working directory. 28 | 29 | ```sh 30 | $> aws datapipeline create-pipeline --name s3-to-rds-sqoop --unique-id s3-to-rds-sqoop 31 | # { 32 | # "pipelineId": "df-03971252U4AVY60545T7" 33 | # } 34 | ``` 35 | 36 | Upload the [pipeline definition](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html). Use the `pipelineId` that was returned by the `create-pipeline` command. Specify the required parameters. 37 | 38 | ```sh 39 | $> aws datapipeline put-pipeline-definition --pipeline-id \ 40 | --pipeline-definition file://sqoop_activity.json \ 41 | --parameter-values myRdsEndpoint= myRdsDatabaseName= myRdsTableName= \ 42 | myRdsUsername= '*myRdsPassword=' myS3InputDataLocation= myPipelineLogUri= 43 | # { 44 | # "errored": false, 45 | # "validationWarnings": [], 46 | # "validationErrors": [] 47 | # } 48 | ``` 49 | 50 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command. 51 | 52 | ```sh 53 | $> aws datapipeline activate-pipeline --pipeline-id 54 | ``` 55 | 56 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the 57 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output 58 | from this command will show FINISHED for all pipeine nodes. 59 | 60 | ```sh 61 | 62 | >$ aws datapipeline list-runs --pipeline-id 63 | 64 | ``` 65 | 66 | Once the pipeline has completed, you should be able to see the imported records in MySQL by running a SELECT query using the MySQL command-line tool or a graphical client. 67 | 68 | ## Next steps 69 | 70 | In addition to the required parameters, there are optional parameters to set the EC2 instance types launched by the EMR cluster as well as the number of core nodes to launch. Changing these paramters may improve the performance of the import job. 71 | 72 | Once the pipeline is completed, you can delete it with the following command. 73 | 74 | ```sh 75 | $> aws datapipeline delete-pipeline --pipeline-id 76 | ``` 77 | 78 | The resources used by this example will incur normal charges. If you created any resources specifically to test this pipeline, you may wish to delete them now. 79 | 80 | ## Disclaimer 81 | 82 | The samples in this repository are meant to help users get started with Data Pipeline. They may not 83 | be sufficient for production environments. Users should carefully inspect samples before running 84 | them. 85 | 86 | *Use at your own risk.* 87 | 88 | Licensed under the MIT-0 License. 89 | -------------------------------------------------------------------------------- /samples/S3ToRdsSqoop/sqoop_activity.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "The default object sets global properties for the pipeline.", 5 | "id": "Default", 6 | "name": "Default", 7 | "failureAndRerunMode": "CASCADE", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myPipelineLogUri}", 11 | "scheduleType": "ONDEMAND" 12 | }, 13 | { 14 | "myComment": "An EMR cluster where the Sqoop job will be run. These parameters can be edited to create a more powerful cluster.", 15 | "id": "MyEmrCluster", 16 | "name": "MyEmrCluster", 17 | "type": "EmrCluster", 18 | "masterInstanceType": "#{myEmrMasterInstanceType}", 19 | "coreInstanceType": "#{myEmrCoreInstanceType}", 20 | "coreInstanceCount": "#{myEmrCoreInstanceCount}", 21 | "releaseLabel": "emr-5.0.0", 22 | "applications": ["sqoop"], 23 | "terminateAfter": "12 hours" 24 | }, 25 | { 26 | "myComment": "S3 folder where the input CSV is stored.", 27 | "id": "S3InputDataLocation", 28 | "name": "S3InputDataLocation", 29 | "directoryPath": "#{myS3InputDataLocation}", 30 | "type": "S3DataNode" 31 | }, 32 | { 33 | "myComment": "The shell command to invoke sqoop to copy the CSV into RDS. This template assumes that the target database is either MySQL or MariaDB and that the target table has already been created.", 34 | "id": "SqoopActivity", 35 | "name": "SqoopActivity", 36 | "runsOn": { 37 | "ref": "MyEmrCluster" 38 | }, 39 | "input": { 40 | "ref": "S3InputDataLocation" 41 | }, 42 | "type": "ShellCommandActivity", 43 | "command": "sqoop export --connect jdbc:mariadb://#{myRdsEndpoint}/#{myRdsDatabaseName} --driver org.mariadb.jdbc.Driver --table #{myRdsTableName} --username #{myRdsUsername} --password #{*myRdsPassword} --export-dir #{myS3InputDataLocation}" 44 | } 45 | ], 46 | "parameters": [ 47 | { 48 | "id": "myEmrMasterInstanceType", 49 | "type": "String", 50 | "default": "m2.xlarge", 51 | "description": "The EC2 instance type to use for the master node in the EMR cluster" 52 | }, 53 | { 54 | "id": "myEmrCoreInstanceType", 55 | "type": "String", 56 | "default": "m2.xlarge", 57 | "description": "The EC2 instance type to use for the core nodes in the EMR cluster" 58 | }, 59 | { 60 | "id": "myEmrCoreInstanceCount", 61 | "type": "String", 62 | "default": "2", 63 | "description": "The number of core nodes to launch in the EMR cluster" 64 | }, 65 | { 66 | "id": "myRdsEndpoint", 67 | "type": "String", 68 | "description": "DNS endpoint for target RDS instance. The value should include the port number. Example: test.xyzw.us-east-1.rds.amazonaws.com:3306" 69 | }, 70 | { 71 | "id": "myRdsDatabaseName", 72 | "type": "String", 73 | "description": "Name of the target MySQL or MariaDB database" 74 | }, 75 | { 76 | "id": "myRdsTableName", 77 | "type": "String", 78 | "description": "Name of the database table that the CSV will be imported into" 79 | }, 80 | { 81 | "id": "myRdsUsername", 82 | "type": "String", 83 | "description": "User name to use to connect to RDS" 84 | }, 85 | { 86 | "id": "*myRdsPassword", 87 | "type": "String", 88 | "description": "Password to use to connect to RDS" 89 | }, 90 | { 91 | "id": "myS3InputDataLocation", 92 | "type": "AWS::S3::ObjectKey", 93 | "description": "S3 path to folder where the CSV data is stored" 94 | }, 95 | { 96 | "id": "myPipelineLogUri", 97 | "type": "AWS::S3::ObjectKey", 98 | "description": "S3 folder where log data generated by this pipeline will be written" 99 | } 100 | ] 101 | } 102 | -------------------------------------------------------------------------------- /samples/S3TsvFilesToRedshiftTablesIfReady/readme.md: -------------------------------------------------------------------------------- 1 | #Data Pipeline Load Tab Separated Files in S3 to Redshift if file exists 2 | 3 | ##About the sample 4 | This pipeline definition when imported would instruct Redshift to load two TSV files from given two S3 location, into two different Redshift Table. Two copy activities are independent, each will start once the input s3 file exists. Table insert mode is OVERWRITE_EXISTING. 5 | 6 | ##Running this sample 7 | The pipeline requires the following user input point: 8 | 9 | 1. Redshift connection info 10 | 2. The S3 file locations where the input TSV files are located. 11 | 2. Redshift target table names of each S3 file to copy to. 12 | 3. Redshift Cluster security group id(s). 13 | 14 | 15 | ## Prerequisites 16 | 17 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 18 | Redshift Cluster and Table must already exist. 19 | S3 tsv file locations are input for this pipeline, RedshiftCopy activity will start only when input S3 file exists. 20 | 21 | 22 | ## Run this sample pipeline using the AWS CLI 23 | 24 | ```sh 25 | $> aws datapipeline create-pipeline --name s3_if_ready_to_redshift --unique-id s3_if_ready_to_redshift 26 | ``` 27 | 28 | You receive a pipelineId like this. 29 | ```sh 30 | # ----------------------------------------- 31 | # | CreatePipeline | 32 | # +-------------+--------------------------+ 33 | # | pipelineId | | 34 | # +-------------+--------------------------+ 35 | ``` 36 | 37 | ```sh 38 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://S3TsvFilesToRedshiftTablesIfReady.json --pipeline-id \ 39 | --parameter-values myRedshiftUsername= \*myRedshiftPassword= \ 40 | myRedshiftDbName= \ 41 | myRedshiftSecurityGrpIds= \ 42 | myRedshiftJdbcConnectStr=\ 43 | myInputTsvFilesS3Loc_1=\ 44 | myDestRedshiftTable_1=
\ 45 | myInputTsvFilesS3Loc_2=s3://myInputTsvFilesS3Loc_2.csv>\ 46 | myDestRedshiftTable_2=
\ 47 | myLogUri= 48 | 49 | ``` 50 | 51 | You receive a validation messages like this 52 | ```sh 53 | # ----------------------- 54 | # |PutPipelineDefinition| 55 | # +-----------+---------+ 56 | # | errored | False | 57 | # +-----------+---------+ 58 | ``` 59 | 60 | Now activate the pipeline 61 | ```sh 62 | $> aws datapipeline activate-pipeline --pipeline-id 63 | ``` 64 | 65 | Check the status of your pipeline 66 | ```sh 67 | >$ aws datapipeline list-runs --pipeline-id 68 | ``` 69 | 70 | You will receive status information on the pipeline. 71 | 72 | 73 | ## Disclaimer 74 | 75 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 76 | 77 | Use at your own risk. 78 | 79 | Licensed under the MIT-0 License. 80 | -------------------------------------------------------------------------------- /samples/SQLActivityWithTimeout/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline SQL Activity with timeout sample 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a pipeline that uses the SQL activity to execute queries defined in a .sql script file 6 | that is stored on S3. The SQL queries are executed against an RDS mySQL database instance. 7 | 8 | The sample also demonstrates setting an explicit timeout on the attempt of the SQL activity (attemptTimeout: "1 hour") in the pipeline definition json file. This field can be set appropriately based on the expected run time of the activity attempt. 9 | 10 | The project provides scripts for setting up the RDS database for the sample, importing a [data set](http://aws.amazon.com/datasets/6468931156960467) (pipeline.json), and destroying the RDS datbase. The project also provides the [pipeline definition file](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html) which is used to create the pipeline and the AWS CLI commands for creating and executing the pipeline. See the instructions below to get started. 11 | 12 | *Note: Normal AWS charges apply for the resources created by the script. Make sure to run the teardown script as soon as you are done with the sample.* 13 | 14 | ## Prerequisites 15 | 16 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 17 | 18 | ## Step 1: Priming this sample 19 | 20 | Run the following commands to run the setup script. 21 | 22 | *Setup and teardown scripts are located in the setup directory. 23 | ```sh 24 | $> cd /data-pipeline-samples/samples/SQLActivityWithTimeout 25 | $> python setup/Setup.py 26 | ``` 27 | 28 | ## Step 2: Run this sample pipeline using the AWS CLI 29 | 30 | ```sh 31 | $> aws datapipeline create-pipeline --name sql_activity_pipeline --unique-id sql_activity_pipeline 32 | ``` 33 | 34 | You receive a pipelineId like this. 35 | ```sh 36 | # ----------------------------------------- 37 | # | CreatePipeline | 38 | # +-------------+--------------------------+ 39 | # | pipelineId | | 40 | # +-------------+--------------------------+ 41 | ``` 42 | 43 | ```sh 44 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values 45 | myS3LogsPath= myRDSUsername= myRDSPassword= 46 | myRDSId= --pipeline-id 47 | ``` 48 | 49 | You receive a validation messages like this 50 | ```sh 51 | # ----------------------- 52 | # |PutPipelineDefinition| 53 | # +-----------+---------+ 54 | # | errored | False | 55 | # +-----------+---------+ 56 | ``` 57 | 58 | Now activate the pipeline 59 | ```sh 60 | $> aws datapipeline activate-pipeline --pipeline-id 61 | ``` 62 | 63 | Check the status of your pipeline 64 | ```sh 65 | >$ aws datapipeline list-runs --pipeline-id 66 | ``` 67 | 68 | You will receive status information on the pipeline. For example... 69 | ```sh 70 | # Name Scheduled Start Status 71 | # ID Started Ended 72 | #--------------------------------------------------------------------------------------------------- 73 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES 74 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20 75 | # 76 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING 77 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20 78 | # 79 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20 80 | ``` 81 | 82 | 83 | ## Step 3: IMPORTANT! Tear down this sample 84 | 85 | *Note: The setup script will provide the teardown command with parameters at end of the execution.* 86 | 87 | ```sh 88 | $> python setup/Teardown.py --rds-instance-id 89 | ``` 90 | 91 | ## Disclaimer 92 | 93 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 94 | 95 | Use at your own risk. 96 | 97 | Licensed under the MIT-0 License. 98 | -------------------------------------------------------------------------------- /samples/SQLActivityWithTimeout/pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline", 5 | 6 | "id": "Default", 7 | "failureAndRerunMode": "CASCADE", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myS3LogsPath}", 11 | "scheduleType": "cron", 12 | "schedule": { 13 | "ref": "DefaultSchedule" 14 | } 15 | }, 16 | { 17 | "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it.", 18 | 19 | "type": "Schedule", 20 | "id": "DefaultSchedule", 21 | "occurrences": "1", 22 | "period": "1 Day", 23 | "name": "RunOnce", 24 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 25 | }, 26 | { 27 | "myComment": "This object is a SqlActivity which can be used to query data from a relational database.", 28 | 29 | "type": "SqlActivity", 30 | "id": "ActivityId_wRpKi", 31 | "name": "DefaultActivity1", 32 | "database": { 33 | "ref": "DatabaseId_rAZmM" 34 | }, 35 | "reportProgressTimeout": "1 Hour", 36 | "attemptTimeout": "1 Hour", 37 | "scriptUri": "s3://data-pipeline-samples/sqlactivity/script.sql", 38 | "runsOn": { 39 | "ref": "ResourceId_nEzqN" 40 | } 41 | }, 42 | { 43 | "myComment": "This object defines the RDS Datbase resource that the SQL query will run on.", 44 | 45 | "type": "RdsDatabase", 46 | "id": "DatabaseId_rAZmM", 47 | "rdsInstanceId" : "#{myRDSId}", 48 | "name": "DefaultDatabase1", 49 | "username": "#{myRDSUsername}", 50 | "*password": "#{myRDSPassword}" 51 | }, 52 | { 53 | "myComment": "This object defines the EC2 resource ", 54 | 55 | "type": "Ec2Resource", 56 | "id": "ResourceId_nEzqN", 57 | "name": "DefaultResource1", 58 | "resourceRole": "DataPipelineDefaultResourceRole", 59 | "role": "DataPipelineDefaultRole", 60 | "terminateAfter": "2 Hours" 61 | } 62 | ], 63 | "parameters": [ 64 | { 65 | "type": "String", 66 | "id": "myRDSId", 67 | "description": "RDS instance id" 68 | }, 69 | { 70 | "type": "String", 71 | "id": "myRDSUsername", 72 | "description": "RDS MySQL username" 73 | }, 74 | { 75 | "type": "String", 76 | "id": "myRDSPassword", 77 | "description": "RDS MySQL password" 78 | }, 79 | { 80 | "type": "AWS::S3::ObjectKey", 81 | "id": "myS3LogsPath", 82 | "description": "S3 folder for logs" 83 | } 84 | ] 85 | } -------------------------------------------------------------------------------- /samples/SQLActivityWithTimeout/setup/Setup.py: -------------------------------------------------------------------------------- 1 | from SQLActivitySample import SQLActivitySample 2 | from Utilities import check_working_directory 3 | 4 | import argparse 5 | import sys 6 | 7 | 8 | if __name__ == '__main__': 9 | check_working_directory() 10 | parser = argparse.ArgumentParser(description='Setup for SQLActivity pipeline sample') 11 | args = parser.parse_args() 12 | 13 | sample = SQLActivitySample() 14 | sample.create_rds_instance() 15 | sample.run_setup_datapipeline() 16 | sample.print_setup_results() 17 | -------------------------------------------------------------------------------- /samples/SQLActivityWithTimeout/setup/SetupPipelineDefinition.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class SetupPipelineDefinitionHelper(object): 5 | 6 | def __init__(self): 7 | with open("setup.json", "r") as setup: 8 | pipeline_string = setup.read().replace('\n', '') 9 | self.pipeline_definition = json.loads(pipeline_string) 10 | 11 | def get_setup_pipeline_objects(self): 12 | return self.pipeline_definition['objects'] 13 | 14 | def get_setup_pipeline_parameters(self): 15 | return self.pipeline_definition['parameters'] 16 | 17 | def get_setup_pipeline_parameter_values(self): 18 | return self.pipeline_definition['parameterValues'] 19 | -------------------------------------------------------------------------------- /samples/SQLActivityWithTimeout/setup/Teardown.py: -------------------------------------------------------------------------------- 1 | from SQLActivitySample import SQLActivitySample 2 | from Utilities import check_working_directory 3 | 4 | import argparse 5 | 6 | 7 | if __name__ == '__main__': 8 | check_working_directory() 9 | 10 | parser = argparse.ArgumentParser(description='Teardown for SQLAcivityPipeline pipeline sample') 11 | parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id") 12 | args = parser.parse_args() 13 | 14 | sample = SQLActivitySample() 15 | 16 | if args.rds_instance_id is not None: 17 | sample.destroy_rds(args.rds_instance_id) 18 | -------------------------------------------------------------------------------- /samples/SQLActivityWithTimeout/setup/Utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def check_working_directory(): 6 | current_folder_path, current_folder_name = os.path.split(os.getcwd()) 7 | if current_folder_name == 'SQLActivityWithTimeout': 8 | os.chdir('setup') 9 | elif current_folder_name != 'setup': 10 | print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoS3/setup' 11 | sys.exit(0) 12 | -------------------------------------------------------------------------------- /samples/ShellCommandWithFTP/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline ShellCommandWith (S)FTP Sample 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a Shell Command Activity pipeline that uses a (s)ftp server to get files. The sample relies 6 | on having public key authentication configured to access the SFTP server. The samples also uses an input and output s3 7 | bucket for storing input scripts and output results of the shell command. 8 | 9 | The sample includes the pipeline definition, a script of ftp commands and a data file. 10 | 11 | ## Prerequisites 12 | 13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 14 | 15 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket. 16 | 17 | You must also upload *ftpcommands* to your s3 input bucket and the *data* file from this file to your ftp server. 18 | 19 | ## Run this sample pipeline using the AWS CLI 20 | 21 | ```sh 22 | $> aws datapipeline create-pipeline --name shell_command_ftp_pipeline --unique-id shell_command_ftp_pipeline 23 | ``` 24 | 25 | You receive a pipelineId like this. 26 | ```sh 27 | # ----------------------------------------- 28 | # | CreatePipeline | 29 | # +-------------+--------------------------+ 30 | # | pipelineId | | 31 | # +-------------+--------------------------+ 32 | ``` 33 | 34 | ```sh 35 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values myS3InputLoc= myS3OutputLoc= myS3LogsPath= myFTPUser= myFTPHost= --pipeline-id 36 | ``` 37 | 38 | You receive a validation messages like this 39 | ```sh 40 | # ----------------------- 41 | # |PutPipelineDefinition| 42 | # +-----------+---------+ 43 | # | errored | False | 44 | # +-----------+---------+ 45 | ``` 46 | 47 | Now activate the pipeline 48 | ```sh 49 | $> aws datapipeline activate-pipeline --pipeline-id 50 | ``` 51 | 52 | Check the status of your pipeline 53 | ``` 54 | >$ aws datapipeline list-runs --pipeline-id 55 | ``` 56 | 57 | You will receive status information on the pipeline. 58 | ```sh 59 | # Name Scheduled Start Status 60 | # ID Started Ended 61 | #--------------------------------------------------------------------------------------------------- 62 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES 63 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20 64 | # 65 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING 66 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20 67 | # 68 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20 69 | ``` 70 | 71 | 72 | ## Disclaimer 73 | 74 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 75 | 76 | Use at your own risk. 77 | 78 | Licensed under the MIT-0 License. 79 | -------------------------------------------------------------------------------- /samples/ShellCommandWithFTP/data: -------------------------------------------------------------------------------- 1 | test 2 | test 3 | test 4 | test 5 | test 6 | test 7 | test 8 | test 9 | test 10 | test 11 | test 12 | test 13 | test 14 | test 15 | test 16 | test 17 | test 18 | test 19 | test 20 | test 21 | test 22 | test 23 | test 24 | test 25 | test 26 | test 27 | test 28 | test 29 | -------------------------------------------------------------------------------- /samples/ShellCommandWithFTP/ftpcommands: -------------------------------------------------------------------------------- 1 | cd /var/tmp 2 | get data 3 | exit -------------------------------------------------------------------------------- /samples/ShellCommandWithFTP/pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline", 5 | 6 | "id": "Default", 7 | "failureAndRerunMode": "CASCADE", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myS3LogsPath}", 11 | "name": "Default", 12 | "scheduleType": "cron", 13 | "schedule": { 14 | "ref": "DefaultSchedule" 15 | } 16 | }, 17 | { 18 | "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. 19 | In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. 20 | For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' 21 | option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.", 22 | 23 | "type": "Schedule", 24 | "id": "DefaultSchedule", 25 | "occurrences": "1", 26 | "period": "1 Day", 27 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 28 | }, 29 | { 30 | "myComment": "Pipeline object that represents the S3 bucket node that is used as the input staging directory in this pipeline.", 31 | 32 | "type": "S3DataNode", 33 | "id": "S3InputLocation", 34 | "directoryPath": "#{myS3InputLoc}" 35 | }, 36 | { 37 | "myComment": "Pipeline object that represents the S3 bucket node that is used as the output staging directory in this pipeline.", 38 | 39 | "type": "S3DataNode", 40 | "id": "S3OutputLocation", 41 | "directoryPath": "#{myS3OutputLoc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}", 42 | }, 43 | { 44 | "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. 45 | ${INPUT1_STAGING_DIR} is the identifier used to refer to the input staging directory and ${OUTPUT1_STAGING_DIR} is used to refer 46 | to the output staging directory.", 47 | 48 | "type": "ShellCommandActivity", 49 | "id": "ShellCommandActivityObj", 50 | "stage": "true", 51 | "input": { 52 | "ref": "S3InputLocation" 53 | }, 54 | "output": { 55 | "ref": "S3OutputLocation" 56 | }, 57 | "runsOn": { 58 | "ref": "EC2ResourceObj" 59 | }, 60 | "command": "sftp -b ${Input_STAGING_DIR}/ftpcommands #{user}@#{host}; wc -l data > ${OUTPUT1_STAGING_DIR}/linecount.txt;" 61 | }, 62 | { 63 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.", 64 | 65 | "instanceType": "t1.micro", 66 | "name": "EC2ResourceObj", 67 | "id": "EC2ResourceObj", 68 | "type": "Ec2Resource", 69 | "terminateAfter": "20 Minutes" 70 | } 71 | ], 72 | "parameters": [ 73 | { 74 | "myComment": "This Parameter specifies the S3 logging path for the pipeline. 75 | It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users 76 | avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling ' 77 | aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.", 78 | 79 | "id" : "myS3LogsPath", 80 | "type" : "AWS::S3::ObjectKey", 81 | "description" : "S3 path for pipeline logs." 82 | }, 83 | { 84 | "myComment": "This Parameter specifies the S3 input location for the pipeline.", 85 | 86 | "id": "myS3InputLoc", 87 | "type": "AWS::S3::ObjectKey" 88 | }, 89 | { 90 | "myComment": "This Parameter specifies the S3 output location for the pipeline.", 91 | 92 | "id": "myS3OutputLoc", 93 | "type": "AWS::S3::ObjectKey" 94 | }, 95 | { 96 | "myComment": "This Parameter specifies user for the ftp server", 97 | 98 | "id": "user", 99 | "type": "String" 100 | }, 101 | { 102 | "myComment": "This Parameter specifies the ftp server host", 103 | 104 | "id": "host", 105 | "type": "String" 106 | } 107 | ] 108 | } 109 | -------------------------------------------------------------------------------- /samples/ShellCommandWithS3StagingDirectory/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline ShellCommandWithS3StagingDirectory Sample 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a Shell Command Activity pipeline that uses a S3 directory for staging. Specifically, this sample runs a script that is located in a s3 bucket and takes an argument string. The script simply prints out the argument to stdout. 6 | 7 | ## Prerequisites 8 | 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 10 | 11 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket. 12 | 13 | You must also upload *bashscript.sh* to your s3 bucket. 14 | 15 | ## Run this sample pipeline using the AWS CLI 16 | 17 | ```sh 18 | $> aws datapipeline create-pipeline --name shell_command_staging_pipeline --unique-id shell_command_staging_pipeline 19 | ``` 20 | 21 | You receive a pipelineId like this. 22 | ```sh 23 | # ----------------------------------------- 24 | # | CreatePipeline | 25 | # +-------------+--------------------------+ 26 | # | pipelineId | | 27 | # +-------------+--------------------------+ 28 | ``` 29 | 30 | ```sh 31 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://shellcommandwiths3stagingdir.json --parameter-values myS3StagingPath= myS3LogsPath= --pipeline-id 32 | ``` 33 | 34 | You receive a validation messages like this 35 | ```sh 36 | # ----------------------- 37 | # |PutPipelineDefinition| 38 | # +-----------+---------+ 39 | # | errored | False | 40 | # +-----------+---------+ 41 | ``` 42 | 43 | Now activate the pipeline 44 | ```sh 45 | $> aws datapipeline activate-pipeline --pipeline-id 46 | ``` 47 | 48 | Check the status of your pipeline 49 | ``` 50 | >$ aws datapipeline list-runs --pipeline-id 51 | ``` 52 | 53 | You will receive status information on the pipeline. 54 | ```sh 55 | # Name Scheduled Start Status 56 | # ID Started Ended 57 | #--------------------------------------------------------------------------------------------------- 58 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES 59 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20 60 | # 61 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING 62 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20 63 | # 64 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20 65 | ``` 66 | 67 | 68 | ## Disclaimer 69 | 70 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 71 | 72 | Use at your own risk. 73 | 74 | Licensed under the MIT-0 License. 75 | -------------------------------------------------------------------------------- /samples/ShellCommandWithS3StagingDirectory/bashscript.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo $1 4 | -------------------------------------------------------------------------------- /samples/ShellCommandWithS3StagingDirectory/shellcommandwiths3stagingdir.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline", 5 | 6 | "id": "Default", 7 | "failureAndRerunMode": "CASCADE", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myS3LogsPath}", 11 | "scheduleType": "cron", 12 | "schedule": { 13 | "ref": "DefaultSchedule" 14 | } 15 | }, 16 | { 17 | "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.", 18 | 19 | "type": "Schedule", 20 | "id": "DefaultSchedule", 21 | "occurrences": "1", 22 | "period": "1 Day", 23 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 24 | }, 25 | { 26 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.", 27 | 28 | "type": "Ec2Resource", 29 | "id": "New_EC2Instance", 30 | "terminateAfter": "1 Hour", 31 | "resourceRole": "DataPipelineDefaultResourceRole", 32 | "role": "DataPipelineDefaultRole" 33 | }, 34 | { 35 | "myComment": "Pipeline object that represents the S3 bucket node that is used as the staging directory in this pipeline.", 36 | 37 | "type": "S3DataNode", 38 | "id": "New_S3_Datanode", 39 | "directoryPath": "#{myS3StagingPath}" 40 | }, 41 | { 42 | "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case, it runs a simple bash script which echos the path of the input paramter. ${INPUT1_STAGING_DIR} is the identifier used to refer to the input staging directory.", 43 | 44 | "id": "Shell_Command_Staging_Activity", 45 | "type": "ShellCommandActivity", 46 | "stage": "true", 47 | "input": { 48 | "ref": "New_S3_Datanode" 49 | }, 50 | "runsOn": { 51 | "ref": "New_EC2Instance" 52 | }, 53 | "scriptArgument": [ 54 | "hello world" 55 | ], 56 | "command": "bash -x ${INPUT1_STAGING_DIR}/bashscript.sh $1" 57 | } 58 | ], 59 | "parameters": [ 60 | { 61 | "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.", 62 | 63 | "id" : "myS3LogsPath", 64 | "type" : "AWS::S3::ObjectKey", 65 | "description" : "S3 path for pipeline logs." 66 | }, 67 | { 68 | "myComment": "This Parameter specifies the S3 path for the input staging directory. This path is represented in the pipeline definition as ${INPUT1_STAGING_DIR}.", 69 | 70 | "id" : "myS3StagingPath", 71 | "type" : "AWS::S3::ObjectKey", 72 | "description" : "S3 path for staging directory." 73 | } 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /samples/SimplePigActivity/pig_activity_sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "id": "Default", 5 | "failureAndRerunMode": "CASCADE", 6 | "schedule": { 7 | "ref": "DefaultSchedule" 8 | }, 9 | "resourceRole": "DataPipelineDefaultResourceRole", 10 | "role": "DataPipelineDefaultRole", 11 | "pipelineLogUri": "#{myLogUri}", 12 | "scheduleType": "cron" 13 | }, 14 | { 15 | "id": "PigActivity1", 16 | "input": { 17 | "ref": "InputDataNode1" 18 | }, 19 | "output": { 20 | "ref": "OutputDataNode1" 21 | }, 22 | "stage": "true", 23 | "scriptUri": "#{myPigScriptUri}", 24 | "name": "DefaultActivity1", 25 | "runsOn": { 26 | "ref": "EmrCluster1" 27 | }, 28 | "type": "PigActivity", 29 | "scriptVariable": [ 30 | "column1=First", 31 | "column2=Second", 32 | "three=3" 33 | ], 34 | "generatedScriptsPath": "#{myGeneratedScriptsPath}" 35 | }, 36 | { 37 | "id": "DefaultSchedule", 38 | "occurrences": "1", 39 | "period": "1 Day", 40 | "name": "RunOnce", 41 | "type": "Schedule", 42 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 43 | }, 44 | { 45 | "id": "InputDataNode1", 46 | "directoryPath": "#{myS3Input}", 47 | "dataFormat": { 48 | "ref": "InputDataFormat1" 49 | }, 50 | "type": "S3DataNode" 51 | }, 52 | { 53 | "id": "InputDataFormat1", 54 | "inputRegEx": "^(\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+)", 55 | "column": [ 56 | "First STRING", 57 | "Second STRING", 58 | "Third STRING", 59 | "Fourth STRING", 60 | "Fifth STRING", 61 | "Sixth STRING", 62 | "Seventh STRING", 63 | "Eighth STRING", 64 | "Nineth STRING", 65 | "Tenth STRING" 66 | ], 67 | "type": "RegEx" 68 | }, 69 | { 70 | "id": "OutputDataNode1", 71 | "directoryPath": "#{myS3Output}", 72 | "dataFormat": { 73 | "ref": "OutputDataFormat1" 74 | }, 75 | "type": "S3DataNode" 76 | }, 77 | { 78 | "id": "OutputDataFormat1", 79 | "column": [ 80 | "First STRING", 81 | "Second STRING", 82 | "Third STRING", 83 | "Third STRING", 84 | "Fourth STRING", 85 | "Fifth STRING", 86 | "Sixth STRING", 87 | "Seventh STRING", 88 | "Eighth STRING" 89 | ], 90 | "columnSeparator": "*", 91 | "type": "Custom" 92 | }, 93 | { 94 | "id": "EmrCluster1", 95 | "releaseLabel": "emr-4.2.0", 96 | "type": "EmrCluster", 97 | "terminateAfter": "1 Day" 98 | } 99 | ] 100 | } 101 | -------------------------------------------------------------------------------- /samples/SparkPiMaximizeResourceAllocation/SparkPi-maximizeResource.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "name": "DefaultProperty1", 5 | "id": "PropertyId_jVPFN", 6 | "type": "Property", 7 | "value": "true", 8 | "key": "maximizeResourceAllocation" 9 | }, 10 | { 11 | "name": "DefaultEmrActivity1", 12 | "step": "command-runner.jar,spark-submit,--class,org.apache.spark.examples.SparkPi,/usr/lib/spark/lib/spark-examples.jar,100", 13 | "runsOn": { 14 | "ref": "EmrClusterId_mD6dg" 15 | }, 16 | "id": "EmrActivityId_Bo5Zd", 17 | "type": "EmrActivity" 18 | }, 19 | { 20 | "configuration": { 21 | "ref": "EmrConfigurationId_uXera" 22 | }, 23 | "name": "DefaultEmrCluster1", 24 | "coreInstanceCount": "1", 25 | "coreInstanceType": "m3.xlarge", 26 | "releaseLabel": "emr-4.6.0", 27 | "masterInstanceType": "m3.xlarge", 28 | "id": "EmrClusterId_mD6dg", 29 | "type": "EmrCluster", 30 | "terminateAfter": "45 Minutes", 31 | "applications": "spark" 32 | }, 33 | { 34 | "name": "DefaultEmrConfiguration1", 35 | "property": { 36 | "ref": "PropertyId_jVPFN" 37 | }, 38 | "id": "EmrConfigurationId_uXera", 39 | "type": "EmrConfiguration", 40 | "classification": "spark" 41 | }, 42 | { 43 | "failureAndRerunMode": "CASCADE", 44 | "resourceRole": "DataPipelineDefaultResourceRole", 45 | "role": "DataPipelineDefaultRole", 46 | "pipelineLogUri": "#{myPipelineLogUri}", 47 | "scheduleType": "ONDEMAND", 48 | "name": "Default", 49 | "id": "Default" 50 | } 51 | ], 52 | "parameters": [ 53 | { 54 | "id" : "myPipelineLogUri", 55 | "type" : "AWS::S3::ObjectKey", 56 | "description" : "Please specify the logs location" 57 | }] 58 | } -------------------------------------------------------------------------------- /samples/SparkPiMaximizeResourceAllocation/readme.md: -------------------------------------------------------------------------------- 1 | #EMRActivity SparkPi example with maximizeResourceAllocation 2 | 3 | ##About the sample 4 | This Pipeline definition launches an EmrCluster (emr-4.x.x) with [maximizeResourceAllocation](http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-spark-configure.html#d0e17386) with simple [SparkPi](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala) example in yarn-client mode. Also, it runs on [ONDEMAND](https://aws.amazon.com/about-aws/whats-new/2016/02/now-run-your-aws-data-pipeline-on-demand/) schedule. 5 | 6 | ##Running this sample 7 | The pipeline requires one input point from the customer: 8 | 1. The log folder for the pipeline. 9 | 10 | ##Result 11 | You can view the output (stdout) under 'Emr Step Logs' under EmrActivity. 12 | Pi is roughly 3.141716 -------------------------------------------------------------------------------- /samples/billing/readme.md: -------------------------------------------------------------------------------- 1 | ![Data Pipeline Logo](https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/setup/logo/datapipelinelogo.jpeg) 2 | 3 | Load Detailed AWS Billing logs into a Redshift table 4 | ===================== 5 | 6 | The Load AWS Detailed Billing Report Into Redshift template loads the AWS detailed billing report for the current month stored in an Amazon S3 folder to a Redshift table. If you would like to process files from previous months please pick a schedule that starts in the past, so the scheduled start time can be the timestamp of the CSVs for the period of interest. The input file must be of the .csv.zip format. Existing entries in the Redshift table are updated with data from Amazon S3 and new entries from Amazon S3 data are added to the Redshift table. If the table does not exist, it will be automatically created with the same schema as the AWS detailed billing report. The input report file is unzipped and converted to a GZIP file which is stored in the Amazon S3 staging folder before loading to Redshift. 7 | 8 | ## Billing logs format 9 | 10 | This sample specifically targets detailed billing reports for customers who have opted into consolidated billing and have other linked accounts. Their billing logs should have the following fields: 11 | 12 | ```invoice_id 13 | payer_account_id 14 | linked_account_id 15 | record_type 16 | product_name 17 | rate_id 18 | subscription_id 19 | pricing_plan_id 20 | usage_type 21 | operation 22 | availability_zone 23 | reserved_instance 24 | item_description 25 | usage_start_date 26 | usage_end_date 27 | usage_quantity 28 | blended_rate 29 | blended_cost 30 | unblended_rate 31 | unblended_cost 32 | ``` 33 | 34 | ## How it works 35 | 36 | The pipeline will download the billing logs .gzips from the S3 bucket matching the pipeline's scheduled start time into a newly created EC2 instance. A shell script will then uncompress these into a staging bucket in S3. The RedshiftCopyActivity then creates a table in Redshift with columns as listed above and then loads in the staged CSV files. A final cleanup script deletes the temporary staged files in S3. 37 | 38 | ## Different billing formats 39 | 40 | Logs for accounts without consolidated billing or linked accounts will replace 4 fields [blended_rate, blended_cost, unblended_rate, unblended_cost] with 2 fields [rate, cost]. To load these logs into Redshift you must modify the schema of the Redshift table to look similar to the following: 41 | 42 | ```invoice_id varchar(255), payer_account_id varchar(255), linked_account_id varchar(255), record_type varchar(255), product_name varchar(255), rate_id varchar(255), subscription_id varchar(255), pricing_plan_id varchar(255), usage_type varchar(255), operation varchar(255), availability_zone varchar(255), reserved_instance varchar(255), item_description varchar(255), usage_start_date varchar(255), usage_end_date varchar(255), usage_quantity FLOAT, rate FLOAT, cost FLOAT``` 43 | 44 | ## Parameters 45 | 46 | Specifying these parameters is sufficient to get this pipeline to work: 47 | 48 | ``` 49 | "parameters": [ 50 | { 51 | "id": "myS3BillingLogLoc", 52 | "type": "AWS::S3::ObjectKey", 53 | "description": "Input S3 folder for billing report", 54 | "helpText": "S3 folder that has the monthly AWS detailed billing report files with a .csv.zip format." 55 | }, 56 | { 57 | "id": "myS3StagingLoc", 58 | "type": "AWS::S3::ObjectKey", 59 | "description": "S3 staging folder", 60 | "helpText": "Folder to store the unzipped CSV file before loading to Redshift. The S3 folder must be in the same region as the Redshift cluster." 61 | }, 62 | { 63 | "id": "myRedshiftJdbcConnectStr", 64 | "type": "String", 65 | "description": "Redshift JDBC connection string", 66 | "watermark": "jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true" 67 | }, 68 | { 69 | "id": "myRedshiftUsername", 70 | "type": "String", 71 | "description": "Redshift username" 72 | }, 73 | { 74 | "id": "*myRedshiftPassword", 75 | "type": "String", 76 | "description": "Redshift password" 77 | }, 78 | { 79 | "id": "myRedshiftSecurityGrps", 80 | "type": "String", 81 | "isArray": "true", 82 | "description": "Redshift security group(s)", 83 | "default":"default", 84 | "helpText": "The names of one or more security groups that are assigned to the Redshift cluster.", 85 | "watermark": "security group name" 86 | }, 87 | { 88 | "id": "myRedshiftDbName", 89 | "type": "String", 90 | "description": "Redshift database name" 91 | }, 92 | { 93 | "id": "myRedshiftTableName", 94 | "type": "String", 95 | "description": "Redshift table name", 96 | "helpText": "The name of an existing table or a new table that will be created with the same schema as the AWS detailed billing report." 97 | } 98 | ] 99 | ``` 100 | -------------------------------------------------------------------------------- /samples/diagnose/README.md: -------------------------------------------------------------------------------- 1 | # Diagnosis Tool 2 | The diagnosis tool can be used to do a quick check to test whether your connectivity is fine. It checks for the following: 3 | - Connectivity to different regions 4 | - Connections to S3, DynamoDB, Redshift and RDS. 5 | 6 | ## Instructions 7 | It can be done in two different ways: 8 | 1. Using the terminal 9 | 10 | 2. Using the AWS Data Pipeline Console 11 | 12 | 13 | ###Using the terminal 14 | 1. Download the diagnostics jar file: https://s3.amazonaws.com/data-pipeline-samples/diagnose-sample/Diagnose.jar 15 | 16 | 2. Run the following command (The config option takes in the path and file name of your credentials.json file) 17 | `$> java -jar /Diagnose.jar --config /credentials.json` 18 | 19 | NOTE: If you are running it from an AWS CLI that has been configured with your credentials, you can run just the following command: 20 | `$> java -jar /Diagnose.jar` 21 | 22 | 23 | ###Using the AWS Data Pipeline Console 24 | 1. Download the pipeline definition json file:https://s3.amazonaws.com/data-pipeline-samples/diagnose-sample/diagnose_pipeline.json. 25 | 26 | 3. Use the AWS Data Pipeline console to create a new pipeline and import the definition from the downloaded json file. 27 | 28 | 4. Activate the pipeline and wait for it to finish. 29 | 30 | 5. Once it's finished, open the stdout logs and ensure that all the connectivity checks have been completed successfully. 31 | 32 | 33 | -------------------------------------------------------------------------------- /samples/diagnose/diagnose_pipeline.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "occurrences": "1", 5 | "period": "1 Day", 6 | "name": "RunOnce", 7 | "id": "DefaultSchedule", 8 | "type": "Schedule", 9 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 10 | }, 11 | { 12 | "failureAndRerunMode": "CASCADE", 13 | "schedule": { 14 | "ref": "DefaultSchedule" 15 | }, 16 | "resourceRole": "DataPipelineDefaultResourceRole", 17 | "role": "DataPipelineDefaultRole", 18 | "pipelineLogUri": "s3://edptestdiagnose/", 19 | "scheduleType": "cron", 20 | "name": "Default", 21 | "id": "Default" 22 | }, 23 | { 24 | "schedule": { 25 | "ref": "DefaultSchedule" 26 | }, 27 | "resourceRole": "DataPipelineDefaultResourceRole", 28 | "role": "DataPipelineDefaultRole", 29 | "name": "DefaultResource1", 30 | "id": "ResourceId_1", 31 | "type": "Ec2Resource" 32 | }, 33 | { 34 | "schedule": { 35 | "ref": "DefaultSchedule" 36 | }, 37 | "name": "DefaultActivity1", 38 | "id": "ActivityId_1", 39 | "runsOn": { 40 | "ref": "ResourceId_1" 41 | }, 42 | "type": "ShellCommandActivity", 43 | "command": "wget https://s3.amazonaws.com/data-pipeline-samples/diagnose-sample/Diagnose.jar;java -jar Diagnose.jar" 44 | } 45 | ], 46 | "parameters": [] 47 | } -------------------------------------------------------------------------------- /samples/dynamo-db-export-as-csv/ddb-to-csv.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "schedule": { 5 | "ref": "ScheduleId_scTIc" 6 | }, 7 | "name": "MyDynamoData", 8 | "id": "DynamoDBDataNodeId_PK5Iq", 9 | "type": "DynamoDBDataNode", 10 | "tableName": "ddbtable" 11 | }, 12 | { 13 | "output": { 14 | "ref": "DataNodeId_KQofW" 15 | }, 16 | "input": { 17 | "ref": "S3DataNodeId_3cbrR" 18 | }, 19 | "schedule": { 20 | "ref": "ScheduleId_scTIc" 21 | }, 22 | "dependsOn": { 23 | "ref": "EmrActivityId_bxl6C" 24 | }, 25 | "stage": "false", 26 | "name": "DDBExporttoCSV", 27 | "hiveScript": "drop table if exists raw_data;\n\nCREATE EXTERNAL TABLE raw_data(accountId string, name string, description string)\nROW FORMAT SERDE 'org.apache.hadoop.hive.dynamodb.DynamoDBExportSerDe'\nLOCATION \"#{input.directoryPath}/#{format(@scheduledStartTime,'YYYY-MM-dd_hh.mm')}\"\nTBLPROPERTIES (\"dynamodb.column.mapping\"=\"accountId:accountId,name:name,description:description\");\n\ndrop table if exists csv_data;\ncreate table csv_data (accountId string, name string, description string)\nrow format delimited\nfields terminated by ',' lines terminated by '\\n'\nlocation '#{output.directoryPath}/';\n\ninsert overwrite table csv_data select * from raw_data;", 28 | "runsOn": { 29 | "ref": "EmrClusterId_auxJq" 30 | }, 31 | "id": "ActivityId_IUO66", 32 | "type": "HiveActivity" 33 | }, 34 | { 35 | "occurrences": "1", 36 | "period": "1 Day", 37 | "name": "ExportSchedule", 38 | "id": "ScheduleId_scTIc", 39 | "type": "Schedule", 40 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 41 | }, 42 | { 43 | "schedule": { 44 | "ref": "ScheduleId_scTIc" 45 | }, 46 | "directoryPath": "s3://bucket/data_as_csv", 47 | "name": "CSVData", 48 | "id": "DataNodeId_KQofW", 49 | "type": "S3DataNode" 50 | }, 51 | { 52 | "resizeClusterMaxInstances": "50", 53 | "maximumRetries": "0", 54 | "runsOn": { 55 | "ref": "EmrClusterId_auxJq" 56 | }, 57 | "type": "EmrActivity", 58 | "output": { 59 | "ref": "S3DataNodeId_3cbrR" 60 | }, 61 | "schedule": { 62 | "ref": "ScheduleId_scTIc" 63 | }, 64 | "input": { 65 | "ref": "DynamoDBDataNodeId_PK5Iq" 66 | }, 67 | "attemptTimeout": "24 Hours", 68 | "myDynamoDBReadThroughputRatio": "1.0", 69 | "name": "MyExportJob", 70 | "step": "s3://elasticmapreduce/libs/script-runner/script-runner.jar,s3://elasticmapreduce/libs/hive/hive-script,--run-hive-script,--hive-versions,latest,--args,-f,s3://elasticmapreduce/libs/hive/dynamodb/exportDynamoDBTableToS3,-d,DYNAMODB_INPUT_TABLE=#{input.tableName},-d,S3_OUTPUT_BUCKET=#{output.directoryPath}/#{format(@scheduledStartTime,'YYYY-MM-dd_hh.mm')},-d,DYNAMODB_READ_PERCENT=#{myDynamoDBReadThroughputRatio},-d,DYNAMODB_ENDPOINT=dynamodb.us-east-1.amazonaws.com", 71 | "id": "EmrActivityId_bxl6C", 72 | "resizeClusterBeforeRunning": "true" 73 | }, 74 | { 75 | "emrLogUri": "s3://bucket/data_pipeline_logs/export/us-east-1/ddbtable/#{format(@scheduledStartTime,'YYYY-MM-dd_hh.mm')}", 76 | "schedule": { 77 | "ref": "ScheduleId_scTIc" 78 | }, 79 | "installHive": "latest", 80 | "enableDebugging": "true", 81 | "name": "ExportCluster", 82 | "coreInstanceType": "m1.medium", 83 | "coreInstanceCount": "1", 84 | "id": "EmrClusterId_auxJq", 85 | "masterInstanceType": "m1.medium", 86 | "amiVersion": "3.3.2", 87 | "type": "EmrCluster" 88 | }, 89 | { 90 | "failureAndRerunMode": "CASCADE", 91 | "resourceRole": "DataPipelineDefaultResourceRole", 92 | "role": "DataPipelineDefaultRole", 93 | "scheduleType": "CRON", 94 | "name": "Default", 95 | "id": "Default" 96 | }, 97 | { 98 | "schedule": { 99 | "ref": "ScheduleId_scTIc" 100 | }, 101 | "directoryPath": "s3://bucket/backup", 102 | "name": "MyS3Data", 103 | "id": "S3DataNodeId_3cbrR", 104 | "type": "S3DataNode" 105 | } 106 | ], 107 | "parameters": [] 108 | } 109 | -------------------------------------------------------------------------------- /samples/dynamo-db-export-as-csv/readme.md: -------------------------------------------------------------------------------- 1 | This pipeline demonstrates how to export data in dynamoDB as csv data in S3. 2 | 3 | Steps to run the pipeline using the cli. 4 | 5 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id 6 | => Returns a pipeline-id 7 | 8 | 2) aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file:///home/user/ddb-to-csv.json 9 | 10 | 3) aws datapipeline activate-pipeline --pipeline-id 11 | -------------------------------------------------------------------------------- /samples/dynamo-db-export/DynamoDB-export.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "occurrences": "1", 5 | "period": "1 Day", 6 | "name": "RunOnce", 7 | "id": "DefaultSchedule", 8 | "type": "Schedule", 9 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 10 | }, 11 | { 12 | "failureAndRerunMode": "CASCADE", 13 | "schedule": { 14 | "ref": "DefaultSchedule" 15 | }, 16 | "resourceRole": "DataPipelineDefaultResourceRole", 17 | "role": "DataPipelineDefaultRole", 18 | "scheduleType": "cron", 19 | "pipelineLogUri" : "#{myOutputS3Loc}/logs", 20 | "name": "Default", 21 | "id": "Default", 22 | "maxActiveInstances" : "1" 23 | }, 24 | { 25 | "maximumRetries": "2", 26 | "name": "TableBackupActivity", 27 | "step": "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')},#{myDDBTableName},#{myDDBReadThroughputRatio}", 28 | "id": "TableBackupActivity", 29 | "runsOn": { 30 | "ref": "EmrClusterForBackup" 31 | }, 32 | "type": "EmrActivity" 33 | }, 34 | { 35 | "bootstrapAction": "s3://elasticmapreduce/bootstrap-actions/configure-hadoop, --yarn-key-value, yarn.nodemanager.resource.memory-mb=12800,--yarn-key-value,yarn.scheduler.minimum-allocation-mb=256,--mapred-key-value,mapreduce.map.memory.mb=500,--mapred-key-value,mapreduce.map.java.opts=-Xmx400M,--mapred-key-value,mapreduce.job.reduce.slowstart.completedmaps=1,--mapred-key-value,mapreduce.map.speculative=false", 36 | "name": "EmrClusterForBackup", 37 | "amiVersion": "3.8.0", 38 | "id": "EmrClusterForBackup", 39 | "type": "EmrCluster", 40 | "masterInstanceType": "m1.medium", 41 | "coreInstanceType": "#{myInstanceType}", 42 | "coreInstanceCount": "#{myInstanceCount}", 43 | "region" : "#{myRegion}", 44 | "terminateAfter" : "12 hours", 45 | "keyPair" : "ramsug-test-desktop" 46 | } 47 | ], 48 | "parameters": [ 49 | { 50 | "description": "OutputS3folder", 51 | "id": "myOutputS3Loc", 52 | "type": "AWS::S3::ObjectKey" 53 | }, 54 | { 55 | "default": "0.2", 56 | "watermark": "Enter value between 0.1 - 1.0", 57 | "description": "DynamoDB Read Throughput Ratio", 58 | "id": "myDDBReadThroughputRatio", 59 | "type": "Double" 60 | }, 61 | { 62 | "description": "DynamoDB Table Name", 63 | "id": "myDDBTableName", 64 | "type": "String" 65 | }, 66 | { 67 | "description": "Instance Type", 68 | "id": "myInstanceType", 69 | "watermark" : "m1.medium if IOPS <= 900. Else use m3.xlarge", 70 | "type": "String" 71 | }, 72 | { 73 | "description": "Instance Count", 74 | "watermark" : " (IOPS / 300) for m1.medium.(IOPS / 1500) for m3.xlarge", 75 | "id": "myInstanceCount", 76 | "type": "Integer" 77 | }, 78 | { 79 | "description" : "Region", 80 | "watermark" : "Region of DynamoDB Table/EMR cluster", 81 | "id" : "myRegion", 82 | "type" : "String" 83 | } 84 | ] 85 | } 86 | -------------------------------------------------------------------------------- /samples/dynamo-db-export/example-parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "values": 3 | { 4 | "myOutputS3Loc" : "s3://bucket/directory/", 5 | "myDDBReadThroughputRatio" : "0.5", 6 | "myDDBTableName" : "dynamo-table-name", 7 | "myInstanceType" : "m1.medium", 8 | "myInstanceCount" : "1", 9 | "myRegion" : "eu-west-1" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /samples/dynamo-db-export/readme.md: -------------------------------------------------------------------------------- 1 | This pipeline exports data from a Dynamo DB Table to a S3 location using an EMR Cluster. 2 | 3 | Steps to run the pipeline using the cli. 4 | 5 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id 6 | => Returns a pipeline-id 7 | 8 | 2) aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file:///home/user/DynamoDB-export.json --parameter-values-uri file:///home/user/example-parameters.json 9 | 10 | 3) aws datapipeline activate-pipeline --pipeline-id 11 | -------------------------------------------------------------------------------- /samples/dynamo-db-to-redshift/dynamo-db-to-redshift.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "id": "DefaultSchedule1", 5 | "startAt": "FIRST_ACTIVATION_DATE_TIME", 6 | "name": "DefaultSchedule1", 7 | "type": "Schedule", 8 | "occurrences" : "1", 9 | "period": "1 day" 10 | }, 11 | { 12 | "id": "DefaultRedshiftDatabase1", 13 | "region": "eu-west-1", 14 | "databaseName": "database_name", 15 | "username": "%Redshift DB Username%", 16 | "name": "DefaultRedshiftDatabase1", 17 | "*password": "%Redshift DB Password%", 18 | "type": "RedshiftDatabase", 19 | "clusterId": "%Redshift DB Cluster ID%" 20 | }, 21 | { 22 | "id": "DefaultDynamoDBDataNode1", 23 | "region": "us-east-1", 24 | "schedule": { 25 | "ref": "DefaultSchedule1" 26 | }, 27 | "tableName": "%Dynamo DB Table Name%", 28 | "name": "DefaultDynamoDBDataNode1", 29 | "type": "DynamoDBDataNode" 30 | }, 31 | { 32 | "id": "DefaultRedshiftCopyActivity1", 33 | "input": { 34 | "ref": "DefaultDynamoDBDataNode1" 35 | }, 36 | "schedule": { 37 | "ref": "DefaultSchedule1" 38 | }, 39 | "insertMode": "KEEP_EXISTING", 40 | "name": "DefaultRedshiftCopyActivity1", 41 | "runsOn": { 42 | "ref": "DefaultEc2Resource1" 43 | }, 44 | "output": { 45 | "ref": "DefaultRedshiftDataNode1" 46 | }, 47 | "type": "RedshiftCopyActivity" 48 | }, 49 | { 50 | "id": "DefaultRedshiftDataNode1", 51 | "schedule": { 52 | "ref": "DefaultSchedule1" 53 | }, 54 | "tableName": "%Redshift DB Table Name%", 55 | "name": "DefaultRedshiftDataNode1", 56 | "type": "RedshiftDataNode", 57 | "database": { 58 | "ref": "DefaultRedshiftDatabase1" 59 | } 60 | }, 61 | { 62 | "id": "Default", 63 | "scheduleType": "CRON", 64 | "failureAndRerunMode": "CASCADE", 65 | "name": "Default", 66 | "role": "DataPipelineDefaultRole", 67 | "resourceRole": "DataPipelineDefaultResourceRole" 68 | }, 69 | { 70 | "id" : "DefaultEc2Resource1", 71 | "name": "DefaultEc2Resource1", 72 | "type": "Ec2Resource", 73 | "terminateAfter": "45 minutes", 74 | "actionOnTaskFailure": "terminate", 75 | "actionOnResourceFailure": "retrynone", 76 | "maximumRetries": "0", 77 | "schedule": { "ref": "DefaultSchedule1" }, 78 | "logUri": "s3://logbucket/" 79 | } 80 | ] 81 | } 82 | -------------------------------------------------------------------------------- /samples/dynamo-db-to-redshift/readme.md: -------------------------------------------------------------------------------- 1 | This pipeline demonstrates how to copy data from DynamoDB to Redshift using datapipeline's RedshiftCopyActivity. 2 | Steps to run the pipeline using the cli. 3 | 4 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id 5 | => Returns a pipeline-id 6 | 7 | 2) aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file:///home/user/dynamo-db-to-redshift.json 8 | 9 | 3) aws datapipeline activate-pipeline --pipeline-id 10 | -------------------------------------------------------------------------------- /samples/dynamodb-to-dynamodb-crossregion/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline DynamoDB to DynamoDB Copy Sample [both src and target tables are in different region] 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a DynamoDB Copy Activity pipeline that uses a S3 directory for temporary backup. Specifically, this sample shows copying data across two dynamodb tables in the different regions [any aws region except eu-central-1]. Temporary S3 folder will be cleared after the copy activity completes. 6 | 7 | ## Prerequisites 8 | 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 10 | 11 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket. 12 | 13 | ## Run this sample pipeline using the AWS CLI 14 | 15 | ```sh 16 | $> aws datapipeline create-pipeline --name dynamodb_copy_pipeline --unique-id dynamodb_copy_pipeline 17 | ``` 18 | 19 | You receive a pipelineId like this. 20 | ```sh 21 | # ----------------------------------------- 22 | # | CreatePipeline | 23 | # +-------------+--------------------------+ 24 | # | pipelineId | | 25 | # +-------------+--------------------------+ 26 | ``` 27 | 28 | ```sh 29 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json \ 30 | --parameter-values myTempS3Folder= myDDBSourceTableName= \ 31 | myDDBDestinationTableName= myDDBSourceRegion= \ 32 | myDDBDestinationRegion= myS3LogsPath= --pipeline-id 33 | ``` 34 | 35 | You receive a validation messages like this 36 | ```sh 37 | # ----------------------- 38 | # |PutPipelineDefinition| 39 | # +-----------+---------+ 40 | # | errored | False | 41 | # +-----------+---------+ 42 | ``` 43 | 44 | Now activate the pipeline 45 | ```sh 46 | $> aws datapipeline activate-pipeline --pipeline-id 47 | ``` 48 | 49 | Check the status of your pipeline 50 | ``` 51 | >$ aws datapipeline list-runs --pipeline-id 52 | ``` 53 | 54 | You will receive status information on the pipeline. 55 | ```sh 56 | # Name Scheduled Start Status 57 | # ID Started Ended 58 | #--------------------------------------------------------------------------------------------------- 59 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES 60 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20 61 | # 62 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING 63 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20 64 | # 65 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20 66 | ``` 67 | 68 | 69 | ## Disclaimer 70 | 71 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 72 | 73 | Use at your own risk. 74 | 75 | Licensed under the MIT-0 License. 76 | -------------------------------------------------------------------------------- /samples/dynamodb-to-dynamodb/README.md: -------------------------------------------------------------------------------- 1 | # Data Pipeline DynamoDB to DynamoDB Copy Sample [both src and target tables are in the same region] 2 | 3 | ## Overview 4 | 5 | This sample shows how to build a DynamoDB Copy Activity pipeline that uses a S3 directory for temporary backup. Specifically, this sample shows copying data across two dynamodb tables in the same region. 6 | 7 | ## Prerequisites 8 | 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this. 10 | 11 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket. 12 | 13 | ## Run this sample pipeline using the AWS CLI 14 | 15 | ```sh 16 | $> aws datapipeline create-pipeline --name dynamodb_copy_pipeline --unique-id dynamodb_copy_pipeline 17 | ``` 18 | 19 | You receive a pipelineId like this. 20 | ```sh 21 | # ----------------------------------------- 22 | # | CreatePipeline | 23 | # +-------------+--------------------------+ 24 | # | pipelineId | | 25 | # +-------------+--------------------------+ 26 | ``` 27 | 28 | ```sh 29 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json \ 30 | --parameter-values myDDBRegion= myTempS3Folder= \ 31 | myDDBSourceTableName= myDDBDestinationTableName= myS3LogsPath= \ 32 | --pipeline-id 33 | ``` 34 | 35 | You receive a validation messages like this 36 | ```sh 37 | # ----------------------- 38 | # |PutPipelineDefinition| 39 | # +-----------+---------+ 40 | # | errored | False | 41 | # +-----------+---------+ 42 | ``` 43 | 44 | Now activate the pipeline 45 | ```sh 46 | $> aws datapipeline activate-pipeline --pipeline-id 47 | ``` 48 | 49 | Check the status of your pipeline 50 | ``` 51 | >$ aws datapipeline list-runs --pipeline-id 52 | ``` 53 | 54 | You will receive status information on the pipeline. 55 | ```sh 56 | # Name Scheduled Start Status 57 | # ID Started Ended 58 | #--------------------------------------------------------------------------------------------------- 59 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES 60 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20 61 | # 62 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING 63 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20 64 | # 65 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20 66 | ``` 67 | 68 | 69 | ## Disclaimer 70 | 71 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them. 72 | 73 | Use at your own risk. 74 | 75 | Licensed under the MIT-0 License. 76 | -------------------------------------------------------------------------------- /samples/hadoop-activity/README.md: -------------------------------------------------------------------------------- 1 | #Hadoop Activity word count example with Fair Scheduler queues 2 | 3 | ##About the sample 4 | This pipeline definition when imported would run a word count splitter program (s3://elasticmapreduce/samples/wordcount/wordSplitter.py) on the public data set s3://elasticmapreduce/samples/wordcount/input/. There are two Hadoop Activities in the definition each of which run the splitter program and output to two s3 different folders with the format <s3Prefix>/scheduledStartTime/queue_(1|2). Each of the activities run a hadoop job on using Hadoop Fair Scheduler which is configured with two queues. 5 | 6 | ##Running this sample 7 | The pipeline requires three input points from the customer: 8 | 9 | 1. The s3 prefix folder where the output of the word splitter would be stored. 10 | 2. The queue configuration for Fair Scheduler sample allocations file could be found here s3://data-pipeline-samples/hadoop-activity/allocations.xml. More details http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/FairScheduler.html 11 | 3. The log folder for the pipeline. 12 | -------------------------------------------------------------------------------- /samples/hadoop-activity/hadoop-activity-world-count-fair.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "argument": [ 5 | "-files", 6 | "s3://elasticmapreduce/samples/wordcount/wordSplitter.py", 7 | "-mapper", 8 | "wordSplitter.py", 9 | "-reducer", 10 | "aggregate", 11 | "-input", 12 | "s3://elasticmapreduce/samples/wordcount/input/", 13 | "-output", 14 | "#{myOutputFolder}/#{@scheduledStartTime}/queue_1/" 15 | ], 16 | "maximumRetries": "0", 17 | "name": "HadoopActivity_1", 18 | "id": "HadoopActivity_1", 19 | "runsOn": { 20 | "ref": "EmrClusterId_pmtY0" 21 | }, 22 | "jarUri": "/home/hadoop/contrib/streaming/hadoop-streaming.jar", 23 | "type": "HadoopActivity", 24 | "hadoopQueue" : "queue_1" 25 | }, 26 | { 27 | "argument": [ 28 | "-files", 29 | "s3://elasticmapreduce/samples/wordcount/wordSplitter.py", 30 | "-mapper", 31 | "wordSplitter.py", 32 | "-reducer", 33 | "aggregate", 34 | "-input", 35 | "s3://elasticmapreduce/samples/wordcount/input/", 36 | "-output", 37 | "#{myOutputFolder}/#{@scheduledStartTime}/queue_2/" 38 | ], 39 | "maximumRetries": "0", 40 | "name": "HadoopActivity_2", 41 | "id": "HadoopActivity_2", 42 | "runsOn": { 43 | "ref": "EmrClusterId_pmtY0" 44 | }, 45 | "jarUri": "/home/hadoop/contrib/streaming/hadoop-streaming.jar", 46 | "type": "HadoopActivity", 47 | "hadoopQueue" : "queue_2" 48 | }, 49 | { 50 | "bootstrapAction": "s3://datapipeline-us-east-1/us-east-1/bootstrap-actions/latest/TaskRunner/configure-hadoop,--fair-config-copy-file,#{myFairConfig}", 51 | "hadoopSchedulerType": "PARALLEL_FAIR_SCHEDULING", 52 | "name": "DefaultEmrCluster1", 53 | "coreInstanceCount": "2", 54 | "coreInstanceType": "m3.xlarge", 55 | "amiVersion": "3.8.0", 56 | "masterInstanceType": "m3.xlarge", 57 | "id": "EmrClusterId_pmtY0", 58 | "type": "EmrCluster" 59 | }, 60 | { 61 | "occurrences": "1", 62 | "period": "1 Day", 63 | "name": "RunOnce", 64 | "id": "DefaultSchedule", 65 | "type": "Schedule", 66 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 67 | }, 68 | { 69 | "failureAndRerunMode": "CASCADE", 70 | "schedule": { 71 | "ref": "DefaultSchedule" 72 | }, 73 | "resourceRole": "DataPipelineDefaultResourceRole", 74 | "role": "DataPipelineDefaultRole", 75 | "pipelineLogUri": "#{myPipelineLogUri}", 76 | "scheduleType": "cron", 77 | "name": "Default", 78 | "id": "Default" 79 | } 80 | ], 81 | "parameters": [ 82 | { 83 | "id" : "myFairConfig", 84 | "type" : "AWS::S3::ObjectKey", 85 | "description" : "Please choose the fair scheduler configuration" 86 | }, 87 | { 88 | "id" : "myPipelineLogUri", 89 | "type" : "AWS::S3::ObjectKey", 90 | "description" : "Please specify the logs location" 91 | }, 92 | { 93 | "id" : "myOutputFolder", 94 | "type" : "AWS::S3::ObjectKey", 95 | "description" : "Please specify the word count output location" 96 | } 97 | ] 98 | } 99 | -------------------------------------------------------------------------------- /samples/helloworld/README.md: -------------------------------------------------------------------------------- 1 | # Hello World 2 | 3 | This sample defines a [shell command activity](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-shellcommandactivity.html) to echo the text "hello world". The output, along with 4 | the acitivity log, is saved to an [S3](https://aws.amazon.com/s3/) bucket. 5 | 6 | ## Parameters 7 | 8 |
9 | 10 | 11 | 12 | 15 | 16 |
ParameterDescription
myS3LogsPath 13 | (Required) An S3 key where the shell output and activity log will be stored. Example: "s3://data-pipeline-samples-12345" 14 |
17 | 18 | ## Setup (Optional) 19 | 20 | You can use the setup script in the HelloWorld sample directory to create an S3 bucket to use in 21 | this example. You can skip this step if you have another S3 bucket that you want to use. The script 22 | will take a minute to complete, and when it's finished it will print the resource identifier of the 23 | S3 bucket that it created. 24 | 25 | ```sh 26 | $> python setup.py 27 | # Creating resources for stack [dpl-samples-hello-world]... 28 | # AWS::S3::Bucket: dpl-samples-hello-world-s3bucket-2bbt69s1j29c 29 | ``` 30 | 31 | ## Running this sample 32 | 33 | Create a new pipeline. Throughout this section we assume that the HelloWorld sample directory is 34 | your current working directory. 35 | 36 | ```sh 37 | $> aws datapipeline create-pipeline --name hello_world_pipeline --unique-id hello_world_pipeline 38 | # { 39 | # "pipelineId": "df-074257336JDKJ6QWQCT4" 40 | # } 41 | ``` 42 | 43 | Upload the pipeline definition. Use the `pipelineId` that was returned by the `create-pipeline` 44 | command. Specify the name of an S3 bucket where the output and activity log will be stored. This 45 | will either be the bucket name that was printed by the setup script or another bucket that you've 46 | created. 47 | 48 | 49 | ```sh 50 | $> aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file://helloworld.json --parameter-values myS3LogsPath="s3://" 51 | # { 52 | # "validationErrors": [], 53 | # "validationWarnings": [], 54 | # "errored": false 55 | # } 56 | ``` 57 | 58 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command. 59 | 60 | ```sh 61 | $> aws datapipeline activate-pipeline --pipeline-id 62 | ``` 63 | 64 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the 65 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output 66 | from this command will show FINISHED for all pipeine nodes. Note that it may take a minute after the 67 | `activate-pipeline` command has completed before the `list-runs` command shows any output. 68 | 69 | ```sh 70 | 71 | >$ aws datapipeline list-runs --pipeline-id 72 | # Name Scheduled Start Status 73 | # ID Started Ended 74 | # --------------------------------------------------------------------------------------------------- 75 | # 1. EC2Resource_HelloWorld 2015-10-14T16:51:56 RUNNING 76 | # @EC2Resource_HelloWorld_2015-10-14T16:51:56 2015-10-14T16:51:59 77 | # 78 | # 2. ShellCommandActivity_HelloWorld 2015-10-14T16:51:56 WAITING_FOR_RUNNER 79 | # @ShellCommandActivity_HelloWorld_2015-10-14T16:51: 2015-10-14T16:51:59 80 | 81 | ``` 82 | 83 | After the pipeline is completed, the output and activity log from the pipeline will be saved to the S3 bucket that you 84 | specified under the following prefix. To view or download these files, navigate to this prefix in 85 | the S3 section of the [AWS Management Console](https://aws.amazon.com/console/). 86 | 87 | s3:///HelloWorld//// 88 | 89 | ## Next steps 90 | 91 | Once the pipeline is completed, you can delete it with the following command. If you try to run the 92 | sample again without deleting, you may receive errors or unexpected behavior. 93 | 94 | ```sh 95 | $> aws datapipeline delete-pipeline --pipeline-id 96 | ``` 97 | 98 | The resources used by this example will incur normal charges. If you provisioned resources using the 99 | setup script, you can free them by running the following command in the HelloWorld sample directory. 100 | 101 | ```sh 102 | $> python setup.py --teardown 103 | # Request to delete stack [dpl-samples-hello-world] has been sent 104 | ``` 105 | 106 | 107 | ## Disclaimer 108 | 109 | The samples in this repository are meant to help users get started with Data Pipeline. They may not 110 | be sufficient for production environments. Users should carefully inspect samples before running 111 | them. 112 | 113 | *Use at your own risk.* 114 | 115 | Licensed under the MIT-0 License. 116 | -------------------------------------------------------------------------------- /samples/helloworld/helloworld.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "myComment": "This object is used to set default configuration for objects in the pipeline", 5 | 6 | "id": "Default", 7 | "failureAndRerunMode":"cascade", 8 | "resourceRole": "DataPipelineDefaultResourceRole", 9 | "role": "DataPipelineDefaultRole", 10 | "pipelineLogUri": "#{myS3LogsPath}/HelloWorld", 11 | "scheduleType": "cron", 12 | "schedule": { 13 | "ref": "DefaultSchedule" 14 | } 15 | }, 16 | { 17 | "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.", 18 | 19 | "type": "Schedule", 20 | "id": "DefaultSchedule", 21 | "occurrences": "1", 22 | "period": "1 Day", 23 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 24 | }, 25 | 26 | 27 | { 28 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.", 29 | 30 | "id": "EC2Resource_HelloWorld", 31 | "type": "Ec2Resource", 32 | "terminateAfter": "1 Hour" 33 | }, 34 | { 35 | "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case it is simply running the 'echo' command, but it can be used to run any command that is accessible on the in the shell of the Resource that runs on.", 36 | 37 | "id": "ShellCommandActivity_HelloWorld", 38 | "runsOn": { 39 | "ref": "EC2Resource_HelloWorld" 40 | }, 41 | "type": "ShellCommandActivity", 42 | "command": "echo 'Hello World!'" 43 | }, 44 | { 45 | "id": "OutputData", 46 | "name": "OutputData", 47 | "type": "S3DataNode", 48 | "filePath": "s3://joshtok-test/abc" 49 | }, 50 | { 51 | "id": "OutputData2", 52 | "name": "OutputData", 53 | "type": "S3DataNode", 54 | "filePath": "s3://joshtok-test/def" 55 | } 56 | ], 57 | "parameters": [ 58 | { 59 | "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.", 60 | 61 | "id" : "myS3LogsPath", 62 | "type" : "AWS::S3::ObjectKey", 63 | "description" : "S3 path for pipeline logs." 64 | } 65 | ] 66 | } 67 | -------------------------------------------------------------------------------- /samples/helloworld/setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("../../setup") 3 | 4 | from stacker import Stacker 5 | 6 | s = Stacker( 7 | "dpl-samples-hello-world", 8 | { 9 | "Resources": { 10 | "S3Bucket": { 11 | "Type": "AWS::S3::Bucket", 12 | "DeletionPolicy": "Delete" 13 | } 14 | } 15 | }) 16 | 17 | s.run(sys.argv) 18 | -------------------------------------------------------------------------------- /samples/json-to-dynamodb/README.md: -------------------------------------------------------------------------------- 1 | # json-to-dynamodb 2 | Example that loads a json stored in an S3 location into a DynamoDB table 3 | 4 | The pipeline definition reads a customer json file stored in an S3 location and loads the data to a DynamoDB table called customers. 5 | 6 | The load to DynamoDb is done via a hive script [json_to_ddb.q](json_to_ddb.q) that reads the json from the S3 location into an external table and then leverages the `org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler` to move the data from the Hive external table to a DynamoDb table called 'customers'. 7 | 8 | 9 | 10 | ## Disclaimer 11 | 12 | The samples in this repository are meant to help users get started with Data Pipeline. They may not 13 | be sufficient for production environments. Users should carefully inspect samples before running 14 | them. 15 | 16 | *Use at your own risk.* 17 | 18 | Licensed under the MIT-0 License. 19 | -------------------------------------------------------------------------------- /samples/json-to-dynamodb/customers.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "customer_id": 1, 3 | "customer_income": 200 4 | }, 5 | { 6 | "customer_id": 2, 7 | "customer_income": 100 8 | }] 9 | -------------------------------------------------------------------------------- /samples/json-to-dynamodb/definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "name": "EmrClusterForBackup", 5 | "coreInstanceType": "m1.medium", 6 | "coreInstanceCount": "1", 7 | "masterInstanceType": "m1.medium", 8 | "amiVersion": "3.3.2", 9 | "id": "EmrClusterForBackup", 10 | "type": "EmrCluster", 11 | "terminateAfter": "2 Hours" 12 | }, 13 | { 14 | "failureAndRerunMode": "CASCADE", 15 | "schedule": { 16 | "ref": "DefaultSchedule" 17 | }, 18 | "resourceRole": "DataPipelineDefaultResourceRole", 19 | "role": "DataPipelineDefaultRole", 20 | "scheduleType": "ondemand", 21 | "name": "Default", 22 | "id": "Default" 23 | }, 24 | { 25 | "name": "TableBackupActivity", 26 | "scriptUri":"s3://datapipeline-samples/JsonToDynamoDb/json_to_ddb.q", 27 | "runsOn": { 28 | "ref" : "EmrClusterForBackup" 29 | }, 30 | "id": "TableBackupActivity", 31 | "type": "HiveActivity", 32 | "stage":"false", 33 | "myComment": "Activity used to run the hive script to export data" 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /samples/json-to-dynamodb/json_to_ddb.q: -------------------------------------------------------------------------------- 1 | ADD JAR s3://elasticmapreduce/samples/hive-ads/libs/jsonserde.jar; 2 | 3 | DROP TABLE IF EXISTS customer_json; 4 | 5 | CREATE EXTERNAL TABLE customer_json (id STRING, 6 | income STRING) 7 | ROW FORMAT SERDE 'com.amazon.elasticmapreduce.JsonSerde' 8 | WITH SERDEPROPERTIES ('paths'='customer_id,customer_income') 9 | LOCATION 's3://datapipeline-samples/JsonToDynamoDb/customers.json'; 10 | 11 | DROP TABLE IF EXISTS customer_hive; 12 | 13 | CREATE EXTERNAL TABLE customer_hive (id STRING, 14 | income STRING) 15 | STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' 16 | TBLPROPERTIES ("dynamodb.table.name" = "customers", 17 | "dynamodb.column.mapping" = "id:id,income:income"); 18 | 19 | INSERT OVERWRITE TABLE customer_hive SELECT * FROM customer_json; 20 | -------------------------------------------------------------------------------- /samples/kinesis/README.md: -------------------------------------------------------------------------------- 1 | ![Data Pipeline Logo](https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/setup/logo/datapipelinelogo.jpeg) 2 | 3 | Process a Kinesis stream of Apache access logs using EMR 4 | ===================== 5 | This sample sets up a Data Pipeline to run an analysis on a kinesis stream every 15 minutes and store the result in S3. This requires the setup from the EMR [documentation](http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-kinesis.html). 6 | 7 | # Running the sample 8 | 9 | ##Setting up your resources 10 | 11 | The setup script will: 12 | - create a Kinesis stream named AccessLogStream 13 | - create a DynamoDb table called MyEMRKinesisTable 14 | - create a DynamoDb table called MyEMRKinesisTableIteration 15 | - download a kinesis stream appender for sample apache access logs 16 | 17 | ```sh 18 | $> setup/setup-script.sh 19 | ``` 20 | ##Populating your stream 21 | 22 | You can push sample data to your stream by running 23 | 24 | ```sh 25 | $> setup/append-to-stream.sh 26 | ``` 27 | 28 | ##Setting up the pipeline 29 | 30 | The instructions at https://github.com/awslabs/data-pipeline-samples tell you how to create, setup, and activate a pipeline. 31 | 32 | ```sh 33 | $> aws datapipeline create-pipeline --name kinesis_apache_access_logs --unique-id kinesis_apache_access_logs 34 | $> aws datapipeline put-pipeline-definition --pipeline-id df-0554887H4KXKTY59MRJ --pipeline-definition file://samples/kinesis/kinesis-to-s3.json --parameter-values myS3LogsPath="" myS3Output="" 35 | $> aws datapipeline activate-pipeline --pipeline-id df-0554887H4KXKTY59MRJ 36 | ``` 37 | -------------------------------------------------------------------------------- /samples/kinesis/hive-scripts/create-table-from-kinesis-stream.q: -------------------------------------------------------------------------------- 1 | DROP TABLE apachelog; 2 | 3 | CREATE TABLE apachelog ( 4 | host STRING, 5 | IDENTITY STRING, 6 | USER STRING, 7 | TIME STRING, 8 | request STRING, 9 | STATUS STRING, 10 | SIZE STRING, 11 | referrer STRING, 12 | agent STRING 13 | ) 14 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe' 15 | WITH SERDEPROPERTIES ( 16 | "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") ([0-9]*) ([0-9]*) ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\")" 17 | ) 18 | STORED BY 19 | 'com.amazon.emr.kinesis.hive.KinesisStorageHandler' 20 | TBLPROPERTIES("kinesis.stream.name"="AccessLogStream"); 21 | 22 | CREATE TABLE IF NOT EXISTS apachelog_s3 ( 23 | host STRING, 24 | IDENTITY STRING, 25 | USER STRING, 26 | TIME STRING, 27 | request STRING, 28 | STATUS STRING, 29 | SIZE STRING, 30 | referrer STRING, 31 | agent STRING 32 | ) 33 | PARTITIONED BY(iteration_no int) 34 | LOCATION '${s3Location}'; 35 | -------------------------------------------------------------------------------- /samples/kinesis/hive-scripts/script-runner.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | aws s3 cp s3://data-pipeline-samples/kinesis-apache-access-logs/create-table-from-kinesis-stream.q . 4 | aws s3 cp s3://data-pipeline-samples/kinesis-apache-access-logs//write-kinesis-to-s3.q . 5 | 6 | S3_LOCATION=$1 7 | 8 | #Read iteration count from DynamoDb if exists 9 | result=$(aws dynamodb get-item --table-name MyEMRKinesisTableIteration --key '{"Hash":{"S": "IterationCount"}}' --attributes-to-get "Count") 10 | if [ -z "$result" ]; 11 | then 12 | ITERATION_COUNT=0 13 | else 14 | ITERATION_COUNT=$(echo $result | grep "S" | sed 's/[^0-9]//g' ) 15 | fi 16 | 17 | echo "Processing with iteration count $ITERATION_COUNT" 18 | 19 | #Run hive scripts 20 | hive -hivevar s3Location=$S3_LOCATION -f create-table-from-kinesis-stream.q 21 | 22 | echo "Completed table creation" 23 | 24 | hive -hivevar iterationNo=$ITERATION_COUNT -f write-kinesis-to-s3.q 25 | 26 | ITERATION_COUNT=$((ITERATION_COUNT+1)) 27 | 28 | echo "Writing iteration count as $ITERATION_COUNT" 29 | 30 | #Write iteration count to DynamoDb 31 | aws dynamodb put-item --table-name MyEMRKinesisTableIteration --item {\"Hash\":{\"S\":\"IterationCount\"}\,\"Count\":{\"S\":\"$ITERATION_COUNT\"}} 32 | -------------------------------------------------------------------------------- /samples/kinesis/hive-scripts/write-kinesis-to-s3.q: -------------------------------------------------------------------------------- 1 | set kinesis.checkpoint.enabled=true; 2 | set kinesis.checkpoint.metastore.table.name=MyEMRKinesisTable; 3 | set kinesis.checkpoint.metastore.hash.key.name=HashKey; 4 | set kinesis.checkpoint.metastore.range.key.name=RangeKey; 5 | set kinesis.checkpoint.logical.name=TestLogicalName; 6 | set kinesis.checkpoint.iteration.no=${iterationNo}; 7 | 8 | INSERT OVERWRITE TABLE apachelog_s3 partition (iteration_no=${hiveconf:kinesis.checkpoint.iteration.no}) SELECT * FROM apachelog; 9 | -------------------------------------------------------------------------------- /samples/kinesis/kinesis-to-s3.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "id": "Default", 5 | "failureAndRerunMode": "cascade", 6 | "resourceRole": "DataPipelineDefaultResourceRole", 7 | "role": "DataPipelineDefaultRole", 8 | "pipelineLogUri": "#{myS3LogsPath}", 9 | "scheduleType": "cron", 10 | "schedule": { 11 | "ref": "DefaultSchedule" 12 | } 13 | }, 14 | { 15 | "type": "Schedule", 16 | "id": "DefaultSchedule", 17 | "period": "15 Period", 18 | "startAt": "FIRST_ACTIVATION_DATE_TIME" 19 | }, 20 | { 21 | "schedule": { 22 | "ref": "DefaultSchedule" 23 | }, 24 | "name": "A_Fresh_NewEMRInstance", 25 | "amiVersion": "3.3", 26 | "id": "A_Fresh_NewEMRInstance", 27 | "type": "EmrCluster" 28 | }, 29 | { 30 | "id": "ShellCommandActivity_HelloWorld", 31 | "runsOn": { 32 | "ref": "A_Fresh_NewEMRInstance" 33 | }, 34 | "type": "ShellCommandActivity", 35 | "scriptUri": "s3://data-pipeline-samples/kinesis-apache-access-logs/script-runner.sh", 36 | "scriptArgument": "#{myS3Output}" 37 | } 38 | ], 39 | "parameters": [ 40 | { 41 | "id": "myS3LogsPath", 42 | "type": "AWS::S3::ObjectKey", 43 | "description": "S3 path for pipeline logs." 44 | }, 45 | { 46 | "id": "myS3Output", 47 | "type": "AWS::S3::ObjectKey", 48 | "description": "S3 path for pipeline logs e.g. s3://mybucket/" 49 | } 50 | ] 51 | } 52 | -------------------------------------------------------------------------------- /samples/kinesis/setup/append-to-stream.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Make first append to Kinesis stream 4 | java -cp .:kinesis-log4j-appender-1.0.0.jar com.amazonaws.services.kinesis.log4j.FilePublisher access_log_1 5 | -------------------------------------------------------------------------------- /samples/kinesis/setup/setup-script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create Kinesis stream for sample 4 | aws kinesis create-stream --stream-name AccessLogStream --shard-count 2 5 | 6 | # Create DynamoDb table required by EMR to process Kinesis 7 | aws dynamodb create-table --table-name MyEMRKinesisTable --attribute-definitions AttributeName=HashKey,AttributeType=S AttributeName=RangeKey,AttributeType=S --key-schema AttributeName=HashKey,KeyType=HASH AttributeName=RangeKey,KeyType=RANGE --provisioned-throughput ReadCapacityUnits=10,WriteCapacityUnits=10 8 | 9 | #Create DynamoDb table to maintain iterations on Kinesis processing by EMR 10 | aws dynamodb create-table --table-name MyEMRKinesisTableIteration --attribute-definitions AttributeName=Hash,AttributeType=S --key-schema AttributeName=Hash,KeyType=HASH --provisioned-throughput ReadCapacityUnits=1,WriteCapacityUnits=1 11 | 12 | # Download sample kinesis stream appender 13 | wget http://emr-kinesis.s3.amazonaws.com/publisher/kinesis-log4j-appender-1.0.0.jar 14 | 15 | # Download sample access logs 16 | wget http://elasticmapreduce.s3.amazonaws.com/samples/pig-apache/input/access_log_1 17 | -------------------------------------------------------------------------------- /samples/oracle-backup/README.md: -------------------------------------------------------------------------------- 1 | # Oracle-Backup 2 | 3 | This sample pipeline does a daily backup of an Oracle database to S3 in CSV format, under an S3 prefix using the date of the backup. 4 | 5 | It features usage of parameters and expressions for easy pipeline definition re-use, construction of a JDBC connection string for the [`JdbcDatabase`](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-jdbcdatabase.html) object, and to store backups on AWS under the date the pipeline was started (instead of the full timestamp). 6 | 7 | ## Instructions 8 | 9 | 1. The Oracle JDBC driver is not available by default on instances launched using Data Pipeline. In order to use it, you will need to [download the driver](http://www.oracle.com/technetwork/database/features/jdbc/index-091264.html) from Oracle. 10 | 11 | 2. Upload the driver JAR to an S3 bucket. 12 | 13 | 3. Install the [AWS CLI](http://aws.amazon.com/cli/). This is available by default on Amazon Linux instances. 14 | 15 | 4. Configure the credentials with `aws configure`. If using role credentials, then you can skip all fields except for the default region. 16 | 17 | 5. Fill out `values.json` with the appropriate values; there are descriptions of the parameters in `parameters.json` as well as below. 18 | 19 | 6. Create a pipeline either with the AWS Console, or through the CLI. Through the CLI, this can be done with 20 | 21 | `aws datapipeline create-pipeline --name --unique-id ` 22 | 23 | 7. Using the pipeline-id (`df-XXXXXX`), submit the pipeline definition with parameters and values. 24 | 25 | `aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file://definition.json --parameter-objects file://parameters.json --parameter-values-uri file://values.json` 26 | 27 | 8. Activate the pipeline 28 | `aws datapipeline activate-pipeline --pipeline-id ` 29 | 30 | 31 | ##Parameters 32 | 33 | myBackupLocation: S3 backup location (i.e. `s3://mybucket/backups/oracle`) 34 | 35 | myOracleDriverLocation: S3 location to fetch Oracle JDBC driver from (i.e. `s3://mybucket/ojdbc6.jar`) 36 | 37 | myOracleHost: Oracle host address (i.e. `abc.xyz.us-east-1.rds.amazonaws.com`) 38 | 39 | myOraclePort: Oracle port (i.e. `1521`) 40 | 41 | myOracleDatabase: Oracle SID/database (.i.e. `ORCL`) 42 | 43 | myOracleUser: Oracle user 44 | 45 | myOraclePassword: Password to use 46 | 47 | myOracleTable: Name of the Oracle table to back up 48 | 49 | myTerminateAfter: Terminate instance after a certain amount of time (i.e. `2 Hours`) 50 | 51 | myPipelineLogUri: Log pipeline execution details to an S3 location (i.e. `s3://mybucket/pipelinelogs`) 52 | -------------------------------------------------------------------------------- /samples/oracle-backup/definition.json: -------------------------------------------------------------------------------- 1 | { "objects": 2 | [ { "id": "S3" 3 | , "name": "Backup Location" 4 | , "type": "S3DataNode" 5 | , "directoryPath": "#{myBackupLocation}/#{day(@scheduledStartTime)}/" 6 | , "schedule": 7 | { "ref": "DefaultSchedule" } 8 | } 9 | , { "id": "DefaultSchedule" 10 | , "name": "Every 1 day" 11 | , "type": "Schedule" 12 | , "period": "1 days" 13 | , "startAt": "FIRST_ACTIVATION_DATE_TIME" 14 | } 15 | , { "id": "Instance" 16 | , "name": "Instance" 17 | , "type": "Ec2Resource" 18 | , "role": "DataPipelineDefaultRole" 19 | , "resourceRole": "DataPipelineDefaultResourceRole" 20 | , "terminateAfter": "#{myTerminateAfter}" 21 | , "schedule": 22 | { "ref": "DefaultSchedule" } 23 | } 24 | , { "id": "Default" 25 | , "name": "Default" 26 | , "role": "DataPipelineDefaultRole" 27 | , "resourceRole": "DataPipelineDefaultResourceRole" 28 | , "failureAndRerunMode": "CASCADE" 29 | , "pipelineLogUri": "#{myPipelineLogUri}" 30 | , "scheduleType": "cron" 31 | , "schedule": 32 | { "ref": "DefaultSchedule" } 33 | } 34 | , { "id": "OracleDatabase" 35 | , "name": "Oracle Database" 36 | , "type": "JdbcDatabase" 37 | , "jdbcDriverJarUri": "#{myOracleDriverLocation}" 38 | , "jdbcDriverClass": "oracle.jdbc.OracleDriver" 39 | , "connectionString": "jdbc:oracle:thin:@#{myOracleHost}:#{myOraclePort}:#{myOracleDatabase}" 40 | , "username": "#{myOracleUser}" 41 | , "*password": "#{myOraclePassword}" 42 | } 43 | , { "id": "BackupTable" 44 | , "name": "Back up Oracle table" 45 | , "type": "CopyActivity" 46 | , "input": 47 | { "ref": "Oracle" } 48 | , "output": 49 | { "ref": "S3" } 50 | , "runsOn": 51 | { "ref": "Instance" } 52 | , "schedule": 53 | { "ref": "DefaultSchedule" } 54 | } 55 | , { "id": "Oracle" 56 | , "name": "Oracle" 57 | , "type": "SqlDataNode" 58 | , "table": "#{myOracleTable}" 59 | , "selectQuery": "SELECT * FROM mytable" 60 | , "database": 61 | { "ref": "OracleDatabase" } 62 | , "runsOn": 63 | { "ref": "Instance" } 64 | , "schedule": 65 | { "ref": "DefaultSchedule" } 66 | } 67 | ], 68 | "parameters": [] 69 | } 70 | -------------------------------------------------------------------------------- /samples/oracle-backup/parameters.json: -------------------------------------------------------------------------------- 1 | { "parameters": 2 | [ { "id": "myBackupLocation" 3 | , "description": "S3 backup location" 4 | , "type": "AWS::S3::ObjectKey" 5 | } 6 | , { "id": "myOracleDriverLocation" 7 | , "description": "S3 location to fetch Oracle JDBC driver from" 8 | , "type": "AWS::S3::ObjectKey" 9 | } 10 | , { "id": "myOracleHost" 11 | , "description": "Oracle host address" 12 | , "type": "String" 13 | } 14 | , { "id": "myOraclePort" 15 | , "description": "Oracle port" 16 | , "type": "Integer" 17 | } 18 | , { "id": "myOracleDatabase" 19 | , "description": "Oracle SID/database" 20 | , "type": "String" 21 | } 22 | , { "id": "myOracleUser" 23 | , "description": "Oracle user" 24 | , "type": "String" 25 | } 26 | , { "id": "myOraclePassword" 27 | , "description": "Oracle password" 28 | , "type": "String" 29 | } 30 | , { "id": "myOracleTable" 31 | , "description": "Name of the Oracle table to back up" 32 | , "type": "String" 33 | } 34 | , { "id": "myTerminateAfter" 35 | , "description": "Terminate instance after a certain amount of time" 36 | , "type": "String" 37 | } 38 | , { "id": "myPipelineLogUri" 39 | , "description": "Log pipeline execution details to an S3 location" 40 | , "type": "AWS::S3::ObjectKey" 41 | } 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /samples/oracle-backup/values.json: -------------------------------------------------------------------------------- 1 | { "values": 2 | { "myBackupLocation": "" 3 | , "myOracleDriverLocation": "" 4 | , "myOracleHost": "" 5 | , "myOraclePort": "" 6 | , "myOracleDatabase": "" 7 | , "myOracleUser": "" 8 | , "myOraclePassword": "" 9 | , "myOracleTable": "" 10 | , "myTerminateAfter": "" 11 | , "myPipelineLogUri": "" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /samples/rds-to-rds-copy/readme.md: -------------------------------------------------------------------------------- 1 | This pipeline demonstrates how to copy data from S3 to RDS instances and between RDS instances using datapipeline. Following is the data flow 2 | 3 | S3 -> Mysql -> Oracle -> PostGres -> SqlServer -> S3 4 | 5 | Steps to run the pipeline using the cli. 6 | 7 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id 8 | => Returns a pipeline-id 9 | 10 | 2) aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file:///home/user/rds-to-rds-copy.json 11 | 12 | 3) aws datapipeline activate-pipeline --pipeline-id 13 | -------------------------------------------------------------------------------- /setup/logo/datapipelinelogo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-archives/data-pipeline-samples/3be77bd7ddb0021fc36074718da225f84aac21fd/setup/logo/datapipelinelogo.jpeg -------------------------------------------------------------------------------- /setup/stacker.py: -------------------------------------------------------------------------------- 1 | import botocore, boto3, json, sys, time 2 | 3 | 4 | def print_resources(stack): 5 | resources = [] 6 | for summary in stack.resource_summaries.all(): 7 | resources.append((summary.resource_type, summary.physical_resource_id)) 8 | 9 | if len(resources) == 0: 10 | print("No resources") 11 | return 12 | 13 | max_type_length = max(len(res[0]) for res in resources) 14 | format_string = " {{:>{}}}: {{}}".format(max_type_length) 15 | 16 | for res in resources: 17 | print(format_string.format(*res)) 18 | 19 | 20 | def wait_for_status_change(stack, initial_status="CREATE_IN_PROGRESS"): 21 | while stack.stack_status == initial_status: 22 | time.sleep(0.2) 23 | stack.reload() 24 | 25 | 26 | class UnexpectedStateError(Exception): 27 | 28 | def __init__(self, state): 29 | message = "Stack reached unexpected state: {}".format(state) 30 | super(UnexpectedStateError, self).__init__(message) 31 | 32 | 33 | class Stacker(object): 34 | 35 | def __init__(self, stack_name, stack_template, timeout_in_minutes=10, cloudformation=None): 36 | self.stack_name = stack_name 37 | self.stack_template = stack_template 38 | self.timeout_in_minutes = timeout_in_minutes 39 | 40 | if cloudformation: 41 | self.cloudformation = cloudformation 42 | else: 43 | self.cloudformation = boto3.resource("cloudformation") 44 | 45 | 46 | def setup(self, on_complete=None): 47 | print("Creating resources for stack [{}]...".format(self.stack_name)) 48 | 49 | try: 50 | 51 | stack = self.cloudformation.create_stack( 52 | StackName=self.stack_name, 53 | TemplateBody=json.dumps(self.stack_template), 54 | TimeoutInMinutes=self.timeout_in_minutes) 55 | 56 | wait_for_status_change(stack) 57 | 58 | if stack.stack_status == "CREATE_COMPLETE": 59 | print_resources(stack) 60 | 61 | if on_complete: 62 | on_complete() 63 | 64 | return True 65 | else: 66 | raise UnexpectedStateError(stack.stack_status) 67 | 68 | except (UnexpectedStateError, botocore.exceptions.ClientError) as e: 69 | print("ERROR: {}".format(e)) 70 | return False 71 | 72 | 73 | def teardown(self): 74 | stacks = self.cloudformation.stacks.filter(StackName=self.stack_name) 75 | s3 = None 76 | 77 | for s in stacks: 78 | for r in s.resource_summaries.all(): 79 | if r.resource_type == "AWS::S3::Bucket": 80 | if not s3: 81 | s3 = boto3.resource("s3") 82 | 83 | bucket = s3.Bucket(r.physical_resource_id) 84 | for key in bucket.objects.all(): 85 | key.delete() 86 | 87 | s.delete() 88 | 89 | print("Request to delete stack [{}] has been sent".format(self.stack_name)) 90 | 91 | 92 | def run(self, args): 93 | if "--teardown" in args: 94 | self.teardown() 95 | else: 96 | self.setup() 97 | -------------------------------------------------------------------------------- /setup/stacker_tests.py: -------------------------------------------------------------------------------- 1 | import botocore, time, unittest 2 | import stacker 3 | 4 | from unittest.mock import Mock 5 | from threading import Thread 6 | 7 | 8 | 9 | class StatusChanger(object): 10 | 11 | def __init__(self, end_status, change_after_seconds, start_status="CREATE_IN_PROGRESS"): 12 | self.creation_time = time.time() 13 | self.stack_status = start_status 14 | self.end_status = end_status 15 | self.change_after_seconds = change_after_seconds 16 | 17 | self.resource_summaries = Mock() 18 | self.resource_summaries.all = Mock(return_value=[]) 19 | 20 | def reload(self): 21 | call_time = time.time() 22 | if call_time - self.creation_time > self.change_after_seconds: 23 | self.stack_status = self.end_status 24 | 25 | 26 | class TestStacker(unittest.TestCase): 27 | 28 | def setUp(self): 29 | self.cloudformation = Mock() 30 | 31 | def test_stack_status_change(self): 32 | stack = StatusChanger("CREATE_COMPLETE", 1) 33 | stacker.wait_for_status_change(stack) 34 | self.assertEqual(stack.stack_status, "CREATE_COMPLETE") 35 | 36 | def test_unexpected_status(self): 37 | self.cloudformation.create_stack = Mock(return_value=StatusChanger("UNEXPECTED", 1)) 38 | stkr = stacker.Stacker("example", {}, cloudformation=self.cloudformation) 39 | self.assertFalse(stkr.setup()) 40 | 41 | def test_client_error(self): 42 | error_response = { 43 | "Error": { 44 | "Code": "ExampleClientError", 45 | "Message": "Something happened" 46 | } 47 | } 48 | self.cloudformation.create_stack = Mock(side_effect=botocore.exceptions.ClientError(error_response, "CreateStack")) 49 | stkr = stacker.Stacker("example", {}, cloudformation=self.cloudformation) 50 | self.assertFalse(stkr.setup()) 51 | 52 | def test_stack_on_complete_callback(self): 53 | self.cloudformation.create_stack = Mock(return_value=StatusChanger("CREATE_COMPLETE", 0.1)) 54 | stkr = stacker.Stacker("example", {}, cloudformation=self.cloudformation) 55 | 56 | mem = {"called": False} 57 | def callback(): 58 | mem["called"] = True 59 | 60 | stkr.setup(on_complete=callback) 61 | self.assertTrue(mem["called"]) 62 | 63 | 64 | 65 | if __name__ == "__main__": 66 | unittest.main() 67 | --------------------------------------------------------------------------------