├── .gitignore
├── .project
├── LICENSE.txt
├── README.md
├── requirements.txt
├── samples
├── DynamoDBExport
│ ├── DynamoDBTableExport.json
│ ├── DynamoDBtoCSV.json
│ └── readme.md
├── DynamoDBExportJava
│ ├── .gitignore
│ ├── pom.xml
│ ├── readme.md
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── amazonaws
│ │ │ └── datapipelinesamples
│ │ │ └── ddbexport
│ │ │ ├── CommandLineArgParser.java
│ │ │ ├── DDBExportPipelineCreator.java
│ │ │ ├── DDBExportPipelineObjectCreator.java
│ │ │ ├── Main.java
│ │ │ └── PipelineMonitor.java
│ │ └── resources
│ │ └── log4j2.xml
├── DynamoDBImport
│ ├── XMLtoDynamoDBImport.json
│ └── readme.md
├── DynamoDBImportCSV
│ ├── CSVtoDynamoDB.json
│ └── README.md
├── DynamoDBToRedshiftConvertDataUsingHive
│ ├── DynamoDBtoRedshiftHiveCSV.json
│ └── README.md
├── EFSBackup
│ ├── 1-Node-EFSBackupPipeline.json
│ ├── 1-Node-EFSRestorePipeline.json
│ ├── 2-Node-EFSBackupPipeline.json
│ ├── 2-Node-EFSRestorePipeline.json
│ ├── 3-Node-EFSBackupPipeline.json
│ ├── README.md
│ ├── efs-backup-end.sh
│ ├── efs-backup-init.sh
│ ├── efs-backup-rsync.sh
│ ├── efs-backup.sh
│ ├── efs-restore-rsync.sh
│ └── efs-restore.sh
├── ExampleTemplate
│ └── README.md
├── HadoopTerasort
│ ├── README.md
│ ├── TeraSortHadoopBenchmark.json
│ ├── process-jhist.sh
│ └── setup.py
├── InvokeLambda
│ ├── README.md
│ └── invokelambda.json
├── LoadTsvFilesInS3ToRedshift
│ ├── LoadTsvFilesInS3ToRedshift.json
│ └── README.md
├── OnDemandWithLamdaFunctions
│ ├── lambda_function.py
│ ├── ondemand.json
│ └── readme.md
├── RDStoRedshiftSqoop
│ ├── RDStoRedshiftSqoop.json
│ ├── README.md
│ └── setup
│ │ ├── RdsToRedshiftSqoopSample.py
│ │ ├── Setup.py
│ │ ├── SetupPipelineDefinition.py
│ │ ├── Teardown.py
│ │ ├── Utilities.py
│ │ └── setup.json
├── RDStoS3
│ ├── RDStoS3Pipeline.json
│ ├── README.md
│ └── setup
│ │ ├── RDStoS3Sample.py
│ │ ├── Setup.py
│ │ ├── SetupPipelineDefinition.py
│ │ ├── Teardown.py
│ │ └── Utilities.py
├── RedshiftCopyActivityFromDynamoDBTable
│ ├── RedshiftCopyActivityFromDynamoDBTable.json
│ └── readme.md
├── RedshiftToRDS
│ ├── RedshiftToRDS_WithoutRDSCreate.json
│ ├── RedshiftToRDS_withTableCreate.json
│ └── readme.md
├── S3ToRdsSqoop
│ ├── README.md
│ └── sqoop_activity.json
├── S3TsvFilesToRedshiftTablesIfReady
│ ├── S3TsvFilesToRedshiftTablesIfReady.json
│ └── readme.md
├── SQLActivityWithTimeout
│ ├── README.md
│ ├── pipeline.json
│ └── setup
│ │ ├── SQLActivitySample.py
│ │ ├── Setup.py
│ │ ├── SetupPipelineDefinition.py
│ │ ├── Teardown.py
│ │ ├── Utilities.py
│ │ └── setup.json
├── ShellCommandWithFTP
│ ├── README.md
│ ├── data
│ ├── ftpcommands
│ └── pipeline.json
├── ShellCommandWithS3StagingDirectory
│ ├── README.md
│ ├── bashscript.sh
│ └── shellcommandwiths3stagingdir.json
├── SimplePigActivity
│ ├── pig_activity_sample.json
│ └── readme.md
├── SparkPiMaximizeResourceAllocation
│ ├── SparkPi-maximizeResource.json
│ └── readme.md
├── billing
│ ├── readme.md
│ └── template.json
├── diagnose
│ ├── README.md
│ └── diagnose_pipeline.json
├── dynamo-db-export-as-csv
│ ├── ddb-to-csv.json
│ └── readme.md
├── dynamo-db-export
│ ├── DynamoDB-export.json
│ ├── example-parameters.json
│ └── readme.md
├── dynamo-db-to-redshift
│ ├── dynamo-db-to-redshift.json
│ └── readme.md
├── dynamodb-to-dynamodb-crossregion
│ ├── README.md
│ └── pipeline.json
├── dynamodb-to-dynamodb
│ ├── README.md
│ └── pipeline.json
├── hadoop-activity
│ ├── README.md
│ └── hadoop-activity-world-count-fair.json
├── helloworld
│ ├── README.md
│ ├── helloworld.json
│ └── setup.py
├── json-to-dynamodb
│ ├── README.md
│ ├── customers.json
│ ├── definition.json
│ └── json_to_ddb.q
├── kinesis
│ ├── README.md
│ ├── hive-scripts
│ │ ├── create-table-from-kinesis-stream.q
│ │ ├── script-runner.sh
│ │ └── write-kinesis-to-s3.q
│ ├── kinesis-to-s3.json
│ └── setup
│ │ ├── append-to-stream.sh
│ │ └── setup-script.sh
├── oracle-backup
│ ├── README.md
│ ├── definition.json
│ ├── parameters.json
│ └── values.json
└── rds-to-rds-copy
│ └── readme.md
└── setup
├── logo
└── datapipelinelogo.jpeg
├── stacker.py
└── stacker_tests.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | .DS_Store
3 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | data-pipeline-samples
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2011-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | this software and associated documentation files (the "Software"), to deal in
5 | the Software without restriction, including without limitation the rights to
6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | the Software, and to permit persons to whom the Software is furnished to do so.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | awscli==1.8.12
2 | boto3==1.1.4
3 | botocore==1.2.10
4 | colorama==0.3.3
5 | docutils==0.12
6 | futures==2.2.0
7 | jmespath==0.9.0
8 | pyasn1==0.1.9
9 | python-dateutil==2.4.2
10 | rsa==3.2
11 | six==1.10.0
12 | wheel==0.24.0
13 |
--------------------------------------------------------------------------------
/samples/DynamoDBExport/DynamoDBTableExport.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "occurrences": "1",
5 | "period": "1 Day",
6 | "name": "RunOnce",
7 | "id": "DefaultSchedule",
8 | "type": "Schedule",
9 | "startAt": "FIRST_ACTIVATION_DATE_TIME",
10 | "maxActiveInstances" : "1"
11 | },
12 | {
13 | "failureAndRerunMode": "CASCADE",
14 | "schedule": {
15 | "ref": "DefaultSchedule"
16 | },
17 | "resourceRole": "DataPipelineDefaultResourceRole",
18 | "role": "DataPipelineDefaultRole",
19 | "pipelineLogUri": "s3://",
20 | "scheduleType": "cron",
21 | "name": "Default",
22 | "id": "Default"
23 | },
24 | {
25 | "maximumRetries": "2",
26 | "name": "TableBackupActivity",
27 | "step": "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')},#{myDDBTableName},#{myDDBReadThroughputRatio}",
28 | "id": "TableBackupActivity",
29 | "runsOn": {
30 | "ref": "EmrClusterForBackup"
31 | },
32 | "type": "EmrActivity"
33 | },
34 | {
35 | "bootstrapAction": "s3://elasticmapreduce/bootstrap-actions/configure-hadoop, --yarn-key-value, yarn.nodemanager.resource.memory-mb=12800,--yarn-key-value,yarn.scheduler.minimum-allocation-mb=256,--mapred-key-value,mapreduce.map.memory.mb=500,--mapred-key-value,mapreduce.map.java.opts=-Xmx400M,--mapred-key-value,mapreduce.job.reduce.slowstart.completedmaps=1,--mapred-key-value,mapreduce.map.speculative=false",
36 | "name": "EmrClusterForBackup",
37 | "amiVersion": "3.8.0",
38 | "id": "EmrClusterForBackup",
39 | "type": "EmrCluster",
40 | "masterInstanceType": "m1.medium",
41 | "coreInstanceType": "#{myInstanceType}",
42 | "coreInstanceCount": "#{myInstanceCount}",
43 | "terminateAfter" : "12 hours"
44 | }
45 | ],
46 | "parameters": [
47 | {
48 | "description": "OutputS3folder",
49 | "id": "myOutputS3Loc",
50 | "type": "AWS::S3::ObjectKey"
51 | },
52 | {
53 | "default": "0.2",
54 | "watermark": "Valuebetween0.1-1.0",
55 | "description": "DynamoDB Read Throughput Ratio",
56 | "id": "myDDBReadThroughputRatio",
57 | "type": "Double"
58 | },
59 | {
60 | "description": "DynamoDB Table Name",
61 | "id": "myDDBTableName",
62 | "type": "String"
63 | },
64 | {
65 | "description": "Instance Type",
66 | "id": "myInstanceType",
67 | "watermark" : "Use m1.medium if Read Capacity Units for the job <= 900. Else use m3.xlarge",
68 | "type": "String",
69 | "default": "m3.xlarge"
70 | },
71 | {
72 | "description": "Instance Count",
73 | "watermark" : "(Read Capacity Units / 300) for m1.medium if RCU <= 900. Else (RCU / 1500) for m3.xlarge",
74 | "id": "myInstanceCount",
75 | "type": "Integer",
76 | "default": "1"
77 | },
78 | {
79 | "description" : "Burst IOPs",
80 | "watermark" : "Add IOPS to the DDB table by this percent for the duration of the export job",
81 | "id" : "myBurstIOPS",
82 | "type" : "Double",
83 | "default" : "0.0"
84 | }
85 | ]
86 | }
87 |
--------------------------------------------------------------------------------
/samples/DynamoDBExport/DynamoDBtoCSV.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment" : "Activity used to run the hive script to export data to CSV",
5 | "output": {
6 | "ref": "DataNodeId_cnlSW"
7 | },
8 | "input": {
9 | "ref": "DataNodeId_1ERqq"
10 | },
11 | "name": "TableBackupActivity",
12 | "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myS3ColMapping})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n'\nLOCATION '#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}';\n \nINSERT OVERWRITE TABLE s3TempTable SELECT * FROM tempHiveTable;",
13 | "runsOn": { "ref" : "EmrClusterForBackup" },
14 | "id": "TableBackupActivity",
15 | "type": "HiveActivity"
16 | },
17 | {
18 | "period": "1 days",
19 | "name": "Every 1 day",
20 | "id": "DefaultSchedule",
21 | "type": "Schedule",
22 | "startAt": "FIRST_ACTIVATION_DATE_TIME"
23 | },
24 | {
25 | "myComment" : "The DynamoDB table from which we need to export data from",
26 | "dataFormat": {
27 | "ref": "DDBExportFormat"
28 | },
29 | "name": "DynamoDB",
30 | "id": "DataNodeId_1ERqq",
31 | "type": "DynamoDBDataNode",
32 | "tableName": "#{myDDBTableName}"
33 | },
34 | {
35 | "failureAndRerunMode": "CASCADE",
36 | "schedule": {
37 | "ref": "DefaultSchedule"
38 | },
39 | "resourceRole": "DataPipelineDefaultResourceRole",
40 | "role": "DataPipelineDefaultRole",
41 | "pipelineLogUri": "#{myLogUri}",
42 | "scheduleType": "cron",
43 | "name": "Default",
44 | "id": "Default"
45 | },
46 | {
47 | "name": "EmrClusterForBackup",
48 | "coreInstanceType": "m1.medium",
49 | "coreInstanceCount": "1",
50 | "masterInstanceType": "m1.medium",
51 | "amiVersion": "3.3.2",
52 | "id": "EmrClusterForBackup",
53 | "type": "EmrCluster",
54 | "terminateAfter": "2 Hours"
55 | },
56 | {
57 | "myComment" : "The S3 path to which we export data to",
58 | "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}/",
59 | "dataFormat": {
60 | "ref": "DataFormatId_xqWRk"
61 | },
62 | "name": "S3DataNode",
63 | "id": "DataNodeId_cnlSW",
64 | "type": "S3DataNode"
65 | },
66 | {
67 | "myComment" : "Format for the S3 Path",
68 | "name": "DefaultDataFormat1",
69 | "column": "not_used STRING",
70 | "id": "DataFormatId_xqWRk",
71 | "type": "CSV"
72 | },
73 | {
74 | "myComment" : "Format for the DynamoDB table",
75 | "name": "DDBExportFormat",
76 | "id": "DDBExportFormat",
77 | "column": "not_used STRING",
78 | "type": "DynamoDBExportDataFormat"
79 | }
80 | ],
81 | "parameters": [
82 | {
83 | "description": "Output S3 folder",
84 | "id": "myOutputS3Loc",
85 | "type": "AWS::S3::ObjectKey"
86 | },
87 | {
88 | "description": "DynamoDB table name",
89 | "id": "myDDBTableName",
90 | "type": "String"
91 | },
92 | {
93 | "description": "S3 to DynamoDB Column Mapping",
94 | "id": "myDDBTableColMapping",
95 | "type": "String"
96 | },
97 | {
98 | "description": "S3 Column Mappings",
99 | "id": "myS3ColMapping",
100 | "type": "String"
101 | },
102 | {
103 | "description": "DataPipeline Log Uri",
104 | "id": "myLogUri",
105 | "type": "String"
106 | }
107 | ]
108 | }
109 |
--------------------------------------------------------------------------------
/samples/DynamoDBExport/readme.md:
--------------------------------------------------------------------------------
1 | #DynamoDB to CSV export
2 |
3 | ##About the sample
4 | The pipeline definition is used for exporting DynamoDB data to a CSV format.
5 |
6 | ##Running the pipeline
7 |
8 | Example DynamoDB table with keys: customer_id, income, demographics, financial
9 |
10 | User needs to provide:
11 |
12 | 1. Output S3 folder: The s3 folder prefix to which the CSV data is to be exported.
13 | 2. DynamoDB read throughput ratio: The throughput to be used for the export operation.
14 | 3. DynamoDB table name: The table name from which we need to export the data.
15 | 4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string
16 | 5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas.
17 | 6. Log Uri: S3 log path to capture the pipeline logs.
18 |
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | *.iml
3 | dependency-reduced-pom.xml
4 |
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | DynamoDBExportSample
8 | DynamoDBExportSample
9 | 0.1
10 |
11 |
12 |
13 | com.amazonaws
14 | aws-java-sdk
15 | 1.10.33
16 |
17 |
18 | com.google.guava
19 | guava
20 | 19.0-rc2
21 |
22 |
23 | commons-cli
24 | commons-cli
25 | 1.3.1
26 |
27 |
28 | org.apache.logging.log4j
29 | log4j-api
30 | 2.4.1
31 |
32 |
33 | org.apache.logging.log4j
34 | log4j-core
35 | 2.4.1
36 |
37 |
38 |
39 |
40 |
41 |
42 | org.apache.maven.plugins
43 | maven-shade-plugin
44 | 2.3
45 |
46 |
47 | package
48 |
49 | shade
50 |
51 |
52 |
53 |
54 | com.amazonaws.datapipelinesamples.ddbexport.Main
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | org.apache.maven.plugins
63 | maven-compiler-plugin
64 | 3.3
65 |
66 | 1.8
67 | 1.8
68 |
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/readme.md:
--------------------------------------------------------------------------------
1 | # Data Pipeline DynamoDB Export Java Sample
2 |
3 | ## Overview
4 |
5 | This sample makes it easy to create a pipeline that uses the latest DynamoDB export template EMR activity. You provide
6 | parameters and the tool will create the pipeline and run and monitor it once so you can verify that it is healthy.
7 |
8 | This sample also provides an example application using the AWS Data Pipeline Java SDK. It demonstrates how to
9 | create, run and monitor a pipeline.
10 |
11 | ## Prerequisites
12 |
13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the
14 | [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
15 |
16 |
17 | ## Getting started
18 |
19 | Build: mvn clean package
20 | View parameters description: java -jar path/to/DynamoDBExportSample-0.1.jar help
21 | Run: java -jar path/to/DynamoDBExportSample-0.1.jar <-yourParam foo>
22 |
23 | ## Example
24 |
25 | Create and run on a pipeline that runs once per day:
26 |
27 | java -jar /Users/foobar/DynamoDBExportJava/target/DynamoDBExportSample-0.1.jar -credentialsFile
28 | /Users/foobar/.aws/credentials -myDDBTableName footable -myOutputS3Location s3://foobar/ddb-exports -schedule daily
29 | -myLogsS3Location s3://foobar/logs -myDDBRegion us-east-1
30 |
31 | Create and run on a pipeline that runs once:
32 |
33 | java -jar /Users/foobar/DynamoDBExportJava/target/DynamoDBExportSample-0.1.jar -credentialsFile
34 | /Users/foobar/.aws/credentials -myDDBTableName footable -myOutputS3Location s3://foobar/ddb-exports -schedule once
35 | -myLogsS3Location s3://foobar/logs -myDDBRegion us-east-1
36 |
37 | ## Disclaimer
38 |
39 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for
40 | production environments. Users should carefully inspect code samples before running them.
41 |
42 | Use at your own risk.
43 |
44 | Licensed under the MIT-0 License.
45 |
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/CommandLineArgParser.java:
--------------------------------------------------------------------------------
1 | package com.amazonaws.datapipelinesamples.ddbexport;
2 |
3 | import org.apache.commons.cli.CommandLine;
4 | import org.apache.commons.cli.CommandLineParser;
5 | import org.apache.commons.cli.DefaultParser;
6 | import org.apache.commons.cli.HelpFormatter;
7 | import org.apache.commons.cli.Options;
8 | import org.apache.commons.cli.ParseException;
9 | import org.apache.logging.log4j.LogManager;
10 | import org.apache.logging.log4j.Logger;
11 |
12 | import java.util.HashMap;
13 | import java.util.Map;
14 |
15 | public class CommandLineArgParser {
16 | private static final Logger logger = LogManager.getLogger(CommandLineArgParser.class);
17 |
18 | public static Map parseParameters(final String[] args) {
19 | Options params = new Options();
20 | params.addOption("myDDBTableName", true, "Dynamo DB source table that will be exported (REQUIRED)");
21 | params.addOption("myOutputS3Location", true, "S3 bucket where the export will be stored (REQUIRED)");
22 | params.addOption("myLogsS3Location", true, "S3 bucket where the logs will be stored (REQUIRED)");
23 | params.addOption("schedule", true, "Schedule to run pipeline on. Options are: once or daily (REQUIRED)");
24 | params.addOption("credentialsFile", true, "Path to AWS credentials file. ex: /Users/foo/.aws/credentials " +
25 | "(REQUIRED)");
26 | params.addOption("myDDBRegion", true, "Region to run pipeline in. Default: us-east-1 (Optional)");
27 |
28 | return getParamsMap(args, params);
29 | }
30 |
31 | private static Map getParamsMap(final String[] args, final Options params) {
32 | CommandLineParser parser = new DefaultParser();
33 | CommandLine cmd;
34 | Map paramsMap = new HashMap<>();
35 |
36 | try {
37 | cmd = parser.parse(params, args);
38 | addToMapIfPreset(cmd, "credentialsFile", true, paramsMap);
39 | addToMapIfPreset(cmd, "myDDBTableName", true, paramsMap);
40 | addToMapIfPreset(cmd, "myOutputS3Location", true, paramsMap);
41 | addToMapIfPreset(cmd, "myLogsS3Location", true, paramsMap);
42 | addToMapIfPreset(cmd, "schedule", true, paramsMap);
43 | addToMapIfPreset(cmd, "myDDBRegion", false, paramsMap);
44 | } catch (ParseException | RuntimeException e) {
45 | logger.error(e.getMessage());
46 | printHelp(params);
47 | throw new RuntimeException();
48 | }
49 |
50 | return paramsMap;
51 | }
52 |
53 | private static void printHelp(final Options params) {
54 | HelpFormatter formatter = new HelpFormatter();
55 | formatter.printHelp("maven", params);
56 | }
57 |
58 | private static void addToMapIfPreset(final CommandLine cmd, final String paramName, final boolean required,
59 | final Map paramsMap) {
60 | if(cmd.hasOption(paramName)) {
61 | paramsMap.put(paramName, cmd.getOptionValue(paramName));
62 | } else if (required) {
63 | logger.error("Unable to find required parameter: " + paramName);
64 | throw new RuntimeException();
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/Main.java:
--------------------------------------------------------------------------------
1 | package com.amazonaws.datapipelinesamples.ddbexport;
2 |
3 | import com.amazonaws.auth.AWSCredentials;
4 | import com.amazonaws.auth.profile.ProfileCredentialsProvider;
5 | import com.amazonaws.services.datapipeline.DataPipelineClient;
6 |
7 | import java.util.Map;
8 |
9 | public class Main {
10 |
11 | private static DataPipelineClient dataPipelineClient;
12 |
13 | public static void main(String args[]) {
14 | Map params = CommandLineArgParser.parseParameters(args);
15 |
16 | dataPipelineClient = getClient(params.get("credentialsFile"));
17 |
18 | String pipelineId = DDBExportPipelineCreator.createPipeline(dataPipelineClient);
19 |
20 | DDBExportPipelineCreator.putPipelineDefinition(dataPipelineClient, pipelineId, params);
21 |
22 | DDBExportPipelineCreator.activatePipeline(dataPipelineClient, pipelineId);
23 |
24 | PipelineMonitor.monitorPipelineUntilCompleted(dataPipelineClient, pipelineId, "TableBackupActivity");
25 | }
26 |
27 | private static DataPipelineClient getClient(final String profileName) {
28 | AWSCredentials credentials = new ProfileCredentialsProvider(profileName, "default").getCredentials();
29 | return new DataPipelineClient(credentials);
30 | }
31 | }
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/PipelineMonitor.java:
--------------------------------------------------------------------------------
1 | package com.amazonaws.datapipelinesamples.ddbexport;
2 |
3 | import com.amazonaws.services.datapipeline.DataPipelineClient;
4 | import com.amazonaws.services.datapipeline.model.DescribeObjectsRequest;
5 | import com.amazonaws.services.datapipeline.model.DescribeObjectsResult;
6 | import com.amazonaws.services.datapipeline.model.Field;
7 | import com.amazonaws.services.datapipeline.model.QueryObjectsRequest;
8 | import com.amazonaws.services.datapipeline.model.QueryObjectsResult;
9 | import org.apache.logging.log4j.LogManager;
10 | import org.apache.logging.log4j.Logger;
11 |
12 | import java.util.Timer;
13 | import java.util.TimerTask;
14 | import java.util.stream.Collectors;
15 |
16 | public class PipelineMonitor {
17 |
18 | private static final Logger logger = LogManager.getLogger(DDBExportPipelineCreator.class);
19 |
20 | public static void monitorPipelineUntilCompleted(final DataPipelineClient dataPipelineClient,
21 | final String pipelineId, final String activityName) {
22 | Timer timer = new Timer();
23 | int thirtySeconds = 30 * 1000;
24 | timer.schedule(new TimerTask() {
25 | @Override
26 | public void run() {
27 | QueryObjectsRequest queryObjectsRequest = new QueryObjectsRequest().withPipelineId(pipelineId)
28 | .withSphere("INSTANCE");
29 | QueryObjectsResult result = dataPipelineClient.queryObjects(queryObjectsRequest);
30 |
31 | if(result.getIds().size() <= 0) {
32 | logger.info("Creating pipeline object execution graph");
33 | return;
34 | }
35 |
36 | String emrActivityId = result.getIds().stream().filter(r -> r.contains(activityName))
37 | .collect(Collectors.joining("\n"));
38 | DescribeObjectsResult describeObjectsResult = dataPipelineClient
39 | .describeObjects(new DescribeObjectsRequest().withObjectIds(emrActivityId)
40 | .withPipelineId(pipelineId));
41 |
42 | String status = "";
43 | for(Field field : describeObjectsResult.getPipelineObjects().get(0).getFields()) {
44 | if (field.getKey().equals("@status")) {
45 | logger.info(field.getKey() + "=" + field.getStringValue());
46 | status = field.getStringValue();
47 | }
48 | }
49 |
50 | if (status.equals("CANCELED") || status.equals("FINISHED") || status.equals("FAILED")) {
51 | this.cancel();
52 | timer.cancel();
53 | }
54 | }
55 | }, 0, thirtySeconds);
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/samples/DynamoDBImport/XMLtoDynamoDBImport.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "name": "EmrClusterForBackup",
5 | "coreInstanceType": "m1.medium",
6 | "coreInstanceCount": "1",
7 | "masterInstanceType": "m1.medium",
8 | "amiVersion": "3.3.2",
9 | "id": "EmrClusterForBackup",
10 | "type": "EmrCluster",
11 | "terminateAfter": "2 Hours"
12 | },
13 | {
14 | "period": "1 days",
15 | "name": "Every 1 day",
16 | "id": "DefaultSchedule",
17 | "type": "Schedule",
18 | "startAt": "FIRST_ACTIVATION_DATE_TIME"
19 | },
20 | {
21 | "name": "DefaultDataFormat1",
22 | "column": "not_used STRING",
23 | "id": "DataFormatId_xqWRk",
24 | "myComment": "Format for the S3 Path",
25 | "type": "CSV"
26 | },
27 | {
28 | "failureAndRerunMode": "CASCADE",
29 | "schedule": {
30 | "ref": "DefaultSchedule"
31 | },
32 | "resourceRole": "DataPipelineDefaultResourceRole",
33 | "role": "DataPipelineDefaultRole",
34 | "pipelineLogUri": "#{myLogUri}",
35 | "scheduleType": "cron",
36 | "name": "Default",
37 | "id": "Default"
38 | },
39 | {
40 | "name": "ShellCommandActivityCp",
41 | "runsOn": { "ref" : "EmrClusterForBackup" },
42 | "id": "ActivityId_zrRQz",
43 | "type": "ShellCommandActivity",
44 | "command": "aws s3 cp s3://data-pipeline-samples/dynamodbxml/input/serde.xml /home/hadoop/serde-#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}.xml"
45 | },
46 | {
47 | "dataFormat": {
48 | "ref": "DDBExportFormat"
49 | },
50 | "name": "DynamoDB",
51 | "id": "DataNodeId_1ERqq",
52 | "type": "DynamoDBDataNode",
53 | "myComment": "The DynamoDB table from which we need to export data from",
54 | "tableName": "customers"
55 | },
56 | {
57 | "column": "not_used STRING",
58 | "name": "DDBExportFormat",
59 | "id": "DDBExportFormat",
60 | "type": "DynamoDBExportDataFormat",
61 | "myComment": "Format for the DynamoDB table"
62 | },
63 | {
64 | "directoryPath": "s3://data-pipeline-samples/dynamodbxml/input",
65 | "dataFormat": {
66 | "ref": "DataFormatId_xqWRk"
67 | },
68 | "name": "S3DataNode",
69 | "id": "DataNodeId_cnlSW",
70 | "type": "S3DataNode",
71 | "myComment": "The S3 path to which we export data to"
72 | },
73 | {
74 | "output": {
75 | "ref": "DataNodeId_1ERqq"
76 | },
77 | "input": {
78 | "ref": "DataNodeId_cnlSW"
79 | },
80 | "dependsOn": {
81 | "ref": "ActivityId_zrRQz"
82 | },
83 | "name": "TableBackupActivity",
84 | "hiveScript": "add jar s3://data-pipeline-samples/dynamodbxml/hivexmlserde-1.0.5.3.jar;\nDROP TABLE IF EXISTS xml_bank;\nCREATE EXTERNAL TABLE xml_bank(customer_id STRING, income string, demographics string, financial string)\nROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'\nWITH SERDEPROPERTIES (\n\"column.xpath.customer_id\"=\"/record/@customer_id\",\n\"column.xpath.income\"=\"/record/income/text()\",\n\"column.xpath.demographics\"=\"/record/demographics/*\",\n\"column.xpath.financial\"=\"/record/financial/*\"\n)\nSTORED AS\nINPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'\nTBLPROPERTIES (\n\"xmlinput.start\"=\"\"\n);\nLOAD DATA LOCAL inpath '/home/hadoop/serde-#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}.xml' overwrite into table xml_bank;\nDROP TABLE IF EXISTS hiveTableName;\nCREATE EXTERNAL TABLE hiveTableName (col1 string, col2 string, col3 string, col4 string)\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"customers\", \n\"dynamodb.column.mapping\" = \"col1:customer_id,col2:income,col3:demographics,col4:financial\"); \nINSERT OVERWRITE TABLE hiveTableName SELECT * FROM xml_bank;",
85 | "runsOn": { "ref" : "EmrClusterForBackup" },
86 | "id": "TableBackupActivity",
87 | "type": "HiveActivity",
88 | "myComment": "Activity used to run the hive script to export data to CSV"
89 | }
90 | ]
91 | }
92 |
--------------------------------------------------------------------------------
/samples/DynamoDBImport/readme.md:
--------------------------------------------------------------------------------
1 | #XML to DynamoDB Import
2 |
3 | ##Running the sample pipeline
4 | The json format could be either directly imported in the Console -> Create Pipeline or used in the aws datapipeline cli.
5 | The Pipeline definition would copy an example xml from s3://data-pipeline-samples/dynamodbxml/input/serde.xml to local. This step is required for creating a temporary xml table using hive. The hive script is configured for running on a DynamoDB table with keys as "customer_id, financial, income, demographics". It finally performs an import from the temporary xml table to dynamodb
6 | The data from the xml file is parsed using hive xml serde. The parsing functionality is similar to parsing in xpath
7 | The resultant should be the data is available in the DynamoDB table.
8 |
9 |
10 |
--------------------------------------------------------------------------------
/samples/DynamoDBImportCSV/CSVtoDynamoDB.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment" : "Activity used to run the hive script to import CSV data",
5 | "output": {
6 | "ref": "DataNodeId_cnlSW"
7 | },
8 | "input": {
9 | "ref": "DataNodeId_1ERqq"
10 | },
11 | "name": "TableRestoreActivity",
12 | "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myDDBColDefn})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\\n' LOCATION '#{myInputS3Loc}';\n \nINSERT OVERWRITE TABLE tempHiveTable SELECT * FROM s3TempTable;",
13 | "id": "TableRestoreActivity",
14 | "runsOn": { "ref" : "EmrClusterForRestore" },
15 | "stage": "false",
16 | "type": "HiveActivity"
17 | },
18 | {
19 | "myComment" : "The DynamoDB table from which we need to import data from",
20 | "dataFormat": {
21 | "ref": "DDBExportFormat"
22 | },
23 | "name": "DynamoDB",
24 | "id": "DataNodeId_1ERqq",
25 | "type": "DynamoDBDataNode",
26 | "tableName": "#{myDDBTableName}"
27 | },
28 | {
29 | "failureAndRerunMode": "CASCADE",
30 | "resourceRole": "DataPipelineDefaultResourceRole",
31 | "role": "DataPipelineDefaultRole",
32 | "pipelineLogUri": "#{myLogUri}",
33 | "scheduleType": "ONDEMAND",
34 | "name": "Default",
35 | "id": "Default"
36 | },
37 | {
38 | "name": "EmrClusterForRestore",
39 | "coreInstanceType": "m1.medium",
40 | "coreInstanceCount": "1",
41 | "masterInstanceType": "m1.medium",
42 | "releaseLabel": "emr-4.4.0",
43 | "id": "EmrClusterForRestore",
44 | "type": "EmrCluster",
45 | "terminateAfter": "2 Hours"
46 | },
47 | {
48 | "myComment" : "The S3 path from which we import data from",
49 | "directoryPath": "#{myInputS3Loc}",
50 | "dataFormat": {
51 | "ref": "DataFormatId_xqWRk"
52 | },
53 | "name": "S3DataNode",
54 | "id": "DataNodeId_cnlSW",
55 | "type": "S3DataNode"
56 | },
57 | {
58 | "myComment" : "Format for the S3 Path",
59 | "name": "DefaultDataFormat1",
60 | "column": "not_used STRING",
61 | "id": "DataFormatId_xqWRk",
62 | "type": "CSV"
63 | },
64 | {
65 | "myComment" : "Format for the DynamoDB table",
66 | "name": "DDBExportFormat",
67 | "id": "DDBExportFormat",
68 | "column": "not_used STRING",
69 | "type": "DynamoDBExportDataFormat"
70 | }
71 | ],
72 | "parameters": [
73 | {
74 | "description": "Input S3 folder",
75 | "id": "myInputS3Loc",
76 | "default": "s3://datapipeline-sample-csv/",
77 | "type": "AWS::S3::ObjectKey"
78 | },
79 | {
80 | "description": "DynamoDB table name",
81 | "id": "myDDBTableName",
82 | "type": "String"
83 | },
84 | {
85 | "description": "S3 to DynamoDB Column Mapping",
86 | "id": "myDDBTableColMapping",
87 | "default" : "id:id,age:age,job:job,marital:marital,education:education,default:default,housing:housing,loan:loan,contact:contact,month:month,day_of_week:day_of_week,duration:duration,campaign:campaign,pdays:pdays,previous:previous,poutcome:poutcome,emp_var_rate:emp_var_rate,cons_price_idx:cons_price_idx,cons_conf_idx:cons_conf_idx,euribor3m:euribor3m,nr_employed:nr_employed,y:y",
88 | "type": "String"
89 | },
90 | {
91 | "description": "S3 Column Mappings",
92 | "id": "myS3ColMapping",
93 | "default" : "id string,age int,job string,marital string,education string,default string,housing string,loan string,contact string,month string,day_of_week string,duration int,campaign int,pdays int,previous int,poutcome string,emp_var_rate double,cons_price_idx double,cons_conf_idx double,euribor3m double,nr_employed double,y int",
94 | "type": "String"
95 | },
96 | {
97 | "description": "DynamoDB Column Mappings",
98 | "id": "myDDBColDefn",
99 | "default" : "id string,age bigint,job string,marital string,education string,default string,housing string,loan string,contact string,month string,day_of_week string,duration bigint,campaign bigint,pdays bigint,previous bigint,poutcome string,emp_var_rate double,cons_price_idx double,cons_conf_idx double,euribor3m double,nr_employed double,y bigint",
100 | "type": "String"
101 | },
102 | {
103 | "description": "DataPipeline Log Uri",
104 | "id": "myLogUri",
105 | "type": "AWS::S3::ObjectKey"
106 | }
107 | ]
108 | }
109 |
--------------------------------------------------------------------------------
/samples/DynamoDBImportCSV/README.md:
--------------------------------------------------------------------------------
1 | #DynamoDB to CSV import
2 |
3 | ##About the sample
4 | The pipeline definition is used to import DynamoDB data to a CSV format.
5 |
6 | ##Running the pipeline
7 |
8 | Example DynamoDB table with keys: id
9 |
10 | User needs to provide:
11 |
12 | 1. Input S3 folder: The s3 folder prefix from which the CSV data is to be imported.
13 | 2. DynamoDB read throughput ratio: The throughput to be used for the import operation.
14 | 3. DynamoDB table name: The table name from which we need to import the data.
15 | 4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string
16 | 4. Dynamodb Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string
17 | 5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas.
18 | 6. Log Uri: S3 log path to capture the pipeline logs.
19 |
--------------------------------------------------------------------------------
/samples/EFSBackup/1-Node-EFSBackupPipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects" : [
3 | {
4 | "id" : "Default",
5 | "scheduleType" : "cron",
6 | "failureAndRerunMode" : "CASCADE",
7 | "schedule" : {
8 | "ref" : "DefaultSchedule"
9 | },
10 | "name" : "Default",
11 | "role" : "DataPipelineDefaultRole",
12 | "resourceRole" : "DataPipelineDefaultResourceRole"
13 | },
14 | {
15 | "id" : "EC2ResourceObj",
16 | "terminateAfter" : "70 Minutes",
17 | "instanceType" : "#{myInstanceType}",
18 | "name" : "EC2ResourceObj",
19 | "type" : "Ec2Resource",
20 | "securityGroupIds" : [
21 | "#{mySrcSecGroupID}",
22 | "#{myBackupSecGroupID}"
23 | ],
24 | "subnetId" : "#{mySubnetID}",
25 | "associatePublicIpAddress" : "true",
26 | "imageId" : "#{myImageID}"
27 | },
28 | {
29 | "id" : "DefaultSchedule",
30 | "name" : "Every Day",
31 | "startAt" : "FIRST_ACTIVATION_DATE_TIME",
32 | "type" : "Schedule",
33 | "period" : "1 Days"
34 | },
35 | {
36 | "id" : "ShellCommandActivityObj",
37 | "name" : "ShellCommandActivityObj",
38 | "runsOn" : {
39 | "ref" : "EC2ResourceObj"
40 | },
41 | "command" : "#{myShellCmd}",
42 | "scriptArgument" : [
43 | "#{myEfsSource}",
44 | "#{myEfsBackup}",
45 | "#{myInterval}",
46 | "#{myRetainedBackups}",
47 | "#{myEfsID}"
48 | ],
49 | "type" : "ShellCommandActivity",
50 | "stage" : "true"
51 | }
52 | ],
53 | "parameters" : [
54 | {
55 | "id" : "myShellCmd",
56 | "default" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-backup.sh\nchmod a+x efs-backup.sh\n./efs-backup.sh $1 $2 $3 $4 $5",
57 | "description" : "Shell command to run.",
58 | "type" : "String"
59 | },
60 | {
61 | "id" : "myInstanceType",
62 | "default" : "m3.medium",
63 | "description" : "Instance type for creating backups.",
64 | "allowedValues" : [
65 | "t1.micro",
66 | "m3.medium",
67 | "m3.large",
68 | "m3.xlarge",
69 | "m3.2xlarge",
70 | "c3.large",
71 | "c3.xlarge",
72 | "c3.2xlarge",
73 | "c3.4xlarge",
74 | "c3.8xlarge"
75 | ],
76 | "type" : "String"
77 | },
78 | {
79 | "id" : "mySubnetID",
80 | "default" : "subnet-1234abcd",
81 | "description" : "VPC subnet for your backup EC2 instance (ideally the same subnet used for the production EFS mount point).",
82 | "type" : "String"
83 | },
84 | {
85 | "id" : "mySrcSecGroupID",
86 | "default" : "sg-1111111b",
87 | "description" : "Security group that can connect to the Production EFS mount point.",
88 | "type" : "String"
89 | },
90 | {
91 | "id" : "myBackupSecGroupID",
92 | "default" : "sg-9999999b",
93 | "description" : "Security group that can connect to the Backup EFS mount point.",
94 | "type" : "String"
95 | },
96 | {
97 | "id" : "myInterval",
98 | "default" : "daily",
99 | "description" : "Interval for backups.",
100 | "allowedValues" : [
101 | "hourly",
102 | "daily",
103 | "weekly",
104 | "monthly"
105 | ],
106 | "type" : "String"
107 | },
108 | {
109 | "id" : "myRetainedBackups",
110 | "default" : "7",
111 | "description" : "Number of backups to retain.",
112 | "type" : "Integer"
113 | },
114 | {
115 | "id" : "myEfsID",
116 | "default" : "backup-fs-12345678",
117 | "description" : "Name for the directory that will contain your backups.",
118 | "type" : "String"
119 | },
120 | {
121 | "id" : "myEfsSource",
122 | "default" : "10.0.1.32:/",
123 | "description" : "Production EFS mount target IP address.",
124 | "type" : "String"
125 | },
126 | {
127 | "id" : "myEfsBackup",
128 | "default" : "10.0.1.75:/",
129 | "description" : "Backup EFS mount target IP address.",
130 | "type" : "String"
131 | },
132 | {
133 | "id" : "myImageID",
134 | "default" : "ami-12345678",
135 | "description" : "AMI ID for the EC2 instance.",
136 | "type" : "String"
137 | }
138 | ]
139 | }
--------------------------------------------------------------------------------
/samples/EFSBackup/1-Node-EFSRestorePipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects" : [
3 | {
4 | "id" : "Default",
5 | "scheduleType" : "cron",
6 | "failureAndRerunMode" : "CASCADE",
7 | "schedule" : {
8 | "ref" : "DefaultSchedule"
9 | },
10 | "name" : "Default",
11 | "role" : "DataPipelineDefaultRole",
12 | "resourceRole" : "DataPipelineDefaultResourceRole"
13 | },
14 | {
15 | "id" : "EC2ResourceObj",
16 | "terminateAfter" : "70 Minutes",
17 | "instanceType" : "#{myInstanceType}",
18 | "name" : "EC2ResourceObj",
19 | "type" : "Ec2Resource",
20 | "securityGroupIds" : [
21 | "#{mySrcSecGroupID}",
22 | "#{myBackupSecGroupID}"
23 | ],
24 | "subnetId" : "#{mySubnetID}",
25 | "associatePublicIpAddress" : "true",
26 | "imageId" : "#{myImageID}"
27 | },
28 | {
29 | "id" : "DefaultSchedule",
30 | "name" : "Every Day",
31 | "startAt" : "FIRST_ACTIVATION_DATE_TIME",
32 | "type" : "Schedule",
33 | "occurrences" : "1",
34 | "period" : "1 Days"
35 | },
36 | {
37 | "id" : "ShellCommandActivityObj",
38 | "name" : "ShellCommandActivityObj",
39 | "runsOn" : {
40 | "ref" : "EC2ResourceObj"
41 | },
42 | "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore.sh\nchmod a+x efs-restore.sh\n./efs-restore.sh $1 $2 $3 $4 $5",
43 | "scriptArgument" : [
44 | "#{myEfsSource}",
45 | "#{myEfsBackup}",
46 | "#{myInterval}",
47 | "#{myBackup}",
48 | "#{myEfsID}"
49 | ],
50 | "type" : "ShellCommandActivity",
51 | "stage" : "true"
52 | }
53 | ],
54 | "parameters" : [
55 | {
56 | "id" : "myInstanceType",
57 | "default" : "m3.large",
58 | "description" : "Instance type for performing the restore.",
59 | "allowedValues" : [
60 | "t1.micro",
61 | "m3.medium",
62 | "m3.large",
63 | "m3.xlarge",
64 | "m3.2xlarge",
65 | "c3.large",
66 | "c3.xlarge",
67 | "c3.2xlarge",
68 | "c3.4xlarge",
69 | "c3.8xlarge"
70 | ],
71 | "type" : "String"
72 | },
73 | {
74 | "id" : "mySubnetID",
75 | "default" : "subnet-1234abcd",
76 | "description" : "VPC subnet for your restoration EC2 instance (ideally the same subnet used for the backup EFS mount point).",
77 | "type" : "String"
78 | },
79 | {
80 | "id" : "mySrcSecGroupID",
81 | "default" : "sg-1111111b",
82 | "description" : "Security group that can connect to the Production EFS mount point.",
83 | "type" : "String"
84 | },
85 | {
86 | "id" : "myBackupSecGroupID",
87 | "default" : "sg-9999999b",
88 | "description" : "Security group that can connect to the Backup EFS mount point.",
89 | "type" : "String"
90 | },
91 | {
92 | "id" : "myInterval",
93 | "default" : "daily",
94 | "description" : "Interval that you chose for the backup your going to restore.",
95 | "allowedValues" : [
96 | "hourly",
97 | "daily",
98 | "weekly",
99 | "monthly"
100 | ],
101 | "type" : "String"
102 | },
103 | {
104 | "id" : "myBackup",
105 | "default" : "0",
106 | "description" : "Backup number to restore (0 = the most recent backup).",
107 | "type" : "Integer"
108 | },
109 | {
110 | "id" : "myEfsID",
111 | "default" : "backup-fs-12345678",
112 | "description" : "Name for the directory that already contains your backups.",
113 | "type" : "String"
114 | },
115 | {
116 | "id" : "myEfsSource",
117 | "default" : "10.0.1.32:/",
118 | "description" : "Production EFS mount target IP address.",
119 | "type" : "String"
120 | },
121 | {
122 | "id" : "myEfsBackup",
123 | "default" : "10.0.1.75:/",
124 | "description" : "Backup EFS mount target IP address.",
125 | "type" : "String"
126 | },
127 | {
128 | "id" : "myImageID",
129 | "default" : "ami-12345678",
130 | "description" : "AMI ID for the EC2 instance.",
131 | "type" : "String"
132 | }
133 | ]
134 | }
--------------------------------------------------------------------------------
/samples/EFSBackup/2-Node-EFSRestorePipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects" : [
3 | {
4 | "id" : "Default",
5 | "scheduleType" : "cron",
6 | "failureAndRerunMode" : "CASCADE",
7 | "schedule" : {
8 | "ref" : "DefaultSchedule"
9 | },
10 | "name" : "Default",
11 | "role" : "DataPipelineDefaultRole",
12 | "resourceRole" : "DataPipelineDefaultResourceRole"
13 | },
14 | {
15 | "id" : "EC2Resource1",
16 | "terminateAfter" : "70 Minutes",
17 | "instanceType" : "#{myInstanceType}",
18 | "name" : "EC2Resource1",
19 | "type" : "Ec2Resource",
20 | "securityGroupIds" : [
21 | "#{mySrcSecGroupID}",
22 | "#{myBackupSecGroupID}"
23 | ],
24 | "subnetId" : "#{mySubnetID}",
25 | "associatePublicIpAddress" : "true",
26 | "imageId" : "#{myImageID}"
27 | },
28 | {
29 | "id" : "EC2Resource2",
30 | "terminateAfter" : "70 Minutes",
31 | "instanceType" : "#{myInstanceType}",
32 | "name" : "EC2Resource2",
33 | "type" : "Ec2Resource",
34 | "securityGroupIds" : [
35 | "#{mySrcSecGroupID}",
36 | "#{myBackupSecGroupID}"
37 | ],
38 | "subnetId" : "#{mySubnetID}",
39 | "associatePublicIpAddress" : "true",
40 | "imageId" : "#{myImageID}"
41 | },
42 | {
43 | "id" : "DefaultSchedule",
44 | "name" : "RunOnce",
45 | "startAt" : "FIRST_ACTIVATION_DATE_TIME",
46 | "type" : "Schedule",
47 | "occurrences" : "1",
48 | "period" : "1 Days"
49 | },
50 | {
51 | "id" : "RestorePart1",
52 | "name" : "RestorePart1",
53 | "runsOn" : {
54 | "ref" : "EC2Resource1"
55 | },
56 | "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore-rsync.sh\nchmod a+x efs-restore-rsync.sh\n./efs-restore-rsync.sh $1 $2 $3 $4 $5 $6 $7",
57 | "scriptArgument" : [
58 | "#{myEfsSource}",
59 | "#{myEfsBackup}",
60 | "#{myInterval}",
61 | "#{myBackup}",
62 | "#{myEfsID}",
63 | "1",
64 | "2"
65 | ],
66 | "type" : "ShellCommandActivity",
67 | "stage" : "true"
68 | },
69 | {
70 | "id" : "RestorePart2",
71 | "name" : "RestorePart2",
72 | "runsOn" : {
73 | "ref" : "EC2Resource2"
74 | },
75 | "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore-rsync.sh\nchmod a+x efs-restore-rsync.sh\n./efs-restore-rsync.sh $1 $2 $3 $4 $5 $6 $7",
76 | "scriptArgument" : [
77 | "#{myEfsSource}",
78 | "#{myEfsBackup}",
79 | "#{myInterval}",
80 | "#{myBackup}",
81 | "#{myEfsID}",
82 | "0",
83 | "2"
84 | ],
85 | "type" : "ShellCommandActivity",
86 | "stage" : "true"
87 | }
88 | ],
89 | "parameters" : [
90 | {
91 | "id" : "myInstanceType",
92 | "default" : "m3.large",
93 | "description" : "Instance type for performing the restore.",
94 | "allowedValues" : [
95 | "t1.micro",
96 | "m3.medium",
97 | "m3.large",
98 | "m3.xlarge",
99 | "m3.2xlarge",
100 | "c3.large",
101 | "c3.xlarge",
102 | "c3.2xlarge",
103 | "c3.4xlarge",
104 | "c3.8xlarge"
105 | ],
106 | "type" : "String"
107 | },
108 | {
109 | "id" : "mySubnetID",
110 | "default" : "subnet-1234abcd",
111 | "description" : "VPC subnet for your restoration EC2 instance (ideally the same subnet used for the backup EFS mount point).",
112 | "type" : "String"
113 | },
114 | {
115 | "id" : "mySrcSecGroupID",
116 | "default" : "sg-1111111b",
117 | "description" : "Security group that can connect to the Production EFS mount point.",
118 | "type" : "String"
119 | },
120 | {
121 | "id" : "myBackupSecGroupID",
122 | "default" : "sg-9999999b",
123 | "description" : "Security group that can connect to the Backup EFS mount point.",
124 | "type" : "String"
125 | },
126 | {
127 | "id" : "myInterval",
128 | "default" : "daily",
129 | "description" : "Interval for backups.",
130 | "allowedValues" : [
131 | "hourly",
132 | "daily",
133 | "weekly",
134 | "monthly"
135 | ],
136 | "type" : "String"
137 | },
138 | {
139 | "id" : "myBackup",
140 | "default" : "0",
141 | "description" : "Backup number to restore (0 = the most recent backup).",
142 | "type" : "Integer"
143 | },
144 | {
145 | "id" : "myEfsID",
146 | "default" : "backup-fs-12345678",
147 | "description" : "Name for the directory that already contains your backups",
148 | "type" : "String"
149 | },
150 | {
151 | "id" : "myEfsSource",
152 | "default" : "10.0.1.32:/",
153 | "description" : "Production EFS mount target IP address.",
154 | "type" : "String"
155 | },
156 | {
157 | "id" : "myEfsBackup",
158 | "default" : "10.0.1.75:/",
159 | "description" : "Backup EFS mount target IP address.",
160 | "type" : "String"
161 | },
162 | {
163 | "id" : "myImageID",
164 | "default" : "ami-12345678",
165 | "description" : "AMI ID for the EC2 instance.",
166 | "type" : "String"
167 | }
168 | ]
169 | }
--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup-end.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Input arguments
4 | interval=$1
5 | efsid=$2
6 |
7 | echo "sudo touch /mnt/backups/$efsid/$interval.0/"
8 | sudo touch /mnt/backups/$efsid/$interval.0/
9 | echo "$interval: completed successfully"
10 |
--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup-init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Initialization of EFS backup
3 |
4 | # Input arguments
5 | source=$1
6 | destination=$2
7 | interval=$3
8 | retain=$4
9 | efsid=$5
10 |
11 | # Prepare system for rsync
12 | echo 'sudo yum -y install nfs-utils'
13 | sudo yum -y install nfs-utils
14 | if [ ! -d /backup ]; then
15 | echo 'sudo mkdir /backup'
16 | sudo mkdir /backup
17 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
18 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
19 | fi
20 | if [ ! -d /mnt/backups ]; then
21 | echo 'sudo mkdir /mnt/backups'
22 | sudo mkdir /mnt/backups
23 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
24 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
25 | fi
26 |
27 | # we need to decrement retain because we start counting with 0 and we need to remove the oldest backup
28 | let "retain=$retain-1"
29 | if sudo test -d /mnt/backups/$efsid/$interval.$retain; then
30 | echo "sudo rm -rf /mnt/backups/$efsid/$interval.$retain"
31 | sudo rm -rf /mnt/backups/$efsid/$interval.$retain
32 | fi
33 |
34 | # Rotate all previous backups (except the first one), up one level
35 | for x in `seq $retain -1 2`; do
36 | if sudo test -d /mnt/backups/$efsid/$interval.$[$x-1]; then
37 | echo "sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x"
38 | sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x
39 | fi
40 | done
41 |
42 | # Copy first backup with hard links, then replace first backup with new backup
43 | if sudo test -d /mnt/backups/$efsid/$interval.0 ; then
44 | echo "sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1"
45 | sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1
46 | fi
47 | if [ ! -d /mnt/backups/$efsid ]; then
48 | echo "sudo mkdir -p /mnt/backups/$efsid"
49 | sudo mkdir -p /mnt/backups/$efsid
50 | echo "sudo chmod 700 /mnt/backups/$efsid"
51 | sudo chmod 700 /mnt/backups/$efsid
52 | fi
53 | if [ ! -d /mnt/backups/efsbackup-logs ]; then
54 | echo "sudo mkdir -p /mnt/backups/efsbackup-logs"
55 | sudo mkdir -p /mnt/backups/efsbackup-logs
56 | echo "sudo chmod 700 /mnt/backups/efsbackup-logs"
57 | sudo chmod 700 /mnt/backups/efsbackup-logs
58 | fi
59 | if [ -f /tmp/efs-backup.log ]; then
60 | echo "sudo rm /tmp/efs-backup.log"
61 | sudo rm /tmp/efs-backup.log
62 | fi
63 |
--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup-rsync.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Input arguments
4 | source=$1
5 | destination=$2
6 | interval=$3
7 | retain=$4
8 | efsid=$5
9 | clientNum=$6
10 | numClients=$7
11 |
12 |
13 | # Prepare system for rsync
14 | echo 'sudo yum -y install nfs-utils'
15 | sudo yum -y install nfs-utils
16 | if [ ! -d /backup ]; then
17 | echo 'sudo mkdir /backup'
18 | sudo mkdir /backup
19 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
20 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
21 | fi
22 | if [ ! -d /mnt/backups ]; then
23 | echo 'sudo mkdir /mnt/backups'
24 | sudo mkdir /mnt/backups
25 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
26 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
27 | fi
28 |
29 | if [ -f /tmp/efs-backup.log ]; then
30 | echo "sudo rm /tmp/efs-backup.log"
31 | sudo rm /tmp/efs-backup.log
32 | fi
33 |
34 | #Copy all content this node is responsible for
35 | for myContent in `sudo ls -a --ignore . --ignore .. /backup/ | awk 'NR%'$numClients==$clientNum`; do
36 | echo "sudo rsync -s -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/$myContent /mnt/backups/$efsid/$interval.0/"
37 | sudo rsync -s -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/"$myContent" /mnt/backups/$efsid/$interval.0/
38 | rsyncStatus=$?
39 | done
40 |
41 | if [ -f /tmp/efs-backup.log ]; then
42 | echo "sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log"
43 | sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log
44 | fi
45 | exit $rsyncStatus
46 |
--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Example would be to run this script as follows:
3 | # Every 6 hours; retain last 4 backups
4 | # efs-backup.sh $src $dst hourly 4 efs-12345
5 | # Once a day; retain last 31 days
6 | # efs-backup.sh $src $dst daily 31 efs-12345
7 | # Once a week; retain 4 weeks of backup
8 | # efs-backup.sh $src $dst weekly 7 efs-12345
9 | # Once a month; retain 3 months of backups
10 | # efs-backup.sh $src $dst monthly 3 efs-12345
11 | #
12 | # Snapshots will look like:
13 | # $dst/$efsid/hourly.0-3; daily.0-30; weekly.0-3; monthly.0-2
14 |
15 |
16 | # Input arguments
17 | source=$1
18 | destination=$2
19 | interval=$3
20 | retain=$4
21 | efsid=$5
22 |
23 | # Prepare system for rsync
24 | #echo 'sudo yum -y update'
25 | #sudo yum -y update
26 | echo 'sudo yum -y install nfs-utils'
27 | sudo yum -y install nfs-utils
28 | echo 'sudo mkdir /backup'
29 | sudo mkdir /backup
30 | echo 'sudo mkdir /mnt/backups'
31 | sudo mkdir /mnt/backups
32 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
33 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
34 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
35 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
36 |
37 | # we need to decrement retain because we start counting with 0 and we need to remove the oldest backup
38 | let "retain=$retain-1"
39 | if sudo test -d /mnt/backups/$efsid/$interval.$retain; then
40 | echo "sudo rm -rf /mnt/backups/$efsid/$interval.$retain"
41 | sudo rm -rf /mnt/backups/$efsid/$interval.$retain
42 | fi
43 |
44 |
45 | # Rotate all previous backups (except the first one), up one level
46 | for x in `seq $retain -1 2`; do
47 | if sudo test -d /mnt/backups/$efsid/$interval.$[$x-1]; then
48 | echo "sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x"
49 | sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x
50 | fi
51 | done
52 |
53 | # Copy first backup with hard links, then replace first backup with new backup
54 | if sudo test -d /mnt/backups/$efsid/$interval.0 ; then
55 | echo "sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1"
56 | sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1
57 | fi
58 | if [ ! -d /mnt/backups/$efsid ]; then
59 | echo "sudo mkdir -p /mnt/backups/$efsid"
60 | sudo mkdir -p /mnt/backups/$efsid
61 | echo "sudo chmod 700 /mnt/backups/$efsid"
62 | sudo chmod 700 /mnt/backups/$efsid
63 | fi
64 | if [ ! -d /mnt/backups/efsbackup-logs ]; then
65 | echo "sudo mkdir -p /mnt/backups/efsbackup-logs"
66 | sudo mkdir -p /mnt/backups/efsbackup-logs
67 | echo "sudo chmod 700 /mnt/backups/efsbackup-logs"
68 | sudo chmod 700 /mnt/backups/efsbackup-logs
69 | fi
70 | echo "sudo rm /tmp/efs-backup.log"
71 | sudo rm /tmp/efs-backup.log
72 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/ /mnt/backups/$efsid/$interval.0/"
73 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/ /mnt/backups/$efsid/$interval.0/
74 | rsyncStatus=$?
75 | echo "sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-`date +%Y%m%d-%H%M`.log"
76 | sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-`date +%Y%m%d-%H%M`.log
77 | echo "sudo touch /mnt/backups/$efsid/$interval.0/"
78 | sudo touch /mnt/backups/$efsid/$interval.0/
79 | exit $rsyncStatus
80 |
--------------------------------------------------------------------------------
/samples/EFSBackup/efs-restore-rsync.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Input arguments
4 | source=$1
5 | destination=$2
6 | interval=$3
7 | backupNum=$4
8 | efsid=$5
9 | clientNum=$6
10 | numClients=$7
11 |
12 |
13 | # Prepare system for rsync
14 | echo 'sudo yum -y install nfs-utils'
15 | sudo yum -y install nfs-utils
16 |
17 | if [ ! -d /backup ]; then
18 | echo 'sudo mkdir /backup'
19 | sudo mkdir /backup
20 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
21 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
22 | fi
23 | if [ ! -d /mnt/backups ]; then
24 | echo 'sudo mkdir /mnt/backups'
25 | sudo mkdir /mnt/backups
26 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
27 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
28 | fi
29 |
30 | if [ -f /tmp/efs-restore.log ]; then
31 | echo "sudo rm /tmp/efs-restore.log"
32 | sudo rm /tmp/efs-restore.log
33 | fi
34 |
35 | #Copy all content this node is responsible for
36 | for myContent in `sudo ls -a --ignore . --ignore .. /mnt/backups/$efsid/$interval.$backupNum | awk 'NR%'$numClients==$clientNum`; do
37 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum /backup/"
38 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/$myContent /backup/
39 | rsyncStatus=$?
40 | done
41 |
42 | if [ -f /tmp/efs-restore.log ]; then
43 | echo "sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log"
44 | sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log
45 | fi
46 | exit $rsyncStatus
47 |
--------------------------------------------------------------------------------
/samples/EFSBackup/efs-restore.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Input arguments
4 | source=$1
5 | destination=$2
6 | interval=$3
7 | backupNum=$4
8 | efsid=$5
9 |
10 | # Prepare system for rsync
11 | echo 'sudo yum -y install nfs-utils'
12 | sudo yum -y install nfs-utils
13 | echo 'sudo mkdir /backup'
14 | sudo mkdir /backup
15 | echo 'sudo mkdir /mnt/backups'
16 | sudo mkdir /mnt/backups
17 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
18 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
19 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
20 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
21 |
22 | if [ ! sudo test -d /mnt/backups/$efsid/$interval.$backupNum/ ]; then
23 | echo "EFS Backup $efsid/$interval.$backupNum does not exist!"
24 | exit 1
25 | fi
26 |
27 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/ /backup/"
28 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/ /backup/
29 | rsyncStatus=$?
30 | echo "sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-`date +%Y%m%d-%H%M`.log"
31 | sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-`date +%Y%m%d-%H%M`.log
32 | exit $rsyncStatus
33 |
--------------------------------------------------------------------------------
/samples/ExampleTemplate/README.md:
--------------------------------------------------------------------------------
1 | # {{Example Name}}
2 |
3 | {{Description of activites performed in the example}}
4 |
5 | ## Parameters
6 |
7 | Parameter | Required | Description
8 | ----------|----------|------------
9 | {{Parameter Name}} | {{yes/no}} | {{Description}} {{Example or Default}}
10 |
11 | ## Setup (Optional)
12 |
13 | You can use the setup script in the sample directory to create {{resources}} to use in this example.
14 | You can skip this step if you have {{resources}} that you want to use. The script will take a minute
15 | to complete, and when it's finished it will print the resource identifier of the
16 | {{resources}} that it created.
17 |
18 | ```sh
19 | $> python setup.py
20 | ```
21 |
22 | If the script fails with an ImportError, you may need to [setup your virtualenv](https://github.com/awslabs/data-pipeline-samples#setup).
23 |
24 | ## Running this sample
25 |
26 | Create a new pipeline. Throughout this section we assume that the {{Example Directory}} sample directory is
27 | your current working directory.
28 |
29 | ```sh
30 | $> aws datapipeline create-pipeline --name {{example_name}} --unique-id {{example_name}}
31 | # {
32 | # "pipelineId": "df-03971252U4AVY60545T7"
33 | # }
34 | ```
35 |
36 | Upload the [pipeline definition](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html). Use the `pipelineId` that was returned by the `create-pipeline`
37 | command. Specify the name of an S3 bucket where the output from pipline activites will be stored.
38 | This will either be the bucket name that was printed by the setup script or another bucket that
39 | you've created. You can also specify any optional parameters for this example here.
40 |
41 |
42 | ```sh
43 | $> aws datapipeline put-pipeline-definition --pipeline-id --pipeline-definition file://TeraSortHadoopBenchmark.json {{--parameter-values values}}
44 | # {
45 | # "errored": false,
46 | # "validationWarnings": [],
47 | # "validationErrors": []
48 | # }
49 | ```
50 |
51 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command.
52 |
53 | ```sh
54 | $> aws datapipeline activate-pipeline --pipeline-id
55 | ```
56 |
57 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the
58 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output
59 | from this command will show FINISHED for all pipeine nodes.
60 |
61 | ```sh
62 |
63 | >$ aws datapipeline list-runs --pipeline-id
64 | # {{example output}}
65 |
66 | ```
67 |
68 | {{what happens when the pipeline is finished}}
69 |
70 | ## Next steps
71 |
72 | {{things to try next}}
73 |
74 | Once the pipeline is completed, you can delete it with the following command. If you try to run the
75 | sample again without deleting, you may receive errors or unexpected behavior.
76 |
77 | ```sh
78 | $> aws datapipeline delete-pipeline --pipeline-id
79 | ```
80 |
81 | The resources used by this example will incur normal charges. If you provisioned resources using the
82 | setup script, you can free them by running the following command in the sample directory.
83 |
84 | ```sh
85 | $> python setup.py --teardown
86 | ```
87 |
88 | ## Disclaimer
89 |
90 | The samples in this repository are meant to help users get started with Data Pipeline. They may not
91 | be sufficient for production environments. Users should carefully inspect samples before running
92 | them.
93 |
94 | *Use at your own risk.*
95 |
96 | Licensed under the MIT-0 License.
97 |
--------------------------------------------------------------------------------
/samples/HadoopTerasort/process-jhist.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Number of arguments: $#"
4 | echo "Arguments: $@"
5 | S3_Target=$1
6 | echo "S3 Target output path: $S3_Target"
7 |
8 | # --------------------------------------------------------------
9 | # TeraSort Benchmark JHIST Publish Script
10 | # This script is a reference script.
11 | # TeraSortHadoopBenchmark pipeline uses the script hosted at: s3://datapipeline-us-east-1/sample-scripts/HadoopTeraSort/process-jhist.sh
12 | # --------------------------------------------------------------
13 |
14 | # --------------------------------------------------------------
15 | # Any code, applications, scripts, templates, proofs of concept, documentation and other items provided by AWS under this SOW are AWS Content, as defined in the Agreement, and are provided for illustration purposes only. All such AWS Content is provided solely at the option of AWS, and is subject to the terms of the Addendum and the Agreement. Customer is solely responsible for using, deploying, testing, and supporting any code and applications provided by AWS under the current SOW.
16 | # --------------------------------------------------------------
17 |
18 | # --------------------------------------------------------------
19 | # CHANGE LOG:
20 | # --------------------------------------------------------------
21 | # 2015-04-28 RG v0.1 - Initial script
22 | # 2015-04-28 RG v0.2 - Added TeraSort & TeraValidate JHIST Processing Activities
23 | # 2015-09-01 AR v0.3 - Output to S3 target path
24 | # 2015-11-19 JT v0.4 - Update file name parsing and use mapred command
25 | # --------------------------------------------------------------
26 |
27 | # --------------------------------------------------------------
28 | # Define Variables
29 | # --------------------------------------------------------------
30 |
31 |
32 |
33 |
34 | # --------------------------------------------------------------
35 | # Process JHIST File
36 | # --------------------------------------------------------------
37 |
38 | path_to_jhist() {
39 | # perl incantation to extract the path from the ls command
40 | # via: http://stackoverflow.com/questions/21569172/how-to-list-only-file-name-in-hdfs
41 | hdfs dfs -ls -R / | grep $1 | perl -wlne 'print +(split " ",$_,8)[7]'
42 | }
43 |
44 | TeraGen=$(path_to_jhist TeraGen)
45 | mapred job -history all $TeraGen > TeraGen-results.txt
46 |
47 | TeraSort=$(path_to_jhist TeraSort)
48 | mapred job -history all $TeraSort > TeraSort-results.txt
49 |
50 | TeraValidate=$(path_to_jhist TeraValidate)
51 | mapred job -history all ${TeraValidate} > TeraValidate-results.txt
52 |
53 | # --------------------------------------------------------------
54 | # Copy to S3
55 | # --------------------------------------------------------------
56 |
57 | gensecondline=`sed -n '2{p;q}' TeraGen-results.txt`;
58 | genjob=${gensecondline:12}
59 | date=$(date +"%m-%d-%y")
60 | aws s3 cp TeraGen-results.txt $S3_Target/$date-$genjob/results/
61 | aws s3 cp TeraSort-results.txt $S3_Target/$date-$genjob/results/
62 | aws s3 cp TeraValidate-results.txt $S3_Target/$date-$genjob/results/
63 | aws s3 cp /home/hadoop/conf $S3_Target/$date-$genjob/conf/ --recursive
64 |
65 | exit 0
66 |
--------------------------------------------------------------------------------
/samples/HadoopTerasort/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("../../setup")
3 |
4 | from stacker import Stacker
5 |
6 | s = Stacker(
7 | "dpl-samples-hadoop-terasort",
8 | {
9 | "Resources": {
10 | "S3Bucket": {
11 | "Type": "AWS::S3::Bucket",
12 | "DeletionPolicy": "Delete"
13 | }
14 | }
15 | })
16 |
17 | s.run(sys.argv)
18 |
--------------------------------------------------------------------------------
/samples/InvokeLambda/README.md:
--------------------------------------------------------------------------------
1 | # Data Pipeline InvokeLambda Sample
2 |
3 | ## Overview
4 |
5 | This sample shows how to build a Shell Command Activity pipeline that invokes AWS Lambda function.
6 |
7 | ## Prerequisites
8 |
9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
10 |
11 | ## Run this sample pipeline using the AWS CLI
12 |
13 | ```sh
14 | $> aws datapipeline create-pipeline --name invoke_lambda_pipeline --unique-id invoke_lambda_pipeline
15 | ```
16 |
17 | You receive a pipelineId like this.
18 | ```sh
19 | # -----------------------------------------
20 | # | CreatePipeline |
21 | # +-------------+--------------------------+
22 | # | pipelineId | |
23 | # +-------------+--------------------------+
24 | ```
25 |
26 | ```sh
27 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://invokelambda.json --parameter-values myLambdaFunction= myS3LogsPath=s3:///path --pipeline-id
28 | ```
29 |
30 | You receive a validation messages like this
31 | ```sh
32 | # -----------------------
33 | # |PutPipelineDefinition|
34 | # +-----------+---------+
35 | # | errored | False |
36 | # +-----------+---------+
37 | ```
38 |
39 | Now activate the pipeline
40 | ```sh
41 | $> aws datapipeline activate-pipeline --pipeline-id
42 | ```
43 |
44 | Check the status of your pipeline
45 | ```
46 | >$ aws datapipeline list-runs --pipeline-id
47 | ```
48 |
49 | You will receive status information on the pipeline.
50 | ```sh
51 | Name Scheduled Start Status
52 | ID Started Ended
53 | ---------------------------------------------------------------------------------------------------
54 | 1. Invoke_Lambda_Activity 2016-03-23T18:40:31 WAITING_FOR_RUNNER
55 | @Invoke_Lambda_Activity_2016-03-23T18:40:31 2016-03-23T18:40:35
56 |
57 | 2. New_EC2Instance 2016-03-23T18:40:31 CREATING
58 | @New_EC2Instance_2016-03-23T18:40:31 2016-03-23T18:40:36
59 |
60 | ```
61 |
62 |
63 | ## Disclaimer
64 |
65 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
66 |
67 | Use at your own risk.
68 |
69 | Licensed under the MIT-0 License.
70 |
--------------------------------------------------------------------------------
/samples/InvokeLambda/invokelambda.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment": "This object is used to set default configuration for objects in the pipeline",
5 |
6 | "id": "Default",
7 | "failureAndRerunMode": "CASCADE",
8 | "resourceRole": "DataPipelineDefaultResourceRole",
9 | "role": "DataPipelineDefaultRole",
10 | "pipelineLogUri": "#{myS3LogsPath}",
11 | "scheduleType": "ONDEMAND"
12 | },
13 | {
14 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
15 |
16 | "type": "Ec2Resource",
17 | "id": "New_EC2Instance",
18 | "name": "New_EC2Instance",
19 | "terminateAfter": "1 Hour",
20 | "imageId": "#{myImageId}",
21 | "region": "#{myRegion}",
22 | "instanceType": "#{myInstanceType}",
23 | "resourceRole": "DataPipelineDefaultResourceRole",
24 | "role": "DataPipelineDefaultRole"
25 | },
26 | {
27 | "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case, it invokes Lambda Function.",
28 |
29 | "id": "Invoke_Lambda_Activity",
30 | "name": "Invoke_Lambda_Activity",
31 | "type": "ShellCommandActivity",
32 | "runsOn": {
33 | "ref": "New_EC2Instance"
34 | },
35 | "command": "aws lambda --region #{myRegion} invoke --function-name #{myLambdaFunction} outfile.txt"
36 | }
37 | ],
38 | "parameters": [
39 | {
40 | "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.",
41 |
42 | "id" : "myS3LogsPath",
43 | "type" : "AWS::S3::ObjectKey",
44 | "description" : "S3 path for pipeline logs."
45 | },
46 | {
47 | "myComment": "This Parameter specifies the Lambda function name.",
48 |
49 | "id" : "myLambdaFunction",
50 | "type" : "String",
51 | "description" : "Lambda Function name"
52 | },
53 | {
54 | "myComment": "This Parameter specifies region",
55 |
56 | "id" : "myRegion",
57 | "type" : "String",
58 | "default" : "us-east-1",
59 | "description" : "Region"
60 | },
61 | {
62 | "myComment": "This Parameter specifies image id",
63 |
64 | "id" : "myImageId",
65 | "type" : "String",
66 | "default" : "ami-8fcee4e5",
67 | "description" : "Image Id"
68 | },
69 | {
70 | "myComment": "This Parameter specifies instance type",
71 |
72 | "id" : "myInstanceType",
73 | "type" : "String",
74 | "default" : "m3.medium",
75 | "description" : "Instance Type"
76 | }
77 | ]
78 | }
79 |
--------------------------------------------------------------------------------
/samples/LoadTsvFilesInS3ToRedshift/LoadTsvFilesInS3ToRedshift.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "failureAndRerunMode": "CASCADE",
5 | "schedule": {
6 | "ref": "DefaultSchedule"
7 | },
8 | "resourceRole": "DataPipelineDefaultResourceRole",
9 | "role": "DataPipelineDefaultRole",
10 | "pipelineLogUri": "s3://insert-here-log-location-for-DPL",
11 | "scheduleType": "cron",
12 | "name": "Default",
13 | "id": "Default"
14 | },
15 | {
16 | "occurrences": "1",
17 | "period": "1 Day",
18 | "name": "RunOnce",
19 | "id": "DefaultSchedule",
20 | "type": "Schedule",
21 | "startAt": "FIRST_ACTIVATION_DATE_TIME"
22 | },
23 | {
24 | "output": {
25 | "ref": "DestRedshiftTable"
26 | },
27 | "input": {
28 | "ref": "Input_S3_Tsv_Files"
29 | },
30 | "name": "ExportS3ToRedshift",
31 | "runsOn": {
32 | "ref": "Ec2Instance"
33 | },
34 | "id": "RedshiftLoadActivity",
35 | "type": "RedshiftCopyActivity",
36 | "insertMode": "OVERWRITE_EXISTING"
37 | },
38 | {
39 | "connectionString": "#{myRedshiftJdbcConnectStr}",
40 | "databaseName": "#{myRedshiftDbName}",
41 | "*password": "#{myRedshiftPassword}",
42 | "name": "RedshiftCluster",
43 | "id": "RedshiftCluster",
44 | "type": "RedshiftDatabase",
45 | "username": "#{myRedshiftUsername}"
46 | },
47 | {
48 | "filePath": "#{myInputTsvFilesS3Loc}",
49 | "name": "Input_S3_Tsv_Files",
50 | "id": "Input_S3_Tsv_Files",
51 | "dataFormat": {
52 | "ref": "DataFormatId_tsv"
53 | },
54 | "type": "S3DataNode"
55 | },
56 | {
57 | "securityGroupIds": "#{myRedshiftSecurityGrpIds}",
58 | "instanceType": "m3.xlarge",
59 | "name": "Ec2Instance",
60 | "associatePublicIpAddress": "true",
61 | "id": "Ec2Instance",
62 | "type": "Ec2Resource",
63 | "region": "us-east-1",
64 | "terminateAfter": "10 Hours",
65 | "availabilityZone": "us-east-1a"
66 | },
67 | {
68 | "database": {
69 | "ref": "RedshiftCluster"
70 | },
71 | "name": "DestRedshiftTable",
72 | "id": "DestRedshiftTable",
73 | "schemaName": "schemaNameInRedshift",
74 | "type": "RedshiftDataNode",
75 | "tableName": "DestRedshiftTableName"
76 | },
77 | {
78 | "name": "S3TRDataFormat",
79 | "id": "DataFormatId_tsv",
80 | "type": "TSV"
81 | }
82 | ],
83 | "parameters": [
84 | {
85 | "description": "Redshift password",
86 | "id": "*myRedshiftPassword",
87 | "type": "String"
88 | },
89 | {
90 | "description": "Redshift database name",
91 | "id": "myRedshiftDbName",
92 | "type": "String"
93 | },
94 | {
95 | "watermark": "security group id. E.g.,",
96 | "helpText": "The names of one or more security groups that are assigned to the Redshift cluster.",
97 | "description": "Security group Id(s)",
98 | "isArray": "true",
99 | "id": "myRedshiftSecurityGrpIds",
100 | "type": "String"
101 | },
102 | {
103 | "description": "Redshift username",
104 | "id": "myRedshiftUsername",
105 | "type": "String"
106 | },
107 | {
108 | "allowedValues": "OVERWRITE_EXISTING",
109 | "default": "OVERWRITE_EXISTING",
110 | "helpLink": "https://docs.aws.amazon.com/console/datapipeline/redshiftcopyactivity",
111 | "helpText": "Determines how to handle pre-existing data in the target table that overlaps with rows in the data to be loaded.",
112 | "description": "Table insert mode",
113 | "id": "myInsertMode",
114 | "type": "String"
115 | },
116 | {
117 | "helpText": "The name of an existing table or a new table that will be created based on the create table SQL query parameter below.",
118 | "description": "Redshift table name",
119 | "id": "myRedshiftTableName",
120 | "type": "String"
121 | },
122 | {
123 | "helpText": "The S3 folder where one or more tsv input files are located.",
124 | "description": "Input S3 folder",
125 | "id": "myInputTsvFilesS3Loc",
126 | "type": "AWS::S3::ObjectKey",
127 | "watermark" : "s3://tsv-files-insert-loc/2015-10-27-01-00-29"
128 | },
129 | {
130 | "watermark": "jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true",
131 | "description": "Redshift JDBC connection string",
132 | "id": "myRedshiftJdbcConnectStr",
133 | "type": "String"
134 | }
135 | ]
136 | }
137 |
--------------------------------------------------------------------------------
/samples/LoadTsvFilesInS3ToRedshift/README.md:
--------------------------------------------------------------------------------
1 | #Data Pipeline Load Tab Separated Files in S3 to Redshift
2 |
3 | ##About the sample
4 | This pipeline definition when imported would instruct Redshift to load TSV files under the specified S3 Path into a specified Redshift Table. Table insert mode is OVERWRITE_EXISTING.
5 |
6 | ##Running this sample
7 | The pipeline requires the following user input point:
8 |
9 | 1. The S3 folder where the input TSV files are located.
10 | 2. Redshift connection info along with the target table name.
11 | 3. Redshift Cluster security group id(s).
12 |
13 |
14 | ## Prerequisites
15 |
16 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
17 | TSV files under a S3 folder path is the input for this pipeline. Redshift Cluster and Table must already exist.
18 |
19 |
20 |
21 | ## Run this sample pipeline using the AWS CLI
22 |
23 | ```sh
24 | $> aws datapipeline create-pipeline --name copy_tsv_to_redshift_pipeline --unique-id copy_tsv_to_redshift_pipeline
25 | ```
26 |
27 | You receive a pipelineId like this.
28 | ```sh
29 | # -----------------------------------------
30 | # | CreatePipeline |
31 | # +-------------+--------------------------+
32 | # | pipelineId | |
33 | # +-------------+--------------------------+
34 | ```
35 |
36 | ```sh
37 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values
38 | myInputTsvFilesS3Loc= myRedshiftJdbcConnectStr= myRedshiftUsername= myRedshiftPassword=
39 | myRedshiftTableName= myRedshiftSecurityGrpIds= --pipeline-id
40 | ```
41 |
42 | You receive a validation messages like this
43 | ```sh
44 | # -----------------------
45 | # |PutPipelineDefinition|
46 | # +-----------+---------+
47 | # | errored | False |
48 | # +-----------+---------+
49 | ```
50 |
51 | Now activate the pipeline
52 | ```sh
53 | $> aws datapipeline activate-pipeline --pipeline-id
54 | ```
55 |
56 | Check the status of your pipeline
57 | ```sh
58 | >$ aws datapipeline list-runs --pipeline-id
59 | ```
60 |
61 | You will receive status information on the pipeline.
62 |
63 |
64 | ## Disclaimer
65 |
66 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
67 |
68 | Use at your own risk.
69 |
70 | Licensed under the MIT-0 License.
71 |
--------------------------------------------------------------------------------
/samples/OnDemandWithLamdaFunctions/lambda_function.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import json
4 | import urllib
5 | import boto3
6 |
7 | print('Loading function')
8 |
9 | client = boto3.client('datapipeline')
10 | pipeline_id = 'df-123456789'
11 |
12 | def lambda_handler(event, context):
13 | try:
14 | response = client.activate_pipeline(pipelineId=pipeline_id)
15 | return response
16 | except Exception as e:
17 | print(e)
18 | raise e
19 |
--------------------------------------------------------------------------------
/samples/OnDemandWithLamdaFunctions/ondemand.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment": "This object is used to set default configuration for objects
5 | in the pipeline",
6 |
7 | "id": "Default",
8 | "failureAndRerunMode":"cascade",
9 | "resourceRole": "DataPipelineDefaultResourceRole",
10 | "role": "DataPipelineDefaultRole",
11 | "pipelineLogUri": "#{myS3LogsPath}",
12 | "scheduleType": "ondemand",
13 | },
14 | {
15 | "myComment": "This object is used to create the Amazon EC2 Instance that activities
16 | in the pipeline will be executed on.",
17 |
18 | "id": "A_Fresh_NewEC2Instance",
19 | "type": "Ec2Resource",
20 | "terminateAfter": "1 Hour"
21 | },
22 | {
23 | "myComment": "This object is a ShellCommandActivity. It is used to specify the linux
24 | shell command that will be invoked. In this case it is simply running the 'echo' command,
25 | but it can be used to run any command that is accessible on in the commandline shell of the
26 | Instance that runs on.",
27 |
28 | "id": "ShellCommandActivity_HelloWorld",
29 | "runsOn": {
30 | "ref": "A_Fresh_NewEC2Instance"
31 | },
32 | "type": "ShellCommandActivity",
33 | "command": "echo 'Hello World!'"
34 | }
35 | ],
36 | "parameters": [
37 | {
38 | "myComment": "Pipeline Parameters are placeholders for variables that a user can specify
39 | when uploading or activating the pipeline. In this example, we create a Parameter
40 | called 'myS3LogsPath' which is used to provide an S3 location for output logs. It is
41 | referenced above in the 'Default' object to set the 'pipelineLogUri' value. Parameters
42 | help users avoid hard coding variables in pipeline definitions. Users can supply these
43 | parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline
44 | activate-pipeline-definition'.",
45 |
46 | "id" : "myS3LogsPath",
47 | "type" : "AWS::S3::ObjectKey",
48 | "description" : "S3 path for pipeline logs."
49 | }
50 | ]
51 | }
--------------------------------------------------------------------------------
/samples/OnDemandWithLamdaFunctions/readme.md:
--------------------------------------------------------------------------------
1 | #### This sample shows how to create a Lamda function that responds to S3 create object events on an S3 bucket and/or a Cloudwatch Scheduled Event.
2 |
3 | The following Python code defines an AWS Lamda function to run an ondemand pipeline. This code is in a file called lamda_function.py. You simply need to set the ``pipeline_id`` variable with the id of your on-demand pipeline.
4 |
5 | ```python
6 | from __future__ import print_function
7 |
8 | import json
9 | import urllib
10 | import boto3
11 |
12 | print('Loading function')
13 |
14 | client = boto3.client('datapipeline')
15 | pipeline_id = 'df-123456789'
16 |
17 | def lambda_handler(event, context):
18 | try:
19 | response = client.activate_pipeline(pipelineId=pipeline_id)
20 | return response
21 | except Exception as e:
22 | print(e)
23 | raise e
24 | ```
25 | ### Step 1: Create the on-demand pipeline
26 | *Make sure the pipeline is created in a region that supports Lamda.*
27 |
28 | Create the pipeline:
29 |
30 | ```sh
31 | $> aws datapipeline create-pipeline --name on_demand_lamda --unique-id on_demand_lamda
32 | ```
33 |
34 | Upload the pipeline definition:
35 |
36 | ```sh
37 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://ondemand.json \
38 | --parameter-values myS3LogsPath= --pipeline-id
39 | ```
40 |
41 | Activate the pipeline to make sure it runs sucessfully:
42 |
43 | ```sh
44 | $> aws datapipeline activate-pipeline --pipeline-id
45 | ```
46 |
47 | Check the status of your pipeline:
48 | ```
49 | >$ aws datapipeline list-runs --pipeline-id
50 | ```
51 |
52 | ### Step 2: Create the Lamda function
53 |
54 |
55 | ```sh
56 | >$ aws lambda create-function --function-name --runtime python2.7 \
57 | --role --handler lambda_function.lambda_handler \
58 | --zip-file file:///zip-with-lamda-fn-code.zip --publish --timeout 10
59 | ```
60 |
61 | See this link for reference on the Lamda create-function command:
62 | http://docs.aws.amazon.com/cli/latest/reference/lambda/create-function.html
63 |
64 | ### Step 3: Set-up an event source for the Lamda funtion
65 |
66 | ##### Set-up an S3 bucket to call the Lamda function when objects are created
67 |
68 | Create the s3 bucket:
69 |
70 | ```sh
71 | $> aws s3 mb
72 | ```
73 |
74 | Run the following Lambda add-permission command to grant Amazon S3 service principal permissions to perform the lambda:InvokeFunction action:
75 |
76 | ```sh
77 | $> aws lambda add-permission --function-name \
78 | --region --statement-id --action "lambda:InvokeFunction" \
79 | --principal s3.amazonaws.com --source-arn \
80 | --source-account --profile adminuser
81 | ```
82 |
83 | See this link for reference on the lamda add-permission command:
84 | http://docs.aws.amazon.com/cli/latest/reference/lambda/add-permission.html
85 |
86 | Add the notification on S3 and have it call the Lamda function:
87 |
88 | \*Make sure your notification configuration contains ``s3:ObjectCreated:*`` events
89 |
90 | ```sh
91 | $> aws s3api put-bucket-notification-configuration --bucket --notification-configuration
92 | ```
93 |
94 | See this link for reference on the s3api put-bucket-notification-configuration command:
95 | http://docs.aws.amazon.com/cli/latest/reference/s3api/put-bucket-notification-configuration.html
96 |
97 | Upload a file to the S3 bucket and make validate the lamda function activated your pipeline:
98 |
99 | ```sh
100 | $> aws s3 cp
101 | $> aws datapipeline list-runs --pipeline-id
102 | ```
103 |
104 | ##### OR Add a CRON schedule using Cloudwatch Scheduled Events
105 |
106 | This is only possible in the Lamda console. Instructions here: http://docs.aws.amazon.com/lambda/latest/dg/with-scheduled-events.html
107 |
--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/RDStoRedshiftSqoop.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment": "This object is used to set default configuration for objects in the pipeline.",
5 |
6 | "id": "Default",
7 | "name": "Default",
8 | "failureAndRerunMode": "CASCADE",
9 | "schedule": {
10 | "ref": "DefaultSchedule"
11 | },
12 | "resourceRole": "DataPipelineDefaultResourceRole",
13 | "role": "DataPipelineDefaultRole",
14 | "scheduleType": "cron",
15 | "pipelineLogUri": "#{myS3LogsPath}"
16 | },
17 | {
18 | "myComment": "This object is used to run the Sqoop activity that extracts data from RDS.",
19 |
20 | "name": "Sqoop",
21 | "id": "ActivityId_wQhxe",
22 | "runsOn": {
23 | "ref": "ResourceId_z9RNH"
24 | },
25 | "type": "ShellCommandActivity",
26 | "command": "/usr/bin/sqoop import --connect jdbc:mysql://#{myRdsEndpoint}/millionsongs --table songs --target-dir #{myS3StagingPath} --username dplcustomer --password Dplcustomer1"
27 | },
28 | {
29 | "myComment": "This object is used to specify the copy activity for moving data from S3 to Redshift.",
30 |
31 | "output": {
32 | "ref": "DataNodeId_7EqZ7"
33 | },
34 | "input": {
35 | "ref": "DataNodeId_ImmS9"
36 | },
37 | "dependsOn": {
38 | "ref": "ActivityId_wQhxe"
39 | },
40 | "name": "CopyToRedshift",
41 | "id": "ActivityId_6OGtu",
42 | "runsOn": {
43 | "ref": "ResourceId_z9RNH"
44 | },
45 | "type": "RedshiftCopyActivity",
46 | "insertMode": "TRUNCATE"
47 | },
48 | {
49 | "myComment": "This object is used to control the task schedule.",
50 |
51 | "occurrences": "1",
52 | "period": "1 Day",
53 | "name": "RunOnce",
54 | "id": "DefaultSchedule",
55 | "type": "Schedule",
56 | "startAt": "FIRST_ACTIVATION_DATE_TIME"
57 | },
58 | {
59 | "myComment": "This object provides connection information for the Redshift cluster.",
60 |
61 | "connectionString": "jdbc:postgresql://#{myRedshiftEndpoint}:5439/dev",
62 | "*password": "Dplcustomer1",
63 | "name": "DefaultRedshiftDatabase1",
64 | "id": "RedshiftDatabaseId_S34X5",
65 | "type": "RedshiftDatabase",
66 | "username": "dplcustomer"
67 | },
68 | {
69 | "myComment": "This object is used to provide information for the EMR cluster bootstrap",
70 |
71 | "bootstrapAction": "s3://data-pipeline-samples/sqoop-activity/install_sqoop_ba.sh",
72 | "name": "HadoopCluster",
73 | "id": "ResourceId_z9RNH",
74 | "amiVersion": "3.8.0",
75 | "type": "EmrCluster",
76 | "terminateAfter": "1 Hour"
77 | },
78 | {
79 | "myComment": "This object provides information on the S3 staging data.",
80 |
81 | "directoryPath": "#{myS3StagingPath}",
82 | "name": "S3Input",
83 | "id": "DataNodeId_ImmS9",
84 | "type": "S3DataNode"
85 | },
86 | {
87 | "myComment": "This object contains information about the Redshift database.",
88 |
89 | "createTableSql": "create table IF NOT EXISTS songs (track_id varchar(2048) not null distkey sortkey, title varchar(2048), song_id varchar(2048), release_name varchar(2048), artist_id varchar(2048), artist_mbid varchar(2048), artist_name varchar(2048), duration float, artist_familiarity float, artist_hotness float, year int);",
90 | "database": {
91 | "ref": "RedshiftDatabaseId_S34X5"
92 | },
93 | "primaryKeys": "track_id",
94 | "name": "Redshift",
95 | "id": "DataNodeId_7EqZ7",
96 | "type": "RedshiftDataNode",
97 | "tableName": "songs"
98 | }
99 | ],
100 | "parameters": []
101 | }
102 |
--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/Setup.py:
--------------------------------------------------------------------------------
1 | from RdsToRedshiftSqoopSample import RDStoRedshiftSqoopSample
2 | from Utilities import check_working_directory
3 |
4 | import argparse
5 | import sys
6 |
7 |
8 | if __name__ == '__main__':
9 | check_working_directory()
10 | parser = argparse.ArgumentParser(description='Setup for RDS to Redshift Sqoop pipeline sample')
11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 | args = parser.parse_args()
13 | s3_bucket_path = args.s3_bucket_path
14 |
15 | sample = RDStoRedshiftSqoopSample()
16 |
17 | if s3_bucket_path is None:
18 | sample.create_s3_bucket()
19 | elif not sample.validate_s3_bucket_path(s3_bucket_path):
20 | sys.exit(0)
21 |
22 | sample.create_rds_instance()
23 | sample.create_redshift_cluster()
24 | sample.run_setup_datapipeline()
25 | sample.print_setup_results()
26 |
--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/SetupPipelineDefinition.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | class SetupPipelineDefinitionHelper(object):
5 |
6 | def __init__(self):
7 | with open("setup.json", "r") as setup:
8 | pipeline_string = setup.read().replace('\n', '')
9 | self.pipeline_definition = json.loads(pipeline_string)
10 |
11 | def get_setup_pipeline_objects(self):
12 | return self.pipeline_definition['objects']
13 |
14 | def get_setup_pipeline_parameters(self):
15 | return self.pipeline_definition['parameters']
16 |
17 | def get_setup_pipeline_parameter_values(self):
18 | return self.pipeline_definition['parameterValues']
19 |
--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/Teardown.py:
--------------------------------------------------------------------------------
1 | from RdsToRedshiftSqoopSample import RDStoRedshiftSqoopSample
2 | from Utilities import check_working_directory
3 |
4 | import argparse
5 |
6 |
7 | if __name__ == '__main__':
8 | check_working_directory()
9 |
10 | parser = argparse.ArgumentParser(description='Teardown for RDS to Redshift Sqoop pipeline sample')
11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 | parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id")
13 | parser.add_argument('--redshift-cluster-id', action="store", dest="redshift_cluster_id")
14 | args = parser.parse_args()
15 |
16 | sample = RDStoRedshiftSqoopSample()
17 |
18 | if args.rds_instance_id is not None:
19 | sample.destroy_rds(args.rds_instance_id)
20 |
21 | if args.redshift_cluster_id is not None:
22 | sample.destroy_redshift(args.redshift_cluster_id)
23 |
24 | if args.s3_bucket_path is not None:
25 | sample.destroy_s3_bucket(args.s3_bucket_path)
26 |
--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/Utilities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | def check_working_directory():
6 | current_folder_path, current_folder_name = os.path.split(os.getcwd())
7 | if current_folder_name == 'RDStoRedshiftSqoop':
8 | os.chdir('setup')
9 | elif current_folder_name != 'setup':
10 | print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoRedshiftSqoop/setup'
11 | sys.exit(0)
12 |
--------------------------------------------------------------------------------
/samples/RDStoS3/RDStoS3Pipeline.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment": "This object is used to set default configuration for objects in the pipeline",
5 |
6 | "id": "Default",
7 | "failureAndRerunMode": "CASCADE",
8 | "resourceRole": "DataPipelineDefaultResourceRole",
9 | "role": "DataPipelineDefaultRole",
10 | "pipelineLogUri": "#{myS3LogsPath}",
11 | "scheduleType": "cron",
12 | "schedule": {
13 | "ref": "DefaultSchedule"
14 | }
15 | },
16 | {
17 | "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.",
18 |
19 | "type": "Schedule",
20 | "id": "DefaultSchedule",
21 | "occurrences": "1",
22 | "period": "1 Day",
23 | "startAt": "FIRST_ACTIVATION_DATE_TIME"
24 | },
25 | {
26 | "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
27 | "name": "Ec2Instance",
28 | "type": "Ec2Resource",
29 | "id": "Ec2Instance",
30 | "instanceType": "t1.micro",
31 | "securityGroups": "#{myEc2RdsSecurityGrps}",
32 | "terminateAfter": "2 Hours",
33 | "resourceRole": "DataPipelineDefaultResourceRole",
34 | "role": "DataPipelineDefaultRole"
35 | },
36 | {
37 | "myComment": "This object is a CopyActivity. It is used to define the work that will be done to copy the data from the database to S3.",
38 | "name": "RDStoS3CopyActivity",
39 | "type": "CopyActivity",
40 | "id": "RDStoS3CopyActivity",
41 | "output": {
42 | "ref": "S3OutputLocation"
43 | },
44 | "input": {
45 | "ref": "SourceRDSTable"
46 | },
47 | "runsOn": {
48 | "ref": "Ec2Instance"
49 | }
50 | },
51 | {
52 | "myComment": "This object is a mysql datanode. It is used to represent the database which is the input datanode.",
53 | "name": "SourceRDSTable",
54 | "type": "MySqlDataNode",
55 | "id": "SourceRDSTable",
56 | "connectionString": "#{myRDSConnectStr}",
57 | "*password": "#{*myRDSPassword}",
58 | "table": "#{myRDSTableName}",
59 | "selectQuery": "select * from #{table}",
60 | "username": "#{myRDSUsername}"
61 | },
62 | {
63 | "myComment": "This object is a S3 datanode. It is used to represent the S3 directory where the data will be stored.",
64 | "name": "S3OutputLocation",
65 | "type": "S3DataNode",
66 | "id": "S3OutputLocation",
67 | "directoryPath": "#{myOutputS3Path}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}"
68 | }
69 | ],
70 | "parameters": [
71 | {
72 | "id": "*myRDSPassword",
73 | "type": "String",
74 | "description": "RDS MySQL password"
75 | },
76 | {
77 | "watermark" : "s3://mybucket",
78 | "id": "myS3LogsPath",
79 | "type": "AWS::S3::ObjectKey",
80 | "description": "S3 folder for logs"
81 | },
82 | {
83 | "watermark" : "s3://mybucket",
84 | "id": "myOutputS3Path",
85 | "type": "AWS::S3::ObjectKey",
86 | "description": "Output S3 folder"
87 | },
88 | {
89 | "watermark" : "sg-12345",
90 | "id": "myEc2RdsSecurityGrps",
91 | "type": "String",
92 | "description": "RDS MySQL security group(s)",
93 | "isArray": "true",
94 | "optional": "true"
95 | },
96 | {
97 | "id": "myRDSUsername",
98 | "type": "String",
99 | "description": "RDS MySQL username"
100 | },
101 | {
102 | "id": "myRDSTableName",
103 | "type": "String",
104 | "description": "RDS MySQL table name"
105 | },
106 | {
107 | "watermark" : "jdbc:mysql://connectionstring:3306/dbname",
108 | "id": "myRDSConnectStr",
109 | "type": "String",
110 | "description": "RDS MySQL connection string"
111 | }
112 | ]
113 | }
114 |
--------------------------------------------------------------------------------
/samples/RDStoS3/README.md:
--------------------------------------------------------------------------------
1 | # Data Pipeline RDStoS3 Sample
2 |
3 | ## Overview
4 |
5 | This sample shows how to build a pipeline that outputs a MySQL table in csv format from a RDS database to an S3 bucket.
6 |
7 | The project provides scripts for setting up the resources for the pipeline, installing the [data set](http://aws.amazon.com/datasets/6468931156960467), and destroying the resources. The project also provides the [pipeline definition file](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html) which is used to create the pipeline and the AWS CLI commands for creating and executing the pipeline. See the instructions below to get started.
8 |
9 | *Note: Normal AWS charges apply for the resources created by the script. Make sure to run the teardown script as soon as you are done with the sample.*
10 |
11 | ## Prerequisites
12 |
13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
14 |
15 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket.
16 |
17 | ## Step 1: Priming this sample
18 |
19 | Run the following commands to run the setup script. The AWS resources that will be created are a RDS MySQL database and optionally an S3 bucket.
20 |
21 | The script takes an *optional* parameter for an S3 path for outputting the data from S3. If you choose to provide your own S3 path, the bucket must be in the same region as what is set for your AWS CLI configuration. Finally, please make sure the S3 bucket has a policy that allows data writes to it.
22 |
23 | If the path is not provided, the script will create the S3 bucket for you.
24 |
25 | *Setup and teardown scripts are located in the setup directory under the sqoop directory in the samples directory.*
26 | ```
27 | $> cd /data-pipeline-samples/samples/RDStoS3
28 | $> python setup/Setup.py --s3-path [s3://optional/path/to/s3/location]
29 | ```
30 |
31 | ## Step 2: Run this sample pipeline using the AWS CLI
32 |
33 | ```sh
34 | $> aws datapipeline create-pipeline --name rds_to_s3_pipeline --unique-id rds_to_s3_pipeline
35 | ```
36 |
37 | You receive a pipelineId like this.
38 | ```sh
39 | # -----------------------------------------
40 | # | CreatePipeline |
41 | # +-------------+--------------------------+
42 | # | pipelineId | |
43 | # +-------------+--------------------------+
44 | ```
45 |
46 | ```sh
47 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://RDStoS3Pipeline.json --parameter-values myOutputS3Path= myS3LogsPath= myRDSPassword= myRDSUsername= myRDSTableName= myRDSConnectStr= --pipeline-id
48 | ```
49 |
50 | You receive a validation messages like this
51 | ```sh
52 | # -----------------------
53 | # |PutPipelineDefinition|
54 | # +-----------+---------+
55 | # | errored | False |
56 | # +-----------+---------+
57 | ```
58 |
59 | Now activate the pipeline
60 | ```sh
61 | $> aws datapipeline activate-pipeline --pipeline-id
62 | ```
63 |
64 | Check the status of your pipeline
65 | ```
66 | >$ aws datapipeline list-runs --pipeline-id
67 | ```
68 |
69 | You will receive status information on the pipeline.
70 | ```sh
71 | # Name Scheduled Start Status
72 | # ID Started Ended
73 | #---------------------------------------------------------------------------------------------------
74 | # 1. ActivityId_6OGtu 2015-07-29T01:06:17 WAITING_ON_DEPENDENCIES
75 | # @ActivityId_6OGtu_2015-07-29T01:06:17 2015-07-29T01:06:20
76 | #
77 | # 2. ResourceId_z9RNH 2015-07-29T01:06:17 CREATING
78 | # @ResourceId_z9RNH_2015-07-29T01:06:17 2015-07-29T01:06:20
79 | #
80 | # @ActivityId_wQhxe_2015-07-29T01:06:17 2015-07-29T01:06:20
81 | ```
82 |
83 | Let the pipeline complete, then check the output S3 bucket for the output csv file.
84 |
85 | ## Step 3: IMPORTANT! Tear down this sample
86 |
87 | *Note: The setup script will provide the teardown command with parameters at end of the execution.*
88 |
89 | ```
90 | $> python setup/Teardown.py --rds-instance-id -s3-path [s3://optional/path/to/s3/bucket/created/by/setup]
91 | ```
92 |
93 | ## Disclaimer
94 |
95 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
96 |
97 | Use at your own risk.
98 |
99 | Licensed under the MIT-0 License.
100 |
--------------------------------------------------------------------------------
/samples/RDStoS3/setup/Setup.py:
--------------------------------------------------------------------------------
1 | from RDStoS3Sample import RDStoS3Sample
2 | from Utilities import check_working_directory
3 |
4 | import argparse
5 | import sys
6 |
7 |
8 | if __name__ == '__main__':
9 | check_working_directory()
10 | parser = argparse.ArgumentParser(description='Setup for RDS to S3 pipeline sample')
11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 | args = parser.parse_args()
13 | s3_bucket_path = args.s3_bucket_path
14 |
15 | sample = RDStoS3Sample()
16 |
17 | if s3_bucket_path is None:
18 | sample.create_s3_bucket()
19 | elif not sample.validate_s3_bucket_path(s3_bucket_path):
20 | sys.exit(0)
21 |
22 | sample.create_rds_instance()
23 | sample.run_setup_datapipeline()
24 | sample.print_setup_results()
25 |
--------------------------------------------------------------------------------
/samples/RDStoS3/setup/SetupPipelineDefinition.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | class SetupPipelineDefinitionHelper(object):
5 |
6 | def __init__(self):
7 | with open("setup.json", "r") as setup:
8 | pipeline_string = setup.read().replace('\n', '')
9 | self.pipeline_definition = json.loads(pipeline_string)
10 |
11 | def get_setup_pipeline_objects(self):
12 | return self.pipeline_definition['objects']
13 |
14 | def get_setup_pipeline_parameters(self):
15 | return self.pipeline_definition['parameters']
16 |
17 | def get_setup_pipeline_parameter_values(self):
18 | return self.pipeline_definition['parameterValues']
19 |
--------------------------------------------------------------------------------
/samples/RDStoS3/setup/Teardown.py:
--------------------------------------------------------------------------------
1 | from RDStoS3Sample import RDStoS3Sample
2 | from Utilities import check_working_directory
3 |
4 | import argparse
5 |
6 |
7 | if __name__ == '__main__':
8 | check_working_directory()
9 |
10 | parser = argparse.ArgumentParser(description='Teardown for RDS to S3 pipeline sample')
11 | parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 | parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id")
13 | parser.add_argument('--redshift-cluster-id', action="store", dest="redshift_cluster_id")
14 | args = parser.parse_args()
15 |
16 | sample = RDStoS3Sample()
17 |
18 | if args.rds_instance_id is not None:
19 | sample.destroy_rds(args.rds_instance_id)
20 |
21 |
22 | if args.s3_bucket_path is not None:
23 | sample.destroy_s3_bucket(args.s3_bucket_path)
24 |
--------------------------------------------------------------------------------
/samples/RDStoS3/setup/Utilities.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 |
5 | def check_working_directory():
6 | current_folder_path, current_folder_name = os.path.split(os.getcwd())
7 | if current_folder_name == 'RDStoS3':
8 | os.chdir('setup')
9 | elif current_folder_name != 'setup':
10 | print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoS3/setup'
11 | sys.exit(0)
12 |
--------------------------------------------------------------------------------
/samples/RedshiftCopyActivityFromDynamoDBTable/RedshiftCopyActivityFromDynamoDBTable.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment": "This object is used to set default configuration for objects in the pipeline.",
5 |
6 | "name": "Default",
7 | "id": "Default",
8 | "failureAndRerunMode": "CASCADE",
9 | "schedule": {
10 | "ref": "DefaultSchedule"
11 | },
12 | "resourceRole": "DataPipelineDefaultResourceRole",
13 | "role": "DataPipelineDefaultRole",
14 | "pipelineLogUri": "#{myLogUri}",
15 | "scheduleType": "cron"
16 | },
17 | {
18 | "myComment": "This object provides connection information for the Redshift cluster.",
19 |
20 | "name": "DefaultDatabase1",
21 | "id": "DatabaseId_Kw7C9",
22 | "connectionString": "#{myConnectionString}",
23 | "databaseName": "#{myRedshiftDatabase}",
24 | "*password": "#{myRedshiftPassword}",
25 | "type": "RedshiftDatabase",
26 | "username": "#{myRedshiftUsername}"
27 | },
28 | {
29 | "myComment": "This object is used to provide the resource where the copy job is invoked.",
30 |
31 | "name": "DefaultResource1",
32 | "id": "ResourceId_idL0Y",
33 | "resourceRole": "DataPipelineDefaultResourceRole",
34 | "role": "DataPipelineDefaultRole",
35 | "type": "Ec2Resource",
36 | "terminateAfter": "1 Hour"
37 | },
38 | {
39 | "myComment": "This object is used to specify the copy activity for moving data from DynamoDB to Redshift.",
40 |
41 | "name": "CopyFromDDBToRedshift",
42 | "id": "ActivityId_vmVn4",
43 | "database": {
44 | "ref": "DatabaseId_Kw7C9"
45 | },
46 | "runsOn": {
47 | "ref": "ResourceId_idL0Y"
48 | },
49 | "type": "SqlActivity",
50 | "script": "#{myScript}"
51 | },
52 | {
53 | "myComment": "This object is used to control the task schedule.",
54 |
55 | "name": "RunOnce",
56 | "id": "DefaultSchedule",
57 | "occurrences": "1",
58 | "period": "1 Day",
59 | "type": "Schedule",
60 | "startAt": "FIRST_ACTIVATION_DATE_TIME"
61 | }
62 | ],
63 | "parameters": []
64 | }
65 |
--------------------------------------------------------------------------------
/samples/RedshiftCopyActivityFromDynamoDBTable/readme.md:
--------------------------------------------------------------------------------
1 | #RedshiftCopyActivityFromDynamoDBTable Sample
2 |
3 | This sample demonstrates how you can use Data Pipeline's RedshiftCopyActivity to copy data from a DynamoDB table to a Redshift table. This sample was motivated by a use case that requires the user to provide AWS credentials to access the DynamoDB table. It is assumed that the owner of the DynamoDB table has granted the user read access to the table. To make this sample to work, you must ensure you have the following:
4 |
5 | * Connection string for the destination Redshift cluster, e.g. jdbc:redshift://_hostname_:5439/_database_
6 | * Redshift database name
7 | * Redshift username and password. This user must have write access to the table where data will be copied to.
8 | * DynamoDB table name. Note that both the table name and column names must match on both sides of the copy.
9 | * AWS credentials, i.e the access key and the secret key, to access the DynamoDB table.
10 | * DynamoDB table read ratio.
11 | * S3 location to direct log messages generated by Data Pipeline.
12 |
13 | You will need to provide the above information in the "put-pipeline-definition" command below.
14 |
15 | ##Running this sample
16 |
17 | ```sh
18 | $> aws datapipeline create-pipeline --name redshift_copy_from_dynamodb_pipeline --unique-id redshift_copy_from_dynamodb_pipeline
19 |
20 | # You receive a pipeline activity like this.
21 | # -----------------------------------------
22 | # | CreatePipeline |
23 | # +-------------+--------------------------+
24 | # | pipelineId | df-0554887H4KXKTY59MRJ |
25 | # +-------------+--------------------------+
26 |
27 | #now upload the pipeline definition
28 |
29 | $> aws datapipeline put-pipeline-definition --pipeline-id df-0554887H4KXKTY59MRJ \
30 | --pipeline-definition file://samples/RedshiftCopyActivitySample/RedshiftCopyActivitySample.json \
31 | --parameter-values myConnectionString= myRedshiftDatabase= \
32 | myRedshiftUsername= myRedshiftPassword= \
33 | myScript="copy from 'dynamodb://' credentials 'aws_access_key_id=;aws_secret_access_key=' readratio ;" \
34 | myLogUri=""
35 |
36 | # You receive a validation messages like this
37 |
38 | # -----------------------
39 | # |PutPipelineDefinition|
40 | # +-----------+---------+
41 | # | errored | False |
42 | # +-----------+---------+
43 |
44 | #now activate the pipeline
45 | $> aws datapipeline activate-pipeline --pipeline-id df-0554887H4KXKTY59MRJ
46 |
47 |
48 | #check the status of your pipeline
49 |
50 | >$ aws datapipeline list-runs --pipeline-id df-0554887H4KXKTY59MRJ
51 | # Name Scheduled Start Status
52 | # ID Started Ended
53 | #---------------------------------------------------------------------------------------------------
54 | # 1. ActivityId_vmVn4 2015-11-06T23:52:04 WAITING_FOR_RUNNER
55 | # @ActivityId_vmVn4_2015-11-06T23:52:04 2015-11-06T23:52:11
56 | #
57 | # 2. ResourceId_idL0Y 2015-11-06T23:52:04 CREATING
58 | # @ResourceId_idL0Y_2015-11-06T23:52:04 2015-11-06T23:52:11
59 | ```
60 |
61 | ##Related documentation
62 | https://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-redshiftcopyactivity.html
63 |
64 |
--------------------------------------------------------------------------------
/samples/S3ToRdsSqoop/README.md:
--------------------------------------------------------------------------------
1 | # Loading a CSV file stored in S3 into an RDS MySQL instance
2 |
3 | This sample uses [sqoop](http://sqoop.apache.org/) to load a CSV filed stored in [S3](https://aws.amazon.com/s3/) into a [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/) database instance managed by [RDS](https://aws.amazon.com/rds/). Sqoop is a specialized tool that uses [hadoop](http://hadoop.apache.org/) to transfer bulk data in and out of relational databases. It completes this task more quickly than Data Pipeline's built-in CopyActivity, but it is also more resource intensive. The sample takes advantage of built-in support for sqoop in [EMR](https://aws.amazon.com/emr/) 5.0.
4 |
5 | ## Parameters
6 |
7 | Parameter | Required | Description
8 | ----------|----------|------------
9 | myEmrMasterInstanceType | no | The EC2 instance type to use for the master node in the EMR cluster. Default: m2.xlarge
10 | myEmrCoreInstanceType | no | The EC2 instance type to use for the core nodes in the EMR cluster. Default: m2.xlarge
11 | myEmrCoreInstanceCount | no | The number of core nodes to launch in the EMR cluster. Default: 2
12 | myRdsEndpoint | yes | DNS endpoint for target RDS instance. The value should include the port number. Example: test.xyzw.us-east-1.rds.amazonaws.com:3306
13 | myRdsDatabaseName | yes | Name of the target MySQL or MariaDB database.
14 | myRdsTableName | yes | Name of the database table that the CSV will be imported into.
15 | myRdsUsername | yes | User name to use to connect to RDS.
16 | \*myRdsPassword | yes | Password to use to connect to RDS.
17 | myS3InputDataLocation | yes | S3 path to folder where the CSV data is stored. Example: s3://example-s3-path/folder-containing-csv-data/
18 | myPipelineLogUri | yes | S3 folder where log data generated by this pipeline will be written. Example: s3::/example-s3-path/folder-to-contain-log-files/
19 |
20 | ## Prerequisites
21 |
22 | This template assumes that you have already created an RDS instance running MySQL or MariaDB instance in RDS. Inside the instance you will need a database and table where the records will be inserted. You will need to know the database name, the table name, the database user name and password, and the DNS endpoint of the RDS instance. You can use the RDS console to view the DNS endpoint and the master user name and to modify the master password as needed. You will need to use the MySQL command-line tool or a graphical client like [MySQL Workbench](https://www.mysql.com/products/workbench/) to create the target database and table. See [here](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_ConnectToInstance.html) for more information on connecting to MySQL on RDS. Note that the schema of the table where you will be importing records should match the schema in the CSV file (i.e., it should have the same number of columns and appropriate column types).
23 |
24 | ## Running this sample
25 |
26 | Create a new pipeline. Throughout this section we assume that the S3ToRdsSqoop sample directory is
27 | your current working directory.
28 |
29 | ```sh
30 | $> aws datapipeline create-pipeline --name s3-to-rds-sqoop --unique-id s3-to-rds-sqoop
31 | # {
32 | # "pipelineId": "df-03971252U4AVY60545T7"
33 | # }
34 | ```
35 |
36 | Upload the [pipeline definition](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html). Use the `pipelineId` that was returned by the `create-pipeline` command. Specify the required parameters.
37 |
38 | ```sh
39 | $> aws datapipeline put-pipeline-definition --pipeline-id \
40 | --pipeline-definition file://sqoop_activity.json \
41 | --parameter-values myRdsEndpoint= myRdsDatabaseName= myRdsTableName=
\
42 | myRdsUsername= '*myRdsPassword=' myS3InputDataLocation= myPipelineLogUri=
43 | # {
44 | # "errored": false,
45 | # "validationWarnings": [],
46 | # "validationErrors": []
47 | # }
48 | ```
49 |
50 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command.
51 |
52 | ```sh
53 | $> aws datapipeline activate-pipeline --pipeline-id
54 | ```
55 |
56 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the
57 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output
58 | from this command will show FINISHED for all pipeine nodes.
59 |
60 | ```sh
61 |
62 | >$ aws datapipeline list-runs --pipeline-id
63 |
64 | ```
65 |
66 | Once the pipeline has completed, you should be able to see the imported records in MySQL by running a SELECT query using the MySQL command-line tool or a graphical client.
67 |
68 | ## Next steps
69 |
70 | In addition to the required parameters, there are optional parameters to set the EC2 instance types launched by the EMR cluster as well as the number of core nodes to launch. Changing these paramters may improve the performance of the import job.
71 |
72 | Once the pipeline is completed, you can delete it with the following command.
73 |
74 | ```sh
75 | $> aws datapipeline delete-pipeline --pipeline-id
76 | ```
77 |
78 | The resources used by this example will incur normal charges. If you created any resources specifically to test this pipeline, you may wish to delete them now.
79 |
80 | ## Disclaimer
81 |
82 | The samples in this repository are meant to help users get started with Data Pipeline. They may not
83 | be sufficient for production environments. Users should carefully inspect samples before running
84 | them.
85 |
86 | *Use at your own risk.*
87 |
88 | Licensed under the MIT-0 License.
89 |
--------------------------------------------------------------------------------
/samples/S3ToRdsSqoop/sqoop_activity.json:
--------------------------------------------------------------------------------
1 | {
2 | "objects": [
3 | {
4 | "myComment": "The default object sets global properties for the pipeline.",
5 | "id": "Default",
6 | "name": "Default",
7 | "failureAndRerunMode": "CASCADE",
8 | "resourceRole": "DataPipelineDefaultResourceRole",
9 | "role": "DataPipelineDefaultRole",
10 | "pipelineLogUri": "#{myPipelineLogUri}",
11 | "scheduleType": "ONDEMAND"
12 | },
13 | {
14 | "myComment": "An EMR cluster where the Sqoop job will be run. These parameters can be edited to create a more powerful cluster.",
15 | "id": "MyEmrCluster",
16 | "name": "MyEmrCluster",
17 | "type": "EmrCluster",
18 | "masterInstanceType": "#{myEmrMasterInstanceType}",
19 | "coreInstanceType": "#{myEmrCoreInstanceType}",
20 | "coreInstanceCount": "#{myEmrCoreInstanceCount}",
21 | "releaseLabel": "emr-5.0.0",
22 | "applications": ["sqoop"],
23 | "terminateAfter": "12 hours"
24 | },
25 | {
26 | "myComment": "S3 folder where the input CSV is stored.",
27 | "id": "S3InputDataLocation",
28 | "name": "S3InputDataLocation",
29 | "directoryPath": "#{myS3InputDataLocation}",
30 | "type": "S3DataNode"
31 | },
32 | {
33 | "myComment": "The shell command to invoke sqoop to copy the CSV into RDS. This template assumes that the target database is either MySQL or MariaDB and that the target table has already been created.",
34 | "id": "SqoopActivity",
35 | "name": "SqoopActivity",
36 | "runsOn": {
37 | "ref": "MyEmrCluster"
38 | },
39 | "input": {
40 | "ref": "S3InputDataLocation"
41 | },
42 | "type": "ShellCommandActivity",
43 | "command": "sqoop export --connect jdbc:mariadb://#{myRdsEndpoint}/#{myRdsDatabaseName} --driver org.mariadb.jdbc.Driver --table #{myRdsTableName} --username #{myRdsUsername} --password #{*myRdsPassword} --export-dir #{myS3InputDataLocation}"
44 | }
45 | ],
46 | "parameters": [
47 | {
48 | "id": "myEmrMasterInstanceType",
49 | "type": "String",
50 | "default": "m2.xlarge",
51 | "description": "The EC2 instance type to use for the master node in the EMR cluster"
52 | },
53 | {
54 | "id": "myEmrCoreInstanceType",
55 | "type": "String",
56 | "default": "m2.xlarge",
57 | "description": "The EC2 instance type to use for the core nodes in the EMR cluster"
58 | },
59 | {
60 | "id": "myEmrCoreInstanceCount",
61 | "type": "String",
62 | "default": "2",
63 | "description": "The number of core nodes to launch in the EMR cluster"
64 | },
65 | {
66 | "id": "myRdsEndpoint",
67 | "type": "String",
68 | "description": "DNS endpoint for target RDS instance. The value should include the port number. Example: test.xyzw.us-east-1.rds.amazonaws.com:3306"
69 | },
70 | {
71 | "id": "myRdsDatabaseName",
72 | "type": "String",
73 | "description": "Name of the target MySQL or MariaDB database"
74 | },
75 | {
76 | "id": "myRdsTableName",
77 | "type": "String",
78 | "description": "Name of the database table that the CSV will be imported into"
79 | },
80 | {
81 | "id": "myRdsUsername",
82 | "type": "String",
83 | "description": "User name to use to connect to RDS"
84 | },
85 | {
86 | "id": "*myRdsPassword",
87 | "type": "String",
88 | "description": "Password to use to connect to RDS"
89 | },
90 | {
91 | "id": "myS3InputDataLocation",
92 | "type": "AWS::S3::ObjectKey",
93 | "description": "S3 path to folder where the CSV data is stored"
94 | },
95 | {
96 | "id": "myPipelineLogUri",
97 | "type": "AWS::S3::ObjectKey",
98 | "description": "S3 folder where log data generated by this pipeline will be written"
99 | }
100 | ]
101 | }
102 |
--------------------------------------------------------------------------------
/samples/S3TsvFilesToRedshiftTablesIfReady/readme.md:
--------------------------------------------------------------------------------
1 | #Data Pipeline Load Tab Separated Files in S3 to Redshift if file exists
2 |
3 | ##About the sample
4 | This pipeline definition when imported would instruct Redshift to load two TSV files from given two S3 location, into two different Redshift Table. Two copy activities are independent, each will start once the input s3 file exists. Table insert mode is OVERWRITE_EXISTING.
5 |
6 | ##Running this sample
7 | The pipeline requires the following user input point:
8 |
9 | 1. Redshift connection info
10 | 2. The S3 file locations where the input TSV files are located.
11 | 2. Redshift target table names of each S3 file to copy to.
12 | 3. Redshift Cluster security group id(s).
13 |
14 |
15 | ## Prerequisites
16 |
17 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
18 | Redshift Cluster and Table must already exist.
19 | S3 tsv file locations are input for this pipeline, RedshiftCopy activity will start only when input S3 file exists.
20 |
21 |
22 | ## Run this sample pipeline using the AWS CLI
23 |
24 | ```sh
25 | $> aws datapipeline create-pipeline --name s3_if_ready_to_redshift --unique-id s3_if_ready_to_redshift
26 | ```
27 |
28 | You receive a pipelineId like this.
29 | ```sh
30 | # -----------------------------------------
31 | # | CreatePipeline |
32 | # +-------------+--------------------------+
33 | # | pipelineId | |
34 | # +-------------+--------------------------+
35 | ```
36 |
37 | ```sh
38 | $> aws datapipeline put-pipeline-definition --pipeline-definition file://S3TsvFilesToRedshiftTablesIfReady.json --pipeline-id \
39 | --parameter-values myRedshiftUsername= \*myRedshiftPassword= \
40 | myRedshiftDbName= \
41 | myRedshiftSecurityGrpIds= \
42 | myRedshiftJdbcConnectStr=\
43 | myInputTsvFilesS3Loc_1=\
44 | myDestRedshiftTable_1=