├── .gitignore
├── .project
├── LICENSE.txt
├── README.md
├── requirements.txt
├── samples
    ├── DynamoDBExport
    │   ├── DynamoDBTableExport.json
    │   ├── DynamoDBtoCSV.json
    │   └── readme.md
    ├── DynamoDBExportJava
    │   ├── .gitignore
    │   ├── pom.xml
    │   ├── readme.md
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── com
    │   │           │   └── amazonaws
    │   │           │       └── datapipelinesamples
    │   │           │           └── ddbexport
    │   │           │               ├── CommandLineArgParser.java
    │   │           │               ├── DDBExportPipelineCreator.java
    │   │           │               ├── DDBExportPipelineObjectCreator.java
    │   │           │               ├── Main.java
    │   │           │               └── PipelineMonitor.java
    │   │       └── resources
    │   │           └── log4j2.xml
    ├── DynamoDBImport
    │   ├── XMLtoDynamoDBImport.json
    │   └── readme.md
    ├── DynamoDBImportCSV
    │   ├── CSVtoDynamoDB.json
    │   └── README.md
    ├── DynamoDBToRedshiftConvertDataUsingHive
    │   ├── DynamoDBtoRedshiftHiveCSV.json
    │   └── README.md
    ├── EFSBackup
    │   ├── 1-Node-EFSBackupPipeline.json
    │   ├── 1-Node-EFSRestorePipeline.json
    │   ├── 2-Node-EFSBackupPipeline.json
    │   ├── 2-Node-EFSRestorePipeline.json
    │   ├── 3-Node-EFSBackupPipeline.json
    │   ├── README.md
    │   ├── efs-backup-end.sh
    │   ├── efs-backup-init.sh
    │   ├── efs-backup-rsync.sh
    │   ├── efs-backup.sh
    │   ├── efs-restore-rsync.sh
    │   └── efs-restore.sh
    ├── ExampleTemplate
    │   └── README.md
    ├── HadoopTerasort
    │   ├── README.md
    │   ├── TeraSortHadoopBenchmark.json
    │   ├── process-jhist.sh
    │   └── setup.py
    ├── InvokeLambda
    │   ├── README.md
    │   └── invokelambda.json
    ├── LoadTsvFilesInS3ToRedshift
    │   ├── LoadTsvFilesInS3ToRedshift.json
    │   └── README.md
    ├── OnDemandWithLamdaFunctions
    │   ├── lambda_function.py
    │   ├── ondemand.json
    │   └── readme.md
    ├── RDStoRedshiftSqoop
    │   ├── RDStoRedshiftSqoop.json
    │   ├── README.md
    │   └── setup
    │   │   ├── RdsToRedshiftSqoopSample.py
    │   │   ├── Setup.py
    │   │   ├── SetupPipelineDefinition.py
    │   │   ├── Teardown.py
    │   │   ├── Utilities.py
    │   │   └── setup.json
    ├── RDStoS3
    │   ├── RDStoS3Pipeline.json
    │   ├── README.md
    │   └── setup
    │   │   ├── RDStoS3Sample.py
    │   │   ├── Setup.py
    │   │   ├── SetupPipelineDefinition.py
    │   │   ├── Teardown.py
    │   │   └── Utilities.py
    ├── RedshiftCopyActivityFromDynamoDBTable
    │   ├── RedshiftCopyActivityFromDynamoDBTable.json
    │   └── readme.md
    ├── RedshiftToRDS
    │   ├── RedshiftToRDS_WithoutRDSCreate.json
    │   ├── RedshiftToRDS_withTableCreate.json
    │   └── readme.md
    ├── S3ToRdsSqoop
    │   ├── README.md
    │   └── sqoop_activity.json
    ├── S3TsvFilesToRedshiftTablesIfReady
    │   ├── S3TsvFilesToRedshiftTablesIfReady.json
    │   └── readme.md
    ├── SQLActivityWithTimeout
    │   ├── README.md
    │   ├── pipeline.json
    │   └── setup
    │   │   ├── SQLActivitySample.py
    │   │   ├── Setup.py
    │   │   ├── SetupPipelineDefinition.py
    │   │   ├── Teardown.py
    │   │   ├── Utilities.py
    │   │   └── setup.json
    ├── ShellCommandWithFTP
    │   ├── README.md
    │   ├── data
    │   ├── ftpcommands
    │   └── pipeline.json
    ├── ShellCommandWithS3StagingDirectory
    │   ├── README.md
    │   ├── bashscript.sh
    │   └── shellcommandwiths3stagingdir.json
    ├── SimplePigActivity
    │   ├── pig_activity_sample.json
    │   └── readme.md
    ├── SparkPiMaximizeResourceAllocation
    │   ├── SparkPi-maximizeResource.json
    │   └── readme.md
    ├── billing
    │   ├── readme.md
    │   └── template.json
    ├── diagnose
    │   ├── README.md
    │   └── diagnose_pipeline.json
    ├── dynamo-db-export-as-csv
    │   ├── ddb-to-csv.json
    │   └── readme.md
    ├── dynamo-db-export
    │   ├── DynamoDB-export.json
    │   ├── example-parameters.json
    │   └── readme.md
    ├── dynamo-db-to-redshift
    │   ├── dynamo-db-to-redshift.json
    │   └── readme.md
    ├── dynamodb-to-dynamodb-crossregion
    │   ├── README.md
    │   └── pipeline.json
    ├── dynamodb-to-dynamodb
    │   ├── README.md
    │   └── pipeline.json
    ├── hadoop-activity
    │   ├── README.md
    │   └── hadoop-activity-world-count-fair.json
    ├── helloworld
    │   ├── README.md
    │   ├── helloworld.json
    │   └── setup.py
    ├── json-to-dynamodb
    │   ├── README.md
    │   ├── customers.json
    │   ├── definition.json
    │   └── json_to_ddb.q
    ├── kinesis
    │   ├── README.md
    │   ├── hive-scripts
    │   │   ├── create-table-from-kinesis-stream.q
    │   │   ├── script-runner.sh
    │   │   └── write-kinesis-to-s3.q
    │   ├── kinesis-to-s3.json
    │   └── setup
    │   │   ├── append-to-stream.sh
    │   │   └── setup-script.sh
    ├── oracle-backup
    │   ├── README.md
    │   ├── definition.json
    │   ├── parameters.json
    │   └── values.json
    └── rds-to-rds-copy
    │   └── readme.md
└── setup
    ├── logo
        └── datapipelinelogo.jpeg
    ├── stacker.py
    └── stacker_tests.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>data-pipeline-samples</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 	</buildSpec>
 9 | 	<natures>
10 | 	</natures>
11 | </projectDescription>
12 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2011-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | awscli==1.8.12
 2 | boto3==1.1.4
 3 | botocore==1.2.10
 4 | colorama==0.3.3
 5 | docutils==0.12
 6 | futures==2.2.0
 7 | jmespath==0.9.0
 8 | pyasn1==0.1.9
 9 | python-dateutil==2.4.2
10 | rsa==3.2
11 | six==1.10.0
12 | wheel==0.24.0
13 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExport/DynamoDBTableExport.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "objects": [
 3 |         {
 4 |             "occurrences": "1",
 5 |             "period": "1 Day",
 6 |             "name": "RunOnce",
 7 |             "id": "DefaultSchedule",
 8 |             "type": "Schedule",
 9 |             "startAt": "FIRST_ACTIVATION_DATE_TIME",
10 | 	    "maxActiveInstances" : "1"
11 |         },
12 |         {
13 |             "failureAndRerunMode": "CASCADE",
14 |             "schedule": {
15 |                 "ref": "DefaultSchedule"
16 |             },
17 |             "resourceRole": "DataPipelineDefaultResourceRole",
18 |             "role": "DataPipelineDefaultRole",
19 |             "pipelineLogUri": "s3://",
20 |             "scheduleType": "cron",
21 |             "name": "Default",
22 |             "id": "Default"
23 |         },
24 |         {
25 |             "maximumRetries": "2",
26 |             "name": "TableBackupActivity",
27 |             "step": "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')},#{myDDBTableName},#{myDDBReadThroughputRatio}",
28 |             "id": "TableBackupActivity",
29 |             "runsOn": {
30 |                 "ref": "EmrClusterForBackup"
31 |             },
32 |             "type": "EmrActivity"
33 |         },
34 |         {
35 |             "bootstrapAction": "s3://elasticmapreduce/bootstrap-actions/configure-hadoop, --yarn-key-value, yarn.nodemanager.resource.memory-mb=12800,--yarn-key-value,yarn.scheduler.minimum-allocation-mb=256,--mapred-key-value,mapreduce.map.memory.mb=500,--mapred-key-value,mapreduce.map.java.opts=-Xmx400M,--mapred-key-value,mapreduce.job.reduce.slowstart.completedmaps=1,--mapred-key-value,mapreduce.map.speculative=false",
36 |             "name": "EmrClusterForBackup",
37 |             "amiVersion": "3.8.0",
38 |             "id": "EmrClusterForBackup",
39 |             "type": "EmrCluster",
40 |             "masterInstanceType": "m1.medium",
41 |             "coreInstanceType": "#{myInstanceType}",
42 |             "coreInstanceCount": "#{myInstanceCount}",
43 | 	    "terminateAfter" : "12 hours"
44 |         }
45 |     ],
46 |     "parameters": [
47 |         {
48 |             "description": "OutputS3folder",
49 |             "id": "myOutputS3Loc",
50 |             "type": "AWS::S3::ObjectKey"
51 |         },
52 |         {
53 |             "default": "0.2",
54 |             "watermark": "Valuebetween0.1-1.0",
55 |             "description": "DynamoDB Read Throughput Ratio",
56 |             "id": "myDDBReadThroughputRatio",
57 |             "type": "Double"
58 |         },
59 |         {
60 |             "description": "DynamoDB Table Name",
61 |             "id": "myDDBTableName",
62 |             "type": "String"
63 |         },
64 |         {
65 |             "description": "Instance Type",
66 |             "id": "myInstanceType",
67 |             "watermark" : "Use m1.medium if Read Capacity Units for the job <= 900. Else use m3.xlarge",
68 |             "type": "String",
69 |             "default": "m3.xlarge"
70 |         },
71 |         {
72 |             "description": "Instance Count",
73 | 	    "watermark" : "(Read Capacity Units / 300) for m1.medium if RCU <= 900. Else (RCU / 1500) for m3.xlarge",
74 |             "id": "myInstanceCount",
75 |             "type": "Integer",
76 |             "default": "1"
77 |         },
78 | 	{
79 | 	    "description" : "Burst IOPs",
80 | 	    "watermark"   : "Add IOPS to the DDB table by this percent for the duration of the export job",
81 |             "id"          : "myBurstIOPS",
82 |             "type"     :    "Double",
83 |             "default"     : "0.0"
84 | 	}
85 |     ]
86 | }
87 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExport/DynamoDBtoCSV.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "myComment" : "Activity used to run the hive script to export data to CSV",
  5 |       "output": {
  6 |         "ref": "DataNodeId_cnlSW"
  7 |       },
  8 |       "input": {
  9 |         "ref": "DataNodeId_1ERqq"
 10 |       },
 11 |       "name": "TableBackupActivity",
 12 |       "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myS3ColMapping})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n                    \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n'\nLOCATION '#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}';\n                    \nINSERT OVERWRITE TABLE s3TempTable SELECT * FROM tempHiveTable;",
 13 |       "runsOn": { "ref" : "EmrClusterForBackup" },
 14 |       "id": "TableBackupActivity",
 15 |       "type": "HiveActivity"
 16 |     },
 17 |     {
 18 |       "period": "1 days",
 19 |       "name": "Every 1 day",
 20 |       "id": "DefaultSchedule",
 21 |       "type": "Schedule",
 22 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 23 |     },
 24 |     {
 25 |       "myComment" : "The DynamoDB table from which we need to export data from",
 26 |       "dataFormat": {
 27 |         "ref": "DDBExportFormat"
 28 |       },
 29 |       "name": "DynamoDB",
 30 |       "id": "DataNodeId_1ERqq",
 31 |       "type": "DynamoDBDataNode",
 32 |       "tableName": "#{myDDBTableName}"
 33 |     },
 34 |     {
 35 |       "failureAndRerunMode": "CASCADE",
 36 |       "schedule": {
 37 |         "ref": "DefaultSchedule"
 38 |       },
 39 |       "resourceRole": "DataPipelineDefaultResourceRole",
 40 |       "role": "DataPipelineDefaultRole",
 41 |       "pipelineLogUri": "#{myLogUri}",
 42 |       "scheduleType": "cron",
 43 |       "name": "Default",
 44 |       "id": "Default"
 45 |     },
 46 |     {
 47 |       "name": "EmrClusterForBackup",
 48 |       "coreInstanceType": "m1.medium",
 49 |       "coreInstanceCount": "1",
 50 |       "masterInstanceType": "m1.medium",
 51 |       "amiVersion": "3.3.2",
 52 |       "id": "EmrClusterForBackup",
 53 |       "type": "EmrCluster",
 54 |       "terminateAfter": "2 Hours"
 55 |     },
 56 |     {
 57 |       "myComment" : "The S3 path to which we export data to",
 58 |       "directoryPath": "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}/",
 59 |       "dataFormat": {
 60 |         "ref": "DataFormatId_xqWRk"
 61 |       },
 62 |       "name": "S3DataNode",
 63 |       "id": "DataNodeId_cnlSW",
 64 |       "type": "S3DataNode"
 65 |     },
 66 |     {
 67 |       "myComment" : "Format for the S3 Path",
 68 |       "name": "DefaultDataFormat1",
 69 |       "column": "not_used STRING",
 70 |       "id": "DataFormatId_xqWRk",
 71 |       "type": "CSV"
 72 |     },
 73 |     {
 74 |       "myComment" : "Format for the DynamoDB table",
 75 |       "name": "DDBExportFormat",
 76 |       "id": "DDBExportFormat",
 77 |       "column": "not_used STRING",
 78 |       "type": "DynamoDBExportDataFormat"
 79 |     }
 80 |   ],
 81 |   "parameters": [
 82 |     {
 83 |       "description": "Output S3 folder",
 84 |       "id": "myOutputS3Loc",
 85 |       "type": "AWS::S3::ObjectKey"
 86 |     },
 87 |     {
 88 |       "description": "DynamoDB table name",
 89 |       "id": "myDDBTableName",
 90 |       "type": "String"
 91 |     },
 92 |     {
 93 |       "description": "S3 to DynamoDB Column Mapping",
 94 |       "id": "myDDBTableColMapping",
 95 |       "type": "String"
 96 |     },
 97 |     {
 98 |       "description": "S3 Column Mappings",
 99 |       "id": "myS3ColMapping",
100 |       "type": "String"
101 |     },
102 |     {
103 |       "description": "DataPipeline Log Uri",
104 |       "id": "myLogUri",
105 |       "type": "String"
106 |     }
107 |   ]
108 | }
109 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExport/readme.md:
--------------------------------------------------------------------------------
 1 | #DynamoDB to CSV export
 2 | 
 3 | ##About the sample
 4 | The pipeline definition is used for exporting DynamoDB data to a CSV format.
 5 | 
 6 | ##Running the pipeline 
 7 | 
 8 | Example DynamoDB table with keys: customer_id, income, demographics, financial
 9 | 
10 | User needs to provide:
11 | 
12 | 1. Output S3 folder: The s3 folder prefix to which the CSV data is to be exported.
13 | 2. DynamoDB read throughput ratio: The throughput to be used for the export operation.
14 | 3. DynamoDB table name: The table name from which we need to export the data.
15 | 4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string
16 | 5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas.
17 | 6. Log Uri: S3 log path to capture the pipeline logs.
18 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | *.iml
3 | dependency-reduced-pom.xml
4 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>DynamoDBExportSample</groupId>
 8 |     <artifactId>DynamoDBExportSample</artifactId>
 9 |     <version>0.1</version>
10 | 
11 |     <dependencies>
12 |         <dependency>
13 |             <groupId>com.amazonaws</groupId>
14 |             <artifactId>aws-java-sdk</artifactId>
15 |             <version>1.10.33</version>
16 |         </dependency>
17 |         <dependency>
18 |             <groupId>com.google.guava</groupId>
19 |             <artifactId>guava</artifactId>
20 |             <version>19.0-rc2</version>
21 |         </dependency>
22 |         <dependency>
23 |             <groupId>commons-cli</groupId>
24 |             <artifactId>commons-cli</artifactId>
25 |             <version>1.3.1</version>
26 |         </dependency>
27 |         <dependency>
28 |             <groupId>org.apache.logging.log4j</groupId>
29 |             <artifactId>log4j-api</artifactId>
30 |             <version>2.4.1</version>
31 |         </dependency>
32 |         <dependency>
33 |             <groupId>org.apache.logging.log4j</groupId>
34 |             <artifactId>log4j-core</artifactId>
35 |             <version>2.4.1</version>
36 |         </dependency>
37 |     </dependencies>
38 | 
39 |     <build>
40 |         <plugins>
41 |             <plugin>
42 |                 <groupId>org.apache.maven.plugins</groupId>
43 |                 <artifactId>maven-shade-plugin</artifactId>
44 |                 <version>2.3</version>
45 |                 <executions>
46 |                     <execution>
47 |                         <phase>package</phase>
48 |                         <goals>
49 |                             <goal>shade</goal>
50 |                         </goals>
51 |                         <configuration>
52 |                             <transformers>
53 |                                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
54 |                                     <mainClass>com.amazonaws.datapipelinesamples.ddbexport.Main</mainClass>
55 |                                 </transformer>
56 |                             </transformers>
57 |                         </configuration>
58 |                     </execution>
59 |                 </executions>
60 |             </plugin>
61 |             <plugin>
62 |                 <groupId>org.apache.maven.plugins</groupId>
63 |                 <artifactId>maven-compiler-plugin</artifactId>
64 |                 <version>3.3</version>
65 |                 <configuration>
66 |                     <source>1.8</source>
67 |                     <target>1.8</target>
68 |                 </configuration>
69 |             </plugin>
70 |         </plugins>
71 |     </build>
72 | 
73 | </project>


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/readme.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline DynamoDB Export Java Sample
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample makes it easy to create a pipeline that uses the latest DynamoDB export template EMR activity. You provide 
 6 | parameters and the tool will create the pipeline and run and monitor it once so you can verify that it is healthy.
 7 | 
 8 | This sample also provides an example application using the AWS Data Pipeline Java SDK. It demonstrates how to
 9 | create, run and monitor a pipeline.
10 | 
11 | ## Prerequisites
12 | 
13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the 
14 | [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
15 | 
16 | 
17 | ## Getting started
18 | 
19 | Build: mvn clean package <br/> <br/>
20 | View parameters description: java -jar path/to/DynamoDBExportSample-0.1.jar help <br/> <br/>
21 | Run: java -jar path/to/DynamoDBExportSample-0.1.jar  <-yourParam foo>
22 | 
23 | ## Example
24 | 
25 | Create and run on a pipeline that runs once per day:
26 | 
27 | java -jar /Users/foobar/DynamoDBExportJava/target/DynamoDBExportSample-0.1.jar -credentialsFile 
28 | /Users/foobar/.aws/credentials -myDDBTableName footable -myOutputS3Location s3://foobar/ddb-exports -schedule daily 
29 | -myLogsS3Location s3://foobar/logs -myDDBRegion us-east-1
30 | 
31 | Create and run on a pipeline that runs once:
32 | 
33 | java -jar /Users/foobar/DynamoDBExportJava/target/DynamoDBExportSample-0.1.jar -credentialsFile 
34 | /Users/foobar/.aws/credentials -myDDBTableName footable -myOutputS3Location s3://foobar/ddb-exports -schedule once 
35 | -myLogsS3Location s3://foobar/logs -myDDBRegion us-east-1
36 | 
37 | ## Disclaimer
38 | 
39 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for 
40 | production environments. Users should carefully inspect code samples before running them.
41 | 
42 | Use at your own risk.
43 | 
44 | Licensed under the MIT-0 License.
45 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/CommandLineArgParser.java:
--------------------------------------------------------------------------------
 1 | package com.amazonaws.datapipelinesamples.ddbexport;
 2 | 
 3 | import org.apache.commons.cli.CommandLine;
 4 | import org.apache.commons.cli.CommandLineParser;
 5 | import org.apache.commons.cli.DefaultParser;
 6 | import org.apache.commons.cli.HelpFormatter;
 7 | import org.apache.commons.cli.Options;
 8 | import org.apache.commons.cli.ParseException;
 9 | import org.apache.logging.log4j.LogManager;
10 | import org.apache.logging.log4j.Logger;
11 | 
12 | import java.util.HashMap;
13 | import java.util.Map;
14 | 
15 | public class CommandLineArgParser {
16 |     private static final Logger logger = LogManager.getLogger(CommandLineArgParser.class);
17 | 
18 |     public static Map<String, String> parseParameters(final String[] args) {
19 |         Options params = new Options();
20 |         params.addOption("myDDBTableName", true, "Dynamo DB source table that will be exported (REQUIRED)");
21 |         params.addOption("myOutputS3Location", true, "S3 bucket where the export will be stored (REQUIRED)");
22 |         params.addOption("myLogsS3Location", true, "S3 bucket where the logs will be stored (REQUIRED)");
23 |         params.addOption("schedule", true, "Schedule to run pipeline on. Options are: once or daily (REQUIRED)");
24 |         params.addOption("credentialsFile", true, "Path to AWS credentials file. ex: /Users/foo/.aws/credentials " +
25 |                 "(REQUIRED)");
26 |         params.addOption("myDDBRegion", true, "Region to run pipeline in. Default: us-east-1 (Optional)");
27 | 
28 |         return getParamsMap(args, params);
29 |     }
30 | 
31 |     private static Map<String, String> getParamsMap(final String[] args, final Options params) {
32 |         CommandLineParser parser = new DefaultParser();
33 |         CommandLine cmd;
34 |         Map<String, String> paramsMap = new HashMap<>();
35 | 
36 |         try {
37 |             cmd = parser.parse(params, args);
38 |             addToMapIfPreset(cmd, "credentialsFile", true, paramsMap);
39 |             addToMapIfPreset(cmd, "myDDBTableName", true, paramsMap);
40 |             addToMapIfPreset(cmd, "myOutputS3Location", true, paramsMap);
41 |             addToMapIfPreset(cmd, "myLogsS3Location", true, paramsMap);
42 |             addToMapIfPreset(cmd, "schedule", true, paramsMap);
43 |             addToMapIfPreset(cmd, "myDDBRegion", false, paramsMap);
44 |         } catch (ParseException | RuntimeException e) {
45 |             logger.error(e.getMessage());
46 |             printHelp(params);
47 |             throw new RuntimeException();
48 |         }
49 | 
50 |         return paramsMap;
51 |     }
52 | 
53 |     private static void printHelp(final Options params) {
54 |         HelpFormatter formatter = new HelpFormatter();
55 |         formatter.printHelp("maven", params);
56 |     }
57 | 
58 |     private static void addToMapIfPreset(final CommandLine cmd, final String paramName, final boolean required,
59 |                                          final Map<String,String> paramsMap) {
60 |         if(cmd.hasOption(paramName)) {
61 |             paramsMap.put(paramName, cmd.getOptionValue(paramName));
62 |         } else if (required) {
63 |             logger.error("Unable to find required parameter: " + paramName);
64 |             throw new RuntimeException();
65 |         }
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/Main.java:
--------------------------------------------------------------------------------
 1 | package com.amazonaws.datapipelinesamples.ddbexport;
 2 | 
 3 | import com.amazonaws.auth.AWSCredentials;
 4 | import com.amazonaws.auth.profile.ProfileCredentialsProvider;
 5 | import com.amazonaws.services.datapipeline.DataPipelineClient;
 6 | 
 7 | import java.util.Map;
 8 | 
 9 | public class Main {
10 | 
11 |     private static DataPipelineClient dataPipelineClient;
12 | 
13 |     public static void main(String args[]) {
14 |         Map<String, String> params = CommandLineArgParser.parseParameters(args);
15 | 
16 |         dataPipelineClient = getClient(params.get("credentialsFile"));
17 | 
18 |         String pipelineId = DDBExportPipelineCreator.createPipeline(dataPipelineClient);
19 | 
20 |         DDBExportPipelineCreator.putPipelineDefinition(dataPipelineClient, pipelineId, params);
21 | 
22 |         DDBExportPipelineCreator.activatePipeline(dataPipelineClient, pipelineId);
23 | 
24 |         PipelineMonitor.monitorPipelineUntilCompleted(dataPipelineClient, pipelineId, "TableBackupActivity");
25 |     }
26 | 
27 |     private static DataPipelineClient getClient(final String profileName) {
28 |         AWSCredentials credentials = new ProfileCredentialsProvider(profileName, "default").getCredentials();
29 |         return new DataPipelineClient(credentials);
30 |     }
31 | }


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/java/com/amazonaws/datapipelinesamples/ddbexport/PipelineMonitor.java:
--------------------------------------------------------------------------------
 1 | package com.amazonaws.datapipelinesamples.ddbexport;
 2 | 
 3 | import com.amazonaws.services.datapipeline.DataPipelineClient;
 4 | import com.amazonaws.services.datapipeline.model.DescribeObjectsRequest;
 5 | import com.amazonaws.services.datapipeline.model.DescribeObjectsResult;
 6 | import com.amazonaws.services.datapipeline.model.Field;
 7 | import com.amazonaws.services.datapipeline.model.QueryObjectsRequest;
 8 | import com.amazonaws.services.datapipeline.model.QueryObjectsResult;
 9 | import org.apache.logging.log4j.LogManager;
10 | import org.apache.logging.log4j.Logger;
11 | 
12 | import java.util.Timer;
13 | import java.util.TimerTask;
14 | import java.util.stream.Collectors;
15 | 
16 | public class PipelineMonitor {
17 | 
18 |     private static final Logger logger = LogManager.getLogger(DDBExportPipelineCreator.class);
19 | 
20 |     public static void monitorPipelineUntilCompleted(final DataPipelineClient dataPipelineClient,
21 |                                                      final String pipelineId, final String activityName) {
22 |         Timer timer = new Timer();
23 |         int thirtySeconds = 30 * 1000;
24 |         timer.schedule(new TimerTask() {
25 |             @Override
26 |             public void run() {
27 |                 QueryObjectsRequest queryObjectsRequest = new QueryObjectsRequest().withPipelineId(pipelineId)
28 |                         .withSphere("INSTANCE");
29 |                 QueryObjectsResult result = dataPipelineClient.queryObjects(queryObjectsRequest);
30 | 
31 |                 if(result.getIds().size() <= 0) {
32 |                     logger.info("Creating pipeline object execution graph");
33 |                     return;
34 |                 }
35 | 
36 |                 String emrActivityId = result.getIds().stream().filter(r -> r.contains(activityName))
37 |                         .collect(Collectors.joining("\n"));
38 |                 DescribeObjectsResult describeObjectsResult = dataPipelineClient
39 |                         .describeObjects(new DescribeObjectsRequest().withObjectIds(emrActivityId)
40 |                                 .withPipelineId(pipelineId));
41 | 
42 |                 String status = "";
43 |                 for(Field field : describeObjectsResult.getPipelineObjects().get(0).getFields()) {
44 |                     if (field.getKey().equals("@status")) {
45 |                         logger.info(field.getKey() + "=" + field.getStringValue());
46 |                         status = field.getStringValue();
47 |                     }
48 |                 }
49 | 
50 |                 if (status.equals("CANCELED") || status.equals("FINISHED") || status.equals("FAILED")) {
51 |                     this.cancel();
52 |                     timer.cancel();
53 |                 }
54 |             }
55 |         }, 0, thirtySeconds);
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/samples/DynamoDBExportJava/src/main/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <Configuration status="WARN">
 3 |     <Appenders>
 4 |         <Console name="Console" target="SYSTEM_OUT">
 5 |             <PatternLayout pattern="%d{HH:mm:ss.SSS} [%level] %msg%n"/>
 6 |         </Console>
 7 |     </Appenders>
 8 |     <Loggers>
 9 |         <Root level="info">
10 |             <AppenderRef ref="Console"/>
11 |         </Root>
12 |     </Loggers>
13 | </Configuration>


--------------------------------------------------------------------------------
/samples/DynamoDBImport/XMLtoDynamoDBImport.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "name": "EmrClusterForBackup",
 5 |       "coreInstanceType": "m1.medium",
 6 |       "coreInstanceCount": "1",
 7 |       "masterInstanceType": "m1.medium",
 8 |       "amiVersion": "3.3.2",
 9 |       "id": "EmrClusterForBackup",
10 |       "type": "EmrCluster",
11 |       "terminateAfter": "2 Hours"
12 |     },
13 |     {
14 |       "period": "1 days",
15 |       "name": "Every 1 day",
16 |       "id": "DefaultSchedule",
17 |       "type": "Schedule",
18 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
19 |     },
20 |     {
21 |       "name": "DefaultDataFormat1",
22 |       "column": "not_used STRING",
23 |       "id": "DataFormatId_xqWRk",
24 |       "myComment": "Format for the S3 Path",
25 |       "type": "CSV"
26 |     },
27 |     {
28 |       "failureAndRerunMode": "CASCADE",
29 |       "schedule": {
30 |         "ref": "DefaultSchedule"
31 |       },
32 |       "resourceRole": "DataPipelineDefaultResourceRole",
33 |       "role": "DataPipelineDefaultRole",
34 |       "pipelineLogUri": "#{myLogUri}",
35 |       "scheduleType": "cron",
36 |       "name": "Default",
37 |       "id": "Default"
38 |     },
39 |     {
40 |       "name": "ShellCommandActivityCp",
41 |       "runsOn": { "ref" : "EmrClusterForBackup" },
42 |       "id": "ActivityId_zrRQz",
43 |       "type": "ShellCommandActivity",
44 |       "command": "aws s3 cp s3://data-pipeline-samples/dynamodbxml/input/serde.xml /home/hadoop/serde-#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}.xml"
45 |     },
46 |     {
47 |       "dataFormat": {
48 |         "ref": "DDBExportFormat"
49 |       },
50 |       "name": "DynamoDB",
51 |       "id": "DataNodeId_1ERqq",
52 |       "type": "DynamoDBDataNode",
53 |       "myComment": "The DynamoDB table from which we need to export data from",
54 |       "tableName": "customers"
55 |     },
56 |     {
57 |       "column": "not_used STRING",
58 |       "name": "DDBExportFormat",
59 |       "id": "DDBExportFormat",
60 |       "type": "DynamoDBExportDataFormat",
61 |       "myComment": "Format for the DynamoDB table"
62 |     },
63 |     {
64 |       "directoryPath": "s3://data-pipeline-samples/dynamodbxml/input",
65 |       "dataFormat": {
66 |         "ref": "DataFormatId_xqWRk"
67 |       },
68 |       "name": "S3DataNode",
69 |       "id": "DataNodeId_cnlSW",
70 |       "type": "S3DataNode",
71 |       "myComment": "The S3 path to which we export data to"
72 |     },
73 |     {
74 |       "output": {
75 |         "ref": "DataNodeId_1ERqq"
76 |       },
77 |       "input": {
78 |         "ref": "DataNodeId_cnlSW"
79 |       },
80 |       "dependsOn": {
81 |         "ref": "ActivityId_zrRQz"
82 |       },
83 |       "name": "TableBackupActivity",
84 |       "hiveScript": "add jar s3://data-pipeline-samples/dynamodbxml/hivexmlserde-1.0.5.3.jar;\nDROP TABLE IF EXISTS xml_bank;\nCREATE EXTERNAL TABLE xml_bank(customer_id STRING, income string, demographics string, financial string)\nROW FORMAT SERDE 'com.ibm.spss.hive.serde2.xml.XmlSerDe'\nWITH SERDEPROPERTIES (\n\"column.xpath.customer_id\"=\"/record/@customer_id\",\n\"column.xpath.income\"=\"/record/income/text()\",\n\"column.xpath.demographics\"=\"/record/demographics/*\",\n\"column.xpath.financial\"=\"/record/financial/*\"\n)\nSTORED AS\nINPUTFORMAT 'com.ibm.spss.hive.serde2.xml.XmlInputFormat'\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat'\nTBLPROPERTIES (\n\"xmlinput.start\"=\"<record customer\",\n\"xmlinput.end\"=\"</record>\"\n);\nLOAD DATA LOCAL inpath '/home/hadoop/serde-#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}.xml' overwrite into table xml_bank;\nDROP TABLE IF EXISTS hiveTableName;\nCREATE EXTERNAL TABLE hiveTableName (col1 string, col2 string, col3 string, col4 string)\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"customers\", \n\"dynamodb.column.mapping\" = \"col1:customer_id,col2:income,col3:demographics,col4:financial\");  \nINSERT OVERWRITE TABLE hiveTableName SELECT * FROM xml_bank;",
85 |       "runsOn": { "ref" : "EmrClusterForBackup" },
86 |       "id": "TableBackupActivity",
87 |       "type": "HiveActivity",
88 |       "myComment": "Activity used to run the hive script to export data to CSV"
89 |     }
90 |   ]
91 | }
92 | 


--------------------------------------------------------------------------------
/samples/DynamoDBImport/readme.md:
--------------------------------------------------------------------------------
 1 | #XML to DynamoDB Import
 2 | 
 3 | ##Running the sample pipeline
 4 | The json format could be either directly imported in the Console -> Create Pipeline or used in the aws datapipeline cli.<br/>
 5 | The Pipeline definition would copy an example xml from s3://data-pipeline-samples/dynamodbxml/input/serde.xml to local. This step is required for creating a temporary xml table using hive. The hive script is configured for running on a DynamoDB table with keys as "customer_id, financial, income, demographics". It finally performs an import from the temporary xml table to dynamodb<br/>
 6 | The data from the xml file is parsed using hive xml serde. The parsing functionality is similar to parsing in xpath<br/>
 7 | The resultant should be the data is available in the DynamoDB table. <br/>
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/samples/DynamoDBImportCSV/CSVtoDynamoDB.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "myComment" : "Activity used to run the hive script to import CSV data",
  5 |       "output": {
  6 |         "ref": "DataNodeId_cnlSW"
  7 |       },
  8 |       "input": {
  9 |         "ref": "DataNodeId_1ERqq"
 10 |       },
 11 |       "name": "TableRestoreActivity",
 12 |       "hiveScript": "DROP TABLE IF EXISTS tempHiveTable;\n\nDROP TABLE IF EXISTS s3TempTable;\n\nCREATE EXTERNAL TABLE tempHiveTable (#{myDDBColDefn})\nSTORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler' \nTBLPROPERTIES (\"dynamodb.table.name\" = \"#{myDDBTableName}\", \"dynamodb.column.mapping\" = \"#{myDDBTableColMapping}\");\n                    \nCREATE EXTERNAL TABLE s3TempTable (#{myS3ColMapping})\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\\n' LOCATION '#{myInputS3Loc}';\n                    \nINSERT OVERWRITE TABLE tempHiveTable SELECT * FROM s3TempTable;",
 13 |       "id": "TableRestoreActivity",
 14 |       "runsOn": { "ref" : "EmrClusterForRestore" },
 15 |       "stage": "false",
 16 |       "type": "HiveActivity"
 17 |     },
 18 |     {
 19 |       "myComment" : "The DynamoDB table from which we need to import data from",
 20 |       "dataFormat": {
 21 |         "ref": "DDBExportFormat"
 22 |       },
 23 |       "name": "DynamoDB",
 24 |       "id": "DataNodeId_1ERqq",
 25 |       "type": "DynamoDBDataNode",
 26 |       "tableName": "#{myDDBTableName}"
 27 |     },
 28 |     {
 29 |       "failureAndRerunMode": "CASCADE",
 30 |       "resourceRole": "DataPipelineDefaultResourceRole",
 31 |       "role": "DataPipelineDefaultRole",
 32 |       "pipelineLogUri": "#{myLogUri}",
 33 |       "scheduleType": "ONDEMAND",
 34 |       "name": "Default",
 35 |       "id": "Default"
 36 |     },
 37 |     {
 38 |       "name": "EmrClusterForRestore",
 39 |       "coreInstanceType": "m1.medium",
 40 |       "coreInstanceCount": "1",
 41 |       "masterInstanceType": "m1.medium",
 42 |       "releaseLabel": "emr-4.4.0",
 43 |       "id": "EmrClusterForRestore",
 44 |       "type": "EmrCluster",
 45 |       "terminateAfter": "2 Hours"
 46 |     },
 47 |     {
 48 |       "myComment" : "The S3 path from which we import data from",
 49 |       "directoryPath": "#{myInputS3Loc}",
 50 |       "dataFormat": {
 51 |         "ref": "DataFormatId_xqWRk"
 52 |       },
 53 |       "name": "S3DataNode",
 54 |       "id": "DataNodeId_cnlSW",
 55 |       "type": "S3DataNode"
 56 |     },
 57 |     {
 58 |       "myComment" : "Format for the S3 Path",
 59 |       "name": "DefaultDataFormat1",
 60 |       "column": "not_used STRING",
 61 |       "id": "DataFormatId_xqWRk",
 62 |       "type": "CSV"
 63 |     },
 64 |     {
 65 |       "myComment" : "Format for the DynamoDB table",
 66 |       "name": "DDBExportFormat",
 67 |       "id": "DDBExportFormat",
 68 |       "column": "not_used STRING",
 69 |       "type": "DynamoDBExportDataFormat"
 70 |     }
 71 |   ],
 72 |   "parameters": [
 73 |     {
 74 |       "description": "Input S3 folder",
 75 |       "id": "myInputS3Loc",
 76 |       "default": "s3://datapipeline-sample-csv/",
 77 |       "type": "AWS::S3::ObjectKey"
 78 |     },
 79 |     {
 80 |       "description": "DynamoDB table name",
 81 |       "id": "myDDBTableName",
 82 |       "type": "String"
 83 |     },
 84 |     {
 85 |       "description": "S3 to DynamoDB Column Mapping",
 86 |       "id": "myDDBTableColMapping",
 87 |       "default" : "id:id,age:age,job:job,marital:marital,education:education,default:default,housing:housing,loan:loan,contact:contact,month:month,day_of_week:day_of_week,duration:duration,campaign:campaign,pdays:pdays,previous:previous,poutcome:poutcome,emp_var_rate:emp_var_rate,cons_price_idx:cons_price_idx,cons_conf_idx:cons_conf_idx,euribor3m:euribor3m,nr_employed:nr_employed,y:y",
 88 |       "type": "String"
 89 |     },
 90 |     {
 91 |       "description": "S3 Column Mappings",
 92 |       "id": "myS3ColMapping",
 93 |       "default" : "id string,age int,job string,marital string,education string,default string,housing string,loan string,contact string,month string,day_of_week string,duration int,campaign int,pdays int,previous int,poutcome string,emp_var_rate double,cons_price_idx double,cons_conf_idx double,euribor3m double,nr_employed double,y int",
 94 |       "type": "String"
 95 |     },
 96 |     {
 97 |       "description": "DynamoDB Column Mappings",
 98 |       "id": "myDDBColDefn",
 99 |       "default" : "id string,age bigint,job string,marital string,education string,default string,housing string,loan string,contact string,month string,day_of_week string,duration bigint,campaign bigint,pdays bigint,previous bigint,poutcome string,emp_var_rate double,cons_price_idx double,cons_conf_idx double,euribor3m double,nr_employed double,y bigint",
100 |       "type": "String"
101 |     },
102 |     {
103 |       "description": "DataPipeline Log Uri",
104 |       "id": "myLogUri",
105 |       "type": "AWS::S3::ObjectKey"
106 |     }
107 |   ]
108 | }
109 | 


--------------------------------------------------------------------------------
/samples/DynamoDBImportCSV/README.md:
--------------------------------------------------------------------------------
 1 | #DynamoDB to CSV import
 2 | 
 3 | ##About the sample
 4 | The pipeline definition is used to import DynamoDB data to a CSV format.
 5 | 
 6 | ##Running the pipeline 
 7 | 
 8 | Example DynamoDB table with keys: id
 9 | 
10 | User needs to provide:
11 | 
12 | 1. Input S3 folder: The s3 folder prefix from which the CSV data is to be imported.
13 | 2. DynamoDB read throughput ratio: The throughput to be used for the import operation.
14 | 3. DynamoDB table name: The table name from which we need to import the data.
15 | 4. S3 Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string
16 | 4. Dynamodb Column Mappings: A comma seperated column definitions. For example, customer_id string, income string, demographics string, financial string
17 | 5. S3 to DynamoDB Column Mapping: A comma separated mapping of S3 to DynamoDB for e.g. customer_id:customer_id,income:income,demographics:demographics,financial:financial. Please take care of not using spaces in between the commas.
18 | 6. Log Uri: S3 log path to capture the pipeline logs.
19 | 


--------------------------------------------------------------------------------
/samples/EFSBackup/1-Node-EFSBackupPipeline.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects" : [
  3 |     {
  4 |       "id" : "Default",
  5 |       "scheduleType" : "cron",
  6 |       "failureAndRerunMode" : "CASCADE",
  7 |       "schedule" : {
  8 |         "ref" : "DefaultSchedule"
  9 |       },
 10 |       "name" : "Default",
 11 |       "role" : "DataPipelineDefaultRole",
 12 |       "resourceRole" : "DataPipelineDefaultResourceRole"
 13 |     },
 14 |     {
 15 |       "id" : "EC2ResourceObj",
 16 |       "terminateAfter" : "70 Minutes",
 17 |       "instanceType" : "#{myInstanceType}",
 18 |       "name" : "EC2ResourceObj",
 19 |       "type" : "Ec2Resource",
 20 |       "securityGroupIds" : [
 21 |         "#{mySrcSecGroupID}",
 22 |         "#{myBackupSecGroupID}"
 23 |       ],
 24 |       "subnetId" : "#{mySubnetID}",
 25 |       "associatePublicIpAddress" : "true",
 26 |       "imageId" : "#{myImageID}"
 27 |     },
 28 |     {
 29 |       "id" : "DefaultSchedule",
 30 |       "name" : "Every Day",
 31 |       "startAt" : "FIRST_ACTIVATION_DATE_TIME",
 32 |       "type" : "Schedule",
 33 |       "period" : "1 Days"
 34 |     },
 35 |     {
 36 |       "id" : "ShellCommandActivityObj",
 37 |       "name" : "ShellCommandActivityObj",
 38 |       "runsOn" : {
 39 |         "ref" : "EC2ResourceObj"
 40 |       },
 41 |       "command" : "#{myShellCmd}",
 42 |       "scriptArgument" : [
 43 |         "#{myEfsSource}",
 44 |         "#{myEfsBackup}",
 45 |         "#{myInterval}",
 46 |         "#{myRetainedBackups}",
 47 |         "#{myEfsID}"
 48 |       ],
 49 |       "type" : "ShellCommandActivity",
 50 |       "stage" : "true"
 51 |     }
 52 |   ],
 53 |   "parameters" : [
 54 |     {
 55 |       "id" : "myShellCmd",
 56 |       "default" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-backup.sh\nchmod a+x efs-backup.sh\n./efs-backup.sh $1 $2 $3 $4 $5",
 57 |       "description" : "Shell command to run.",
 58 |       "type" : "String"
 59 |     },
 60 |     {
 61 |       "id" : "myInstanceType",
 62 |       "default" : "m3.medium",
 63 |       "description" : "Instance type for creating backups.",
 64 |       "allowedValues" : [
 65 |         "t1.micro",
 66 |         "m3.medium",
 67 |         "m3.large",
 68 |         "m3.xlarge",
 69 |         "m3.2xlarge",
 70 |         "c3.large",
 71 |         "c3.xlarge",
 72 |         "c3.2xlarge",
 73 |         "c3.4xlarge",
 74 |         "c3.8xlarge"
 75 |       ],
 76 |       "type" : "String"
 77 |     },
 78 |     {
 79 |       "id" : "mySubnetID",
 80 |       "default" : "subnet-1234abcd",
 81 |       "description" : "VPC subnet for your backup EC2 instance (ideally the same subnet used for the production EFS mount point).",
 82 |       "type" : "String"
 83 |     },
 84 |     {
 85 |       "id" : "mySrcSecGroupID",
 86 |       "default" : "sg-1111111b",
 87 |       "description" : "Security group that can connect to the Production EFS mount point.",
 88 |       "type" : "String"
 89 |     },
 90 |     {
 91 |       "id" : "myBackupSecGroupID",
 92 |       "default" : "sg-9999999b",
 93 |       "description" : "Security group that can connect to the Backup EFS mount point.",
 94 |       "type" : "String"
 95 |     },
 96 |     {
 97 |       "id" : "myInterval",
 98 |       "default" : "daily",
 99 |       "description" : "Interval for backups.",
100 |       "allowedValues" : [
101 |         "hourly",
102 |         "daily",
103 |         "weekly",
104 |         "monthly"
105 |       ],
106 |       "type" : "String"
107 |     },
108 |     {
109 |       "id" : "myRetainedBackups",
110 |       "default" : "7",
111 |       "description" : "Number of backups to retain.",
112 |       "type" : "Integer"
113 |     },
114 |     {
115 |       "id" : "myEfsID",
116 |       "default" : "backup-fs-12345678",
117 |       "description" : "Name for the directory that will contain your backups.",
118 |       "type" : "String"
119 |     },
120 |     {
121 |       "id" : "myEfsSource",
122 |       "default" : "10.0.1.32:/",
123 |       "description" : "Production EFS mount target IP address.",
124 |       "type" : "String"
125 |     },
126 |     {
127 |       "id" : "myEfsBackup",
128 |       "default" : "10.0.1.75:/",
129 |       "description" : "Backup EFS mount target IP address.",
130 |       "type" : "String"
131 |     },
132 |     {
133 |       "id" : "myImageID",
134 |       "default" : "ami-12345678",
135 |       "description" : "AMI ID for the EC2 instance.",
136 |       "type" : "String"
137 |     }
138 |   ]
139 | }


--------------------------------------------------------------------------------
/samples/EFSBackup/1-Node-EFSRestorePipeline.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects" : [
  3 |     {
  4 |       "id" : "Default",
  5 |       "scheduleType" : "cron",
  6 |       "failureAndRerunMode" : "CASCADE",
  7 |       "schedule" : {
  8 |         "ref" : "DefaultSchedule"
  9 |       },
 10 |       "name" : "Default",
 11 |       "role" : "DataPipelineDefaultRole",
 12 |       "resourceRole" : "DataPipelineDefaultResourceRole"
 13 |     },
 14 |     {
 15 |       "id" : "EC2ResourceObj",
 16 |       "terminateAfter" : "70 Minutes",
 17 |       "instanceType" : "#{myInstanceType}",
 18 |       "name" : "EC2ResourceObj",
 19 |       "type" : "Ec2Resource",
 20 |       "securityGroupIds" : [
 21 |         "#{mySrcSecGroupID}",
 22 |         "#{myBackupSecGroupID}"
 23 |       ],
 24 |       "subnetId" : "#{mySubnetID}",
 25 |       "associatePublicIpAddress" : "true",
 26 |       "imageId" : "#{myImageID}"
 27 |     },
 28 |     {
 29 |       "id" : "DefaultSchedule",
 30 |       "name" : "Every Day",
 31 |       "startAt" : "FIRST_ACTIVATION_DATE_TIME",
 32 |       "type" : "Schedule",
 33 |       "occurrences" : "1",
 34 |       "period" : "1 Days"
 35 |     },
 36 |     {
 37 |       "id" : "ShellCommandActivityObj",
 38 |       "name" : "ShellCommandActivityObj",
 39 |       "runsOn" : {
 40 |         "ref" : "EC2ResourceObj"
 41 |       },
 42 |       "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore.sh\nchmod a+x efs-restore.sh\n./efs-restore.sh $1 $2 $3 $4 $5",
 43 |       "scriptArgument" : [
 44 |         "#{myEfsSource}",
 45 |         "#{myEfsBackup}",
 46 |         "#{myInterval}",
 47 |         "#{myBackup}",
 48 |         "#{myEfsID}"
 49 |       ],
 50 |       "type" : "ShellCommandActivity",
 51 |       "stage" : "true"
 52 |     }
 53 |   ],
 54 |   "parameters" : [
 55 |     {
 56 |       "id" : "myInstanceType",
 57 |       "default" : "m3.large",
 58 |       "description" : "Instance type for performing the restore.",
 59 |       "allowedValues" : [
 60 |         "t1.micro",
 61 |         "m3.medium",
 62 |         "m3.large",
 63 |         "m3.xlarge",
 64 |         "m3.2xlarge",
 65 |         "c3.large",
 66 |         "c3.xlarge",
 67 |         "c3.2xlarge",
 68 |         "c3.4xlarge",
 69 |         "c3.8xlarge"
 70 |       ],
 71 |       "type" : "String"
 72 |     },
 73 |     {
 74 |       "id" : "mySubnetID",
 75 |       "default" : "subnet-1234abcd",
 76 |       "description" : "VPC subnet for your restoration EC2 instance (ideally the same subnet used for the backup EFS mount point).",
 77 |       "type" : "String"
 78 |     },
 79 |     {
 80 |       "id" : "mySrcSecGroupID",
 81 |       "default" : "sg-1111111b",
 82 |       "description" : "Security group that can connect to the Production EFS mount point.",
 83 |       "type" : "String"
 84 |     },
 85 |     {
 86 |       "id" : "myBackupSecGroupID",
 87 |       "default" : "sg-9999999b",
 88 |       "description" : "Security group that can connect to the Backup EFS mount point.",
 89 |       "type" : "String"
 90 |     },
 91 |     {
 92 |       "id" : "myInterval",
 93 |       "default" : "daily",
 94 |       "description" : "Interval that you chose for the backup your going to restore.",
 95 |       "allowedValues" : [
 96 |         "hourly",
 97 |         "daily",
 98 |         "weekly",
 99 |         "monthly"
100 |       ],
101 |       "type" : "String"
102 |     },
103 |     {
104 |       "id" : "myBackup",
105 |       "default" : "0",
106 |       "description" : "Backup number to restore (0 = the most recent backup).",
107 |       "type" : "Integer"
108 |     },
109 |     {
110 |       "id" : "myEfsID",
111 |       "default" : "backup-fs-12345678",
112 |       "description" : "Name for the directory that already contains your backups.",
113 |       "type" : "String"
114 |     },
115 |     {
116 |       "id" : "myEfsSource",
117 |       "default" : "10.0.1.32:/",
118 |       "description" : "Production EFS mount target IP address.",
119 |       "type" : "String"
120 |     },
121 |     {
122 |       "id" : "myEfsBackup",
123 |       "default" : "10.0.1.75:/",
124 |       "description" : "Backup EFS mount target IP address.",
125 |       "type" : "String"
126 |     },
127 |     {
128 |       "id" : "myImageID",
129 |       "default" : "ami-12345678",
130 |       "description" : "AMI ID for the EC2 instance.",
131 |       "type" : "String"
132 |     }
133 |   ]
134 | }


--------------------------------------------------------------------------------
/samples/EFSBackup/2-Node-EFSRestorePipeline.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects" : [
  3 |     {
  4 |       "id" : "Default",
  5 |       "scheduleType" : "cron",
  6 |       "failureAndRerunMode" : "CASCADE",
  7 |       "schedule" : {
  8 |         "ref" : "DefaultSchedule"
  9 |       },
 10 |       "name" : "Default",
 11 |       "role" : "DataPipelineDefaultRole",
 12 |       "resourceRole" : "DataPipelineDefaultResourceRole"
 13 |     },
 14 |     {
 15 |       "id" : "EC2Resource1",
 16 |       "terminateAfter" : "70 Minutes",
 17 |       "instanceType" : "#{myInstanceType}",
 18 |       "name" : "EC2Resource1",
 19 |       "type" : "Ec2Resource",
 20 |       "securityGroupIds" : [
 21 |         "#{mySrcSecGroupID}",
 22 |         "#{myBackupSecGroupID}"
 23 |       ],
 24 |       "subnetId" : "#{mySubnetID}",
 25 |       "associatePublicIpAddress" : "true",
 26 |       "imageId" : "#{myImageID}"
 27 |     },
 28 |     {
 29 |       "id" : "EC2Resource2",
 30 |       "terminateAfter" : "70 Minutes",
 31 |       "instanceType" : "#{myInstanceType}",
 32 |       "name" : "EC2Resource2",
 33 |       "type" : "Ec2Resource",
 34 |       "securityGroupIds" : [
 35 |         "#{mySrcSecGroupID}",
 36 |         "#{myBackupSecGroupID}"
 37 |       ],
 38 |       "subnetId" : "#{mySubnetID}",
 39 |       "associatePublicIpAddress" : "true",
 40 |       "imageId" : "#{myImageID}"
 41 |     },
 42 |     {
 43 |       "id" : "DefaultSchedule",
 44 |       "name" : "RunOnce",
 45 |       "startAt" : "FIRST_ACTIVATION_DATE_TIME",
 46 |       "type" : "Schedule",
 47 |       "occurrences" : "1",
 48 |       "period" : "1 Days"
 49 |     },
 50 |     {
 51 |       "id" : "RestorePart1",
 52 |       "name" : "RestorePart1",
 53 |       "runsOn" : {
 54 |         "ref" : "EC2Resource1"
 55 |       },
 56 |       "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore-rsync.sh\nchmod a+x efs-restore-rsync.sh\n./efs-restore-rsync.sh $1 $2 $3 $4 $5 $6 $7",
 57 |       "scriptArgument" : [
 58 |         "#{myEfsSource}",
 59 |         "#{myEfsBackup}",
 60 |         "#{myInterval}",
 61 |         "#{myBackup}",
 62 |         "#{myEfsID}",
 63 |         "1",
 64 |         "2"
 65 |       ],
 66 |       "type" : "ShellCommandActivity",
 67 |       "stage" : "true"
 68 |     },
 69 |     {
 70 |       "id" : "RestorePart2",
 71 |       "name" : "RestorePart2",
 72 |       "runsOn" : {
 73 |         "ref" : "EC2Resource2"
 74 |       },
 75 |       "command" : "wget https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/samples/EFSBackup/efs-restore-rsync.sh\nchmod a+x efs-restore-rsync.sh\n./efs-restore-rsync.sh $1 $2 $3 $4 $5 $6 $7",
 76 |       "scriptArgument" : [
 77 |         "#{myEfsSource}",
 78 |         "#{myEfsBackup}",
 79 |         "#{myInterval}",
 80 |         "#{myBackup}",
 81 |         "#{myEfsID}",
 82 |         "0",
 83 |         "2"
 84 |       ],
 85 |       "type" : "ShellCommandActivity",
 86 |       "stage" : "true"
 87 |     }
 88 |   ],
 89 |   "parameters" : [
 90 |     {
 91 |       "id" : "myInstanceType",
 92 |       "default" : "m3.large",
 93 |       "description" : "Instance type for performing the restore.",
 94 |       "allowedValues" : [
 95 |         "t1.micro",
 96 |         "m3.medium",
 97 |         "m3.large",
 98 |         "m3.xlarge",
 99 |         "m3.2xlarge",
100 |         "c3.large",
101 |         "c3.xlarge",
102 |         "c3.2xlarge",
103 |         "c3.4xlarge",
104 |         "c3.8xlarge"
105 |       ],
106 |       "type" : "String"
107 |     },
108 |     {
109 |       "id" : "mySubnetID",
110 |       "default" : "subnet-1234abcd",
111 |       "description" : "VPC subnet for your restoration EC2 instance (ideally the same subnet used for the backup EFS mount point).",
112 |       "type" : "String"
113 |     },
114 |     {
115 |       "id" : "mySrcSecGroupID",
116 |       "default" : "sg-1111111b",
117 |       "description" : "Security group that can connect to the Production EFS mount point.",
118 |       "type" : "String"
119 |     },
120 |     {
121 |       "id" : "myBackupSecGroupID",
122 |       "default" : "sg-9999999b",
123 |       "description" : "Security group that can connect to the Backup EFS mount point.",
124 |       "type" : "String"
125 |     },
126 |     {
127 |       "id" : "myInterval",
128 |       "default" : "daily",
129 |       "description" : "Interval for backups.",
130 |       "allowedValues" : [
131 |         "hourly",
132 |         "daily",
133 |         "weekly",
134 |         "monthly"
135 |       ],
136 |       "type" : "String"
137 |     },
138 |     {
139 |       "id" : "myBackup",
140 |       "default" : "0",
141 |       "description" : "Backup number to restore (0 = the most recent backup).",
142 |       "type" : "Integer"
143 |     },
144 |     {
145 |       "id" : "myEfsID",
146 |       "default" : "backup-fs-12345678",
147 |       "description" : "Name for the directory that already contains your backups",
148 |       "type" : "String"
149 |     },
150 |     {
151 |       "id" : "myEfsSource",
152 |       "default" : "10.0.1.32:/",
153 |       "description" : "Production EFS mount target IP address.",
154 |       "type" : "String"
155 |     },
156 |     {
157 |       "id" : "myEfsBackup",
158 |       "default" : "10.0.1.75:/",
159 |       "description" : "Backup EFS mount target IP address.",
160 |       "type" : "String"
161 |     },
162 |     {
163 |       "id" : "myImageID",
164 |       "default" : "ami-12345678",
165 |       "description" : "AMI ID for the EC2 instance.",
166 |       "type" : "String"
167 |     }
168 |   ]
169 | }


--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup-end.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Input arguments
 4 | interval=$1
 5 | efsid=$2
 6 | 
 7 | echo "sudo touch /mnt/backups/$efsid/$interval.0/"
 8 | sudo touch /mnt/backups/$efsid/$interval.0/
 9 | echo "$interval: completed successfully"
10 | 


--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup-init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Initialization of EFS backup
 3 | 
 4 | # Input arguments
 5 | source=$1
 6 | destination=$2
 7 | interval=$3
 8 | retain=$4
 9 | efsid=$5
10 | 
11 | # Prepare system for rsync
12 | echo 'sudo yum -y install nfs-utils'
13 | sudo yum -y install nfs-utils
14 | if [ ! -d /backup ]; then
15 |   echo 'sudo mkdir /backup'
16 |   sudo mkdir /backup
17 |   echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
18 |   sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
19 | fi
20 | if [ ! -d /mnt/backups ]; then
21 |   echo 'sudo mkdir /mnt/backups'
22 |   sudo mkdir /mnt/backups
23 |   echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
24 |   sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
25 | fi
26 | 
27 | # we need to decrement retain because we start counting with 0 and we need to remove the oldest backup
28 | let "retain=$retain-1"
29 | if sudo test -d /mnt/backups/$efsid/$interval.$retain; then
30 |   echo "sudo rm -rf /mnt/backups/$efsid/$interval.$retain"
31 |   sudo rm -rf /mnt/backups/$efsid/$interval.$retain
32 | fi
33 | 
34 | # Rotate all previous backups (except the first one), up one level
35 | for x in `seq $retain -1 2`; do
36 |   if sudo test -d /mnt/backups/$efsid/$interval.$[$x-1]; then
37 |     echo "sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x"
38 |     sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x
39 |   fi
40 | done
41 | 
42 | # Copy first backup with hard links, then replace first backup with new backup
43 | if sudo test -d /mnt/backups/$efsid/$interval.0 ; then
44 |   echo "sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1"
45 |   sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1
46 | fi
47 | if [ ! -d /mnt/backups/$efsid ]; then
48 |   echo "sudo mkdir -p /mnt/backups/$efsid"
49 |   sudo mkdir -p /mnt/backups/$efsid
50 |   echo "sudo chmod 700 /mnt/backups/$efsid"
51 |   sudo chmod 700 /mnt/backups/$efsid
52 | fi
53 | if [ ! -d /mnt/backups/efsbackup-logs ]; then
54 |   echo "sudo mkdir -p /mnt/backups/efsbackup-logs"
55 |   sudo mkdir -p /mnt/backups/efsbackup-logs
56 |   echo "sudo chmod 700 /mnt/backups/efsbackup-logs"
57 |   sudo chmod 700 /mnt/backups/efsbackup-logs
58 | fi
59 | if [ -f /tmp/efs-backup.log ]; then
60 |   echo "sudo rm /tmp/efs-backup.log"
61 |   sudo rm /tmp/efs-backup.log
62 | fi
63 | 


--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup-rsync.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Input arguments
 4 | source=$1
 5 | destination=$2
 6 | interval=$3
 7 | retain=$4
 8 | efsid=$5
 9 | clientNum=$6
10 | numClients=$7
11 | 
12 | 
13 | # Prepare system for rsync
14 | echo 'sudo yum -y install nfs-utils'
15 | sudo yum -y install nfs-utils
16 | if [ ! -d /backup ]; then
17 |   echo 'sudo mkdir /backup'
18 |   sudo mkdir /backup
19 |   echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
20 |   sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
21 | fi
22 | if [ ! -d /mnt/backups ]; then
23 |   echo 'sudo mkdir /mnt/backups'
24 |   sudo mkdir /mnt/backups
25 |   echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
26 |   sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
27 | fi
28 | 
29 | if [ -f /tmp/efs-backup.log ]; then
30 |   echo "sudo rm /tmp/efs-backup.log"
31 |   sudo rm /tmp/efs-backup.log
32 | fi
33 | 
34 | #Copy all content this node is responsible for
35 | for myContent in `sudo ls -a --ignore . --ignore .. /backup/ | awk 'NR%'$numClients==$clientNum`; do
36 |   echo "sudo rsync -s -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/$myContent /mnt/backups/$efsid/$interval.0/"
37 |   sudo rsync -s -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/"$myContent" /mnt/backups/$efsid/$interval.0/
38 |   rsyncStatus=$?
39 | done
40 | 
41 | if [ -f /tmp/efs-backup.log ]; then
42 | echo "sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log"
43 | sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log
44 | fi
45 | exit $rsyncStatus
46 | 


--------------------------------------------------------------------------------
/samples/EFSBackup/efs-backup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Example would be to run this script as follows:
 3 | # Every 6 hours; retain last 4 backups
 4 | # efs-backup.sh $src $dst hourly 4 efs-12345
 5 | # Once a day; retain last 31 days
 6 | # efs-backup.sh $src $dst daily 31 efs-12345
 7 | # Once a week; retain 4 weeks of backup
 8 | # efs-backup.sh $src $dst weekly 7 efs-12345
 9 | # Once a month; retain 3 months of backups
10 | # efs-backup.sh $src $dst monthly 3 efs-12345
11 | #
12 | # Snapshots will look like:
13 | # $dst/$efsid/hourly.0-3; daily.0-30; weekly.0-3; monthly.0-2
14 | 
15 | 
16 | # Input arguments
17 | source=$1
18 | destination=$2
19 | interval=$3
20 | retain=$4
21 | efsid=$5
22 | 
23 | # Prepare system for rsync
24 | #echo 'sudo yum -y update'
25 | #sudo yum -y update
26 | echo 'sudo yum -y install nfs-utils'
27 | sudo yum -y install nfs-utils
28 | echo 'sudo mkdir /backup'
29 | sudo mkdir /backup
30 | echo 'sudo mkdir /mnt/backups'
31 | sudo mkdir /mnt/backups
32 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
33 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
34 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
35 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
36 | 
37 | # we need to decrement retain because we start counting with 0 and we need to remove the oldest backup
38 | let "retain=$retain-1"
39 | if sudo test -d /mnt/backups/$efsid/$interval.$retain; then
40 |   echo "sudo rm -rf /mnt/backups/$efsid/$interval.$retain"
41 |   sudo rm -rf /mnt/backups/$efsid/$interval.$retain
42 | fi
43 | 
44 | 
45 | # Rotate all previous backups (except the first one), up one level
46 | for x in `seq $retain -1 2`; do
47 |   if sudo test -d /mnt/backups/$efsid/$interval.$[$x-1]; then
48 |     echo "sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x"
49 |     sudo mv /mnt/backups/$efsid/$interval.$[$x-1] /mnt/backups/$efsid/$interval.$x
50 |   fi
51 | done
52 | 
53 | # Copy first backup with hard links, then replace first backup with new backup
54 | if sudo test -d /mnt/backups/$efsid/$interval.0 ; then
55 |   echo "sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1"
56 |   sudo cp -al /mnt/backups/$efsid/$interval.0 /mnt/backups/$efsid/$interval.1
57 | fi
58 | if [ ! -d /mnt/backups/$efsid ]; then
59 |   echo "sudo mkdir -p /mnt/backups/$efsid"
60 |   sudo mkdir -p /mnt/backups/$efsid
61 |   echo "sudo chmod 700 /mnt/backups/$efsid"
62 |   sudo chmod 700 /mnt/backups/$efsid
63 | fi
64 | if [ ! -d /mnt/backups/efsbackup-logs ]; then
65 |   echo "sudo mkdir -p /mnt/backups/efsbackup-logs"
66 |   sudo mkdir -p /mnt/backups/efsbackup-logs
67 |   echo "sudo chmod 700 /mnt/backups/efsbackup-logs"
68 |   sudo chmod 700 /mnt/backups/efsbackup-logs
69 | fi
70 | echo "sudo rm /tmp/efs-backup.log"
71 | sudo rm /tmp/efs-backup.log
72 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/ /mnt/backups/$efsid/$interval.0/"
73 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-backup.log /backup/ /mnt/backups/$efsid/$interval.0/
74 | rsyncStatus=$?
75 | echo "sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-`date +%Y%m%d-%H%M`.log"
76 | sudo cp /tmp/efs-backup.log /mnt/backups/efsbackup-logs/$efsid-`date +%Y%m%d-%H%M`.log
77 | echo "sudo touch /mnt/backups/$efsid/$interval.0/"
78 | sudo touch /mnt/backups/$efsid/$interval.0/
79 | exit $rsyncStatus
80 | 


--------------------------------------------------------------------------------
/samples/EFSBackup/efs-restore-rsync.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Input arguments
 4 | source=$1
 5 | destination=$2
 6 | interval=$3
 7 | backupNum=$4
 8 | efsid=$5
 9 | clientNum=$6
10 | numClients=$7
11 | 
12 | 
13 | # Prepare system for rsync
14 | echo 'sudo yum -y install nfs-utils'
15 | sudo yum -y install nfs-utils
16 | 
17 | if [ ! -d /backup ]; then
18 |   echo 'sudo mkdir /backup'
19 |   sudo mkdir /backup
20 |   echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
21 |   sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
22 | fi
23 | if [ ! -d /mnt/backups ]; then
24 |   echo 'sudo mkdir /mnt/backups'
25 |   sudo mkdir /mnt/backups
26 |   echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
27 |   sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
28 | fi
29 | 
30 | if [ -f /tmp/efs-restore.log ]; then
31 |   echo "sudo rm /tmp/efs-restore.log"
32 |   sudo rm /tmp/efs-restore.log
33 | fi
34 | 
35 | #Copy all content this node is responsible for
36 | for myContent in `sudo ls -a --ignore . --ignore .. /mnt/backups/$efsid/$interval.$backupNum | awk 'NR%'$numClients==$clientNum`; do
37 |   echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum /backup/"
38 |   sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/$myContent /backup/
39 |   rsyncStatus=$?
40 | done
41 | 
42 | if [ -f /tmp/efs-restore.log ]; then
43 | echo "sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log"
44 | sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-$clientNum.$numClients-`date +%Y%m%d-%H%M`.log
45 | fi
46 | exit $rsyncStatus
47 | 


--------------------------------------------------------------------------------
/samples/EFSBackup/efs-restore.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Input arguments
 4 | source=$1
 5 | destination=$2
 6 | interval=$3
 7 | backupNum=$4
 8 | efsid=$5
 9 | 
10 | # Prepare system for rsync
11 | echo 'sudo yum -y install nfs-utils'
12 | sudo yum -y install nfs-utils
13 | echo 'sudo mkdir /backup'
14 | sudo mkdir /backup
15 | echo 'sudo mkdir /mnt/backups'
16 | sudo mkdir /mnt/backups
17 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup"
18 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $source /backup
19 | echo "sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups"
20 | sudo mount -t nfs -o nfsvers=4.1 -o rsize=1048576 -o wsize=1048576 -o timeo=600 -o retrans=2 -o hard $destination /mnt/backups
21 | 
22 | if [ ! sudo test -d /mnt/backups/$efsid/$interval.$backupNum/ ]; then
23 |   echo "EFS Backup $efsid/$interval.$backupNum does not exist!"
24 |   exit 1
25 | fi
26 | 
27 | echo "sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/ /backup/"
28 | sudo rsync -ah --stats --delete --numeric-ids --log-file=/tmp/efs-restore.log /mnt/backups/$efsid/$interval.$backupNum/ /backup/
29 | rsyncStatus=$?
30 | echo "sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-`date +%Y%m%d-%H%M`.log"
31 | sudo cp /tmp/efs-restore.log /mnt/backups/efsbackup-logs/$efsid-$interval.$backupNum-restore-`date +%Y%m%d-%H%M`.log
32 | exit $rsyncStatus
33 | 


--------------------------------------------------------------------------------
/samples/ExampleTemplate/README.md:
--------------------------------------------------------------------------------
 1 | # {{Example Name}}
 2 | 
 3 | {{Description of activites performed in the example}}
 4 | 
 5 | ## Parameters
 6 | 
 7 | Parameter | Required | Description
 8 | ----------|----------|------------
 9 | {{Parameter Name}} | {{yes/no}} | {{Description}} {{Example or Default}}
10 | 
11 | ## Setup (Optional)
12 | 
13 | You can use the setup script in the sample directory to create {{resources}} to use in this example.
14 | You can skip this step if you have {{resources}} that you want to use. The script will take a minute
15 | to complete, and when it's finished it will print the resource identifier of the
16 | {{resources}} that it created.
17 | 
18 | ```sh
19 |  $> python setup.py
20 | ```
21 | 
22 | If the script fails with an ImportError, you may need to [setup your virtualenv](https://github.com/awslabs/data-pipeline-samples#setup).
23 | 
24 | ## Running this sample
25 | 
26 | Create a new pipeline. Throughout this section we assume that the {{Example Directory}} sample directory is
27 | your current working directory.
28 | 
29 | ```sh
30 |  $> aws datapipeline create-pipeline --name {{example_name}} --unique-id {{example_name}} 
31 | # {
32 | #     "pipelineId": "df-03971252U4AVY60545T7"
33 | # }
34 | ```
35 | 
36 | Upload the [pipeline definition](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html). Use the `pipelineId` that was returned by the `create-pipeline`
37 | command. Specify the name of an S3 bucket where the output from pipline activites will be stored.
38 | This will either be the bucket name that was printed by the setup script or another bucket that
39 | you've created. You can also specify any optional parameters for this example here.
40 | 
41 | 
42 | ```sh
43 |   $> aws datapipeline put-pipeline-definition --pipeline-id <your pipelineId> --pipeline-definition file://TeraSortHadoopBenchmark.json {{--parameter-values values}}
44 | # {
45 | #     "errored": false,
46 | #     "validationWarnings": [],
47 | #     "validationErrors": []
48 | # }
49 | ```
50 | 
51 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command.
52 | 
53 | ```sh
54 |   $> aws datapipeline activate-pipeline --pipeline-id <your pipelineId>
55 | ```
56 | 
57 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the
58 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output
59 | from this command will show FINISHED for all pipeine nodes.
60 | 
61 | ```sh
62 | 
63 |   >$ aws datapipeline list-runs --pipeline-id <your pipelineId>
64 | # {{example output}}
65 | 
66 | ```
67 | 
68 | {{what happens when the pipeline is finished}}
69 | 
70 | ## Next steps
71 | 
72 | {{things to try next}}
73 | 
74 | Once the pipeline is completed, you can delete it with the following command. If you try to run the
75 | sample again without deleting, you may receive errors or unexpected behavior.
76 | 
77 | ```sh
78 |  $> aws datapipeline delete-pipeline --pipeline-id <your pipelineId>
79 | ```
80 | 
81 | The resources used by this example will incur normal charges. If you provisioned resources using the
82 | setup script, you can free them by running the following command in the sample directory.
83 | 
84 | ```sh
85 |  $> python setup.py --teardown
86 | ```
87 | 
88 | ## Disclaimer
89 | 
90 | The samples in this repository are meant to help users get started with Data Pipeline. They may not
91 | be sufficient for production environments. Users should carefully inspect samples before running
92 | them.
93 | 
94 | *Use at your own risk.*
95 | 
96 | Licensed under the MIT-0 License.
97 | 


--------------------------------------------------------------------------------
/samples/HadoopTerasort/process-jhist.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Number of arguments: $#"
 4 | echo "Arguments: $@"
 5 | S3_Target=$1
 6 | echo "S3 Target output path: $S3_Target"
 7 | 
 8 | # --------------------------------------------------------------
 9 | # TeraSort Benchmark JHIST Publish Script
10 | # This script is a reference script.
11 | # TeraSortHadoopBenchmark pipeline uses the script hosted at: s3://datapipeline-us-east-1/sample-scripts/HadoopTeraSort/process-jhist.sh 
12 | # --------------------------------------------------------------
13 | 
14 | # --------------------------------------------------------------
15 | # Any code, applications, scripts, templates, proofs of concept, documentation and other items provided by AWS under this SOW are AWS Content, as defined in the Agreement, and are provided for illustration purposes only. All such AWS Content is provided solely at the option of AWS, and is subject to the terms of the Addendum and the Agreement. Customer is solely responsible for using, deploying, testing, and supporting any code and applications provided by AWS under the current SOW.
16 | # --------------------------------------------------------------
17 | 
18 | # --------------------------------------------------------------
19 | #  CHANGE LOG:
20 | # --------------------------------------------------------------
21 | #  2015-04-28 RG  v0.1 - Initial script
22 | #  2015-04-28 RG  v0.2 - Added TeraSort & TeraValidate JHIST Processing Activities
23 | #  2015-09-01 AR  v0.3 - Output to S3 target path
24 | #  2015-11-19 JT  v0.4 - Update file name parsing and use mapred command
25 | # --------------------------------------------------------------
26 | 
27 | # --------------------------------------------------------------
28 | # Define Variables
29 | # --------------------------------------------------------------
30 | 
31 | 
32 | 
33 | 
34 | # --------------------------------------------------------------
35 | # Process JHIST File
36 | # --------------------------------------------------------------
37 | 
38 | path_to_jhist() {
39 |     # perl incantation to extract the path from the ls command
40 |     # via: http://stackoverflow.com/questions/21569172/how-to-list-only-file-name-in-hdfs
41 |     hdfs dfs -ls -R / | grep $1 | perl -wlne 'print +(split " ",$_,8)[7]'
42 | }
43 | 
44 | TeraGen=$(path_to_jhist TeraGen)
45 | mapred job -history all $TeraGen > TeraGen-results.txt
46 | 
47 | TeraSort=$(path_to_jhist TeraSort)
48 | mapred job -history all $TeraSort > TeraSort-results.txt
49 | 
50 | TeraValidate=$(path_to_jhist TeraValidate)
51 | mapred job -history all ${TeraValidate} > TeraValidate-results.txt
52 | 
53 | # --------------------------------------------------------------
54 | # Copy to S3
55 | # --------------------------------------------------------------
56 | 
57 | gensecondline=`sed -n '2{p;q}' TeraGen-results.txt`;
58 | genjob=${gensecondline:12}
59 | date=$(date +"%m-%d-%y")
60 | aws s3 cp TeraGen-results.txt $S3_Target/$date-$genjob/results/
61 | aws s3 cp TeraSort-results.txt $S3_Target/$date-$genjob/results/
62 | aws s3 cp TeraValidate-results.txt $S3_Target/$date-$genjob/results/
63 | aws s3 cp /home/hadoop/conf $S3_Target/$date-$genjob/conf/ --recursive
64 | 
65 | exit 0
66 | 


--------------------------------------------------------------------------------
/samples/HadoopTerasort/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("../../setup")
 3 | 
 4 | from stacker import Stacker
 5 | 
 6 | s = Stacker(
 7 |     "dpl-samples-hadoop-terasort",
 8 |     {
 9 |         "Resources": {
10 |             "S3Bucket": {
11 |                 "Type": "AWS::S3::Bucket",
12 |                 "DeletionPolicy": "Delete"
13 |             }
14 |         }
15 |     })
16 | 
17 | s.run(sys.argv)
18 | 


--------------------------------------------------------------------------------
/samples/InvokeLambda/README.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline InvokeLambda Sample
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample shows how to build a Shell Command Activity pipeline that invokes AWS Lambda function. 
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
10 | 
11 | ## Run this sample pipeline using the AWS CLI
12 | 
13 | ```sh 
14 |   $> aws datapipeline create-pipeline --name invoke_lambda_pipeline --unique-id invoke_lambda_pipeline
15 | ```
16 | 
17 | You receive a pipelineId like this. 
18 | ```sh
19 |   #   -----------------------------------------
20 |   #   |             CreatePipeline             |
21 |   #   +-------------+--------------------------+
22 |   #   |  pipelineId |  <Your Pipeline ID>      |
23 |   #   +-------------+--------------------------+
24 | ```
25 | 
26 | ```sh
27 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://invokelambda.json --parameter-values myLambdaFunction=<your lambda function>  myS3LogsPath=s3://<s3 bucket>/path --pipeline-id <Your Pipeline ID> 
28 | ```
29 | 
30 | You receive a validation messages like this
31 | ```sh
32 |   #   ----------------------- 
33 |   #   |PutPipelineDefinition|
34 |   #   +-----------+---------+
35 |   #   |  errored  |  False  |
36 |   #   +-----------+---------+
37 | ```
38 | 
39 | Now activate the pipeline
40 | ```sh
41 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
42 | ```
43 | 
44 | Check the status of your pipeline 
45 | ```
46 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
47 | ```
48 | 
49 | You will receive status information on the pipeline.  
50 | ```sh
51 |        Name                                                Scheduled Start      Status                 
52 |        ID                                                  Started              Ended              
53 | ---------------------------------------------------------------------------------------------------
54 |    1.  Invoke_Lambda_Activity                              2016-03-23T18:40:31  WAITING_FOR_RUNNER     
55 |        @Invoke_Lambda_Activity_2016-03-23T18:40:31         2016-03-23T18:40:35                     
56 | 
57 |    2.  New_EC2Instance                                     2016-03-23T18:40:31  CREATING               
58 |        @New_EC2Instance_2016-03-23T18:40:31                2016-03-23T18:40:36                     
59 | 
60 | ```
61 | 
62 | 
63 | ## Disclaimer
64 | 
65 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
66 | 
67 | Use at your own risk.
68 | 
69 | Licensed under the MIT-0 License.
70 | 


--------------------------------------------------------------------------------
/samples/InvokeLambda/invokelambda.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "myComment": "This object is used to set default configuration for objects in the pipeline",
 5 |       
 6 |       "id": "Default",
 7 |       "failureAndRerunMode": "CASCADE",
 8 |       "resourceRole": "DataPipelineDefaultResourceRole",
 9 |       "role": "DataPipelineDefaultRole",
10 |       "pipelineLogUri": "#{myS3LogsPath}",
11 |       "scheduleType": "ONDEMAND"
12 |     },
13 |     {
14 |       "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
15 |       
16 |       "type": "Ec2Resource",
17 |       "id": "New_EC2Instance",
18 |       "name": "New_EC2Instance",
19 |       "terminateAfter": "1 Hour",
20 |       "imageId": "#{myImageId}",
21 |       "region": "#{myRegion}",
22 |       "instanceType": "#{myInstanceType}",
23 |       "resourceRole": "DataPipelineDefaultResourceRole",
24 |       "role": "DataPipelineDefaultRole"
25 |     },
26 |     {
27 |       "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case, it invokes Lambda Function.",
28 |       
29 |       "id": "Invoke_Lambda_Activity",
30 |       "name": "Invoke_Lambda_Activity",
31 |       "type": "ShellCommandActivity",
32 |       "runsOn": {
33 |         "ref": "New_EC2Instance"
34 |       },
35 |       "command": "aws lambda  --region #{myRegion} invoke --function-name #{myLambdaFunction} outfile.txt"
36 |     }
37 |   ],
38 |   "parameters": [
39 |     {
40 |         "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.",
41 | 
42 |         "id" : "myS3LogsPath",
43 |         "type" : "AWS::S3::ObjectKey",
44 |         "description" : "S3 path for pipeline logs."
45 |      },
46 |      {
47 |         "myComment": "This Parameter specifies the Lambda function name.",
48 | 
49 |         "id" : "myLambdaFunction",
50 |         "type" : "String",
51 |         "description" : "Lambda Function name"
52 |      },
53 |      {
54 |         "myComment": "This Parameter specifies region",
55 | 
56 |         "id" : "myRegion",
57 |         "type" : "String",
58 |         "default" : "us-east-1",
59 |         "description" : "Region"
60 |      },
61 |      {
62 |         "myComment": "This Parameter specifies image id",
63 | 
64 |         "id" : "myImageId",
65 |         "type" : "String",
66 |         "default" : "ami-8fcee4e5",
67 |         "description" : "Image Id"
68 |      },
69 |      {
70 |         "myComment": "This Parameter specifies instance type",
71 | 
72 |         "id" : "myInstanceType",
73 |         "type" : "String",
74 |         "default" : "m3.medium",
75 |         "description" : "Instance Type"
76 |      }
77 |   ]
78 | }
79 | 


--------------------------------------------------------------------------------
/samples/LoadTsvFilesInS3ToRedshift/LoadTsvFilesInS3ToRedshift.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "failureAndRerunMode": "CASCADE",
  5 |       "schedule": {
  6 |         "ref": "DefaultSchedule"
  7 |       },
  8 |       "resourceRole": "DataPipelineDefaultResourceRole",
  9 |       "role": "DataPipelineDefaultRole",
 10 |       "pipelineLogUri": "s3://insert-here-log-location-for-DPL",
 11 |       "scheduleType": "cron",
 12 |       "name": "Default",
 13 |       "id": "Default"
 14 |     },
 15 |     {
 16 |       "occurrences": "1",
 17 |       "period": "1 Day",
 18 |       "name": "RunOnce",
 19 |       "id": "DefaultSchedule",
 20 |       "type": "Schedule",
 21 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 22 |     },
 23 |     {
 24 |       "output": {
 25 |         "ref": "DestRedshiftTable"
 26 |       },
 27 |       "input": {
 28 |         "ref": "Input_S3_Tsv_Files"
 29 |       },
 30 |       "name": "ExportS3ToRedshift",
 31 |       "runsOn": {
 32 |         "ref": "Ec2Instance"
 33 |       },
 34 |       "id": "RedshiftLoadActivity",
 35 |       "type": "RedshiftCopyActivity",
 36 |       "insertMode": "OVERWRITE_EXISTING"
 37 |     },
 38 |     {
 39 |       "connectionString": "#{myRedshiftJdbcConnectStr}",
 40 |       "databaseName": "#{myRedshiftDbName}",
 41 |       "*password": "#{myRedshiftPassword}",
 42 |       "name": "RedshiftCluster",
 43 |       "id": "RedshiftCluster",
 44 |       "type": "RedshiftDatabase",
 45 |       "username": "#{myRedshiftUsername}"
 46 |     },
 47 |     {
 48 |       "filePath": "#{myInputTsvFilesS3Loc}",
 49 |       "name": "Input_S3_Tsv_Files",
 50 |       "id": "Input_S3_Tsv_Files",
 51 |       "dataFormat": {
 52 |         "ref": "DataFormatId_tsv"
 53 |       },
 54 |       "type": "S3DataNode"
 55 |     },
 56 |     {
 57 |       "securityGroupIds": "#{myRedshiftSecurityGrpIds}",
 58 |       "instanceType": "m3.xlarge",
 59 |       "name": "Ec2Instance",
 60 |       "associatePublicIpAddress": "true",
 61 |       "id": "Ec2Instance",
 62 |       "type": "Ec2Resource",
 63 |       "region": "us-east-1",
 64 |       "terminateAfter": "10 Hours",
 65 |       "availabilityZone": "us-east-1a"
 66 |     },
 67 |     {
 68 |       "database": {
 69 |         "ref": "RedshiftCluster"
 70 |       },
 71 |       "name": "DestRedshiftTable",
 72 |       "id": "DestRedshiftTable",
 73 |       "schemaName": "schemaNameInRedshift",
 74 |       "type": "RedshiftDataNode",
 75 |       "tableName": "DestRedshiftTableName"
 76 |     },
 77 |     {
 78 |       "name": "S3TRDataFormat",
 79 |       "id": "DataFormatId_tsv",
 80 |       "type": "TSV"
 81 |     }
 82 |   ],
 83 |   "parameters": [
 84 |     {
 85 |       "description": "Redshift password",
 86 |       "id": "*myRedshiftPassword",
 87 |       "type": "String"
 88 |     },
 89 |     {
 90 |       "description": "Redshift database name",
 91 |       "id": "myRedshiftDbName",
 92 |       "type": "String"
 93 |     },
 94 |     {
 95 |       "watermark": "security group id. E.g.,",
 96 |       "helpText": "The names of one or more security groups that are assigned to the Redshift cluster.",
 97 |       "description": "Security group Id(s)",
 98 |       "isArray": "true",
 99 |       "id": "myRedshiftSecurityGrpIds",
100 |       "type": "String"
101 |     },
102 |     {
103 |       "description": "Redshift username",
104 |       "id": "myRedshiftUsername",
105 |       "type": "String"
106 |     },
107 |     {
108 |       "allowedValues": "OVERWRITE_EXISTING",
109 |       "default": "OVERWRITE_EXISTING",
110 |       "helpLink": "https://docs.aws.amazon.com/console/datapipeline/redshiftcopyactivity",
111 |       "helpText": "Determines how to handle pre-existing data in the target table that overlaps with rows in the data to be loaded.",
112 |       "description": "Table insert mode",
113 |       "id": "myInsertMode",
114 |       "type": "String"
115 |     },
116 |     {
117 |       "helpText": "The name of an existing table or a new table that will be created based on the create table SQL query parameter below.",
118 |       "description": "Redshift table name",
119 |       "id": "myRedshiftTableName",
120 |       "type": "String"
121 |     },
122 |     {
123 |       "helpText": "The S3 folder where one or more tsv input files are located.",
124 |       "description": "Input S3 folder",
125 |       "id": "myInputTsvFilesS3Loc",
126 |       "type": "AWS::S3::ObjectKey",
127 |       "watermark" : "s3://tsv-files-insert-loc/2015-10-27-01-00-29"
128 |     },
129 |     {
130 |       "watermark": "jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true",
131 |       "description": "Redshift JDBC connection string",
132 |       "id": "myRedshiftJdbcConnectStr",
133 |       "type": "String"
134 |     }
135 |   ]
136 | }
137 | 


--------------------------------------------------------------------------------
/samples/LoadTsvFilesInS3ToRedshift/README.md:
--------------------------------------------------------------------------------
 1 | #Data Pipeline  Load Tab Separated Files in S3 to Redshift 
 2 | 
 3 | ##About the sample
 4 | This pipeline definition when imported would instruct Redshift to load TSV files under the specified S3 Path into a specified Redshift Table. Table insert mode is OVERWRITE_EXISTING.
 5 | 
 6 | ##Running this sample
 7 | The pipeline requires the following user input point:
 8 | 
 9 | 1. The S3 folder where the input TSV files are located. 
10 | 2. Redshift connection info along with the target table name.
11 | 3. Redshift Cluster security group id(s).
12 | 
13 | 
14 | ## Prerequisites
15 | 
16 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
17 | TSV files under a S3 folder path is the input for this pipeline. Redshift Cluster and Table must already exist.
18 | 
19 | 
20 | 
21 | ## Run this sample pipeline using the AWS CLI
22 | 
23 | ```sh 
24 |   $> aws datapipeline create-pipeline --name copy_tsv_to_redshift_pipeline --unique-id copy_tsv_to_redshift_pipeline
25 | ```
26 | 
27 | You receive a pipelineId like this. 
28 | ```sh
29 |   #   -----------------------------------------
30 |   #   |             CreatePipeline             |
31 |   #   +-------------+--------------------------+
32 |   #   |  pipelineId |  <Your Pipeline ID>      |
33 |   #   +-------------+--------------------------+
34 | ```
35 | 
36 | ```sh
37 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values
38 |      myInputTsvFilesS3Loc=<s3://tsv-files-insert-loc> myRedshiftJdbcConnectStr=<jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true> myRedshiftUsername=<user> myRedshiftPassword=<your-red-password>
39 |      myRedshiftTableName=<target-redshift-tablename> myRedshiftSecurityGrpIds=<sg-blah> --pipeline-id <Your Pipeline ID> 
40 | ```
41 | 
42 | You receive a validation messages like this
43 | ```sh
44 |   #   ----------------------- 
45 |   #   |PutPipelineDefinition|
46 |   #   +-----------+---------+
47 |   #   |  errored  |  False  |
48 |   #   +-----------+---------+
49 | ```
50 | 
51 | Now activate the pipeline
52 | ```sh
53 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
54 | ```
55 | 
56 | Check the status of your pipeline 
57 | ```sh
58 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
59 | ```
60 | 
61 | You will receive status information on the pipeline. 
62 | 
63 | 
64 | ## Disclaimer
65 | 
66 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
67 | 
68 | Use at your own risk.
69 | 
70 | Licensed under the MIT-0 License.
71 | 


--------------------------------------------------------------------------------
/samples/OnDemandWithLamdaFunctions/lambda_function.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import json
 4 | import urllib
 5 | import boto3
 6 | 
 7 | print('Loading function')
 8 | 
 9 | client = boto3.client('datapipeline')
10 | pipeline_id = 'df-123456789'
11 | 
12 | def lambda_handler(event, context):
13 |     try:
14 |         response = client.activate_pipeline(pipelineId=pipeline_id)
15 |         return response
16 |     except Exception as e:
17 |         print(e)
18 |         raise e
19 | 


--------------------------------------------------------------------------------
/samples/OnDemandWithLamdaFunctions/ondemand.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |   { 
 4 |       "myComment": "This object is used to set default configuration for objects 
 5 |       in the pipeline",
 6 | 
 7 |       "id": "Default",
 8 |       "failureAndRerunMode":"cascade",
 9 |       "resourceRole": "DataPipelineDefaultResourceRole",
10 |       "role": "DataPipelineDefaultRole",
11 |       "pipelineLogUri": "#{myS3LogsPath}",
12 |       "scheduleType": "ondemand",
13 |   },
14 |   { 
15 |       "myComment": "This object is used to create the Amazon EC2 Instance that activities 
16 |       in the pipeline will be executed on.",
17 | 
18 |       "id": "A_Fresh_NewEC2Instance",
19 |       "type": "Ec2Resource",
20 |       "terminateAfter": "1 Hour"
21 |   },
22 |   {
23 |      "myComment": "This object is a ShellCommandActivity. It is used to specify the linux 
24 |      shell command that will be invoked. In this case it is simply running the 'echo' command, 
25 |      but it can be used to run any command that is accessible on in the commandline shell of the 
26 |      Instance that runs on.",
27 | 
28 |       "id": "ShellCommandActivity_HelloWorld",
29 |       "runsOn": {
30 |         "ref": "A_Fresh_NewEC2Instance"
31 |       },
32 |       "type": "ShellCommandActivity",
33 |       "command": "echo 'Hello World!'"
34 |     }
35 | ],
36 | "parameters": [
37 |      {
38 |         "myComment": "Pipeline Parameters are placeholders for variables that a user can specify 
39 |         when uploading or activating the pipeline. In this example, we create a Parameter 
40 |         called 'myS3LogsPath' which is used to provide an S3 location for output logs. It is 
41 |         referenced above in the 'Default' object to set the 'pipelineLogUri' value. Parameters 
42 |         help users avoid hard coding variables in pipeline definitions. Users can supply these 
43 |         parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline
44 |         activate-pipeline-definition'.",
45 | 
46 |         "id" : "myS3LogsPath",
47 |         "type" : "AWS::S3::ObjectKey",
48 |         "description" : "S3 path for pipeline logs."
49 |      }
50 |   ]
51 | }


--------------------------------------------------------------------------------
/samples/OnDemandWithLamdaFunctions/readme.md:
--------------------------------------------------------------------------------
  1 | #### This sample shows how to create a Lamda function that responds to S3 create object events on an S3 bucket and/or a Cloudwatch Scheduled Event.
  2 | 
  3 | The following Python code defines an AWS Lamda function to run an ondemand pipeline. This code is in a file called lamda_function.py. You simply need to set the ``pipeline_id`` variable with the id of your on-demand pipeline.
  4 | 
  5 | ```python
  6 | from __future__ import print_function
  7 | 
  8 | import json
  9 | import urllib
 10 | import boto3
 11 | 
 12 | print('Loading function')
 13 | 
 14 | client = boto3.client('datapipeline')
 15 | pipeline_id = 'df-123456789'
 16 | 
 17 | def lambda_handler(event, context):
 18 |     try:
 19 |         response = client.activate_pipeline(pipelineId=pipeline_id)
 20 |         return response
 21 |     except Exception as e:
 22 |         print(e)
 23 |         raise e
 24 | ```
 25 | ### Step 1: Create the on-demand pipeline
 26 | *Make sure the pipeline is created in a region that supports Lamda.*
 27 | 
 28 | Create the pipeline:
 29 | 
 30 | ```sh 
 31 |   $> aws datapipeline create-pipeline --name on_demand_lamda --unique-id on_demand_lamda
 32 | ```
 33 | 
 34 | Upload the pipeline definition:
 35 | 
 36 | ```sh
 37 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://ondemand.json \
 38 |   --parameter-values myS3LogsPath=<s3://your/s3/logs/path> --pipeline-id <Your Pipeline ID> 
 39 | ```
 40 | 
 41 | Activate the pipeline to make sure it runs sucessfully:
 42 | 
 43 | ```sh
 44 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
 45 | ```
 46 | 
 47 | Check the status of your pipeline:
 48 | ```
 49 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
 50 | ```
 51 | 
 52 | ### Step 2: Create the Lamda function
 53 | 
 54 | 
 55 | ```sh
 56 |   >$ aws lambda create-function --function-name <fn-name> --runtime python2.7 \
 57 |   --role <role-arn-that-allows-data-pipeline-activate> --handler lambda_function.lambda_handler \
 58 |   --zip-file file:///zip-with-lamda-fn-code.zip --publish --timeout 10
 59 | ```
 60 | 
 61 | See this link for reference on the Lamda create-function command: 
 62 | http://docs.aws.amazon.com/cli/latest/reference/lambda/create-function.html
 63 | 
 64 | ### Step 3: Set-up an event source for the Lamda funtion
 65 | 
 66 | ##### Set-up an S3 bucket to call the Lamda function when objects are created
 67 | 
 68 | Create the s3 bucket:
 69 | 
 70 | ```sh
 71 |   $> aws s3 mb <s3://bucket>
 72 | ```
 73 | 
 74 | Run the following Lambda add-permission command to grant Amazon S3 service principal permissions to perform the lambda:InvokeFunction action:
 75 | 
 76 | ```sh
 77 |   $> aws lambda add-permission --function-name <function-name> \
 78 | --region <region> --statement-id <some-unique-id> --action "lambda:InvokeFunction" \
 79 | --principal s3.amazonaws.com --source-arn <arn:aws:s3:::sourcebucket> \
 80 | --source-account <bucket-owner-account-id> --profile adminuser
 81 | ```
 82 | 
 83 | See this link for reference on the lamda add-permission command:
 84 | http://docs.aws.amazon.com/cli/latest/reference/lambda/add-permission.html
 85 | 
 86 | Add the notification on S3 and have it call the Lamda function:
 87 | 
 88 | \*Make sure your notification configuration contains ``s3:ObjectCreated:*`` events
 89 | 
 90 | ```sh
 91 |   $> aws s3api put-bucket-notification-configuration --bucket <your bucket name> --notification-configuration <your-cloud-function notification-configuration>
 92 | ```
 93 | 
 94 | See this link for reference on the s3api put-bucket-notification-configuration command:  
 95 | http://docs.aws.amazon.com/cli/latest/reference/s3api/put-bucket-notification-configuration.html
 96 | 
 97 | Upload a file to the S3 bucket and make validate the lamda function activated your pipeline:
 98 | 
 99 | ```sh
100 |   $> aws s3 cp <test.txt> <s3://bucket/test.txt>
101 |   $> aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
102 | ```
103 | 
104 | ##### OR Add a CRON schedule using Cloudwatch Scheduled Events
105 | 
106 | This is only possible in the Lamda console. Instructions here: http://docs.aws.amazon.com/lambda/latest/dg/with-scheduled-events.html
107 | 


--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/RDStoRedshiftSqoop.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "myComment": "This object is used to set default configuration for objects in the pipeline.",
  5 | 
  6 |       "id": "Default",
  7 |       "name": "Default",
  8 |       "failureAndRerunMode": "CASCADE",
  9 |       "schedule": {
 10 |         "ref": "DefaultSchedule"
 11 |       },
 12 |       "resourceRole": "DataPipelineDefaultResourceRole",
 13 |       "role": "DataPipelineDefaultRole",
 14 |       "scheduleType": "cron",
 15 |       "pipelineLogUri": "#{myS3LogsPath}"
 16 |     },
 17 |     {
 18 |       "myComment": "This object is used to run the Sqoop activity that extracts data from RDS.",
 19 | 
 20 |       "name": "Sqoop",
 21 |       "id": "ActivityId_wQhxe",
 22 |       "runsOn": {
 23 |         "ref": "ResourceId_z9RNH"
 24 |       },
 25 |       "type": "ShellCommandActivity",
 26 |       "command": "/usr/bin/sqoop import --connect jdbc:mysql://#{myRdsEndpoint}/millionsongs --table songs --target-dir #{myS3StagingPath} --username dplcustomer --password Dplcustomer1"
 27 |     },
 28 |     {
 29 |       "myComment": "This object is used to specify the copy activity for moving data from S3 to Redshift.",
 30 | 
 31 |       "output": {
 32 |         "ref": "DataNodeId_7EqZ7"
 33 |       },
 34 |       "input": {
 35 |         "ref": "DataNodeId_ImmS9"
 36 |       },
 37 |       "dependsOn": {
 38 |         "ref": "ActivityId_wQhxe"
 39 |       },
 40 |       "name": "CopyToRedshift",
 41 |       "id": "ActivityId_6OGtu",
 42 |       "runsOn": {
 43 |         "ref": "ResourceId_z9RNH"
 44 |       },
 45 |       "type": "RedshiftCopyActivity",
 46 |       "insertMode": "TRUNCATE"
 47 |     },
 48 |     {
 49 |       "myComment": "This object is used to control the task schedule.", 
 50 | 
 51 |       "occurrences": "1",
 52 |       "period": "1 Day",
 53 |       "name": "RunOnce",
 54 |       "id": "DefaultSchedule",
 55 |       "type": "Schedule",
 56 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 57 |     },
 58 |     {
 59 |       "myComment": "This object provides connection information for the Redshift cluster.",
 60 | 
 61 |       "connectionString": "jdbc:postgresql://#{myRedshiftEndpoint}:5439/dev",
 62 |       "*password": "Dplcustomer1",
 63 |       "name": "DefaultRedshiftDatabase1",
 64 |       "id": "RedshiftDatabaseId_S34X5",
 65 |       "type": "RedshiftDatabase",
 66 |       "username": "dplcustomer"
 67 |     },
 68 |     {
 69 |       "myComment": "This object is used to provide information for the EMR cluster bootstrap",
 70 | 
 71 |       "bootstrapAction": "s3://data-pipeline-samples/sqoop-activity/install_sqoop_ba.sh",
 72 |       "name": "HadoopCluster",
 73 |       "id": "ResourceId_z9RNH",
 74 |       "amiVersion": "3.8.0",
 75 |       "type": "EmrCluster",
 76 |       "terminateAfter": "1 Hour"
 77 |     },
 78 |     {
 79 |       "myComment": "This object provides information on the S3 staging data.",
 80 | 
 81 |       "directoryPath": "#{myS3StagingPath}",
 82 |       "name": "S3Input",
 83 |       "id": "DataNodeId_ImmS9",
 84 |       "type": "S3DataNode"
 85 |     },
 86 |     {
 87 |       "myComment": "This object contains information about the Redshift database.",
 88 | 
 89 |       "createTableSql": "create table IF NOT EXISTS songs (track_id varchar(2048) not null distkey sortkey, title varchar(2048), song_id varchar(2048), release_name varchar(2048), artist_id varchar(2048), artist_mbid varchar(2048), artist_name varchar(2048), duration float, artist_familiarity float, artist_hotness float, year int);",
 90 |       "database": {
 91 |         "ref": "RedshiftDatabaseId_S34X5"
 92 |       },
 93 |       "primaryKeys": "track_id",
 94 |       "name": "Redshift",
 95 |       "id": "DataNodeId_7EqZ7",
 96 |       "type": "RedshiftDataNode",
 97 |       "tableName": "songs"
 98 |     }
 99 |   ],
100 |   "parameters": []
101 | }
102 | 


--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/Setup.py:
--------------------------------------------------------------------------------
 1 | from RdsToRedshiftSqoopSample import RDStoRedshiftSqoopSample
 2 | from Utilities import check_working_directory
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     check_working_directory()
10 |     parser = argparse.ArgumentParser(description='Setup for RDS to Redshift Sqoop pipeline sample')
11 |     parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 |     args = parser.parse_args()
13 |     s3_bucket_path = args.s3_bucket_path
14 | 
15 |     sample = RDStoRedshiftSqoopSample()
16 | 
17 |     if s3_bucket_path is None:
18 |         sample.create_s3_bucket()
19 |     elif not sample.validate_s3_bucket_path(s3_bucket_path):
20 |         sys.exit(0) 
21 | 
22 |     sample.create_rds_instance()
23 |     sample.create_redshift_cluster()
24 |     sample.run_setup_datapipeline()
25 |     sample.print_setup_results()
26 | 


--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/SetupPipelineDefinition.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class SetupPipelineDefinitionHelper(object):
 5 | 
 6 |     def __init__(self):
 7 |         with open("setup.json", "r") as setup:
 8 |             pipeline_string = setup.read().replace('\n', '')
 9 |         self.pipeline_definition = json.loads(pipeline_string)
10 | 
11 |     def get_setup_pipeline_objects(self):
12 |         return self.pipeline_definition['objects']
13 | 
14 |     def get_setup_pipeline_parameters(self):
15 |         return self.pipeline_definition['parameters']
16 | 
17 |     def get_setup_pipeline_parameter_values(self):
18 |         return self.pipeline_definition['parameterValues']
19 | 


--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/Teardown.py:
--------------------------------------------------------------------------------
 1 | from RdsToRedshiftSqoopSample import RDStoRedshiftSqoopSample
 2 | from Utilities import check_working_directory
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     check_working_directory()
 9 | 
10 |     parser = argparse.ArgumentParser(description='Teardown for RDS to Redshift Sqoop pipeline sample')
11 |     parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 |     parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id")
13 |     parser.add_argument('--redshift-cluster-id', action="store", dest="redshift_cluster_id")
14 |     args = parser.parse_args()
15 | 
16 |     sample = RDStoRedshiftSqoopSample()
17 | 
18 |     if args.rds_instance_id is not None:
19 |         sample.destroy_rds(args.rds_instance_id)
20 | 
21 |     if args.redshift_cluster_id is not None:
22 |         sample.destroy_redshift(args.redshift_cluster_id)
23 | 
24 |     if args.s3_bucket_path is not None:
25 |         sample.destroy_s3_bucket(args.s3_bucket_path)
26 | 


--------------------------------------------------------------------------------
/samples/RDStoRedshiftSqoop/setup/Utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def check_working_directory():
 6 |     current_folder_path, current_folder_name = os.path.split(os.getcwd())
 7 |     if current_folder_name == 'RDStoRedshiftSqoop':
 8 |         os.chdir('setup')
 9 |     elif current_folder_name != 'setup':
10 |         print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoRedshiftSqoop/setup'
11 |         sys.exit(0)
12 | 


--------------------------------------------------------------------------------
/samples/RDStoS3/RDStoS3Pipeline.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "myComment": "This object is used to set default configuration for objects in the pipeline",
  5 | 
  6 |       "id": "Default",
  7 |       "failureAndRerunMode": "CASCADE",
  8 |       "resourceRole": "DataPipelineDefaultResourceRole",
  9 |       "role": "DataPipelineDefaultRole",
 10 |       "pipelineLogUri": "#{myS3LogsPath}",
 11 |       "scheduleType": "cron",
 12 |       "schedule": {
 13 |         "ref": "DefaultSchedule"
 14 |       }
 15 |     },
 16 |     {
 17 |       "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.",
 18 | 
 19 |       "type": "Schedule",
 20 |       "id": "DefaultSchedule",
 21 |       "occurrences": "1",
 22 |       "period": "1 Day",
 23 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 24 |     },
 25 |     {
 26 |       "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
 27 |       "name": "Ec2Instance",
 28 |       "type": "Ec2Resource",
 29 |       "id": "Ec2Instance",
 30 |       "instanceType": "t1.micro",
 31 |       "securityGroups": "#{myEc2RdsSecurityGrps}",
 32 |       "terminateAfter": "2 Hours",
 33 |       "resourceRole": "DataPipelineDefaultResourceRole",
 34 |       "role": "DataPipelineDefaultRole"
 35 |     },
 36 |     {
 37 |       "myComment": "This object is a CopyActivity. It is used to define the work that will be done to copy the data from the database to S3.",
 38 |       "name": "RDStoS3CopyActivity",
 39 |       "type": "CopyActivity",
 40 |       "id": "RDStoS3CopyActivity",
 41 |       "output": {
 42 |         "ref": "S3OutputLocation"
 43 |       },
 44 |       "input": {
 45 |         "ref": "SourceRDSTable"
 46 |       },
 47 |       "runsOn": {
 48 |         "ref": "Ec2Instance"
 49 |       }
 50 |     },
 51 |     {
 52 |       "myComment": "This object is a mysql datanode. It is used to represent the database which is the input datanode.",
 53 |       "name": "SourceRDSTable",
 54 |       "type": "MySqlDataNode",
 55 |       "id": "SourceRDSTable",
 56 |       "connectionString": "#{myRDSConnectStr}",
 57 |       "*password": "#{*myRDSPassword}",
 58 |       "table": "#{myRDSTableName}",
 59 |       "selectQuery": "select * from #{table}",
 60 |       "username": "#{myRDSUsername}"
 61 |     },
 62 |     {
 63 |       "myComment": "This object is a S3 datanode. It is used to represent the S3 directory where the data will be stored.",
 64 |       "name": "S3OutputLocation",
 65 |       "type": "S3DataNode",
 66 |       "id": "S3OutputLocation",
 67 |       "directoryPath": "#{myOutputS3Path}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}"
 68 |     }
 69 |   ],
 70 |   "parameters": [
 71 |     {
 72 |       "id": "*myRDSPassword",
 73 |       "type": "String",
 74 |       "description": "RDS MySQL password"
 75 |     },
 76 |     {
 77 |       "watermark" : "s3://mybucket",
 78 |       "id": "myS3LogsPath",
 79 |       "type": "AWS::S3::ObjectKey",
 80 |       "description": "S3 folder for logs"
 81 |     },
 82 |     {
 83 |       "watermark" : "s3://mybucket",
 84 |       "id": "myOutputS3Path",
 85 |       "type": "AWS::S3::ObjectKey",
 86 |       "description": "Output S3 folder"
 87 |     },
 88 |     {
 89 |       "watermark" : "sg-12345",
 90 |       "id": "myEc2RdsSecurityGrps",
 91 |       "type": "String",
 92 |       "description": "RDS MySQL security group(s)",
 93 |       "isArray": "true",
 94 |       "optional": "true"
 95 |     },
 96 |     {
 97 |       "id": "myRDSUsername",
 98 |       "type": "String",
 99 |       "description": "RDS MySQL username"
100 |     },
101 |     {
102 |       "id": "myRDSTableName",
103 |       "type": "String",
104 |       "description": "RDS MySQL table name"
105 |     },
106 |     {
107 |       "watermark" : "jdbc:mysql://connectionstring:3306/dbname",
108 |       "id": "myRDSConnectStr",
109 |       "type": "String",
110 |       "description": "RDS MySQL connection string"
111 |     }
112 |   ]
113 | }
114 | 


--------------------------------------------------------------------------------
/samples/RDStoS3/README.md:
--------------------------------------------------------------------------------
  1 | # Data Pipeline RDStoS3 Sample
  2 | 
  3 | ## Overview
  4 | 
  5 | This sample shows how to build a pipeline that outputs a MySQL table in csv format from a RDS database to an S3 bucket.
  6 | 
  7 | The project provides scripts for setting up the resources for the pipeline, installing the [data set](http://aws.amazon.com/datasets/6468931156960467), and destroying the resources. The project also provides the [pipeline definition file](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html) which is used to create the pipeline and the AWS CLI commands for creating and executing the pipeline. See the instructions below to get started.
  8 | 
  9 | *Note: Normal AWS charges apply for the resources created by the script. Make sure to run the teardown script as soon as you are done with the sample.*
 10 | 
 11 | ## Prerequisites
 12 | 
 13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
 14 | 
 15 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket.
 16 | 
 17 | ## Step 1: Priming this sample
 18 | 
 19 | Run the following commands to run the setup script. The AWS resources that will be created are a RDS MySQL database and optionally an S3 bucket.
 20 | 
 21 | The script takes an *optional* parameter for an S3 path for outputting the data from S3. If you choose to provide your own S3 path, the bucket must be in the same region as what is set for your AWS CLI configuration.  Finally, please make sure the S3 bucket has a policy that allows data writes to it.  
 22 | 
 23 | If the path is not provided, the script will create the S3 bucket for you.
 24 | 
 25 | *Setup and teardown scripts are located in the setup directory under the sqoop directory in the samples directory.*
 26 | ```
 27 | $> cd <GITCLONE>/data-pipeline-samples/samples/RDStoS3
 28 | $> python setup/Setup.py --s3-path [s3://optional/path/to/s3/location]
 29 | ```
 30 | 
 31 | ## Step 2: Run this sample pipeline using the AWS CLI
 32 | 
 33 | ```sh 
 34 |   $> aws datapipeline create-pipeline --name rds_to_s3_pipeline --unique-id rds_to_s3_pipeline
 35 | ```
 36 | 
 37 | You receive a pipelineId like this. 
 38 | ```sh
 39 |   #   -----------------------------------------
 40 |   #   |             CreatePipeline             |
 41 |   #   +-------------+--------------------------+
 42 |   #   |  pipelineId |  <Your Pipeline ID>      |
 43 |   #   +-------------+--------------------------+
 44 | ```
 45 | 
 46 | ```sh
 47 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://RDStoS3Pipeline.json --parameter-values myOutputS3Path=<s3://your/s3/output/path> myS3LogsPath=<s3://your/s3/logs/path> myRDSPassword=<your-rds-password> myRDSUsername=<your-rds-username> myRDSTableName=<your-rds-table-name> myRDSConnectStr=<your-rds-connection-string> --pipeline-id <Your Pipeline ID> 
 48 | ```
 49 | 
 50 | You receive a validation messages like this
 51 | ```sh
 52 |   #   ----------------------- 
 53 |   #   |PutPipelineDefinition|
 54 |   #   +-----------+---------+
 55 |   #   |  errored  |  False  |
 56 |   #   +-----------+---------+
 57 | ```
 58 | 
 59 | Now activate the pipeline
 60 | ```sh
 61 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
 62 | ```
 63 | 
 64 | Check the status of your pipeline 
 65 | ```
 66 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
 67 | ```
 68 | 
 69 | You will receive status information on the pipeline.  
 70 | ```sh
 71 |   #       Name                                                Scheduled Start      Status
 72 |   #       ID                                                  Started              Ended
 73 |   #---------------------------------------------------------------------------------------------------
 74 |   #   1.  ActivityId_6OGtu                                    2015-07-29T01:06:17  WAITING_ON_DEPENDENCIES
 75 |   #       @ActivityId_6OGtu_2015-07-29T01:06:17               2015-07-29T01:06:20
 76 |   #
 77 |   #   2.  ResourceId_z9RNH                                    2015-07-29T01:06:17  CREATING
 78 |   #       @ResourceId_z9RNH_2015-07-29T01:06:17               2015-07-29T01:06:20
 79 |   #
 80 |   #       @ActivityId_wQhxe_2015-07-29T01:06:17               2015-07-29T01:06:20
 81 | ```
 82 | 
 83 | Let the pipeline complete, then check the output S3 bucket for the output csv file.
 84 | 
 85 | ## Step 3: IMPORTANT! Tear down this sample
 86 | 
 87 | *Note: The setup script will provide the teardown command with parameters at end of the execution.*
 88 | 
 89 | ```
 90 | $> python setup/Teardown.py --rds-instance-id <rds_instance_id> -s3-path [s3://optional/path/to/s3/bucket/created/by/setup]
 91 | ```
 92 | 
 93 | ## Disclaimer
 94 | 
 95 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
 96 | 
 97 | Use at your own risk.
 98 | 
 99 | Licensed under the MIT-0 License.
100 | 


--------------------------------------------------------------------------------
/samples/RDStoS3/setup/Setup.py:
--------------------------------------------------------------------------------
 1 | from RDStoS3Sample import RDStoS3Sample
 2 | from Utilities import check_working_directory
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     check_working_directory()
10 |     parser = argparse.ArgumentParser(description='Setup for RDS to S3 pipeline sample')
11 |     parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 |     args = parser.parse_args()
13 |     s3_bucket_path = args.s3_bucket_path
14 | 
15 |     sample = RDStoS3Sample()
16 | 
17 |     if s3_bucket_path is None:
18 |         sample.create_s3_bucket()
19 |     elif not sample.validate_s3_bucket_path(s3_bucket_path):
20 |         sys.exit(0) 
21 | 
22 |     sample.create_rds_instance()
23 |     sample.run_setup_datapipeline()
24 |     sample.print_setup_results()
25 | 


--------------------------------------------------------------------------------
/samples/RDStoS3/setup/SetupPipelineDefinition.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class SetupPipelineDefinitionHelper(object):
 5 | 
 6 |     def __init__(self):
 7 |         with open("setup.json", "r") as setup:
 8 |             pipeline_string = setup.read().replace('\n', '')
 9 |         self.pipeline_definition = json.loads(pipeline_string)
10 | 
11 |     def get_setup_pipeline_objects(self):
12 |         return self.pipeline_definition['objects']
13 | 
14 |     def get_setup_pipeline_parameters(self):
15 |         return self.pipeline_definition['parameters']
16 | 
17 |     def get_setup_pipeline_parameter_values(self):
18 |         return self.pipeline_definition['parameterValues']
19 | 


--------------------------------------------------------------------------------
/samples/RDStoS3/setup/Teardown.py:
--------------------------------------------------------------------------------
 1 | from RDStoS3Sample import RDStoS3Sample
 2 | from Utilities import check_working_directory
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     check_working_directory()
 9 | 
10 |     parser = argparse.ArgumentParser(description='Teardown for RDS to S3 pipeline sample')
11 |     parser.add_argument('--s3-path', action="store", dest="s3_bucket_path")
12 |     parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id")
13 |     parser.add_argument('--redshift-cluster-id', action="store", dest="redshift_cluster_id")
14 |     args = parser.parse_args()
15 | 
16 |     sample = RDStoS3Sample()
17 | 
18 |     if args.rds_instance_id is not None:
19 |         sample.destroy_rds(args.rds_instance_id)
20 | 
21 | 
22 |     if args.s3_bucket_path is not None:
23 |         sample.destroy_s3_bucket(args.s3_bucket_path)
24 | 


--------------------------------------------------------------------------------
/samples/RDStoS3/setup/Utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def check_working_directory():
 6 |     current_folder_path, current_folder_name = os.path.split(os.getcwd())
 7 |     if current_folder_name == 'RDStoS3':
 8 |         os.chdir('setup')
 9 |     elif current_folder_name != 'setup':
10 |         print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoS3/setup'
11 |         sys.exit(0)
12 | 


--------------------------------------------------------------------------------
/samples/RedshiftCopyActivityFromDynamoDBTable/RedshiftCopyActivityFromDynamoDBTable.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "myComment": "This object is used to set default configuration for objects in the pipeline.",
 5 | 
 6 |       "name": "Default",
 7 |       "id": "Default",
 8 |       "failureAndRerunMode": "CASCADE",
 9 |       "schedule": {
10 |         "ref": "DefaultSchedule"
11 |       },
12 |       "resourceRole": "DataPipelineDefaultResourceRole",
13 |       "role": "DataPipelineDefaultRole",
14 |       "pipelineLogUri": "#{myLogUri}",
15 |       "scheduleType": "cron"
16 |     },
17 |     {
18 |       "myComment": "This object provides connection information for the Redshift cluster.",
19 | 
20 |       "name": "DefaultDatabase1",
21 |       "id": "DatabaseId_Kw7C9",
22 |       "connectionString": "#{myConnectionString}",
23 |       "databaseName": "#{myRedshiftDatabase}",
24 |       "*password": "#{myRedshiftPassword}",
25 |       "type": "RedshiftDatabase",
26 |       "username": "#{myRedshiftUsername}"
27 |     },
28 |     {
29 |       "myComment": "This object is used to provide the resource where the copy job is invoked.",
30 |       
31 |       "name": "DefaultResource1",
32 |       "id": "ResourceId_idL0Y",
33 |       "resourceRole": "DataPipelineDefaultResourceRole",
34 |       "role": "DataPipelineDefaultRole",
35 |       "type": "Ec2Resource",
36 |       "terminateAfter": "1 Hour"
37 |     },
38 |     {
39 |       "myComment": "This object is used to specify the copy activity for moving data from DynamoDB to Redshift.",
40 | 
41 |       "name": "CopyFromDDBToRedshift",
42 |       "id": "ActivityId_vmVn4",
43 |       "database": {
44 |         "ref": "DatabaseId_Kw7C9"
45 |       },      
46 |       "runsOn": {
47 |         "ref": "ResourceId_idL0Y"
48 |       },
49 |       "type": "SqlActivity",
50 |       "script": "#{myScript}"
51 |     },
52 |     {
53 |       "myComment": "This object is used to control the task schedule.", 
54 | 
55 |       "name": "RunOnce",
56 |       "id": "DefaultSchedule",
57 |       "occurrences": "1",
58 |       "period": "1 Day",
59 |       "type": "Schedule",
60 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
61 |     }   
62 |   ],
63 |   "parameters": []
64 | }
65 | 


--------------------------------------------------------------------------------
/samples/RedshiftCopyActivityFromDynamoDBTable/readme.md:
--------------------------------------------------------------------------------
 1 | #RedshiftCopyActivityFromDynamoDBTable Sample 
 2 | 
 3 | This sample demonstrates how you can use Data Pipeline's RedshiftCopyActivity to copy data from a DynamoDB table to a Redshift table.  This sample was motivated by a use case that requires the user to provide AWS credentials to access the DynamoDB table.  It is assumed that the owner of the DynamoDB table has granted the user read access to the table.  To make this sample to work, you must ensure you have the following:
 4 | 
 5 | * Connection string for the destination Redshift cluster, e.g. jdbc:redshift://_hostname_:5439/_database_
 6 | * Redshift database name
 7 | * Redshift username and password.  This user must have write access to the table where data will be copied to.
 8 | * DynamoDB table name.  Note that both the table name and column names must match on both sides of the copy.
 9 | * AWS credentials, i.e the access key and the secret key, to access the DynamoDB table.
10 | * DynamoDB table read ratio.
11 | * S3 location to direct log messages generated by Data Pipeline.  
12 | 
13 | You will need to provide the above information in the "put-pipeline-definition" command below.
14 | 
15 | ##Running this sample
16 | 
17 | ```sh
18 |  $> aws datapipeline create-pipeline --name redshift_copy_from_dynamodb_pipeline --unique-id redshift_copy_from_dynamodb_pipeline 
19 | 
20 | # You receive a pipeline activity like this. 
21 | #   -----------------------------------------
22 | #   |             CreatePipeline             |
23 | #   +-------------+--------------------------+
24 | #   |  pipelineId |  df-0554887H4KXKTY59MRJ  |
25 | #   +-------------+--------------------------+
26 | 
27 | #now upload the pipeline definition 
28 | 
29 |   $> aws datapipeline put-pipeline-definition --pipeline-id df-0554887H4KXKTY59MRJ \
30 |   --pipeline-definition file://samples/RedshiftCopyActivitySample/RedshiftCopyActivitySample.json \
31 |   --parameter-values myConnectionString=<connection_string> myRedshiftDatabase=<database> \
32 |   myRedshiftUsername=<username> myRedshiftPassword=<password> \
33 |   myScript="copy <table_name> from 'dynamodb://<table_name>' credentials 'aws_access_key_id=<your_access_key>;aws_secret_access_key=<your_secret_key>' readratio <ratio>;" \
34 |  myLogUri="<your_log_dir>"
35 | 
36 | # You receive a validation messages like this
37 | 
38 | #   ----------------------- 
39 | #   |PutPipelineDefinition|
40 | #   +-----------+---------+
41 | #   |  errored  |  False  |
42 | #   +-----------+---------+
43 | 
44 | #now activate the pipeline
45 |   $> aws datapipeline activate-pipeline --pipeline-id df-0554887H4KXKTY59MRJ
46 | 
47 | 
48 | #check the status of your pipeline 
49 | 
50 |   >$ aws datapipeline list-runs --pipeline-id df-0554887H4KXKTY59MRJ
51 | #       Name                                                Scheduled Start      Status                 
52 | #       ID                                                  Started              Ended              
53 | #---------------------------------------------------------------------------------------------------
54 | #   1.  ActivityId_vmVn4                                    2015-11-06T23:52:04  WAITING_FOR_RUNNER     
55 | #       @ActivityId_vmVn4_2015-11-06T23:52:04               2015-11-06T23:52:11                     
56 | #
57 | #   2.  ResourceId_idL0Y                                    2015-11-06T23:52:04  CREATING               
58 | #       @ResourceId_idL0Y_2015-11-06T23:52:04               2015-11-06T23:52:11      
59 | ```
60 | 
61 | ##Related documentation
62 | https://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-redshiftcopyactivity.html
63 | 
64 | 


--------------------------------------------------------------------------------
/samples/S3ToRdsSqoop/README.md:
--------------------------------------------------------------------------------
 1 | # Loading a CSV file stored in S3 into an RDS MySQL instance
 2 | 
 3 | This sample uses [sqoop](http://sqoop.apache.org/) to load a CSV filed stored in [S3](https://aws.amazon.com/s3/) into a [MySQL](https://www.mysql.com/) or [MariaDB](https://mariadb.org/) database instance managed by [RDS](https://aws.amazon.com/rds/). Sqoop is a specialized tool that uses [hadoop](http://hadoop.apache.org/) to transfer bulk data in and out of relational databases. It completes this task more quickly than Data Pipeline's built-in CopyActivity, but it is also more resource intensive. The sample takes advantage of built-in support for sqoop in [EMR](https://aws.amazon.com/emr/) 5.0.
 4 | 
 5 | ## Parameters
 6 | 
 7 | Parameter | Required | Description
 8 | ----------|----------|------------
 9 | myEmrMasterInstanceType | no | The EC2 instance type to use for the master node in the EMR cluster. Default: m2.xlarge
10 | myEmrCoreInstanceType | no | The EC2 instance type to use for the core nodes in the EMR cluster. Default: m2.xlarge
11 | myEmrCoreInstanceCount | no | The number of core nodes to launch in the EMR cluster. Default: 2
12 | myRdsEndpoint | yes | DNS endpoint for target RDS instance. The value should include the port number. Example: test.xyzw.us-east-1.rds.amazonaws.com:3306
13 | myRdsDatabaseName | yes | Name of the target MySQL or MariaDB database.
14 | myRdsTableName | yes | Name of the database table that the CSV will be imported into.
15 | myRdsUsername | yes | User name to use to connect to RDS.
16 | \*myRdsPassword | yes | Password to use to connect to RDS.
17 | myS3InputDataLocation | yes | S3 path to folder where the CSV data is stored. Example: s3://example-s3-path/folder-containing-csv-data/
18 | myPipelineLogUri | yes | S3 folder where log data generated by this pipeline will be written. Example: s3::/example-s3-path/folder-to-contain-log-files/
19 | 
20 | ## Prerequisites
21 | 
22 | This template assumes that you have already created an RDS instance running MySQL or MariaDB instance in RDS. Inside the instance you will need a database and table where the records will be inserted. You will need to know the database name, the table name, the database user name and password, and the DNS endpoint of the RDS instance. You can use the RDS console to view the DNS endpoint and the master user name and to modify the master password as needed. You will need to use the MySQL command-line tool or a graphical client like [MySQL Workbench](https://www.mysql.com/products/workbench/) to create the target database and table. See [here](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_ConnectToInstance.html) for more information on connecting to MySQL on RDS. Note that the schema of the table where you will be importing records should match the schema in the CSV file (i.e., it should have the same number of columns and appropriate column types).
23 | 
24 | ## Running this sample
25 | 
26 | Create a new pipeline. Throughout this section we assume that the S3ToRdsSqoop sample directory is
27 | your current working directory.
28 | 
29 | ```sh
30 |  $> aws datapipeline create-pipeline --name s3-to-rds-sqoop --unique-id s3-to-rds-sqoop
31 | # {
32 | #     "pipelineId": "df-03971252U4AVY60545T7"
33 | # }
34 | ```
35 | 
36 | Upload the [pipeline definition](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html). Use the `pipelineId` that was returned by the `create-pipeline` command. Specify the required parameters.
37 | 
38 | ```sh
39 |   $> aws datapipeline put-pipeline-definition --pipeline-id <your pipelineId> \
40 |     --pipeline-definition file://sqoop_activity.json \
41 |     --parameter-values myRdsEndpoint=<RDS endpoing> myRdsDatabaseName=<database name> myRdsTableName=<table name> \
42 |     myRdsUsername=<user name> '*myRdsPassword=<password>' myS3InputDataLocation=<data location> myPipelineLogUri=<log location>
43 | # {
44 | #     "errored": false,
45 | #     "validationWarnings": [],
46 | #     "validationErrors": []
47 | # }
48 | ```
49 | 
50 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command.
51 | 
52 | ```sh
53 |   $> aws datapipeline activate-pipeline --pipeline-id <your pipelineId>
54 | ```
55 | 
56 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the
57 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output
58 | from this command will show FINISHED for all pipeine nodes.
59 | 
60 | ```sh
61 | 
62 |   >$ aws datapipeline list-runs --pipeline-id <your pipelineId>
63 | 
64 | ```
65 | 
66 | Once the pipeline has completed, you should be able to see the imported records in MySQL by running a SELECT query using the MySQL command-line tool or a graphical client.
67 | 
68 | ## Next steps
69 | 
70 | In addition to the required parameters, there are optional parameters to set the EC2 instance types launched by the EMR cluster as well as the number of core nodes to launch. Changing these paramters may improve the performance of the import job.
71 | 
72 | Once the pipeline is completed, you can delete it with the following command.
73 | 
74 | ```sh
75 |  $> aws datapipeline delete-pipeline --pipeline-id <your pipelineId>
76 | ```
77 | 
78 | The resources used by this example will incur normal charges. If you created any resources specifically to test this pipeline, you may wish to delete them now.
79 | 
80 | ## Disclaimer
81 | 
82 | The samples in this repository are meant to help users get started with Data Pipeline. They may not
83 | be sufficient for production environments. Users should carefully inspect samples before running
84 | them.
85 | 
86 | *Use at your own risk.*
87 | 
88 | Licensed under the MIT-0 License.
89 | 


--------------------------------------------------------------------------------
/samples/S3ToRdsSqoop/sqoop_activity.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "myComment": "The default object sets global properties for the pipeline.",
  5 |       "id": "Default",
  6 |       "name": "Default",
  7 |       "failureAndRerunMode": "CASCADE",
  8 |       "resourceRole": "DataPipelineDefaultResourceRole",
  9 |       "role": "DataPipelineDefaultRole",
 10 |       "pipelineLogUri": "#{myPipelineLogUri}",
 11 |       "scheduleType": "ONDEMAND"
 12 |     },
 13 |     {
 14 |       "myComment": "An EMR cluster where the Sqoop job will be run. These parameters can be edited to create a more powerful cluster.",
 15 |       "id": "MyEmrCluster",
 16 |       "name": "MyEmrCluster",
 17 |       "type": "EmrCluster",
 18 |       "masterInstanceType": "#{myEmrMasterInstanceType}",
 19 |       "coreInstanceType": "#{myEmrCoreInstanceType}",
 20 |       "coreInstanceCount": "#{myEmrCoreInstanceCount}",
 21 |       "releaseLabel": "emr-5.0.0",
 22 |       "applications": ["sqoop"],
 23 |       "terminateAfter": "12 hours"
 24 |     },
 25 |     {
 26 |       "myComment": "S3 folder where the input CSV is stored.",
 27 |       "id": "S3InputDataLocation",
 28 |       "name": "S3InputDataLocation",
 29 |       "directoryPath": "#{myS3InputDataLocation}",
 30 |       "type": "S3DataNode"
 31 |     },
 32 |     {
 33 |       "myComment": "The shell command to invoke sqoop to copy the CSV into RDS. This template assumes that the target database is either MySQL or MariaDB and that the target table has already been created.",
 34 |       "id": "SqoopActivity",
 35 |       "name": "SqoopActivity",
 36 |       "runsOn": {
 37 |         "ref": "MyEmrCluster"
 38 |       },
 39 |       "input": {
 40 |         "ref": "S3InputDataLocation"
 41 |       },
 42 |       "type": "ShellCommandActivity",
 43 |       "command": "sqoop export --connect jdbc:mariadb://#{myRdsEndpoint}/#{myRdsDatabaseName} --driver org.mariadb.jdbc.Driver --table #{myRdsTableName} --username #{myRdsUsername} --password #{*myRdsPassword} --export-dir #{myS3InputDataLocation}"
 44 |     }
 45 |   ],
 46 |   "parameters": [
 47 |     {
 48 |       "id": "myEmrMasterInstanceType",
 49 |       "type": "String",
 50 |       "default": "m2.xlarge",
 51 |       "description": "The EC2 instance type to use for the master node in the EMR cluster"
 52 |     },
 53 |     {
 54 |       "id": "myEmrCoreInstanceType",
 55 |       "type": "String",
 56 |       "default": "m2.xlarge",
 57 |       "description": "The EC2 instance type to use for the core nodes in the EMR cluster"
 58 |     },
 59 |     {
 60 |       "id": "myEmrCoreInstanceCount",
 61 |       "type": "String",
 62 |       "default": "2",
 63 |       "description": "The number of core nodes to launch in the EMR cluster"
 64 |     },
 65 |     {
 66 |       "id": "myRdsEndpoint",
 67 |       "type": "String",
 68 |       "description": "DNS endpoint for target RDS instance. The value should include the port number. Example: test.xyzw.us-east-1.rds.amazonaws.com:3306"
 69 |     },
 70 |     {
 71 |       "id": "myRdsDatabaseName",
 72 |       "type": "String",
 73 |       "description": "Name of the target MySQL or MariaDB database"
 74 |     },
 75 |     {
 76 |       "id": "myRdsTableName",
 77 |       "type": "String",
 78 |       "description": "Name of the database table that the CSV will be imported into"
 79 |     },
 80 |     {
 81 |       "id": "myRdsUsername",
 82 |       "type": "String",
 83 |       "description": "User name to use to connect to RDS"
 84 |     },
 85 |     {
 86 |       "id": "*myRdsPassword",
 87 |       "type": "String",
 88 |       "description": "Password to use to connect to RDS"
 89 |     },
 90 |     {
 91 |       "id": "myS3InputDataLocation",
 92 |       "type": "AWS::S3::ObjectKey",
 93 |       "description": "S3 path to folder where the CSV data is stored"
 94 |     },
 95 |     {
 96 |       "id": "myPipelineLogUri",
 97 |       "type": "AWS::S3::ObjectKey",
 98 |       "description": "S3 folder where log data generated by this pipeline will be written"
 99 |     }
100 |   ]
101 | }
102 | 


--------------------------------------------------------------------------------
/samples/S3TsvFilesToRedshiftTablesIfReady/readme.md:
--------------------------------------------------------------------------------
 1 | #Data Pipeline  Load Tab Separated Files in S3 to Redshift if file exists
 2 | 
 3 | ##About the sample
 4 | This pipeline definition when imported would instruct Redshift to load two TSV files from given two S3 location, into two different Redshift Table. Two copy activities are independent, each will start once the input s3 file exists. Table insert mode is OVERWRITE_EXISTING.
 5 | 
 6 | ##Running this sample
 7 | The pipeline requires the following user input point:
 8 | 
 9 | 1. Redshift connection info
10 | 2. The S3 file locations where the input TSV files are located. 
11 | 2. Redshift target table names of each S3 file to copy to.
12 | 3. Redshift Cluster security group id(s).
13 | 
14 | 
15 | ## Prerequisites
16 | 
17 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
18 | Redshift Cluster and Table must already exist.
19 | S3 tsv file locations are input for this pipeline, RedshiftCopy activity will start only when input S3 file exists.
20 | 
21 | 
22 | ## Run this sample pipeline using the AWS CLI
23 | 
24 | ```sh 
25 |   $> aws datapipeline create-pipeline --name s3_if_ready_to_redshift --unique-id s3_if_ready_to_redshift
26 | ```
27 | 
28 | You receive a pipelineId like this. 
29 | ```sh
30 |   #   -----------------------------------------
31 |   #   |             CreatePipeline             |
32 |   #   +-------------+--------------------------+
33 |   #   |  pipelineId |  <Your Pipeline ID>      |
34 |   #   +-------------+--------------------------+
35 | ```
36 | 
37 | ```sh
38 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://S3TsvFilesToRedshiftTablesIfReady.json --pipeline-id <your-pipeline-id-shown-in-last-command> \
39 |   --parameter-values  myRedshiftUsername=<myRedshiftUsername>  \*myRedshiftPassword=<redshift password> \
40 |      myRedshiftDbName=<myRedshiftDbName> \
41 |      myRedshiftSecurityGrpIds=<security group like sg-abc> \
42 |      myRedshiftJdbcConnectStr=<your connection string like jdbc:redshift://example.eaeer.us-east-1.redshift.amazonaws.com:5439/example>\
43 |      myInputTsvFilesS3Loc_1=<s3://myInputTsvFilesS3Loc_1.csv>\
44 |      myDestRedshiftTable_1=<table name for file 1>\
45 |      myInputTsvFilesS3Loc_2=s3://myInputTsvFilesS3Loc_2.csv>\
46 |      myDestRedshiftTable_2=<table name for file 2>\
47 |      myLogUri=<s3://your-log-location> 
48 | 
49 | ```
50 | 
51 | You receive a validation messages like this
52 | ```sh
53 |   #   ----------------------- 
54 |   #   |PutPipelineDefinition|
55 |   #   +-----------+---------+
56 |   #   |  errored  |  False  |
57 |   #   +-----------+---------+
58 | ```
59 | 
60 | Now activate the pipeline
61 | ```sh
62 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
63 | ```
64 | 
65 | Check the status of your pipeline 
66 | ```sh
67 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
68 | ```
69 | 
70 | You will receive status information on the pipeline. 
71 | 
72 | 
73 | ## Disclaimer
74 | 
75 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
76 | 
77 | Use at your own risk.
78 | 
79 | Licensed under the MIT-0 License.
80 | 


--------------------------------------------------------------------------------
/samples/SQLActivityWithTimeout/README.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline SQL Activity with timeout sample
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample shows how to build a pipeline that uses the SQL activity to execute queries defined in a .sql script file
 6 | that is stored on S3. The SQL queries are executed against an RDS mySQL database instance.
 7 | 
 8 | The sample also demonstrates setting an explicit timeout on the attempt of the SQL activity (attemptTimeout: "1 hour") in the pipeline definition json file. This field can be set appropriately based on the expected run time of the activity attempt. 
 9 | 
10 | The project provides scripts for setting up the RDS database for the sample, importing a [data set](http://aws.amazon.com/datasets/6468931156960467) (pipeline.json), and destroying the RDS datbase. The project also provides the [pipeline definition file](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-writing-pipeline-definition.html) which is used to create the pipeline and the AWS CLI commands for creating and executing the pipeline. See the instructions below to get started.
11 | 
12 | *Note: Normal AWS charges apply for the resources created by the script. Make sure to run the teardown script as soon as you are done with the sample.*
13 | 
14 | ## Prerequisites
15 | 
16 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
17 | 
18 | ## Step 1: Priming this sample
19 | 
20 | Run the following commands to run the setup script.
21 | 
22 | *Setup and teardown scripts are located in the setup directory.
23 | ```sh
24 | $> cd <GITCLONE>/data-pipeline-samples/samples/SQLActivityWithTimeout
25 | $> python setup/Setup.py
26 | ```
27 | 
28 | ## Step 2: Run this sample pipeline using the AWS CLI
29 | 
30 | ```sh 
31 |   $> aws datapipeline create-pipeline --name sql_activity_pipeline --unique-id sql_activity_pipeline
32 | ```
33 | 
34 | You receive a pipelineId like this. 
35 | ```sh
36 |   #   -----------------------------------------
37 |   #   |             CreatePipeline             |
38 |   #   +-------------+--------------------------+
39 |   #   |  pipelineId |  <Your Pipeline ID>      |
40 |   #   +-------------+--------------------------+
41 | ```
42 | 
43 | ```sh
44 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values
45 |      myS3LogsPath=<s3://your/s3/logs/path> myRDSUsername=<your-rds-username> myRDSPassword=<your-rds-password>
46 |      myRDSId=<your-rds-id> --pipeline-id <Your Pipeline ID> 
47 | ```
48 | 
49 | You receive a validation messages like this
50 | ```sh
51 |   #   ----------------------- 
52 |   #   |PutPipelineDefinition|
53 |   #   +-----------+---------+
54 |   #   |  errored  |  False  |
55 |   #   +-----------+---------+
56 | ```
57 | 
58 | Now activate the pipeline
59 | ```sh
60 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
61 | ```
62 | 
63 | Check the status of your pipeline 
64 | ```sh
65 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
66 | ```
67 | 
68 | You will receive status information on the pipeline. For example... 
69 | ```sh
70 |   #       Name                                                Scheduled Start      Status
71 |   #       ID                                                  Started              Ended
72 |   #---------------------------------------------------------------------------------------------------
73 |   #   1.  ActivityId_6OGtu                                    2015-07-29T01:06:17  WAITING_ON_DEPENDENCIES
74 |   #       @ActivityId_6OGtu_2015-07-29T01:06:17               2015-07-29T01:06:20
75 |   #
76 |   #   2.  ResourceId_z9RNH                                    2015-07-29T01:06:17  CREATING
77 |   #       @ResourceId_z9RNH_2015-07-29T01:06:17               2015-07-29T01:06:20
78 |   #
79 |   #       @ActivityId_wQhxe_2015-07-29T01:06:17               2015-07-29T01:06:20
80 | ```
81 | 
82 | 
83 | ## Step 3: IMPORTANT! Tear down this sample
84 | 
85 | *Note: The setup script will provide the teardown command with parameters at end of the execution.*
86 | 
87 | ```sh
88 | $> python setup/Teardown.py --rds-instance-id <rds_instance_id>
89 | ```
90 | 
91 | ## Disclaimer
92 | 
93 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
94 | 
95 | Use at your own risk.
96 | 
97 | Licensed under the MIT-0 License.
98 | 


--------------------------------------------------------------------------------
/samples/SQLActivityWithTimeout/pipeline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "myComment": "This object is used to set default configuration for objects in the pipeline",
 5 | 
 6 |       "id": "Default",
 7 |       "failureAndRerunMode": "CASCADE",
 8 |       "resourceRole": "DataPipelineDefaultResourceRole",
 9 |       "role": "DataPipelineDefaultRole",
10 |       "pipelineLogUri": "#{myS3LogsPath}",
11 |       "scheduleType": "cron",
12 |       "schedule": {
13 |         "ref": "DefaultSchedule"
14 |       }
15 |     },
16 |     {
17 |       "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it.",
18 | 
19 |       "type": "Schedule",
20 |       "id": "DefaultSchedule",
21 |       "occurrences": "1",
22 |       "period": "1 Day",
23 |       "name": "RunOnce",
24 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
25 |     },
26 |     {
27 |       "myComment": "This object is a SqlActivity which can be used to query data from a relational database.",
28 | 
29 |       "type": "SqlActivity",
30 |       "id": "ActivityId_wRpKi",
31 |       "name": "DefaultActivity1",
32 |       "database": {
33 |         "ref": "DatabaseId_rAZmM"
34 |       },
35 |       "reportProgressTimeout": "1 Hour",
36 |       "attemptTimeout": "1 Hour",
37 |       "scriptUri": "s3://data-pipeline-samples/sqlactivity/script.sql",
38 |       "runsOn": {
39 |         "ref": "ResourceId_nEzqN"
40 |       }
41 |     },
42 |     {
43 |       "myComment": "This object defines the RDS Datbase resource that the SQL query will run on.",
44 | 
45 |       "type": "RdsDatabase",
46 |       "id": "DatabaseId_rAZmM",
47 |       "rdsInstanceId" : "#{myRDSId}",
48 |       "name": "DefaultDatabase1",
49 |       "username": "#{myRDSUsername}",
50 |       "*password": "#{myRDSPassword}"
51 |     },
52 |     {
53 |       "myComment": "This object defines the EC2 resource ",
54 | 
55 |       "type": "Ec2Resource",
56 |       "id": "ResourceId_nEzqN",
57 |       "name": "DefaultResource1",
58 |       "resourceRole": "DataPipelineDefaultResourceRole",
59 |       "role": "DataPipelineDefaultRole",
60 |       "terminateAfter":  "2 Hours"
61 |     }
62 |   ],
63 |   "parameters": [
64 |     {
65 |       "type": "String",
66 |       "id": "myRDSId",
67 |       "description": "RDS instance id"
68 |     },
69 |     {
70 |       "type": "String",
71 |       "id": "myRDSUsername",
72 |       "description": "RDS MySQL username"
73 |     },
74 |     {
75 |       "type": "String",
76 |       "id": "myRDSPassword",
77 |       "description": "RDS MySQL password"
78 |     },
79 |     {
80 |       "type": "AWS::S3::ObjectKey",
81 |       "id": "myS3LogsPath",
82 |       "description": "S3 folder for logs"
83 |     }
84 |   ]
85 | }


--------------------------------------------------------------------------------
/samples/SQLActivityWithTimeout/setup/Setup.py:
--------------------------------------------------------------------------------
 1 | from SQLActivitySample import SQLActivitySample
 2 | from Utilities import check_working_directory
 3 | 
 4 | import argparse
 5 | import sys
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     check_working_directory()
10 |     parser = argparse.ArgumentParser(description='Setup for SQLActivity pipeline sample')
11 |     args = parser.parse_args()
12 | 
13 |     sample = SQLActivitySample()
14 |     sample.create_rds_instance()
15 |     sample.run_setup_datapipeline()
16 |     sample.print_setup_results()
17 | 


--------------------------------------------------------------------------------
/samples/SQLActivityWithTimeout/setup/SetupPipelineDefinition.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class SetupPipelineDefinitionHelper(object):
 5 | 
 6 |     def __init__(self):
 7 |         with open("setup.json", "r") as setup:
 8 |             pipeline_string = setup.read().replace('\n', '')
 9 |         self.pipeline_definition = json.loads(pipeline_string)
10 | 
11 |     def get_setup_pipeline_objects(self):
12 |         return self.pipeline_definition['objects']
13 | 
14 |     def get_setup_pipeline_parameters(self):
15 |         return self.pipeline_definition['parameters']
16 | 
17 |     def get_setup_pipeline_parameter_values(self):
18 |         return self.pipeline_definition['parameterValues']
19 | 


--------------------------------------------------------------------------------
/samples/SQLActivityWithTimeout/setup/Teardown.py:
--------------------------------------------------------------------------------
 1 | from SQLActivitySample import SQLActivitySample
 2 | from Utilities import check_working_directory
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     check_working_directory()
 9 | 
10 |     parser = argparse.ArgumentParser(description='Teardown for SQLAcivityPipeline pipeline sample')
11 |     parser.add_argument('--rds-instance-id', action="store", dest="rds_instance_id")
12 |     args = parser.parse_args()
13 | 
14 |     sample = SQLActivitySample()
15 | 
16 |     if args.rds_instance_id is not None:
17 |         sample.destroy_rds(args.rds_instance_id)
18 | 


--------------------------------------------------------------------------------
/samples/SQLActivityWithTimeout/setup/Utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def check_working_directory():
 6 |     current_folder_path, current_folder_name = os.path.split(os.getcwd())
 7 |     if current_folder_name == 'SQLActivityWithTimeout':
 8 |         os.chdir('setup')
 9 |     elif current_folder_name != 'setup':
10 |         print 'ERROR: please run the setup script from data-pipeline-samples/samples/RDStoS3/setup'
11 |         sys.exit(0)
12 | 


--------------------------------------------------------------------------------
/samples/ShellCommandWithFTP/README.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline ShellCommandWith (S)FTP Sample
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample shows how to build a Shell Command Activity pipeline that uses a (s)ftp server to get files. The sample relies 
 6 | on having public key authentication configured to access the SFTP server. The samples also uses an input and output s3
 7 | bucket for storing input scripts and output results of the shell command.
 8 | 
 9 | The sample includes the pipeline definition, a script of ftp commands and a data file. 
10 | 
11 | ## Prerequisites
12 | 
13 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
14 | 
15 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket.
16 | 
17 | You must also upload *ftpcommands* to your s3 input bucket and the *data* file from this file to your ftp server.
18 | 
19 | ## Run this sample pipeline using the AWS CLI
20 | 
21 | ```sh 
22 |   $> aws datapipeline create-pipeline --name shell_command_ftp_pipeline --unique-id shell_command_ftp_pipeline
23 | ```
24 | 
25 | You receive a pipelineId like this. 
26 | ```sh
27 |   #   -----------------------------------------
28 |   #   |             CreatePipeline             |
29 |   #   +-------------+--------------------------+
30 |   #   |  pipelineId |  <Your Pipeline ID>      |
31 |   #   +-------------+--------------------------+
32 | ```
33 | 
34 | ```sh
35 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json --parameter-values myS3InputLoc=<s3://your/s3/input/staging/path> myS3OutputLoc=<s3://your/s3/output/staging/path> myS3LogsPath=<s3://your/s3/logs/path> myFTPUser=<user> myFTPHost=<host> --pipeline-id <Your Pipeline ID> 
36 | ```
37 | 
38 | You receive a validation messages like this
39 | ```sh
40 |   #   ----------------------- 
41 |   #   |PutPipelineDefinition|
42 |   #   +-----------+---------+
43 |   #   |  errored  |  False  |
44 |   #   +-----------+---------+
45 | ```
46 | 
47 | Now activate the pipeline
48 | ```sh
49 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
50 | ```
51 | 
52 | Check the status of your pipeline 
53 | ```
54 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
55 | ```
56 | 
57 | You will receive status information on the pipeline.  
58 | ```sh
59 |   #       Name                                                Scheduled Start      Status
60 |   #       ID                                                  Started              Ended
61 |   #---------------------------------------------------------------------------------------------------
62 |   #   1.  ActivityId_6OGtu                                    2015-07-29T01:06:17  WAITING_ON_DEPENDENCIES
63 |   #       @ActivityId_6OGtu_2015-07-29T01:06:17               2015-07-29T01:06:20
64 |   #
65 |   #   2.  ResourceId_z9RNH                                    2015-07-29T01:06:17  CREATING
66 |   #       @ResourceId_z9RNH_2015-07-29T01:06:17               2015-07-29T01:06:20
67 |   #
68 |   #       @ActivityId_wQhxe_2015-07-29T01:06:17               2015-07-29T01:06:20
69 | ```
70 | 
71 | 
72 | ## Disclaimer
73 | 
74 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
75 | 
76 | Use at your own risk.
77 | 
78 | Licensed under the MIT-0 License.
79 | 


--------------------------------------------------------------------------------
/samples/ShellCommandWithFTP/data:
--------------------------------------------------------------------------------
 1 | test
 2 | test
 3 | test
 4 | test
 5 | test
 6 | test
 7 | test
 8 | test
 9 | test
10 | test
11 | test
12 | test
13 | test
14 | test
15 | test
16 | test
17 | test
18 | test
19 | test
20 | test
21 | test
22 | test
23 | test
24 | test
25 | test
26 | test
27 | test
28 | test
29 | 


--------------------------------------------------------------------------------
/samples/ShellCommandWithFTP/ftpcommands:
--------------------------------------------------------------------------------
1 | cd /var/tmp
2 | get data
3 | exit


--------------------------------------------------------------------------------
/samples/ShellCommandWithFTP/pipeline.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |      {
  4 |       "myComment": "This object is used to set default configuration for objects in the pipeline",
  5 | 
  6 |       "id": "Default",
  7 |       "failureAndRerunMode": "CASCADE",
  8 |       "resourceRole": "DataPipelineDefaultResourceRole",
  9 |       "role": "DataPipelineDefaultRole",
 10 |       "pipelineLogUri": "#{myS3LogsPath}",
 11 |       "name": "Default",
 12 |       "scheduleType": "cron",
 13 |       "schedule": {
 14 |         "ref": "DefaultSchedule"
 15 |       }
 16 |     },
 17 |     {
 18 |       "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. 
 19 |       In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. 
 20 |       For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' 
 21 |       option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.",
 22 | 
 23 |       "type": "Schedule",
 24 |       "id": "DefaultSchedule",
 25 |       "occurrences": "1",
 26 |       "period": "1 Day",
 27 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 28 |     },
 29 |     {
 30 |       "myComment": "Pipeline object that represents the S3 bucket node that is used as the input staging directory in this pipeline.",
 31 | 
 32 |       "type": "S3DataNode",
 33 |       "id": "S3InputLocation",
 34 |       "directoryPath": "#{myS3InputLoc}"
 35 |     },
 36 |     {
 37 |       "myComment": "Pipeline object that represents the S3 bucket node that is used as the output staging directory in this pipeline.",
 38 | 
 39 |       "type": "S3DataNode",
 40 |       "id": "S3OutputLocation",
 41 |       "directoryPath": "#{myS3OutputLoc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}",
 42 |     },
 43 |     {
 44 |       "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. 
 45 |       ${INPUT1_STAGING_DIR} is the identifier used to refer to the input staging directory and ${OUTPUT1_STAGING_DIR} is used to refer
 46 |       to the output staging directory.",
 47 | 
 48 |       "type": "ShellCommandActivity",
 49 |       "id": "ShellCommandActivityObj",
 50 |       "stage": "true",
 51 |       "input": {
 52 |         "ref": "S3InputLocation"
 53 |       },
 54 |       "output": {
 55 |         "ref": "S3OutputLocation"
 56 |       },
 57 |       "runsOn": {
 58 |         "ref": "EC2ResourceObj"
 59 |       },
 60 |       "command": "sftp -b ${Input_STAGING_DIR}/ftpcommands #{user}@#{host}; wc -l data > ${OUTPUT1_STAGING_DIR}/linecount.txt;"
 61 |     },
 62 |     {
 63 |       "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
 64 | 
 65 |       "instanceType": "t1.micro",
 66 |       "name": "EC2ResourceObj",
 67 |       "id": "EC2ResourceObj",
 68 |       "type": "Ec2Resource",
 69 |       "terminateAfter": "20 Minutes"
 70 |     }
 71 |   ],
 72 |   "parameters": [
 73 |     {
 74 |       "myComment": "This Parameter specifies the S3 logging path for the pipeline. 
 75 |       It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users 
 76 |       avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling '
 77 |       aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.",
 78 | 
 79 |       "id" : "myS3LogsPath",
 80 |       "type" : "AWS::S3::ObjectKey",
 81 |       "description" : "S3 path for pipeline logs."
 82 |     },
 83 |     {
 84 |       "myComment": "This Parameter specifies the S3 input location for the pipeline.",
 85 | 
 86 |       "id": "myS3InputLoc",
 87 |       "type": "AWS::S3::ObjectKey"
 88 |     },
 89 |     {
 90 |       "myComment": "This Parameter specifies the S3 output location for the pipeline.",
 91 | 
 92 |       "id": "myS3OutputLoc",
 93 |       "type": "AWS::S3::ObjectKey"
 94 |     },
 95 |     {
 96 |       "myComment": "This Parameter specifies user for the ftp server",
 97 | 
 98 |       "id": "user",
 99 |       "type": "String"
100 |     },
101 |     {
102 |       "myComment": "This Parameter specifies the ftp server host",
103 | 
104 |       "id": "host",
105 |       "type": "String"
106 |     }
107 |   ]
108 | }
109 | 


--------------------------------------------------------------------------------
/samples/ShellCommandWithS3StagingDirectory/README.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline ShellCommandWithS3StagingDirectory Sample
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample shows how to build a Shell Command Activity pipeline that uses a S3 directory for staging. Specifically, this sample runs a script that is located in a s3 bucket and takes an argument string. The script simply prints out the argument to stdout.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
10 | 
11 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket.
12 | 
13 | You must also upload *bashscript.sh* to your s3 bucket.
14 | 
15 | ## Run this sample pipeline using the AWS CLI
16 | 
17 | ```sh 
18 |   $> aws datapipeline create-pipeline --name shell_command_staging_pipeline --unique-id shell_command_staging_pipeline
19 | ```
20 | 
21 | You receive a pipelineId like this. 
22 | ```sh
23 |   #   -----------------------------------------
24 |   #   |             CreatePipeline             |
25 |   #   +-------------+--------------------------+
26 |   #   |  pipelineId |  <Your Pipeline ID>      |
27 |   #   +-------------+--------------------------+
28 | ```
29 | 
30 | ```sh
31 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://shellcommandwiths3stagingdir.json --parameter-values myS3StagingPath=<s3://your/s3/staging/path> myS3LogsPath=<s3://your/s3/logs/path> --pipeline-id <Your Pipeline ID> 
32 | ```
33 | 
34 | You receive a validation messages like this
35 | ```sh
36 |   #   ----------------------- 
37 |   #   |PutPipelineDefinition|
38 |   #   +-----------+---------+
39 |   #   |  errored  |  False  |
40 |   #   +-----------+---------+
41 | ```
42 | 
43 | Now activate the pipeline
44 | ```sh
45 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
46 | ```
47 | 
48 | Check the status of your pipeline 
49 | ```
50 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
51 | ```
52 | 
53 | You will receive status information on the pipeline.  
54 | ```sh
55 |   #       Name                                                Scheduled Start      Status
56 |   #       ID                                                  Started              Ended
57 |   #---------------------------------------------------------------------------------------------------
58 |   #   1.  ActivityId_6OGtu                                    2015-07-29T01:06:17  WAITING_ON_DEPENDENCIES
59 |   #       @ActivityId_6OGtu_2015-07-29T01:06:17               2015-07-29T01:06:20
60 |   #
61 |   #   2.  ResourceId_z9RNH                                    2015-07-29T01:06:17  CREATING
62 |   #       @ResourceId_z9RNH_2015-07-29T01:06:17               2015-07-29T01:06:20
63 |   #
64 |   #       @ActivityId_wQhxe_2015-07-29T01:06:17               2015-07-29T01:06:20
65 | ```
66 | 
67 | 
68 | ## Disclaimer
69 | 
70 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
71 | 
72 | Use at your own risk.
73 | 
74 | Licensed under the MIT-0 License.
75 | 


--------------------------------------------------------------------------------
/samples/ShellCommandWithS3StagingDirectory/bashscript.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | echo $1
4 | 


--------------------------------------------------------------------------------
/samples/ShellCommandWithS3StagingDirectory/shellcommandwiths3stagingdir.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "myComment": "This object is used to set default configuration for objects in the pipeline",
 5 |       
 6 |       "id": "Default",
 7 |       "failureAndRerunMode": "CASCADE",
 8 |       "resourceRole": "DataPipelineDefaultResourceRole",
 9 |       "role": "DataPipelineDefaultRole",
10 |       "pipelineLogUri": "#{myS3LogsPath}",
11 |       "scheduleType": "cron",
12 |       "schedule": {
13 |         "ref": "DefaultSchedule"
14 |       }
15 |     },
16 |     {
17 |       "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.",
18 |       
19 |       "type": "Schedule",
20 |       "id": "DefaultSchedule",
21 |       "occurrences": "1",
22 |       "period": "1 Day",
23 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
24 |     },
25 |     {
26 |       "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
27 |       
28 |       "type": "Ec2Resource",
29 |       "id": "New_EC2Instance",
30 |       "terminateAfter": "1 Hour",
31 |       "resourceRole": "DataPipelineDefaultResourceRole",
32 |       "role": "DataPipelineDefaultRole"
33 |     },
34 |     {
35 |       "myComment": "Pipeline object that represents the S3 bucket node that is used as the staging directory in this pipeline.",
36 |       
37 |       "type": "S3DataNode",
38 |       "id": "New_S3_Datanode",
39 |       "directoryPath": "#{myS3StagingPath}"
40 |     },
41 |     {
42 |       "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case, it runs a simple bash script which echos the path of the input paramter. ${INPUT1_STAGING_DIR} is the identifier used to refer to the input staging directory.",
43 |       
44 |       "id": "Shell_Command_Staging_Activity",
45 |       "type": "ShellCommandActivity",
46 |       "stage": "true",
47 |       "input": {
48 |         "ref": "New_S3_Datanode"
49 |       },
50 |       "runsOn": {
51 |         "ref": "New_EC2Instance"
52 |       },
53 |       "scriptArgument": [
54 |         "hello world"
55 |       ],
56 |       "command": "bash -x ${INPUT1_STAGING_DIR}/bashscript.sh $1"
57 |     }
58 |   ],
59 |   "parameters": [
60 |     {
61 |         "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.",
62 | 
63 |         "id" : "myS3LogsPath",
64 |         "type" : "AWS::S3::ObjectKey",
65 |         "description" : "S3 path for pipeline logs."
66 |      },
67 |      {
68 |         "myComment": "This Parameter specifies the S3 path for the input staging directory. This path is represented in the pipeline definition as ${INPUT1_STAGING_DIR}.",
69 | 
70 |         "id" : "myS3StagingPath",
71 |         "type" : "AWS::S3::ObjectKey",
72 |         "description" : "S3 path for staging directory."
73 |      }
74 |   ]
75 | }
76 | 


--------------------------------------------------------------------------------
/samples/SimplePigActivity/pig_activity_sample.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "id": "Default",
  5 |       "failureAndRerunMode": "CASCADE",
  6 |       "schedule": {
  7 |         "ref": "DefaultSchedule"
  8 |       },
  9 |       "resourceRole": "DataPipelineDefaultResourceRole",
 10 |       "role": "DataPipelineDefaultRole",
 11 |       "pipelineLogUri": "#{myLogUri}",
 12 |       "scheduleType": "cron"
 13 |     },
 14 |     {
 15 |       "id": "PigActivity1",
 16 |       "input": {
 17 |         "ref": "InputDataNode1"
 18 |       },
 19 |       "output": {
 20 |         "ref": "OutputDataNode1"
 21 |       },
 22 |       "stage": "true",
 23 |       "scriptUri": "#{myPigScriptUri}",
 24 |       "name": "DefaultActivity1",
 25 |       "runsOn": {
 26 |         "ref": "EmrCluster1"
 27 |       },
 28 |       "type": "PigActivity",
 29 |       "scriptVariable": [
 30 |         "column1=First",
 31 |         "column2=Second",
 32 |         "three=3"
 33 |       ],
 34 |       "generatedScriptsPath": "#{myGeneratedScriptsPath}"
 35 |     },
 36 |     {
 37 |       "id": "DefaultSchedule",
 38 |       "occurrences": "1",
 39 |       "period": "1 Day",
 40 |       "name": "RunOnce",
 41 |       "type": "Schedule",
 42 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 43 |     },
 44 |     {
 45 |       "id": "InputDataNode1",
 46 |       "directoryPath": "#{myS3Input}",
 47 |       "dataFormat": {
 48 |         "ref": "InputDataFormat1"
 49 |       },
 50 |       "type": "S3DataNode"
 51 |     },
 52 |     {
 53 |       "id": "InputDataFormat1",
 54 |       "inputRegEx": "^(\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+) (\\\\S+)",
 55 |       "column": [
 56 |         "First STRING",
 57 |         "Second STRING",
 58 |         "Third STRING",
 59 |         "Fourth STRING",
 60 |         "Fifth STRING",
 61 |         "Sixth STRING",
 62 |         "Seventh STRING",
 63 |         "Eighth STRING",
 64 |         "Nineth STRING",
 65 |         "Tenth STRING"        
 66 |       ],
 67 |       "type": "RegEx"
 68 |     },
 69 |     {
 70 |       "id": "OutputDataNode1",
 71 |       "directoryPath": "#{myS3Output}",
 72 |       "dataFormat": {
 73 |         "ref": "OutputDataFormat1"
 74 |       },
 75 |       "type": "S3DataNode"
 76 |     },
 77 |     {
 78 |       "id": "OutputDataFormat1",
 79 |       "column": [
 80 |         "First STRING",
 81 |         "Second STRING",
 82 |         "Third STRING",
 83 |         "Third STRING",
 84 |         "Fourth STRING",
 85 |         "Fifth STRING",
 86 |         "Sixth STRING",
 87 |         "Seventh STRING",
 88 |         "Eighth STRING"
 89 |       ],
 90 |       "columnSeparator": "*",
 91 |       "type": "Custom"
 92 |     },
 93 |     {
 94 |       "id": "EmrCluster1",
 95 |       "releaseLabel": "emr-4.2.0",
 96 |       "type": "EmrCluster",
 97 |       "terminateAfter": "1 Day"
 98 |     }
 99 |   ]
100 | }
101 | 


--------------------------------------------------------------------------------
/samples/SparkPiMaximizeResourceAllocation/SparkPi-maximizeResource.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "name": "DefaultProperty1",
 5 |       "id": "PropertyId_jVPFN",
 6 |       "type": "Property",
 7 |       "value": "true",
 8 |       "key": "maximizeResourceAllocation"
 9 |     },
10 |     {
11 |       "name": "DefaultEmrActivity1",
12 |       "step": "command-runner.jar,spark-submit,--class,org.apache.spark.examples.SparkPi,/usr/lib/spark/lib/spark-examples.jar,100",
13 |       "runsOn": {
14 |         "ref": "EmrClusterId_mD6dg"
15 |       },
16 |       "id": "EmrActivityId_Bo5Zd",
17 |       "type": "EmrActivity"
18 |     },
19 |     {
20 |       "configuration": {
21 |         "ref": "EmrConfigurationId_uXera"
22 |       },
23 |       "name": "DefaultEmrCluster1",
24 |       "coreInstanceCount": "1",
25 |       "coreInstanceType": "m3.xlarge",
26 |       "releaseLabel": "emr-4.6.0",
27 |       "masterInstanceType": "m3.xlarge",
28 |       "id": "EmrClusterId_mD6dg",
29 |       "type": "EmrCluster",
30 |       "terminateAfter": "45 Minutes",
31 |       "applications": "spark"
32 |     },
33 |     {
34 |       "name": "DefaultEmrConfiguration1",
35 |       "property": {
36 |         "ref": "PropertyId_jVPFN"
37 |       },
38 |       "id": "EmrConfigurationId_uXera",
39 |       "type": "EmrConfiguration",
40 |       "classification": "spark"
41 |     },
42 |     {
43 |       "failureAndRerunMode": "CASCADE",
44 |       "resourceRole": "DataPipelineDefaultResourceRole",
45 |       "role": "DataPipelineDefaultRole",
46 |       "pipelineLogUri": "#{myPipelineLogUri}",
47 |       "scheduleType": "ONDEMAND",
48 |       "name": "Default",
49 |       "id": "Default"
50 |     }
51 |   ],
52 |   "parameters": [
53 |     {
54 |        "id" : "myPipelineLogUri",
55 |        "type" : "AWS::S3::ObjectKey",
56 |        "description" : "Please specify the logs location"
57 |      }]
58 | }


--------------------------------------------------------------------------------
/samples/SparkPiMaximizeResourceAllocation/readme.md:
--------------------------------------------------------------------------------
 1 | #EMRActivity SparkPi example with maximizeResourceAllocation
 2 | 
 3 | ##About the sample
 4 | This Pipeline definition launches an EmrCluster (emr-4.x.x) with [maximizeResourceAllocation](http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-spark-configure.html#d0e17386) with simple [SparkPi](https://github.com/apache/spark/blob/master/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala) example in yarn-client mode. Also, it runs on [ONDEMAND](https://aws.amazon.com/about-aws/whats-new/2016/02/now-run-your-aws-data-pipeline-on-demand/) schedule. 
 5 | 
 6 | ##Running this sample
 7 | The pipeline requires one input point from the customer:
 8 | 1. The log folder for the pipeline.
 9 | 
10 | ##Result
11 | You can view the output (stdout) under 'Emr Step Logs' under EmrActivity.
12 | Pi is roughly 3.141716


--------------------------------------------------------------------------------
/samples/billing/readme.md:
--------------------------------------------------------------------------------
  1 | ![Data Pipeline Logo](https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/setup/logo/datapipelinelogo.jpeg)
  2 | 
  3 | Load Detailed AWS Billing logs into a Redshift table
  4 | =====================
  5 | 
  6 | The Load AWS Detailed Billing Report Into Redshift template loads the AWS detailed billing report for the current month stored in an Amazon S3 folder to a Redshift table. If you would like to process files from previous months please pick a schedule that starts in the past, so the scheduled start time can be the timestamp of the CSVs for the period of interest. The input file must be of the .csv.zip format. Existing entries in the Redshift table are updated with data from Amazon S3 and new entries from Amazon S3 data are added to the Redshift table. If the table does not exist, it will be automatically created with the same schema as the AWS detailed billing report. The input report file is unzipped and converted to a GZIP file which is stored in the Amazon S3 staging folder before loading to Redshift.
  7 | 
  8 | ## Billing logs format
  9 | 
 10 | This sample specifically targets detailed billing reports for customers who have opted into consolidated billing and have other linked accounts. Their billing logs should have the following fields:
 11 | 
 12 | ```invoice_id
 13 | payer_account_id
 14 | linked_account_id
 15 | record_type
 16 | product_name
 17 | rate_id
 18 | subscription_id
 19 | pricing_plan_id
 20 | usage_type
 21 | operation
 22 | availability_zone
 23 | reserved_instance
 24 | item_description
 25 | usage_start_date
 26 | usage_end_date
 27 | usage_quantity
 28 | blended_rate
 29 | blended_cost
 30 | unblended_rate
 31 | unblended_cost
 32 | ```
 33 | 
 34 | ## How it works
 35 | 
 36 | The pipeline will download the billing logs .gzips from the S3 bucket matching the pipeline's scheduled start time into a newly created EC2 instance. A shell script will then uncompress these into a staging bucket in S3. The RedshiftCopyActivity then creates a table in Redshift with columns as listed above and then loads in the staged CSV files. A final cleanup script deletes the temporary staged files in S3.
 37 | 
 38 | ## Different billing formats
 39 | 
 40 | Logs for accounts without consolidated billing or linked accounts will replace 4 fields [blended_rate, blended_cost, unblended_rate, unblended_cost] with 2 fields [rate, cost]. To load these logs into Redshift you must modify the schema of the Redshift table to look similar to the following:
 41 | 
 42 | ```invoice_id varchar(255), payer_account_id varchar(255), linked_account_id varchar(255), record_type varchar(255), product_name varchar(255), rate_id varchar(255), subscription_id varchar(255), pricing_plan_id varchar(255), usage_type varchar(255), operation varchar(255), availability_zone varchar(255), reserved_instance varchar(255), item_description varchar(255), usage_start_date varchar(255), usage_end_date varchar(255), usage_quantity FLOAT, rate FLOAT, cost FLOAT```
 43 | 
 44 | ## Parameters
 45 | 
 46 | Specifying these parameters is sufficient to get this pipeline to work:
 47 | 
 48 | ```
 49 | "parameters": [
 50 |     {
 51 |       "id": "myS3BillingLogLoc",
 52 |       "type": "AWS::S3::ObjectKey",
 53 |       "description": "Input S3 folder for billing report",
 54 |       "helpText": "S3 folder that has the monthly AWS detailed billing report files with a .csv.zip format."
 55 |     },
 56 |     {
 57 |       "id": "myS3StagingLoc",
 58 |       "type": "AWS::S3::ObjectKey",
 59 |       "description": "S3 staging folder",
 60 |       "helpText": "Folder to store the unzipped CSV file before loading to Redshift. The S3 folder must be in the same region as the Redshift cluster."
 61 |     },
 62 |     {
 63 |       "id": "myRedshiftJdbcConnectStr",
 64 |       "type": "String",
 65 |       "description": "Redshift JDBC connection string",
 66 |       "watermark": "jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true"
 67 |     },
 68 |     {
 69 |       "id": "myRedshiftUsername",
 70 |       "type": "String",
 71 |       "description": "Redshift username"
 72 |     },
 73 |     {
 74 |       "id": "*myRedshiftPassword",
 75 |       "type": "String",
 76 |       "description": "Redshift password"
 77 |     },
 78 |     {
 79 |       "id": "myRedshiftSecurityGrps",
 80 |       "type": "String",
 81 |       "isArray": "true",
 82 |       "description": "Redshift security group(s)",
 83 |       "default":"default",
 84 |       "helpText": "The names of one or more security groups that are assigned to the Redshift cluster.",
 85 |       "watermark": "security group name"
 86 |     },
 87 |     {
 88 |       "id": "myRedshiftDbName",
 89 |       "type": "String",
 90 |       "description": "Redshift database name"
 91 |     },
 92 |     {
 93 |       "id": "myRedshiftTableName",
 94 |       "type": "String",
 95 |       "description": "Redshift table name",
 96 |       "helpText": "The name of an existing table or a new table that will be created with the same schema as the AWS detailed billing report."
 97 |     }
 98 |   ]
 99 | ```
100 | 


--------------------------------------------------------------------------------
/samples/diagnose/README.md:
--------------------------------------------------------------------------------
 1 | # Diagnosis Tool
 2 | The diagnosis tool can be used to do a quick check to test whether your connectivity is fine. It checks for the following:
 3 |  - Connectivity to different regions
 4 |  - Connections to S3, DynamoDB, Redshift and RDS.
 5 | 
 6 | ## Instructions
 7 | It can be done in two different ways:
 8 | 1. Using the terminal
 9 | 
10 | 2. Using the AWS Data Pipeline Console
11 | 
12 | 
13 | ###Using the terminal
14 | 1. Download the diagnostics jar file: https://s3.amazonaws.com/data-pipeline-samples/diagnose-sample/Diagnose.jar
15 | 
16 | 2. Run the following command (The config option takes in the path and file name of your credentials.json file)
17 | `$> java -jar <path_to_jar>/Diagnose.jar --config <path_to_file>/credentials.json`
18 | 
19 | NOTE: If you are running it from an AWS CLI that has been configured with your credentials, you can run just the following command:
20 | 	`$> java -jar <path_to_jar>/Diagnose.jar`
21 | 
22 | 
23 | ###Using the AWS Data Pipeline Console
24 | 1. Download the pipeline definition json file:https://s3.amazonaws.com/data-pipeline-samples/diagnose-sample/diagnose_pipeline.json.
25 | 
26 | 3. Use the AWS Data Pipeline console to create a new pipeline and import the definition from the downloaded json file.
27 | 
28 | 4. Activate the pipeline and wait for it to finish.
29 | 
30 | 5. Once it's finished, open the stdout logs and ensure that all the connectivity checks have been completed successfully.
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/samples/diagnose/diagnose_pipeline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "occurrences": "1",
 5 |       "period": "1 Day",
 6 |       "name": "RunOnce",
 7 |       "id": "DefaultSchedule",
 8 |       "type": "Schedule",
 9 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
10 |     },
11 |     {
12 |       "failureAndRerunMode": "CASCADE",
13 |       "schedule": {
14 |         "ref": "DefaultSchedule"
15 |       },
16 |       "resourceRole": "DataPipelineDefaultResourceRole",
17 |       "role": "DataPipelineDefaultRole",
18 |       "pipelineLogUri": "s3://edptestdiagnose/",
19 |       "scheduleType": "cron",
20 |       "name": "Default",
21 |       "id": "Default"
22 |     },
23 |     {
24 |       "schedule": {
25 |         "ref": "DefaultSchedule"
26 |       },
27 |       "resourceRole": "DataPipelineDefaultResourceRole",
28 |       "role": "DataPipelineDefaultRole",
29 |       "name": "DefaultResource1",
30 |       "id": "ResourceId_1",
31 |       "type": "Ec2Resource"
32 |     },
33 |     {
34 |       "schedule": {
35 |         "ref": "DefaultSchedule"
36 |       },
37 |       "name": "DefaultActivity1",
38 |       "id": "ActivityId_1",
39 |       "runsOn": {
40 |         "ref": "ResourceId_1"
41 |       },
42 |       "type": "ShellCommandActivity",
43 |       "command": "wget https://s3.amazonaws.com/data-pipeline-samples/diagnose-sample/Diagnose.jar;java -jar Diagnose.jar"
44 |     }
45 |   ],
46 |   "parameters": []
47 | }


--------------------------------------------------------------------------------
/samples/dynamo-db-export-as-csv/ddb-to-csv.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "objects": [
  3 |     {
  4 |       "schedule": {
  5 |         "ref": "ScheduleId_scTIc"
  6 |       },
  7 |       "name": "MyDynamoData",
  8 |       "id": "DynamoDBDataNodeId_PK5Iq",
  9 |       "type": "DynamoDBDataNode",
 10 |       "tableName": "ddbtable"
 11 |     },
 12 |     {
 13 |       "output": {
 14 |         "ref": "DataNodeId_KQofW"
 15 |       },
 16 |       "input": {
 17 |         "ref": "S3DataNodeId_3cbrR"
 18 |       },
 19 |       "schedule": {
 20 |         "ref": "ScheduleId_scTIc"
 21 |       },
 22 |       "dependsOn": {
 23 |         "ref": "EmrActivityId_bxl6C"
 24 |       },
 25 |       "stage": "false",
 26 |       "name": "DDBExporttoCSV",
 27 |       "hiveScript": "drop table if exists raw_data;\n\nCREATE EXTERNAL TABLE raw_data(accountId string, name string, description string)\nROW FORMAT SERDE 'org.apache.hadoop.hive.dynamodb.DynamoDBExportSerDe'\nLOCATION \"#{input.directoryPath}/#{format(@scheduledStartTime,'YYYY-MM-dd_hh.mm')}\"\nTBLPROPERTIES (\"dynamodb.column.mapping\"=\"accountId:accountId,name:name,description:description\");\n\ndrop table if exists csv_data;\ncreate table csv_data (accountId string, name string, description string)\nrow format delimited\nfields terminated by ',' lines terminated by '\\n'\nlocation '#{output.directoryPath}/';\n\ninsert overwrite table csv_data select * from raw_data;",
 28 |       "runsOn": {
 29 |         "ref": "EmrClusterId_auxJq"
 30 |       },
 31 |       "id": "ActivityId_IUO66",
 32 |       "type": "HiveActivity"
 33 |     },
 34 |     {
 35 |       "occurrences": "1",
 36 |       "period": "1 Day",
 37 |       "name": "ExportSchedule",
 38 |       "id": "ScheduleId_scTIc",
 39 |       "type": "Schedule",
 40 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
 41 |     },
 42 |     {
 43 |       "schedule": {
 44 |         "ref": "ScheduleId_scTIc"
 45 |       },
 46 |       "directoryPath": "s3://bucket/data_as_csv",
 47 |       "name": "CSVData",
 48 |       "id": "DataNodeId_KQofW",
 49 |       "type": "S3DataNode"
 50 |     },
 51 |     {
 52 |       "resizeClusterMaxInstances": "50",
 53 |       "maximumRetries": "0",
 54 |       "runsOn": {
 55 |         "ref": "EmrClusterId_auxJq"
 56 |       },
 57 |       "type": "EmrActivity",
 58 |       "output": {
 59 |         "ref": "S3DataNodeId_3cbrR"
 60 |       },
 61 |       "schedule": {
 62 |         "ref": "ScheduleId_scTIc"
 63 |       },
 64 |       "input": {
 65 |         "ref": "DynamoDBDataNodeId_PK5Iq"
 66 |       },
 67 |       "attemptTimeout": "24 Hours",
 68 |       "myDynamoDBReadThroughputRatio": "1.0",
 69 |       "name": "MyExportJob",
 70 |       "step": "s3://elasticmapreduce/libs/script-runner/script-runner.jar,s3://elasticmapreduce/libs/hive/hive-script,--run-hive-script,--hive-versions,latest,--args,-f,s3://elasticmapreduce/libs/hive/dynamodb/exportDynamoDBTableToS3,-d,DYNAMODB_INPUT_TABLE=#{input.tableName},-d,S3_OUTPUT_BUCKET=#{output.directoryPath}/#{format(@scheduledStartTime,'YYYY-MM-dd_hh.mm')},-d,DYNAMODB_READ_PERCENT=#{myDynamoDBReadThroughputRatio},-d,DYNAMODB_ENDPOINT=dynamodb.us-east-1.amazonaws.com",
 71 |       "id": "EmrActivityId_bxl6C",
 72 |       "resizeClusterBeforeRunning": "true"
 73 |     },
 74 |     {
 75 |       "emrLogUri": "s3://bucket/data_pipeline_logs/export/us-east-1/ddbtable/#{format(@scheduledStartTime,'YYYY-MM-dd_hh.mm')}",
 76 |       "schedule": {
 77 |         "ref": "ScheduleId_scTIc"
 78 |       },
 79 |       "installHive": "latest",
 80 |       "enableDebugging": "true",
 81 |       "name": "ExportCluster",
 82 |       "coreInstanceType": "m1.medium",
 83 |       "coreInstanceCount": "1",
 84 |       "id": "EmrClusterId_auxJq",
 85 |       "masterInstanceType": "m1.medium",
 86 |       "amiVersion": "3.3.2",
 87 |       "type": "EmrCluster"
 88 |     },
 89 |     {
 90 |       "failureAndRerunMode": "CASCADE",
 91 |       "resourceRole": "DataPipelineDefaultResourceRole",
 92 |       "role": "DataPipelineDefaultRole",
 93 |       "scheduleType": "CRON",
 94 |       "name": "Default",
 95 |       "id": "Default"
 96 |     },
 97 |     {
 98 |       "schedule": {
 99 |         "ref": "ScheduleId_scTIc"
100 |       },
101 |       "directoryPath": "s3://bucket/backup",
102 |       "name": "MyS3Data",
103 |       "id": "S3DataNodeId_3cbrR",
104 |       "type": "S3DataNode"
105 |     }
106 |   ],
107 |   "parameters": []
108 | }
109 | 


--------------------------------------------------------------------------------
/samples/dynamo-db-export-as-csv/readme.md:
--------------------------------------------------------------------------------
 1 | This pipeline demonstrates how to export data in dynamoDB as csv data in S3.   
 2 | 
 3 | Steps to run the pipeline using the cli.
 4 | 
 5 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id
 6 |   => Returns a pipeline-id 
 7 | 
 8 | 2) aws datapipeline put-pipeline-definition --pipeline-id <pipeline-id> --pipeline-definition file:///home/user/ddb-to-csv.json  
 9 | 
10 | 3) aws datapipeline activate-pipeline --pipeline-id <pipeline-id>
11 | 


--------------------------------------------------------------------------------
/samples/dynamo-db-export/DynamoDB-export.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "objects": [
 3 |         {
 4 |             "occurrences": "1",
 5 |             "period": "1 Day",
 6 |             "name": "RunOnce",
 7 |             "id": "DefaultSchedule",
 8 |             "type": "Schedule",
 9 |             "startAt": "FIRST_ACTIVATION_DATE_TIME"
10 |         },
11 |         {
12 |             "failureAndRerunMode": "CASCADE",
13 |             "schedule": {
14 |                 "ref": "DefaultSchedule"
15 |             },
16 |             "resourceRole": "DataPipelineDefaultResourceRole",
17 |             "role": "DataPipelineDefaultRole",
18 |             "scheduleType": "cron",
19 |             "pipelineLogUri" :  "#{myOutputS3Loc}/logs",
20 |             "name": "Default",
21 |             "id": "Default",
22 |             "maxActiveInstances" : "1"
23 |         },
24 |         {
25 |             "maximumRetries": "2",
26 |             "name": "TableBackupActivity",
27 |             "step": "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')},#{myDDBTableName},#{myDDBReadThroughputRatio}",
28 |             "id": "TableBackupActivity",
29 |             "runsOn": {
30 |                 "ref": "EmrClusterForBackup"
31 |             },
32 |             "type": "EmrActivity"
33 |         },
34 |         {
35 |             "bootstrapAction": "s3://elasticmapreduce/bootstrap-actions/configure-hadoop, --yarn-key-value, yarn.nodemanager.resource.memory-mb=12800,--yarn-key-value,yarn.scheduler.minimum-allocation-mb=256,--mapred-key-value,mapreduce.map.memory.mb=500,--mapred-key-value,mapreduce.map.java.opts=-Xmx400M,--mapred-key-value,mapreduce.job.reduce.slowstart.completedmaps=1,--mapred-key-value,mapreduce.map.speculative=false",
36 |             "name": "EmrClusterForBackup",
37 |             "amiVersion": "3.8.0",
38 |             "id": "EmrClusterForBackup",
39 |             "type": "EmrCluster",
40 |             "masterInstanceType": "m1.medium",
41 |             "coreInstanceType": "#{myInstanceType}",
42 |             "coreInstanceCount": "#{myInstanceCount}",
43 | 	          "region" : "#{myRegion}",
44 | 	          "terminateAfter" : "12 hours",
45 |             "keyPair" : "ramsug-test-desktop"
46 |         }
47 |     ],
48 |     "parameters": [
49 |         {
50 |             "description": "OutputS3folder",
51 |             "id": "myOutputS3Loc",
52 |             "type": "AWS::S3::ObjectKey"
53 |         },
54 |         {
55 |             "default": "0.2",
56 |             "watermark": "Enter value between 0.1 - 1.0",
57 |             "description": "DynamoDB Read Throughput Ratio",
58 |             "id": "myDDBReadThroughputRatio",
59 |             "type": "Double"
60 |         },
61 |         {
62 |             "description": "DynamoDB Table Name",
63 |             "id": "myDDBTableName",
64 |             "type": "String"
65 |         },
66 |         {
67 |             "description": "Instance Type",
68 |             "id": "myInstanceType",
69 |             "watermark" : "m1.medium if IOPS <= 900. Else use m3.xlarge",
70 |             "type": "String"
71 |         },
72 |         {
73 |             "description": "Instance Count",
74 | 	    "watermark" : " (IOPS / 300) for m1.medium.(IOPS / 1500) for m3.xlarge",
75 |             "id": "myInstanceCount",
76 |             "type": "Integer"
77 |         },
78 | 	{
79 | 	    "description" : "Region",
80 | 	    "watermark"   : "Region of DynamoDB Table/EMR cluster",
81 |             "id"          : "myRegion",
82 |             "type"     :    "String"
83 | 	}
84 |     ]
85 | }
86 | 


--------------------------------------------------------------------------------
/samples/dynamo-db-export/example-parameters.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "values": 
 3 |     {
 4 |         "myOutputS3Loc"            :   "s3://bucket/directory/",
 5 |         "myDDBReadThroughputRatio" :   "0.5",
 6 |         "myDDBTableName"           :   "dynamo-table-name",
 7 |         "myInstanceType"           :   "m1.medium",
 8 |         "myInstanceCount"          :   "1",
 9 |         "myRegion"                 :   "eu-west-1"
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/samples/dynamo-db-export/readme.md:
--------------------------------------------------------------------------------
 1 | This pipeline exports data from a Dynamo DB Table to a S3 location using an EMR Cluster.
 2 | 
 3 | Steps to run the pipeline using the cli.
 4 | 
 5 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id
 6 |   => Returns a pipeline-id 
 7 | 
 8 | 2) aws datapipeline put-pipeline-definition --pipeline-id <pipeline-id> --pipeline-definition file:///home/user/DynamoDB-export.json --parameter-values-uri file:///home/user/example-parameters.json 
 9 | 
10 | 3) aws datapipeline activate-pipeline --pipeline-id <pipeline-id>
11 | 


--------------------------------------------------------------------------------
/samples/dynamo-db-to-redshift/dynamo-db-to-redshift.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "id": "DefaultSchedule1",
 5 |       "startAt": "FIRST_ACTIVATION_DATE_TIME",
 6 |       "name": "DefaultSchedule1",
 7 |       "type": "Schedule",
 8 |       "occurrences" : "1",
 9 |       "period": "1 day"
10 |     },
11 |     {
12 |       "id": "DefaultRedshiftDatabase1",
13 |       "region": "eu-west-1",
14 |       "databaseName": "database_name",
15 |       "username": "%Redshift DB Username%",
16 |       "name": "DefaultRedshiftDatabase1",
17 |       "*password": "%Redshift DB Password%",
18 |       "type": "RedshiftDatabase",
19 |       "clusterId": "%Redshift DB Cluster ID%"
20 |     },
21 |     {
22 |       "id": "DefaultDynamoDBDataNode1",
23 |       "region": "us-east-1",
24 |       "schedule": {
25 |         "ref": "DefaultSchedule1"
26 |       },
27 |       "tableName": "%Dynamo DB Table Name%",
28 |       "name": "DefaultDynamoDBDataNode1",
29 |       "type": "DynamoDBDataNode"
30 |     },
31 |     {
32 |       "id": "DefaultRedshiftCopyActivity1",
33 |       "input": {
34 |         "ref": "DefaultDynamoDBDataNode1"
35 |       },
36 |       "schedule": {
37 |         "ref": "DefaultSchedule1"
38 |       },
39 |       "insertMode": "KEEP_EXISTING",
40 |       "name": "DefaultRedshiftCopyActivity1",
41 |       "runsOn": {
42 |         "ref": "DefaultEc2Resource1"
43 |       },
44 |       "output": {
45 |         "ref": "DefaultRedshiftDataNode1"
46 |       },
47 |       "type": "RedshiftCopyActivity"
48 |     },
49 |     {
50 |       "id": "DefaultRedshiftDataNode1",
51 |       "schedule": {
52 |         "ref": "DefaultSchedule1"
53 |       },
54 |       "tableName": "%Redshift DB Table Name%",
55 |       "name": "DefaultRedshiftDataNode1",
56 |       "type": "RedshiftDataNode",
57 |       "database": {
58 |         "ref": "DefaultRedshiftDatabase1"
59 |       }
60 |     },
61 |     {
62 |       "id": "Default",
63 |       "scheduleType": "CRON",
64 |       "failureAndRerunMode": "CASCADE",
65 |       "name": "Default",
66 |       "role": "DataPipelineDefaultRole",
67 |       "resourceRole": "DataPipelineDefaultResourceRole"
68 |     },
69 |       {
70 |         "id"  : "DefaultEc2Resource1", 
71 |         "name": "DefaultEc2Resource1",
72 |         "type": "Ec2Resource",
73 |         "terminateAfter": "45 minutes",
74 |         "actionOnTaskFailure": "terminate",
75 |         "actionOnResourceFailure": "retrynone",
76 |         "maximumRetries": "0",
77 |         "schedule": { "ref": "DefaultSchedule1" },
78 |         "logUri": "s3://logbucket/"
79 |       }
80 |   ]
81 | }
82 | 


--------------------------------------------------------------------------------
/samples/dynamo-db-to-redshift/readme.md:
--------------------------------------------------------------------------------
 1 | This pipeline demonstrates how to copy data from DynamoDB to Redshift using  datapipeline's RedshiftCopyActivity. 
 2 | Steps to run the pipeline using the cli.
 3 | 
 4 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id
 5 |   => Returns a pipeline-id 
 6 | 
 7 | 2) aws datapipeline put-pipeline-definition --pipeline-id <pipeline-id> --pipeline-definition file:///home/user/dynamo-db-to-redshift.json  
 8 | 
 9 | 3) aws datapipeline activate-pipeline --pipeline-id <pipeline-id>
10 | 


--------------------------------------------------------------------------------
/samples/dynamodb-to-dynamodb-crossregion/README.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline DynamoDB to DynamoDB Copy Sample [both src and target tables are in different region]
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample shows how to build a DynamoDB Copy Activity pipeline that uses a S3 directory for temporary backup. Specifically, this sample shows copying data across two dynamodb tables in the different regions [any aws region except eu-central-1]. Temporary S3 folder will be cleared after the copy activity completes.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
10 | 
11 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket.
12 | 
13 | ## Run this sample pipeline using the AWS CLI
14 | 
15 | ```sh 
16 |   $> aws datapipeline create-pipeline --name dynamodb_copy_pipeline --unique-id dynamodb_copy_pipeline
17 | ```
18 | 
19 | You receive a pipelineId like this. 
20 | ```sh
21 |   #   -----------------------------------------
22 |   #   |             CreatePipeline             |
23 |   #   +-------------+--------------------------+
24 |   #   |  pipelineId |  <Your Pipeline ID>      |
25 |   #   +-------------+--------------------------+
26 | ```
27 | 
28 | ```sh
29 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json \
30 |   --parameter-values myTempS3Folder=<s3://your/s3/temp/path> myDDBSourceTableName=<dynamodb source table> \
31 |   myDDBDestinationTableName=<dynamodb destination table> myDDBSourceRegion=<region for dynamodb source table> \
32 |   myDDBDestinationRegion=<region for dynamodb destination table> myS3LogsPath=<s3://logs/path>  --pipeline-id <Your Pipeline ID> 
33 | ```
34 | 
35 | You receive a validation messages like this
36 | ```sh
37 |   #   ----------------------- 
38 |   #   |PutPipelineDefinition|
39 |   #   +-----------+---------+
40 |   #   |  errored  |  False  |
41 |   #   +-----------+---------+
42 | ```
43 | 
44 | Now activate the pipeline
45 | ```sh
46 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
47 | ```
48 | 
49 | Check the status of your pipeline 
50 | ```
51 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
52 | ```
53 | 
54 | You will receive status information on the pipeline.  
55 | ```sh
56 |   #       Name                                                Scheduled Start      Status
57 |   #       ID                                                  Started              Ended
58 |   #---------------------------------------------------------------------------------------------------
59 |   #   1.  ActivityId_6OGtu                                    2015-07-29T01:06:17  WAITING_ON_DEPENDENCIES
60 |   #       @ActivityId_6OGtu_2015-07-29T01:06:17               2015-07-29T01:06:20
61 |   #
62 |   #   2.  ResourceId_z9RNH                                    2015-07-29T01:06:17  CREATING
63 |   #       @ResourceId_z9RNH_2015-07-29T01:06:17               2015-07-29T01:06:20
64 |   #
65 |   #       @ActivityId_wQhxe_2015-07-29T01:06:17               2015-07-29T01:06:20
66 | ```
67 | 
68 | 
69 | ## Disclaimer
70 | 
71 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
72 | 
73 | Use at your own risk.
74 | 
75 | Licensed under the MIT-0 License.
76 | 


--------------------------------------------------------------------------------
/samples/dynamodb-to-dynamodb/README.md:
--------------------------------------------------------------------------------
 1 | # Data Pipeline DynamoDB to DynamoDB Copy Sample [both src and target tables are in the same region]
 2 | 
 3 | ## Overview
 4 | 
 5 | This sample shows how to build a DynamoDB Copy Activity pipeline that uses a S3 directory for temporary backup. Specifically, this sample shows copying data across two dynamodb tables in the same region.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | You must have the AWS CLI and default IAM roles setup in order to run the sample. Please see the [readme](https://github.com/awslabs/data-pipeline-samples) for the base repository for instructions how to do this.
10 | 
11 | You must also provide the S3Path of a S3 bucket with write permissions. See [here](http://docs.aws.amazon.com/AmazonS3/latest/UG/CreatingaBucket.html) for instructions on how to create an S3 bucket.
12 | 
13 | ## Run this sample pipeline using the AWS CLI
14 | 
15 | ```sh 
16 |   $> aws datapipeline create-pipeline --name dynamodb_copy_pipeline --unique-id dynamodb_copy_pipeline
17 | ```
18 | 
19 | You receive a pipelineId like this. 
20 | ```sh
21 |   #   -----------------------------------------
22 |   #   |             CreatePipeline             |
23 |   #   +-------------+--------------------------+
24 |   #   |  pipelineId |  <Your Pipeline ID>      |
25 |   #   +-------------+--------------------------+
26 | ```
27 | 
28 | ```sh
29 |   $> aws datapipeline put-pipeline-definition --pipeline-definition file://pipeline.json \
30 |   --parameter-values myDDBRegion=<aws region> myTempS3Folder=<s3://your/s3/temp/path> \
31 |    myDDBSourceTableName=<dynamodb source table> myDDBDestinationTableName=<dynamodb destination table> myS3LogsPath=<s3://logs/path> \
32 |    --pipeline-id <Your Pipeline ID> 
33 | ```
34 | 
35 | You receive a validation messages like this
36 | ```sh
37 |   #   ----------------------- 
38 |   #   |PutPipelineDefinition|
39 |   #   +-----------+---------+
40 |   #   |  errored  |  False  |
41 |   #   +-----------+---------+
42 | ```
43 | 
44 | Now activate the pipeline
45 | ```sh
46 |   $> aws datapipeline activate-pipeline --pipeline-id <Your Pipeline ID>
47 | ```
48 | 
49 | Check the status of your pipeline 
50 | ```
51 |   >$ aws datapipeline list-runs --pipeline-id <Your Pipeline ID>
52 | ```
53 | 
54 | You will receive status information on the pipeline.  
55 | ```sh
56 |   #       Name                                                Scheduled Start      Status
57 |   #       ID                                                  Started              Ended
58 |   #---------------------------------------------------------------------------------------------------
59 |   #   1.  ActivityId_6OGtu                                    2015-07-29T01:06:17  WAITING_ON_DEPENDENCIES
60 |   #       @ActivityId_6OGtu_2015-07-29T01:06:17               2015-07-29T01:06:20
61 |   #
62 |   #   2.  ResourceId_z9RNH                                    2015-07-29T01:06:17  CREATING
63 |   #       @ResourceId_z9RNH_2015-07-29T01:06:17               2015-07-29T01:06:20
64 |   #
65 |   #       @ActivityId_wQhxe_2015-07-29T01:06:17               2015-07-29T01:06:20
66 | ```
67 | 
68 | 
69 | ## Disclaimer
70 | 
71 | The samples in this repository are meant to help users get started with Data Pipeline. They may not be sufficient for production environments. Users should carefully inspect code samples before running them.
72 | 
73 | Use at your own risk.
74 | 
75 | Licensed under the MIT-0 License.
76 | 


--------------------------------------------------------------------------------
/samples/hadoop-activity/README.md:
--------------------------------------------------------------------------------
 1 | #Hadoop Activity word count example with Fair Scheduler queues
 2 | 
 3 | ##About the sample
 4 | This pipeline definition when imported would run a word count splitter program (s3://elasticmapreduce/samples/wordcount/wordSplitter.py) on the public data set s3://elasticmapreduce/samples/wordcount/input/. There are two Hadoop Activities in the definition each of which run the splitter program and output to two s3 different folders with the format &lt;s3Prefix&gt;/scheduledStartTime/queue_(1|2). Each of the activities run a hadoop job on using Hadoop Fair Scheduler which is configured with two queues.
 5 | 
 6 | ##Running this sample
 7 | The pipeline requires three input points from the customer:
 8 | 
 9 | 1. The s3 prefix folder where the output of the word splitter would be stored. 
10 | 2. The queue configuration for Fair Scheduler sample allocations file could be found here s3://data-pipeline-samples/hadoop-activity/allocations.xml. More details http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/FairScheduler.html
11 | 3. The log folder for the pipeline.
12 | 


--------------------------------------------------------------------------------
/samples/hadoop-activity/hadoop-activity-world-count-fair.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "argument": [
 5 |         "-files",
 6 |         "s3://elasticmapreduce/samples/wordcount/wordSplitter.py",
 7 |         "-mapper",
 8 |         "wordSplitter.py",
 9 |         "-reducer",
10 |         "aggregate",
11 |         "-input",
12 |         "s3://elasticmapreduce/samples/wordcount/input/",
13 |         "-output",
14 |         "#{myOutputFolder}/#{@scheduledStartTime}/queue_1/"
15 |       ],
16 |       "maximumRetries": "0",
17 |       "name": "HadoopActivity_1",
18 |       "id": "HadoopActivity_1",
19 |       "runsOn": {
20 |         "ref": "EmrClusterId_pmtY0"
21 |       },
22 |       "jarUri": "/home/hadoop/contrib/streaming/hadoop-streaming.jar",
23 |       "type": "HadoopActivity",
24 |       "hadoopQueue" : "queue_1"
25 |     },
26 |     {
27 |       "argument": [
28 |         "-files",
29 |         "s3://elasticmapreduce/samples/wordcount/wordSplitter.py",
30 |         "-mapper",
31 |         "wordSplitter.py",
32 |         "-reducer",
33 |         "aggregate",
34 |         "-input",
35 |         "s3://elasticmapreduce/samples/wordcount/input/",
36 |         "-output",
37 |         "#{myOutputFolder}/#{@scheduledStartTime}/queue_2/"
38 |       ],
39 |       "maximumRetries": "0",
40 |       "name": "HadoopActivity_2",
41 |       "id": "HadoopActivity_2",
42 |       "runsOn": {
43 |         "ref": "EmrClusterId_pmtY0"
44 |       },
45 |       "jarUri": "/home/hadoop/contrib/streaming/hadoop-streaming.jar",
46 |       "type": "HadoopActivity",
47 |       "hadoopQueue" : "queue_2"
48 |     },
49 |     {
50 |       "bootstrapAction": "s3://datapipeline-us-east-1/us-east-1/bootstrap-actions/latest/TaskRunner/configure-hadoop,--fair-config-copy-file,#{myFairConfig}",
51 |       "hadoopSchedulerType": "PARALLEL_FAIR_SCHEDULING",
52 |       "name": "DefaultEmrCluster1",
53 |       "coreInstanceCount": "2",
54 |       "coreInstanceType": "m3.xlarge",
55 |       "amiVersion": "3.8.0",
56 |       "masterInstanceType": "m3.xlarge",
57 |       "id": "EmrClusterId_pmtY0",
58 |       "type": "EmrCluster"
59 |     },
60 |     {
61 |       "occurrences": "1",
62 |       "period": "1 Day",
63 |       "name": "RunOnce",
64 |       "id": "DefaultSchedule",
65 |       "type": "Schedule",
66 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
67 |     },
68 |     {
69 |       "failureAndRerunMode": "CASCADE",
70 |       "schedule": {
71 |         "ref": "DefaultSchedule"
72 |       },
73 |       "resourceRole": "DataPipelineDefaultResourceRole",
74 |       "role": "DataPipelineDefaultRole",
75 |       "pipelineLogUri": "#{myPipelineLogUri}",
76 |       "scheduleType": "cron",
77 |       "name": "Default",
78 |       "id": "Default"
79 |     }
80 |   ],
81 |   "parameters": [
82 |      {
83 |         "id" : "myFairConfig",
84 |         "type" : "AWS::S3::ObjectKey",
85 |         "description" : "Please choose the fair scheduler configuration"
86 |      },
87 |      {
88 |        "id" : "myPipelineLogUri",
89 |        "type" : "AWS::S3::ObjectKey",
90 |        "description" : "Please specify the logs location"
91 |      },
92 |     {
93 |        "id" : "myOutputFolder",
94 |        "type" : "AWS::S3::ObjectKey",
95 |        "description" : "Please specify the word count output location"
96 |      }
97 |    ]
98 | }
99 | 


--------------------------------------------------------------------------------
/samples/helloworld/README.md:
--------------------------------------------------------------------------------
  1 | # Hello World
  2 | 
  3 | This sample defines a [shell command activity](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-shellcommandactivity.html) to echo the text "hello world". The output, along with
  4 | the acitivity log, is saved to an [S3](https://aws.amazon.com/s3/) bucket.
  5 | 
  6 | ## Parameters
  7 | 
  8 | <table>
  9 | <tr><th>Parameter</th><th>Description</th></tr>
 10 | <tr>
 11 | <td>myS3LogsPath</td>
 12 | <td>
 13 | (Required) An S3 key where the shell output and activity log will be stored. Example: "s3://data-pipeline-samples-12345"
 14 | </td>
 15 | </tr>
 16 | </table>
 17 | 
 18 | ## Setup (Optional)
 19 | 
 20 | You can use the setup script in the HelloWorld sample directory to create an S3 bucket to use in
 21 | this example. You can skip this step if you have another S3 bucket that you want to use. The script
 22 | will take a minute to complete, and when it's finished it will print the resource identifier of the
 23 | S3 bucket that it created.
 24 | 
 25 | ```sh
 26 | $> python setup.py
 27 | # Creating resources for stack [dpl-samples-hello-world]...
 28 | #   AWS::S3::Bucket: dpl-samples-hello-world-s3bucket-2bbt69s1j29c
 29 | ```
 30 | 
 31 | ## Running this sample
 32 | 
 33 | Create a new pipeline. Throughout this section we assume that the HelloWorld sample directory is
 34 | your current working directory.
 35 | 
 36 | ```sh
 37 | $> aws datapipeline create-pipeline --name hello_world_pipeline --unique-id hello_world_pipeline 
 38 | # {
 39 | #     "pipelineId": "df-074257336JDKJ6QWQCT4"
 40 | # }
 41 | ```
 42 | 
 43 | Upload the pipeline definition. Use the `pipelineId` that was returned by the `create-pipeline`
 44 | command. Specify the name of an S3 bucket where the output and activity log will be stored. This
 45 | will either be the bucket name that was printed by the setup script or another bucket that you've
 46 | created.
 47 | 
 48 | 
 49 | ```sh
 50 | $> aws datapipeline put-pipeline-definition --pipeline-id <your pipelineId> --pipeline-definition file://helloworld.json --parameter-values myS3LogsPath="s3://<your s3 logging path>"
 51 | # {
 52 | #     "validationErrors": [],
 53 | #     "validationWarnings": [],
 54 | #     "errored": false
 55 | # }
 56 | ```
 57 | 
 58 | Activate the pipeline. Use the `pipelineId` that was returned by the `create-pipeline` command.
 59 | 
 60 | ```sh
 61 | $> aws datapipeline activate-pipeline --pipeline-id <your pipelineId>
 62 | ```
 63 | 
 64 | Optionally, check the status of your running pipeline. Use the `pipelineId` that was returned by the
 65 | `create-pipeline` command. When the pipeline has completed, the Status Ended column in the output
 66 | from this command will show FINISHED for all pipeine nodes. Note that it may take a minute after the
 67 | `activate-pipeline` command has completed before the `list-runs` command shows any output.
 68 | 
 69 | ```sh
 70 | 
 71 | >$ aws datapipeline list-runs --pipeline-id <your pipelineId>
 72 | #          Name                                                Scheduled Start      Status                 
 73 | #          ID                                                  Started              Ended              
 74 | #   ---------------------------------------------------------------------------------------------------
 75 | #      1.  EC2Resource_HelloWorld                              2015-10-14T16:51:56  RUNNING                
 76 | #          @EC2Resource_HelloWorld_2015-10-14T16:51:56         2015-10-14T16:51:59                     
 77 | #   
 78 | #      2.  ShellCommandActivity_HelloWorld                     2015-10-14T16:51:56  WAITING_FOR_RUNNER     
 79 | #          @ShellCommandActivity_HelloWorld_2015-10-14T16:51:  2015-10-14T16:51:59   
 80 | 
 81 | ```
 82 | 
 83 | After the pipeline is completed, the output and activity log from the pipeline will be saved to the S3 bucket that you
 84 | specified under the following prefix. To view or download these files, navigate to this prefix in
 85 | the S3 section of the [AWS Management Console](https://aws.amazon.com/console/).
 86 | 
 87 |     s3://<your S3 logging path>/HelloWorld/<your pipelineId>/<pipeline definition>/<pipeline instance>/<pipeline attempt>
 88 | 
 89 | ## Next steps
 90 | 
 91 | Once the pipeline is completed, you can delete it with the following command. If you try to run the
 92 | sample again without deleting, you may receive errors or unexpected behavior.
 93 | 
 94 | ```sh
 95 | $> aws datapipeline delete-pipeline --pipeline-id <your pipelineId>
 96 | ```
 97 | 
 98 | The resources used by this example will incur normal charges. If you provisioned resources using the
 99 | setup script, you can free them by running the following command in the HelloWorld sample directory.
100 | 
101 | ```sh
102 | $> python setup.py --teardown
103 | # Request to delete stack [dpl-samples-hello-world] has been sent
104 | ```
105 | 
106 | 
107 | ## Disclaimer
108 | 
109 | The samples in this repository are meant to help users get started with Data Pipeline. They may not
110 | be sufficient for production environments. Users should carefully inspect samples before running
111 | them.
112 | 
113 | *Use at your own risk.*
114 | 
115 | Licensed under the MIT-0 License.
116 | 


--------------------------------------------------------------------------------
/samples/helloworld/helloworld.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |   { 
 4 |       "myComment": "This object is used to set default configuration for objects in the pipeline",
 5 |       
 6 |       "id": "Default",
 7 |       "failureAndRerunMode":"cascade",
 8 |       "resourceRole": "DataPipelineDefaultResourceRole",
 9 |       "role": "DataPipelineDefaultRole",
10 |       "pipelineLogUri": "#{myS3LogsPath}/HelloWorld",
11 |       "scheduleType": "cron",
12 |       "schedule": {
13 |         "ref": "DefaultSchedule"
14 |       }
15 |    },
16 |    {
17 |       "myComment": "This object is used to specify the time-based trigger for executing Activities and for provisioning Resources of the pipeline. In this case it is used by the 'Default' object so it will cascade down to all other objects in the pipeline if they do not override it. For this example, we are using it to specify that our pipeline will run immediately upon activation. Also, we are using the 'occurrences' option specify that the pipeline should only be run once. You can have multiple schedules defined in a pipeline.",
18 | 
19 |       "type": "Schedule",
20 |       "id": "DefaultSchedule",
21 |       "occurrences": "1",
22 |       "period": "1 Day",
23 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
24 |     },
25 | 
26 | 
27 |     { 
28 |       "myComment": "This object is used to create an Amazon EC2 Instance that activities in the pipeline can run on.",
29 | 
30 |       "id": "EC2Resource_HelloWorld",
31 |       "type": "Ec2Resource",
32 |       "terminateAfter": "1 Hour"
33 |     },
34 |     {
35 |      "myComment": "This object is a ShellCommandActivity. It is used to specify the command linux shell command that will be invoked. In this case it is simply running the 'echo' command, but it can be used to run any command that is accessible on the in the shell of the Resource that runs on.",
36 | 
37 |       "id": "ShellCommandActivity_HelloWorld",
38 |       "runsOn": {
39 |         "ref": "EC2Resource_HelloWorld"
40 |       },
41 |       "type": "ShellCommandActivity",
42 |       "command": "echo 'Hello World!'"
43 |     },
44 |     {
45 |         "id": "OutputData",
46 |         "name": "OutputData",
47 |         "type": "S3DataNode",
48 |         "filePath": "s3://joshtok-test/abc"
49 |     },
50 |     {
51 |         "id": "OutputData2",
52 |         "name": "OutputData",
53 |         "type": "S3DataNode",
54 |         "filePath": "s3://joshtok-test/def"
55 |     }
56 |   ],
57 | "parameters": [
58 |      {
59 |         "myComment": "This Parameter specifies the S3 logging path for the pipeline. It is used by the 'Default' object to set the 'pipelineLogUri' value. Using Parameters helps users avoid hard coding variables in pipeline definitions. Users can instead supply these parameters when calling 'aws datapipeline put-pipeline-definition' or 'aws datapipeline activate-pipeline-definition'.",
60 | 
61 |         "id" : "myS3LogsPath",
62 |         "type" : "AWS::S3::ObjectKey",
63 |         "description" : "S3 path for pipeline logs."
64 |      }
65 |    ]
66 | }
67 | 


--------------------------------------------------------------------------------
/samples/helloworld/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("../../setup")
 3 | 
 4 | from stacker import Stacker
 5 | 
 6 | s = Stacker(
 7 |     "dpl-samples-hello-world",
 8 |     {
 9 |         "Resources": {
10 |             "S3Bucket": {
11 |                 "Type": "AWS::S3::Bucket",
12 |                 "DeletionPolicy": "Delete"
13 |             }
14 |         }
15 |     })
16 | 
17 | s.run(sys.argv)
18 | 


--------------------------------------------------------------------------------
/samples/json-to-dynamodb/README.md:
--------------------------------------------------------------------------------
 1 | # json-to-dynamodb  
 2 | Example that loads a json stored in an S3 location into a DynamoDB table
 3 | 
 4 | The pipeline definition reads a customer json file stored in an S3 location and loads the data to a DynamoDB table called customers.
 5 | 
 6 | The load to DynamoDb is done via a hive script [json_to_ddb.q](json_to_ddb.q)  that reads the json from the S3 location into an external table and then leverages the `org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler` to move the data from the Hive external table to a DynamoDb table called 'customers'.
 7 | 
 8 | 
 9 | 
10 | ## Disclaimer
11 | 
12 | The samples in this repository are meant to help users get started with Data Pipeline. They may not
13 | be sufficient for production environments. Users should carefully inspect samples before running
14 | them.
15 | 
16 | *Use at your own risk.*
17 | 
18 | Licensed under the MIT-0 License.
19 | 


--------------------------------------------------------------------------------
/samples/json-to-dynamodb/customers.json:
--------------------------------------------------------------------------------
1 | [{
2 | 	"customer_id": 1,
3 | 	"customer_income": 200
4 | },
5 | {
6 | 	"customer_id": 2,
7 | 	"customer_income": 100
8 | }]
9 | 


--------------------------------------------------------------------------------
/samples/json-to-dynamodb/definition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "name": "EmrClusterForBackup",
 5 |       "coreInstanceType": "m1.medium",
 6 |       "coreInstanceCount": "1",
 7 |       "masterInstanceType": "m1.medium",
 8 |       "amiVersion": "3.3.2",
 9 |       "id": "EmrClusterForBackup",
10 |       "type": "EmrCluster",
11 |       "terminateAfter": "2 Hours"
12 |     },
13 |     {
14 |       "failureAndRerunMode": "CASCADE",
15 |       "schedule": {
16 |         "ref": "DefaultSchedule"
17 |       },
18 |       "resourceRole": "DataPipelineDefaultResourceRole",
19 |       "role": "DataPipelineDefaultRole",
20 |       "scheduleType": "ondemand",
21 |       "name": "Default",
22 |       "id": "Default"
23 |     },
24 |     {
25 |       "name": "TableBackupActivity",
26 |       "scriptUri":"s3://datapipeline-samples/JsonToDynamoDb/json_to_ddb.q",
27 |       "runsOn": {
28 |         "ref" : "EmrClusterForBackup"
29 |       },
30 |       "id": "TableBackupActivity",
31 |       "type": "HiveActivity",
32 |       "stage":"false",
33 |       "myComment": "Activity used to run the hive script to export data"
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/samples/json-to-dynamodb/json_to_ddb.q:
--------------------------------------------------------------------------------
 1 | ADD JAR s3://elasticmapreduce/samples/hive-ads/libs/jsonserde.jar;
 2 | 
 3 | DROP TABLE IF EXISTS customer_json;
 4 | 
 5 | CREATE EXTERNAL TABLE customer_json (id STRING,
 6 |                                      income STRING)
 7 | ROW FORMAT SERDE 'com.amazon.elasticmapreduce.JsonSerde'
 8 | WITH SERDEPROPERTIES ('paths'='customer_id,customer_income')
 9 | LOCATION 's3://datapipeline-samples/JsonToDynamoDb/customers.json';
10 | 
11 | DROP TABLE IF EXISTS customer_hive;
12 | 
13 | CREATE EXTERNAL TABLE customer_hive (id STRING,
14 |                                      income STRING)
15 | STORED BY 'org.apache.hadoop.hive.dynamodb.DynamoDBStorageHandler'
16 | TBLPROPERTIES ("dynamodb.table.name" = "customers",
17 |                "dynamodb.column.mapping" = "id:id,income:income");
18 | 
19 | INSERT OVERWRITE TABLE customer_hive SELECT * FROM customer_json;
20 | 


--------------------------------------------------------------------------------
/samples/kinesis/README.md:
--------------------------------------------------------------------------------
 1 | ![Data Pipeline Logo](https://raw.githubusercontent.com/awslabs/data-pipeline-samples/master/setup/logo/datapipelinelogo.jpeg)
 2 | 
 3 | Process a Kinesis stream of Apache access logs using EMR
 4 | =====================
 5 | This sample sets up a Data Pipeline to run an analysis on a kinesis stream every 15 minutes and store the result in S3. This requires the setup from the EMR [documentation](http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-kinesis.html).
 6 | 
 7 | # Running the sample
 8 | 
 9 | ##Setting up your resources
10 | 
11 | The setup script will:
12 | - create a Kinesis stream named AccessLogStream
13 | - create a DynamoDb table called MyEMRKinesisTable 
14 | - create a DynamoDb table called MyEMRKinesisTableIteration
15 | - download a kinesis stream appender for sample apache access logs
16 |  
17 | ```sh
18 |  $> setup/setup-script.sh
19 | ```
20 | ##Populating your stream
21 | 
22 | You can push sample data to your stream by running
23 | 
24 | ```sh
25 |  $> setup/append-to-stream.sh
26 | ```
27 | 
28 | ##Setting up the pipeline
29 | 
30 | The instructions at https://github.com/awslabs/data-pipeline-samples tell you how to create, setup, and activate a pipeline. 
31 | 
32 | ```sh
33 |  $> aws datapipeline create-pipeline --name kinesis_apache_access_logs --unique-id kinesis_apache_access_logs
34 |  $> aws datapipeline put-pipeline-definition --pipeline-id df-0554887H4KXKTY59MRJ --pipeline-definition file://samples/kinesis/kinesis-to-s3.json --parameter-values myS3LogsPath="<your s3 logging path>" myS3Output="<your s3 output path>"
35 |  $> aws datapipeline activate-pipeline --pipeline-id df-0554887H4KXKTY59MRJ
36 | ```
37 | 


--------------------------------------------------------------------------------
/samples/kinesis/hive-scripts/create-table-from-kinesis-stream.q:
--------------------------------------------------------------------------------
 1 | DROP TABLE apachelog;
 2 | 
 3 | CREATE TABLE apachelog (
 4 |   host STRING,
 5 |   IDENTITY STRING,
 6 |   USER STRING,
 7 |   TIME STRING,
 8 |   request STRING,
 9 |   STATUS STRING,
10 |   SIZE STRING,
11 |   referrer STRING,
12 |   agent STRING
13 | )
14 | ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
15 | WITH SERDEPROPERTIES (
16 |   "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") ([0-9]*) ([0-9]*) ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\")"
17 | )
18 | STORED BY
19 | 'com.amazon.emr.kinesis.hive.KinesisStorageHandler'
20 | TBLPROPERTIES("kinesis.stream.name"="AccessLogStream");
21 | 
22 | CREATE TABLE IF NOT EXISTS apachelog_s3 (
23 |   host STRING,
24 |   IDENTITY STRING,
25 |   USER STRING,
26 |   TIME STRING,
27 |   request STRING,
28 |   STATUS STRING,
29 |   SIZE STRING,
30 |   referrer STRING,
31 |   agent STRING
32 | )
33 | PARTITIONED BY(iteration_no int)
34 | LOCATION '${s3Location}';
35 | 


--------------------------------------------------------------------------------
/samples/kinesis/hive-scripts/script-runner.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | aws s3 cp s3://data-pipeline-samples/kinesis-apache-access-logs/create-table-from-kinesis-stream.q .
 4 | aws s3 cp s3://data-pipeline-samples/kinesis-apache-access-logs//write-kinesis-to-s3.q .
 5 | 
 6 | S3_LOCATION=$1
 7 | 
 8 | #Read iteration count from DynamoDb if exists
 9 | result=$(aws dynamodb get-item --table-name MyEMRKinesisTableIteration --key '{"Hash":{"S": "IterationCount"}}' --attributes-to-get "Count")
10 | if [ -z "$result" ];
11 | then
12 |     ITERATION_COUNT=0
13 | else
14 |     ITERATION_COUNT=$(echo $result | grep "S" | sed 's/[^0-9]//g' )    
15 | fi
16 | 
17 | echo "Processing with iteration count $ITERATION_COUNT"
18 | 
19 | #Run hive scripts
20 | hive -hivevar s3Location=$S3_LOCATION -f create-table-from-kinesis-stream.q
21 | 
22 | echo "Completed table creation"
23 | 
24 | hive -hivevar iterationNo=$ITERATION_COUNT -f write-kinesis-to-s3.q
25 | 
26 | ITERATION_COUNT=$((ITERATION_COUNT+1))
27 | 
28 | echo "Writing iteration count as $ITERATION_COUNT"
29 | 
30 | #Write iteration count to DynamoDb
31 | aws dynamodb put-item --table-name MyEMRKinesisTableIteration --item {\"Hash\":{\"S\":\"IterationCount\"}\,\"Count\":{\"S\":\"$ITERATION_COUNT\"}}
32 | 


--------------------------------------------------------------------------------
/samples/kinesis/hive-scripts/write-kinesis-to-s3.q:
--------------------------------------------------------------------------------
1 | set kinesis.checkpoint.enabled=true;
2 | set kinesis.checkpoint.metastore.table.name=MyEMRKinesisTable;
3 | set kinesis.checkpoint.metastore.hash.key.name=HashKey;
4 | set kinesis.checkpoint.metastore.range.key.name=RangeKey;
5 | set kinesis.checkpoint.logical.name=TestLogicalName;
6 | set kinesis.checkpoint.iteration.no=${iterationNo};
7 | 
8 | INSERT OVERWRITE TABLE apachelog_s3 partition (iteration_no=${hiveconf:kinesis.checkpoint.iteration.no}) SELECT * FROM apachelog;
9 | 


--------------------------------------------------------------------------------
/samples/kinesis/kinesis-to-s3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "objects": [
 3 |     {
 4 |       "id": "Default",
 5 |       "failureAndRerunMode": "cascade",
 6 |       "resourceRole": "DataPipelineDefaultResourceRole",
 7 |       "role": "DataPipelineDefaultRole",
 8 |       "pipelineLogUri": "#{myS3LogsPath}",
 9 |       "scheduleType": "cron",
10 |       "schedule": {
11 |         "ref": "DefaultSchedule"
12 |       }
13 |     },
14 |     {
15 |       "type": "Schedule",
16 |       "id": "DefaultSchedule",
17 |       "period": "15 Period",
18 |       "startAt": "FIRST_ACTIVATION_DATE_TIME"
19 |     },
20 |     {
21 |       "schedule": {
22 |         "ref": "DefaultSchedule"
23 |       },
24 |       "name": "A_Fresh_NewEMRInstance",
25 |       "amiVersion": "3.3",
26 |       "id": "A_Fresh_NewEMRInstance",
27 |       "type": "EmrCluster"
28 |     },
29 |     {
30 |       "id": "ShellCommandActivity_HelloWorld",
31 |       "runsOn": {
32 |         "ref": "A_Fresh_NewEMRInstance"
33 |       },
34 |       "type": "ShellCommandActivity",
35 |       "scriptUri": "s3://data-pipeline-samples/kinesis-apache-access-logs/script-runner.sh",
36 |       "scriptArgument": "#{myS3Output}"
37 |     }
38 |   ],
39 |   "parameters": [
40 |     {
41 |       "id": "myS3LogsPath",
42 |       "type": "AWS::S3::ObjectKey",
43 |       "description": "S3 path for pipeline logs."
44 |     },
45 |     {
46 |       "id": "myS3Output",
47 |       "type": "AWS::S3::ObjectKey",
48 |       "description": "S3 path for pipeline logs e.g. s3://mybucket/"
49 |     }
50 |   ]
51 | }
52 | 


--------------------------------------------------------------------------------
/samples/kinesis/setup/append-to-stream.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #Make first append to Kinesis stream
4 | java -cp .:kinesis-log4j-appender-1.0.0.jar  com.amazonaws.services.kinesis.log4j.FilePublisher access_log_1
5 | 


--------------------------------------------------------------------------------
/samples/kinesis/setup/setup-script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Create Kinesis stream for sample
 4 | aws kinesis create-stream --stream-name AccessLogStream --shard-count 2
 5 | 
 6 | # Create DynamoDb table required by EMR to process Kinesis
 7 | aws dynamodb create-table --table-name MyEMRKinesisTable --attribute-definitions AttributeName=HashKey,AttributeType=S AttributeName=RangeKey,AttributeType=S --key-schema AttributeName=HashKey,KeyType=HASH AttributeName=RangeKey,KeyType=RANGE --provisioned-throughput ReadCapacityUnits=10,WriteCapacityUnits=10
 8 | 
 9 | #Create DynamoDb table to maintain iterations on Kinesis processing by EMR
10 | aws dynamodb create-table --table-name MyEMRKinesisTableIteration --attribute-definitions AttributeName=Hash,AttributeType=S  --key-schema AttributeName=Hash,KeyType=HASH --provisioned-throughput ReadCapacityUnits=1,WriteCapacityUnits=1
11 | 
12 | # Download sample kinesis stream appender
13 | wget http://emr-kinesis.s3.amazonaws.com/publisher/kinesis-log4j-appender-1.0.0.jar
14 | 
15 | # Download sample access logs
16 | wget http://elasticmapreduce.s3.amazonaws.com/samples/pig-apache/input/access_log_1
17 | 


--------------------------------------------------------------------------------
/samples/oracle-backup/README.md:
--------------------------------------------------------------------------------
 1 | # Oracle-Backup
 2 | 
 3 | This sample pipeline does a daily backup of an Oracle database to S3 in CSV format, under an S3 prefix using the date of the backup.
 4 | 
 5 | It features usage of parameters and expressions for easy pipeline definition re-use, construction of a JDBC connection string for the [`JdbcDatabase`](http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-object-jdbcdatabase.html) object, and to store backups on AWS under the date the pipeline was started (instead of the full timestamp).
 6 | 
 7 | ## Instructions
 8 | 
 9 | 1. The Oracle JDBC driver is not available by default on instances launched using Data Pipeline. In order to use it, you will need to [download the driver](http://www.oracle.com/technetwork/database/features/jdbc/index-091264.html) from Oracle.
10 | 
11 | 2. Upload the driver JAR to an S3 bucket.
12 | 
13 | 3. Install the [AWS CLI](http://aws.amazon.com/cli/). This is available by default on Amazon Linux instances.
14 | 
15 | 4. Configure the credentials with `aws configure`. If using role credentials, then you can skip all fields except for the default region.
16 | 
17 | 5. Fill out `values.json` with the appropriate values; there are descriptions of the parameters in `parameters.json` as well as below.
18 | 
19 | 6. Create a pipeline either with the AWS Console, or through the CLI. Through the CLI, this can be done with
20 | 
21 |     `aws datapipeline create-pipeline --name <name> --unique-id <unique-identifier>`
22 | 
23 | 7.  Using the pipeline-id (`df-XXXXXX`), submit the pipeline definition with parameters and values.
24 | 
25 |     `aws datapipeline put-pipeline-definition --pipeline-id <pipeline-id> --pipeline-definition file://definition.json --parameter-objects file://parameters.json --parameter-values-uri file://values.json`
26 | 
27 | 8. Activate the pipeline
28 |     `aws datapipeline activate-pipeline --pipeline-id <pipeline-id>`
29 | 
30 | 
31 | ##Parameters
32 | 
33 | myBackupLocation: S3 backup location (i.e. `s3://mybucket/backups/oracle`)
34 | 
35 | myOracleDriverLocation: S3 location to fetch Oracle JDBC driver from (i.e. `s3://mybucket/ojdbc6.jar`)
36 | 
37 | myOracleHost: Oracle host address (i.e. `abc.xyz.us-east-1.rds.amazonaws.com`)
38 | 
39 | myOraclePort: Oracle port (i.e. `1521`)
40 | 
41 | myOracleDatabase: Oracle SID/database (.i.e. `ORCL`)
42 | 
43 | myOracleUser: Oracle user
44 | 
45 | myOraclePassword: Password to use
46 | 
47 | myOracleTable: Name of the Oracle table to back up
48 | 
49 | myTerminateAfter: Terminate instance after a certain amount of time (i.e. `2 Hours`)
50 | 
51 | myPipelineLogUri: Log pipeline execution details to an S3 location (i.e. `s3://mybucket/pipelinelogs`)
52 | 


--------------------------------------------------------------------------------
/samples/oracle-backup/definition.json:
--------------------------------------------------------------------------------
 1 | { "objects":
 2 |   [ { "id": "S3"
 3 |     , "name": "Backup Location"
 4 |     , "type": "S3DataNode"
 5 |     , "directoryPath": "#{myBackupLocation}/#{day(@scheduledStartTime)}/"
 6 |     , "schedule":
 7 |       { "ref": "DefaultSchedule" }
 8 |     }
 9 |   , { "id": "DefaultSchedule"
10 |     , "name": "Every 1 day"
11 |     , "type": "Schedule"
12 |     , "period": "1 days"
13 |     , "startAt": "FIRST_ACTIVATION_DATE_TIME"
14 |     }
15 |   , { "id": "Instance"
16 |     , "name": "Instance"
17 |     , "type": "Ec2Resource"
18 |     , "role": "DataPipelineDefaultRole"
19 |     , "resourceRole": "DataPipelineDefaultResourceRole"
20 |     , "terminateAfter": "#{myTerminateAfter}"
21 |     , "schedule":
22 |       { "ref": "DefaultSchedule" }
23 |     }
24 |   , { "id": "Default"
25 |     , "name": "Default"
26 |     , "role": "DataPipelineDefaultRole"
27 |     , "resourceRole": "DataPipelineDefaultResourceRole"
28 |     , "failureAndRerunMode": "CASCADE"
29 |     , "pipelineLogUri": "#{myPipelineLogUri}"
30 |     , "scheduleType": "cron"
31 |     , "schedule":
32 |       { "ref": "DefaultSchedule" }
33 |     }
34 |   , { "id": "OracleDatabase"
35 |     , "name": "Oracle Database"
36 |     , "type": "JdbcDatabase"
37 |     , "jdbcDriverJarUri": "#{myOracleDriverLocation}"
38 |     , "jdbcDriverClass": "oracle.jdbc.OracleDriver"
39 |     , "connectionString": "jdbc:oracle:thin:@#{myOracleHost}:#{myOraclePort}:#{myOracleDatabase}"
40 |     , "username": "#{myOracleUser}"
41 |     , "*password": "#{myOraclePassword}"
42 |     }
43 |   , { "id": "BackupTable"
44 |     , "name": "Back up Oracle table"
45 |     , "type": "CopyActivity"
46 |     , "input":
47 |       { "ref": "Oracle" }
48 |     , "output":
49 |       { "ref": "S3" }
50 |     , "runsOn":
51 |       { "ref": "Instance" }
52 |     , "schedule":
53 |       { "ref": "DefaultSchedule" }
54 |     }
55 |   , { "id": "Oracle"
56 |     , "name": "Oracle"
57 |     , "type": "SqlDataNode"
58 |     , "table": "#{myOracleTable}"
59 |     , "selectQuery": "SELECT * FROM mytable"
60 |     , "database":
61 |       { "ref": "OracleDatabase" }
62 |     , "runsOn":
63 |       { "ref": "Instance" }
64 |     , "schedule":
65 |       { "ref": "DefaultSchedule" }
66 |     }
67 |   ],
68 |   "parameters": []
69 | }
70 | 


--------------------------------------------------------------------------------
/samples/oracle-backup/parameters.json:
--------------------------------------------------------------------------------
 1 | { "parameters":
 2 |   [ { "id": "myBackupLocation"
 3 |     , "description": "S3 backup location"
 4 |     , "type": "AWS::S3::ObjectKey"
 5 |     }
 6 |   , { "id": "myOracleDriverLocation"
 7 |     , "description": "S3 location to fetch Oracle JDBC driver from"
 8 |     , "type": "AWS::S3::ObjectKey"
 9 |     }
10 |   , { "id": "myOracleHost"
11 |     , "description": "Oracle host address"
12 |     , "type": "String"
13 |     }
14 |   , { "id": "myOraclePort"
15 |     , "description": "Oracle port"
16 |     , "type": "Integer"
17 |     }
18 |   , { "id": "myOracleDatabase"
19 |     , "description": "Oracle SID/database"
20 |     , "type": "String"
21 |     }
22 |   , { "id": "myOracleUser"
23 |     , "description": "Oracle user"
24 |     , "type": "String"
25 |     }
26 |   , { "id": "myOraclePassword"
27 |     , "description": "Oracle password"
28 |     , "type": "String"
29 |     }
30 |   , { "id": "myOracleTable"
31 |     , "description": "Name of the Oracle table to back up"
32 |     , "type": "String"
33 |     }
34 |   , { "id": "myTerminateAfter"
35 |     , "description": "Terminate instance after a certain amount of time"
36 |     , "type": "String"
37 |     }
38 |   , { "id": "myPipelineLogUri"
39 |     , "description": "Log pipeline execution details to an S3 location"
40 |     , "type": "AWS::S3::ObjectKey"
41 |     }
42 |   ]
43 | }
44 | 


--------------------------------------------------------------------------------
/samples/oracle-backup/values.json:
--------------------------------------------------------------------------------
 1 | { "values":
 2 |   { "myBackupLocation": ""
 3 |   , "myOracleDriverLocation": ""
 4 |   , "myOracleHost": ""
 5 |   , "myOraclePort": ""
 6 |   , "myOracleDatabase": ""
 7 |   , "myOracleUser": ""
 8 |   , "myOraclePassword": ""
 9 |   , "myOracleTable": ""
10 |   , "myTerminateAfter": ""
11 |   , "myPipelineLogUri": ""
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/samples/rds-to-rds-copy/readme.md:
--------------------------------------------------------------------------------
 1 | This pipeline demonstrates how to copy data from S3 to RDS instances and between RDS instances using  datapipeline.  Following is the data flow
 2 | 
 3 | S3 -> Mysql -> Oracle -> PostGres -> SqlServer -> S3 
 4 | 
 5 | Steps to run the pipeline using the cli.
 6 | 
 7 | 1) aws datapipeline create-pipeline --name ddb-backup --unique-id some-unique-id
 8 |   => Returns a pipeline-id 
 9 | 
10 | 2) aws datapipeline put-pipeline-definition --pipeline-id <pipeline-id> --pipeline-definition file:///home/user/rds-to-rds-copy.json  
11 | 
12 | 3) aws datapipeline activate-pipeline --pipeline-id <pipeline-id>
13 | 


--------------------------------------------------------------------------------
/setup/logo/datapipelinelogo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-archives/data-pipeline-samples/3be77bd7ddb0021fc36074718da225f84aac21fd/setup/logo/datapipelinelogo.jpeg


--------------------------------------------------------------------------------
/setup/stacker.py:
--------------------------------------------------------------------------------
 1 | import botocore, boto3, json, sys, time
 2 | 
 3 | 
 4 | def print_resources(stack):
 5 |     resources = []
 6 |     for summary in stack.resource_summaries.all():
 7 |         resources.append((summary.resource_type, summary.physical_resource_id))
 8 | 
 9 |     if len(resources) == 0:
10 |         print("No resources")
11 |         return
12 | 
13 |     max_type_length = max(len(res[0]) for res in resources)
14 |     format_string = "  {{:>{}}}: {{}}".format(max_type_length)
15 | 
16 |     for res in resources:
17 |         print(format_string.format(*res))
18 | 
19 | 
20 | def wait_for_status_change(stack, initial_status="CREATE_IN_PROGRESS"):
21 |     while stack.stack_status == initial_status:
22 |         time.sleep(0.2)
23 |         stack.reload()
24 | 
25 | 
26 | class UnexpectedStateError(Exception):
27 | 
28 |     def __init__(self, state):
29 |         message = "Stack reached unexpected state: {}".format(state)
30 |         super(UnexpectedStateError, self).__init__(message)
31 | 
32 | 
33 | class Stacker(object):
34 | 
35 |     def __init__(self, stack_name, stack_template, timeout_in_minutes=10, cloudformation=None):
36 |         self.stack_name = stack_name
37 |         self.stack_template = stack_template
38 |         self.timeout_in_minutes = timeout_in_minutes
39 | 
40 |         if cloudformation:
41 |             self.cloudformation = cloudformation
42 |         else:
43 |             self.cloudformation = boto3.resource("cloudformation")
44 | 
45 | 
46 |     def setup(self, on_complete=None):
47 |         print("Creating resources for stack [{}]...".format(self.stack_name))
48 | 
49 |         try:
50 | 
51 |             stack = self.cloudformation.create_stack(
52 |                         StackName=self.stack_name,
53 |                         TemplateBody=json.dumps(self.stack_template),
54 |                         TimeoutInMinutes=self.timeout_in_minutes)
55 | 
56 |             wait_for_status_change(stack)
57 | 
58 |             if stack.stack_status == "CREATE_COMPLETE":
59 |                 print_resources(stack)
60 | 
61 |                 if on_complete:
62 |                     on_complete()
63 | 
64 |                 return True
65 |             else:
66 |                 raise UnexpectedStateError(stack.stack_status)
67 | 
68 |         except (UnexpectedStateError, botocore.exceptions.ClientError) as e:
69 |             print("ERROR: {}".format(e))
70 |             return False
71 | 
72 | 
73 |     def teardown(self):
74 |         stacks = self.cloudformation.stacks.filter(StackName=self.stack_name)
75 |         s3 = None
76 | 
77 |         for s in stacks:
78 |             for r in s.resource_summaries.all():
79 |                 if r.resource_type == "AWS::S3::Bucket":
80 |                     if not s3:
81 |                         s3 = boto3.resource("s3")
82 | 
83 |                     bucket = s3.Bucket(r.physical_resource_id)
84 |                     for key in bucket.objects.all():
85 |                         key.delete()
86 | 
87 |             s.delete()
88 | 
89 |         print("Request to delete stack [{}] has been sent".format(self.stack_name))
90 | 
91 | 
92 |     def run(self, args):
93 |         if "--teardown" in args:
94 |             self.teardown()
95 |         else:
96 |             self.setup()
97 | 


--------------------------------------------------------------------------------
/setup/stacker_tests.py:
--------------------------------------------------------------------------------
 1 | import botocore, time, unittest
 2 | import stacker
 3 | 
 4 | from unittest.mock import Mock
 5 | from threading import Thread
 6 | 
 7 | 
 8 | 
 9 | class StatusChanger(object):
10 | 
11 |     def __init__(self, end_status, change_after_seconds, start_status="CREATE_IN_PROGRESS"):
12 |         self.creation_time = time.time()
13 |         self.stack_status = start_status
14 |         self.end_status = end_status
15 |         self.change_after_seconds = change_after_seconds
16 | 
17 |         self.resource_summaries = Mock()
18 |         self.resource_summaries.all = Mock(return_value=[])
19 | 
20 |     def reload(self):
21 |         call_time = time.time()
22 |         if call_time - self.creation_time > self.change_after_seconds:
23 |             self.stack_status = self.end_status
24 | 
25 | 
26 | class TestStacker(unittest.TestCase):
27 | 
28 |     def setUp(self):
29 |         self.cloudformation = Mock()
30 | 
31 |     def test_stack_status_change(self):
32 |         stack = StatusChanger("CREATE_COMPLETE", 1)
33 |         stacker.wait_for_status_change(stack)
34 |         self.assertEqual(stack.stack_status, "CREATE_COMPLETE")
35 | 
36 |     def test_unexpected_status(self):
37 |         self.cloudformation.create_stack = Mock(return_value=StatusChanger("UNEXPECTED", 1))
38 |         stkr = stacker.Stacker("example", {}, cloudformation=self.cloudformation)
39 |         self.assertFalse(stkr.setup())
40 | 
41 |     def test_client_error(self):
42 |         error_response = {
43 |             "Error": {
44 |                 "Code": "ExampleClientError",
45 |                 "Message": "Something happened"
46 |             }
47 |         }
48 |         self.cloudformation.create_stack = Mock(side_effect=botocore.exceptions.ClientError(error_response, "CreateStack"))
49 |         stkr = stacker.Stacker("example", {}, cloudformation=self.cloudformation)
50 |         self.assertFalse(stkr.setup())
51 | 
52 |     def test_stack_on_complete_callback(self):
53 |         self.cloudformation.create_stack = Mock(return_value=StatusChanger("CREATE_COMPLETE", 0.1))
54 |         stkr = stacker.Stacker("example", {}, cloudformation=self.cloudformation)
55 | 
56 |         mem = {"called": False}
57 |         def callback():
58 |             mem["called"] = True
59 | 
60 |         stkr.setup(on_complete=callback)
61 |         self.assertTrue(mem["called"])
62 | 
63 |         
64 | 
65 | if __name__ == "__main__":
66 |     unittest.main()
67 | 


--------------------------------------------------------------------------------