├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── app
    ├── common-json
    │   ├── configurations.json
    │   ├── ec2-attributes.json
    │   └── instance-groups.json
    ├── invokeEMR.sh
    ├── produce-steps-json.py
    ├── requirements.txt
    ├── restoreEMR.sh
    └── update-throughput.sh
└── config-samples
    ├── crontab.sample
    └── dynamodb_emr_backup_restore.IAMPOLICY.json


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7-alpine
 2 | 
 3 | RUN apk -Uuv add coreutils bash groff py-pip ca-certificates && \
 4 |   pip install awscli && \
 5 |   apk --purge -v del py-pip && \
 6 |   rm /var/cache/apk/*
 7 | 
 8 | RUN mkdir -p /app
 9 | 
10 | WORKDIR /app
11 | COPY app/ ./
12 | 
13 | RUN chmod a+x *
14 | RUN pip install -r requirements.txt
15 | 
16 | ENTRYPOINT ["./invokeEMR.sh"]
17 | CMD ['']
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT license
 2 | 
 3 | The MIT License (MIT)
 4 | 
 5 | Copyright (c) 2015 Signiant Inc.
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 8 | this software and associated documentation files (the "Software"), to deal in
 9 | the Software without restriction, including without limitation the rights to
10 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
11 | the Software, and to permit persons to whom the Software is furnished to do so,
12 | subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
19 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
20 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
21 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DynamoDB EMR Exporter
 2 | Uses EMR clusters to export and import dynamoDB tables to/from S3.  This uses the same routines as dataPipeline BUT it runs everything though a single cluster for all tables rather than a cluster per table.
 3 | 
 4 | ## Export Usage
 5 | 
 6 | The tool is packaged into a Docker container with all the prerequisites required.  To run this:
 7 | 
 8 | * Create a new IAM role
 9 | ** Must be named _**dynamodb_emr_backup_restore**_
10 | ** Use the IAM policy contained in _**config-samples/dynamodb_emr_backup_restore.IAMPOLICY.json**_
11 | 
12 | * Create a new EMR Security Configuration in any region to backup or restore to
13 | ** Must be named _**dynamodb-backups**_
14 | 
15 | * Run the docker container as follows:
16 | 
17 | ```bash
18 | docker run \
19 |   signiant/dynamodb-emr-exporter \
20 |     app_name \
21 |     emr_cluster_name \
22 |     table_filter \
23 |     read_throughput_percentage \
24 |     s3_location \
25 |     export_region \
26 |     spiked_throughput \
27 |     number_of_clusters
28 | ```
29 | 
30 | Where
31 | 
32 | * _**app_name**_ is a 'friendly name' for the DynamoDB table set you wish to export
33 | * _**emr_cluster_name**_ is a name to give to the EMR cluster
34 | * _**table_filter**_ is a filter for which table names to export (ie. MYAPP_PROD will export ALL tables starting with MYAPP_PROD)
35 | * _**read_throughput_percentage**_ is the percent of provisioned read throughput to use (eg 0.45 will use 45% of the provisioned read throughput)
36 | * _**S3_location**_ is a base S3 location to store the exports and all logs (ie. s3://mybucket/myfolder)
37 | * _**export_region**_ is the AWS region where the tables to export exist
38 | * _**spiked_throughput**_ is an optional provisioned read throughput value to spike the read throughtput to on the table being backed up
39 | * _**number_of_clusters**_ is an optional value to specify how many clusters to use (default 1)
40 | 
41 | An optional environment variable _**DEBUG_OUTPUT**_ can also be specified to the container which will run the underlying script with debug enabled
42 | 
43 | ## Excluding tables
44 | You can place an optional file called _**excludes**_ into the S3 based location (ie. whatever you have specified for _**S3_location**_) to exclude tables.  The format is one table per line and it must be the full table name (no wildcards are supported here).  Any tables which match the _**table_filter**_ BUT also match an entry in the _**excludes**_ file will NOT be exported
45 | 
46 | ## Import Usage
47 | 
48 | When the export runs, it also generates the configuration needed to execute an import. You can find the configuration file for imorting within the S3 location you specified (importSteps.json).
49 | 
50 | ### Running the import
51 | 
52 | The import can be run from Docker but you'll need to exec into the container to run it.
53 | 
54 | ```bash
55 | docker run \
56 |   --entrypoint bash \
57 |   signiant/dynamodb-emr-exporter
58 | ```
59 | Before running the import, you need to perform 2 tasks
60 | 
61 | 1. The tables you are importing data into MUST already exist with the same key structure in the region you wish to import into
62 | 2. Copy the importSteps.json file from the S3 bucket which contains the exports into the Docker container into the /app/common-json folder
63 | 
64 | Once these are done, you can invoke the restore like so
65 | ```
66 | ./restoreEMR.sh app_name emr_cluster_name local_json_files_path s3_path_for_logs cluster_region
67 | ```
68 | 
69 | Where
70 | 
71 | * _**app_name**_ is a 'friendly name' for the DynamoDB table set you wish to import
72 | * _**emr_cluster_name**_ is a name to give to the EMR cluster
73 | * _**local_json_files_path**_ is a folder to containing the json files produced by the export (generally, this will be /app/common-json)
74 | * _**s3_path_for_logs**_ is a base S3 location to store logs from EMR related to the import
75 | * _**cluster_region**_ is the AWS region in which to start the EMR cluster.  This does not have to be the same region as the tables are being imported to
76 | 
77 | _**NOTE**_
78 | The write throughput to use for the DynamoDB tables is actually defined in the script that runs at export time.  This is because it's then configured in the importSteps.json file.  If you wish to increase this, you can edit the generated importSteps.json file.
79 | 
80 | ## Workings
81 | 
82 | The basic mechanics of the process are as follows
83 | 
84 | ### Export
85 | 
86 | 1. Check and see if there are any EMR clusters already running for 'this' app.  If so, exit.  Otherwise, carry on
87 | 2. Setup the common configuration for the cluster
88 | 3. Call the python script to generate the steps (tasks) for EMR for each table.  This essentially lists all the tables in the region, applies the provided filter and then generates the JSON that can be passed to EMR to export the tables
89 | 4. Once the steps JSON is present, create a new cluster with the AWS CLI. We have to handle cluster setup failure here so retries are used for failures.
90 | 5. Submit the tasks to the cluster and poll the cluster until it's complete.  Any errors of a step will result in a failure being logged
91 | 6. Once we know everyyhing was successful, write the export and import steps files to S3 in case this machine has issues.  We also write flag files to S3 indicating the progress of the export (in progress, complete, error, etc.) in case another process needs to ingest this data, it can poll on these status files.
92 | 
93 | ### Import
94 | 
95 | 1. Create a new EMR cluster with the import steps file as the tasks to perform
96 | 2. Poll the cluster to ensure success
97 | 


--------------------------------------------------------------------------------
/app/common-json/configurations.json:
--------------------------------------------------------------------------------
 1 | [
 2 |    {
 3 |       "Classification":"mapred-site",
 4 |       "Properties":{
 5 |          "mapreduce.map.java.opts":"-Xmx2048m",
 6 |          "mapreduce.reduce.java.opts":"-Xmx2048m",
 7 |          "mapreduce.job.reuse.jvm.num.tasks":"1",
 8 |          "mapreduce.map.memory.mb":"2560",
 9 |          "mapreduce.reduce.memory.mb":"2560",
10 |          "mapreduce.map.speculative":"false"
11 |       }
12 |    }
13 | ]
14 | 


--------------------------------------------------------------------------------
/app/common-json/ec2-attributes.json:
--------------------------------------------------------------------------------
1 | {
2 |   "InstanceProfile": "dynamodb_emr_backup_restore",
3 |   "AvailabilityZone": "us-east-1a"
4 | }
5 | 


--------------------------------------------------------------------------------
/app/common-json/instance-groups.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "InstanceCount": 1,
 4 |     "Name": "Master",
 5 |     "InstanceGroupType": "MASTER",
 6 |     "InstanceType": "m3.xlarge",
 7 |     "BidPrice" : "0.270"
 8 |   },
 9 |   {
10 |     "InstanceCount": 1,
11 |     "Name": "Workers",
12 |     "InstanceGroupType": "CORE",
13 |     "InstanceType": "m3.xlarge",
14 |     "BidPrice" : "0.270"
15 |   }
16 | ]
17 | 


--------------------------------------------------------------------------------
/app/invokeEMR.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [ "$DEBUG_OUTPUT" ]; then
  4 |     echo "DEBUG Output selected"
  5 |     set -x
  6 | fi
  7 | 
  8 | # Inputs
  9 | APPNAME=$1
 10 | CLUSTER_NAME=$2
 11 | TABLE_FILTER=$3
 12 | READ_TPUT=$4
 13 | S3LOCATION=$5
 14 | REGION=$6
 15 | SPIKED_THROUGHPUT=$7
 16 | 
 17 | NUMBER_OF_CLUSTERS=1
 18 | if [ $# -gt 7 ]; then
 19 |     NUMBER_OF_CLUSTERS=$8
 20 | fi
 21 | 
 22 | WRITE_TPUT=0.8		# Used when we generate the Import steps
 23 | RETRY_DELAY=60
 24 | RUNNING_CHECK_DELAY=30
 25 | 
 26 | # Just vars
 27 | INSTALL_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 28 | COMMON_JSON=${INSTALL_DIR}/common-json
 29 | STEP_PRODUCER=${INSTALL_DIR}/produce-steps-json.py
 30 | THROUGHPUT_SCRIPT=${INSTALL_DIR}/update-throughput.sh
 31 | JSON_OUTPUT_DIR=${INSTALL_DIR}/${TABLE_FILTER}
 32 | NEXTPHASE=0
 33 | RETCODE=0
 34 | 
 35 | # Lock files (delivered to S3 at different phases)
 36 | BACKUP_RUNNING_LOCK_NAME=BACKUP_RUNNING
 37 | BACKUP_COMPLETE_SUCCESS_LOCK_NAME=BACKUP_COMPLETE_SUCCESS
 38 | BACKUP_COMPLETE_FAILED_LOCK_NAME=BACKUP_COMPLETE_FAILED
 39 | 
 40 | BACKUP_RUNNING_LOCK_LOCAL_FILE=${INSTALL_DIR}/${BACKUP_RUNNING_LOCK_NAME}
 41 | BACKUP_COMPLETE_SUCCESS_LOCK_LOCAL_FILE=${INSTALL_DIR}/${BACKUP_COMPLETE_SUCCESS_LOCK_NAME}
 42 | BACKUP_COMPLETE_FAILED_LOCK_LOCAL_FILE=${INSTALL_DIR}/${BACKUP_COMPLETE_FAILED_LOCK_NAME}
 43 | 
 44 | logMsg()
 45 | {
 46 |     PROGNAME=invokeEMR
 47 |     PID=$$
 48 |     logger -t ${PROGNAME}[$PID] $1
 49 |     echo $1
 50 | }
 51 | 
 52 | usage()
 53 | {
 54 |     echo "Usage: invokeEMR app_name emr_cluster_name table_filter read_throughput_percentage json_output_directory S3_location export_region [spiked_throughput] [number_of_clusters]"
 55 | }
 56 | 
 57 | pollClusters()
 58 | {
 59 |     CLUSTER_IDS=$1
 60 |     CLUSTERS=$2
 61 | 
 62 |     CLUSTERS_COMPLETE=()
 63 |     for cluster in "${CLUSTERS[@]}"
 64 |     do
 65 |         CLUSTERS_COMPLETE+=(0)
 66 |     done
 67 |     ALL_COMPLETE=0
 68 |     ERRORS=0
 69 | 
 70 |     while [ $ALL_COMPLETE -ne 1 ]
 71 |     do
 72 |         cluster_number=0
 73 |         for cluster in "${CLUSTERS[@]}"
 74 |         do
 75 |             if [ ${CLUSTERS_COMPLETE[$cluster_number]} -ne 1 ]; then
 76 |                 # If the cluster is not yet complete
 77 |                 logMsg "polling cluster NAME:${cluster} ID ${CLUSTER_IDS[$cluster_number]} for status in region ${REGION}"
 78 |                 CLUSTER_STATUS=$(aws emr describe-cluster --cluster-id ${CLUSTER_IDS[$cluster_number]} --region $REGION --output text --query 'Cluster.Status.State')
 79 | 
 80 |                 if [ "${CLUSTER_STATUS}" == "TERMINATED" ]; then
 81 |                     # We now need to check if there were step errors
 82 |                     STEPS_STATUS=$(aws emr describe-cluster --cluster-id ${CLUSTER_IDS[$cluster_number]} --region $REGION --output text --query 'Cluster.Status.StateChangeReason.Message')
 83 | 
 84 |                     if [ "${STEPS_STATUS}" == "Steps completed with errors" ]; then
 85 |                         EXPORT_FAILS=$(aws emr list-steps --step-states FAILED --cluster-id ${CLUSTER_IDS[$cluster_number]} --region $REGION --output text --query 'Steps[?starts_with(Name, `Export Table:`) == `true`]|[].Name')
 86 |                         if [ ! -z "${EXPORT_FAILS}" ]; then
 87 |                             ERRORS=1
 88 |                             logMsg "Cluster ERROR:task failure NAME:${cluster} ID:${CLUSTER_IDS[$cluster_number]}"
 89 |                         else
 90 |                             ERRORS=0
 91 |                         fi
 92 |                     else
 93 |                         ERRORS=0
 94 |                         logMsg "Cluster SUCCESS NAME:${cluster} ID:${CLUSTER_IDS[$cluster_number]}"
 95 |                     fi
 96 | 
 97 |                     CLUSTERS_COMPLETE[$cluster_number]=1
 98 |                 elif [ "${CLUSTER_STATUS}" == "TERMINATED_WITH_ERRORS" ]; then
 99 |                     ERRORS=1
100 |                     CLUSTERS_COMPLETE[$cluster_number]=1
101 |                 fi
102 |             fi
103 |             cluster_number=$((cluster_number+1))
104 |         done
105 | 
106 |         # Parse the cluster complete values
107 |         ALL_COMPLETE=1
108 |         for complete in "${CLUSTERS_COMPLETE[@]}"
109 |         do
110 |             if [ $complete -eq 0 ]; then
111 |                 ALL_COMPLETE=0
112 |                 break
113 |             fi
114 |         done
115 | 
116 |         if [ $ALL_COMPLETE -eq 0 ]; then
117 |             sleep 10
118 |         fi
119 |     done
120 | 
121 |     return $ERRORS
122 | }
123 | 
124 | if [ $# != 6 ] && [ $# != 7 ] && [ $# != 8 ]; then
125 |         usage
126 |         exit 1
127 | fi
128 | logMsg "Starting up"
129 | 
130 | CLUSTERS=()
131 | if [ $NUMBER_OF_CLUSTERS -gt 1 ]; then
132 |     # CLUSTER_NAMES will be an arry of cluster names using CLUSTER_NAME_01, 02, etc...
133 |     logMsg "Asked to create multiple clusters for this backup"
134 |     #for num in `seq 1 $NUMBER_OF_CLUSTERS`;
135 |     num=1
136 |     while [ $num -le $NUMBER_OF_CLUSTERS ];
137 |     do
138 |         zero_pad_num=`printf "%02d\n" $num;`
139 |         #logMsg "Adding ${CLUSTER_NAME}_${zero_pad_num}"
140 |         CLUSTERS+=("${CLUSTER_NAME}_${zero_pad_num}")
141 |         num=$(($num+1))
142 |     done
143 | else
144 |     # single cluster
145 |     CLUSTERS+=("${CLUSTER_NAME}")
146 | fi
147 | 
148 | ######
149 | ## PHASE 1 - See if there are any clusters already runing with our name.  If there are, exit
150 | ######
151 | EXISTING_CLUSTERS=0
152 | # Get a list of running clusters
153 | running_clusters=$(aws emr list-clusters --active --region us-east-1 --query 'Clusters[].[Name]' --output text)
154 | for cluster in "${CLUSTERS[@]}"
155 | do
156 |     echo $running_clusters | grep -q ${cluster}
157 |     STATUS=$?
158 |     if [ $STATUS -eq 0 ]; then
159 |         # There is already a running cluster with this name - bail
160 |         logMsg "Cluster ERROR: existing cluster ${cluster} running"
161 |         EXISTING_CLUSTERS=1
162 |         NEXTPHASE=0
163 |         RETCODE=2
164 |         break
165 |     fi
166 | done
167 | 
168 | if [ $EXISTING_CLUSTERS -eq 0 ]; then
169 |     logMsg "No existing conflicting EMR clusters running.  Creating"
170 |     NEXTPHASE=1
171 | fi
172 | 
173 | ######
174 | ## PHASE 2 - Copy in the common JSON files
175 | ######
176 | if [ $NEXTPHASE -eq 1 ]; then
177 |     if [ ! -d "${COMMON_JSON}" ]; then
178 |         logMsg "The common-json folder is missing - unable to continue"
179 |         NEXTPHASE=0
180 |         RETCODE=2
181 |     else
182 |         mkdir -p ${JSON_OUTPUT_DIR}
183 | 
184 |         logMsg "Copying common json files to ${JSON_OUTPUT_DIR}"
185 |         cp -f ${COMMON_JSON}/ec2-attributes.json ${JSON_OUTPUT_DIR}/ec2-attributes.json
186 |         cp -f ${COMMON_JSON}/instance-groups.json ${JSON_OUTPUT_DIR}/instance-groups.json
187 |         cp -f ${COMMON_JSON}/configurations.json ${JSON_OUTPUT_DIR}/configurations.json
188 | 
189 |         if [ ! -e "${JSON_OUTPUT_DIR}/ec2-attributes.json" ] ||
190 |            [ ! -e "${JSON_OUTPUT_DIR}/configurations.json" ] ||
191 |            [ ! -e "${JSON_OUTPUT_DIR}/instance-groups.json" ]; then
192 |                logMsg "Error copying common json files to ${JSON_OUTPUT_DIR}"
193 |                NEXTPHASE=0
194 |                RETCODE=2
195 |         fi
196 |     fi
197 | fi
198 | 
199 | ######
200 | ## PHASE 3 - Upload the update-throughput script
201 | ######
202 | if [ $NEXTPHASE -eq 1 ]; then
203 |     if [ ! -e $THROUGHPUT_SCRIPT ]; then
204 |         logMsg "The update-throughput.sh script is missing - unable to continue"
205 |         NEXTPHASE=0
206 |         RETCODE=2
207 |     else
208 |         aws s3 cp $THROUGHPUT_SCRIPT ${S3LOCATION}/scripts/update-throughput.sh
209 |         if [ $? -ne 0 ]; then
210 |             logMsg "ERROR: Unable to upload the update-throughput script to s3, unable to continue"
211 |             RETCODE=2
212 |             NEXTPHASE=0
213 |         fi
214 |     fi
215 | fi
216 | 
217 | 
218 | ######
219 | ## PHASE 4 - See if we have an excludes file in the S3 bucket and download if so
220 | ######
221 | if [ $NEXTPHASE -eq 1 ]; then
222 |     aws s3 cp ${S3LOCATION}/excludes ./excludes
223 | 
224 |     if [ $? -eq 0 ]; then
225 |         logMsg "Excludes file found in S3 - downloading ${S3LOCATION}/excludes"
226 |         EXCLUDE_ARG="-x ./excludes"
227 |     fi
228 | fi
229 | 
230 | ######
231 | ## PHASE 5 - Generate the steps files
232 | ######
233 | if [ $NEXTPHASE -eq 1 ]; then
234 |     # PHASE 2 - Get the EMR steps file for the tables to backup
235 |     logMsg "Generating JSON files (R:${REGION} READ:${READ_TPUT} WRITE:${WRITE_TPUT} FILT:${TABLE_FILTER} CCOUNT:${NUMBER_OF_CLUSTERS} JDIR:${JSON_OUTPUT_DIR} S3DIR:${S3LOCATION}"
236 | 
237 |     if [ -n "${SPIKED_THROUGHPUT}" ]; then
238 |         SPIKE_ARG="-s ${SPIKED_THROUGHPUT}"
239 |     fi
240 |     ${STEP_PRODUCER} -a ${APPNAME} -r ${REGION} -e ${READ_TPUT} -w ${WRITE_TPUT} -f ${TABLE_FILTER} -c ${NUMBER_OF_CLUSTERS} ${SPIKE_ARG} ${EXCLUDE_ARG} ${JSON_OUTPUT_DIR} ${S3LOCATION}
241 |     RESULT=$?
242 |     if [ $RESULT -eq 0 ]; then
243 |         NEXTPHASE=1
244 |     else
245 |         logMsg "Cluster ERROR: Unable to generate the EMR steps files"
246 |         RETCODE=3
247 |         NEXTPHASE=0
248 |     fi
249 | 
250 |     # Get the location of where 'this' backup will be placed in S3
251 |     S3_BACKUP_BASE=$(cat ${JSON_OUTPUT_DIR}/s3path.info)
252 |     logMsg "The S3 base path for this backup is ${S3_BACKUP_BASE}"
253 | 
254 |     if [ "${S3_BACKUP_BASE}" == "" ]; then
255 |         logMsg "ERROR: No S3 base location for this backup - unable to continue"
256 |         RETCODE=3
257 |         NEXTPHASE=0
258 |     fi
259 | fi
260 | 
261 | ######
262 | ## PHASE 6 - Create the EMR cluster(s) (with retries)
263 | ######
264 | if [ $NEXTPHASE -eq 1 ]; then
265 |     RETRIES=5
266 |     CHECK_RETRIES=60
267 | 
268 |     # we need some status files which are delivered to S3 if the job is running or if it fails.
269 |     # This just creates them - we deliver them to S3 at later steps
270 | 
271 |     if [ ! -e "${BACKUP_RUNNING_LOCK_LOCAL_FILE}" ]; then
272 |         touch "${BACKUP_RUNNING_LOCK_LOCAL_FILE}"
273 |     fi
274 | 
275 |     if [ ! -e "${BACKUP_COMPLETE_SUCCESS_LOCK_LOCAL_FILE}" ]; then
276 |         touch "${BACKUP_COMPLETE_SUCCESS_LOCK_LOCAL_FILE}"
277 |     fi
278 | 
279 |     if [ ! -e "${BACKUP_COMPLETE_FAILED_LOCK_LOCAL_FILE}" ]; then
280 |         touch "${BACKUP_COMPLETE_FAILED_LOCK_LOCAL_FILE}"
281 |     fi
282 | 
283 |     # Create and Wait for Clusters to start
284 | 
285 |     # Initialize statuses
286 |     CLUSTER_ATTEMPT=()
287 |     CLUSTER_CREATED=()
288 |     CLUSTER_RUNNING=()
289 |     CLUSTER_RUNNING_CHECK=()
290 |     for cluster in "${CLUSTERS[@]}"
291 |     do
292 |         CLUSTER_ATTEMPT+=(1)
293 |         CLUSTER_CREATED+=(0)
294 |         CLUSTER_RUNNING+=(0)
295 |         CLUSTER_RUNNING_CHECK+=(0)
296 |     done
297 | 
298 |     CLUSTER_IDS=()
299 |     TOO_MANY_RETRIES=0
300 |     ALL_STARTED=0
301 |     while [ $ALL_STARTED -ne 1 ] && [ $TOO_MANY_RETRIES -eq 0 ]
302 |     do
303 |         # Create all the clusters
304 |         ALL_CREATED=0
305 |         while [ $ALL_CREATED -ne 1 ]
306 |         do
307 |             cluster_number=0
308 |             for cluster in "${CLUSTERS[@]}"
309 |             do
310 |                 zero_pad_cluster_num=`printf "%02d\n" $((cluster_number+1));`
311 |                 if [ ${CLUSTER_ATTEMPT[$cluster_number]} -le $RETRIES ]; then
312 |                     if [ ${CLUSTER_CREATED[$cluster_number]} -ne 1 ]; then
313 |                         #double check that cluster isn't really running with one more check
314 |                         running_clusters=$(aws emr list-clusters --cluster-states STARTING BOOTSTRAPPING RUNNING WAITING --region us-east-1 --query 'Clusters[].[Name]' --output text)
315 |                         echo $running_clusters | grep -q ${cluster}
316 |                         STATUS=$?
317 |                         if [ $STATUS -eq 0 ]; then
318 |                             # We already have a cluster running - bail
319 |                             logMsg "Cluster ERROR: existing cluster ${cluster} running"
320 |                             CLUSTER_CREATED[$cluster_number]=0
321 |                             #set current attemps greater than while condition
322 |                             CLUSTER_ATTEMPT[$cluster_number]=$[$RETRIES+1]
323 |                             break
324 |                         else
325 |                             logMsg "No existing EMR cluster with  name ${cluster} running.  Creating"
326 |                             # Invoke the aws CLI to create the cluster
327 |                             logMsg "Creating new EMR Cluster NAME:${cluster} Attempt ${CLUSTER_ATTEMPT[$cluster_number]} of ${RETRIES}"
328 | 
329 |                             CLUSTERID=$(aws emr create-cluster --name "${cluster}"                                        \
330 |                                         --release-label "emr-5.28.0"                                                           \
331 |                                         --service-role "EMR_DefaultRole"                                                       \
332 |                                         --security-configuration "dynamodb-backups"                                            \
333 |                                         --tags Name=${CLUSTER_NAME} signiant:product=devops signiant:email=devops@signiant.com \
334 |                                         --enable-debugging                                                                     \
335 |                                         --log-uri ${S3LOCATION}/emr-logs                                                       \
336 |                                         --configurations file://${JSON_OUTPUT_DIR}/configurations.json                         \
337 |                                         --instance-groups file://${JSON_OUTPUT_DIR}/instance-groups.json                       \
338 |                                         --ec2-attributes file://${JSON_OUTPUT_DIR}/ec2-attributes.json                         \
339 |                                         --steps file://${JSON_OUTPUT_DIR}/exportSteps_${zero_pad_cluster_num}.json                                     \
340 |                                         --auto-terminate                                                                       \
341 |                                         --visible-to-all-users                                                                 \
342 |                                         --output text                                                                          \
343 |                                         --region ${REGION})
344 | 
345 |                             logMsg "CLUSTERID for ${cluster} is $CLUSTERID"
346 |                             CLUSTER_IDS[$cluster_number]=$CLUSTERID
347 |                             if [ "$CLUSTERID" != "" ]; then
348 |                                 CLUSTER_CREATED[$cluster_number]=1
349 |                             else
350 |                                 logMsg "Cluster ERROR: no cluster ID returned NAME:${cluster}"
351 |                                 CLUSTER_CREATED[$cluster_number]=0
352 |                                 CLUSTER_ATTEMPT[$cluster_number]=$((CLUSTER_ATTEMPT[$cluster_number]+1))
353 |                             fi
354 |                         fi
355 |                     fi
356 |                 fi
357 |                 cluster_number=$((cluster_number+1))
358 |             done
359 | 
360 |             # Check the retry count for each cluster
361 |             for attempts in "${CLUSTER_ATTEMPT[@]}"
362 |             do
363 |                 if [ $attempts -gt $RETRIES ]; then
364 |                     TOO_MANY_RETRIES=1
365 |                     break
366 |                 fi
367 |             done
368 | 
369 |             # Check to see if we've got a cluster ID for all clusters
370 |             ALL_CREATED=1
371 |             for cluster_id in "${CLUSTER_IDS[@]}"
372 |             do
373 |                 if [ "$cluster_id" == "" ]; then
374 |                     ALL_CREATED=0
375 |                 fi
376 |             done
377 | 
378 |             if [ $ALL_CREATED -eq 0 ]; then
379 |                 # Wait before trying to create again
380 |                 sleep ${RETRY_DELAY}
381 |             fi
382 |         done # All Clusters Created
383 | 
384 |         if [ $TOO_MANY_RETRIES -eq 0 ]; then
385 |             # Wait for all clusters to start
386 |             cluster_number=0
387 |             for cluster in "${CLUSTERS[@]}"
388 |             do
389 |                 if [ ${CLUSTER_CREATED[$cluster_number]} -eq 1 ]; then
390 |                     # Cluster created - check if it's running
391 |                     if [ ${CLUSTER_RUNNING[$cluster_number]} -ne 1 ]; then
392 |                         # Cluster isn't yet running
393 |                         if [ ${CLUSTER_RUNNING_CHECK[$cluster_number]} -le $CHECK_RETRIES ]; then
394 |                             # We haven't exceeded our check retries
395 |                             logMsg "Waiting for cluster NAME:${cluster} ID:${CLUSTER_IDS[$cluster_number]} to start...."
396 |                             CLUSTER_STATE=$(aws emr describe-cluster --cluster-id ${CLUSTER_IDS[$cluster_number]} --query 'Cluster.Status.State' --output text --region ${REGION})
397 | 
398 |                             if [ "$CLUSTER_STATE" == "RUNNING" ]; then
399 |                                 logMsg "Cluster NAME:${cluster} ID:${CLUSTER_IDS[$cluster_number]} launched successfully"
400 |                                 CLUSTER_RUNNING[$cluster_number]=1
401 |                             else
402 |                                 # Not Running yet - increment CLUSTER_RUNNING_CHECK
403 |                                 CLUSTER_RUNNING_CHECK[$cluster_number]=$((CLUSTER_RUNNING_CHECK[$cluster_number]+1))
404 |                                 if [[ "$CLUSTER_STATE" == *"TERMINATED"* ]]; then
405 |                                     logMsg "Cluster ERROR: launch failure NAME:${cluster} ID:${CLUSTER_IDS[$cluster_number]} Attempt ${CLUSTER_ATTEMPT[$cluster_number]} of ${RETRIES}"
406 |                                     CLUSTER_CREATED[$cluster_number]=0
407 |                                     CLUSTER_ATTEMPT[$cluster_number]=$((CLUSTER_ATTEMPT[$cluster_number]+1))
408 |                                     CLUSTER_RUNNING[$cluster_number]=0
409 |                                     CLUSTER_RUNNING_CHECK[$cluster_number]=0
410 |                                 fi
411 |                             fi
412 |                         fi
413 |                     fi
414 |                 fi
415 |                 cluster_number=$((cluster_number+1))
416 |             done
417 | 
418 |             # Check the retry count for each cluster
419 |             for attempts in "${CLUSTER_ATTEMPT[@]}"
420 |             do
421 |                 if [ $attempts -gt $RETRIES ]; then
422 |                     TOO_MANY_RETRIES=1
423 |                     break
424 |                 fi
425 |             done
426 | 
427 |             # Check the running check retry count for each cluster
428 |             for attempts in "${CLUSTER_RUNNING_CHECK[@]}"
429 |             do
430 |                 if [ $attempts -gt $CHECK_RETRIES ]; then
431 |                     TOO_MANY_RETRIES=1
432 |                     break
433 |                 fi
434 |             done
435 | 
436 |             # Check the status for each cluster
437 |             ALL_STARTED=1
438 |             for status in "${CLUSTER_RUNNING[@]}"
439 |             do
440 |                 if [ $status -eq 0 ]; then
441 |                     ALL_STARTED=0
442 |                     break
443 |                 fi
444 |             done
445 | 
446 |             if [ $ALL_STARTED -eq 0 ]; then
447 |                 logMsg "Delaying ${RUNNING_CHECK_DELAY} seconds before checking cluster(s) status..."
448 |                 sleep ${RUNNING_CHECK_DELAY}
449 |             fi
450 |         fi
451 |     done # All Clusters Started
452 | 
453 |     if [ $ALL_STARTED -eq 1 ]; then
454 |         # All cluster(s) provisioned...now we can poll their tasks
455 |         # First tag the backup as in progress so any downstream processes know not to copy this
456 |         logMsg "Writing BACKUP_RUNNING_LOCK file for this backup"
457 |         aws s3 cp ${BACKUP_RUNNING_LOCK_LOCAL_FILE} ${S3_BACKUP_BASE}/${BACKUP_RUNNING_LOCK_NAME}
458 | 
459 |         pollClusters $CLUSTER_IDS $CLUSTERS
460 |         STATUS=$?
461 | 
462 |         if [ $STATUS -eq 0 ]; then
463 |             logMsg "All Clusters SUCCESS"
464 | 
465 |             # Copy the steps json files to S3 so we have a copy for 'this' job
466 |             if [ "${S3_BACKUP_BASE}" != "" ]; then
467 |                     logMsg "Copying steps files to S3"
468 |                     for filename in ${JSON_OUTPUT_DIR}/exportSteps*; do
469 |                         aws s3 cp $filename ${S3_BACKUP_BASE}/
470 |                     done
471 |                     for filename in ${JSON_OUTPUT_DIR}/importSteps*; do
472 |                         aws s3 cp $filename ${S3_BACKUP_BASE}/
473 |                     done
474 | 
475 |                     logMsg "Removing the BACKUP_RUNNING_LOCK file for this backup"
476 |                     aws s3 rm ${S3_BACKUP_BASE}/${BACKUP_RUNNING_LOCK_NAME}
477 | 
478 |                     logMsg "Writing the BACKUP_COMPLETE_SUCCESS file for this backup"
479 |                     aws s3 cp ${BACKUP_COMPLETE_SUCCESS_LOCK_LOCAL_FILE} ${S3_BACKUP_BASE}/${BACKUP_COMPLETE_SUCCESS_LOCK_NAME}
480 |             else
481 |                     logMsg "No S3 base location for this backup specified - unable to copy steps files to S3"
482 |             fi
483 | 
484 |             logMsg "DynamoDB Export SUCCESSFUL for $APPNAME"
485 | 
486 |             RETCODE=0
487 |         else
488 |             logMsg "Cluster ERROR"
489 | 
490 |             logMsg "Removing the BACKUP_RUNNING_LOCK file for this backup"
491 |             aws s3 rm ${S3_BACKUP_BASE}/${BACKUP_RUNNING_LOCK_NAME}
492 | 
493 |             logMsg "Writing the BACKUP_COMPLETE_FAILED file for this backup"
494 |             aws s3 cp ${BACKUP_COMPLETE_FAILED_LOCK_LOCAL_FILE} ${S3_BACKUP_BASE}/${BACKUP_COMPLETE_FAILED_LOCK_NAME}
495 | 
496 |             logMsg "DynamoDB Export FAILED for $APPNAME"
497 | 
498 |             RETCODE=4
499 |         fi
500 |     else
501 |         # TODO: Check status of clusters and terminate any that ARE running
502 |         logMsg "Unable to provision new cluster(s) after ${RETRIES} attempts"
503 |         RETCODE=6
504 |     fi
505 | fi
506 | 
507 | exit ${RETCODE}
508 | 


--------------------------------------------------------------------------------
/app/produce-steps-json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import datetime
  4 | import argparse
  5 | import syslog
  6 | import contextlib
  7 | import os
  8 | import os.path
  9 | import sys
 10 | import boto3
 11 | import json
 12 | 
 13 | parser = argparse.ArgumentParser(
 14 |     prog="produce-steps-json",
 15 |     description="""EMR JSON steps producer for DynamoDB table extraction"""
 16 | )
 17 | 
 18 | parser.add_argument(
 19 |     '-a',
 20 |     '--appname',
 21 |     type=str,
 22 |     default="MYAPP",
 23 |     help="Name of the application we are exporting tables for.  Used in the S3 path where the dumps finally end up."
 24 | )
 25 | 
 26 | parser.add_argument(
 27 |     '-r',
 28 |     '--region',
 29 |     type=str,
 30 |     default="us-east-1",
 31 |     help="The region to connect to for exporting."
 32 | )
 33 | 
 34 | parser.add_argument(
 35 |     '-e',
 36 |     '--readtput',
 37 |     type=str,
 38 |     default="0.25",
 39 |     help="The percentage of read throughput to utilize when exporting (def: 0.25)."
 40 | )
 41 | 
 42 | parser.add_argument(
 43 |     '-w',
 44 |     '--writetput',
 45 |     type=str,
 46 |     default="0.5",
 47 |     help="The percentage of write throughput to utilize when importing data (def: 0.5)."
 48 | )
 49 | 
 50 | parser.add_argument(
 51 |     '-s',
 52 |     '--spikedread',
 53 |     type=str,
 54 |     help="The value to spike read throughput to before table export"
 55 | )
 56 | 
 57 | parser.add_argument(
 58 |     '-x',
 59 |     '--excludes',
 60 |     type=str,
 61 |     help="A file containing a list of tables to exclude"
 62 | )
 63 | 
 64 | parser.add_argument(
 65 |     '-f',
 66 |     '--filter',
 67 |     type=str,
 68 |     default="",
 69 |     help="Only export tables with this filter criteria in the table name."
 70 | )
 71 | 
 72 | parser.add_argument(
 73 |     '-c',
 74 |     '--clusters',
 75 |     type=str,
 76 |     default="",
 77 |     help="Number of Clusters to create. (tables will be split evenly among them)"
 78 | )
 79 | 
 80 | parser.add_argument(
 81 |     'destination',
 82 |     type=str,
 83 |     help="where to place the EMR export and import steps files"
 84 | )
 85 | 
 86 | parser.add_argument(
 87 |     's3location',
 88 |     type=str,
 89 |     help="The S3 FOLDER path to place export files in and read import files from."
 90 | )
 91 | 
 92 | 
 93 | def myLog(message):
 94 |     procName = __file__
 95 |     currDTS = datetime.datetime.now()
 96 |     dateTimeStr = currDTS.strftime('%Y/%B/%d/ %H:%M:%S')
 97 | 
 98 |     syslogMsg = procName + ": " + message
 99 |     syslog.syslog(syslogMsg)
100 |     print('%s %s' % (dateTimeStr, message))
101 | 
102 | 
103 | def main(region, filter, destination, writetput, readtput,
104 |          spikedread, s3location, appname, excludes, clusters):
105 | 
106 |     dateStr = datetime.datetime.now().strftime("%Y/%m/%d/%H_%M.%S")
107 | 
108 |     conn = boto3.client('dynamodb', region_name=region)
109 | 
110 |     # Have we been given an excludes file?  If so, read it.  Any tables in here
111 |     # will not have export steps generated for them
112 |     if excludes:
113 |         myLog("excludes specified - reading " + excludes)
114 | 
115 |         if os.path.exists(excludes):
116 |             exclude_table_list = [line.rstrip('\n') for line in open(excludes)]
117 |         else:
118 |             myLog("Unable to open " + excludes + " for reading")
119 | 
120 |     if conn:
121 |         myLog("Connected to dynamodb (region: %s)" % region)
122 | 
123 |         # Get the path we will use for 'this' backup
124 |         s3ExportPath = generateS3Path(s3location, region, dateStr, appname)
125 | 
126 |         # Get the path to the update-throughput script
127 |         s3ScriptPath = s3location.rstrip('/') + "/scripts/update-throughput.sh"
128 | 
129 |         S3PathFilename = destination + "/s3path.info"
130 |         writeFile(s3ExportPath, S3PathFilename)
131 | 
132 |         # get a list of all tables in the region
133 |         table_list = listTables(conn)
134 |         # print('Table list:\n%s' % json.dumps(table_list, indent=4))
135 | 
136 |         myLog("Exporting all tables where table name contains %s " % filter)
137 | 
138 |         filtered_list = [x for x in table_list if filter in x]
139 |         # print("Filtered list:\n" + json.dumps(filtered_list, indent=4))
140 | 
141 |         myLog('Excluding any tables in the following list:\n%s' % json.dumps(exclude_table_list, indent=4))
142 | 
143 |         filtered_excluded_list = [x for x in filtered_list if x not in exclude_table_list]
144 |         # print("Filtered and Excluded list:\n" + json.dumps(filtered_excluded_list, indent=4))
145 | 
146 |         myLog('Asked to break list of tables up into %s clusters' % str(clusters))
147 |         table_list_list = chunkIt(filtered_excluded_list, clusters)
148 | 
149 |         myLog('Broke list of tables up into the following clusters:\n%s' % json.dumps(table_list_list, indent=4))
150 | 
151 |         cluster_number = 1
152 |         for chunk in table_list_list:
153 |             exportSteps = []
154 |             importSteps = []
155 | 
156 |             # Process this chunk of tables
157 |             myLog("Creating steps for Cluster %02d" % int(cluster_number))
158 | 
159 |             table_desc_list = describeTables(conn, chunk)
160 | 
161 |             for table in table_desc_list:
162 |                 myLog(
163 |                     "Generating EMR export JSON for table: [%s]" %
164 |                     table['name'])
165 | 
166 |                 autoscale_min_spike_read_capacity = None  # Assume no autoscaling
167 |                 autoscale_min_reset_read_capacity = None
168 |                 tableS3Path = s3ExportPath + "/" + table['name']
169 | 
170 |                 # Check if table is set to On Demand Capacity
171 |                 if int(table['read']) > 0:
172 |                     myLog(
173 |                         "Table uses provisioned capacity - need to add throughput spike and reset steps")
174 |                     # Does this table have autoscaling enabled?
175 |                     scalable_target_info = scalable_target_exists(
176 |                         region, "table/" + table['name'], "dynamodb:table:ReadCapacityUnits")
177 |                     if scalable_target_info is not None:
178 |                         myLog("Table " + table['name'] +
179 |                               " has autoscaling enabled")
180 |                         autoscale_min_spike_read_capacity = scalable_target_info[0]['MinCapacity'] + int(
181 |                             spikedread)
182 |                         autoscale_min_reset_read_capacity = scalable_target_info[0]['MinCapacity']
183 |                         myLog(
184 |                             "Table " +
185 |                             table['name'] +
186 |                             " has a current AS min capacity of " +
187 |                             str(autoscale_min_reset_read_capacity))
188 | 
189 |                     if spikedread is not None:
190 |                         tputSpikeStep = generateThroughputUpdateStep(
191 |                             table['name'],
192 |                             "Spike",
193 |                             s3ScriptPath,
194 |                             autoscale_min_spike_read_capacity,
195 |                             autoscale_min_spike_read_capacity,
196 |                             table['write'],
197 |                             region)
198 |                         exportSteps.append(tputSpikeStep)
199 |                 else:
200 |                     myLog(
201 |                         "Table uses on-demand capacity - no need for spike and reset throughput steps")
202 | 
203 |                 tableExportStep = generateTableExportStep(
204 |                     table['name'], tableS3Path, readtput)
205 |                 exportSteps.append(tableExportStep)
206 | 
207 |                 if int(table['read']) > 0:
208 |                     if spikedread is not None:
209 |                         tputResetStep = generateThroughputUpdateStep(
210 |                             table['name'],
211 |                             "Reset",
212 |                             s3ScriptPath,
213 |                             table['read'],
214 |                             autoscale_min_reset_read_capacity,
215 |                             table['write'],
216 |                             region)
217 |                         exportSteps.append(tputResetStep)
218 | 
219 |                 tableImportStep = generateTableImportStep(
220 |                     table['name'], tableS3Path, writetput)
221 |                 importSteps.append(tableImportStep)
222 | 
223 |             # Now we can write out the import and export steps files
224 |             exportJSON = json.dumps(exportSteps, indent=4)
225 |             exportJSONFilename = "%s/exportSteps_%02d.json" % (destination, cluster_number)
226 |             writeFile(exportJSON, exportJSONFilename)
227 | 
228 |             importJSON = json.dumps(importSteps, indent=4)
229 |             importJSONFilename = "%s/importSteps_%02d.json" % (destination, cluster_number)
230 |             writeFile(importJSON, importJSONFilename)
231 |             cluster_number += 1
232 | 
233 | 
234 | ###########
235 | # Add a JSON entry for a single table throughput update step
236 | ###########
237 | def generateThroughputUpdateStep(
238 |         tableName,
239 |         stepName,
240 |         s3Path,
241 |         readtput,
242 |         autoscale_min_throughput,
243 |         writetput,
244 |         region):
245 |     myLog("addThroughputUpdateStep (%s) %s" % (stepName, tableName))
246 | 
247 |     tputUpdateDict = {}
248 | 
249 |     if autoscale_min_throughput:
250 |         tputUpdateDict = {
251 |             "Name": stepName +
252 |             " Throughput: " +
253 |             tableName,
254 |             "ActionOnFailure": "CONTINUE",
255 |             "Type": "CUSTOM_JAR",
256 |             "Jar": "s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar",
257 |             "Args": [
258 |                 s3Path,
259 |                 region,
260 |                 tableName,
261 |                 str(readtput),
262 |                 str(writetput),
263 |                 str(autoscale_min_throughput)]}
264 |     else:
265 |         tputUpdateDict = {
266 |             "Name": stepName +
267 |             " Throughput: " +
268 |             tableName,
269 |             "ActionOnFailure": "CONTINUE",
270 |             "Type": "CUSTOM_JAR",
271 |             "Jar": "s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar",
272 |             "Args": [
273 |                 s3Path,
274 |                 region,
275 |                 tableName,
276 |                 str(readtput),
277 |                 str(writetput)]}
278 | 
279 |     return tputUpdateDict
280 | 
281 | 
282 | ###########
283 | # Add a JSON entry for a single table export step
284 | ###########
285 | def generateTableExportStep(
286 |         tableName, s3Path, readtput, jarPath=None, classPath=None):
287 |     myLog("addTableExportStep %s" % tableName)
288 | 
289 |     if not jarPath:
290 |         # Default JAR
291 |         jarPath = "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar"
292 | 
293 |     if not classPath:
294 |         # Default ClassPath
295 |         classPath = "org.apache.hadoop.dynamodb.tools.DynamoDbExport"
296 | 
297 |     tableExportDict = {"Name": "Export Table:" + tableName,
298 |                        "ActionOnFailure": "CONTINUE",
299 |                        "Type": "CUSTOM_JAR",
300 |                        "Jar": jarPath,
301 |                        "Args": [classPath,
302 |                                 s3Path,
303 |                                 tableName,
304 |                                 readtput,
305 |                                 ]
306 |                        }
307 | 
308 |     return tableExportDict
309 | 
310 | 
311 | ###########
312 | # Add a JSON entry for a single table import step
313 | ###########
314 | def generateTableImportStep(
315 |         tableName, s3Path, writetput, jarPath=None, classPath=None):
316 |     myLog("addTableImportStep %s" % tableName)
317 | 
318 |     if not jarPath:
319 |         # Default JAR
320 |         jarPath = "s3://dynamodb-emr-us-east-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar"
321 | 
322 |     if not classPath:
323 |         # Default ClassPath
324 |         classPath = "org.apache.hadoop.dynamodb.tools.DynamoDbImport"
325 | 
326 |     tableImportDict = {"Name": "Import Table:" + tableName,
327 |                        "ActionOnFailure": "CONTINUE",
328 |                        "Type": "CUSTOM_JAR",
329 |                        "Jar": jarPath,
330 |                        "Args": [classPath,
331 |                                 s3Path,
332 |                                 tableName,
333 |                                 writetput
334 |                                 ]
335 |                        }
336 | 
337 |     return tableImportDict
338 | 
339 | 
340 | ###########
341 | # Generate a formatted S3 path which is used in the export and import steps file
342 | ###########
343 | def generateS3Path(basePath, region, dateStr, appname):
344 |     myLog("generateS3Path BASE:%s" % basePath)
345 |     basePath = basePath.rstrip('/')
346 | 
347 |     s3Path = basePath + "/" + region + "/" + appname + "/" + dateStr
348 |     myLog("S3 path generated is %s" % s3Path)
349 | 
350 |     return s3Path
351 | 
352 | 
353 | def describeTables(conn, table_list):
354 |     table_list_return = []
355 | 
356 |     for table in table_list:
357 |         table_desc = conn.describe_table(TableName=table)
358 |         table_return = dict()
359 |         table_return['name'] = table
360 |         table_return['read'] = str(
361 |             table_desc['Table']['ProvisionedThroughput']['ReadCapacityUnits'])
362 |         table_return['write'] = str(
363 |             table_desc['Table']['ProvisionedThroughput']['WriteCapacityUnits'])
364 |         table_list_return.append(table_return)
365 | 
366 |     return table_list_return
367 | 
368 | 
369 | ###########
370 | # Obtain a list of dynamoDB tables from the current region
371 | ###########
372 | def listTables(conn):
373 | 
374 |     table_list_return = []
375 | 
376 |     # Get the inital list of tables. boto only returns the first 100 tho....
377 |     table_list = conn.list_tables()
378 | 
379 |     moreTables = True
380 |     while moreTables:
381 |         if 'LastEvaluatedTableName' in table_list:
382 |             LastEvaluatedTableName = table_list['LastEvaluatedTableName']
383 |             moreTables = True
384 |         else:
385 |             LastEvaluatedTableName = ''
386 |             moreTables = False
387 | 
388 |         for table_name in table_list['TableNames']:
389 |             table_list_return.append(table_name)
390 | 
391 |         if LastEvaluatedTableName != '':
392 |             table_list = conn.list_tables(
393 |                 ExclusiveStartTableName=LastEvaluatedTableName, Limit=100)
394 | 
395 |     myLog("Read %d tables from dynamodb" % len(table_list_return))
396 | 
397 |     return table_list_return
398 | 
399 | 
400 | # Checks if a dynamo table has a scalable target (ie. is autoscale enabled?)
401 | def scalable_target_exists(region, resource_id, scalable_dimension):
402 |     response = None
403 |     retval = None
404 | 
405 |     myLog("Checking if scalable target exists for " +
406 |           resource_id + " for dimension " + scalable_dimension)
407 |     client = boto3.client('application-autoscaling', region_name=region)
408 | 
409 |     try:
410 |         response = client.describe_scalable_targets(
411 |             ServiceNamespace='dynamodb',
412 |             ResourceIds=[
413 |                 resource_id,
414 |             ],
415 |             ScalableDimension=scalable_dimension
416 |         )
417 |     except Exception as e:
418 |         myLog("Failed to describe scalable targets " + str(e))
419 | 
420 |     if response:
421 |         if response['ScalableTargets']:
422 |             retval = response['ScalableTargets']
423 | 
424 |     return retval
425 | 
426 | 
427 | def chunkIt(seq, num):
428 |     avg = len(seq) / float(num)
429 |     out = []
430 |     last = 0.0
431 | 
432 |     while last < len(seq):
433 |         out.append(seq[int(last):int(last + avg)])
434 |         last += avg
435 | 
436 |     return out
437 | 
438 | 
439 | def writeFile(content, filename):
440 |     myLog("writeFile %s" % filename)
441 | 
442 |     text_file = open(filename, "w")
443 |     text_file.write(content)
444 |     text_file.close()
445 | 
446 | 
447 | @contextlib.contextmanager
448 | def preserve_cwd():
449 |     cwd = os.getcwd()
450 |     try:
451 |         yield
452 |     finally:
453 |         os.chdir(cwd)
454 | 
455 | 
456 | if __name__ == '__main__':
457 |     kwargs = dict(parser.parse_args(sys.argv[1:])._get_kwargs())
458 |     main(**kwargs)
459 | 


--------------------------------------------------------------------------------
/app/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3
2 | 


--------------------------------------------------------------------------------
/app/restoreEMR.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [ "$DEBUG_OUTPUT" ]; then
  4 |     echo "DEBUG Output selected"
  5 |     set -x
  6 | fi
  7 | 
  8 | # Inputs
  9 | APPNAME=$1
 10 | CLUSTER_NAME=$2
 11 | JSON_INPUT_DIR=$3
 12 | S3LOCATION=$4
 13 | CLUSTER_REGION=$5
 14 | 
 15 | # Hard-codes (but can be changed here)
 16 | RETRY_DELAY=10
 17 | 
 18 | # Just vars
 19 | INSTALL_DIR=/usr/local/dynamodb-emr
 20 | NEXTPHASE=0
 21 | RETCODE=0
 22 | 
 23 | logMsg()
 24 | {
 25 |         PROGNAME=restoreEMR
 26 |         PID=$$
 27 |         logger -t ${PROGNAME}[$PID] $1
 28 |         echo $1
 29 | }
 30 | 
 31 | usage()
 32 | {
 33 |         echo "Usage: restoreEMR app_name emr_cluster_name json_input_directory S3_location_for_logs cluster_region"
 34 | }
 35 | 
 36 | pollCluster()
 37 | {
 38 |         CLUSTERID=$1
 39 |         CLUSTERNAME=$2
 40 | 
 41 |         COMPLETE=0
 42 |         ERRORS=0
 43 | 
 44 |         logMsg "polling cluster NAME:${CLUSTERNAME} ID ${CLUSTERID} for status in region ${CLUSTER_REGION}"
 45 | 
 46 |         while [ $COMPLETE -ne 1 ]
 47 |         do
 48 |                 CLUSTER_STATUS=$(aws emr describe-cluster --cluster-id $CLUSTERID --region $CLUSTER_REGION |jq -r '.["Cluster"]["Status"]["State"]')
 49 |                 #echo "STATUS IS $CLUSTER_STATUS"
 50 | 
 51 |                 if [ "${CLUSTER_STATUS}" == "TERMINATED" ]; then
 52 |                         # We need to check if there were step errors
 53 |                         STEPS_STATUS=$(aws emr describe-cluster --cluster-id $CLUSTERID --region $CLUSTER_REGION | jq -r '.["Cluster"]["Status"]["StateChangeReason"]["Message"]')
 54 | 
 55 |                         if [ "${STEPS_STATUS}" == "Steps completed with errors" ]; then
 56 |                                 ERRORS=1
 57 |                         else
 58 |                                 ERRORS=0
 59 |                         fi
 60 | 
 61 |                         COMPLETE=1
 62 |                 elif [ "${CLUSTER_STATUS}" == "TERMINATED_WITH_ERRORS" ]; then
 63 |                         ERRORS=1
 64 |                         COMPLETE=1
 65 |                 fi
 66 | 
 67 |                 sleep 10
 68 |         done
 69 | 
 70 |         return $ERRORS
 71 | }
 72 | 
 73 | if [ $# != 5 ]; then
 74 |         usage
 75 |         exit 1
 76 | fi
 77 | logMsg "Starting up"
 78 | 
 79 | ######
 80 | ## PHASE 1 - See if there are any clusters already runing with our name.  If there are, exit
 81 | ######
 82 | aws emr list-clusters --active --region ${CLUSTER_REGION} | grep -q ${CLUSTER_NAME}
 83 | STATUS=$?
 84 | 
 85 | if [ $STATUS == 0 ]; then
 86 |         # We already have a cluster running - bail
 87 |         logMsg "Cluster ERROR: existing cluster ${CLUSTER_NAME} running"
 88 |         NEXTPHASE=0
 89 |         RETCODE=2
 90 | else
 91 |         logMsg "No existing EMR cluster with  name ${CLUSTER_NAME} running.  Creating"
 92 |         NEXTPHASE=1
 93 | fi
 94 | 
 95 | ######
 96 | ## PHASE 1 - Create the EMR cluster (with retries)
 97 | ######
 98 | if [ $NEXTPHASE == 1 ]; then
 99 |         RETRIES=5
100 |         CURR_ATTEMPT=1
101 | 
102 |         while [ $CURR_ATTEMPT -le $RETRIES ]
103 |         do
104 |                 CLUSTERUP=0
105 | 
106 |                 # Invoke the aws CLI to create the cluster
107 |                 logMsg "Creating new EMR Cluster NAME:${CLUSTER_NAME} Attempt ${CURR_ATTEMPT} of ${RETRIES}"
108 | 
109 |                 CLUSTERID=$(aws emr create-cluster --name "${CLUSTER_NAME}"                                        \
110 |                             --release-label "emr-5.28.0"                                                            \
111 |                             --service-role "EMR_DefaultRole"                                                       \
112 |                             --security-configuration "dynamodb-backups"                                            \
113 |                             --tags Name=${CLUSTER_NAME} signiant:product=devops signiant:email=devops@signiant.com \
114 |                             --enable-debugging                                                                     \
115 |                             --log-uri ${S3LOCATION}/emr-logs                                                       \
116 |                             --configurations file://${JSON_INPUT_DIR}/configurations.json                         \
117 |                             --instance-groups file://${JSON_INPUT_DIR}/instance-groups.json                        \
118 |                             --ec2-attributes file://${JSON_INPUT_DIR}/ec2-attributes.json                          \
119 |                             --steps file://${JSON_INPUT_DIR}/importSteps.json                                      \
120 |                             --auto-terminate                                                                       \
121 |                             --visible-to-all-users                                                                 \
122 |                             --output text                                                                          \
123 |                             --region ${CLUSTER_REGION} )
124 | 
125 |                 logMsg "CLUSTERID for ${CLUSTER_NAME} is $CLUSTERID"
126 |                 # Now use the waiter to make sure the cluster is launched successfully
127 |                 if [ "$CLUSTERID" != "" ]; then
128 |                         logMsg "Waiting for cluster NAME:${CLUSTER_NAME} ID:${CLUSTERID} to start...."
129 |                         aws emr wait cluster-running --cluster-id ${CLUSTERID} --region ${CLUSTER_REGION}
130 |                         STATUS=$?
131 | 
132 |                         if [ $STATUS == 0 ]; then
133 |                                 logMsg "Cluster NAME:${CLUSTER_NAME} ID:${CLUSTERID} launched successfully"
134 |                                 CLUSTERUP=1
135 |                                 break
136 |                         else
137 |                                 logMsg "Cluster ERROR: launch failure NAME:${CLUSTER_NAME} ID:${CLUSTERID} Attempt ${CURR_ATTEMPT} of ${RETRIES} "
138 |                                 CLUSTERUP=0
139 |                                 # Fall into the next iteration of the loop to try and create the cluster again
140 |                         fi
141 |                 else
142 |                         logMsg "Cluster ERROR: no cluster ID returned NAME:${CLUSTER_NAME}"
143 |                         CLUSTERUP=0
144 |                 fi
145 | 
146 |                 CURR_ATTEMPT=$[$CURR_ATTEMPT+1]
147 |                 logMsg "Delaying ${RETRY_DELAY} seconds before attempting to create cluster..."
148 |                 sleep ${RETRY_DELAY}
149 |         done
150 | 
151 |         ####
152 |         ## Phase 3.5 - poll the cluster for status so we know when it's done
153 |         ####
154 |         if [ $CLUSTERUP == 1 ]; then
155 |                 # We have a cluster provisioned...now we can poll it's tasks and make sure it completes ok
156 | 
157 |                 pollCluster $CLUSTERID $CLUSTER_NAME
158 |                 STATUS=$?
159 | 
160 |                 if [ $STATUS == 0 ]; then
161 |                         logMsg "Cluster SUCCESS NAME:${CLUSTER_NAME} ID:${CLUSTERID}"
162 |                         RETCODE=0
163 |                 else
164 |                         logMsg "Cluster ERROR:task failure NAME:${CLUSTER_NAME} ID:${CLUSTERID}"
165 |                         RETCODE=4
166 |                 fi
167 |         else
168 |                 logMsg "Unable to provision a new cluster after ${RETRIES} attempts"
169 |                 RETCODE=6
170 |         fi
171 | 
172 | fi
173 | 
174 | exit ${RETCODE}
175 | 


--------------------------------------------------------------------------------
/app/update-throughput.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | SOURCE_REGION=$1
  3 | TABLE_NAME=$2
  4 | READ_CAPACITY=$3
  5 | WRITE_CAPACITY=$4
  6 | AS_READ_MIN_TPUT=$5
  7 | 
  8 | MAX_ATTEMPTS=50
  9 | ATTEMPTS=0
 10 | SLEEP_SECONDS=20
 11 | RET_CODE=0
 12 | 
 13 | table_scaleable_read_dimension="dynamodb:table:ReadCapacityUnits"
 14 | 
 15 | USAGE="$0 source_region table_name read_capacity write_capacity"
 16 | 
 17 | if [ $# -lt 4 ]; then
 18 |   echo $USAGE
 19 |   exit 1
 20 | fi
 21 | 
 22 | # Does the table have an autoscaling scalable target for reads?
 23 | # If so, return the RoleARN and the max capacity
 24 | scalable_target_exists()
 25 | {
 26 |   resource_id=$1
 27 |   scalable_dimension=$2
 28 |   region=$3
 29 | 
 30 |   scalable_target=$(aws application-autoscaling describe-scalable-targets \
 31 |                         --service-namespace dynamodb \
 32 |                         --resource-id "${resource_id}" \
 33 |                         --query "ScalableTargets[?contains(ScalableDimension,\`${scalable_dimension}\`) == \`true\`].[RoleARN,MaxCapacity]" \
 34 |                         --region ${region} \
 35 |                         --output text)
 36 | 
 37 |   if [ -z "${scalable_target}" ]; then
 38 |     echo "false"
 39 |   else
 40 |     echo "${scalable_target}"
 41 |   fi
 42 | }
 43 | 
 44 | # Add or replace a scalable target on a table or index
 45 | register_scalable_target()
 46 | {
 47 |   resource_id=$1
 48 |   scalable_dimension=$2
 49 |   role_arn=$3
 50 |   min_tput=$4
 51 |   max_tput=$5
 52 |   region=$6
 53 | 
 54 |   aws application-autoscaling register-scalable-target \
 55 |     --service-namespace dynamodb \
 56 |     --resource-id "${resource_id}" \
 57 |     --scalable-dimension "${scalable_dimension}" \
 58 |     --min-capacity ${min_tput} \
 59 |     --max-capacity ${max_tput} \
 60 |     --role-arn ${role_arn} \
 61 |     --region ${region}
 62 | 
 63 |   status=$?
 64 | 
 65 |   if [ ${status} -eq 0 ]; then
 66 |     echo "true"
 67 |   else
 68 |     echo "false"
 69 |   fi
 70 | }
 71 | 
 72 | # Poll a table until it becomes ACTIVE
 73 | wait_for_active()
 74 | {
 75 |   table_name=$1
 76 |   region=$2
 77 | 
 78 |   # wait for the table to finish updating
 79 |   while [ $ATTEMPTS -le $MAX_ATTEMPTS ]; do
 80 |     TABLE_STATUS=$(aws dynamodb describe-table --region $region --table-name $TABLE_NAME --query 'Table.TableStatus' --output text)
 81 |     echo "Checking table status, attempt ${ATTEMPTS}" 1>&2
 82 |     if [ "$TABLE_STATUS" == "ACTIVE" ]; then
 83 |       echo "Table transition successful" 1>&2
 84 |       return 0
 85 |     fi
 86 |     echo "Table is $TABLE_STATUS, checking again in $SLEEP_SECONDS seconds" 1>&2
 87 |     (( ATTEMPTS++ ))
 88 |     sleep $SLEEP_SECONDS
 89 |   done
 90 | 
 91 |   # if we're here, the table did not become active in a reasonable time
 92 |   return 1
 93 | }
 94 | 
 95 | #
 96 | #  MAINLINE
 97 | #
 98 | 
 99 | table_resource_id="table/${TABLE_NAME}"
100 | scalable_target_exists=$(scalable_target_exists ${table_resource_id} ${table_scaleable_read_dimension} ${SOURCE_REGION})
101 | 
102 | # Check if we have autoscaling enabled.  If so, we need to update
103 | # the minimum tput so that we don't keep autoscaling down
104 | if [ "${scalable_target_exists}" != "false" ]; then
105 |   echo "Table ${TABLE_NAME} has an autoscaling policy - manipulating the min-tput"
106 |   # get the role ARN and the max capacity currently set...the min capacity we are provided
107 |   role_arn=$(echo ${scalable_target_exists}|cut -f1 -d" "); echo "role arn is ${role_arn}"
108 |   max_tput=$(echo ${scalable_target_exists}|cut -f2 -d" "); echo "max_tput is ${max_tput}"
109 | 
110 |   if [[ "$(register_scalable_target ${table_resource_id} ${table_scaleable_read_dimension} ${role_arn} ${AS_READ_MIN_TPUT} ${max_tput} ${SOURCE_REGION})" == "true" ]]; then
111 |     echo "Successfully registered new scalable target for ${table_resource_id} with minimum tput ${AS_READ_MIN_TPUT}"
112 | 
113 |    # Updating the min tput triggers autoscaling to update the table if there is read activity
114 |    # so we need to wait for it to finish updating
115 |    wait_for_active "${TABLE_NAME}" "${SOURCE_REGION}"
116 |    table_status=$?
117 | 
118 |    if [ ${table_status} -eq 0 ]; then
119 |      echo "Table has returned to ACTIVE state"
120 |    else
121 |      echo "FAILURE: Table never transitioned to active"
122 |      RET_CODE=1
123 |    fi
124 |   else
125 |     echo "ERROR registering new scalable target for ${table_resource_id}"
126 |   fi
127 | fi
128 | 
129 | echo "Updating the base table read throughput with update-table to ${READ_CAPACITY}"
130 | if [ ${RET_CODE} -eq 0 ]; then
131 |   # Update the table throughput directly
132 |   # This is needed in case there is no autoscaling enabled OR
133 |   # if autoscaling is enabled and there is no read activity on the table
134 |   # Since autoscaling will never scale back down by itself
135 |   TABLE_STATUS=$(aws dynamodb update-table \
136 |                     --region $SOURCE_REGION \
137 |                     --table-name $TABLE_NAME \
138 |                     --provisioned-throughput ReadCapacityUnits=${READ_CAPACITY},WriteCapacityUnits=${WRITE_CAPACITY} \
139 |                     --query 'Table.TableStatus' \
140 |                     --output text 2>&1)
141 | 
142 |   if [ $? -ne 0 ]; then
143 |     ERROR_TYPE=$(echo $TABLE_STATUS | cut -d \( -f2 | cut -d \) -f1)
144 |     if [ "$ERROR_TYPE" == "ValidationException" ]; then
145 |       echo "Provisioned throughput already set, no action taken"
146 |       RET_CODE=0
147 |     else
148 |       echo "Unable to spike throughput"
149 |       echo ${TABLE_STATUS}
150 |       RET_CODE=1
151 |     fi
152 |   fi
153 | 
154 |   # Check the table staus again from the update-table call
155 |   wait_for_active ${TABLE_NAME} ${SOURCE_REGION}
156 |   table_status=$?
157 | 
158 |   if [ ${table_status} -eq 0 ]; then
159 |     RET_CODE=0
160 |   else
161 |     echo "FAILURE: Table never transitioned to active"
162 |     RET_CODE=1
163 |   fi
164 | else
165 |   echo "FAILURE: Table never transitioned to active"
166 |   RET_CODE=1
167 | fi
168 | 
169 | exit $RET_CODE
170 | 


--------------------------------------------------------------------------------
/config-samples/crontab.sample:
--------------------------------------------------------------------------------
1 | 
2 | # Backup the dynamoDB tables
3 | 20 17 * * * /usr/local/signiant/dynamodb-emr-exporter/invokeEMR.sh MYAPP DynamoDB_Exporter_MYAPP MY_TABLE_PREFIX 0.70 /usr/local/signiant/dynamodb-emr-exporter/myapp s3://my-bucket/dynamodb/emr-backups us-east-1 us-west-2 1000
4 | 


--------------------------------------------------------------------------------
/config-samples/dynamodb_emr_backup_restore.IAMPOLICY.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "Statement":[
 3 |       {
 4 |          "Action":[
 5 |             "s3:List*",
 6 |             "s3:Put*",
 7 |             "s3:Get*",
 8 |             "s3:DeleteObject",
 9 |             "dynamodb:DescribeTable",
10 |             "dynamodb:Scan",
11 |             "dynamodb:Query",
12 |             "dynamodb:GetItem",
13 |             "dynamodb:BatchGetItem",
14 |             "dynamodb:UpdateTable",
15 |             "dynamodb:WriteItem",
16 |             "dynamodb:BatchWriteItem",
17 |             "cloudwatch:PutMetricData",
18 |             "elasticmapreduce:Describe*",
19 |             "kms:GenerateDataKey",
20 |             "application-autoscaling:DescribeScalableTargets",
21 |             "application-autoscaling:RegisterScalableTarget",
22 |             "iam:PassRole"
23 |          ],
24 |          "Effect":"Allow",
25 |          "Resource":[
26 |             "*"
27 |          ]
28 |       }
29 |    ]
30 | }
31 | 


--------------------------------------------------------------------------------