├── src
    ├── __init__.py
    └── app.py
├── .github
    └── CODEOWNERS
├── LICENSE
├── SampleEvent.json
├── .gitignore
├── README.md
├── deploy.sh
└── template.yaml


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Each line is a file pattern followed by one or more owners.
2 | 
3 | # These owners will be the default owners for everything in
4 | # the repo. Unless a later match takes precedence, members of
5 | # @rewindio/codeowners will be requested for review when someone
6 | # opens a pull request.
7 | *       @dnorth98
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Rewind
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/SampleEvent.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Records": [
 3 |         {
 4 |             "eventVersion": "2.0", 
 5 |             "eventName": "ObjectCreated:Put", 
 6 |             "eventTime": "1970-01-01T00:00:00.000Z", 
 7 |             "userIdentity": {
 8 |                 "principalId": "EXAMPLE"
 9 |             }, 
10 |             "eventSource": "aws:s3", 
11 |             "requestParameters": {
12 |                 "sourceIPAddress": "127.0.0.1"
13 |             }, 
14 |             "s3": {
15 |                 "configurationId": "testConfigRule", 
16 |                 "object": {
17 |                     "eTag": "1c43a0c9dcc31572b5e49c0b42f8b17f", 
18 |                     "key": "INSERT FILENAME HERE (e.g. receipt.png)", 
19 |                     "sequencer": "0A1B2C3D4E5F678901", 
20 |                     "size": 1024
21 |                 }, 
22 |                 "bucket": {
23 |                     "ownerIdentity": {
24 |                         "principalId": "EXAMPLE"
25 |                     }, 
26 |                     "name": "INSERT BUCKET NAME HERE (e.g. aws-sam-ocr-sourceimagebucket-123122312)", 
27 |                     "arn": "arn:aws:s3:::INSERT BUCKET NAME HERE (e.g. aws-sam-ocr-sourceimagebucket-123122312)"
28 |                 }, 
29 |                 "s3SchemaVersion": "1.0"
30 |             }, 
31 |             "responseElements": {
32 |                 "x-amz-id-2": "EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH", 
33 |                 "x-amz-request-id": "EXAMPLE123456789"
34 |             }, 
35 |             "awsRegion": "us-east-1"
36 |         }
37 |     ]
38 | }
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # SAM
107 | packaged.yaml
108 | 
109 | # test events
110 | src/*.json
111 | 
112 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # aws-athena-partition-autoloader
 2 | Automatically adds new partitions detected in S3 to an existing Athena table
 3 | 
 4 | # Purpose
 5 | Athena is fantastic for querying data in S3 and works especially well when the data is partitioned.  The issue comes when you have a lot of partitions and need to issue the `MSCK LOAD PARTITONS` command as it can take a long time.
 6 | 
 7 | This solution subscribes to S3 events on a bucket and detects when a new partition is created and then loads only that partition into Athena.  It uses a cache of the existing partitions to minimize the number of calls needed to Athena to query the parition list.
 8 | 
 9 | # Installing and Configuring
10 | 
11 | ## AWS Setup
12 | 
13 | ## Deploying to AWS
14 | Before starting, you will need:
15 | * The [AWS CLI](https://aws.amazon.com/cli/) installed and default credentials configured
16 | * The [AWS SAM CLI](https://github.com/awslabs/aws-sam-cli) installed
17 | * An existing S3 bucket where the AWS Lambda code will be deployed to by SAM
18 | * An existing Athena table backed by content in S3 with at least 1 partition key
19 | * This repo cloned
20 | 
21 | 1. Run the *deploy.sh* script like
22 | 
23 | ```
24 | ./deploy.sh <function_name> <s3 bucket region> <athena region> <action>  <s3 bucket to store lamba code in> <s3 bucket containing athena data> <S3 bucket for Athena results> <Athena database> <Athena table> <comma-seperated list of athena partition names> <AWS profile>
25 | ```
26 | 
27 | For Example:
28 | 
29 | ```
30 | ./deploy.sh athena_loader_mytable eu-west-1 us-east-1 ALL lambda-sam-staging stage-audit-log aws-athena-query-results-123456789-us-east-1 audit_log_db api_audit_log 'destination_platform_id,date' staging
31 | ```
32 | 
33 | The list of partition keys must exactly match that which was defined on the table.
34 | 
35 | deploy.sh uses AWS SAM to package the AWS Lambda functions and then deploys them to AWS.  Everything is deployed as a Cloudformation Stack in the specified region.
36 | 
37 | | NOTE: If you don't have SAM installed, you can replace the SAM commands in the deploy script with `aws cloudformation package...` and `aws cloudformation deploy..` instead |
38 | | --- |


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | STACK_NAME=$1
 4 | STACK_REGION=$2
 5 | ATHENA_REGION=$3
 6 | OPERATION=$4
 7 | DEPLOY_BUCKET=$5
 8 | CONTENT_BUCKET=$6
 9 | ATHENA_RESULTS_BUCKET=$7
10 | ATHENA_DATABASE=$8
11 | ATHENA_TABLE=$9
12 | PARTITION_KEYS=${10}
13 | PROFILE=${11}
14 | 
15 | # No longer hard code as need different functions for different tables
16 | # STACK_NAME=aws-athena-partition-autoloader
17 | 
18 | if [ "${OPERATION}" == "ALL" ]; then
19 |     echo "Packaging using SAM....."
20 |     sam package \
21 |         --template-file template.yaml \
22 |         --output-template-file packaged.yaml \
23 |         --s3-bucket ${DEPLOY_BUCKET} \
24 |         --region ${STACK_REGION} \
25 |         --profile ${PROFILE}
26 | 
27 |     echo "Deploying using SAM...."
28 |     sam deploy \
29 |     --template-file packaged.yaml \
30 |         --stack-name ${STACK_NAME} \
31 |         --capabilities CAPABILITY_IAM \
32 |         --parameter-overrides S3Bucket=${CONTENT_BUCKET} AthenaRegion=${ATHENA_REGION} AthenaResultsBucket=${ATHENA_RESULTS_BUCKET} AthenaDatabase=${ATHENA_DATABASE} AthenaTable=${ATHENA_TABLE} PartitionKeys=${PARTITION_KEYS}\
33 |         --region ${STACK_REGION} \
34 |         --profile ${PROFILE}
35 | fi
36 | 
37 | # SAM only allows subscribing Lambdas to events for buckets created in the same template
38 | # Existing buckets cannot be used so we do this to subscribe an existing bucket to the new
39 | # functions.  See : https://github.com/awslabs/serverless-application-model/issues/124
40 | 
41 | AWS_ACCOUNT_ID=$(aws sts get-caller-identity \
42 |                     --query 'Account' \
43 |                     --output text \
44 |                     --region ${STACK_REGION} \
45 |                     --profile ${PROFILE}
46 | )
47 | echo "Our AWS account ID is ${AWS_ACCOUNT_ID}"
48 | 
49 | FUNCTION_NAME=$(aws cloudformation describe-stacks \
50 |                 --stack-name ${STACK_NAME} \
51 |                 --query 'Stacks[].Outputs[].OutputValue' \
52 |                 --output text \
53 |                 --region ${STACK_REGION} \
54 |                 --profile ${PROFILE}
55 | )
56 | echo "The Lambda function name is ${FUNCTION_NAME}"
57 | 
58 | FUNCTION_ARN=$(aws lambda get-function \
59 |                 --function-name ${FUNCTION_NAME} \
60 |                 --query 'Configuration.FunctionArn' \
61 |                 --output text \
62 |                 --region ${STACK_REGION} \
63 |                 --profile ${PROFILE}
64 | )
65 | echo "The Lambda function ARN is ${FUNCTION_ARN}"
66 | 
67 | # Allow the lambda to receive events from S3
68 | echo "Adding Lambda invoke permissions..."
69 | 
70 | aws lambda add-permission \
71 |     --function-name ${FUNCTION_NAME} \
72 |     --region ${STACK_REGION} \
73 |     --profile ${PROFILE} \
74 |     --statement-id "s3perms-${CONTENT_BUCKET}" \
75 |     --action "lambda:InvokeFunction" \
76 |     --principal s3.amazonaws.com \
77 |     --source-arn arn:aws:s3:::${CONTENT_BUCKET} \
78 |     --source-account ${AWS_ACCOUNT_ID} > /dev/null 2>&1
79 | 
80 | # Subscribe the lambda to S3 events for our specific bucket
81 | S3_LAMBDA_EVENT_SUBSCRIPTION="{\"LambdaFunctionConfigurations\":[{\"LambdaFunctionArn\":\"${FUNCTION_ARN}\",\"Events\":[\"s3:ObjectCreated:*\"]}]}"
82 | 
83 | echo "Adding AWS event subscription for bucket ${CONTENT_BUCKET}"
84 | aws s3api put-bucket-notification-configuration \
85 |     --bucket ${CONTENT_BUCKET} \
86 |     --notification-configuration ${S3_LAMBDA_EVENT_SUBSCRIPTION} \
87 |     --region ${STACK_REGION} \
88 |     --profile ${PROFILE}


--------------------------------------------------------------------------------
/template.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: '2010-09-09'
  2 | Transform: 'AWS::Serverless-2016-10-31'
  3 | 
  4 | Description: SAM app that automatically loads Athena partitions for new partitions in S3
  5 | 
  6 | Parameters:
  7 |   S3Bucket:
  8 |     Type: String
  9 |   AthenaResultsBucket:
 10 |     Type: String
 11 |   AthenaRegion:
 12 |     Type: String
 13 |   AthenaDatabase:
 14 |     Type: String
 15 |   AthenaTable:
 16 |     Type: String
 17 |   PartitionKeys:
 18 |     Type: String
 19 | 
 20 | Resources:
 21 |   LambdaRole:
 22 |     Type: AWS::IAM::Role
 23 |     Properties:
 24 |       Path: "/"
 25 |       AssumeRolePolicyDocument:
 26 |         Version: "2012-10-17"
 27 |         Statement:
 28 |           -
 29 |             Sid: "AllowLambdaServiceToAssumeRole"
 30 |             Effect: "Allow"
 31 |             Action: 
 32 |               - "sts:AssumeRole"
 33 |             Principal:
 34 |               Service: 
 35 |                 - "lambda.amazonaws.com"
 36 |       ManagedPolicyArns: 
 37 |         - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole
 38 |       Policies: 
 39 |         - 
 40 |           PolicyName: "AthenaPartitionLoader"
 41 |           PolicyDocument: 
 42 |             Version: '2012-10-17'
 43 |             Statement:
 44 |               - Effect: Allow
 45 |                 Action:
 46 |                   - 's3:Get*'
 47 |                   - 's3:List*'
 48 |                   - 's3:Put*'
 49 |                 Resource:
 50 |                   - !Sub arn:aws:s3:::${AthenaResultsBucket}
 51 |                   - !Sub arn:aws:s3:::${AthenaResultsBucket}/*
 52 | 
 53 |               - Effect: Allow
 54 |                 Action:
 55 |                   - 's3:Get*'
 56 |                   - 's3:ListBucket'
 57 |                 Resource:
 58 |                   - !Sub arn:aws:s3:::${S3Bucket}
 59 |                   - !Sub arn:aws:s3:::${S3Bucket}/*
 60 | 
 61 |               - Effect: Allow
 62 |                 Action:
 63 |                   - 'athena:RunQuery'
 64 |                   - 'athena:StartQueryExecution'
 65 |                   - 'athena:GetQueryResultsStream'
 66 |                   - 'athena:StopQueryExecution'
 67 |                   - 'athena:GetQueryExecution'
 68 |                   - 'athena:GetQueryResults'
 69 |                   - 'athena:GetQueryExecutions'
 70 |                   - 'athena:GetTables'
 71 |                   - 'athena:GetTable'
 72 |                   - 'glue:GetDatabase'
 73 |                   - 'glue:GetTable'
 74 |                   - 'glue:GetTables'
 75 |                   - 'glue:GetPartition'
 76 |                   - 'glue:GetPartitions'
 77 |                   - 'glue:BatchCreatePartition'
 78 |                 Resource:
 79 |                   - '*'
 80 | 
 81 |   AthenaPartitionLoader:
 82 |     Type: 'AWS::Serverless::Function'
 83 |     Properties:
 84 |       Handler: src/app.lambda_handler
 85 |       Runtime: python3.6
 86 |       CodeUri: .
 87 |       Description: Automatically loads Athena partitions for new partitions in S3.
 88 |       MemorySize: 128
 89 |       Timeout: 900
 90 |       Environment:
 91 |         Variables:
 92 |           ATHENA_REGION: !Ref AthenaRegion
 93 |           ATHENA_DATABASE: !Ref AthenaDatabase
 94 |           ATHENA_TABLE: !Ref AthenaTable
 95 |           PARTITION_KEYS: !Ref PartitionKeys
 96 |       Role: !GetAtt LambdaRole.Arn
 97 | 
 98 | Outputs:
 99 |   InvalidatorFunctionName:
100 |     Description: The name of the lambda Function
101 |     Value: !Ref AthenaPartitionLoader
102 | 


--------------------------------------------------------------------------------
/src/app.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import boto3
  3 | from botocore.exceptions import ClientError
  4 | import json
  5 | import urllib
  6 | import time
  7 | import os
  8 | import os.path
  9 | from pprint import pprint
 10 | 
 11 | # Global holding the AWS account ID this is executing in
 12 | account_id = 0
 13 | 
 14 | # The bucket could be in one region with the Athena DB being in a seperate region
 15 | athena_region = os.environ['ATHENA_REGION']
 16 | 
 17 | session = boto3.session.Session(region_name = athena_region)
 18 | 
 19 | #
 20 | # Get the current AWS account ID
 21 | #
 22 | def get_aws_account_id(session):
 23 |     global account_id
 24 | 
 25 |     if account_id == 0:
 26 |         account_id = session.client('sts').get_caller_identity()['Account']
 27 | 
 28 |     return account_id
 29 | 
 30 | #
 31 | # Submit a query to Athena; return the query ID
 32 | #
 33 | def submit_query(query, database, session):
 34 |     output_location = 's3://aws-athena-query-results-' + str(get_aws_account_id(session)) + "-" + session.region_name
 35 |     query_id = None
 36 |     response = None
 37 | 
 38 |     client = session.client('athena')
 39 | 
 40 |     try:
 41 |         response = client.start_query_execution(
 42 |             QueryString=query,
 43 |             QueryExecutionContext={
 44 |                 'Database': database
 45 |             },
 46 |             ResultConfiguration={
 47 |                 'OutputLocation': output_location,
 48 |                 'EncryptionConfiguration': {
 49 |                     'EncryptionOption': 'SSE_S3'
 50 |                 }
 51 |             }
 52 |         )
 53 |     except Exception as e:
 54 |         print("Error submitting query to Athena " + query + " (" + str(e) + ")")
 55 | 
 56 |     if response:
 57 |         if response['ResponseMetadata']['HTTPStatusCode'] == 200:
 58 |             print("Athena Query submitted successfully")
 59 |             query_id = response['QueryExecutionId']
 60 |         else:
 61 |             print("The response code was " + response['ResponseMetadata']['HTTPStatusCode'])
 62 | 
 63 |     return query_id
 64 | 
 65 | #
 66 | # Poll an existing query already submitted to Athena
 67 | #
 68 | def wait_for_query_to_complete(query_id, session):
 69 |     status = True
 70 |     client = session.client('athena')
 71 | 
 72 |     is_query_still_running = True
 73 |     while is_query_still_running:
 74 |         response = None
 75 |         try:
 76 |             response = client.get_query_execution(
 77 |                 QueryExecutionId=query_id
 78 |             )
 79 |         except Exception as e:
 80 |             print("Error getting query execution for " + query_id + " (" + str(e) + ")")
 81 |             status = False
 82 | 
 83 |         if response and status:
 84 |             query_state = response['QueryExecution']['Status']['State']
 85 | 
 86 |             if query_state == 'FAILED':
 87 |                 is_query_still_running = False
 88 |                 
 89 |                 if 'AlreadyExistsException' in response['QueryExecution']['Status']['StateChangeReason']:
 90 |                     print("Table partition already exists")
 91 |                     status = True
 92 |                 else:
 93 |                     print("Athena query " + query_id + " failed")
 94 |                     status = False
 95 |             elif query_state == 'CANCELLED':
 96 |                 print("Athena query " + query_id + " was cancelled")
 97 |                 is_query_still_running = False
 98 |                 status = False
 99 |             elif query_state == 'SUCCEEDED':
100 |                 print("Athena query " + query_id + " completed successfully")
101 |                 is_query_still_running = False
102 |                 status = True
103 |             else:
104 |                 time.sleep(1)
105 | 
106 |     return status
107 | 
108 | #
109 | # Get the results from a query that has executed
110 | #
111 | def get_query_results(query_id, session, header_row=True):
112 |     status = True
113 |     results = []
114 |     skip_row_count=1
115 | 
116 |     if not header_row:
117 |         skip_row_count=0
118 |         
119 |     client = session.client('athena')
120 |     
121 |     try:
122 |         results_paginator = client.get_paginator('get_query_results')
123 |         results_iter = results_paginator.paginate(QueryExecutionId=query_id)
124 | 
125 |         data_list = []
126 | 
127 |         for results_page in results_iter:
128 |             for row in results_page['ResultSet']['Rows']:
129 |                 data_list.append(row['Data'])
130 | 
131 |         for datum in data_list[skip_row_count:]:
132 |             results.append([x['VarCharValue'] for x in datum])
133 |         
134 |     except ClientError as e:
135 |         print("Unexpected error getting query results: "+ e.response['Error']['Code'])
136 |         
137 |     return [tuple(x) for x in results]
138 | 
139 | #
140 | # Query an Athena table to get the existing partitions
141 | #
142 | def get_existing_db_partitions(session, database, table_name):
143 |     print("load_partition")
144 |     query_results = None
145 |     partitions = []
146 | 
147 |     get_partitions_sql = "SHOW PARTITIONS " + table_name
148 | 
149 |     query_id = submit_query(get_partitions_sql, database, session)
150 | 
151 |     if query_id:
152 |         if wait_for_query_to_complete(query_id, session):
153 |             query_results = get_query_results(query_id, session, False)
154 |         else:
155 |             print("ERROR running query to get existing partitions")
156 | 
157 |     # Query results come back as a tuple but we only care about the first val for partitions
158 |     for part_info in query_results:
159 |         partitions.append(part_info[0])
160 | 
161 |     return partitions
162 | 
163 | #
164 | # Add a new partition to an Athena table
165 | #
166 | def add_partition(session, database, table_name, partition, bucket):
167 |     current_key = 0
168 |     status = False
169 | 
170 |     sql = 'ALTER TABLE ' + table_name + ' ADD PARTITION ('
171 | 
172 |     partition_key_vals = partition.split('/')
173 |     # Filter out any prefix dirs from the key
174 |     partition_key_vals = [p for p in partition_key_vals if "=" in p]
175 |     partiton_key_count = len(partition_key_vals)
176 | 
177 |     for part in partition_key_vals:
178 |         current_key += 1
179 |         key,val = part.split('=')
180 | 
181 |         sql += key + " = '" + val + "'"
182 | 
183 |         if current_key != partiton_key_count:
184 |             sql += ", "
185 |         else:
186 |             sql += ") "
187 | 
188 |     sql += "LOCATION 's3://" + bucket + "/" + partition + "';"
189 | 
190 |     print("Running sql: " + sql)
191 | 
192 |     query_id = submit_query(sql, database, session)
193 | 
194 |     if query_id:
195 |         if wait_for_query_to_complete(query_id, session):
196 |             status = True
197 |         else:
198 |             print("ERROR running query to add new partition")
199 |             status = False
200 |             
201 |     return status
202 | 
203 | # 
204 | # Write the list of table partitions to a cache file
205 | #
206 | def write_partition_cache(partitions, filename):
207 |     with open(filename, 'w') as outfile:  
208 |         json.dump(partitions, outfile)
209 | 
210 | #
211 | # Load the list of partitions from the cache file
212 | #
213 | def load_partition_cache(filename):
214 |     data = dict()
215 | 
216 |     with open(filename) as json_file:  
217 |         data = json.load(json_file)
218 | 
219 |     return data
220 | 
221 | #
222 | # Does an S3 key contain all the partition name keys
223 | #
224 | def partition_name_in_key(key, partition_keys):
225 |     key_contains_all_partitions = False
226 |     keys_found = 0
227 |     partition_key_count = len(partition_keys)
228 | 
229 |     for part_name in partition_keys:
230 |         if part_name in key:
231 |             keys_found += 1
232 | 
233 |     if keys_found == partition_key_count:
234 |         key_contains_all_partitions =  True
235 | 
236 |     return key_contains_all_partitions
237 | 
238 | 
239 | # --------------- Main handler ------------------
240 | def lambda_handler(event, context):
241 |     '''
242 |     Loads an athena partition if it is not already loaded
243 |     '''
244 |     
245 |     partition_cache_file = '/tmp/partitions'
246 | 
247 |     database = os.environ['ATHENA_DATABASE']
248 |     table_name = os.environ['ATHENA_TABLE']
249 |     partition_keys = os.environ['PARTITION_KEYS'].split(',')
250 |     
251 |     # Log the the received event locally.
252 |     #print("Received event: " + json.dumps(event, indent=2))
253 | 
254 |     # Get the object from the event.
255 |     bucket = event['Records'][0]['s3']['bucket']['name']
256 |     s3_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key']).rstrip('/')
257 |     size = event['Records'][0]['s3']['object']['size']
258 |     
259 |     print("The S3 key is " + s3_key)
260 | 
261 |     # Do we have a cached partition list?
262 |     if not os.path.isfile(partition_cache_file):
263 |         print("No partition cache file exists - creating it")
264 |         existing_parts = get_existing_db_partitions(session, database, table_name)
265 |         write_partition_cache(existing_parts, partition_cache_file)
266 |     else:
267 |         print("Partition cache file exists - loading it")
268 |         existing_parts = load_partition_cache(partition_cache_file)
269 | 
270 |     # Now from the event, do we have this partition?
271 | 
272 |     # This will handle when we get data in the folder but ignore events for just the folder itself
273 |     dirname = os.path.dirname(s3_key)
274 | 
275 |     # If we've removed the filename and we still have all the keys
276 |     # then this is a valid partition
277 |     if partition_name_in_key(dirname, partition_keys):
278 |         print("Incoming event contains both parition keys")
279 | 
280 |         # is this in the cache?
281 |         if dirname in existing_parts:
282 |             print("A partition already exists for " + dirname)
283 |         else:
284 |             # We are ok if multiple lambdas try and add the same partition - it will fail
285 |             # and we catch it when we get the results.
286 |             add_partition(session, database, table_name, dirname, bucket)
287 | 
288 |             # Refresh the cache
289 |             existing_parts = get_existing_db_partitions(session, database, table_name)
290 |             write_partition_cache(existing_parts, partition_cache_file)
291 | 
292 |     return 'Success'
293 | 


--------------------------------------------------------------------------------