├── src ├── __init__.py └── app.py ├── .github └── CODEOWNERS ├── LICENSE ├── SampleEvent.json ├── .gitignore ├── README.md ├── deploy.sh └── template.yaml /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Each line is a file pattern followed by one or more owners. 2 | 3 | # These owners will be the default owners for everything in 4 | # the repo. Unless a later match takes precedence, members of 5 | # @rewindio/codeowners will be requested for review when someone 6 | # opens a pull request. 7 | * @dnorth98 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Rewind 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /SampleEvent.json: -------------------------------------------------------------------------------- 1 | { 2 | "Records": [ 3 | { 4 | "eventVersion": "2.0", 5 | "eventName": "ObjectCreated:Put", 6 | "eventTime": "1970-01-01T00:00:00.000Z", 7 | "userIdentity": { 8 | "principalId": "EXAMPLE" 9 | }, 10 | "eventSource": "aws:s3", 11 | "requestParameters": { 12 | "sourceIPAddress": "127.0.0.1" 13 | }, 14 | "s3": { 15 | "configurationId": "testConfigRule", 16 | "object": { 17 | "eTag": "1c43a0c9dcc31572b5e49c0b42f8b17f", 18 | "key": "INSERT FILENAME HERE (e.g. receipt.png)", 19 | "sequencer": "0A1B2C3D4E5F678901", 20 | "size": 1024 21 | }, 22 | "bucket": { 23 | "ownerIdentity": { 24 | "principalId": "EXAMPLE" 25 | }, 26 | "name": "INSERT BUCKET NAME HERE (e.g. aws-sam-ocr-sourceimagebucket-123122312)", 27 | "arn": "arn:aws:s3:::INSERT BUCKET NAME HERE (e.g. aws-sam-ocr-sourceimagebucket-123122312)" 28 | }, 29 | "s3SchemaVersion": "1.0" 30 | }, 31 | "responseElements": { 32 | "x-amz-id-2": "EXAMPLE123/5678abcdefghijklambdaisawesome/mnopqrstuvwxyzABCDEFGH", 33 | "x-amz-request-id": "EXAMPLE123456789" 34 | }, 35 | "awsRegion": "us-east-1" 36 | } 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # SAM 107 | packaged.yaml 108 | 109 | # test events 110 | src/*.json 111 | 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aws-athena-partition-autoloader 2 | Automatically adds new partitions detected in S3 to an existing Athena table 3 | 4 | # Purpose 5 | Athena is fantastic for querying data in S3 and works especially well when the data is partitioned. The issue comes when you have a lot of partitions and need to issue the `MSCK LOAD PARTITONS` command as it can take a long time. 6 | 7 | This solution subscribes to S3 events on a bucket and detects when a new partition is created and then loads only that partition into Athena. It uses a cache of the existing partitions to minimize the number of calls needed to Athena to query the parition list. 8 | 9 | # Installing and Configuring 10 | 11 | ## AWS Setup 12 | 13 | ## Deploying to AWS 14 | Before starting, you will need: 15 | * The [AWS CLI](https://aws.amazon.com/cli/) installed and default credentials configured 16 | * The [AWS SAM CLI](https://github.com/awslabs/aws-sam-cli) installed 17 | * An existing S3 bucket where the AWS Lambda code will be deployed to by SAM 18 | * An existing Athena table backed by content in S3 with at least 1 partition key 19 | * This repo cloned 20 | 21 | 1. Run the *deploy.sh* script like 22 | 23 | ``` 24 | ./deploy.sh 25 | ``` 26 | 27 | For Example: 28 | 29 | ``` 30 | ./deploy.sh athena_loader_mytable eu-west-1 us-east-1 ALL lambda-sam-staging stage-audit-log aws-athena-query-results-123456789-us-east-1 audit_log_db api_audit_log 'destination_platform_id,date' staging 31 | ``` 32 | 33 | The list of partition keys must exactly match that which was defined on the table. 34 | 35 | deploy.sh uses AWS SAM to package the AWS Lambda functions and then deploys them to AWS. Everything is deployed as a Cloudformation Stack in the specified region. 36 | 37 | | NOTE: If you don't have SAM installed, you can replace the SAM commands in the deploy script with `aws cloudformation package...` and `aws cloudformation deploy..` instead | 38 | | --- | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | STACK_NAME=$1 4 | STACK_REGION=$2 5 | ATHENA_REGION=$3 6 | OPERATION=$4 7 | DEPLOY_BUCKET=$5 8 | CONTENT_BUCKET=$6 9 | ATHENA_RESULTS_BUCKET=$7 10 | ATHENA_DATABASE=$8 11 | ATHENA_TABLE=$9 12 | PARTITION_KEYS=${10} 13 | PROFILE=${11} 14 | 15 | # No longer hard code as need different functions for different tables 16 | # STACK_NAME=aws-athena-partition-autoloader 17 | 18 | if [ "${OPERATION}" == "ALL" ]; then 19 | echo "Packaging using SAM....." 20 | sam package \ 21 | --template-file template.yaml \ 22 | --output-template-file packaged.yaml \ 23 | --s3-bucket ${DEPLOY_BUCKET} \ 24 | --region ${STACK_REGION} \ 25 | --profile ${PROFILE} 26 | 27 | echo "Deploying using SAM...." 28 | sam deploy \ 29 | --template-file packaged.yaml \ 30 | --stack-name ${STACK_NAME} \ 31 | --capabilities CAPABILITY_IAM \ 32 | --parameter-overrides S3Bucket=${CONTENT_BUCKET} AthenaRegion=${ATHENA_REGION} AthenaResultsBucket=${ATHENA_RESULTS_BUCKET} AthenaDatabase=${ATHENA_DATABASE} AthenaTable=${ATHENA_TABLE} PartitionKeys=${PARTITION_KEYS}\ 33 | --region ${STACK_REGION} \ 34 | --profile ${PROFILE} 35 | fi 36 | 37 | # SAM only allows subscribing Lambdas to events for buckets created in the same template 38 | # Existing buckets cannot be used so we do this to subscribe an existing bucket to the new 39 | # functions. See : https://github.com/awslabs/serverless-application-model/issues/124 40 | 41 | AWS_ACCOUNT_ID=$(aws sts get-caller-identity \ 42 | --query 'Account' \ 43 | --output text \ 44 | --region ${STACK_REGION} \ 45 | --profile ${PROFILE} 46 | ) 47 | echo "Our AWS account ID is ${AWS_ACCOUNT_ID}" 48 | 49 | FUNCTION_NAME=$(aws cloudformation describe-stacks \ 50 | --stack-name ${STACK_NAME} \ 51 | --query 'Stacks[].Outputs[].OutputValue' \ 52 | --output text \ 53 | --region ${STACK_REGION} \ 54 | --profile ${PROFILE} 55 | ) 56 | echo "The Lambda function name is ${FUNCTION_NAME}" 57 | 58 | FUNCTION_ARN=$(aws lambda get-function \ 59 | --function-name ${FUNCTION_NAME} \ 60 | --query 'Configuration.FunctionArn' \ 61 | --output text \ 62 | --region ${STACK_REGION} \ 63 | --profile ${PROFILE} 64 | ) 65 | echo "The Lambda function ARN is ${FUNCTION_ARN}" 66 | 67 | # Allow the lambda to receive events from S3 68 | echo "Adding Lambda invoke permissions..." 69 | 70 | aws lambda add-permission \ 71 | --function-name ${FUNCTION_NAME} \ 72 | --region ${STACK_REGION} \ 73 | --profile ${PROFILE} \ 74 | --statement-id "s3perms-${CONTENT_BUCKET}" \ 75 | --action "lambda:InvokeFunction" \ 76 | --principal s3.amazonaws.com \ 77 | --source-arn arn:aws:s3:::${CONTENT_BUCKET} \ 78 | --source-account ${AWS_ACCOUNT_ID} > /dev/null 2>&1 79 | 80 | # Subscribe the lambda to S3 events for our specific bucket 81 | S3_LAMBDA_EVENT_SUBSCRIPTION="{\"LambdaFunctionConfigurations\":[{\"LambdaFunctionArn\":\"${FUNCTION_ARN}\",\"Events\":[\"s3:ObjectCreated:*\"]}]}" 82 | 83 | echo "Adding AWS event subscription for bucket ${CONTENT_BUCKET}" 84 | aws s3api put-bucket-notification-configuration \ 85 | --bucket ${CONTENT_BUCKET} \ 86 | --notification-configuration ${S3_LAMBDA_EVENT_SUBSCRIPTION} \ 87 | --region ${STACK_REGION} \ 88 | --profile ${PROFILE} -------------------------------------------------------------------------------- /template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: 'AWS::Serverless-2016-10-31' 3 | 4 | Description: SAM app that automatically loads Athena partitions for new partitions in S3 5 | 6 | Parameters: 7 | S3Bucket: 8 | Type: String 9 | AthenaResultsBucket: 10 | Type: String 11 | AthenaRegion: 12 | Type: String 13 | AthenaDatabase: 14 | Type: String 15 | AthenaTable: 16 | Type: String 17 | PartitionKeys: 18 | Type: String 19 | 20 | Resources: 21 | LambdaRole: 22 | Type: AWS::IAM::Role 23 | Properties: 24 | Path: "/" 25 | AssumeRolePolicyDocument: 26 | Version: "2012-10-17" 27 | Statement: 28 | - 29 | Sid: "AllowLambdaServiceToAssumeRole" 30 | Effect: "Allow" 31 | Action: 32 | - "sts:AssumeRole" 33 | Principal: 34 | Service: 35 | - "lambda.amazonaws.com" 36 | ManagedPolicyArns: 37 | - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole 38 | Policies: 39 | - 40 | PolicyName: "AthenaPartitionLoader" 41 | PolicyDocument: 42 | Version: '2012-10-17' 43 | Statement: 44 | - Effect: Allow 45 | Action: 46 | - 's3:Get*' 47 | - 's3:List*' 48 | - 's3:Put*' 49 | Resource: 50 | - !Sub arn:aws:s3:::${AthenaResultsBucket} 51 | - !Sub arn:aws:s3:::${AthenaResultsBucket}/* 52 | 53 | - Effect: Allow 54 | Action: 55 | - 's3:Get*' 56 | - 's3:ListBucket' 57 | Resource: 58 | - !Sub arn:aws:s3:::${S3Bucket} 59 | - !Sub arn:aws:s3:::${S3Bucket}/* 60 | 61 | - Effect: Allow 62 | Action: 63 | - 'athena:RunQuery' 64 | - 'athena:StartQueryExecution' 65 | - 'athena:GetQueryResultsStream' 66 | - 'athena:StopQueryExecution' 67 | - 'athena:GetQueryExecution' 68 | - 'athena:GetQueryResults' 69 | - 'athena:GetQueryExecutions' 70 | - 'athena:GetTables' 71 | - 'athena:GetTable' 72 | - 'glue:GetDatabase' 73 | - 'glue:GetTable' 74 | - 'glue:GetTables' 75 | - 'glue:GetPartition' 76 | - 'glue:GetPartitions' 77 | - 'glue:BatchCreatePartition' 78 | Resource: 79 | - '*' 80 | 81 | AthenaPartitionLoader: 82 | Type: 'AWS::Serverless::Function' 83 | Properties: 84 | Handler: src/app.lambda_handler 85 | Runtime: python3.6 86 | CodeUri: . 87 | Description: Automatically loads Athena partitions for new partitions in S3. 88 | MemorySize: 128 89 | Timeout: 900 90 | Environment: 91 | Variables: 92 | ATHENA_REGION: !Ref AthenaRegion 93 | ATHENA_DATABASE: !Ref AthenaDatabase 94 | ATHENA_TABLE: !Ref AthenaTable 95 | PARTITION_KEYS: !Ref PartitionKeys 96 | Role: !GetAtt LambdaRole.Arn 97 | 98 | Outputs: 99 | InvalidatorFunctionName: 100 | Description: The name of the lambda Function 101 | Value: !Ref AthenaPartitionLoader 102 | -------------------------------------------------------------------------------- /src/app.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import boto3 3 | from botocore.exceptions import ClientError 4 | import json 5 | import urllib 6 | import time 7 | import os 8 | import os.path 9 | from pprint import pprint 10 | 11 | # Global holding the AWS account ID this is executing in 12 | account_id = 0 13 | 14 | # The bucket could be in one region with the Athena DB being in a seperate region 15 | athena_region = os.environ['ATHENA_REGION'] 16 | 17 | session = boto3.session.Session(region_name = athena_region) 18 | 19 | # 20 | # Get the current AWS account ID 21 | # 22 | def get_aws_account_id(session): 23 | global account_id 24 | 25 | if account_id == 0: 26 | account_id = session.client('sts').get_caller_identity()['Account'] 27 | 28 | return account_id 29 | 30 | # 31 | # Submit a query to Athena; return the query ID 32 | # 33 | def submit_query(query, database, session): 34 | output_location = 's3://aws-athena-query-results-' + str(get_aws_account_id(session)) + "-" + session.region_name 35 | query_id = None 36 | response = None 37 | 38 | client = session.client('athena') 39 | 40 | try: 41 | response = client.start_query_execution( 42 | QueryString=query, 43 | QueryExecutionContext={ 44 | 'Database': database 45 | }, 46 | ResultConfiguration={ 47 | 'OutputLocation': output_location, 48 | 'EncryptionConfiguration': { 49 | 'EncryptionOption': 'SSE_S3' 50 | } 51 | } 52 | ) 53 | except Exception as e: 54 | print("Error submitting query to Athena " + query + " (" + str(e) + ")") 55 | 56 | if response: 57 | if response['ResponseMetadata']['HTTPStatusCode'] == 200: 58 | print("Athena Query submitted successfully") 59 | query_id = response['QueryExecutionId'] 60 | else: 61 | print("The response code was " + response['ResponseMetadata']['HTTPStatusCode']) 62 | 63 | return query_id 64 | 65 | # 66 | # Poll an existing query already submitted to Athena 67 | # 68 | def wait_for_query_to_complete(query_id, session): 69 | status = True 70 | client = session.client('athena') 71 | 72 | is_query_still_running = True 73 | while is_query_still_running: 74 | response = None 75 | try: 76 | response = client.get_query_execution( 77 | QueryExecutionId=query_id 78 | ) 79 | except Exception as e: 80 | print("Error getting query execution for " + query_id + " (" + str(e) + ")") 81 | status = False 82 | 83 | if response and status: 84 | query_state = response['QueryExecution']['Status']['State'] 85 | 86 | if query_state == 'FAILED': 87 | is_query_still_running = False 88 | 89 | if 'AlreadyExistsException' in response['QueryExecution']['Status']['StateChangeReason']: 90 | print("Table partition already exists") 91 | status = True 92 | else: 93 | print("Athena query " + query_id + " failed") 94 | status = False 95 | elif query_state == 'CANCELLED': 96 | print("Athena query " + query_id + " was cancelled") 97 | is_query_still_running = False 98 | status = False 99 | elif query_state == 'SUCCEEDED': 100 | print("Athena query " + query_id + " completed successfully") 101 | is_query_still_running = False 102 | status = True 103 | else: 104 | time.sleep(1) 105 | 106 | return status 107 | 108 | # 109 | # Get the results from a query that has executed 110 | # 111 | def get_query_results(query_id, session, header_row=True): 112 | status = True 113 | results = [] 114 | skip_row_count=1 115 | 116 | if not header_row: 117 | skip_row_count=0 118 | 119 | client = session.client('athena') 120 | 121 | try: 122 | results_paginator = client.get_paginator('get_query_results') 123 | results_iter = results_paginator.paginate(QueryExecutionId=query_id) 124 | 125 | data_list = [] 126 | 127 | for results_page in results_iter: 128 | for row in results_page['ResultSet']['Rows']: 129 | data_list.append(row['Data']) 130 | 131 | for datum in data_list[skip_row_count:]: 132 | results.append([x['VarCharValue'] for x in datum]) 133 | 134 | except ClientError as e: 135 | print("Unexpected error getting query results: "+ e.response['Error']['Code']) 136 | 137 | return [tuple(x) for x in results] 138 | 139 | # 140 | # Query an Athena table to get the existing partitions 141 | # 142 | def get_existing_db_partitions(session, database, table_name): 143 | print("load_partition") 144 | query_results = None 145 | partitions = [] 146 | 147 | get_partitions_sql = "SHOW PARTITIONS " + table_name 148 | 149 | query_id = submit_query(get_partitions_sql, database, session) 150 | 151 | if query_id: 152 | if wait_for_query_to_complete(query_id, session): 153 | query_results = get_query_results(query_id, session, False) 154 | else: 155 | print("ERROR running query to get existing partitions") 156 | 157 | # Query results come back as a tuple but we only care about the first val for partitions 158 | for part_info in query_results: 159 | partitions.append(part_info[0]) 160 | 161 | return partitions 162 | 163 | # 164 | # Add a new partition to an Athena table 165 | # 166 | def add_partition(session, database, table_name, partition, bucket): 167 | current_key = 0 168 | status = False 169 | 170 | sql = 'ALTER TABLE ' + table_name + ' ADD PARTITION (' 171 | 172 | partition_key_vals = partition.split('/') 173 | # Filter out any prefix dirs from the key 174 | partition_key_vals = [p for p in partition_key_vals if "=" in p] 175 | partiton_key_count = len(partition_key_vals) 176 | 177 | for part in partition_key_vals: 178 | current_key += 1 179 | key,val = part.split('=') 180 | 181 | sql += key + " = '" + val + "'" 182 | 183 | if current_key != partiton_key_count: 184 | sql += ", " 185 | else: 186 | sql += ") " 187 | 188 | sql += "LOCATION 's3://" + bucket + "/" + partition + "';" 189 | 190 | print("Running sql: " + sql) 191 | 192 | query_id = submit_query(sql, database, session) 193 | 194 | if query_id: 195 | if wait_for_query_to_complete(query_id, session): 196 | status = True 197 | else: 198 | print("ERROR running query to add new partition") 199 | status = False 200 | 201 | return status 202 | 203 | # 204 | # Write the list of table partitions to a cache file 205 | # 206 | def write_partition_cache(partitions, filename): 207 | with open(filename, 'w') as outfile: 208 | json.dump(partitions, outfile) 209 | 210 | # 211 | # Load the list of partitions from the cache file 212 | # 213 | def load_partition_cache(filename): 214 | data = dict() 215 | 216 | with open(filename) as json_file: 217 | data = json.load(json_file) 218 | 219 | return data 220 | 221 | # 222 | # Does an S3 key contain all the partition name keys 223 | # 224 | def partition_name_in_key(key, partition_keys): 225 | key_contains_all_partitions = False 226 | keys_found = 0 227 | partition_key_count = len(partition_keys) 228 | 229 | for part_name in partition_keys: 230 | if part_name in key: 231 | keys_found += 1 232 | 233 | if keys_found == partition_key_count: 234 | key_contains_all_partitions = True 235 | 236 | return key_contains_all_partitions 237 | 238 | 239 | # --------------- Main handler ------------------ 240 | def lambda_handler(event, context): 241 | ''' 242 | Loads an athena partition if it is not already loaded 243 | ''' 244 | 245 | partition_cache_file = '/tmp/partitions' 246 | 247 | database = os.environ['ATHENA_DATABASE'] 248 | table_name = os.environ['ATHENA_TABLE'] 249 | partition_keys = os.environ['PARTITION_KEYS'].split(',') 250 | 251 | # Log the the received event locally. 252 | #print("Received event: " + json.dumps(event, indent=2)) 253 | 254 | # Get the object from the event. 255 | bucket = event['Records'][0]['s3']['bucket']['name'] 256 | s3_key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key']).rstrip('/') 257 | size = event['Records'][0]['s3']['object']['size'] 258 | 259 | print("The S3 key is " + s3_key) 260 | 261 | # Do we have a cached partition list? 262 | if not os.path.isfile(partition_cache_file): 263 | print("No partition cache file exists - creating it") 264 | existing_parts = get_existing_db_partitions(session, database, table_name) 265 | write_partition_cache(existing_parts, partition_cache_file) 266 | else: 267 | print("Partition cache file exists - loading it") 268 | existing_parts = load_partition_cache(partition_cache_file) 269 | 270 | # Now from the event, do we have this partition? 271 | 272 | # This will handle when we get data in the folder but ignore events for just the folder itself 273 | dirname = os.path.dirname(s3_key) 274 | 275 | # If we've removed the filename and we still have all the keys 276 | # then this is a valid partition 277 | if partition_name_in_key(dirname, partition_keys): 278 | print("Incoming event contains both parition keys") 279 | 280 | # is this in the cache? 281 | if dirname in existing_parts: 282 | print("A partition already exists for " + dirname) 283 | else: 284 | # We are ok if multiple lambdas try and add the same partition - it will fail 285 | # and we catch it when we get the results. 286 | add_partition(session, database, table_name, dirname, bucket) 287 | 288 | # Refresh the cache 289 | existing_parts = get_existing_db_partitions(session, database, table_name) 290 | write_partition_cache(existing_parts, partition_cache_file) 291 | 292 | return 'Success' 293 | --------------------------------------------------------------------------------