├── .gitignore ├── README.md ├── cloud_formation └── canvas_data_aws.yaml └── lambda ├── README.md ├── build.sh ├── fetch-canvas-data-file.py ├── requirements.txt └── sync-canvas-data-files.py /.gitignore: -------------------------------------------------------------------------------- 1 | lambda/*.zip 2 | lambda/package 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Canvas Data on AWS 2 | 3 | This repository contains artifacts necessary to create a Canvas Data Warehouse on AWS. See the [tutorial page](https://github.com/Harvard-University-iCommons/canvas-data-aws/wiki/Tutorial) for detailed instructions on how to set up your own environment. 4 | 5 | ## CloudFormation template 6 | 7 | A CloudFormation template, `cloud_formation/canvas_data_aws.yaml` is used to create all of the AWS infrastructure components to build the warehouse (see the template for details). 8 | 9 | ## Lambda functions 10 | 11 | The Python code for two Lambda functions is included in the `lambda` directory. This code is used by the CloudFormation template. 12 | -------------------------------------------------------------------------------- /cloud_formation/canvas_data_aws.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | AWSTemplateFormatVersion: "2010-09-09" 3 | 4 | Description: 5 | This template creates AWS resources necessary to retrieve and store Canvas Data extracts. It also creates a Glue data catalog. 6 | 7 | Parameters: 8 | EnvironmentParameter: 9 | Type: String 10 | Default: prod 11 | Description: The environment name. 12 | AllowedValues: [dev, qa, prod] 13 | LambdaFunctionZipFileKeyParameter: 14 | Type: String 15 | Default: "canvas-data-aws/canvas-data-lambda-05272019-173758.zip" 16 | Description: The S3 key (file path) where the Lambda function package zip file is stored. If you've packaged your own version of the functions, replace this with your own zip file. 17 | LambdaFunctionBucketParameter: 18 | Type: String 19 | Default: "huit-at-public-build-artifacts" 20 | Description: The S3 bucket where the Lambda function package is stored. If you've packaged your own version of the functions, replace this with your own bucket name. 21 | ApiKeyParameter: 22 | Type: String 23 | Description: Your Canvas Data API Key 24 | ApiSecretParameter: 25 | Type: String 26 | Description: Your Canvas Data API Secret 27 | ApiSecretsManagerParameter: 28 | Type: String 29 | Description: An optional Secrets Manager secret that contains the api_key and api_secret (alternative to ApiKeyParameter and ApiSecretParameter). 30 | Default: "" 31 | EmailAddressParameter: 32 | Type: String 33 | Description: Your email address. This will be used to send you notifications about the success or failure of the data synchronization process. 34 | 35 | Conditions: 36 | HasApiSecretsManager: !And 37 | - !Equals [ !Ref ApiKeyParameter, "" ] 38 | - !Equals [ !Ref ApiSecretParameter, "" ] 39 | - !Not [ !Equals [ !Ref ApiSecretsManagerParameter, "" ] ] 40 | 41 | Resources: 42 | 43 | S3Bucket: 44 | Type: AWS::S3::Bucket 45 | Properties: 46 | BucketName: !Join 47 | - "-" 48 | - - "canvas-data-warehouse" 49 | - !Ref AWS::AccountId 50 | - !Ref EnvironmentParameter 51 | PublicAccessBlockConfiguration: 52 | BlockPublicAcls: True 53 | BlockPublicPolicy: True 54 | IgnorePublicAcls: True 55 | RestrictPublicBuckets: True 56 | BucketEncryption: 57 | ServerSideEncryptionConfiguration: 58 | - ServerSideEncryptionByDefault: 59 | SSEAlgorithm: AES256 60 | 61 | GlueDatabase: 62 | Type: AWS::Glue::Database 63 | Properties: 64 | CatalogId: !Ref AWS::AccountId 65 | DatabaseInput: 66 | Name: !Join ["_", ["canvasdata", !Ref EnvironmentParameter]] 67 | 68 | SNSTopic: 69 | Type: AWS::SNS::Topic 70 | Properties: 71 | DisplayName: !Join ["-", ["canvas-data-sync", !Ref EnvironmentParameter]] 72 | 73 | SNSSubscription: 74 | Type: AWS::SNS::Subscription 75 | Properties: 76 | Endpoint: !Ref EmailAddressParameter 77 | Protocol: email 78 | TopicArn: !Ref SNSTopic 79 | 80 | FetchLambdaFunctionRole: 81 | Type: AWS::IAM::Role 82 | Properties: 83 | RoleName: !Join ["-", ["canvas-data-fetch-lambda-role", !Ref EnvironmentParameter]] 84 | AssumeRolePolicyDocument: 85 | Version: "2012-10-17" 86 | Statement: 87 | - 88 | Effect: "Allow" 89 | Principal: 90 | Service: 91 | - "lambda.amazonaws.com" 92 | Action: 93 | - "sts:AssumeRole" 94 | ManagedPolicyArns: 95 | - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" 96 | Policies: 97 | - PolicyName: !Join ["-", ["canvas-data-fetch-lambda-policy", !Ref EnvironmentParameter]] 98 | PolicyDocument: 99 | Version: "2012-10-17" 100 | Statement: 101 | - 102 | Effect: "Allow" 103 | Action: 104 | - "s3:ListBucket" 105 | Resource: !GetAtt S3Bucket.Arn 106 | - 107 | Effect: "Allow" 108 | Action: 109 | - "s3:GetObject" 110 | - "s3:PutObject" 111 | - "s3:DeleteObject" 112 | Resource: !Join ["/", [!GetAtt S3Bucket.Arn, "*"]] 113 | 114 | FetchLambdaFunction: 115 | Type: AWS::Lambda::Function 116 | Properties: 117 | Description: "Lambda function that fetches a file from a URL and stores it in an S3 bucket" 118 | FunctionName: !Join ["-", ["canvas-data-fetch", !Ref EnvironmentParameter]] 119 | Handler: "fetch-canvas-data-file.lambda_handler" 120 | Role: !GetAtt FetchLambdaFunctionRole.Arn 121 | Runtime: "python3.7" 122 | Environment: 123 | Variables: 124 | ENV : !Ref EnvironmentParameter 125 | MemorySize: "256" 126 | Timeout: "600" 127 | Code: 128 | S3Bucket: !Ref LambdaFunctionBucketParameter 129 | S3Key: !Ref LambdaFunctionZipFileKeyParameter 130 | 131 | SyncLambdaFunctionRole: 132 | Type: AWS::IAM::Role 133 | Properties: 134 | RoleName: !Join ["-", ["canvas-data-sync-lambda-role", !Ref EnvironmentParameter]] 135 | AssumeRolePolicyDocument: 136 | Version: "2012-10-17" 137 | Statement: 138 | - 139 | Effect: "Allow" 140 | Principal: 141 | Service: 142 | - "lambda.amazonaws.com" 143 | Action: 144 | - "sts:AssumeRole" 145 | ManagedPolicyArns: 146 | - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" 147 | Policies: 148 | - PolicyName: !Join ["-", ["uw-canvas-data-sync-lambda-policy", !Ref EnvironmentParameter]] 149 | PolicyDocument: 150 | Version: "2012-10-17" 151 | Statement: 152 | - 153 | Effect: "Allow" 154 | Action: "glue:GetResoucePolicy" 155 | Resource: !Join 156 | - ":" 157 | - - "arn:aws:glue" 158 | - !Ref AWS::Region 159 | - !Ref AWS::AccountId 160 | - "catalog/*" 161 | - 162 | Effect: "Allow" 163 | Action: 164 | - "glue:Get*" 165 | - "glue:CreateTable" 166 | - "glue:UpdateTable" 167 | Resource: "*" 168 | - 169 | Effect: "Allow" 170 | Action: 171 | - "s3:ListBucket" 172 | Resource: !GetAtt S3Bucket.Arn 173 | - 174 | Effect: "Allow" 175 | Action: 176 | - "s3:GetObject" 177 | - "s3:PutObject" 178 | - "s3:DeleteObject" 179 | Resource: !Join ["/", [!GetAtt S3Bucket.Arn, "*"]] 180 | - 181 | Effect: "Allow" 182 | Action: "sns:Publish" 183 | Resource: !Ref SNSTopic 184 | - 185 | Effect: "Allow" 186 | Action: 187 | - "lambda:InvokeFunction" 188 | - "lambda:InvokeAsync" 189 | Resource: !Join 190 | - ":" 191 | - - "arn:aws:lambda" 192 | - !Ref AWS::Region 193 | - !Ref AWS::AccountId 194 | - "function:canvas-data-*" 195 | SyncLambdaFunctionSMPolicy: 196 | Type: AWS::IAM::Policy 197 | Condition: HasApiSecretsManager 198 | Properties: 199 | Roles: 200 | - !Ref SyncLambdaFunctionRole 201 | PolicyName: APISecretsManager 202 | PolicyDocument: 203 | Version: "2012-10-17" 204 | Statement: 205 | - 206 | Effect: "Allow" 207 | Action: "secretsmanager:GetSecretValue" 208 | Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${ApiSecretsManagerParameter}-??????" 209 | 210 | SyncLambdaFunction: 211 | Type: AWS::Lambda::Function 212 | Properties: 213 | Description: "Lambda function that synchronizes Canvas Data files with a local archive" 214 | FunctionName: !Join ["-", ["canvas-data-sync", !Ref EnvironmentParameter]] 215 | Handler: "sync-canvas-data-files.lambda_handler" 216 | Role: !GetAtt SyncLambdaFunctionRole.Arn 217 | Runtime: "python3.7" 218 | Environment: 219 | Variables: 220 | ENV : !Ref EnvironmentParameter 221 | api_key: !Ref ApiKeyParameter 222 | api_secret: !Ref ApiSecretParameter 223 | api_sm_id: !Ref ApiSecretsManagerParameter 224 | fetch_function_name: !Ref FetchLambdaFunction 225 | s3_bucket: !Ref S3Bucket 226 | sns_topic: !Ref SNSTopic 227 | database_name: !Ref GlueDatabase 228 | MemorySize: "512" 229 | Timeout: "900" 230 | Code: 231 | S3Bucket: !Ref LambdaFunctionBucketParameter 232 | S3Key: !Ref LambdaFunctionZipFileKeyParameter 233 | 234 | CloudwatchEventRule: 235 | Type: AWS::Events::Rule 236 | Properties: 237 | Description: runs every day at 10am UTC 238 | Name: !Join ["-", ["canvas-data-sync-schedule", !Ref EnvironmentParameter]] 239 | ScheduleExpression: "cron(0 10 * * ? *)" 240 | Targets: 241 | - Arn: !GetAtt SyncLambdaFunction.Arn 242 | Id: !Join ["-", ["canvas-data-sync-sched-target", !Ref EnvironmentParameter]] 243 | 244 | SyncLambdaFuctionPermission: 245 | Type: AWS::Lambda::Permission 246 | Properties: 247 | FunctionName: !GetAtt SyncLambdaFunction.Arn 248 | Action: 'lambda:InvokeFunction' 249 | Principal: "events.amazonaws.com" 250 | SourceArn: !GetAtt CloudwatchEventRule.Arn 251 | -------------------------------------------------------------------------------- /lambda/README.md: -------------------------------------------------------------------------------- 1 | This folder contains the Python code for two lambda functions, `sync-canvas-data-files` and `fetch-canvas-data-file`. For convenience both of these function libraries and all of their dependencies are packaged into a single ZIP file for deployment on AWS. 2 | 3 | To build the zip file, run `./build.sh` from this folder. Note that you will need `zip` and `pip3` installed in order to run the build script. 4 | -------------------------------------------------------------------------------- /lambda/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | TS=$(date +"canvas-data-lambda-%m%d%Y-%H%M%S.zip") 4 | 5 | echo "Building $TS" 6 | 7 | mkdir package 8 | cd package 9 | echo "[install]\nprefix=" > setup.cfg 10 | pip3 install -q --no-warn-conflicts -r ../requirements.txt --target . 11 | zip -q -r9 ../$TS . 12 | cd .. 13 | rm -rf package 14 | zip -q -9 $TS *.py 15 | echo "Done." 16 | -------------------------------------------------------------------------------- /lambda/fetch-canvas-data-file.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from io import BytesIO 4 | from pprint import pprint 5 | 6 | import boto3 7 | import requests 8 | from smart_open import open 9 | 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | def lambda_handler(event, context): 15 | 16 | file_url = event['file_url'] 17 | s3_bucket = event['s3_bucket'] 18 | key = event['key'] 19 | 20 | chunk_size = 1024*1024*8 21 | 22 | logger.info('fetching {} to {}'.format(file_url, key)) 23 | 24 | s3 = boto3.client('s3') 25 | obj_list = s3.list_objects_v2(Bucket=s3_bucket, Prefix=key) 26 | 27 | if obj_list.get('KeyCount', 0) > 0: 28 | logger.warn('trying to download {} but it already exists -- skipping'.format(key)) 29 | return({ 30 | 'message': 'key {} already exists - skipping'.format(key) 31 | }) 32 | 33 | with open('s3://{}/{}'.format(s3_bucket, key), 'wb', ignore_ext=True) as fout: 34 | with requests.get(file_url, stream=True) as r: 35 | r.raise_for_status() 36 | for chunk in r.iter_content(chunk_size=chunk_size): 37 | if chunk: # filter out keep-alive new chunks 38 | fout.write(chunk) 39 | 40 | return { 41 | 'statusCode': 200, 42 | } 43 | -------------------------------------------------------------------------------- /lambda/requirements.txt: -------------------------------------------------------------------------------- 1 | smart-open==1.8.3 2 | canvas-data-sdk==0.1.4 -------------------------------------------------------------------------------- /lambda/sync-canvas-data-files.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | import boto3 6 | from canvas_data.api import CanvasDataAPI 7 | 8 | logger = logging.getLogger() 9 | logger.setLevel(os.environ.get('log_level', logging.INFO)) 10 | 11 | 12 | def lambda_handler(event, context): 13 | logger.debug('Starting Canvas Data sync') 14 | 15 | dry_run = True if os.environ.get('dry_run', '').lower() == 'true' else False 16 | 17 | sm = boto3.client('secretsmanager') 18 | if os.environ.get('api_sm_id'): 19 | res = sm.get_secret_value(SecretId=os.environ['api_sm_id']) 20 | api_sm = json.loads(res['SecretString']) 21 | api_key, api_secret = api_sm['api_key'], api_sm['api_secret'] 22 | else: 23 | api_key = os.environ['api_key'] 24 | api_secret = os.environ['api_secret'] 25 | 26 | fetch_function_name = os.environ.get('fetch_function_name') 27 | 28 | database = os.environ.get('database_name', 'canvasdata') 29 | s3_prefix = 'raw_files/' 30 | 31 | s3_bucket = os.environ['s3_bucket'] 32 | sns_topic = os.environ['sns_topic'] 33 | 34 | s3 = boto3.resource('s3') 35 | lmb = boto3.client('lambda') 36 | sns = boto3.client('sns') 37 | 38 | # get the list of the current objects in the s3 bucket 39 | b = s3.Bucket(s3_bucket) 40 | existing_keys = [] 41 | existing_objects = b.objects.filter(Prefix=s3_prefix) 42 | for o in existing_objects: 43 | existing_keys.append(o.key) 44 | 45 | # now get all of the current files from the api 46 | cd = CanvasDataAPI(api_key=api_key, api_secret=api_secret) 47 | result_data = cd.get_sync_file_urls() 48 | files = result_data['files'] 49 | 50 | numfiles = len(files) 51 | 52 | fetched_files = 0 53 | skipped_files = 0 54 | removed_files = 0 55 | 56 | reinvoke = False 57 | 58 | for f in files: 59 | 60 | key = '{}{}/{}'.format(s3_prefix, f['table'], f['filename']) 61 | 62 | if key in existing_keys: 63 | # we don't need to get this one 64 | # remove it from the existing_filenames so we don't delete it 65 | existing_keys.remove(key) 66 | logger.debug('skipping {}'.format(key)) 67 | skipped_files += 1 68 | 69 | else: 70 | # we need to get it 71 | # call the other lambda asynchronously to download 72 | payload = { 73 | 'file_url': f['url'], 74 | 's3_bucket': s3_bucket, 75 | 'key': key 76 | } 77 | if not dry_run: 78 | status = lmb.invoke( 79 | FunctionName=fetch_function_name, 80 | InvocationType='Event', 81 | Payload=json.dumps(payload) 82 | ) 83 | logger.info('fetching {} - status {}'.format(key, status)) 84 | fetched_files += 1 85 | else: 86 | logger.info('would have fetched {}'.format(key)) 87 | 88 | if context.get_remaining_time_in_millis() < 30000: 89 | # stop here and call this function again 90 | logger.info('this invocation has 30 seconds until timeout') 91 | logger.info('invoking another instance and exiting this one') 92 | reinvoke = True 93 | status = lmb.invoke( 94 | FunctionName=context.function_name, 95 | InvocationType='Event', 96 | Payload=json.dumps(event), 97 | ) 98 | break 99 | 100 | tables_created = 0 101 | tables_updated = 0 102 | 103 | if not reinvoke: 104 | # now we need to delete any keys that remain in existing_keys 105 | s3c = boto3.client('s3') 106 | for old_key in existing_keys: 107 | if not dry_run: 108 | logger.info('removing old file {}'.format(old_key)) 109 | s3c.delete_object(Bucket=s3_bucket, Key=old_key) 110 | removed_files += 1 111 | else: 112 | logger.info('would have removed old file {}'.format(old_key)) 113 | 114 | # now update the Glue data catalog 115 | if not dry_run: 116 | schema = cd.get_schema() 117 | for tk in schema.keys(): 118 | c_or_u = create_or_update_table(schema[tk], database, s3_bucket, s3_prefix) 119 | if c_or_u == 'created': 120 | tables_created += 1 121 | elif c_or_u == 'updated': 122 | tables_updated += 1 123 | 124 | 125 | logger.info('total number of files in the sync: {}'.format(numfiles)) 126 | logger.info('fetched {} files'.format(fetched_files)) 127 | logger.info('skipped {} files'.format(skipped_files)) 128 | logger.info('removed {} old files'.format(removed_files)) 129 | logger.info('tables created/updated: {}/{}'.format(tables_created, tables_updated)) 130 | 131 | summary = { 132 | 'total_files': numfiles, 133 | 'fetched_files': fetched_files, 134 | 'skipped_files': skipped_files, 135 | 'removed_files': removed_files, 136 | 'reinvoke': reinvoke, 137 | 'tables_created': tables_created, 138 | 'tables_updated': tables_updated, 139 | } 140 | 141 | sns.publish( 142 | TopicArn=sns_topic, 143 | Subject='Canvas Data sync complete', 144 | Message=json.dumps(summary, indent=4), 145 | ) 146 | 147 | return summary 148 | 149 | 150 | def get_column_type(column): 151 | # converts the column types returned by the Canvas Data API to Athena-compatible types 152 | raw_type = column['type'] 153 | if raw_type in ['text', 'enum', 'guid']: 154 | return 'string' 155 | elif raw_type in ['varchar']: 156 | if column.get('length'): 157 | return 'varchar({})'.format(column['length']) 158 | else: 159 | return 'string' 160 | elif raw_type in ['double precision']: 161 | return 'double' 162 | elif raw_type in ['integer']: 163 | return 'int' 164 | elif raw_type in ['datetime']: 165 | return 'timestamp' 166 | else: 167 | return raw_type 168 | 169 | 170 | def create_or_update_table(table_schema, database, s3_bucket, s3_prefix): 171 | 172 | table_desc = table_schema.get('description', '')[:254] 173 | 174 | table_input = { 175 | 'Name': table_schema['tableName'], 176 | 'Description': table_desc, 177 | 'Parameters': { 178 | 'compressionType': 'gzip', 179 | 'delimiter': '\t', 180 | 'classification': 'csv', 181 | 'typeOfData': 'file' 182 | }, 183 | 'TableType': 'EXTERNAL_TABLE', 184 | 'PartitionKeys': [], 185 | 'StorageDescriptor': { 186 | 'Location': 's3://{}/{}{}/'.format(s3_bucket, s3_prefix, table_schema['tableName']), 187 | 'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat', 188 | 'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', 189 | 'Compressed': True, 190 | 'Parameters': { 191 | 'compressionType': 'gzip', 192 | 'delimiter': '\t', 193 | 'classification': 'csv', 194 | 'typeOfData': 'file' 195 | }, 196 | 'SerdeInfo': { 197 | 'Name': 'LazySimpleSerDe', 198 | 'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe', 199 | 'Parameters': { 200 | 'field.delim': '\t' 201 | } 202 | } 203 | }, 204 | } 205 | 206 | columns = [] 207 | for column in table_schema['columns']: 208 | t = get_column_type(column) 209 | desc = column.get('description', '')[:254] 210 | 211 | c = { 212 | 'Name': column['name'], 213 | 'Type': t, 214 | 'Comment': desc 215 | } 216 | columns.append(c) 217 | 218 | table_input['StorageDescriptor']['Columns'] = columns 219 | 220 | glue = boto3.client('glue') 221 | 222 | try: 223 | glue.create_table( 224 | DatabaseName=database, 225 | TableInput=table_input 226 | ) 227 | logger.info('created Glue table {}.{}'.format(database, table_schema['tableName'])) 228 | return 'created' 229 | except glue.exceptions.AlreadyExistsException: 230 | glue.update_table( 231 | DatabaseName=database, 232 | TableInput=table_input 233 | ) 234 | logger.info('updated Glue table {}.{}'.format(database, table_schema['tableName'])) 235 | return 'updated' 236 | --------------------------------------------------------------------------------