├── .gitignore
├── README.md
├── cloud_formation
    └── canvas_data_aws.yaml
└── lambda
    ├── README.md
    ├── build.sh
    ├── fetch-canvas-data-file.py
    ├── requirements.txt
    └── sync-canvas-data-files.py


/.gitignore:
--------------------------------------------------------------------------------
1 | lambda/*.zip
2 | lambda/package
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Canvas Data on AWS
 2 | 
 3 | This repository contains artifacts necessary to create a Canvas Data Warehouse on AWS. See the [tutorial page](https://github.com/Harvard-University-iCommons/canvas-data-aws/wiki/Tutorial) for detailed instructions on how to set up your own environment.
 4 | 
 5 | ## CloudFormation template
 6 | 
 7 | A CloudFormation template, `cloud_formation/canvas_data_aws.yaml` is used to create all of the AWS infrastructure components to build the warehouse (see the template for details).
 8 | 
 9 | ## Lambda functions
10 | 
11 | The Python code for two Lambda functions is included in the `lambda` directory. This code is used by the CloudFormation template.
12 | 


--------------------------------------------------------------------------------
/cloud_formation/canvas_data_aws.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | AWSTemplateFormatVersion: "2010-09-09"
  3 | 
  4 | Description:
  5 |   This template creates AWS resources necessary to retrieve and store Canvas Data extracts. It also creates a Glue data catalog.
  6 | 
  7 | Parameters:
  8 |   EnvironmentParameter:
  9 |     Type: String
 10 |     Default: prod
 11 |     Description: The environment name.
 12 |     AllowedValues: [dev, qa, prod]
 13 |   LambdaFunctionZipFileKeyParameter:
 14 |     Type: String
 15 |     Default: "canvas-data-aws/canvas-data-lambda-05272019-173758.zip"
 16 |     Description: The S3 key (file path) where the Lambda function package zip file is stored. If you've packaged your own version of the functions, replace this with your own zip file.
 17 |   LambdaFunctionBucketParameter:
 18 |     Type: String
 19 |     Default: "huit-at-public-build-artifacts"
 20 |     Description: The S3 bucket where the Lambda function package is stored. If you've packaged your own version of the functions, replace this with your own bucket name.
 21 |   ApiKeyParameter:
 22 |     Type: String
 23 |     Description: Your Canvas Data API Key
 24 |   ApiSecretParameter:
 25 |     Type: String
 26 |     Description: Your Canvas Data API Secret
 27 |   ApiSecretsManagerParameter:
 28 |     Type: String
 29 |     Description: An optional Secrets Manager secret that contains the api_key and api_secret (alternative to ApiKeyParameter and ApiSecretParameter).
 30 |     Default: ""
 31 |   EmailAddressParameter:
 32 |     Type: String
 33 |     Description: Your email address. This will be used to send you notifications about the success or failure of the data synchronization process.
 34 | 
 35 | Conditions:
 36 |   HasApiSecretsManager: !And
 37 |     - !Equals [ !Ref ApiKeyParameter, "" ]
 38 |     - !Equals [ !Ref ApiSecretParameter, "" ]
 39 |     - !Not [ !Equals [ !Ref ApiSecretsManagerParameter, "" ] ]
 40 | 
 41 | Resources:
 42 | 
 43 |   S3Bucket:
 44 |     Type: AWS::S3::Bucket
 45 |     Properties:
 46 |       BucketName: !Join
 47 |         - "-"
 48 |         - - "canvas-data-warehouse"
 49 |           - !Ref AWS::AccountId
 50 |           - !Ref EnvironmentParameter
 51 |       PublicAccessBlockConfiguration:
 52 |         BlockPublicAcls: True
 53 |         BlockPublicPolicy: True
 54 |         IgnorePublicAcls: True
 55 |         RestrictPublicBuckets: True
 56 |       BucketEncryption:
 57 |         ServerSideEncryptionConfiguration:
 58 |           - ServerSideEncryptionByDefault:
 59 |               SSEAlgorithm: AES256
 60 | 
 61 |   GlueDatabase:
 62 |     Type: AWS::Glue::Database
 63 |     Properties:
 64 |       CatalogId: !Ref AWS::AccountId
 65 |       DatabaseInput:
 66 |         Name: !Join ["_", ["canvasdata", !Ref EnvironmentParameter]]
 67 | 
 68 |   SNSTopic:
 69 |     Type: AWS::SNS::Topic
 70 |     Properties:
 71 |       DisplayName: !Join ["-", ["canvas-data-sync", !Ref EnvironmentParameter]]
 72 | 
 73 |   SNSSubscription:
 74 |     Type: AWS::SNS::Subscription
 75 |     Properties:
 76 |       Endpoint: !Ref EmailAddressParameter
 77 |       Protocol: email
 78 |       TopicArn: !Ref SNSTopic
 79 | 
 80 |   FetchLambdaFunctionRole:
 81 |     Type: AWS::IAM::Role
 82 |     Properties:
 83 |       RoleName: !Join ["-", ["canvas-data-fetch-lambda-role", !Ref EnvironmentParameter]]
 84 |       AssumeRolePolicyDocument:
 85 |         Version: "2012-10-17"
 86 |         Statement:
 87 |           -
 88 |             Effect: "Allow"
 89 |             Principal:
 90 |               Service:
 91 |                 - "lambda.amazonaws.com"
 92 |             Action:
 93 |               - "sts:AssumeRole"
 94 |       ManagedPolicyArns:
 95 |         - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
 96 |       Policies:
 97 |         - PolicyName: !Join ["-", ["canvas-data-fetch-lambda-policy", !Ref EnvironmentParameter]]
 98 |           PolicyDocument:
 99 |             Version: "2012-10-17"
100 |             Statement:
101 |               -
102 |                 Effect: "Allow"
103 |                 Action:
104 |                   - "s3:ListBucket"
105 |                 Resource: !GetAtt S3Bucket.Arn
106 |               -
107 |                 Effect: "Allow"
108 |                 Action:
109 |                   - "s3:GetObject"
110 |                   - "s3:PutObject"
111 |                   - "s3:DeleteObject"
112 |                 Resource: !Join ["/", [!GetAtt S3Bucket.Arn, "*"]]
113 | 
114 |   FetchLambdaFunction:
115 |     Type: AWS::Lambda::Function
116 |     Properties:
117 |       Description: "Lambda function that fetches a file from a URL and stores it in an S3 bucket"
118 |       FunctionName: !Join ["-", ["canvas-data-fetch", !Ref EnvironmentParameter]]
119 |       Handler: "fetch-canvas-data-file.lambda_handler"
120 |       Role: !GetAtt FetchLambdaFunctionRole.Arn
121 |       Runtime: "python3.7"
122 |       Environment:
123 |         Variables:
124 |           ENV : !Ref EnvironmentParameter
125 |       MemorySize: "256"
126 |       Timeout: "600"
127 |       Code:
128 |         S3Bucket: !Ref LambdaFunctionBucketParameter
129 |         S3Key: !Ref LambdaFunctionZipFileKeyParameter
130 | 
131 |   SyncLambdaFunctionRole:
132 |     Type: AWS::IAM::Role
133 |     Properties:
134 |       RoleName: !Join ["-", ["canvas-data-sync-lambda-role", !Ref EnvironmentParameter]]
135 |       AssumeRolePolicyDocument:
136 |         Version: "2012-10-17"
137 |         Statement:
138 |           -
139 |             Effect: "Allow"
140 |             Principal:
141 |               Service:
142 |                 - "lambda.amazonaws.com"
143 |             Action:
144 |               - "sts:AssumeRole"
145 |       ManagedPolicyArns:
146 |         - "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
147 |       Policies:
148 |         - PolicyName: !Join ["-", ["uw-canvas-data-sync-lambda-policy", !Ref EnvironmentParameter]]
149 |           PolicyDocument:
150 |             Version: "2012-10-17"
151 |             Statement:
152 |               -
153 |                 Effect: "Allow"
154 |                 Action: "glue:GetResoucePolicy"
155 |                 Resource: !Join
156 |                   - ":"
157 |                   - - "arn:aws:glue"
158 |                     - !Ref AWS::Region
159 |                     - !Ref AWS::AccountId
160 |                     - "catalog/*"
161 |               -
162 |                 Effect: "Allow"
163 |                 Action:
164 |                   - "glue:Get*"
165 |                   - "glue:CreateTable"
166 |                   - "glue:UpdateTable"
167 |                 Resource: "*"
168 |               -
169 |                 Effect: "Allow"
170 |                 Action:
171 |                   - "s3:ListBucket"
172 |                 Resource: !GetAtt S3Bucket.Arn
173 |               -
174 |                 Effect: "Allow"
175 |                 Action:
176 |                   - "s3:GetObject"
177 |                   - "s3:PutObject"
178 |                   - "s3:DeleteObject"
179 |                 Resource: !Join ["/", [!GetAtt S3Bucket.Arn, "*"]]
180 |               -
181 |                 Effect: "Allow"
182 |                 Action: "sns:Publish"
183 |                 Resource: !Ref SNSTopic
184 |               -
185 |                 Effect: "Allow"
186 |                 Action:
187 |                   - "lambda:InvokeFunction"
188 |                   - "lambda:InvokeAsync"
189 |                 Resource: !Join
190 |                   - ":"
191 |                   - - "arn:aws:lambda"
192 |                     - !Ref AWS::Region
193 |                     - !Ref AWS::AccountId
194 |                     - "function:canvas-data-*"
195 |   SyncLambdaFunctionSMPolicy:
196 |     Type: AWS::IAM::Policy
197 |     Condition: HasApiSecretsManager
198 |     Properties:
199 |       Roles:
200 |         - !Ref SyncLambdaFunctionRole
201 |       PolicyName: APISecretsManager
202 |       PolicyDocument:
203 |         Version: "2012-10-17"
204 |         Statement:
205 |           -
206 |             Effect: "Allow"
207 |             Action: "secretsmanager:GetSecretValue"
208 |             Resource: !Sub "arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${ApiSecretsManagerParameter}-??????"
209 | 
210 |   SyncLambdaFunction:
211 |     Type: AWS::Lambda::Function
212 |     Properties:
213 |       Description: "Lambda function that synchronizes Canvas Data files with a local archive"
214 |       FunctionName: !Join ["-", ["canvas-data-sync", !Ref EnvironmentParameter]]
215 |       Handler: "sync-canvas-data-files.lambda_handler"
216 |       Role: !GetAtt SyncLambdaFunctionRole.Arn
217 |       Runtime: "python3.7"
218 |       Environment:
219 |         Variables:
220 |           ENV : !Ref EnvironmentParameter
221 |           api_key: !Ref ApiKeyParameter
222 |           api_secret: !Ref ApiSecretParameter
223 |           api_sm_id: !Ref ApiSecretsManagerParameter
224 |           fetch_function_name: !Ref FetchLambdaFunction
225 |           s3_bucket: !Ref S3Bucket
226 |           sns_topic: !Ref SNSTopic
227 |           database_name: !Ref GlueDatabase
228 |       MemorySize: "512"
229 |       Timeout: "900"
230 |       Code:
231 |         S3Bucket: !Ref LambdaFunctionBucketParameter
232 |         S3Key: !Ref LambdaFunctionZipFileKeyParameter
233 | 
234 |   CloudwatchEventRule:
235 |     Type: AWS::Events::Rule
236 |     Properties:
237 |       Description: runs every day at 10am UTC
238 |       Name: !Join ["-", ["canvas-data-sync-schedule", !Ref EnvironmentParameter]]
239 |       ScheduleExpression: "cron(0 10 * * ? *)"
240 |       Targets:
241 |         - Arn: !GetAtt SyncLambdaFunction.Arn
242 |           Id: !Join ["-", ["canvas-data-sync-sched-target", !Ref EnvironmentParameter]]
243 | 
244 |   SyncLambdaFuctionPermission:
245 |     Type: AWS::Lambda::Permission
246 |     Properties:
247 |       FunctionName: !GetAtt SyncLambdaFunction.Arn
248 |       Action: 'lambda:InvokeFunction'
249 |       Principal: "events.amazonaws.com"
250 |       SourceArn: !GetAtt CloudwatchEventRule.Arn
251 | 


--------------------------------------------------------------------------------
/lambda/README.md:
--------------------------------------------------------------------------------
1 | This folder contains the Python code for two lambda functions, `sync-canvas-data-files` and `fetch-canvas-data-file`. For convenience both of these function libraries and all of their dependencies are packaged into a single ZIP file for deployment on AWS.
2 | 
3 | To build the zip file, run `./build.sh` from this folder. Note that you will need `zip` and `pip3` installed in order to run the build script.
4 | 


--------------------------------------------------------------------------------
/lambda/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | TS=$(date +"canvas-data-lambda-%m%d%Y-%H%M%S.zip")
 4 | 
 5 | echo "Building $TS"
 6 | 
 7 | mkdir package
 8 | cd package
 9 | echo "[install]\nprefix=" > setup.cfg
10 | pip3 install -q --no-warn-conflicts -r ../requirements.txt --target .
11 | zip -q -r9 ../$TS .
12 | cd ..
13 | rm -rf package
14 | zip -q -9 $TS *.py
15 | echo "Done."
16 | 


--------------------------------------------------------------------------------
/lambda/fetch-canvas-data-file.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from io import BytesIO
 4 | from pprint import pprint
 5 | 
 6 | import boto3
 7 | import requests
 8 | from smart_open import open
 9 | 
10 | logger = logging.getLogger()
11 | logger.setLevel(logging.INFO)
12 | 
13 | 
14 | def lambda_handler(event, context):
15 | 
16 |     file_url = event['file_url']
17 |     s3_bucket = event['s3_bucket']
18 |     key = event['key']
19 | 
20 |     chunk_size = 1024*1024*8
21 | 
22 |     logger.info('fetching {} to {}'.format(file_url, key))
23 | 
24 |     s3 = boto3.client('s3')
25 |     obj_list = s3.list_objects_v2(Bucket=s3_bucket, Prefix=key)
26 | 
27 |     if obj_list.get('KeyCount', 0) > 0:
28 |         logger.warn('trying to download {} but it already exists -- skipping'.format(key))
29 |         return({
30 |             'message': 'key {} already exists - skipping'.format(key)
31 |         })
32 | 
33 |     with open('s3://{}/{}'.format(s3_bucket, key), 'wb', ignore_ext=True) as fout:
34 |         with requests.get(file_url, stream=True) as r:
35 |             r.raise_for_status()
36 |             for chunk in r.iter_content(chunk_size=chunk_size):
37 |                 if chunk: # filter out keep-alive new chunks
38 |                     fout.write(chunk)
39 | 
40 |     return {
41 |         'statusCode': 200,
42 |     }
43 | 


--------------------------------------------------------------------------------
/lambda/requirements.txt:
--------------------------------------------------------------------------------
1 | smart-open==1.8.3
2 | canvas-data-sdk==0.1.4


--------------------------------------------------------------------------------
/lambda/sync-canvas-data-files.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | 
  5 | import boto3
  6 | from canvas_data.api import CanvasDataAPI
  7 | 
  8 | logger = logging.getLogger()
  9 | logger.setLevel(os.environ.get('log_level', logging.INFO))
 10 | 
 11 | 
 12 | def lambda_handler(event, context):
 13 |     logger.debug('Starting Canvas Data sync')
 14 | 
 15 |     dry_run = True if os.environ.get('dry_run', '').lower() == 'true' else False
 16 | 
 17 |     sm  = boto3.client('secretsmanager')
 18 |     if os.environ.get('api_sm_id'):
 19 |         res = sm.get_secret_value(SecretId=os.environ['api_sm_id'])
 20 |         api_sm = json.loads(res['SecretString'])
 21 |         api_key, api_secret = api_sm['api_key'], api_sm['api_secret']
 22 |     else:
 23 |         api_key = os.environ['api_key']
 24 |         api_secret = os.environ['api_secret']
 25 | 
 26 |     fetch_function_name = os.environ.get('fetch_function_name')
 27 | 
 28 |     database = os.environ.get('database_name', 'canvasdata')
 29 |     s3_prefix = 'raw_files/'
 30 | 
 31 |     s3_bucket = os.environ['s3_bucket']
 32 |     sns_topic = os.environ['sns_topic']
 33 | 
 34 |     s3 = boto3.resource('s3')
 35 |     lmb = boto3.client('lambda')
 36 |     sns = boto3.client('sns')
 37 | 
 38 |     # get the list of the current objects in the s3 bucket
 39 |     b = s3.Bucket(s3_bucket)
 40 |     existing_keys = []
 41 |     existing_objects = b.objects.filter(Prefix=s3_prefix)
 42 |     for o in existing_objects:
 43 |         existing_keys.append(o.key)
 44 | 
 45 |     # now get all of the current files from the api
 46 |     cd = CanvasDataAPI(api_key=api_key, api_secret=api_secret)
 47 |     result_data = cd.get_sync_file_urls()
 48 |     files = result_data['files']
 49 | 
 50 |     numfiles = len(files)
 51 | 
 52 |     fetched_files = 0
 53 |     skipped_files = 0
 54 |     removed_files = 0
 55 | 
 56 |     reinvoke = False
 57 | 
 58 |     for f in files:
 59 | 
 60 |         key = '{}{}/{}'.format(s3_prefix, f['table'], f['filename'])
 61 | 
 62 |         if key in existing_keys:
 63 |             # we don't need to get this one
 64 |             # remove it from the existing_filenames so we don't delete it
 65 |             existing_keys.remove(key)
 66 |             logger.debug('skipping {}'.format(key))
 67 |             skipped_files += 1
 68 | 
 69 |         else:
 70 |             # we need to get it
 71 |             # call the other lambda asynchronously to download
 72 |             payload = {
 73 |                 'file_url': f['url'],
 74 |                 's3_bucket': s3_bucket,
 75 |                 'key': key
 76 |             }
 77 |             if not dry_run:
 78 |                 status = lmb.invoke(
 79 |                     FunctionName=fetch_function_name,
 80 |                     InvocationType='Event',
 81 |                     Payload=json.dumps(payload)
 82 |                 )
 83 |                 logger.info('fetching {} - status {}'.format(key, status))
 84 |                 fetched_files += 1
 85 |             else:
 86 |                 logger.info('would have fetched {}'.format(key))
 87 | 
 88 |         if context.get_remaining_time_in_millis() < 30000:
 89 |             # stop here and call this function again
 90 |             logger.info('this invocation has 30 seconds until timeout')
 91 |             logger.info('invoking another instance and exiting this one')
 92 |             reinvoke = True
 93 |             status = lmb.invoke(
 94 |                 FunctionName=context.function_name,
 95 |                 InvocationType='Event',
 96 |                 Payload=json.dumps(event),
 97 |             )
 98 |             break
 99 | 
100 |     tables_created = 0
101 |     tables_updated = 0
102 | 
103 |     if not reinvoke:
104 |         # now we need to delete any keys that remain in existing_keys
105 |         s3c = boto3.client('s3')
106 |         for old_key in existing_keys:
107 |             if not dry_run:
108 |                 logger.info('removing old file {}'.format(old_key))
109 |                 s3c.delete_object(Bucket=s3_bucket, Key=old_key)
110 |                 removed_files += 1
111 |             else:
112 |                 logger.info('would have removed old file {}'.format(old_key))
113 | 
114 |         # now update the Glue data catalog
115 |         if not dry_run:
116 |             schema = cd.get_schema()
117 |             for tk in schema.keys():
118 |                 c_or_u = create_or_update_table(schema[tk], database, s3_bucket, s3_prefix)
119 |                 if c_or_u == 'created':
120 |                     tables_created += 1
121 |                 elif c_or_u == 'updated':
122 |                     tables_updated += 1
123 | 
124 | 
125 |     logger.info('total number of files in the sync: {}'.format(numfiles))
126 |     logger.info('fetched {} files'.format(fetched_files))
127 |     logger.info('skipped {} files'.format(skipped_files))
128 |     logger.info('removed {} old files'.format(removed_files))
129 |     logger.info('tables created/updated: {}/{}'.format(tables_created, tables_updated))
130 | 
131 |     summary = {
132 |         'total_files': numfiles,
133 |         'fetched_files': fetched_files,
134 |         'skipped_files': skipped_files,
135 |         'removed_files': removed_files,
136 |         'reinvoke': reinvoke,
137 |         'tables_created': tables_created,
138 |         'tables_updated': tables_updated,
139 |     }
140 | 
141 |     sns.publish(
142 |         TopicArn=sns_topic,
143 |         Subject='Canvas Data sync complete',
144 |         Message=json.dumps(summary, indent=4),
145 |     )
146 | 
147 |     return summary
148 | 
149 | 
150 | def get_column_type(column):
151 |     # converts the column types returned by the Canvas Data API to Athena-compatible types
152 |     raw_type = column['type']
153 |     if raw_type in ['text', 'enum', 'guid']:
154 |         return 'string'
155 |     elif raw_type in ['varchar']:
156 |         if column.get('length'):
157 |             return 'varchar({})'.format(column['length'])
158 |         else:
159 |             return 'string'
160 |     elif raw_type in ['double precision']:
161 |         return 'double'
162 |     elif raw_type in ['integer']:
163 |         return 'int'
164 |     elif raw_type in ['datetime']:
165 |         return 'timestamp'
166 |     else:
167 |         return raw_type
168 | 
169 | 
170 | def create_or_update_table(table_schema, database, s3_bucket, s3_prefix):
171 | 
172 |     table_desc = table_schema.get('description', '')[:254]
173 | 
174 |     table_input = {
175 |         'Name': table_schema['tableName'],
176 |         'Description': table_desc,
177 |         'Parameters': {
178 |             'compressionType': 'gzip',
179 |             'delimiter': '\t',
180 |             'classification': 'csv',
181 |             'typeOfData': 'file'
182 |         },
183 |         'TableType': 'EXTERNAL_TABLE',
184 |         'PartitionKeys': [],
185 |         'StorageDescriptor': {
186 |             'Location': 's3://{}/{}{}/'.format(s3_bucket, s3_prefix, table_schema['tableName']),
187 |             'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
188 |             'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
189 |             'Compressed': True,
190 |             'Parameters': {
191 |                 'compressionType': 'gzip',
192 |                 'delimiter': '\t',
193 |                 'classification': 'csv',
194 |                 'typeOfData': 'file'
195 |             },
196 |             'SerdeInfo': {
197 |                 'Name': 'LazySimpleSerDe',
198 |                 'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
199 |                 'Parameters': {
200 |                     'field.delim': '\t'
201 |                 }
202 |             }
203 |         },
204 |     }
205 | 
206 |     columns = []
207 |     for column in table_schema['columns']:
208 |         t = get_column_type(column)
209 |         desc = column.get('description', '')[:254]
210 | 
211 |         c = {
212 |             'Name': column['name'],
213 |             'Type': t,
214 |             'Comment': desc
215 |         }
216 |         columns.append(c)
217 | 
218 |     table_input['StorageDescriptor']['Columns'] = columns
219 | 
220 |     glue = boto3.client('glue')
221 | 
222 |     try:
223 |         glue.create_table(
224 |             DatabaseName=database,
225 |             TableInput=table_input
226 |         )
227 |         logger.info('created Glue table {}.{}'.format(database, table_schema['tableName']))
228 |         return 'created'
229 |     except glue.exceptions.AlreadyExistsException:
230 |         glue.update_table(
231 |             DatabaseName=database,
232 |             TableInput=table_input
233 |         )
234 |         logger.info('updated Glue table {}.{}'.format(database, table_schema['tableName']))
235 |         return 'updated'
236 | 


--------------------------------------------------------------------------------