├── macros ├── __init__.py └── redshift_auth.py ├── operators ├── __init__.py └── s3_to_redshift_operator.py ├── .gitignore ├── README.md ├── __init__.py └── LICENSE /macros/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /operators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S3 To Redshift Operator 2 | 3 | 4 | # License 5 | Apache 2.0 6 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from airflow.plugins_manager import AirflowPlugin 2 | from redshift_plugin.operators.s3_to_redshift_operator import S3ToRedshiftOperator 3 | from redshift_plugin.macros.redshift_auth import redshift_auth 4 | 5 | 6 | class S3ToRedshiftPlugin(AirflowPlugin): 7 | name = "S3ToRedshiftPlugin" 8 | operators = [S3ToRedshiftOperator] 9 | # Leave in for explicitness 10 | hooks = [] 11 | executors = [] 12 | macros = [redshift_auth] 13 | admin_views = [] 14 | flask_blueprints = [] 15 | menu_links = [] 16 | -------------------------------------------------------------------------------- /macros/redshift_auth.py: -------------------------------------------------------------------------------- 1 | from airflow.utils.db import provide_session 2 | from airflow.models import Connection 3 | 4 | 5 | @provide_session 6 | def get_conn(conn_id, session=None): 7 | conn = ( 8 | session.query(Connection) 9 | .filter(Connection.conn_id == conn_id) 10 | .first()) 11 | return conn 12 | 13 | 14 | def redshift_auth(s3_conn_id): 15 | s3_conn = get_conn(s3_conn_id) 16 | aws_key = s3_conn.extra_dejson.get('aws_access_key_id') 17 | aws_secret = s3_conn.extra_dejson.get('aws_secret_access_key') 18 | return ("aws_access_key_id={0};aws_secret_access_key={1}" 19 | .format(aws_key, aws_secret)) 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /operators/s3_to_redshift_operator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import string 4 | import logging 5 | 6 | from airflow.utils.db import provide_session 7 | from airflow.models import Connection 8 | from airflow.utils.decorators import apply_defaults 9 | 10 | from airflow.models import BaseOperator 11 | from airflow.hooks.S3_hook import S3Hook 12 | from airflow.hooks.postgres_hook import PostgresHook 13 | 14 | 15 | class S3ToRedshiftOperator(BaseOperator): 16 | """ 17 | S3 To Redshift Operator 18 | :param redshift_conn_id: The destination redshift connection id. 19 | :type redshift_conn_id: string 20 | :param redshift_schema: The destination redshift schema. 21 | :type redshift_schema: string 22 | :param table: The destination redshift table. 23 | :type table: string 24 | :param s3_conn_id: The source s3 connection id. 25 | :type s3_conn_id: string 26 | :param s3_bucket: The source s3 bucket. 27 | :type s3_bucket: string 28 | :param s3_key: The source s3 key. 29 | :type s3_key: string 30 | :param copy_params: The parameters to be included when issuing 31 | the copy statement in Redshift. 32 | :type copy_params: list 33 | :param origin_schema: The s3 key for the incoming data schema. 34 | Expects a JSON file with an array of 35 | dictionaries specifying name and type. 36 | (e.g. {"name": "_id", "type": "int4"}) 37 | :type origin_schema: array of dictionaries 38 | :param schema_location: The location of the origin schema. This 39 | can be set to 'S3' or 'Local'. 40 | If 'S3', it will expect a valid S3 Key. If 41 | 'Local', it will expect a dictionary that 42 | is defined in the operator itself. By 43 | default the location is set to 's3'. 44 | :type schema_location: string 45 | :param load_type: The method of loading into Redshift that 46 | should occur. Options: 47 | - "append" 48 | - "rebuild" 49 | - "truncate" 50 | - "upsert" 51 | Defaults to "append." 52 | :type load_type: string 53 | :param primary_key: *(optional)* The primary key for the 54 | destination table. Not enforced by redshift 55 | and only required if using a load_type of 56 | "upsert". 57 | :type primary_key: string 58 | :param incremental_key: *(optional)* The incremental key to compare 59 | new data against the destination table 60 | with. Only required if using a load_type of 61 | "upsert". 62 | :type incremental_key: string 63 | :param foreign_key: *(optional)* This specifies any foreign_keys 64 | in the table and which corresponding table 65 | and key they reference. This may be either 66 | a dictionary or list of dictionaries (for 67 | multiple foreign keys). The fields that are 68 | required in each dictionary are: 69 | - column_name 70 | - reftable 71 | - ref_column 72 | :type foreign_key: dictionary 73 | :param distkey: *(optional)* The distribution key for the 74 | table. Only one key may be specified. 75 | :type distkey: string 76 | :param sortkey: *(optional)* The sort keys for the table. 77 | If more than one key is specified, set this 78 | as a list. 79 | :type sortkey: string 80 | :param sort_type: *(optional)* The style of distribution 81 | to sort the table. Possible values include: 82 | - compound 83 | - interleaved 84 | Defaults to "compound". 85 | :type sort_type: string 86 | """ 87 | 88 | template_fields = ('s3_key', 89 | 'origin_schema') 90 | 91 | @apply_defaults 92 | def __init__(self, 93 | s3_conn_id, 94 | s3_bucket, 95 | s3_key, 96 | redshift_conn_id, 97 | redshift_schema, 98 | table, 99 | copy_params=[], 100 | origin_schema=None, 101 | schema_location='s3', 102 | load_type='append', 103 | primary_key=None, 104 | incremental_key=None, 105 | foreign_key={}, 106 | distkey=None, 107 | sortkey='', 108 | sort_type='COMPOUND', 109 | *args, 110 | **kwargs): 111 | super().__init__(*args, **kwargs) 112 | self.s3_conn_id = s3_conn_id 113 | self.s3_bucket = s3_bucket 114 | self.s3_key = s3_key 115 | self.redshift_conn_id = redshift_conn_id 116 | self.redshift_schema = redshift_schema.lower() 117 | self.table = table.lower() 118 | self.copy_params = copy_params 119 | self.origin_schema = origin_schema 120 | self.schema_location = schema_location 121 | self.load_type = load_type 122 | self.primary_key = primary_key 123 | self.incremental_key = incremental_key 124 | self.foreign_key = foreign_key 125 | self.distkey = distkey 126 | self.sortkey = sortkey 127 | self.sort_type = sort_type 128 | 129 | if self.load_type.lower() not in ("append", "rebuild", "truncate", "upsert"): 130 | raise Exception('Please choose "append", "rebuild", or "upsert".') 131 | 132 | if self.schema_location.lower() not in ('s3', 'local'): 133 | raise Exception('Valid Schema Locations are "s3" or "local".') 134 | 135 | if not (isinstance(self.sortkey, str) or isinstance(self.sortkey, list)): 136 | raise Exception('Sort Keys must be specified as either a string or list.') 137 | 138 | if not (isinstance(self.foreign_key, dict) or isinstance(self.foreign_key, list)): 139 | raise Exception('Foreign Keys must be specified as either a dictionary or a list of dictionaries.') 140 | 141 | if self.distkey and ((',' in self.distkey) or not isinstance(self.distkey, str)): 142 | raise Exception('Only one distribution key may be specified.') 143 | 144 | if self.sort_type.lower() not in ('compound', 'interleaved'): 145 | raise Exception('Please choose "compound" or "interleaved" for sort type.') 146 | 147 | def execute(self, context): 148 | # Append a random string to the end of the staging table to ensure 149 | # no conflicts if multiple processes running concurrently. 150 | letters = string.ascii_lowercase 151 | random_string = ''.join(random.choice(letters) for _ in range(7)) 152 | self.temp_suffix = '_tmp_{0}'.format(random_string) 153 | 154 | if self.origin_schema: 155 | schema = self.read_and_format() 156 | 157 | pg_hook = PostgresHook(self.redshift_conn_id) 158 | 159 | self.create_if_not_exists(schema, pg_hook) 160 | self.reconcile_schemas(schema, pg_hook) 161 | self.copy_data(pg_hook, schema) 162 | 163 | def read_and_format(self): 164 | if self.schema_location.lower() == 's3': 165 | hook = S3Hook(self.s3_conn_id) 166 | # NOTE: In retrieving the schema, it is assumed 167 | # that boto3 is being used. If using boto, 168 | # `.get()['Body'].read().decode('utf-8'))` 169 | # should be changed to 170 | # `.get_contents_as_string(encoding='utf-8'))` 171 | schema = (hook.get_key(self.origin_schema, 172 | bucket_name= 173 | '{0}'.format(self.s3_bucket)) 174 | .get()['Body'].read().decode('utf-8')) 175 | schema = json.loads(schema.replace("'", '"')) 176 | else: 177 | schema = self.origin_schema 178 | 179 | return schema 180 | 181 | def reconcile_schemas(self, schema, pg_hook): 182 | pg_query = \ 183 | """ 184 | SELECT column_name, udt_name 185 | FROM information_schema.columns 186 | WHERE table_schema = '{0}' AND table_name = '{1}'; 187 | """.format(self.redshift_schema, self.table) 188 | 189 | pg_schema = dict(pg_hook.get_records(pg_query)) 190 | incoming_keys = [column['name'] for column in schema] 191 | diff = list(set(incoming_keys) - set(pg_schema.keys())) 192 | print(diff) 193 | # Check length of column differential to see if any new columns exist 194 | if len(diff): 195 | for i in diff: 196 | for e in schema: 197 | if i == e['name']: 198 | alter_query = \ 199 | """ 200 | ALTER TABLE "{0}"."{1}" 201 | ADD COLUMN "{2}" {3} 202 | """.format(self.redshift_schema, 203 | self.table, 204 | e['name'], 205 | e['type']) 206 | pg_hook.run(alter_query) 207 | logging.info('The new columns were:' + str(diff)) 208 | else: 209 | logging.info('There were no new columns.') 210 | 211 | def copy_data(self, pg_hook, schema=None): 212 | @provide_session 213 | def get_conn(conn_id, session=None): 214 | conn = ( 215 | session.query(Connection) 216 | .filter(Connection.conn_id == conn_id) 217 | .first()) 218 | return conn 219 | 220 | def getS3Conn(): 221 | creds = "" 222 | s3_conn = get_conn(self.s3_conn_id) 223 | aws_key = s3_conn.extra_dejson.get('aws_access_key_id', None) 224 | aws_secret = s3_conn.extra_dejson.get('aws_secret_access_key', None) 225 | # support for cross account resource access 226 | aws_role_arn = s3_conn.extra_dejson.get('role_arn', None) 227 | 228 | if aws_key and aws_secret: 229 | creds = ("aws_access_key_id={0};aws_secret_access_key={1}" 230 | .format(aws_key, aws_secret)) 231 | elif aws_role_arn: 232 | creds = ("aws_iam_role={0}" 233 | .format(aws_role_arn)) 234 | 235 | return creds 236 | 237 | # Delete records from the destination table where the incremental_key 238 | # is greater than or equal to the incremental_key of the source table 239 | # and the primary key is the same. 240 | # (e.g. Source: {"id": 1, "updated_at": "2017-01-02 00:00:00"}; 241 | # Destination: {"id": 1, "updated_at": "2017-01-01 00:00:00"}) 242 | 243 | delete_sql = \ 244 | ''' 245 | DELETE FROM "{rs_schema}"."{rs_table}" 246 | USING "{rs_schema}"."{rs_table}{rs_suffix}" 247 | WHERE "{rs_schema}"."{rs_table}"."{rs_pk}" = 248 | "{rs_schema}"."{rs_table}{rs_suffix}"."{rs_pk}" 249 | AND "{rs_schema}"."{rs_table}{rs_suffix}"."{rs_ik}" >= 250 | "{rs_schema}"."{rs_table}"."{rs_ik}" 251 | '''.format(rs_schema=self.redshift_schema, 252 | rs_table=self.table, 253 | rs_pk=self.primary_key, 254 | rs_suffix=self.temp_suffix, 255 | rs_ik=self.incremental_key) 256 | 257 | # Delete records from the source table where the incremental_key 258 | # is greater than or equal to the incremental_key of the destination 259 | # table and the primary key is the same. This is done in the edge case 260 | # where data is pulled BEFORE it is altered in the source table but 261 | # AFTER a workflow containing an updated version of the record runs. 262 | # In this case, not running this will cause the older record to be 263 | # added as a duplicate to the newer record. 264 | # (e.g. Source: {"id": 1, "updated_at": "2017-01-01 00:00:00"}; 265 | # Destination: {"id": 1, "updated_at": "2017-01-02 00:00:00"}) 266 | 267 | delete_confirm_sql = \ 268 | ''' 269 | DELETE FROM "{rs_schema}"."{rs_table}{rs_suffix}" 270 | USING "{rs_schema}"."{rs_table}" 271 | WHERE "{rs_schema}"."{rs_table}{rs_suffix}"."{rs_pk}" = 272 | "{rs_schema}"."{rs_table}"."{rs_pk}" 273 | AND "{rs_schema}"."{rs_table}"."{rs_ik}" >= 274 | "{rs_schema}"."{rs_table}{rs_suffix}"."{rs_ik}" 275 | '''.format(rs_schema=self.redshift_schema, 276 | rs_table=self.table, 277 | rs_pk=self.primary_key, 278 | rs_suffix=self.temp_suffix, 279 | rs_ik=self.incremental_key) 280 | 281 | append_sql = \ 282 | ''' 283 | ALTER TABLE "{0}"."{1}" 284 | APPEND FROM "{0}"."{1}{2}" 285 | FILLTARGET 286 | '''.format(self.redshift_schema, self.table, self.temp_suffix) 287 | 288 | drop_sql = \ 289 | ''' 290 | DROP TABLE IF EXISTS "{0}"."{1}" 291 | '''.format(self.redshift_schema, self.table) 292 | 293 | drop_temp_sql = \ 294 | ''' 295 | DROP TABLE IF EXISTS "{0}"."{1}{2}" 296 | '''.format(self.redshift_schema, self.table, self.temp_suffix) 297 | 298 | truncate_sql = \ 299 | ''' 300 | TRUNCATE TABLE "{0}"."{1}" 301 | '''.format(self.redshift_schema, self.table) 302 | 303 | params = '\n'.join(self.copy_params) 304 | 305 | # Example params for loading json from US-East-1 S3 region 306 | # params = ["COMPUPDATE OFF", 307 | # "STATUPDATE OFF", 308 | # "JSON 'auto'", 309 | # "TIMEFORMAT 'auto'", 310 | # "TRUNCATECOLUMNS", 311 | # "region as 'us-east-1'"] 312 | 313 | base_sql = \ 314 | """ 315 | FROM 's3://{0}/{1}' 316 | CREDENTIALS '{2}' 317 | {3}; 318 | """.format(self.s3_bucket, 319 | self.s3_key, 320 | getS3Conn(), 321 | params) 322 | 323 | load_sql = '''COPY "{0}"."{1}" {2}'''.format(self.redshift_schema, 324 | self.table, 325 | base_sql) 326 | if self.load_type == 'append': 327 | pg_hook.run(load_sql) 328 | elif self.load_type == 'rebuild': 329 | pg_hook.run(drop_sql) 330 | self.create_if_not_exists(schema, pg_hook) 331 | pg_hook.run(load_sql) 332 | elif self.load_type == 'truncate': 333 | pg_hook.run(truncate_sql) 334 | pg_hook.run(load_sql) 335 | elif self.load_type == 'upsert': 336 | self.create_if_not_exists(schema, pg_hook, temp=True) 337 | load_temp_sql = \ 338 | '''COPY "{0}"."{1}{2}" {3}'''.format(self.redshift_schema, 339 | self.table, 340 | self.temp_suffix, 341 | base_sql) 342 | pg_hook.run(load_temp_sql) 343 | pg_hook.run(delete_sql) 344 | pg_hook.run(delete_confirm_sql) 345 | pg_hook.run(append_sql, autocommit=True) 346 | pg_hook.run(drop_temp_sql) 347 | 348 | def create_if_not_exists(self, schema, pg_hook, temp=False): 349 | output = '' 350 | for item in schema: 351 | k = "{quote}{key}{quote}".format(quote='"', key=item['name']) 352 | field = ' '.join([k, item['type']]) 353 | if isinstance(self.sortkey, str) and self.sortkey == item['name']: 354 | field += ' sortkey' 355 | output += field 356 | output += ', ' 357 | # Remove last comma and space after schema items loop ends 358 | output = output[:-2] 359 | if temp: 360 | copy_table = '{0}{1}'.format(self.table, self.temp_suffix) 361 | else: 362 | copy_table = self.table 363 | create_schema_query = \ 364 | ''' 365 | CREATE SCHEMA IF NOT EXISTS "{0}"; 366 | '''.format(self.redshift_schema) 367 | 368 | pk = '' 369 | fk = '' 370 | dk = '' 371 | sk = '' 372 | 373 | if self.primary_key: 374 | pk = ', primary key("{0}")'.format(self.primary_key) 375 | 376 | if self.foreign_key: 377 | if isinstance(self.foreign_key, list): 378 | fk = ', ' 379 | for i, e in enumerate(self.foreign_key): 380 | fk += 'foreign key("{0}") references {1}("{2}")'.format(e['column_name'], 381 | e['reftable'], 382 | e['ref_column']) 383 | if i != (len(self.foreign_key) - 1): 384 | fk += ', ' 385 | elif isinstance(self.foreign_key, dict): 386 | fk += ', ' 387 | fk += 'foreign key("{0}") references {1}("{2}")'.format(self.foreign_key['column_name'], 388 | self.foreign_key['reftable'], 389 | self.foreign_key['ref_column']) 390 | if self.distkey: 391 | dk = 'distkey({})'.format(self.distkey) 392 | 393 | if self.sortkey: 394 | if isinstance(self.sortkey, list): 395 | sk += '{0} sortkey({1})'.format(self.sort_type, ', '.join(["{}".format(e) for e in self.sortkey])) 396 | 397 | create_table_query = \ 398 | ''' 399 | CREATE TABLE IF NOT EXISTS "{schema}"."{table}" 400 | ({fields}{primary_key}{foreign_key}) {distkey} {sortkey} 401 | '''.format(schema=self.redshift_schema, 402 | table=copy_table, 403 | fields=output, 404 | primary_key=pk, 405 | foreign_key=fk, 406 | distkey=dk, 407 | sortkey=sk) 408 | 409 | pg_hook.run([create_schema_query, create_table_query]) 410 | --------------------------------------------------------------------------------