├── dataduct ├── utils │ ├── __init__.py │ ├── tests │ │ └── __init__.py │ ├── exceptions.py │ ├── decorators.py │ ├── constants.py │ └── hook.py ├── config │ ├── tests │ │ ├── __init__.py │ │ ├── test_config_actions.py │ │ ├── test_credentials.py │ │ └── test_config.py │ ├── __init__.py │ ├── constants.py │ ├── example_config │ ├── config_actions.py │ ├── logger_config.py │ └── credentials.py ├── etl │ ├── tests │ │ ├── __init__.py │ │ ├── test_etl_pipeline.py │ │ └── test_etl_actions.py │ ├── __init__.py │ └── utils.py ├── tests │ ├── __init__.py │ └── test_import.py ├── data_access │ ├── tests │ │ ├── __init__.py │ │ └── test_connection.py │ ├── __init__.py │ └── open_shell.py ├── database │ ├── tests │ │ ├── __init__.py │ │ ├── helpers.py │ │ └── test_table.py │ ├── parsers │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_create_view.py │ │ │ ├── test_create_table.py │ │ │ └── test_select_query.py │ │ ├── __init__.py │ │ ├── helpers.py │ │ ├── create_view.py │ │ ├── utils.py │ │ └── transform.py │ ├── sql │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_sql_utils.py │ │ │ └── test_sql_statement.py │ │ ├── __init__.py │ │ ├── transaction.py │ │ ├── utils.py │ │ └── sql_statement.py │ ├── __init__.py │ ├── select_statement.py │ ├── view.py │ ├── column.py │ └── relation.py ├── steps │ ├── executors │ │ ├── __init__.py │ │ ├── primary_key_check.py │ │ └── count_check.py │ ├── __init__.py │ ├── reload.py │ ├── delta_load.py │ ├── extract_local.py │ ├── primary_key_check.py │ ├── emr_job.py │ ├── extract_s3.py │ ├── upsert.py │ ├── qa_transform.py │ ├── create_load_redshift.py │ ├── extract_redshift.py │ ├── create_update_sql.py │ ├── load_postgres.py │ ├── sql_command.py │ ├── load_redshift.py │ └── pipeline_dependencies.py ├── __init__.py ├── s3 │ ├── __init__.py │ ├── s3_log_path.py │ └── s3_directory.py ├── qa │ ├── __init__.py │ ├── utils.py │ ├── primary_key_check.py │ ├── count_check.py │ └── column_check.py └── pipeline │ ├── __init__.py │ ├── precondition.py │ ├── postgres_node.py │ ├── default_object.py │ ├── postgres_database.py │ ├── redshift_node.py │ ├── sns_alarm.py │ ├── mysql_node.py │ ├── redshift_database.py │ ├── copy_activity.py │ ├── emr_activity.py │ ├── activity.py │ ├── sql_activity.py │ ├── ec2_resource.py │ ├── shell_command_activity.py │ └── redshift_copy_activity.py ├── examples ├── README.md ├── resources │ ├── data │ │ ├── test_table2.tsv │ │ └── test_table1.tsv │ ├── tables │ │ ├── dev.test_table.sql │ │ ├── dev.test_table_2.sql │ │ ├── shippers.sql │ │ ├── categories.sql │ │ ├── employees.sql │ │ ├── order_details.sql │ │ ├── orders.sql │ │ ├── customers.sql │ │ ├── products.sql │ │ └── suppliers.sql │ └── scripts │ │ ├── word_mapper.py │ │ └── word_reducer.py ├── example_failed_pipeline.yaml ├── example_sql_command.yaml ├── example_extract_redshift.yaml ├── example_extract_s3.yaml ├── example_extract_local.yaml ├── example_custom_extract_local.yaml ├── example_extract_postgres.yaml ├── example_load_redshift.yaml ├── example_create_and_load_redshift.yaml ├── example_load_postgres.yaml ├── example_count_check.yaml ├── example_primary_key_check.yaml ├── example_load_reload_pk.yaml ├── example_reload.yaml ├── example_upsert.yaml ├── example_column_check.yaml ├── example_pipeline_dependency.yaml ├── example_create_update_sql.yaml ├── example_extract_rds.yaml ├── example_bootstrap.yaml ├── example_emr_streaming.yaml ├── example_double_input.yaml ├── steps │ └── custom_extract_local.py ├── example_transform.yaml └── example_double_output.yaml ├── read_the_docs.txt ├── docs ├── modules.rst ├── README.md ├── dataduct.rst ├── dataduct.tests.rst ├── dataduct.data_access.rst ├── dataduct.config.tests.rst ├── dataduct.etl.tests.rst ├── dataduct.database.tests.rst ├── dataduct.etl.rst ├── dataduct.database.sql.tests.rst ├── dataduct.s3.rst ├── dataduct.qa.rst ├── dataduct.utils.rst ├── dataduct.database.sql.rst ├── dataduct.config.rst ├── dataduct.database.parsers.tests.rst ├── index.rst ├── dataduct.database.parsers.rst ├── introduction.rst ├── dataduct.database.rst ├── installation.rst ├── hooks.rst └── creating_an_etl.rst ├── MANIFEST.in ├── bin └── README.md ├── requirements.txt ├── .gitignore ├── LICENSE.md ├── .travis.yml ├── README.rst ├── MANIFEST ├── setup.py ├── CONTRIBUTING.md └── CHANGES.md /dataduct/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/config/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/etl/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dataduct/utils/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/data_access/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/database/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/steps/executors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | #### Examples 2 | -------------------------------------------------------------------------------- /read_the_docs.txt: -------------------------------------------------------------------------------- 1 | Sphinx>=1.3.1 2 | -------------------------------------------------------------------------------- /dataduct/database/parsers/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataduct/database/sql/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/resources/data/test_table2.tsv: -------------------------------------------------------------------------------- 1 | 2 this is another row (with ID=2) 2 | -------------------------------------------------------------------------------- /examples/resources/data/test_table1.tsv: -------------------------------------------------------------------------------- 1 | 1 thisis a roooow 2 | 3 3 | 4 NULL 4 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | dataduct 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | dataduct 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.rst 4 | include *.py 5 | recursive-include bin * 6 | -------------------------------------------------------------------------------- /dataduct/database/sql/__init__.py: -------------------------------------------------------------------------------- 1 | from .sql_statement import SqlStatement 2 | from .sql_script import SqlScript 3 | -------------------------------------------------------------------------------- /bin/README.md: -------------------------------------------------------------------------------- 1 | #### Bin 2 | 3 | Folder contains scripts to be added to the path variable of the user for command line access. 4 | -------------------------------------------------------------------------------- /dataduct/__init__.py: -------------------------------------------------------------------------------- 1 | """Welcome to DataDuct 2 | """ 3 | __version__ = '0.5.0' 4 | __import__('pkg_resources').declare_namespace(__name__) 5 | -------------------------------------------------------------------------------- /examples/resources/tables/dev.test_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE dev.test_table( 2 | id INTEGER PRIMARY KEY, 3 | description VARCHAR(255) 4 | ); 5 | -------------------------------------------------------------------------------- /examples/resources/tables/dev.test_table_2.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE dev.test_table_2( 2 | id INTEGER PRIMARY KEY, 3 | description VARCHAR(255) 4 | ); 5 | -------------------------------------------------------------------------------- /dataduct/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import Config 2 | from .logger_config import logger_configuration 3 | from .credentials import get_aws_credentials 4 | -------------------------------------------------------------------------------- /dataduct/s3/__init__.py: -------------------------------------------------------------------------------- 1 | from .s3_file import S3File 2 | from .s3_path import S3Path 3 | from .s3_directory import S3Directory 4 | from .s3_log_path import S3LogPath 5 | -------------------------------------------------------------------------------- /dataduct/qa/__init__.py: -------------------------------------------------------------------------------- 1 | from .check import Check 2 | from .count_check import CountCheck 3 | from .column_check import ColumnCheck 4 | from .primary_key_check import PrimaryKeyCheck 5 | -------------------------------------------------------------------------------- /examples/resources/tables/shippers.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE shippers ( 2 | shipper_id INTEGER DISTKEY PRIMARY KEY 3 | ,shipper_name VARCHAR(200) 4 | ,phone VARCHAR(20) 5 | ) SORTKEY(shipper_id); 6 | -------------------------------------------------------------------------------- /dataduct/config/constants.py: -------------------------------------------------------------------------------- 1 | """Constants shared across the config package 2 | """ 3 | 4 | CONFIG_STR = 'config' 5 | CONFIG_DIR = '.dataduct' 6 | CFG_FILE = 'dataduct.cfg' 7 | LOG_FILE = 'dataduct.log' 8 | -------------------------------------------------------------------------------- /dataduct/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions for dataduct 2 | """ 3 | 4 | class ETLInputError(Exception): pass 5 | 6 | class ETLConfigError(Exception): pass 7 | 8 | class DatabaseInputError(Exception): pass 9 | -------------------------------------------------------------------------------- /examples/resources/tables/categories.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE categories ( 2 | category_id INTEGER DISTKEY PRIMARY KEY 3 | ,category_name VARCHAR(100) 4 | ,description VARCHAR(2000) 5 | ) SORTKEY(category_id); 6 | -------------------------------------------------------------------------------- /examples/example_failed_pipeline.yaml: -------------------------------------------------------------------------------- 1 | name: example_failed_pipeline 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | steps: 6 | - step_type: transform 7 | name: failure_step 8 | command: this is going to fail 9 | -------------------------------------------------------------------------------- /examples/resources/tables/employees.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE employees ( 2 | employee_id INTEGER DISTKEY PRIMARY KEY 3 | ,last_name VARCHAR(100) 4 | ,first_name VARCHAR(100) 5 | ,birth_date DATE 6 | ,notes VARCHAR(2000) 7 | ) SORTKEY(employee_id); 8 | -------------------------------------------------------------------------------- /dataduct/etl/__init__.py: -------------------------------------------------------------------------------- 1 | from .etl_actions import activate_pipeline 2 | from .etl_actions import create_pipeline 3 | from .etl_actions import read_pipeline_definition 4 | from .etl_actions import validate_pipeline 5 | from .etl_actions import visualize_pipeline 6 | -------------------------------------------------------------------------------- /dataduct/qa/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared utility functions 3 | """ 4 | 5 | def render_output(data): 6 | """Print the formatted output for the list 7 | """ 8 | output = ['[Dataduct]: '] 9 | output.extend(data) 10 | return '\n'.join(output) 11 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | #### Documentation 2 | 3 | This is the base directory for all the docs, we use sphinx and the sphinx 4 | napolean extention for autogenerating docs for any library code. 5 | 6 | Running `make html` in the docs directory should create all the docs for you. 7 | -------------------------------------------------------------------------------- /examples/example_sql_command.yaml: -------------------------------------------------------------------------------- 1 | name: example_sql_command 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the sql_command step 6 | 7 | steps: 8 | - step_type: sql-command 9 | command: SELECT * FROM dev.test_table; 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto>=2.34.0 2 | Sphinx>=1.2.3 3 | sphinx-rtd-theme>=0.1.6 4 | pandas>=0.14.1 5 | psycopg2==2.6.0 6 | MySQL-python 7 | PyYAML 8 | coverage 9 | pyparsing==1.5.6 10 | pygraphviz 11 | testfixtures>=4.1.1 12 | mock 13 | pytimeparse 14 | pyprind 15 | requests 16 | -------------------------------------------------------------------------------- /examples/resources/tables/order_details.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE order_details ( 2 | order_detail_id INTEGER DISTKEY PRIMARY KEY 3 | ,order_id INTEGER REFERENCES orders(order_id) 4 | ,product_id INTEGER REFERENCES products(product_id) 5 | ,quantity INTEGER 6 | ) SORTKEY(order_detail_id); 7 | -------------------------------------------------------------------------------- /dataduct/data_access/__init__.py: -------------------------------------------------------------------------------- 1 | from .connection import get_sql_config 2 | from .connection import rds_connection 3 | from .connection import get_redshift_config 4 | from .connection import redshift_connection 5 | from .connection import get_postgres_config 6 | from .connection import postgres_connection 7 | -------------------------------------------------------------------------------- /dataduct/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .database import Database 2 | from .select_statement import SelectStatement 3 | from .sql import SqlScript 4 | from .sql import SqlStatement 5 | from .table import Table 6 | from .view import View 7 | from .history_table import HistoryTable 8 | from .column import Column 9 | -------------------------------------------------------------------------------- /examples/example_extract_redshift.yaml: -------------------------------------------------------------------------------- 1 | name: example_extract_redshift 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: This example extracts data out of redshift 6 | 7 | steps: 8 | - step_type: extract-redshift 9 | schema: dev 10 | table: categories 11 | -------------------------------------------------------------------------------- /examples/example_extract_s3.yaml: -------------------------------------------------------------------------------- 1 | name: example_extract_s3 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: This example creates an S3Node given a S3 Uri 6 | 7 | steps: 8 | - step_type: extract-s3 9 | file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py 10 | -------------------------------------------------------------------------------- /examples/example_extract_local.yaml: -------------------------------------------------------------------------------- 1 | name: example_extract_local 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: | 6 | This example uploads a local file to S3 with the extract-local step. 7 | 8 | steps: 9 | - step_type: extract-local 10 | path: data/test_table1.tsv 11 | -------------------------------------------------------------------------------- /examples/resources/tables/orders.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE orders ( 2 | order_id INTEGER DISTKEY PRIMARY KEY 3 | ,customer_id INTEGER REFERENCES customers(customer_id) 4 | ,employee_id INTEGER REFERENCES employees(employee_id) 5 | ,order_date DATE 6 | ,shipper_id INTEGER REFERENCES shippers(shipper_id) 7 | ) SORTKEY(order_id); 8 | -------------------------------------------------------------------------------- /examples/example_custom_extract_local.yaml: -------------------------------------------------------------------------------- 1 | name: example_custom_extract_local 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: | 6 | This example uploads a local file to S3 with the extract-local step. 7 | 8 | steps: 9 | - step_type: custom-extract-local 10 | path: data/test_table1.tsv 11 | -------------------------------------------------------------------------------- /examples/resources/tables/customers.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE customers ( 2 | customer_id INTEGER DISTKEY PRIMARY KEY 3 | ,customer_name VARCHAR(200) 4 | ,contact_name VARCHAR(200) 5 | ,address VARCHAR(200) 6 | ,city VARCHAR(100) 7 | ,postal_code VARCHAR(10) 8 | ,country VARCHAR(100) 9 | ) SORTKEY(customer_id); 10 | -------------------------------------------------------------------------------- /examples/resources/tables/products.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE products ( 2 | product_id INTEGER DISTKEY PRIMARY KEY 3 | ,product_name VARCHAR(200) 4 | ,supplier_id INTEGER REFERENCES suppliers(supplier_id) 5 | ,category_id INTEGER REFERENCES categories(category_id) 6 | ,unit VARCHAR(200) 7 | ,price REAL 8 | ) SORTKEY(product_id); 9 | -------------------------------------------------------------------------------- /examples/example_extract_postgres.yaml: -------------------------------------------------------------------------------- 1 | name: example_extract_postgres 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: This example extracts data out of postgres 6 | 7 | steps: 8 | - step_type: extract-postgres 9 | sql: "SELECT * from sometable" 10 | output_path: s3://somebucket/somedata.csv 11 | -------------------------------------------------------------------------------- /docs/dataduct.rst: -------------------------------------------------------------------------------- 1 | Code documentation 2 | ================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dataduct.config 8 | dataduct.data_access 9 | dataduct.database 10 | dataduct.etl 11 | dataduct.pipeline 12 | dataduct.qa 13 | dataduct.s3 14 | dataduct.steps 15 | dataduct.tests 16 | dataduct.utils 17 | -------------------------------------------------------------------------------- /examples/example_load_redshift.yaml: -------------------------------------------------------------------------------- 1 | name: example_load_redshift 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the load_redshift step 6 | 7 | steps: 8 | - step_type: extract-local 9 | path: data/test_table1.tsv 10 | 11 | - step_type: load-redshift 12 | schema: dev 13 | table: test_table 14 | -------------------------------------------------------------------------------- /examples/resources/tables/suppliers.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE suppliers ( 2 | supplier_id INTEGER DISTKEY PRIMARY KEY 3 | ,supplier_name VARCHAR(200) 4 | ,contact_name VARCHAR(200) 5 | ,address VARCHAR(200) 6 | ,city VARCHAR(100) 7 | ,postal_code VARCHAR(10) 8 | ,county VARCHAR(100) 9 | ,phone VARCHAR(20) 10 | ) SORTKEY(supplier_id); 11 | -------------------------------------------------------------------------------- /examples/example_create_and_load_redshift.yaml: -------------------------------------------------------------------------------- 1 | name: example_create_and_load_redshift 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the load_redshift step 6 | 7 | steps: 8 | - step_type: extract-local 9 | path: data/test_table1.tsv 10 | 11 | - step_type: create-load-redshift 12 | table_definition: tables/dev.test_table.sql 13 | -------------------------------------------------------------------------------- /examples/example_load_postgres.yaml: -------------------------------------------------------------------------------- 1 | name: example_load_postgres 2 | frequency: one-time 3 | load_time: 00:01 # Hour:Min in UTC 4 | 5 | description: Example for the load_postgres step 6 | 7 | steps: 8 | - step_type: extract-s3 9 | file_uri: s3://somebucket/somedata.csv 10 | 11 | - step_type: load-postgres 12 | table: sometable 13 | insert_query: "INSERT INTO sometable (col1, col2, col3) VALUES (?,?,?);" 14 | -------------------------------------------------------------------------------- /examples/example_count_check.yaml: -------------------------------------------------------------------------------- 1 | name: example_count_check 2 | frequency: one-time 3 | load_time: 01:00 4 | 5 | description: Example for the count-check step 6 | 7 | steps: 8 | - step_type: count-check 9 | source_sql: "SELECT id, name FROM networks_network;" 10 | source_host: maestro 11 | destination_sql: "SELECT network_id, network_name FROM prod.networks" 12 | tolerance: 2.0 13 | log_to_s3: true 14 | -------------------------------------------------------------------------------- /examples/example_primary_key_check.yaml: -------------------------------------------------------------------------------- 1 | name: example_primary_key_check 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the primary-key-check step 6 | 7 | steps: 8 | - step_type: primary-key-check 9 | table_definition: tables/dev.test_table.sql 10 | log_to_s3: true 11 | script_arguments: 12 | - "--path_suffix=#{format(@scheduledStartTime, 'YYYY-MM-dd')}" 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | 4 | # Setuptools distribution folder. 5 | /dist/ 6 | 7 | # Docs build folder 8 | /docs/_build 9 | 10 | # Build directory 11 | /build/ 12 | 13 | # Python egg metadata, regenerated from source files by setuptools. 14 | /*.egg-info 15 | /*.egg 16 | 17 | # Images created should be checked in manually 18 | *.png 19 | 20 | .coverage 21 | 22 | # pycharm or intellij 23 | .idea/ 24 | 25 | .DS_Store 26 | -------------------------------------------------------------------------------- /examples/example_load_reload_pk.yaml: -------------------------------------------------------------------------------- 1 | name: example_load_reload_primary_key_check 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the load-reload-pk step 6 | 7 | steps: 8 | - step_type: extract-local 9 | path: data/test_table1.tsv 10 | 11 | - step_type: load-reload-pk 12 | staging_table_definition: tables/dev.test_table.sql 13 | production_table_definition: tables/dev.test_table_2.sql 14 | -------------------------------------------------------------------------------- /examples/example_reload.yaml: -------------------------------------------------------------------------------- 1 | name: example_reload 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the reload step 6 | 7 | steps: 8 | - step_type: extract-local 9 | path: data/test_table1.tsv 10 | 11 | - step_type: create-load-redshift 12 | table_definition: tables/dev.test_table.sql 13 | 14 | - step_type: reload 15 | source: tables/dev.test_table.sql 16 | destination: tables/dev.test_table_2.sql 17 | -------------------------------------------------------------------------------- /examples/example_upsert.yaml: -------------------------------------------------------------------------------- 1 | name: example_upsert 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the upsert step 6 | 7 | steps: 8 | - step_type: extract-local 9 | path: data/test_table1.tsv 10 | 11 | - step_type: create-load-redshift 12 | table_definition: tables/dev.test_table.sql 13 | 14 | - step_type: upsert 15 | source: tables/dev.test_table.sql 16 | destination: tables/dev.test_table_2.sql 17 | -------------------------------------------------------------------------------- /docs/dataduct.tests.rst: -------------------------------------------------------------------------------- 1 | dataduct.tests package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.tests.test_import module 8 | --------------------------------- 9 | 10 | .. automodule:: dataduct.tests.test_import 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: dataduct.tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /examples/example_column_check.yaml: -------------------------------------------------------------------------------- 1 | name: example_column_check 2 | frequency: one-time 3 | load_time: 01:00 4 | 5 | description: Example for the column-check step 6 | 7 | steps: 8 | - step_type: column-check 9 | source_sql: "SELECT id, name FROM networks_network;" 10 | source_host: maestro 11 | destination_sql: "SELECT network_id, network_name FROM prod.networks" 12 | sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" 13 | sample_size: 10 14 | log_to_s3: true 15 | -------------------------------------------------------------------------------- /docs/dataduct.data_access.rst: -------------------------------------------------------------------------------- 1 | dataduct.data_access package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.data_access.connection module 8 | -------------------------------------- 9 | 10 | .. automodule:: dataduct.data_access.connection 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: dataduct.data_access 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /examples/example_pipeline_dependency.yaml: -------------------------------------------------------------------------------- 1 | name: example_pipeline_dependency 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | steps: 6 | - step_type: pipeline-dependencies 7 | name: dependency_step 8 | refresh_rate: 60 9 | dependent_pipelines: 10 | - example_transform 11 | dependent_pipelines_ok_to_fail: 12 | - example_failed_pipeline 13 | 14 | - step_type: transform 15 | depends_on: dependency_step 16 | command: whoami >> $OUTPUT1_STAGING_DIR/output.txt 17 | -------------------------------------------------------------------------------- /examples/example_create_update_sql.yaml: -------------------------------------------------------------------------------- 1 | name: example_create_update_sql 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the create-update-sql step 6 | 7 | steps: 8 | - step_type: create-update-sql 9 | command: | 10 | DELETE FROM dev.test_table WHERE id < 0; 11 | INSERT INTO dev.test_table 12 | SELECT * FROM dev.test_table_2 13 | WHERE id < %s; 14 | table_definition: tables/dev.test_table.sql 15 | script_arguments: 16 | - 4 17 | -------------------------------------------------------------------------------- /examples/example_extract_rds.yaml: -------------------------------------------------------------------------------- 1 | name: example_extract_rds 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: | 6 | This example extracts data from mysql to S3 with the extract-rds step. 7 | 8 | steps: 9 | - step_type: extract-rds 10 | host_name: maestro 11 | database: maestro 12 | table: specializations_specialization 13 | 14 | - step_type: extract-rds 15 | host_name: maestro 16 | database: maestro 17 | sql: | 18 | SELECT * 19 | FROM networks_network; 20 | -------------------------------------------------------------------------------- /docs/dataduct.config.tests.rst: -------------------------------------------------------------------------------- 1 | dataduct.config.tests package 2 | ============================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.config.tests.test_credentials module 8 | --------------------------------------------- 9 | 10 | .. automodule:: dataduct.config.tests.test_credentials 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: dataduct.config.tests 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /dataduct/database/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .transform import remove_comments 2 | from .transform import remove_empty_statements 3 | from .transform import remove_transactional 4 | from .transform import split_statements 5 | from .transform import remove_newlines 6 | 7 | from .select_query import parse_select_dependencies 8 | from .select_query import parse_select_columns 9 | from .select_query import parse_column_name 10 | 11 | from .create_table import parse_create_table 12 | from .create_table import create_exists_clone 13 | from .create_view import parse_create_view 14 | -------------------------------------------------------------------------------- /examples/example_bootstrap.yaml: -------------------------------------------------------------------------------- 1 | name: example_bootstrap 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the transform step 6 | 7 | bootstrap: 8 | ec2: 9 | - step_type: transform 10 | input_node: [] 11 | command: pip install git+https://github.com/coursera/dataduct.git >> ${OUTPUT1_STAGING_DIR}/output.txt 12 | name: bootstrap_override 13 | 14 | steps: 15 | - step_type: transform 16 | input_node: [] 17 | command: python -c "import dataduct" >> ${OUTPUT1_STAGING_DIR}/output.txt 18 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright [2014] [Coursera] 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /dataduct/utils/decorators.py: -------------------------------------------------------------------------------- 1 | """Common decorator utilities 2 | """ 3 | 4 | from datetime import datetime 5 | 6 | 7 | def timeit(method): 8 | """Timing decorator for measuring performance of functions 9 | """ 10 | 11 | def timed(*args, **kw): 12 | ts = datetime.now() 13 | print 'Starting time for Method %r is %s' % (method.__name__, ts) 14 | 15 | result = method(*args, **kw) 16 | te = datetime.now() 17 | print 'End time for Method %r is %s' % (method.__name__, te) 18 | 19 | print 'Method %r took %s time' % (method.__name__, te - ts) 20 | return result 21 | 22 | return timed 23 | -------------------------------------------------------------------------------- /dataduct/database/sql/transaction.py: -------------------------------------------------------------------------------- 1 | """SQL Statements used in transactions 2 | """ 3 | 4 | from .sql_statement import SqlStatement 5 | 6 | 7 | class BeginStatement(SqlStatement): 8 | """Class representing begin sql statement 9 | """ 10 | def __init__(self): 11 | """Constructor for begin class 12 | """ 13 | super(BeginStatement, self).__init__('BEGIN', True) 14 | 15 | 16 | class CommitStatement(SqlStatement): 17 | """Class representing Commit sql statement 18 | """ 19 | def __init__(self): 20 | """Constructor for Commit class 21 | """ 22 | super(CommitStatement, self).__init__('COMMIT', True) 23 | -------------------------------------------------------------------------------- /examples/example_emr_streaming.yaml: -------------------------------------------------------------------------------- 1 | name: example_emr_streaming 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | emr_cluster_config: 5 | num_instances: 1 6 | instance_size: m1.large 7 | ami_version: 3.3.1 8 | 9 | description: Example for the emr_streaming step 10 | 11 | steps: 12 | - step_type: extract-local 13 | path: data/word_data.txt 14 | 15 | - step_type: emr-streaming 16 | mapper: scripts/word_mapper.py 17 | reducer: scripts/word_reducer.py 18 | 19 | - step_type: transform 20 | script: scripts/s3_profiler.py 21 | script_arguments: 22 | - --input=INPUT1_STAGING_DIR 23 | - --output=OUTPUT1_STAGING_DIR 24 | - -f 25 | -------------------------------------------------------------------------------- /examples/example_double_input.yaml: -------------------------------------------------------------------------------- 1 | name: example_double_input 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the transform step with multiple inputs 6 | 7 | steps: 8 | - step_type: extract-local 9 | name: step1 10 | path: data/test_table1.tsv 11 | 12 | - step_type: extract-local 13 | name: step2 14 | path: data/test_table2.tsv 15 | 16 | - step_type: transform 17 | script: scripts/s3_profiler.py 18 | input_node: 19 | step1: script 20 | step2: directory 21 | script_arguments: 22 | - --input=INPUT1_STAGING_DIR 23 | - --output=OUTPUT1_STAGING_DIR 24 | - script/ 25 | - directory/ 26 | -------------------------------------------------------------------------------- /examples/steps/custom_extract_local.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for creating an S3 node for input from local files 3 | """ 4 | from dataduct.steps import ExtractLocalStep 5 | import logging 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class CustomExtractLocalStep(ExtractLocalStep): 10 | """CustomExtractLocal Step class that helps get data from a local file 11 | """ 12 | 13 | def __init__(self, **kwargs): 14 | """Constructor for the CustomExtractLocal class 15 | 16 | Args: 17 | **kwargs(optional): Keyword arguments directly passed to base class 18 | """ 19 | logger.info('Using the Custom Extract Local Step') 20 | super(CustomExtractLocalStep, self).__init__(**kwargs) 21 | -------------------------------------------------------------------------------- /docs/dataduct.etl.tests.rst: -------------------------------------------------------------------------------- 1 | dataduct.etl.tests package 2 | ========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.etl.tests.test_etl_actions module 8 | ------------------------------------------ 9 | 10 | .. automodule:: dataduct.etl.tests.test_etl_actions 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.etl.tests.test_etl_pipeline module 16 | ------------------------------------------- 17 | 18 | .. automodule:: dataduct.etl.tests.test_etl_pipeline 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: dataduct.etl.tests 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /dataduct/database/tests/helpers.py: -------------------------------------------------------------------------------- 1 | """Helpers for Database Tests 2 | """ 3 | from nose.tools import eq_ 4 | 5 | from ..table import Table 6 | from ..view import View 7 | from ..sql import SqlScript 8 | 9 | 10 | def create_table(sql): 11 | """Creates a table object from a SQL string 12 | """ 13 | return Table(SqlScript(sql)) 14 | 15 | 16 | def create_view(sql): 17 | """Creates a view object from a SQL string 18 | """ 19 | return View(SqlScript(sql)) 20 | 21 | 22 | def compare_scripts(actual_script, expected_script): 23 | """Validates a SqlScript chain 24 | """ 25 | assert len(actual_script) == len(expected_script) 26 | for actual, expected in zip(actual_script, expected_script): 27 | eq_(actual.sql(), expected) 28 | -------------------------------------------------------------------------------- /docs/dataduct.database.tests.rst: -------------------------------------------------------------------------------- 1 | dataduct.database.tests package 2 | =============================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.database.tests.test_database module 8 | -------------------------------------------- 9 | 10 | .. automodule:: dataduct.database.tests.test_database 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.database.tests.test_history_table module 16 | ------------------------------------------------- 17 | 18 | .. automodule:: dataduct.database.tests.test_history_table 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: dataduct.database.tests 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /dataduct/database/parsers/tests/test_create_view.py: -------------------------------------------------------------------------------- 1 | """Tests for create view parser 2 | """ 3 | 4 | from unittest import TestCase 5 | from nose.tools import eq_ 6 | from ..create_view import parse_create_view 7 | 8 | 9 | class TestCreateViewStatement(TestCase): 10 | """Tests for create view 11 | """ 12 | @staticmethod 13 | def test_basic(): 14 | """Basic test for create view 15 | """ 16 | query = 'CREATE VIEW orders AS (' + \ 17 | 'SELECT x, y, z from xyz_table)' 18 | 19 | full_name = 'orders' 20 | replace = False 21 | 22 | output = parse_create_view(query) 23 | 24 | eq_(output['view_name'], full_name) 25 | eq_(output['replace'], replace) 26 | eq_(output['select_statement'], 'SELECT x, y, z from xyz_table') 27 | -------------------------------------------------------------------------------- /examples/example_transform.yaml: -------------------------------------------------------------------------------- 1 | name: example_transform 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | ec2_resource_config: 5 | instance_type: m1.small 6 | 7 | description: | 8 | Example for the transform step, uses an m1.small instance instead of 9 | the default 10 | 11 | steps: 12 | - step_type: extract-local 13 | name: extract-node 14 | path: data/test_table1.tsv 15 | 16 | - step_type: transform 17 | input_node: extract-node 18 | script: scripts/s3_profiler.py 19 | script_arguments: 20 | - --input=INPUT1_STAGING_DIR 21 | - --output=OUTPUT1_STAGING_DIR 22 | 23 | - step_type: transform 24 | input_node: extract-node 25 | script_directory: scripts/ 26 | script_name: s3_profiler.py 27 | script_arguments: 28 | - --input=INPUT1_STAGING_DIR 29 | - --output=OUTPUT1_STAGING_DIR 30 | -------------------------------------------------------------------------------- /dataduct/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .activity import Activity 2 | from .copy_activity import CopyActivity 3 | from .data_pipeline import DataPipeline 4 | from .default_object import DefaultObject 5 | from .ec2_resource import Ec2Resource 6 | from .emr_resource import EmrResource 7 | from .emr_activity import EmrActivity 8 | from .mysql_node import MysqlNode 9 | from .postgres_node import PostgresNode 10 | from .postgres_database import PostgresDatabase 11 | from .pipeline_object import PipelineObject 12 | from .precondition import Precondition 13 | from .redshift_copy_activity import RedshiftCopyActivity 14 | from .redshift_node import RedshiftNode 15 | from .redshift_database import RedshiftDatabase 16 | from .s3_node import S3Node 17 | from .schedule import Schedule 18 | from .shell_command_activity import ShellCommandActivity 19 | from .sns_alarm import SNSAlarm 20 | from .sql_activity import SqlActivity 21 | -------------------------------------------------------------------------------- /dataduct/config/tests/test_config_actions.py: -------------------------------------------------------------------------------- 1 | """Tests that the config actions are working properly 2 | """ 3 | from unittest import TestCase 4 | from nose.tools import eq_ 5 | 6 | from .. import config_actions 7 | from ..config import Config 8 | 9 | 10 | class TestConfigActions(TestCase): 11 | """Tests for config actions 12 | """ 13 | @staticmethod 14 | def test_s3_config_path(): 15 | """Tests that s3_config_path correctly returns the S3 base path 16 | """ 17 | config = Config() 18 | config.etl['S3_BASE_PATH'] = 'test/path' 19 | config.etl['S3_ETL_BUCKET'] = 'test_bucket' 20 | config_actions.CONFIG_STR = 'test_config_str' 21 | config_actions.CFG_FILE = 'test_cfg_file.cfg' 22 | result = config_actions.s3_config_path() 23 | eq_(result.bucket, 'test_bucket') 24 | eq_(result.key, 'test/path/test_config_str/test_cfg_file.cfg') 25 | -------------------------------------------------------------------------------- /docs/dataduct.etl.rst: -------------------------------------------------------------------------------- 1 | dataduct.etl package 2 | ==================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | dataduct.etl.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | dataduct.etl.etl_actions module 15 | ------------------------------- 16 | 17 | .. automodule:: dataduct.etl.etl_actions 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | dataduct.etl.etl_pipeline module 23 | -------------------------------- 24 | 25 | .. automodule:: dataduct.etl.etl_pipeline 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | dataduct.etl.utils module 31 | ------------------------- 32 | 33 | .. automodule:: dataduct.etl.utils 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: dataduct.etl 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /dataduct/config/example_config: -------------------------------------------------------------------------------- 1 | # Constants that are used across the dataduct library 2 | 3 | ec2: 4 | INSTANCE_TYPE: m1.large 5 | ETL_AMI: ami-05355a6c # Default AMI used by data pipeline 6 | SECURITY_GROUP: FILL_ME_IN 7 | 8 | emr: 9 | MASTER_INSTANCE_TYPE: m1.large 10 | NUM_CORE_INSTANCES: 1 11 | CORE_INSTANCE_TYPE: m1.large 12 | CLUSTER_AMI: 3.7.0 13 | 14 | etl: 15 | S3_ETL_BUCKET: FILL_ME_IN 16 | ROLE: FILL_ME_IN 17 | RESOURCE_ROLE: FILL_ME_IN 18 | 19 | postgres: 20 | DATABASE_NAME: FILL_ME_IN 21 | RDS_INSTANCE_ID: FILL_ME_IN 22 | USERNAME: FILL_ME_IN 23 | PASSWORD: FILL_ME_IN 24 | REGION: FILL_ME_IN 25 | 26 | mysql: 27 | DATABASE: 28 | HOST: FILL_ME_IN 29 | PASSWORD: FILL_ME_IN 30 | USERNAME: FILL_ME_IN 31 | 32 | redshift: 33 | CLUSTER_ID: FILL_ME_IN 34 | DATABASE_NAME: FILL_ME_IN 35 | HOST: FILL_ME_IN 36 | PASSWORD: FILL_ME_IN 37 | USERNAME: FILL_ME_IN 38 | PORT: 5439 39 | -------------------------------------------------------------------------------- /examples/resources/scripts/word_mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple mapper for word count example""" 3 | 4 | import sys 5 | 6 | def read_input(file): 7 | """Reads the stdin line by line 8 | """ 9 | for line in file: 10 | # split the line into words 11 | yield line.split() 12 | 13 | def main(separator='\t'): 14 | """Read the data and split the lines and emit the words 15 | Args: 16 | separator(str): Separator to be used between key and value 17 | """ 18 | # input comes from STDIN (standard input) 19 | data = read_input(sys.stdin) 20 | for words in data: 21 | # write the results to STDOUT (standard output); 22 | # what we output here will be the input for the 23 | # Reduce step, i.e. the input for reducer.py 24 | # 25 | # tab-delimited; the trivial word count is 1 26 | for word in words: 27 | print '%s%s%d' % (word, separator, 1) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /docs/dataduct.database.sql.tests.rst: -------------------------------------------------------------------------------- 1 | dataduct.database.sql.tests package 2 | =================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.database.sql.tests.test_sql_script module 8 | -------------------------------------------------- 9 | 10 | .. automodule:: dataduct.database.sql.tests.test_sql_script 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.database.sql.tests.test_sql_statement module 16 | ----------------------------------------------------- 17 | 18 | .. automodule:: dataduct.database.sql.tests.test_sql_statement 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | dataduct.database.sql.tests.test_sql_utils module 24 | ------------------------------------------------- 25 | 26 | .. automodule:: dataduct.database.sql.tests.test_sql_utils 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: dataduct.database.sql.tests 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /dataduct/steps/__init__.py: -------------------------------------------------------------------------------- 1 | from .column_check import ColumnCheckStep 2 | from .count_check import CountCheckStep 3 | from .create_load_redshift import CreateAndLoadStep 4 | from .create_update_sql import CreateUpdateSqlStep 5 | from .delta_load import DeltaLoadStep 6 | from .emr_job import EMRJobStep 7 | from .emr_streaming import EMRStreamingStep 8 | from .etl_step import ETLStep 9 | from .extract_local import ExtractLocalStep 10 | from .extract_rds import ExtractRdsStep 11 | from .extract_redshift import ExtractRedshiftStep 12 | from .extract_postgres import ExtractPostgresStep 13 | from .extract_s3 import ExtractS3Step 14 | from .load_redshift import LoadRedshiftStep 15 | from .load_postgres import LoadPostgresStep 16 | from .load_reload_pk import LoadReloadAndPrimaryKeyStep 17 | from .pipeline_dependencies import PipelineDependenciesStep 18 | from .primary_key_check import PrimaryKeyCheckStep 19 | from .qa_transform import QATransformStep 20 | from .reload import ReloadStep 21 | from .sql_command import SqlCommandStep 22 | from .transform import TransformStep 23 | from .upsert import UpsertStep 24 | -------------------------------------------------------------------------------- /dataduct/config/config_actions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script that has action functions for config 3 | """ 4 | from .config import Config 5 | from ..s3 import S3Path 6 | from ..s3 import S3File 7 | 8 | from .constants import CONFIG_STR 9 | from .constants import CFG_FILE 10 | 11 | 12 | config = Config() 13 | 14 | def s3_config_path(): 15 | """S3 uri for the config files 16 | """ 17 | key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, CFG_FILE] 18 | return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) 19 | 20 | 21 | def sync_to_s3(): 22 | """Upload the config file to an S3 location 23 | """ 24 | s3_file = S3File(text=config.raw_config(), s3_path=s3_config_path()) 25 | s3_file.upload_to_s3() 26 | 27 | 28 | def sync_from_s3(filename): 29 | """Read the config file from S3 30 | """ 31 | s3_file = S3File(s3_path=s3_config_path()) 32 | text = s3_file.text 33 | 34 | if filename is None: 35 | raise ValueError('Filename for config sync must be provided') 36 | else: 37 | with open(filename, 'w') as op_file: 38 | op_file.write(text) 39 | -------------------------------------------------------------------------------- /dataduct/pipeline/precondition.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for the precondition step 3 | """ 4 | 5 | from .pipeline_object import PipelineObject 6 | 7 | 8 | class Precondition(PipelineObject): 9 | """Precondition object added to all pipelines 10 | """ 11 | 12 | def __init__(self, 13 | id, 14 | is_directory=True, 15 | **kwargs): 16 | """Constructor for the Precondition class 17 | 18 | Args: 19 | id(str): id of the precondition object 20 | is_directory(bool): if s3 path is a directory or not 21 | **kwargs(optional): Keyword arguments directly passed to base class 22 | """ 23 | 24 | if is_directory: 25 | super(Precondition, self).__init__( 26 | id=id, 27 | type='S3PrefixNotEmpty', 28 | s3Prefix="#{node.directoryPath}", 29 | ) 30 | else: 31 | super(Precondition, self).__init__( 32 | id=id, 33 | type='S3KeyExists', 34 | s3Prefix="#{node.filePath}", 35 | ) 36 | -------------------------------------------------------------------------------- /dataduct/steps/reload.py: -------------------------------------------------------------------------------- 1 | """ETL step wrapper for Reload SQL script 2 | """ 3 | from .upsert import UpsertStep 4 | 5 | 6 | class ReloadStep(UpsertStep): 7 | """Reload Step class that helps run a step on the emr cluster 8 | """ 9 | 10 | def __init__(self, **kwargs): 11 | """Constructor for the ReloadStep class 12 | 13 | Args: 14 | **kwargs(optional): Keyword arguments directly passed to base class 15 | """ 16 | 17 | # Enforce PK by default. 18 | if 'enforce_primary_key' not in kwargs: 19 | kwargs['enforce_primary_key'] = True 20 | super(ReloadStep, self).__init__(**kwargs) 21 | 22 | @classmethod 23 | def arguments_processor(cls, etl, input_args): 24 | """Parse the step arguments according to the ETL pipeline 25 | 26 | Args: 27 | etl(ETLPipeline): Pipeline object containing resources and steps 28 | step_args(dict): Dictionary of the step arguments for the class 29 | """ 30 | input_args['delete_existing'] = True 31 | return super(ReloadStep, cls).arguments_processor(etl, input_args) 32 | -------------------------------------------------------------------------------- /examples/example_double_output.yaml: -------------------------------------------------------------------------------- 1 | name: example_double_output 2 | frequency: one-time 3 | load_time: 01:00 # Hour:Min in UTC 4 | 5 | description: Example for the transform step with multiple outputs 6 | 7 | steps: 8 | - step_type: extract-local 9 | name: step1_a 10 | path: data/test_table1.tsv 11 | 12 | - step_type: extract-local 13 | name: step1_b 14 | path: data/test_table2.tsv 15 | 16 | - step_type: transform 17 | command: cp -r $INPUT1_STAGING_DIR/* $OUTPUT1_STAGING_DIR 18 | input_node: 19 | step1_a: step2_a 20 | step1_b: step2_b 21 | output_node: 22 | - step2_a 23 | - step2_b 24 | 25 | - step_type: transform 26 | name: profiler_1 27 | script: scripts/s3_profiler.py 28 | input_node: step2_a 29 | script_arguments: 30 | - --input=INPUT1_STAGING_DIR 31 | - --output=OUTPUT1_STAGING_DIR 32 | - -f 33 | 34 | - step_type: transform 35 | name: profiler_2 36 | script: scripts/s3_profiler.py 37 | input_node: step2_b 38 | script_arguments: 39 | - --input=INPUT1_STAGING_DIR 40 | - --output=OUTPUT1_STAGING_DIR 41 | - -f 42 | -------------------------------------------------------------------------------- /dataduct/database/select_statement.py: -------------------------------------------------------------------------------- 1 | """Script containing the SelectStatement object 2 | """ 3 | 4 | from .sql import SqlStatement 5 | from .column import Column 6 | from .parsers import parse_select_dependencies 7 | from .parsers import parse_select_columns 8 | from .parsers import parse_column_name 9 | 10 | 11 | class SelectStatement(SqlStatement): 12 | """Class representing SelectStatement from a sql_statement 13 | """ 14 | def __init__(self, sql): 15 | """Constructor for CreateTableStatement class 16 | """ 17 | super(SelectStatement, self).__init__(sql) 18 | 19 | self._dependencies = parse_select_dependencies(self.sql()) 20 | self._raw_columns = parse_select_columns(self.sql()) 21 | self._columns = [ 22 | Column(parse_column_name(c), None) for c in self._raw_columns] 23 | 24 | @property 25 | def dependencies(self): 26 | """Table dependencies of the select statement 27 | """ 28 | return self._dependencies 29 | 30 | def columns(self): 31 | """Table columns of the select statement 32 | """ 33 | return self._columns 34 | -------------------------------------------------------------------------------- /dataduct/qa/primary_key_check.py: -------------------------------------------------------------------------------- 1 | """QA test for we have duplicate primary keys inside redshift 2 | """ 3 | 4 | from .check import Check 5 | from .utils import render_output 6 | 7 | 8 | class PrimaryKeyCheck(Check): 9 | """QA test for checking duplicate primary keys inside redshift 10 | """ 11 | def __init__(self, duplicate_count=0, **kwargs): 12 | """Constructor for Primary Key Check 13 | 14 | Args: 15 | duplicate_count(int): Number of duplicates 16 | """ 17 | super(PrimaryKeyCheck, self).__init__(**kwargs) 18 | self.duplicate_count = duplicate_count 19 | 20 | @property 21 | def error_rate(self): 22 | """The error rate for the QA test 23 | """ 24 | return self.duplicate_count 25 | 26 | @property 27 | def summary(self): 28 | """Summary of the test results for the SNS message 29 | """ 30 | return render_output( 31 | [ 32 | 'Test Name: %s' % self.name, 33 | 'Success: %s' % self.success, 34 | 'Tolerance: %d' % self.tolerance, 35 | 'Error Rate: %d' % self.error_rate, 36 | ] 37 | ) 38 | -------------------------------------------------------------------------------- /docs/dataduct.s3.rst: -------------------------------------------------------------------------------- 1 | dataduct.s3 package 2 | =================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.s3.s3_directory module 8 | ------------------------------- 9 | 10 | .. automodule:: dataduct.s3.s3_directory 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.s3.s3_file module 16 | -------------------------- 17 | 18 | .. automodule:: dataduct.s3.s3_file 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | dataduct.s3.s3_log_path module 24 | ------------------------------ 25 | 26 | .. automodule:: dataduct.s3.s3_log_path 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | dataduct.s3.s3_path module 32 | -------------------------- 33 | 34 | .. automodule:: dataduct.s3.s3_path 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | dataduct.s3.utils module 40 | ------------------------ 41 | 42 | .. automodule:: dataduct.s3.utils 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: dataduct.s3 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /dataduct/database/sql/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared utility functions 3 | """ 4 | from ..parsers import remove_comments 5 | from ..parsers import remove_empty_statements 6 | from ..parsers import split_statements 7 | from ..parsers import remove_transactional 8 | from ..parsers import remove_newlines 9 | 10 | 11 | def balanced_parenthesis(statement): 12 | """Check if the SQL statement is balanced 13 | """ 14 | counter = 0 15 | for character in statement: 16 | if character == '(': 17 | counter += 1 18 | if character == ')': 19 | counter -= 1 20 | if counter < 0: 21 | return False 22 | return counter == 0 23 | 24 | 25 | def sanitize_sql(sql, keep_transaction=False): 26 | """Sanatize the sql string 27 | """ 28 | # remove comments 29 | string = remove_comments(sql) 30 | 31 | # remove transactionals 32 | if not keep_transaction: 33 | string = remove_transactional(string) 34 | 35 | # remove new lines 36 | string = remove_newlines(string) 37 | 38 | # remove empty statements 39 | string = remove_empty_statements(string) 40 | 41 | # split into multiple statements 42 | return split_statements(string) 43 | -------------------------------------------------------------------------------- /docs/dataduct.qa.rst: -------------------------------------------------------------------------------- 1 | dataduct.qa package 2 | =================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.qa.check module 8 | ------------------------ 9 | 10 | .. automodule:: dataduct.qa.check 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.qa.column_check module 16 | ------------------------------- 17 | 18 | .. automodule:: dataduct.qa.column_check 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | dataduct.qa.count_check module 24 | ------------------------------ 25 | 26 | .. automodule:: dataduct.qa.count_check 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | dataduct.qa.primary_key_check module 32 | ------------------------------------ 33 | 34 | .. automodule:: dataduct.qa.primary_key_check 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | dataduct.qa.utils module 40 | ------------------------ 41 | 42 | .. automodule:: dataduct.qa.utils 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: dataduct.qa 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /dataduct/database/sql/tests/test_sql_utils.py: -------------------------------------------------------------------------------- 1 | """Tests the utils functions 2 | """ 3 | from unittest import TestCase 4 | from nose.tools import eq_ 5 | 6 | from ..utils import balanced_parenthesis 7 | from ..utils import sanitize_sql 8 | 9 | 10 | class TestSqlUtils(TestCase): 11 | """Tests for sql utils function 12 | """ 13 | @staticmethod 14 | def test_balanced_paranthesis(): 15 | """Test for balanced_parenthesis 16 | """ 17 | eq_(balanced_parenthesis('SELECT 1;'), True) 18 | eq_(balanced_parenthesis('SELECT 1(;'), False) 19 | eq_(balanced_parenthesis('SELECT 1();'), True) 20 | eq_(balanced_parenthesis('SELECT 1(abcd);'), True) 21 | eq_(balanced_parenthesis('SELECT 1(ab[cd);'), True) 22 | eq_(balanced_parenthesis('SELECT 1(ab[cd));'), False) 23 | eq_(balanced_parenthesis('SELECT 1);'), False) 24 | eq_(balanced_parenthesis('SELECT 1(ab)(ab);'), True) 25 | eq_(balanced_parenthesis('SELECT 1(a(ab)b);'), True) 26 | 27 | @staticmethod 28 | def test_sanitize_sql(): 29 | """Test for sanitize_sql 30 | """ 31 | sql = "SELECT 1 if x='x;y'; SELECT 1 ;" 32 | eq_(sanitize_sql(sql), ["SELECT 1 if x='x;y'", 'SELECT 1']) 33 | -------------------------------------------------------------------------------- /dataduct/s3/s3_log_path.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class for storing a S3 Log Path 3 | """ 4 | 5 | from os.path import join 6 | from .s3_path import S3Path 7 | 8 | 9 | class S3LogPath(S3Path): 10 | """S3 Log path for data pipeline 11 | S3LogPath only exists to correct the use of S3 URI's by Data 12 | Pipeline. In most cases, one should use a backslash to disambiguate 13 | prefixes. For instance, the former prefix includes the latter 14 | unless there is a backslash: 15 | 16 | :: 17 | s3:://coursera-bucket/dev 18 | s3:://coursera-bucket/dev_log_dir 19 | 20 | However, if one adds a backslash to the log s3 URI, Data Pipeline 21 | will add another backslash before adding subdirectories. These 22 | double backslashes break boto. 23 | """ 24 | def __init(self, **kwargs): 25 | """Constructor for S3LogPath 26 | """ 27 | super(S3LogPath, self).__init__(**kwargs) 28 | 29 | @property 30 | def uri(self): 31 | """Get the log directory path 32 | 33 | Returns: 34 | s3_uri(str): s3_log path without the trailing '/' 35 | """ 36 | if self.key is None: 37 | return None 38 | return join('s3://', self.bucket, self.key).rstrip('/') 39 | -------------------------------------------------------------------------------- /docs/dataduct.utils.rst: -------------------------------------------------------------------------------- 1 | dataduct.utils package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.utils.cli module 8 | ------------------------- 9 | 10 | .. automodule:: dataduct.utils.cli 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.utils.constants module 16 | ------------------------------- 17 | 18 | .. automodule:: dataduct.utils.constants 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | dataduct.utils.exceptions module 24 | -------------------------------- 25 | 26 | .. automodule:: dataduct.utils.exceptions 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | dataduct.utils.helpers module 32 | ----------------------------- 33 | 34 | .. automodule:: dataduct.utils.helpers 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | dataduct.utils.slack_hook module 40 | -------------------------------- 41 | 42 | .. automodule:: dataduct.utils.slack_hook 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: dataduct.utils 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | 5 | sudo: false 6 | 7 | addons: 8 | apt_packages: 9 | - graphviz 10 | # command to install dependencies 11 | install: 12 | - pip install coveralls 13 | - pip install -r requirements.txt 14 | 15 | # Setup config file 16 | before_script: 17 | - mkdir ~/.dataduct 18 | - |+ 19 | echo " 20 | etl: 21 | ROLE: DataPipelineDefaultRole 22 | RESOURCE_ROLE: DataPipelineDefaultResourceRole 23 | S3_ETL_BUCKET: FILL_ME_IN 24 | 25 | ec2: 26 | CORE_INSTANCE_TYPE: m1.large 27 | 28 | emr: 29 | CLUSTER_AMI: 2.4.7 30 | 31 | redshift: 32 | DATABASE_NAME: FILL_ME_IN 33 | CLUSTER_ID: FILL_ME_IN 34 | USERNAME: FILL_ME_IN 35 | PASSWORD: FILL_ME_IN 36 | 37 | postgres: 38 | DATABASE_NAME: FILL_ME_IN 39 | RDS_INSTANCE_ID: FILL_ME_IN 40 | USERNAME: FILL_ME_IN 41 | PASSWORD: FILL_ME_IN 42 | REGION: FILL_ME_IN 43 | 44 | mysql: 45 | DATABASE_KEY: 46 | HOST: FILL_ME_IN 47 | USERNAME: FILL_ME_IN 48 | PASSWORD: FILL_ME_IN" > ~/.dataduct/dataduct.cfg 49 | 50 | # Run tests 51 | script: nosetests --with-coverage --cover-package=. --cover-erase 52 | after_success: 53 | coveralls 54 | -------------------------------------------------------------------------------- /docs/dataduct.database.sql.rst: -------------------------------------------------------------------------------- 1 | dataduct.database.sql package 2 | ============================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | dataduct.database.sql.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | dataduct.database.sql.sql_script module 15 | --------------------------------------- 16 | 17 | .. automodule:: dataduct.database.sql.sql_script 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | dataduct.database.sql.sql_statement module 23 | ------------------------------------------ 24 | 25 | .. automodule:: dataduct.database.sql.sql_statement 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | dataduct.database.sql.transaction module 31 | ---------------------------------------- 32 | 33 | .. automodule:: dataduct.database.sql.transaction 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | dataduct.database.sql.utils module 39 | ---------------------------------- 40 | 41 | .. automodule:: dataduct.database.sql.utils 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: dataduct.database.sql 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /examples/resources/scripts/word_reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple reducer for the word count example""" 3 | 4 | from itertools import groupby 5 | from operator import itemgetter 6 | import sys 7 | 8 | def read_mapper_output(file, separator='\t'): 9 | """Reads the stdin line by line 10 | """ 11 | for line in file: 12 | yield line.rstrip().split(separator, 1) 13 | 14 | def main(separator='\t'): 15 | """Read the key value pairs and count the number of words 16 | Args: 17 | separator(str): Separator to be used between key and value 18 | """ 19 | 20 | # input comes from STDIN (standard input) 21 | data = read_mapper_output(sys.stdin, separator=separator) 22 | # groupby groups multiple word-count pairs by word, 23 | # and creates an iterator that returns consecutive keys and their group: 24 | # current_word - string containing a word (the key) 25 | for current_word, group in groupby(data, itemgetter(0)): 26 | try: 27 | total_count = sum(int(count) for current_word, count in group) 28 | print "%s%s%d" % (current_word, separator, total_count) 29 | except ValueError: 30 | # count was not a number, so silently discard this item 31 | pass 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /dataduct/steps/executors/primary_key_check.py: -------------------------------------------------------------------------------- 1 | """Script that checks for primary key violations on the input table 2 | """ 3 | 4 | import argparse 5 | import pandas.io.sql as pdsql 6 | from dataduct.data_access import redshift_connection 7 | from dataduct.database import SqlScript 8 | from dataduct.database import Table 9 | from dataduct.qa import PrimaryKeyCheck 10 | 11 | 12 | def primary_key_check(): 13 | parser = argparse.ArgumentParser() 14 | 15 | parser.add_argument('--table', dest='table', required=True) 16 | parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) 17 | parser.add_argument('--test_name', dest='test_name', 18 | default="Check Primary Key") 19 | parser.add_argument('--log_to_s3', action='store_true', default=False) 20 | parser.add_argument('--path_suffix', dest='path_suffix', default=None) 21 | 22 | args = parser.parse_args() 23 | 24 | connection = redshift_connection() 25 | table = Table(SqlScript(args.table)) 26 | result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) 27 | check = PrimaryKeyCheck(len(result), name=args.test_name, 28 | sns_topic_arn=args.sns_topic_arn) 29 | check.publish(args.log_to_s3, table=table.full_name, 30 | path_suffix=args.path_suffix) 31 | connection.close() 32 | -------------------------------------------------------------------------------- /docs/dataduct.config.rst: -------------------------------------------------------------------------------- 1 | dataduct.config package 2 | ======================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | dataduct.config.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | dataduct.config.config module 15 | ----------------------------- 16 | 17 | .. automodule:: dataduct.config.config 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | dataduct.config.config_actions module 23 | ------------------------------------- 24 | 25 | .. automodule:: dataduct.config.config_actions 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | dataduct.config.constants module 31 | -------------------------------- 32 | 33 | .. automodule:: dataduct.config.constants 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | dataduct.config.credentials module 39 | ---------------------------------- 40 | 41 | .. automodule:: dataduct.config.credentials 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | dataduct.config.logger_config module 47 | ------------------------------------ 48 | 49 | .. automodule:: dataduct.config.logger_config 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | 55 | Module contents 56 | --------------- 57 | 58 | .. automodule:: dataduct.config 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | -------------------------------------------------------------------------------- /dataduct/data_access/open_shell.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ..utils.hook import hook 3 | 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | @hook('connect_to_redshift') 8 | def open_psql_shell(redshift_creds, **kwargs): 9 | command = [ 10 | "psql", 11 | "-h", redshift_creds["HOST"], 12 | "-p", str(redshift_creds["PORT"]), 13 | "-U", redshift_creds["USERNAME"], 14 | "-d", redshift_creds["DATABASE_NAME"], 15 | "-vPROMPT1=%[%033[0m%]" + redshift_creds["CLUSTER_ID"] + "%R%[%033[0m%]%# ", 16 | "-vPROMPT2=%[%033[0m%]" + redshift_creds["CLUSTER_ID"] + "%R%[%033[0m%]%# ", 17 | ] 18 | env = dict(os.environ) 19 | env['PGPASSWORD'] = redshift_creds["PASSWORD"] 20 | logger.info("Running command: {}".format(' '.join(command))) 21 | os.execvpe(command[0], command, env=env) 22 | 23 | 24 | @hook('connect_to_mysql') 25 | def open_mysql_shell(sql_creds, **kwargs): 26 | command = [ 27 | "mysql", 28 | "-h", sql_creds["HOST"], 29 | "-u", sql_creds["USERNAME"], 30 | "--default-character-set=utf8" 31 | ] 32 | if sql_creds.get("DATABASE"): 33 | command.extend(["-D", sql_creds["DATABASE"]]) 34 | 35 | env = dict(os.environ) 36 | env['MYSQL_PWD'] = sql_creds["PASSWORD"] 37 | logger.info("Running command: {}".format(' '.join(command))) 38 | os.execvpe(command[0], command, env=env) 39 | -------------------------------------------------------------------------------- /docs/dataduct.database.parsers.tests.rst: -------------------------------------------------------------------------------- 1 | dataduct.database.parsers.tests package 2 | ======================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | dataduct.database.parsers.tests.test_create_table module 8 | -------------------------------------------------------- 9 | 10 | .. automodule:: dataduct.database.parsers.tests.test_create_table 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | dataduct.database.parsers.tests.test_create_view module 16 | ------------------------------------------------------- 17 | 18 | .. automodule:: dataduct.database.parsers.tests.test_create_view 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | dataduct.database.parsers.tests.test_select_query module 24 | -------------------------------------------------------- 25 | 26 | .. automodule:: dataduct.database.parsers.tests.test_select_query 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | dataduct.database.parsers.tests.test_transfrom module 32 | ----------------------------------------------------- 33 | 34 | .. automodule:: dataduct.database.parsers.tests.test_transfrom 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | 40 | Module contents 41 | --------------- 42 | 43 | .. automodule:: dataduct.database.parsers.tests 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | -------------------------------------------------------------------------------- /dataduct/database/parsers/helpers.py: -------------------------------------------------------------------------------- 1 | """SQL parser helpers 2 | """ 3 | from pyparsing import delimitedList 4 | from pyparsing import Optional 5 | from pyparsing import ParseResults 6 | 7 | from .utils import _db_name 8 | from .utils import _temp 9 | from .utils import _temporary 10 | from .utils import _if_not_exists 11 | from .utils import _or_replace 12 | 13 | # Functions 14 | isNotEmpty = lambda x: len(x) > 0 15 | 16 | temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty) 17 | 18 | replace_check = Optional(_or_replace).setParseAction(isNotEmpty) 19 | 20 | existance_check = Optional(_if_not_exists).setParseAction(isNotEmpty) 21 | 22 | 23 | def paranthesis_list(output_name, input_var=_db_name): 24 | """Parser for a delimiedList enclosed in paranthesis 25 | """ 26 | return '(' + delimitedList(input_var).setResultsName(output_name) + ')' 27 | 28 | 29 | def exists(parser, output_name): 30 | """Get a parser that returns boolean on existance 31 | """ 32 | return parser.setParseAction(isNotEmpty).setResultsName(output_name) 33 | 34 | 35 | def to_dict(input): 36 | """Purge the ParseResults from output dictionary 37 | """ 38 | output = dict() 39 | for key, value in input.asDict().iteritems(): 40 | if isinstance(value, ParseResults): 41 | output[key] = value.asList() 42 | else: 43 | output[key] = value 44 | 45 | return output 46 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Dataduct |build-status| |coverage-status| 2 | ----------------------------------------- 3 | Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to 4 | create ETL jobs. All jobs can be specified as a series of steps in a YAML file 5 | and would automatically be translated into datapipeline with appropriate 6 | pipeline objects. 7 | 8 | **Documentation and Details** 9 | 10 | Documentation and more details can be found at http://dataduct.readthedocs.org/en/latest/ 11 | 12 | **License** 13 | 14 | Copyright [2014] [Coursera] 15 | 16 | Licensed under the Apache License, Version 2.0 (the "License"); 17 | you may not use this file except in compliance with the License. 18 | You may obtain a copy of the License at 19 | 20 | http://www.apache.org/licenses/LICENSE-2.0 21 | 22 | Unless required by applicable law or agreed to in writing, software 23 | distributed under the License is distributed on an "AS IS" BASIS, 24 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | See the License for the specific language governing permissions and 26 | limitations under the License. 27 | 28 | .. |build-status| 29 | image:: https://travis-ci.org/coursera/dataduct.svg?branch=develop 30 | :target: https://travis-ci.org/coursera/dataduct 31 | 32 | .. |coverage-status| 33 | image:: https://coveralls.io/repos/coursera/dataduct/badge.svg?branch=develop 34 | :target: https://coveralls.io/r/coursera/dataduct?branch=develop 35 | -------------------------------------------------------------------------------- /dataduct/pipeline/postgres_node.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for SqlNode 3 | """ 4 | 5 | from ..utils.exceptions import ETLInputError 6 | from .pipeline_object import PipelineObject 7 | from .schedule import Schedule 8 | 9 | 10 | class PostgresNode(PipelineObject): 11 | """SQL Data Node class 12 | """ 13 | 14 | def __init__(self, id, schedule, host, database, username, password, 15 | select_query, insert_query, table, depends_on=None): 16 | """Constructor for the SqlNode class 17 | 18 | Args: 19 | id(str): id of the object 20 | schedule(Schedule): pipeline schedule 21 | database(str): database name on the RDS host 22 | sql(str): sql to be executed 23 | table(str): table to be read 24 | """ 25 | 26 | # Validate inputs 27 | if not isinstance(schedule, Schedule): 28 | raise ETLInputError( 29 | 'Input schedule must be of the type Schedule') 30 | 31 | if not depends_on: 32 | depends_on = list() 33 | 34 | kwargs = { 35 | 'id': id, 36 | 'type': 'SqlDataNode', 37 | 'schedule': schedule, 38 | 'database': database, 39 | 'selectQuery': select_query, 40 | 'insertQuery': insert_query, 41 | 'table': table, 42 | 'dependsOn': depends_on, 43 | } 44 | super(PostgresNode, self).__init__(**kwargs) 45 | -------------------------------------------------------------------------------- /dataduct/database/parsers/create_view.py: -------------------------------------------------------------------------------- 1 | """Create SQL parser 2 | """ 3 | from pyparsing import Group 4 | from pyparsing import Optional 5 | from pyparsing import StringEnd 6 | from pyparsing import Word 7 | from pyparsing import ZeroOrMore 8 | from pyparsing import printables 9 | 10 | from .utils import _as 11 | from .utils import _create 12 | from .utils import _db_name 13 | from .utils import _view 14 | 15 | from .helpers import replace_check 16 | from .helpers import to_dict 17 | 18 | 19 | merge = lambda x: ' '.join(x[0]) 20 | 21 | 22 | def rreplace(s, old, new): 23 | li = s.rsplit(old, 1) 24 | return new.join(li) 25 | 26 | 27 | def parse_create_view(string): 28 | """Parse the create view sql query and return metadata 29 | 30 | Args: 31 | string(str): Input sql string that should be parsed 32 | 33 | Returns: 34 | view_data(dict): view_data dictionary for instantiating a view object 35 | """ 36 | 37 | string = rreplace(string, ')', ' )') 38 | 39 | end = Optional(')') + StringEnd() 40 | select = Group(ZeroOrMore(~end + Word(printables))) 41 | 42 | parser = _create + replace_check.setResultsName('replace') + _view 43 | parser += _db_name.setResultsName('view_name') + _as + Optional('(') 44 | parser += select.setParseAction(merge).setResultsName('select_statement') 45 | parser += end 46 | 47 | # Parse the base table definitions 48 | view_data = to_dict(parser.parseString(string)) 49 | 50 | return view_data 51 | -------------------------------------------------------------------------------- /dataduct/utils/constants.py: -------------------------------------------------------------------------------- 1 | """Constants shared across dataduct 2 | """ 3 | 4 | # Constants 5 | ZERO = 0 6 | ONE = 1 7 | NONE = None 8 | EMPTY_STR = '' 9 | NULL_STR = 'NULL' 10 | DEFAULT_DELAY = '10 Minutes' 11 | DEFAULT_TIMEOUT = '6 Hours' 12 | 13 | # ETL Constants 14 | EMR_CLUSTER_STR = 'emr' 15 | EC2_RESOURCE_STR = 'ec2' 16 | M1_LARGE = 'm1.large' 17 | 18 | LOG_STR = 'logs' 19 | DATA_STR = 'data' 20 | SRC_STR = 'src' 21 | QA_STR = 'qa' 22 | 23 | # Commands 24 | COMMAND_TEMPLATE = 'python -c "from {file} import {func}; {func}()" "$@"' 25 | 26 | COUNT_CHECK_COMMAND = COMMAND_TEMPLATE.format( 27 | file='dataduct.steps.executors.count_check', 28 | func='count_check') 29 | 30 | COLUMN_CHECK_COMMAND = COMMAND_TEMPLATE.format( 31 | file='dataduct.steps.executors.column_check', 32 | func='column_check') 33 | 34 | LOAD_COMMAND = COMMAND_TEMPLATE.format( 35 | file='dataduct.steps.executors.create_load_redshift', 36 | func='create_load_redshift_runner') 37 | 38 | PK_CHECK_COMMAND = COMMAND_TEMPLATE.format( 39 | file='dataduct.steps.executors.primary_key_check', 40 | func='primary_key_check') 41 | 42 | DEPENDENCY_COMMAND = COMMAND_TEMPLATE.format( 43 | file='dataduct.steps.executors.dependency_check', 44 | func='dependency_check') 45 | 46 | SCRIPT_RUNNER_COMMAND = COMMAND_TEMPLATE.format( 47 | file='dataduct.steps.executors.runner', func='script_runner') 48 | 49 | SQL_RUNNER_COMMAND = COMMAND_TEMPLATE.format( 50 | file='dataduct.steps.executors.runner', func='sql_runner') 51 | -------------------------------------------------------------------------------- /dataduct/steps/delta_load.py: -------------------------------------------------------------------------------- 1 | """ETL step wrapper for delta loading a table based on a date column 2 | """ 3 | from ..database import SqlScript 4 | from ..database import Table 5 | from ..utils.helpers import parse_path 6 | from .upsert import UpsertStep 7 | 8 | 9 | class DeltaLoadStep(UpsertStep): 10 | """DeltaLoadStep Step class that creates the table if needed and loads data 11 | """ 12 | 13 | def __init__(self, destination, date_column, window=0, **kwargs): 14 | """Constructor for the DeltaLoadStep class 15 | 16 | Args: 17 | date_column(string): name of column (of type date) to use as the 18 | delta value (i.e., only load the last X days) 19 | window(int): number of days before last loaded day to update 20 | **kwargs(optional): Keyword arguments directly passed to base class 21 | """ 22 | dest = Table(SqlScript(filename=parse_path(destination))) 23 | delta_clause = """ 24 | WHERE {date_column} >= 25 | COALESCE( 26 | (SELECT MAX({date_column}) FROM {destination}), 27 | '1800-01-01'::DATE 28 | ) - {window} 29 | """.format(date_column=date_column, 30 | destination=dest.full_name, 31 | window=window) 32 | super(DeltaLoadStep, self).__init__(destination=destination, 33 | filter_clause=delta_clause, 34 | **kwargs) 35 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. dataduct documentation master file, created by 2 | sphinx-quickstart on Mon Nov 10 17:50:14 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Dataduct 7 | ======== 8 | 9 | Dataduct - DataPipeline for humans 10 | 11 | `Dataduct `__ is a wrapper built 12 | on top of `AWS 13 | Datapipeline `__ 14 | which makes it easy to create ETL jobs. All jobs can be specified as a 15 | series of steps in a YAML file and would automatically be translated 16 | into datapipeline with appropriate pipeline objects. 17 | 18 | Features include: 19 | 20 | - Visualizing pipeline activities 21 | - Extracting data from different sources such as RDS, S3, local files 22 | - Transforming data using EC2 and EMR 23 | - Loading data into redshift 24 | - Transforming data inside redshift 25 | - QA data between the source system and warehouse 26 | 27 | It is easy to create custom steps to augment the DSL as per the 28 | requirements. As well as running a backfill with the command line 29 | interface. 30 | 31 | 32 | Contents: 33 | 34 | .. toctree:: 35 | :maxdepth: 2 36 | 37 | introduction 38 | installation 39 | commands 40 | config 41 | creating_an_etl 42 | steps 43 | input_output 44 | hooks 45 | dataduct 46 | 47 | Indices and tables 48 | ================== 49 | 50 | * :ref:`genindex` 51 | * :ref:`modindex` 52 | * :ref:`search` 53 | 54 | -------------------------------------------------------------------------------- /dataduct/pipeline/default_object.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for default metadata 3 | """ 4 | 5 | from ..config import Config 6 | from ..utils import constants as const 7 | from .pipeline_object import PipelineObject 8 | 9 | config = Config() 10 | ROLE = config.etl['ROLE'] 11 | RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] 12 | MAX_ACTIVE_INSTANCES = config.etl.get('MAX_ACTIVE_INSTANCES', const.ONE) 13 | 14 | 15 | class DefaultObject(PipelineObject): 16 | """Default object added to all pipelines 17 | """ 18 | 19 | def __init__(self, id, pipeline_log_uri, sns=None, scheduleType='cron', 20 | failureAndRerunMode='CASCADE', **kwargs): 21 | """Constructor for the DefaultObject class 22 | 23 | Args: 24 | id(str): must be 'Default' for this class 25 | sns(sns): notify on failure 26 | scheduleType(str): frequency type for the pipeline 27 | failureAndRerunMode(str): aws input argument for failure mode 28 | **kwargs(optional): Keyword arguments directly passed to base class 29 | 30 | Note: 31 | id must be Default for this object 32 | """ 33 | 34 | super(DefaultObject, self).__init__( 35 | id='Default', # This should always have the default id 36 | scheduleType=scheduleType, 37 | failureAndRerunMode=failureAndRerunMode, 38 | role=ROLE, 39 | resourceRole=RESOURCE_ROLE, 40 | maxActiveInstances=MAX_ACTIVE_INSTANCES, 41 | pipelineLogUri=pipeline_log_uri, 42 | onFail=sns 43 | ) 44 | -------------------------------------------------------------------------------- /dataduct/steps/extract_local.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for creating an S3 node for input from local files 3 | """ 4 | from ..s3 import S3File 5 | from ..utils.exceptions import ETLInputError 6 | from .etl_step import ETLStep 7 | 8 | 9 | class ExtractLocalStep(ETLStep): 10 | """ExtractLocal Step class that helps get data from a local file 11 | """ 12 | 13 | def __init__(self, path, output_path=None, **kwargs): 14 | """Constructor for the ExtractLocalStep class 15 | 16 | Args: 17 | path(str): local path for data 18 | **kwargs(optional): Keyword arguments directly passed to base class 19 | """ 20 | super(ExtractLocalStep, self).__init__(**kwargs) 21 | self._output = self.create_s3_data_node( 22 | S3File(path=path, s3_path=self.get_output_s3_path(output_path))) 23 | 24 | @classmethod 25 | def arguments_processor(cls, etl, input_args): 26 | """Parse the step arguments according to the ETL pipeline 27 | 28 | Args: 29 | etl(ETLPipeline): Pipeline object containing resources and steps 30 | step_args(dict): Dictionary of the step arguments for the class 31 | """ 32 | input_args = cls.pop_inputs(input_args) 33 | step_args = cls.base_arguments_processor(etl, input_args) 34 | 35 | step_args.pop('resource', None) 36 | step_args.pop('worker_group', None) 37 | if etl.frequency != 'one-time': 38 | raise ETLInputError( 39 | 'Extract Local can be used for one-time pipelines only') 40 | 41 | return step_args 42 | -------------------------------------------------------------------------------- /dataduct/pipeline/postgres_database.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for Rds database 3 | """ 4 | 5 | from ..config import Config 6 | from .pipeline_object import PipelineObject 7 | from ..utils.exceptions import ETLConfigError 8 | 9 | config = Config() 10 | 11 | if not hasattr(config, 'postgres'): 12 | raise ETLConfigError('Postgres credentials missing from config') 13 | 14 | REGION = config.postgres['REGION'] 15 | RDS_INSTANCE_ID = config.postgres['RDS_INSTANCE_ID'] 16 | USERNAME = config.postgres['USERNAME'] 17 | PASSWORD = config.postgres['PASSWORD'] 18 | 19 | 20 | class PostgresDatabase(PipelineObject): 21 | """Postgres resource class 22 | """ 23 | 24 | def __init__(self, 25 | id, 26 | region=REGION, 27 | rds_instance_id=RDS_INSTANCE_ID, 28 | username=USERNAME, 29 | password=PASSWORD): 30 | """Constructor for the Postgres class 31 | 32 | Args: 33 | id(str): id of the object 34 | region(str): code for the region where the database exists 35 | rds_instance_id(str): identifier of the DB instance 36 | username(str): username for the database 37 | password(str): password for the database 38 | """ 39 | 40 | kwargs = { 41 | 'id': id, 42 | 'type': 'RdsDatabase', 43 | 'region': region, 44 | 'rdsInstanceId': rds_instance_id, 45 | 'username': username, 46 | '*password': password, 47 | } 48 | super(PostgresDatabase, self).__init__(**kwargs) 49 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | CHANGES.md 3 | CONTRIBUTING.md 4 | LICENSE.md 5 | README.rst 6 | setup.py 7 | bin/README.md 8 | dataduct/__init__.py 9 | dataduct/definition_parser.py 10 | dataduct/etl_pipeline.py 11 | dataduct/pipeline/__init__.py 12 | dataduct/pipeline/activity.py 13 | dataduct/pipeline/copy_activity.py 14 | dataduct/pipeline/data_pipeline.py 15 | dataduct/pipeline/default_object.py 16 | dataduct/pipeline/ec2_resource.py 17 | dataduct/pipeline/emr_activity.py 18 | dataduct/pipeline/emr_resource.py 19 | dataduct/pipeline/mysql_node.py 20 | dataduct/pipeline/pipeline_object.py 21 | dataduct/pipeline/precondition.py 22 | dataduct/pipeline/redshift_copy_activity.py 23 | dataduct/pipeline/redshift_database.py 24 | dataduct/pipeline/redshift_node.py 25 | dataduct/pipeline/s3_node.py 26 | dataduct/pipeline/schedule.py 27 | dataduct/pipeline/shell_command_activity.py 28 | dataduct/pipeline/sns_alarm.py 29 | dataduct/pipeline/sql_activity.py 30 | dataduct/pipeline/utils.py 31 | dataduct/s3/__init__.py 32 | dataduct/s3/s3_directory.py 33 | dataduct/s3/s3_file.py 34 | dataduct/s3/s3_log_path.py 35 | dataduct/s3/s3_path.py 36 | dataduct/s3/utils.py 37 | dataduct/steps/__init__.py 38 | dataduct/steps/emr_streaming.py 39 | dataduct/steps/etl_step.py 40 | dataduct/steps/extract_local.py 41 | dataduct/steps/extract_rds.py 42 | dataduct/steps/extract_redshift.py 43 | dataduct/steps/extract_s3.py 44 | dataduct/steps/load_redshift.py 45 | dataduct/steps/sql_command.py 46 | dataduct/steps/transform.py 47 | dataduct/utils/__init__.py 48 | dataduct/utils/exceptions.py 49 | dataduct/utils/helpers.py 50 | scripts/README.md 51 | -------------------------------------------------------------------------------- /dataduct/s3/s3_directory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for storing a S3 File 3 | """ 4 | from .s3_path import S3Path 5 | from .utils import upload_dir_to_s3 6 | from ..utils.helpers import parse_path 7 | from ..utils.exceptions import ETLInputError 8 | 9 | 10 | class S3Directory(object): 11 | """S3 Directory object helps operate with a directory on S3 12 | 13 | The S3Directory acts much like the S3File. 14 | It represents a directory. Tries to unify the concept of a directory 15 | stored locally with one stored in S3. 16 | 17 | """ 18 | def __init__(self, path=None, s3_path=None): 19 | """Constructor for the S3 File object 20 | 21 | Args: 22 | path (str): Local path to file 23 | s3_path (S3Path, optional): s3_path of the file 24 | 25 | """ 26 | self.path = parse_path(path) 27 | self._s3_path = s3_path 28 | 29 | @property 30 | def s3_path(self): 31 | """Outputs the s3_path 32 | """ 33 | return self._s3_path 34 | 35 | @s3_path.setter 36 | def s3_path(self, value): 37 | """Set the S3 path for the file 38 | 39 | Args: 40 | value(S3Path): s3path of the directory 41 | """ 42 | if not isinstance(value, S3Path): 43 | raise ETLInputError('Input path should be of type S3Path') 44 | 45 | if not value.is_directory: 46 | raise ETLInputError('S3 path must be directory') 47 | self._s3_path = value 48 | 49 | def upload_to_s3(self): 50 | """Uploads the directory to the s3 directory 51 | """ 52 | upload_dir_to_s3(self._s3_path, self.path) 53 | -------------------------------------------------------------------------------- /dataduct/steps/primary_key_check.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for PK check step can be executed on Ec2 resource 3 | """ 4 | from ..config import Config 5 | from ..database import SqlStatement 6 | from ..database import Table 7 | from ..utils import constants as const 8 | from ..utils.helpers import parse_path 9 | from .qa_transform import QATransformStep 10 | 11 | config = Config() 12 | 13 | 14 | class PrimaryKeyCheckStep(QATransformStep): 15 | """PrimaryKeyCheckStep class that checks a table for PK violations 16 | """ 17 | 18 | def __init__(self, id, table_definition, script_arguments=None, 19 | log_to_s3=False, command=None, script=None, **kwargs): 20 | """Constructor for the PrimaryKeyCheckStep class 21 | 22 | Args: 23 | table_definition(file): table definition for the table to check 24 | **kwargs(optional): Keyword arguments directly passed to base class 25 | """ 26 | with open(parse_path(table_definition)) as f: 27 | table_def_string = f.read() 28 | 29 | if script_arguments is None: 30 | script_arguments = list() 31 | 32 | # We initialize the table object to check valid strings 33 | script_arguments.append( 34 | '--table=%s' % Table(SqlStatement(table_def_string)).sql()) 35 | 36 | if log_to_s3: 37 | script_arguments.append('--log_to_s3') 38 | 39 | if script is None and command is None: 40 | command = const.PK_CHECK_COMMAND 41 | 42 | super(PrimaryKeyCheckStep, self).__init__( 43 | id=id, command=command, script=script, 44 | script_arguments=script_arguments, **kwargs) 45 | -------------------------------------------------------------------------------- /dataduct/config/tests/test_credentials.py: -------------------------------------------------------------------------------- 1 | """Tests for credentials file 2 | """ 3 | from mock import patch 4 | from nose.tools import eq_ 5 | import json 6 | 7 | from ..credentials import get_aws_credentials_from_iam 8 | 9 | @patch('requests.get') 10 | def test_get_aws_credentials_from_iam(patched_requests_get): 11 | """Test for get credentials from IAM 12 | """ 13 | class MockedReturn: 14 | """Mock request response 15 | """ 16 | def __init__(self, content): 17 | self.content = content 18 | self.ok = True 19 | 20 | def json(self): 21 | """Returns a json for the content 22 | """ 23 | return json.loads(self.content) 24 | 25 | def server_response(url): 26 | """Mocked server responses 27 | """ 28 | if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/': # NOQA 29 | return MockedReturn("role") 30 | if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/role': # NOQA 31 | return MockedReturn(""" 32 | { 33 | "Code" : "Success", 34 | "LastUpdated" : "2012-04-26T16:39:16Z", 35 | "Type" : "AWS-HMAC", 36 | "AccessKeyId" : "access_id", 37 | "SecretAccessKey" : "secret_key", 38 | "Token" : "token", 39 | "Expiration" : "2012-04-27T22:39:16Z" 40 | } 41 | """) 42 | 43 | patched_requests_get.side_effect = server_response 44 | access_id, secret_key, token = get_aws_credentials_from_iam() 45 | eq_(access_id, 'access_id') 46 | eq_(secret_key, 'secret_key') 47 | eq_(token, 'token') 48 | -------------------------------------------------------------------------------- /docs/dataduct.database.parsers.rst: -------------------------------------------------------------------------------- 1 | dataduct.database.parsers package 2 | ================================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | dataduct.database.parsers.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | dataduct.database.parsers.create_table module 15 | --------------------------------------------- 16 | 17 | .. automodule:: dataduct.database.parsers.create_table 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | dataduct.database.parsers.create_view module 23 | -------------------------------------------- 24 | 25 | .. automodule:: dataduct.database.parsers.create_view 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | dataduct.database.parsers.helpers module 31 | ---------------------------------------- 32 | 33 | .. automodule:: dataduct.database.parsers.helpers 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | dataduct.database.parsers.select_query module 39 | --------------------------------------------- 40 | 41 | .. automodule:: dataduct.database.parsers.select_query 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | dataduct.database.parsers.transform module 47 | ------------------------------------------ 48 | 49 | .. automodule:: dataduct.database.parsers.transform 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | dataduct.database.parsers.utils module 55 | -------------------------------------- 56 | 57 | .. automodule:: dataduct.database.parsers.utils 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | 63 | Module contents 64 | --------------- 65 | 66 | .. automodule:: dataduct.database.parsers 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | -------------------------------------------------------------------------------- /dataduct/steps/emr_job.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for EmrActivity can be executed on EMR Cluster 3 | """ 4 | from .etl_step import ETLStep 5 | from ..pipeline import EmrActivity 6 | from ..utils import constants as const 7 | 8 | 9 | class EMRJobStep(ETLStep): 10 | """EMR Step class that helps run a step on the emr cluster 11 | """ 12 | 13 | def __init__(self, 14 | step_string, 15 | **kwargs): 16 | """Constructor for the EMRJobStep class 17 | 18 | Args: 19 | step_string(str): Step string for the emr job to be executed 20 | **kwargs(optional): Keyword arguments directly passed to base class 21 | 22 | Note: 23 | In the step_string all comma within arguments should be escaped 24 | using 4 backslashes 25 | """ 26 | super(EMRJobStep, self).__init__(**kwargs) 27 | 28 | self.activity = self.create_pipeline_object( 29 | object_class=EmrActivity, 30 | resource=self.resource, 31 | worker_group=self.worker_group, 32 | input_node=self.input, 33 | schedule=self.schedule, 34 | emr_step_string=step_string, 35 | output_node=self.output, 36 | depends_on=self.depends_on, 37 | max_retries=self.max_retries 38 | ) 39 | 40 | @classmethod 41 | def arguments_processor(cls, etl, input_args): 42 | """Parse the step arguments according to the ETL pipeline 43 | 44 | Args: 45 | etl(ETLPipeline): Pipeline object containing resources and steps 46 | step_args(dict): Dictionary of the step arguments for the class 47 | """ 48 | step_args = cls.base_arguments_processor( 49 | etl, input_args, resource_type=const.EMR_CLUSTER_STR) 50 | 51 | return step_args 52 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============= 3 | 4 | `Dataduct `__ is a wrapper built 5 | on top of `AWS 6 | Datapipeline `__ 7 | which makes it easy to create ETL jobs. All jobs can be specified as a 8 | series of steps in a YAML file and would automatically be translated 9 | into datapipeline with appropriate pipeline objects. 10 | 11 | Features include: 12 | 13 | - Visualizing pipeline activities 14 | - Extracting data from different sources such as RDS, S3, local files 15 | - Transforming data using EC2 and EMR 16 | - Loading data into redshift 17 | - Transforming data inside redshift 18 | - QA data between the source system and warehouse 19 | It is easy to create custom steps to augment the DSL as per the 20 | requirements. As well as running a backfill with the command line 21 | interface. 22 | 23 | An example ETL from RDS would look like: 24 | 25 | .. code:: YAML 26 | 27 | name: example_upsert 28 | frequency: daily 29 | load_time: 01:00 # Hour:Min in UTC 30 | 31 | steps: 32 | - step_type: extract-rds 33 | host_name: test_host 34 | database: test_database 35 | sql: | 36 | SELECT * 37 | FROM test_table; 38 | 39 | - step_type: create-load-redshift 40 | table_definition: tables/dev.test_table.sql 41 | 42 | - step_type: upsert 43 | source: tables/dev.test_table.sql 44 | destination: tables/dev.test_table_2.sql 45 | 46 | This would first perform an extraction from the RDS database with the 47 | ``extract-rds`` step using the ``COPY ACTIVITY``. Then load the data 48 | into the ``dev.test_table`` in redshift with the 49 | ``create-load-redshift``. Then perform an ``upsert`` with the data into 50 | the ``test_table_2``. 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup file for installation of the dataduct code 3 | """ 4 | from setuptools import find_packages 5 | from setuptools import setup 6 | 7 | from dataduct import __version__ as version 8 | 9 | setup( 10 | name='dataduct', 11 | version=version, 12 | author='Coursera Inc.', 13 | packages=find_packages( 14 | exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 15 | namespace_packages=['dataduct'], 16 | include_package_data=True, 17 | url='https://github.com/coursera/dataduct', 18 | long_description=open('README.rst').read(), 19 | author_email='data-infra@coursera.org', 20 | license='Apache License 2.0', 21 | description='DataPipeline for Humans', 22 | install_requires=[ 23 | 'boto>=2.38', 24 | 'MySQL-python>=1.2.3', 25 | 'matplotlib==1.5.3', 26 | 'pandas==0.18.1', 27 | 'psycopg2==2.6.0', 28 | 'pyparsing>=1.5.6', 29 | 'pytimeparse>=1.1.4', 30 | 'PyYAML>=3.11', 31 | 'testfixtures>=4.1.2', 32 | 'pyprind' 33 | ], 34 | scripts=['bin/dataduct'], 35 | classifiers=[ 36 | 'Development Status :: 5 - Production/Stable', 37 | 'Intended Audience :: Developers', 38 | 'License :: OSI Approved :: Apache Software License', 39 | 'Natural Language :: English', 40 | 'Operating System :: MacOS', 41 | 'Operating System :: MacOS :: MacOS 9', 42 | 'Operating System :: MacOS :: MacOS X', 43 | 'Operating System :: Unix', 44 | 'Programming Language :: Python :: 2.7', 45 | 'Programming Language :: Unix Shell', 46 | 'Topic :: Database', 47 | 'Topic :: Scientific/Engineering', 48 | 'Topic :: Scientific/Engineering :: Information Analysis', 49 | 'Topic :: Scientific/Engineering :: Visualization', 50 | 'Topic :: Utilities', 51 | ], 52 | ) 53 | -------------------------------------------------------------------------------- /docs/dataduct.database.rst: -------------------------------------------------------------------------------- 1 | dataduct.database package 2 | ========================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | dataduct.database.parsers 10 | dataduct.database.sql 11 | dataduct.database.tests 12 | 13 | Submodules 14 | ---------- 15 | 16 | dataduct.database.column module 17 | ------------------------------- 18 | 19 | .. automodule:: dataduct.database.column 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | dataduct.database.database module 25 | --------------------------------- 26 | 27 | .. automodule:: dataduct.database.database 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | dataduct.database.history_table module 33 | -------------------------------------- 34 | 35 | .. automodule:: dataduct.database.history_table 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | 40 | dataduct.database.relation module 41 | --------------------------------- 42 | 43 | .. automodule:: dataduct.database.relation 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | 48 | dataduct.database.select_statement module 49 | ----------------------------------------- 50 | 51 | .. automodule:: dataduct.database.select_statement 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | dataduct.database.table module 57 | ------------------------------ 58 | 59 | .. automodule:: dataduct.database.table 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | 64 | dataduct.database.view module 65 | ----------------------------- 66 | 67 | .. automodule:: dataduct.database.view 68 | :members: 69 | :undoc-members: 70 | :show-inheritance: 71 | 72 | 73 | Module contents 74 | --------------- 75 | 76 | .. automodule:: dataduct.database 77 | :members: 78 | :undoc-members: 79 | :show-inheritance: 80 | -------------------------------------------------------------------------------- /dataduct/pipeline/redshift_node.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for RedshiftNode 3 | """ 4 | 5 | from .pipeline_object import PipelineObject 6 | from .schedule import Schedule 7 | from ..utils.exceptions import ETLInputError 8 | 9 | 10 | class RedshiftNode(PipelineObject): 11 | """Redshift Data Node class 12 | """ 13 | 14 | def __init__(self, 15 | id, 16 | schedule, 17 | redshift_database, 18 | schema_name, 19 | table_name): 20 | """Constructor for the RedshiftNode class 21 | 22 | Args: 23 | id(str): id of the object 24 | schedule(Schedule): pipeline schedule 25 | redshift_database(RedshiftDatabase): database for the node 26 | schema_name(str): schema for node to extract or load data 27 | table_name(str): table for node to extract or load data 28 | """ 29 | 30 | # Validate inputs 31 | if not isinstance(schedule, Schedule): 32 | raise ETLInputError( 33 | 'Input schedule must be of the type Schedule') 34 | 35 | super(RedshiftNode, self).__init__( 36 | id=id, 37 | type='RedshiftDataNode', 38 | schedule=schedule, 39 | database=redshift_database, 40 | schemaName=schema_name, 41 | tableName=table_name, 42 | ) 43 | 44 | @property 45 | def schema(self): 46 | """Get the schema name for the redshift node 47 | 48 | Returns: 49 | result(str): schema name for this redshift node 50 | """ 51 | return self['schemaName'] 52 | 53 | @property 54 | def table(self): 55 | """Get the table name for the redshift node 56 | 57 | Returns: 58 | result(str): table name for this redshift node 59 | """ 60 | return self['tableName'] 61 | -------------------------------------------------------------------------------- /dataduct/pipeline/sns_alarm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for sns 3 | """ 4 | 5 | from ..config import Config 6 | from ..utils import constants as const 7 | from .pipeline_object import PipelineObject 8 | 9 | config = Config() 10 | SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) 11 | ROLE = config.etl['ROLE'] 12 | 13 | 14 | class SNSAlarm(PipelineObject): 15 | """SNS object added to all pipelines 16 | """ 17 | 18 | def __init__(self, 19 | id, 20 | pipeline_name=None, 21 | failure_message=None, 22 | topic_arn=None, 23 | **kwargs): 24 | """Constructor for the SNSAlarm class 25 | 26 | Args: 27 | id(str): id of the object 28 | pipeline_name(str): frequency type for the pipeline 29 | failure_message(str): Message used in SNS on pipeline failures, 30 | **kwargs(optional): Keyword arguments directly passed to base class 31 | """ 32 | 33 | if not pipeline_name: 34 | pipeline_name = "None" 35 | 36 | if not failure_message: 37 | failure_message = '\n'.join([ 38 | 'Identifier: ' + pipeline_name, 39 | 'Object: #{node.name}', 40 | 'Object Scheduled Start Time: #{node.@scheduledStartTime}', 41 | 'Error Message: #{node.errorMessage}', 42 | 'Error Stack Trace: #{node.errorStackTrace}' 43 | ]) 44 | 45 | subject = 'Data Pipeline %s failed' % pipeline_name 46 | 47 | if topic_arn is None: 48 | topic_arn = SNS_TOPIC_ARN_FAILURE 49 | 50 | super(SNSAlarm, self).__init__( 51 | id=id, 52 | type='SnsAlarm', 53 | topicArn=topic_arn, 54 | role=ROLE, 55 | subject=subject, 56 | message=failure_message, 57 | ) 58 | -------------------------------------------------------------------------------- /dataduct/config/logger_config.py: -------------------------------------------------------------------------------- 1 | """Script that has the base logger configurations 2 | """ 3 | import os 4 | import logging 5 | from logging.handlers import RotatingFileHandler 6 | 7 | from .config import Config 8 | from .constants import CONFIG_DIR 9 | from .constants import LOG_FILE 10 | 11 | FILE_FORMAT_STR = '%(asctime)s [%(levelname)s]: %(message)s ' + \ 12 | '[in %(name)s:%(lineno)d in %(funcName)s]' 13 | CONSOLE_FORMAT_STR = '[%(levelname)s]: %(message)s' 14 | 15 | 16 | def logger_configuration(): 17 | """Set the logger configurations for dataduct 18 | """ 19 | config = Config() 20 | 21 | if not hasattr(config, 'logging'): 22 | raise Exception('logging section is missing in config') 23 | 24 | log_directory = os.path.expanduser(config.logging.get( 25 | 'LOG_DIR', os.path.join('~', CONFIG_DIR))) 26 | file_name = config.logging.get( 27 | 'LOG_FILE', LOG_FILE) 28 | 29 | console_level = config.logging.get( 30 | 'CONSOLE_DEBUG_LEVEL', logging.INFO) 31 | file_level = config.logging.get( 32 | 'FILE_DEBUG_LEVEL', logging.DEBUG) 33 | 34 | if not os.path.exists(log_directory): 35 | os.mkdir(log_directory) 36 | 37 | logger = logging.getLogger() 38 | logger.setLevel(logging.DEBUG) 39 | 40 | file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), 41 | maxBytes=200000, 42 | backupCount=10) 43 | file_handler.setLevel(file_level) 44 | file_handler.setFormatter(logging.Formatter(FILE_FORMAT_STR, 45 | datefmt='%Y-%m-%d %H:%M')) 46 | 47 | console_handler = logging.StreamHandler() 48 | console_handler.setLevel(console_level) 49 | console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT_STR)) 50 | 51 | logger.addHandler(console_handler) 52 | logger.addHandler(file_handler) 53 | -------------------------------------------------------------------------------- /dataduct/steps/extract_s3.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for creating an S3 node for input 3 | """ 4 | from ..s3 import S3Path 5 | from ..utils.exceptions import ETLInputError 6 | from ..utils.helpers import exactly_one 7 | from ..utils.helpers import get_modified_s3_path 8 | from .etl_step import ETLStep 9 | 10 | 11 | class ExtractS3Step(ETLStep): 12 | """ExtractS3 Step class that helps get data from S3 13 | """ 14 | 15 | def __init__(self, directory_uri=None, file_uri=None, **kwargs): 16 | """Constructor for the ExtractS3Step class 17 | 18 | Args: 19 | directory_uri(str): s3 path for s3 data directory 20 | file_uri(str): s3 path for s3 data file 21 | **kwargs(optional): Keyword arguments directly passed to base class 22 | """ 23 | if not exactly_one(directory_uri, file_uri): 24 | raise ETLInputError('One of file_uri or directory_uri needed') 25 | 26 | super(ExtractS3Step, self).__init__(**kwargs) 27 | 28 | if directory_uri: 29 | directory_uri = get_modified_s3_path(directory_uri) 30 | s3_path = S3Path(uri=directory_uri, is_directory=True) 31 | else: 32 | file_uri = get_modified_s3_path(file_uri) 33 | s3_path = S3Path(uri=file_uri) 34 | self._output = self.create_s3_data_node(s3_path) 35 | 36 | @classmethod 37 | def arguments_processor(cls, etl, input_args): 38 | """Parse the step arguments according to the ETL pipeline 39 | 40 | Args: 41 | etl(ETLPipeline): Pipeline object containing resources and steps 42 | step_args(dict): Dictionary of the step arguments for the class 43 | """ 44 | input_args = cls.pop_inputs(input_args) 45 | step_args = cls.base_arguments_processor(etl, input_args) 46 | step_args.pop('resource', None) 47 | step_args.pop('worker_group', None) 48 | 49 | return step_args 50 | -------------------------------------------------------------------------------- /dataduct/steps/upsert.py: -------------------------------------------------------------------------------- 1 | """ETL step wrapper for Upsert SQL script 2 | """ 3 | from ..database import HistoryTable 4 | from ..database import SelectStatement 5 | from ..database import SqlScript 6 | from ..database import Table 7 | from ..utils.helpers import exactly_one 8 | from ..utils.helpers import parse_path 9 | from .create_update_sql import CreateUpdateSqlStep 10 | 11 | 12 | class UpsertStep(CreateUpdateSqlStep): 13 | """Upsert Step class that helps run a step on the emr cluster 14 | """ 15 | 16 | def __init__(self, destination, sql=None, script=None, source=None, 17 | enforce_primary_key=True, delete_existing=False, history=None, 18 | analyze_table=True, filter_clause=None, **kwargs): 19 | """Constructor for the UpsertStep class 20 | 21 | Args: 22 | **kwargs(optional): Keyword arguments directly passed to base class 23 | """ 24 | self.s3_source_dir = kwargs['s3_source_dir'] 25 | assert exactly_one(sql, source, script), 'One of sql/source/script' 26 | 27 | # Input formatting 28 | dest = Table(SqlScript(filename=parse_path(destination))) 29 | 30 | if source is not None: 31 | source_relation = Table(SqlScript(filename=parse_path(source))) 32 | else: 33 | source_relation = SelectStatement( 34 | SqlScript(sql=sql, filename=parse_path(script)).sql()) 35 | 36 | # Create the destination table if doesn't exist 37 | sql_script = dest.upsert_script(source_relation, enforce_primary_key, 38 | delete_existing, filter_clause) 39 | 40 | if history: 41 | hist = HistoryTable(SqlScript( 42 | filename=parse_path(history))) 43 | sql_script.append(hist.update_history_script(dest)) 44 | 45 | super(UpsertStep, self).__init__( 46 | table_definition=destination, command=sql_script.sql(), 47 | analyze_table=analyze_table, **kwargs) 48 | -------------------------------------------------------------------------------- /dataduct/steps/qa_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for QA step can be executed on Ec2 resource 3 | """ 4 | from .transform import TransformStep 5 | from ..config import Config 6 | 7 | config = Config() 8 | 9 | 10 | class QATransformStep(TransformStep): 11 | """QATransform Step class that helps run scripts on resouces for QA checks 12 | """ 13 | 14 | def __init__(self, 15 | id, 16 | pipeline_name, 17 | script_arguments=None, 18 | sns_topic_arn=None, 19 | **kwargs): 20 | """Constructor for the QATransformStep class 21 | 22 | Args: 23 | sns_arn(str): sns topic arn for QA steps 24 | script_arguments(list of str): list of arguments to the script 25 | **kwargs(optional): Keyword arguments directly passed to base class 26 | """ 27 | 28 | if sns_topic_arn is None: 29 | sns_topic_arn = config.etl.get('SNS_TOPIC_ARN_WARNING', None) 30 | 31 | if script_arguments is None: 32 | script_arguments = list() 33 | 34 | script_arguments.append('--test_name=%s' % (pipeline_name + "." + id)) 35 | if sns_topic_arn: 36 | script_arguments.append('--sns_topic_arn=%s' % sns_topic_arn) 37 | 38 | super(QATransformStep, self).__init__( 39 | id=id, 40 | script_arguments=script_arguments, 41 | no_output=True, 42 | **kwargs) 43 | 44 | @classmethod 45 | def arguments_processor(cls, etl, input_args): 46 | """Parse the step arguments according to the ETL pipeline 47 | 48 | Args: 49 | etl(ETLPipeline): Pipeline object containing resources and steps 50 | step_args(dict): Dictionary of the step arguments for the class 51 | """ 52 | input_args = cls.pop_inputs(input_args) 53 | step_args = cls.base_arguments_processor(etl, input_args) 54 | step_args['pipeline_name'] = etl.name 55 | 56 | return step_args 57 | -------------------------------------------------------------------------------- /dataduct/database/view.py: -------------------------------------------------------------------------------- 1 | """Script containing the view class object 2 | """ 3 | from .parsers import parse_create_view 4 | from .sql import SqlScript 5 | from .select_statement import SelectStatement 6 | from .relation import Relation 7 | 8 | 9 | class View(Relation): 10 | """Class representing view in the database 11 | """ 12 | def __init__(self, sql): 13 | """Constructor for view class 14 | """ 15 | super(View, self).__init__() 16 | 17 | if isinstance(sql, SqlScript): 18 | # Take the first statement and ignore the rest 19 | sql = sql.statements[0] 20 | 21 | parameters = parse_create_view(sql.sql()) 22 | 23 | self.sql_statement = sql 24 | self.parameters = parameters 25 | 26 | self.full_name = parameters.get('view_name') 27 | self.replace_flag = parameters.get('replace', False) 28 | 29 | self.select_statement = SelectStatement(parameters.get('select_statement')) 30 | 31 | self.schema_name, self.view_name = self.initialize_name() 32 | 33 | @property 34 | def dependencies(self): 35 | """List of relations which this view references. 36 | """ 37 | return self.select_statement.dependencies 38 | 39 | @property 40 | def columns(self): 41 | """List of columns in the view's select statement 42 | """ 43 | return self.select_statement.columns 44 | 45 | def drop_script(self): 46 | """Sql script to drop the view 47 | """ 48 | return SqlScript('DROP VIEW IF EXISTS %s CASCADE' % self.full_name) 49 | 50 | def check_not_exists_script(self): 51 | """Sql script to create statement if the table exists or not 52 | """ 53 | return SqlScript(""" 54 | SELECT NOT EXISTS( 55 | SELECT 1 56 | FROM information_schema.views 57 | WHERE table_schema = '%s' 58 | AND table_name = '%s' 59 | ) 60 | """ % (self.schema_name, self.view_name)) 61 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | We really appreciate any help we can get in making dataduct a successful project. 4 | There are a few guidelines that we need contributors to follow so that we can 5 | have a chance of keeping on top of things. 6 | 7 | ## Getting Started 8 | 9 | * Make sure you have a [GitHub account](https://github.com/signup/free) 10 | * Create an issue for the bug, assuming one does not already exist. 11 | * Clearly describe the issue including steps to reproduce when it is a bug. 12 | * Make sure you fill in the earliest version that you know has the issue. 13 | * Fork the repository on GitHub 14 | 15 | ## Making Changes 16 | 17 | * Create a topic branch from where you want to base your work. 18 | * This is usually the master branch. 19 | * Only target release branches if you are certain your fix must be on that 20 | branch. 21 | * To quickly create a topic branch based on master; `git checkout -b 22 | fix/master/my_contribution master`. Please avoid working directly on the 23 | `master` branch. 24 | * Make commits of logical units. 25 | * Check for unnecessary whitespace with `git diff --check` before committing. 26 | * Make sure your commit messages are in the proper format. 27 | * Make sure you have added the necessary tests for your changes. 28 | * Run _all_ the tests to assure nothing else was accidentally broken. 29 | * Make sure all the code follows PEP8 30 | 31 | ## Making Trivial Changes 32 | 33 | ### Documentation 34 | 35 | For changes of a trivial nature to comments and documentation, it is not 36 | always necessary to create a new issue. In this case, it is 37 | appropriate to start the first line of a commit with '(doc)' instead of 38 | a ticket number. 39 | 40 | ## Submitting Changes 41 | 42 | * Push your changes to a topic branch in your fork of the repository. 43 | * Submit a pull request to the repository in the coursera organization. 44 | * Reference the issue you created in the pull requrest 45 | 46 | # Additional Resources 47 | 48 | * [General GitHub documentation](http://help.github.com/) 49 | * [GitHub pull request documentation](http://help.github.com/send-pull-requests/) 50 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Installation using pip 5 | ---------------------- 6 | 7 | Dataduct can easily be installed using pip with the following commands. 8 | 9 | :: 10 | 11 | pip install dataduct 12 | 13 | The major dependencies of dataduct are: 14 | 15 | - ``boto`` greater than version 2.34, older versions are missing some 16 | of the functionality provided by EMR 17 | - ``PyYAML`` 18 | - ``pandas`` 19 | - ``psycopg2`` 20 | - ``pytimeparse`` 21 | - ``MySQL-python`` 22 | - ``pyparsing`` 23 | - ``testfixtures`` 24 | 25 | Ensure that a `boto config file `__ 26 | containing proper AWS credentials is present. 27 | 28 | The visualizations are created using: 29 | 30 | - ``graphviz`` 31 | - ``pygraphviz`` 32 | 33 | Autocomplete for the CLI is supported using: 34 | 35 | - ``argcomplete`` 36 | 37 | The documentation is created using: 38 | 39 | - ``sphinx`` 40 | - ``sphinx-napolean`` 41 | - ``sphinx_rtd_theme`` 42 | 43 | Installing in the developer environment 44 | --------------------------------------- 45 | 46 | 1. Clone the Repo 47 | ^^^^^^^^^^^^^^^^^ 48 | 49 | :: 50 | 51 | git clone https://github.com/coursera/dataduct.git 52 | 53 | 2. Update PATH and PYTHONPATH 54 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 55 | 56 | Add these lines into your ``.bash_profile`` or ``.zshrc`` etc based on 57 | your shell type. 58 | 59 | :: 60 | 61 | export PYTHONPATH=~/dataduct:$PYTHONPATH 62 | export PATH=~/dataduct/bin:$PATH 63 | 64 | 3. Config 65 | ^^^^^^^^^ 66 | 67 | Create a config file. Instructions for this are provided in the config 68 | section. 69 | 70 | Setup Autocomplete 71 | ------------------ 72 | 73 | Install argcomplete with ``pip install argcomplete``. 74 | 75 | If you're using ``bash`` then add the following to your 76 | ``.bash_profile``: 77 | 78 | :: 79 | 80 | eval "$(register-python-argcomplete dataduct)" 81 | 82 | if you're using ``zsh`` then add the following line to your ``.zshrc``: 83 | 84 | :: 85 | 86 | autoload bashcompinit 87 | bashcompinit 88 | eval "$(register-python-argcomplete dataduct)" 89 | -------------------------------------------------------------------------------- /dataduct/data_access/tests/test_connection.py: -------------------------------------------------------------------------------- 1 | """Tests for the connection file 2 | """ 3 | from unittest import TestCase 4 | from nose.tools import eq_ 5 | from nose.tools import raises 6 | 7 | from ...config import Config 8 | from ...utils.exceptions import ETLConfigError 9 | from .. import connection 10 | 11 | 12 | class TestConnection(TestCase): 13 | """Tests for the connection file 14 | """ 15 | @staticmethod 16 | def test_get_redshift_config_correctly_returns(): 17 | """Tests that get_redshift_config can successfully retrieve the 18 | redshift config 19 | """ 20 | config = Config() 21 | config.redshift = 'test' 22 | eq_(connection.get_redshift_config(), 'test') 23 | 24 | @staticmethod 25 | @raises(ETLConfigError) 26 | def test_get_redshift_config_no_config_raises(): 27 | """Tests that get_redshift_config raises an exception if the redshift 28 | config cannot be found 29 | """ 30 | config = Config() 31 | del config.redshift 32 | connection.get_redshift_config() 33 | 34 | @staticmethod 35 | @raises(ETLConfigError) 36 | def test_sql_config_no_config_raises(): 37 | """Tests that get_sql_config raises an exception if the config cannot 38 | be found 39 | """ 40 | config = Config() 41 | del config.mysql 42 | connection.get_sql_config('test') 43 | 44 | @staticmethod 45 | @raises(ETLConfigError) 46 | def test_sql_config_cannot_find_hostname_raises(): 47 | """Tests that get_sql_config raises an exception if the hostname is not 48 | in the config 49 | """ 50 | config = Config() 51 | config.mysql = {'test': {}} 52 | connection.get_sql_config('test1') 53 | 54 | @staticmethod 55 | def test_sql_config_correctly_returns(): 56 | """Tests that get_sql_config can correctly retrieve the config 57 | """ 58 | config = Config() 59 | config.mysql = {'test': {'cred': 'data'}} 60 | result = connection.get_sql_config('test') 61 | eq_(result['DATABASE'], 'test') 62 | eq_(result['cred'], 'data') 63 | -------------------------------------------------------------------------------- /dataduct/database/sql/sql_statement.py: -------------------------------------------------------------------------------- 1 | """Script that contains the sql statement class 2 | """ 3 | from copy import deepcopy 4 | from .utils import sanitize_sql 5 | from ..parsers import parse_create_table 6 | from ..parsers import parse_create_view 7 | 8 | 9 | class SqlStatement(object): 10 | """Class representing a single SQL statement 11 | """ 12 | def __init__(self, sql=None, transactional=False): 13 | """Constructor for the SqlStatement class 14 | """ 15 | if sql is None: 16 | sql = '' 17 | self._raw_sql = sql 18 | self.transactional = transactional 19 | self._raw_statement = self._sanitize_sql() 20 | 21 | def __str__(self): 22 | """Print a SqlStatement object 23 | """ 24 | return self.sql() 25 | 26 | def copy(self): 27 | """Create a copy of the relation object 28 | """ 29 | return deepcopy(self) 30 | 31 | def sql(self): 32 | """Returns the raw_sql for the SqlStatement 33 | """ 34 | return self._raw_statement 35 | 36 | def _sanitize_sql(self): 37 | """Clean the SQL, remove comments and empty statements 38 | """ 39 | if self._raw_sql is None: 40 | return '' 41 | 42 | raw_statements = sanitize_sql(self._raw_sql, self.transactional) 43 | 44 | if len(raw_statements) > 1: 45 | raise ValueError('SQL Statement can not contain more than 1 query') 46 | elif len(raw_statements) == 1: 47 | return raw_statements[0] 48 | else: 49 | return '' 50 | 51 | def _validate_parser(self, func): 52 | """Check if a parser satisfies the sql statement 53 | """ 54 | try: 55 | func(self.sql()) 56 | except Exception: 57 | return False 58 | return True 59 | 60 | def creates_table(self): 61 | """SQL statement creates a table. 62 | """ 63 | return self._validate_parser(parse_create_table) 64 | 65 | def creates_view(self): 66 | """SQL statement creates a view. 67 | """ 68 | return self._validate_parser(parse_create_view) 69 | -------------------------------------------------------------------------------- /dataduct/database/parsers/tests/test_create_table.py: -------------------------------------------------------------------------------- 1 | """Tests for create table parser 2 | """ 3 | 4 | from unittest import TestCase 5 | from nose.tools import eq_ 6 | from nose.tools import raises 7 | from pyparsing import ParseException 8 | 9 | from ..create_table import parse_create_table 10 | from ..create_table import create_exists_clone 11 | 12 | 13 | class TestCreateTableStatement(TestCase): 14 | """Tests for create table 15 | """ 16 | @staticmethod 17 | def test_basic(): 18 | """Basic test for create table 19 | """ 20 | query = ('CREATE TABLE orders (' 21 | 'customer_id INTEGER DISTKEY PRIMARY KEY,' 22 | 'customer_name VARCHAR(200))') 23 | 24 | output = parse_create_table(query) 25 | 26 | eq_(output['full_name'], 'orders') 27 | eq_(output['temporary'], False) 28 | eq_(output['exists_checks'], False) 29 | eq_(len(output['constraints']), 0) 30 | eq_(len(output['columns']), 2) 31 | 32 | @staticmethod 33 | def test_exists_clone(): 34 | """Basic test for create table clone with exists condition 35 | """ 36 | query = ('CREATE TABLE orders (' 37 | 'customer_id INTEGER DISTKEY PRIMARY KEY,' 38 | 'customer_name VARCHAR(200))') 39 | 40 | exists_clone = create_exists_clone(query) 41 | output = parse_create_table(exists_clone) 42 | eq_(output['full_name'], 'orders') 43 | eq_(output['temporary'], False) 44 | eq_(output['exists_checks'], True) 45 | 46 | @staticmethod 47 | @raises(ParseException) 48 | def test_bad_input(): 49 | """Feeding malformed input into create table 50 | """ 51 | query = 'CREATE TABLE orders (' +\ 52 | 'customer_id INTEGER DISTKEY PRIMARY KEY' 53 | parse_create_table(query) 54 | 55 | @staticmethod 56 | @raises(ParseException) 57 | def test_bad_input_in_columns(): 58 | """Feeding malformed input into create table 59 | """ 60 | query = 'CREATE TABLE orders (' +\ 61 | 'customer_id NEGATIVE DISTKEY PRIMARY KEY)' 62 | parse_create_table(query) 63 | -------------------------------------------------------------------------------- /dataduct/qa/count_check.py: -------------------------------------------------------------------------------- 1 | """QA test for comparing number of rows in the source system with the Warehouse 2 | """ 3 | 4 | from .check import Check 5 | from .utils import render_output 6 | 7 | 8 | class CountCheck(Check): 9 | """QA test for comparing number of rows across the ETL 10 | """ 11 | def __init__(self, source_count, destination_count, **kwargs): 12 | """Constructor for the Count based QA 13 | 14 | Args: 15 | source_count(int): Count of objects in the source system 16 | destination_count(int): Count of objects in the warehouse 17 | """ 18 | super(CountCheck, self).__init__(**kwargs) 19 | self.source_count = source_count 20 | self.destination_count = destination_count 21 | 22 | @property 23 | def error_rate(self): 24 | """The error rate. 25 | If there are no values in the source or destination, the error is 0. 26 | If there are no values in the source but some in the destination, 27 | the error is None 28 | """ 29 | return self.calculate_error_rate(self.source_count, 30 | self.destination_count) 31 | 32 | @staticmethod 33 | def calculate_error_rate(source_count, destination_count): 34 | """Calculate the error rate based on the source and destination counts 35 | """ 36 | if source_count > 0: 37 | error_difference = float(source_count - destination_count) 38 | return abs(error_difference * 100) / source_count 39 | elif destination_count == 0: 40 | return 0 41 | else: 42 | return None 43 | 44 | @property 45 | def summary(self): 46 | """Summary of the test results for the SNS message 47 | """ 48 | return render_output( 49 | [ 50 | 'Test Name: %s' % self.name, 51 | 'Success: %s' % self.success, 52 | 'Tolerance: %0.4f%%' % self.tolerance, 53 | 'Error Rate: %0.4f%%' % self.error_rate, 54 | 'Source Count: %d' % self.source_count, 55 | 'Destination Count: %d' % self.destination_count, 56 | ] 57 | ) 58 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # Changes in dataduct 2 | 3 | ### 0.5.0 4 | - Cleanup commands being passed in QA steps 5 | - Add support for postgres 6 | - Status bar for uploading large files 7 | - Minor bug fixes 8 | 9 | ### 0.4.0 10 | - Support for starting database shell from dataduct cli 11 | - Fix bug in logger configuration 12 | - More performance tuning for analyze and vacumm 13 | - Improved subject line for SNS messages 14 | - More informed logging for load errors 15 | - Improvements to decorators 16 | - PK enforcement changes 17 | - New load-reload-pk step 18 | - Support for worker groups 19 | - Steps to move away from scripts to all code being contained in the library 20 | 21 | ### 0.3.0 22 | - More documentation 23 | - Bug fixes in SQL parser 24 | - Hooks framework 25 | - Default bootstrap 26 | - Teardown 27 | - Frequency fixes 28 | 29 | ### 0.2.0 30 | - Travis integration for continous builds 31 | - QA steps and logging to S3 32 | - Visualizing pipeline 33 | - Dataduct CLI updated as a single entry point 34 | - RDS connections for scripts 35 | - Bootstrap step for pipelines 36 | - Backfill or delay activation 37 | - Output path and input path options 38 | - Script directory for transform step 39 | - SQL sanatization for DBA actions 40 | - SQL parser for select and create table statements 41 | - Logging across the library 42 | - Support for custom steps 43 | - Pipeline dependency step 44 | - Reduce verbosity of imports 45 | - Step parsing is isolated in steps 46 | - More examples for steps 47 | - Sync config with S3 48 | - Config overides with modes 49 | - Rename keywords and safe config failure handling 50 | - EMR Streaming support with hadoop 2 51 | - Exceptions cleanup 52 | - Read the docs support 53 | - Creating tables automatically for various steps 54 | - History table support 55 | - EC2 and EMR config control from YAML 56 | - Slack integration 57 | - Support for Regions in DP 58 | 59 | ### 0.1.0 60 | - Initial version of the dataduct library released 61 | - Support for the following steps: 62 | - emr_streaming 63 | - extract-local 64 | - extract-s3 65 | - extract-rds 66 | - extract-redshift 67 | - load-redshift 68 | - sql-command 69 | - transform 70 | - Examples and documentation added for all the steps 71 | -------------------------------------------------------------------------------- /dataduct/tests/test_import.py: -------------------------------------------------------------------------------- 1 | """Tests for dependencies 2 | """ 3 | from unittest import TestCase 4 | 5 | 6 | class TestImports(TestCase): 7 | """Tests for dependencies 8 | """ 9 | @staticmethod 10 | def test_boto(): 11 | """Testing boto 12 | """ 13 | print 'Trying to import boto' 14 | import boto 15 | 16 | @staticmethod 17 | def test_mysqldb(): 18 | """Testing MySQLdb 19 | """ 20 | print 'Trying to import MySQLdb' 21 | import MySQLdb 22 | 23 | @staticmethod 24 | def test_pandas(): 25 | """Testing pandas 26 | """ 27 | print 'Trying to import pandas' 28 | import pandas 29 | print pandas.io.sql 30 | 31 | @staticmethod 32 | def test_psycopg2(): 33 | """Testing psycopg2 34 | """ 35 | print 'Trying to import psycopg2' 36 | import psycopg2 37 | 38 | @staticmethod 39 | def test_pygraphviz(): 40 | """Testing pygraphviz 41 | """ 42 | print 'Trying to import pygraphviz' 43 | import pygraphviz 44 | 45 | @staticmethod 46 | def test_pyparsing(): 47 | """Testing pyparsing 48 | """ 49 | print 'Trying to import pyparsing' 50 | import pyparsing 51 | 52 | @staticmethod 53 | def test_pyyaml(): 54 | """Testing PyYAML 55 | """ 56 | print 'Trying to import pyyaml' 57 | import yaml 58 | 59 | @staticmethod 60 | def test_setuptools(): 61 | """Testing setuptools 62 | """ 63 | print 'Trying to import setuptools' 64 | import setuptools 65 | 66 | @staticmethod 67 | def test_sphinx_rtd_theme(): 68 | """Testing sphinx_rtd_theme 69 | """ 70 | print 'Trying to import sphinx_rtd_theme' 71 | import sphinx_rtd_theme 72 | 73 | @staticmethod 74 | def test_testfixtures(): 75 | """Testing testfixtures 76 | """ 77 | print 'Trying to import testfixtures' 78 | import testfixtures 79 | 80 | @staticmethod 81 | def test_pytimeparse(): 82 | """Testing pytimeparse 83 | """ 84 | print 'Trying to import pytimeparse' 85 | import pytimeparse 86 | -------------------------------------------------------------------------------- /dataduct/utils/hook.py: -------------------------------------------------------------------------------- 1 | """Hook framework in dataduct. 2 | 3 | To make a function hookable, add the hook decorator like so: 4 | 5 | @hook('hook_name') 6 | def function(): 7 | ... 8 | """ 9 | import os 10 | import imp 11 | import sys 12 | 13 | from .helpers import parse_path 14 | 15 | 16 | def default_before_hook(*args, **kwargs): 17 | """The default before hook, will act like it's not even there 18 | """ 19 | return args, kwargs 20 | 21 | 22 | def default_after_hook(result): 23 | """The default after hook, will act like it's not even there 24 | """ 25 | return result 26 | 27 | 28 | def get_hooks(hook_name): 29 | """Returns the before hook and after hook (in a tuple) for a particular 30 | hook name 31 | """ 32 | from dataduct.config import Config 33 | config = Config() 34 | 35 | if 'HOOKS_BASE_PATH' not in config.etl: 36 | return default_before_hook, default_after_hook 37 | 38 | hook_file = parse_path(hook_name + '.py', 'HOOKS_BASE_PATH') 39 | if not os.path.isfile(hook_file): 40 | return default_before_hook, default_after_hook 41 | 42 | # Delete the previous custom hook, so the imports are not merged. 43 | if 'custom_hook' in sys.modules: 44 | del sys.modules['custom_hook'] 45 | 46 | # Get the hook functions, falling back to the default hooks 47 | custom_hook = imp.load_source('custom_hook', hook_file) 48 | before_hook = getattr(custom_hook, 'before_hook', default_before_hook) 49 | after_hook = getattr(custom_hook, 'after_hook', default_after_hook) 50 | 51 | return before_hook, after_hook 52 | 53 | 54 | def hook(hook_name): 55 | """The hook decorator creator 56 | """ 57 | before_hook, after_hook = get_hooks(hook_name) 58 | 59 | def hook_decorator(func): 60 | """The hook decorator 61 | """ 62 | def function_wrapper(*args, **kwargs): 63 | """The hook wrapper for the function 64 | """ 65 | new_args, new_kwargs = before_hook(*args, **kwargs) 66 | result = func(*new_args, **new_kwargs) 67 | new_result = after_hook(result) 68 | return new_result 69 | 70 | return function_wrapper 71 | 72 | return hook_decorator 73 | -------------------------------------------------------------------------------- /dataduct/pipeline/mysql_node.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for MysqlNode 3 | """ 4 | 5 | from .pipeline_object import PipelineObject 6 | from .schedule import Schedule 7 | from ..utils.exceptions import ETLInputError 8 | 9 | 10 | class MysqlNode(PipelineObject): 11 | """MySQL Data Node class 12 | """ 13 | 14 | def __init__(self, id, schedule, host, database, username, password, sql, 15 | table, depends_on=None): 16 | """Constructor for the MysqlNode class 17 | 18 | Args: 19 | id(str): id of the object 20 | schedule(Schedule): pipeline schedule 21 | host(str): hostname for the mysql database 22 | database(str): database name on the RDS host 23 | user(str): username for the database 24 | password(str): password for the database 25 | sql(str): sql to be executed 26 | table(str): table to be read 27 | """ 28 | 29 | # Validate inputs 30 | if not isinstance(schedule, Schedule): 31 | raise ETLInputError( 32 | 'Input schedule must be of the type Schedule') 33 | 34 | if not depends_on: 35 | depends_on = list() 36 | 37 | connection_string = "jdbc:mysql://" + host + ":3306/" + database 38 | 39 | kwargs = { 40 | 'id': id, 41 | 'type': 'SqlDataNode', 42 | 'schedule': schedule, 43 | 'connectionString': connection_string, 44 | 'username': username, 45 | '*password': password, 46 | 'selectQuery': sql, 47 | 'table': table, 48 | 'dependsOn': depends_on, 49 | } 50 | super(MysqlNode, self).__init__(**kwargs) 51 | 52 | @property 53 | def database(self): 54 | """Get the database name for the MySQL node 55 | 56 | Returns: 57 | result(str): database name for this MySQL node 58 | """ 59 | return self['connectionString'].split("/").pop() 60 | 61 | @property 62 | def table(self): 63 | """Get the table name for the MySQL node 64 | 65 | Returns: 66 | result(str): table name for this MySQL node 67 | """ 68 | return self['tableName'] 69 | -------------------------------------------------------------------------------- /dataduct/steps/create_load_redshift.py: -------------------------------------------------------------------------------- 1 | """ETL step wrapper for loading into redshift with the COPY command 2 | """ 3 | from ..config import Config 4 | from ..database import SqlStatement 5 | from ..database import Table 6 | from ..utils import constants as const 7 | from ..utils.helpers import parse_path 8 | from .transform import TransformStep 9 | 10 | config = Config() 11 | 12 | 13 | class CreateAndLoadStep(TransformStep): 14 | """CreateAndLoad Step class that creates table if needed and loads data 15 | """ 16 | 17 | def __init__(self, id, table_definition, input_node, 18 | script_arguments=None, **kwargs): 19 | """Constructor for the CreateAndLoadStep class 20 | 21 | Args: 22 | table_definition(filepath): schema file for the table to be loaded 23 | script_arguments(list of str): list of arguments to the script 24 | **kwargs(optional): Keyword arguments directly passed to base class 25 | """ 26 | with open(parse_path(table_definition)) as f: 27 | table_def_string = f.read() 28 | 29 | table = Table(SqlStatement(table_def_string)) 30 | 31 | if isinstance(input_node, dict): 32 | input_paths = [i.path().uri for i in input_node.values()] 33 | else: 34 | input_paths = [input_node.path().uri] 35 | 36 | if script_arguments is None: 37 | script_arguments = list() 38 | 39 | script_arguments.extend([ 40 | '--table_definition=%s' % table.sql().sql(), 41 | '--s3_input_paths'] + input_paths) 42 | 43 | super(CreateAndLoadStep, self).__init__( 44 | id=id, command=const.LOAD_COMMAND, 45 | script_arguments=script_arguments, no_input=True, no_output=True, 46 | **kwargs) 47 | 48 | @classmethod 49 | def arguments_processor(cls, etl, input_args): 50 | """Parse the step arguments according to the ETL pipeline 51 | 52 | Args: 53 | etl(ETLPipeline): Pipeline object containing resources and steps 54 | step_args(dict): Dictionary of the step arguments for the class 55 | """ 56 | step_args = cls.base_arguments_processor(etl, input_args) 57 | 58 | return step_args 59 | -------------------------------------------------------------------------------- /dataduct/etl/tests/test_etl_pipeline.py: -------------------------------------------------------------------------------- 1 | """Tests for the ETL Pipeline object 2 | """ 3 | import unittest 4 | from nose.tools import raises 5 | from nose.tools import eq_ 6 | 7 | from datetime import timedelta 8 | from ..etl_pipeline import ETLPipeline 9 | from ...utils.exceptions import ETLInputError 10 | 11 | 12 | class EtlPipelineTests(unittest.TestCase): 13 | """Tests for the ETL Pipeline object 14 | """ 15 | 16 | def setUp(self): 17 | """Setup text fixtures 18 | """ 19 | self.default_pipeline = ETLPipeline('test_pipeline') 20 | 21 | @staticmethod 22 | def test_construct_etl_pipeline(): 23 | """Test if the constructor for EtlPipeline is correct 24 | """ 25 | result = ETLPipeline( 26 | 'test_pipeline', 27 | frequency='one-time', 28 | ec2_resource_config={'terminate_after':'2 Hours'}, 29 | time_delta=timedelta(seconds=3600), 30 | emr_cluster_config={'cfg1': 'value'}, 31 | load_time='12:34', 32 | topic_arn='sns:topic-arn:test-case', 33 | max_retries=5, 34 | bootstrap={'cfg1': 'value'}, 35 | ) 36 | assert result.name.endswith('test_pipeline') 37 | eq_(result.frequency, 'one-time') 38 | eq_(result.ec2_resource_config, {'terminate_after':'2 Hours'}) 39 | eq_(result.load_hour, 12) 40 | eq_(result.load_min, 34) 41 | eq_(result.time_delta, timedelta(seconds=3600)) 42 | eq_(result.max_retries, 5) 43 | eq_(result.topic_arn, 'sns:topic-arn:test-case') 44 | eq_(result.bootstrap_definitions, {'cfg1': 'value'}) 45 | eq_(result.emr_cluster_config, {'cfg1': 'value'}) 46 | 47 | @staticmethod 48 | def test_no_load_time_default_none(): 49 | """Test if the load_hour and load_min get set to None 50 | if load_time is None 51 | """ 52 | result = ETLPipeline('no_load_time_pipeline', load_time=None) 53 | eq_(result.load_hour, None) 54 | eq_(result.load_min, None) 55 | 56 | @raises(ETLInputError) 57 | def test_bad_data_type_throws(self): 58 | """Test that exception is thrown if the data_type parameter for 59 | _s3_uri is bad 60 | """ 61 | self.default_pipeline._s3_uri('TEST_DATA_TYPE') 62 | -------------------------------------------------------------------------------- /dataduct/pipeline/redshift_database.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for redshift database 3 | """ 4 | 5 | from ..config import Config 6 | from .pipeline_object import PipelineObject 7 | from ..utils.exceptions import ETLConfigError 8 | 9 | config = Config() 10 | 11 | if not hasattr(config, 'redshift'): 12 | raise ETLConfigError('Redshift credentials missing from config') 13 | 14 | CLUSTER_ID = None 15 | CONNECTION_STRING = None 16 | DATABASE_NAME = config.redshift['DATABASE_NAME'] 17 | USERNAME = config.redshift['USERNAME'] 18 | PASSWORD = config.redshift['PASSWORD'] 19 | 20 | 21 | if 'CLUSTER_ID' in config.redshift and 'CONNECTION_STRING' in config.redshift: 22 | raise ETLConfigError('Redshift credentials - either CLUSTER_ID or CONNECTION_STRING is required in config') 23 | elif 'CLUSTER_ID' in config.redshift: 24 | CLUSTER_ID = config.redshift['CLUSTER_ID'] 25 | elif 'CONNECTION_STRING' in config.redshift: 26 | CONNECTION_STRING = config.redshift['CONNECTION_STRING'] 27 | 28 | class RedshiftDatabase(PipelineObject): 29 | """Redshift resource class 30 | """ 31 | 32 | def __init__(self, 33 | id, 34 | database_name=DATABASE_NAME, 35 | cluster_id=CLUSTER_ID, 36 | connection_string=CONNECTION_STRING, 37 | username=USERNAME, 38 | password=PASSWORD): 39 | """Constructor for the RedshiftDatabase class 40 | 41 | Args: 42 | id(str): id of the object 43 | database_name(str): host name of the database 44 | cluster_id(str): identifier for the redshift database across aws 45 | connection_string(str): JDBC connection string of the Redshift. 46 | username(str): username for the database 47 | password(str): password for the database 48 | """ 49 | 50 | kwargs = { 51 | 'id': id, 52 | 'type': 'RedshiftDatabase', 53 | 'databaseName': database_name, 54 | 'username': username, 55 | '*password': password 56 | } 57 | 58 | if CLUSTER_ID: 59 | kwargs['clusterId'] = CLUSTER_ID 60 | else: 61 | kwargs['connectionString'] = CONNECTION_STRING 62 | 63 | super(RedshiftDatabase, self).__init__(**kwargs) 64 | -------------------------------------------------------------------------------- /dataduct/database/sql/tests/test_sql_statement.py: -------------------------------------------------------------------------------- 1 | """Tests for the SqlStatement class 2 | """ 3 | from nose.tools import assert_not_equal 4 | from nose.tools import eq_ 5 | from nose.tools import raises 6 | from unittest import TestCase 7 | 8 | from ..sql_statement import SqlStatement 9 | 10 | 11 | class TestSqlStatement(TestCase): 12 | """Tests for sql statement function 13 | """ 14 | @staticmethod 15 | def test_basic(): 16 | """Basic test for statement declaration 17 | """ 18 | query = 'select \n 1;' 19 | result = 'select 1' 20 | 21 | eq_(SqlStatement(query).sql(), result) 22 | 23 | @staticmethod 24 | def test_sanatization(): 25 | """Sanatization of comments 26 | """ 27 | query = 'select 1 -- test connect \n;' 28 | result = 'select 1' 29 | 30 | eq_(SqlStatement(query).sql(), result) 31 | 32 | @staticmethod 33 | def test_sanatization_multiline_comment(): 34 | """Sanatization of comments 35 | """ 36 | query = '/* Comment */\n select 1;' 37 | result = 'select 1' 38 | 39 | eq_(SqlStatement(query).sql(), result) 40 | 41 | @staticmethod 42 | def test_sanatization_multiline_comment_nesting(): 43 | """Sanatization of comments 44 | """ 45 | query = '/* Comment /* nest */ */\n select 1;' 46 | result = 'select 1' 47 | 48 | eq_(SqlStatement(query).sql(), result) 49 | 50 | @staticmethod 51 | def test_sanatization_multiline_comment_partial_nesting(): 52 | """Sanatization of comments 53 | This is a test to highlight issue #134 which was marked as won't fix 54 | """ 55 | query = '/* Comment /* nest */\n select 1;' 56 | result = 'select 1' 57 | parsed_output = '/* Comment select 1' 58 | 59 | eq_(SqlStatement(query).sql(), parsed_output) 60 | assert_not_equal(SqlStatement(query).sql(), result) 61 | 62 | @staticmethod 63 | @raises(ValueError) 64 | def test_error(): 65 | """Raise error if multiple queries are passed 66 | """ 67 | query = 'select 1; select 2;' 68 | SqlStatement(query) 69 | 70 | @staticmethod 71 | def test_empty_declaration(): 72 | """Empty if no sql query is passed 73 | """ 74 | eq_(SqlStatement().sql(), '') 75 | -------------------------------------------------------------------------------- /docs/hooks.rst: -------------------------------------------------------------------------------- 1 | Hooks 2 | ===== 3 | 4 | Dataduct has some endpoints you can use to execute python scripts before and 5 | after certain events when using the CLI and library locally. 6 | 7 | Available Hooks 8 | ~~~~~~~~~~~~~~~ 9 | 10 | - ``activate_pipeline``, which hooks onto the ``activate_pipeline`` function in 11 | ``dataduct.etl.etl_actions``. 12 | - ``connect_to_redshift``, which hooks onto the ``redshift_connection`` function in 13 | ``dataduct.data_access``. 14 | 15 | Creating a hook 16 | ~~~~~~~~~~~~~~~ 17 | 18 | Dataduct tries to find available hooks by searching in the directory specified 19 | by the ``HOOKS_BASE_PATH`` config variable in the ``etl`` section, matching them 20 | by their filename. For example, a hook for the ``activate_pipeline`` 21 | endpoint would saved as ``activate_pipeline.py`` in that directory. 22 | 23 | Each hook has two endpoints: ``before_hook`` and ``after_hook``. To implement 24 | one of these endpoints, you declare them as functions inside the hook. You may 25 | implement only one or both endpoints per hook. 26 | 27 | ``before_hook`` is called before the hooked function is executed. The parameters 28 | passed into the hooked function will also be passed to the ``before_hook``. 29 | The ``before_hook`` is designed to allow you to manipulate the arguments of 30 | the hooked function before it is called. At the end of the ``before_hook``, 31 | return the ``args`` and ``kwargs`` of the hooked function as a tuple. 32 | 33 | Example ``before_hook``: 34 | 35 | .. code:: python 36 | 37 | # hooked function signature: 38 | # def example(arg_one, arg_two, arg_three='foo') 39 | 40 | def before_hook(arg_one, arg_two, arg_three='foo'): 41 | return [arg_one + 1, 'hello world'], {'arg_three': 'bar'} 42 | 43 | ``after_hook`` is called after the hooked function is executed. The result of the 44 | hooked function is passed into ``after_hook`` as a single parameter. 45 | The ``after_hook`` is designed to allow you to access or manipulate the result of 46 | the hooked function. At the end of the ``after_hook``, return the (modified) 47 | result of the hooked function. 48 | 49 | Example ``after_hook``: 50 | 51 | .. code:: python 52 | 53 | # hooked function result: {'foo': 1, 'bar': 'two'} 54 | 55 | def after_hook(result): 56 | result['foo'] = 2 57 | result['bar'] = result['bar'] + ' three' 58 | return result 59 | -------------------------------------------------------------------------------- /dataduct/pipeline/copy_activity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for CopyActivity 3 | """ 4 | 5 | from .activity import Activity 6 | from .schedule import Schedule 7 | 8 | from ..config import Config 9 | from ..utils import constants as const 10 | from ..utils.exceptions import ETLInputError 11 | 12 | config = Config() 13 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) 14 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) 15 | 16 | 17 | class CopyActivity(Activity): 18 | """EC2 Resource class 19 | """ 20 | 21 | def __init__(self, 22 | id, 23 | input_node, 24 | output_node, 25 | schedule, 26 | resource=None, 27 | worker_group=None, 28 | max_retries=None, 29 | depends_on=None, 30 | **kwargs): 31 | """Constructor for the CopyActivity class 32 | 33 | Args: 34 | id(str): id of the object 35 | input_node(S3Node / list of S3Nodes): input nodes for the activity 36 | output_node(S3Node / list of S3Nodes): output nodes for activity 37 | schedule(Schedule): schedule of the pipeline 38 | resource(Ec2Resource / EmrResource): resource to run the activity on 39 | worker_group(str): the worker group to run the activity on 40 | max_retries(int): number of retries for the activity 41 | depends_on(list of activities): dependendent pipelines steps 42 | **kwargs(optional): Keyword arguments directly passed to base class 43 | """ 44 | 45 | # Validate inputs 46 | if not isinstance(schedule, Schedule): 47 | raise ETLInputError( 48 | 'Input schedule must be of the type Schedule') 49 | 50 | # Set default values 51 | if depends_on is None: 52 | depends_on = [] 53 | if max_retries is None: 54 | max_retries = MAX_RETRIES 55 | 56 | super(CopyActivity, self).__init__( 57 | id=id, 58 | retryDelay=RETRY_DELAY, 59 | type='CopyActivity', 60 | maximumRetries=max_retries, 61 | dependsOn=depends_on, 62 | input=input_node, 63 | output=output_node, 64 | runsOn=resource, 65 | workerGroup=worker_group, 66 | schedule=schedule, 67 | ) 68 | -------------------------------------------------------------------------------- /dataduct/pipeline/emr_activity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for EmrActivity 3 | """ 4 | 5 | from .activity import Activity 6 | from ..config import Config 7 | from .schedule import Schedule 8 | from ..utils import constants as const 9 | from ..utils.exceptions import ETLInputError 10 | 11 | config = Config() 12 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) 13 | 14 | 15 | class EmrActivity(Activity): 16 | """EMR Activity class 17 | """ 18 | 19 | def __init__(self, 20 | id, 21 | schedule, 22 | input_node, 23 | emr_step_string, 24 | resource=None, 25 | worker_group=None, 26 | output_node=None, 27 | additional_files=None, 28 | max_retries=None, 29 | depends_on=None): 30 | """Constructor for the EmrActivity class 31 | 32 | Args: 33 | id(str): id of the object 34 | schedule(Schedule): schedule of the pipeline 35 | emr_step_string(list of str): command string to be executed 36 | resource(Ec2Resource / EMRResource): resource to run the activity on 37 | worker_group(str): the worker group to run the activity on 38 | output_node(S3Node): output_node for the emr job 39 | additional_files(list of S3File): Additional files required for emr 40 | max_retries(int): number of retries for the activity 41 | depends_on(list of activities): dependendent pipelines steps 42 | """ 43 | 44 | # Validate inputs 45 | if not isinstance(schedule, Schedule): 46 | raise ETLInputError( 47 | 'Input schedule must be of the type Schedule') 48 | 49 | # Set default values 50 | if depends_on is None: 51 | depends_on = [] 52 | if max_retries is None: 53 | max_retries = MAX_RETRIES 54 | 55 | super(EmrActivity, self).__init__( 56 | id=id, 57 | type='EmrActivity', 58 | maximumRetries=max_retries, 59 | dependsOn=depends_on, 60 | runsOn=resource, 61 | workerGroup=worker_group, 62 | schedule=schedule, 63 | step=emr_step_string, 64 | output=output_node, 65 | input=input_node, 66 | ) 67 | 68 | self.add_additional_files(additional_files) 69 | -------------------------------------------------------------------------------- /dataduct/pipeline/activity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base class for data pipeline instance 3 | """ 4 | 5 | from ..utils.exceptions import ETLInputError 6 | from ..utils.helpers import exactly_one 7 | from .pipeline_object import PipelineObject 8 | 9 | 10 | class Activity(PipelineObject): 11 | """Base class for pipeline activities 12 | """ 13 | 14 | def __init__(self, dependsOn, maximumRetries, runsOn, 15 | workerGroup, **kwargs): 16 | """Constructor for the activity class 17 | 18 | Args: 19 | dependsOn(list): list of dependent activities 20 | maximumRetries(int): maximum number of retries 21 | **kwargs(optional): Keyword arguments directly passed to base class 22 | 23 | Note: 24 | dependsOn and maximum retries are required fields for any activity 25 | """ 26 | if not exactly_one(runsOn, workerGroup): 27 | raise ETLInputError( 28 | 'Exactly one of runsOn or workerGroup allowed!') 29 | 30 | if runsOn: 31 | kwargs['runsOn'] = runsOn 32 | else: 33 | kwargs['workerGroup'] = workerGroup 34 | super(Activity, self).__init__( 35 | dependsOn=dependsOn, 36 | maximumRetries=maximumRetries, 37 | **kwargs 38 | ) 39 | 40 | def __str__(self): 41 | try: 42 | return "%s with id %s" % tuple(self.id.split(".", 1)[::-1]) 43 | except: 44 | return self.id 45 | 46 | @property 47 | def input(self): 48 | """Get the input node for the activity 49 | 50 | Returns: 51 | result: Input node for this activity 52 | """ 53 | return self['input'] 54 | 55 | @property 56 | def output(self): 57 | """Get the output node for the activity 58 | 59 | Returns: 60 | result: output node for this activity 61 | """ 62 | return self['output'] 63 | 64 | @property 65 | def depends_on(self): 66 | """Get the dependent activities for the activity 67 | 68 | Returns: 69 | result: dependent activities for this activity 70 | """ 71 | return self['dependsOn'] 72 | 73 | @property 74 | def maximum_retries(self): 75 | """Get the maximum retries for the activity 76 | 77 | Returns: 78 | result: maximum retries for this activity 79 | """ 80 | return self['maximumRetries'] 81 | -------------------------------------------------------------------------------- /dataduct/database/tests/test_table.py: -------------------------------------------------------------------------------- 1 | """Tests for Table 2 | """ 3 | from unittest import TestCase 4 | 5 | from .helpers import create_table 6 | from .helpers import compare_scripts 7 | 8 | 9 | class TestTable(TestCase): 10 | """Tests for tables 11 | """ 12 | 13 | def setUp(self): 14 | """Setup test fixtures for the table tests 15 | """ 16 | self.basic_table = create_table( 17 | 'CREATE TABLE test_table (id INTEGER);') 18 | 19 | def test_unload_script(self): 20 | """Tests if the unload script generates successfully 21 | """ 22 | result = [ 23 | ("UNLOAD ('SELECT * FROM test_table;') TO 's3://test/' " 24 | "CREDENTIALS 'aws_access_key_id=a;aws_secret_access_key=b' " 25 | "DELIMITER '\t' ESCAPE NULL AS 'NULL'") 26 | ] 27 | compare_scripts( 28 | self.basic_table.unload_script('s3://test/', 'a', 'b'), 29 | result) 30 | 31 | def test_unload_script_with_token(self): 32 | """Tests if the unload script generates successfully 33 | """ 34 | result = [ 35 | ("UNLOAD ('SELECT * FROM test_table;') TO 's3://test/' " 36 | "CREDENTIALS " 37 | "'aws_access_key_id=a;aws_secret_access_key=b;token=c' " 38 | "DELIMITER '\t' ESCAPE NULL AS 'NULL'") 39 | ] 40 | compare_scripts( 41 | self.basic_table.unload_script('s3://test/', 'a', 'b', 'c'), 42 | result) 43 | 44 | def test_load_script(self): 45 | """Tests if the unload script generates successfully 46 | """ 47 | result = [ 48 | ("COPY test_table FROM 's3://test/' " 49 | "CREDENTIALS 'aws_access_key_id=a;aws_secret_access_key=b' " 50 | "DELIMITER '\t' ESCAPE NULL AS 'NULL'") 51 | ] 52 | compare_scripts( 53 | self.basic_table.load_script('s3://test/', 'a', 'b'), 54 | result) 55 | 56 | def test_load_script_with_token(self): 57 | """Tests if the unload script generates successfully 58 | """ 59 | result = [ 60 | ("COPY test_table FROM 's3://test/' " 61 | "CREDENTIALS " 62 | "'aws_access_key_id=a;aws_secret_access_key=b;token=c' " 63 | "DELIMITER '\t' ESCAPE NULL AS 'NULL'") 64 | ] 65 | compare_scripts( 66 | self.basic_table.load_script('s3://test/', 'a', 'b', 'c'), 67 | result) 68 | -------------------------------------------------------------------------------- /dataduct/etl/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for processing etl steps 2 | """ 3 | import imp 4 | from ..config import Config 5 | from ..steps import * # noqa 6 | from ..utils.helpers import parse_path 7 | from ..utils.exceptions import ETLInputError 8 | 9 | STEP_CLASSES = { 10 | 'column-check': ColumnCheckStep, 11 | 'count-check': CountCheckStep, 12 | 'create-load-redshift': CreateAndLoadStep, 13 | 'create-update-sql': CreateUpdateSqlStep, 14 | 'delta-load': DeltaLoadStep, 15 | 'emr-step': EMRJobStep, 16 | 'emr-streaming': EMRStreamingStep, 17 | 'extract-local': ExtractLocalStep, 18 | 'extract-rds': ExtractRdsStep, 19 | 'extract-redshift': ExtractRedshiftStep, 20 | 'extract-postgres': ExtractPostgresStep, 21 | 'extract-s3': ExtractS3Step, 22 | 'load-redshift': LoadRedshiftStep, 23 | 'load-postgres': LoadPostgresStep, 24 | 'load-reload-pk': LoadReloadAndPrimaryKeyStep, 25 | 'pipeline-dependencies': PipelineDependenciesStep, 26 | 'primary-key-check': PrimaryKeyCheckStep, 27 | 'qa-transform': QATransformStep, 28 | 'reload': ReloadStep, 29 | 'sql-command': SqlCommandStep, 30 | 'transform': TransformStep, 31 | 'upsert': UpsertStep, 32 | } 33 | 34 | 35 | def get_custom_steps(): 36 | """Fetch the custom steps specified in config 37 | """ 38 | config = Config() 39 | custom_steps = dict() 40 | 41 | for step_def in getattr(config, 'custom_steps', list()): 42 | step_type = step_def['step_type'] 43 | path = parse_path(step_def['file_path'], 'CUSTOM_STEPS_PATH') 44 | 45 | # Load source from the file path provided 46 | step_mod = imp.load_source(step_type, path) 47 | 48 | # Get the step class based on class_name provided 49 | step_class = getattr(step_mod, step_def['class_name']) 50 | 51 | # Check if step_class is of type ETLStep 52 | if not issubclass(step_class, ETLStep): 53 | raise ETLInputError('Step type %s is not of type ETLStep' % 54 | step_class.__name__) 55 | 56 | custom_steps[step_type] = step_class 57 | return custom_steps 58 | 59 | 60 | STEP_CONFIG = STEP_CLASSES.copy() 61 | STEP_CONFIG.update(get_custom_steps()) 62 | 63 | 64 | def process_steps(steps_params): 65 | """Format the step parameters by changing step type to step class 66 | """ 67 | steps = [] 68 | for step_param in steps_params: 69 | params = step_param.copy() 70 | step_type = params.pop('step_type') 71 | params['step_class'] = STEP_CONFIG[step_type] 72 | steps.append(params) 73 | return steps 74 | -------------------------------------------------------------------------------- /dataduct/pipeline/sql_activity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for SqlActivity 3 | """ 4 | 5 | from .activity import Activity 6 | from ..config import Config 7 | from .schedule import Schedule 8 | from ..s3 import S3File 9 | from ..utils import constants as const 10 | from ..utils.exceptions import ETLInputError 11 | 12 | config = Config() 13 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) 14 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) 15 | 16 | 17 | class SqlActivity(Activity): 18 | """Sql Activity class 19 | """ 20 | 21 | def __init__(self, 22 | id, 23 | schedule, 24 | script, 25 | database, 26 | resource=None, 27 | worker_group=None, 28 | script_arguments=None, 29 | queue=None, 30 | max_retries=None, 31 | depends_on=None): 32 | """Constructor for the SqlActivity class 33 | 34 | Args: 35 | id(str): id of the object 36 | schedule(Schedule): schedule of the pipeline 37 | script(S3File): s3 uri of the script 38 | database(RedshiftDatabase): database to execute commands on 39 | resource(Ec2Resource / EMRResource): resource to run the activity on 40 | worker_group(str): the worker group to run the activity on 41 | queue(str): queue in which the query should be executed 42 | max_retries(int): number of retries for the activity 43 | depends_on(list of activities): dependendent pipelines steps 44 | """ 45 | 46 | # Validate inputs 47 | if not isinstance(schedule, Schedule): 48 | raise ETLInputError( 49 | 'Input schedule must be of the type Schedule') 50 | 51 | if not isinstance(script, S3File): 52 | raise ETLInputError('script must be an S3File') 53 | 54 | # Set default values 55 | if depends_on is None: 56 | depends_on = [] 57 | if max_retries is None: 58 | max_retries = MAX_RETRIES 59 | 60 | super(SqlActivity, self).__init__( 61 | id=id, 62 | retryDelay=RETRY_DELAY, 63 | type='SqlActivity', 64 | maximumRetries=max_retries, 65 | dependsOn=depends_on, 66 | runsOn=resource, 67 | workerGroup=worker_group, 68 | schedule=schedule, 69 | scriptUri=script, 70 | scriptArgument=script_arguments, 71 | database=database, 72 | queue=queue 73 | ) 74 | -------------------------------------------------------------------------------- /dataduct/steps/extract_redshift.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for RedshiftCopyActivity to extract data to S3 3 | """ 4 | from .etl_step import ETLStep 5 | from ..pipeline import RedshiftNode 6 | from ..pipeline import RedshiftCopyActivity 7 | 8 | 9 | class ExtractRedshiftStep(ETLStep): 10 | """Extract Redshift Step class that helps get data out of redshift 11 | """ 12 | 13 | def __init__(self, 14 | schema, 15 | table, 16 | redshift_database, 17 | insert_mode="TRUNCATE", 18 | output_path=None, 19 | **kwargs): 20 | """Constructor for the ExtractRedshiftStep class 21 | 22 | Args: 23 | schema(str): schema from which table should be extracted 24 | table(path): table name for extract 25 | insert_mode(str): insert mode for redshift copy activity 26 | redshift_database(RedshiftDatabase): database to excute the query 27 | **kwargs(optional): Keyword arguments directly passed to base class 28 | """ 29 | super(ExtractRedshiftStep, self).__init__(**kwargs) 30 | 31 | # Create input node 32 | self._input_node = self.create_pipeline_object( 33 | object_class=RedshiftNode, 34 | schedule=self.schedule, 35 | redshift_database=redshift_database, 36 | schema_name=schema, 37 | table_name=table, 38 | ) 39 | 40 | self._output = self.create_s3_data_node( 41 | self.get_output_s3_path(output_path)) 42 | 43 | self.create_pipeline_object( 44 | object_class=RedshiftCopyActivity, 45 | max_retries=self.max_retries, 46 | input_node=self.input, 47 | output_node=self.output, 48 | insert_mode=insert_mode, 49 | resource=self.resource, 50 | worker_group=self.worker_group, 51 | schedule=self.schedule, 52 | depends_on=self.depends_on, 53 | command_options=["DELIMITER '\t' ESCAPE"], 54 | ) 55 | 56 | @classmethod 57 | def arguments_processor(cls, etl, input_args): 58 | """Parse the step arguments according to the ETL pipeline 59 | 60 | Args: 61 | etl(ETLPipeline): Pipeline object containing resources and steps 62 | step_args(dict): Dictionary of the step arguments for the class 63 | """ 64 | input_args = cls.pop_inputs(input_args) 65 | step_args = cls.base_arguments_processor(etl, input_args) 66 | step_args['redshift_database'] = etl.redshift_database 67 | 68 | return step_args 69 | -------------------------------------------------------------------------------- /dataduct/database/parsers/tests/test_select_query.py: -------------------------------------------------------------------------------- 1 | """Tests for select statement parser 2 | """ 3 | 4 | from nose.tools import eq_ 5 | from nose.tools import raises 6 | from pyparsing import ParseException 7 | from unittest import TestCase 8 | 9 | from ..select_query import parse_column_name 10 | from ..select_query import parse_select_columns 11 | from ..select_query import parse_select_dependencies 12 | 13 | 14 | class TestCreateTableStatement(TestCase): 15 | """Tests for create table 16 | """ 17 | @staticmethod 18 | def test_basic(): 19 | """Basic test for select statement 20 | """ 21 | query = ('SELECT x, y, z AS t FROM abc JOIN pqr USING(y) WHERE x=1') 22 | 23 | dependencies = parse_select_dependencies(query) 24 | eq_(dependencies, ['abc', 'pqr']) 25 | 26 | columns = parse_select_columns(query) 27 | eq_(columns, ['x', 'y', 'z AS t']) 28 | 29 | column_name = parse_column_name(columns[0]) 30 | eq_(column_name, 'x') 31 | 32 | column_name = parse_column_name(columns[2]) 33 | eq_(column_name, 't') 34 | 35 | @staticmethod 36 | @raises(ParseException) 37 | def test_bad_input(): 38 | """Feeding malformed input into create table 39 | """ 40 | query = 'SELECT x, y, z' 41 | parse_select_dependencies(query) 42 | 43 | @staticmethod 44 | def test_columns(): 45 | """Basic test for select statement 46 | """ 47 | query = ('SELECT x' 48 | ',CASE WHEN y=10 THEN 5 ELSE z AS a' 49 | ',CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END AS b' 50 | ',COUNT(1) AS c' 51 | ",CASE WHEN course_platform = 'spark' THEN 'v1-' " 52 | "|| topic_id::VARCHAR ELSE course_id END AS course_id " 53 | 'FROM abc') 54 | 55 | result = [ 56 | 'x', 57 | 'CASE WHEN y=10 THEN 5 ELSE z AS a', 58 | 'CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END AS b', 59 | 'COUNT(1) AS c', 60 | "CASE WHEN course_platform = 'spark' THEN 'v1-' " + 61 | "|| topic_id::VARCHAR ELSE course_id END AS course_id" 62 | ] 63 | 64 | result_names = ['x', 'a', 'b', 'c', 'course_id'] 65 | 66 | columns = parse_select_columns(query) 67 | eq_(columns, result) 68 | 69 | column_names = [parse_column_name(c) for c in columns] 70 | eq_(column_names, result_names) 71 | 72 | @staticmethod 73 | def test_with_query(): 74 | """Basic test for select statement with the with query 75 | """ 76 | query = ('WITH data AS (SELECT x, y FROM xy) SELECT x,y FROM data') 77 | 78 | columns = parse_select_columns(query) 79 | eq_(columns, ['x', 'y']) 80 | -------------------------------------------------------------------------------- /dataduct/database/column.py: -------------------------------------------------------------------------------- 1 | """Script containing the column class object 2 | """ 3 | 4 | 5 | class Column(object): 6 | """Class representing columns in a table 7 | """ 8 | def __init__(self, column_name, column_type, encoding=None, 9 | fk_reference=None, fk_table=None, is_distkey=False, 10 | is_sortkey=False, is_primarykey=False, is_null=False, 11 | is_not_null=False, position=None): 12 | """Constructor for Column class 13 | 14 | Args: 15 | column_name(str): The name of the column 16 | column_type(str): The type of the column 17 | encoding(str): The encoding type of the column 18 | fk_reference(str): The column that this key is referring to 19 | fk_table(str): The table that this key is referring to 20 | is_distkey(bool): Whether or not this column is the DISTKEY 21 | is_sortkey(bool): Whether or not this column is a SORTKEY 22 | is_primarykey(bool): Whether or not this column is a primary key 23 | is_null(bool): Whether or not is column is defaults to null 24 | is_not_null(bool): Whether or not is column is not nullable 25 | position(int): The position of the column 26 | """ 27 | 28 | self.column_name = column_name 29 | self.column_type = column_type 30 | self.encoding = encoding 31 | self.fk_reference = fk_reference 32 | self.fk_table = fk_table 33 | self.is_distkey = is_distkey 34 | self.is_sortkey = is_sortkey 35 | self.is_primarykey = is_primarykey 36 | self.is_null = is_null 37 | self.is_not_null = is_not_null 38 | self.position = position 39 | 40 | if is_null and is_not_null: 41 | raise ValueError('Column cannot be both NULL and NOT NULL together') # noqa 42 | 43 | if self.is_primarykey: 44 | self.is_not_null = True 45 | self.is_null = False 46 | 47 | def __str__(self): 48 | """String output for the columns 49 | """ 50 | if self.column_type is not None: 51 | return '%s %s' % (self.column_name, self.column_type) 52 | return self.column_name 53 | 54 | @property 55 | def primary(self): 56 | """Property for the column being part of primary key 57 | """ 58 | return self.is_primarykey 59 | 60 | @primary.setter 61 | def primary(self, value=True): 62 | """Set the primary flag for the column 63 | """ 64 | self.is_primarykey = value 65 | 66 | # Force not null for primary key columns 67 | if self.is_primarykey: 68 | self.is_not_null = True 69 | self.is_null = False 70 | 71 | @property 72 | def name(self): 73 | """Get the name of the column 74 | """ 75 | return self.column_name 76 | -------------------------------------------------------------------------------- /dataduct/qa/column_check.py: -------------------------------------------------------------------------------- 1 | """QA test for comparing columns in the source system with the Warehouse 2 | """ 3 | from .check import Check 4 | from .utils import render_output 5 | 6 | 7 | class ColumnCheck(Check): 8 | """QA test for comparing columns across the ETL 9 | """ 10 | def __init__(self, source_data, destination_data, **kwargs): 11 | """Constructor for the Count based QA 12 | 13 | Args: 14 | source_data(DataFrame): Sample of source data 15 | destination_data(DataFrame): Sample of destination data 16 | """ 17 | super(ColumnCheck, self).__init__(**kwargs) 18 | self.source_data = source_data 19 | self.destination_data = destination_data 20 | self.errors = [] 21 | self.observed = 0 22 | 23 | # Identify errors 24 | for key in source_data.index: 25 | if key not in destination_data.index: 26 | continue 27 | 28 | source_value = ColumnCheck.column_value(self.source_data, key) 29 | dest_value = ColumnCheck.column_value(self.destination_data, key) 30 | 31 | if source_value != dest_value: 32 | self.errors.append((key, source_value, dest_value)) 33 | self.observed += 1 34 | 35 | @property 36 | def error_rate(self): 37 | """The error rate for the column comparisons 38 | 39 | Note: 40 | The error is only calculated for keys that exist in both dataframes. 41 | Thus, we presume that issues dealing with row counts are addressed 42 | in a separate QA test. 43 | """ 44 | if self.observed == 0: 45 | return None 46 | 47 | return float(len(self.errors) * 100) / self.observed 48 | 49 | @staticmethod 50 | def column_value(data, key): 51 | """Fetch the value for a key in the dataframe 52 | 53 | Args: 54 | data(DataFrame): Single column dataframe 55 | key(str): Key to lookup in the dataframe 56 | 57 | Returns: 58 | value(str): Value for the key, unicode values are encoded as utf-8 59 | """ 60 | value = data.loc[key].values[0] 61 | if isinstance(value, unicode): 62 | return value.encode('utf-8') 63 | return value 64 | 65 | @property 66 | def summary(self): 67 | """Summary of the test results for the SNS message 68 | """ 69 | return render_output( 70 | [ 71 | 'Test Name: %s' % self.name, 72 | 'Success: %s' % self.success, 73 | 'Tolerance: %0.4f%%' % self.tolerance, 74 | 'Error Rate: %0.4f%%' % self.error_rate, 75 | 'Observed: %d' % self.observed, 76 | ] 77 | ) 78 | 79 | @property 80 | def results(self): 81 | """Results from the the comparison of the errors 82 | """ 83 | return render_output([str(a) for a in self.errors]) 84 | -------------------------------------------------------------------------------- /dataduct/pipeline/ec2_resource.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for ec2 resource 3 | """ 4 | 5 | from ..config import Config 6 | from .pipeline_object import PipelineObject 7 | from ..s3 import S3LogPath 8 | from .schedule import Schedule 9 | from ..utils import constants as const 10 | from ..utils.exceptions import ETLInputError 11 | 12 | config = Config() 13 | ROLE = config.etl['ROLE'] 14 | RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] 15 | 16 | INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', const.M1_LARGE) 17 | ETL_AMI = config.ec2.get('ETL_AMI', const.NONE) 18 | SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', const.NONE) 19 | SECURITY_GROUP_ID = config.ec2.get('SECURITY_GROUP_ID', const.NONE) 20 | SUBNET_ID = config.ec2.get('SUBNET_ID', const.NONE) 21 | KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE) 22 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) 23 | 24 | 25 | class Ec2Resource(PipelineObject): 26 | """EC2 Resource class 27 | """ 28 | 29 | def __init__(self, 30 | id, 31 | s3_log_dir=None, 32 | schedule=None, 33 | terminate_after='6 Hours', 34 | instance_type=INSTANCE_TYPE, 35 | ami=ETL_AMI, 36 | security_group=SECURITY_GROUP, 37 | security_group_id=SECURITY_GROUP_ID, 38 | subnet_id=SUBNET_ID, 39 | **kwargs): 40 | """Constructor for the Ec2Resource class 41 | 42 | Args: 43 | id(str): id of the object 44 | s3_log_dir(S3Directory): s3 directory for pipeline logs 45 | schedule(Schedule): pipeline schedule used for the machine 46 | terminate_after(str): time to terminate the ec2resource after 47 | instance_type(str): machine type to be used eg. m1.large 48 | ami(str): ami id for the ec2 resource 49 | retry_delay(str): time delay between step retries 50 | **kwargs(optional): Keyword arguments directly passed to base class 51 | """ 52 | 53 | # Validate inputs 54 | if not isinstance(schedule, Schedule): 55 | raise ETLInputError( 56 | 'Input schedule must be of the type Schedule') 57 | if not isinstance(s3_log_dir, S3LogPath): 58 | raise ETLInputError( 59 | 's3 log directory must be of type S3LogPath') 60 | 61 | super(Ec2Resource, self).__init__( 62 | id=id, 63 | type='Ec2Resource', 64 | terminateAfter=terminate_after, 65 | logUri=s3_log_dir, 66 | schedule=schedule, 67 | imageId=ami, 68 | instanceType=instance_type, 69 | role=ROLE, 70 | resourceRole=RESOURCE_ROLE, 71 | keyPair=KEY_PAIR, 72 | retryDelay=RETRY_DELAY, 73 | securityGroups=security_group, 74 | securityGroupIds=security_group_id, 75 | subnetId=subnet_id 76 | ) 77 | -------------------------------------------------------------------------------- /dataduct/steps/create_update_sql.py: -------------------------------------------------------------------------------- 1 | """ETL step wrapper for sql command for inserting into tables 2 | """ 3 | from ..database import SqlScript 4 | from ..database import Table 5 | from ..s3 import S3File 6 | from ..utils import constants as const 7 | from ..utils.exceptions import ETLInputError 8 | from ..utils.helpers import exactly_one 9 | from ..utils.helpers import parse_path 10 | from .transform import TransformStep 11 | 12 | 13 | class CreateUpdateSqlStep(TransformStep): 14 | """Create and Insert step that creates a table and then uses the query to 15 | update the table data with any sql query provided 16 | """ 17 | 18 | def __init__(self, 19 | table_definition, 20 | script=None, 21 | command=None, 22 | analyze_table=True, 23 | script_arguments=None, 24 | non_transactional=False, 25 | **kwargs): 26 | """Constructor for the CreateUpdateStep class 27 | 28 | Args: 29 | **kwargs(optional): Keyword arguments directly passed to base class 30 | """ 31 | if not exactly_one(command, script): 32 | raise ETLInputError('Both command and script found') 33 | 34 | # Create S3File with script / command provided 35 | if script: 36 | update_script = SqlScript(filename=parse_path(script)) 37 | else: 38 | update_script = SqlScript(command) 39 | self.s3_source_dir = kwargs['s3_source_dir'] 40 | sql_script = self.create_script(S3File(text=update_script.sql())) 41 | sql_script.upload_to_s3() 42 | 43 | dest = Table(SqlScript(filename=parse_path(table_definition))) 44 | 45 | arguments = [ 46 | '--table_definition=%s' % dest.sql().sql(), 47 | '--sql=%s' % sql_script.s3_path.uri 48 | ] 49 | 50 | if analyze_table: 51 | arguments.append('--analyze') 52 | 53 | if non_transactional: 54 | arguments.append('--non_transactional') 55 | 56 | if script_arguments is not None: 57 | if not isinstance(script_arguments, list): 58 | raise ETLInputError( 59 | 'Script arguments for SQL steps should be a list') 60 | arguments.extend(script_arguments) 61 | 62 | super(CreateUpdateSqlStep, self).__init__( 63 | command=const.SQL_RUNNER_COMMAND, script_arguments=arguments, 64 | no_output=True, **kwargs) 65 | 66 | @classmethod 67 | def arguments_processor(cls, etl, input_args): 68 | """Parse the step arguments according to the ETL pipeline 69 | 70 | Args: 71 | etl(ETLPipeline): Pipeline object containing resources and steps 72 | step_args(dict): Dictionary of the step arguments for the class 73 | """ 74 | step_args = cls.base_arguments_processor(etl, input_args) 75 | cls.pop_inputs(step_args) 76 | 77 | return step_args 78 | -------------------------------------------------------------------------------- /dataduct/config/credentials.py: -------------------------------------------------------------------------------- 1 | """Credentials utility functions for connecting to various services 2 | """ 3 | import os 4 | import requests 5 | import sys 6 | from ConfigParser import SafeConfigParser 7 | 8 | 9 | def get_aws_credentials_from_iam(): 10 | """Get aws credentials using the IAM api 11 | Note: this script only runs on an EC2 instance with the appropriate 12 | resource roles. For more information, see the following: 13 | http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/\ 14 | AESDG-chapter-instancedata.html 15 | 16 | Returns: 17 | access_key(str): AWS access key 18 | secret_key(str): AWS secret key 19 | token(str): Connection token 20 | """ 21 | url = 'http://169.254.169.254/latest/meta-data/iam/security-credentials/' 22 | 23 | # Get role name 24 | r = requests.get(url) 25 | 26 | if not r.ok: 27 | raise Exception('Request failed for url %s.' % url) 28 | 29 | # Add role name to url 30 | url += r.content 31 | 32 | # Get access keys 33 | r = requests.get(url) 34 | if not r.ok: 35 | raise Exception('Request failed for url %s.' % url) 36 | 37 | json_result = r.json() 38 | return (json_result['AccessKeyId'], 39 | json_result['SecretAccessKey'], 40 | json_result['Token']) 41 | 42 | 43 | def get_aws_credentials_from_file(filename=None): 44 | """Get the AWS credentials from boto config files 45 | 46 | Tries to load from the specified filename, if applicable, else follows what 47 | boto does by following the order specified at 48 | http://boto.cloudhackers.com/en/latest/boto_config_tut.html#details 49 | """ 50 | config = SafeConfigParser() 51 | cred_file = None 52 | if filename is not None and os.path.isfile(filename): 53 | cred_file = filename 54 | elif os.path.isfile('/etc/boto.cfg'): 55 | cred_file = '/etc/boto.cfg' 56 | elif os.path.isfile(os.path.expanduser('~/.aws/credentials')): 57 | cred_file = os.path.expanduser('~/.aws/credentials') 58 | elif os.path.isfile(os.path.expanduser('~/.boto')): 59 | cred_file = os.path.expanduser('~/.boto') 60 | else: 61 | raise Exception("Cannot find a credentials file") 62 | 63 | config.read(cred_file) 64 | aws_access_key_id = config.get('Credentials', 65 | 'aws_access_key_id') 66 | aws_secret_access_key = config.get('Credentials', 67 | 'aws_secret_access_key') 68 | return (aws_access_key_id, aws_secret_access_key, None) 69 | 70 | 71 | def get_aws_credentials(filename=None): 72 | """Get the aws credentials from IAM or files 73 | """ 74 | try: 75 | aws_key, aws_secret, token = get_aws_credentials_from_iam() 76 | except Exception, error: 77 | sys.stderr.write('Failed to get creds from IAM: %s \n' % error.message) 78 | aws_key, aws_secret, token = get_aws_credentials_from_file(filename) 79 | 80 | return aws_key, aws_secret, token 81 | -------------------------------------------------------------------------------- /dataduct/steps/executors/count_check.py: -------------------------------------------------------------------------------- 1 | """Script that compares the number of rows in the source select script with the 2 | number of rows in the destination table 3 | """ 4 | 5 | import argparse 6 | import pandas.io.sql as pdsql 7 | from dataduct.data_access import redshift_connection 8 | from dataduct.data_access import rds_connection 9 | from dataduct.qa import CountCheck 10 | 11 | 12 | def _get_source_data(sql, hostname): 13 | """Gets the DataFrame containing all the rows of the table 14 | The DataFrame will be indexed by the table's primary key(s) 15 | 16 | Args: 17 | sql(str): The table definition representing the table to query 18 | connection(Connection): A connection to the database 19 | 20 | Returns: 21 | DataFrame: The rows of the table 22 | """ 23 | connection = rds_connection(hostname) 24 | data = pdsql.read_sql(sql, connection) 25 | connection.close() 26 | return data.iloc[0][0] 27 | 28 | 29 | def _get_destination_data(sql): 30 | """Gets the DataFrame containing all the rows of the table 31 | The DataFrame will be indexed by the table's primary key(s) 32 | 33 | Args: 34 | sql(str): The table definition representing the table to query 35 | connection(Connection): A connection to the database 36 | 37 | Returns: 38 | DataFrame: The rows of the table 39 | """ 40 | connection = redshift_connection() 41 | data = pdsql.read_sql(sql, connection) 42 | connection.close() 43 | # All columns apart from last are PK columns 44 | return data.iloc[0][0] 45 | 46 | 47 | def count_check(): 48 | """Args (taken in through argparse): 49 | source_sql: SQL script of the source data 50 | destination_sql: SQL script of the destination data 51 | """ 52 | parser = argparse.ArgumentParser() 53 | 54 | parser.add_argument('--source_sql', dest='source_sql', required=True) 55 | parser.add_argument('--source_host', dest='source_host', required=True) 56 | parser.add_argument('--destination_sql', dest='destination_sql', 57 | required=True) 58 | parser.add_argument('--tolerance', type=float, dest='tolerance', 59 | default=1.0) 60 | parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) 61 | parser.add_argument('--test_name', dest='test_name', 62 | default='Check Count') 63 | parser.add_argument('--log_to_s3', action='store_true', default=False) 64 | parser.add_argument('--path_suffix', dest='path_suffix', default=None) 65 | 66 | args = parser.parse_args() 67 | 68 | source_count = _get_source_data(args.source_sql, args.source_host) 69 | destination_count = _get_destination_data(args.destination_sql) 70 | 71 | check = CountCheck(source_count, destination_count, 72 | name=args.test_name, 73 | sns_topic_arn=args.sns_topic_arn, 74 | tolerance=args.tolerance) 75 | 76 | check.publish(args.log_to_s3, dest_sql=args.destination_sql, 77 | path_suffix=args.path_suffix) 78 | -------------------------------------------------------------------------------- /dataduct/database/relation.py: -------------------------------------------------------------------------------- 1 | """Script containing the relation class object 2 | """ 3 | from copy import deepcopy 4 | from .sql import SqlScript 5 | from ..config import Config 6 | from ..utils.helpers import atleast_one 7 | 8 | 9 | class Relation(object): 10 | """Class representing a relation in the database 11 | """ 12 | 13 | def __str__(self): 14 | """Output for the print statement of the relation 15 | """ 16 | return self.sql_statement.sql() 17 | 18 | def sql(self): 19 | """SqlStatement for the table object 20 | """ 21 | return self.sql_statement 22 | 23 | def copy(self): 24 | """Create a copy of the relation object 25 | """ 26 | return deepcopy(self) 27 | 28 | def initialize_name(self): 29 | """Parse the full name to declare the schema and relation name 30 | """ 31 | split_name = self.full_name.split('.') 32 | if len(split_name) == 2: 33 | schema_name = split_name[0] 34 | relation_name = split_name[1] 35 | else: 36 | schema_name = None 37 | relation_name = self.full_name 38 | 39 | return schema_name, relation_name 40 | 41 | def _grant_sql_builder(self, permission, user=None, group=None): 42 | """Return the sql string for granting permissions 43 | """ 44 | if not atleast_one(user, group): 45 | raise ValueError('Atleast one of user / group needed') 46 | 47 | result = list() 48 | option_string = 'WITH GRANT OPTION' 49 | base = 'GRANT %s ON %s TO {user} {option}' % ( 50 | permission, self.full_name) 51 | 52 | if user is not None: 53 | result.append(base.format(user=user, option=option_string)) 54 | 55 | if group is not None: 56 | result.append(base.format(user='GROUP %s' % group, option='')) 57 | return result 58 | 59 | def grant_script(self): 60 | """Grant the permissions based on the config 61 | """ 62 | config = Config() 63 | if not hasattr(config, 'database'): 64 | return 65 | 66 | permissions = config.database.get('permissions', list()) 67 | 68 | sql = list() 69 | for permission in permissions: 70 | sql.extend(self._grant_sql_builder(**permission)) 71 | 72 | return SqlScript('; '.join(sql)) 73 | 74 | def select_script(self): 75 | """Select everything from the relation 76 | """ 77 | return SqlScript('SELECT * FROM %s' % self.full_name) 78 | 79 | def create_script(self, grant_permissions=True): 80 | """Create script for the table object 81 | """ 82 | script = SqlScript(statements=[self.sql_statement.copy()]) 83 | if grant_permissions: 84 | script.append(self.grant_script()) 85 | return script 86 | 87 | def recreate_script(self, grant_permissions=True): 88 | """Sql script to recreate the view 89 | """ 90 | script = self.drop_script() 91 | script.append(self.create_script(grant_permissions)) 92 | return script 93 | -------------------------------------------------------------------------------- /dataduct/config/tests/test_config.py: -------------------------------------------------------------------------------- 1 | """Tests that the config singleton is working properly 2 | """ 3 | from os.path import expanduser 4 | from os.path import join 5 | 6 | from unittest import TestCase 7 | from mock import patch 8 | from testfixtures import TempDirectory 9 | from nose.tools import eq_ 10 | from nose.tools import raises 11 | 12 | from ..config import get_config_files 13 | from ..config import load_yaml 14 | 15 | 16 | class TestConfig(TestCase): 17 | """Tests for config singleton 18 | """ 19 | def setUp(self): 20 | self.test_yaml_file = '\n'.join([ 21 | 'test:', 22 | ' test_sub:', 23 | ' - test_sub1: foo', 24 | ' test_sub1_other: bar', 25 | ' - test_sub2: foobar', 26 | ]) 27 | self.test_config_dict = { 28 | 'test': { 29 | 'test_sub': [ 30 | { 31 | 'test_sub1': 'foo', 32 | 'test_sub1_other': 'bar', 33 | }, 34 | { 35 | 'test_sub2': 'foobar', 36 | } 37 | ] 38 | } 39 | } 40 | 41 | @staticmethod 42 | @patch.dict('os.environ', {}, clear=True) 43 | def test_get_config_files_no_enviroment_variable(): 44 | """Tests that correct config file paths are returned when there's no 45 | enviroment variable 46 | """ 47 | expected = [ 48 | '/etc/dataduct.cfg', 49 | expanduser('~/.dataduct/dataduct.cfg'), 50 | ] 51 | result = get_config_files() 52 | eq_(result, expected) 53 | 54 | @staticmethod 55 | @patch.dict('os.environ', {'DATADUCT_CONFIG_PATH': '/test/test.cfg'}) 56 | def test_get_config_files_with_enviroment_variable(): 57 | """Tests that correct config file paths are returned when there is 58 | an enviroment variable 59 | """ 60 | expected = [ 61 | '/etc/dataduct.cfg', 62 | expanduser('~/.dataduct/dataduct.cfg'), 63 | '/test/test.cfg', 64 | ] 65 | result = get_config_files() 66 | eq_(result, expected) 67 | 68 | def test_load_yaml_works_correctly(self): 69 | """Tests that the yaml file can be loaded correctly 70 | """ 71 | with TempDirectory() as d: 72 | d.write('test.yaml', self.test_yaml_file) 73 | result = load_yaml([join(d.path, 'test.yaml')]) 74 | eq_(result, self.test_config_dict) 75 | 76 | @staticmethod 77 | @raises(IOError) 78 | def test_no_config_file_raises(): 79 | """Tests that an exception is raised if no yaml file path is passed in 80 | """ 81 | load_yaml([]) 82 | 83 | @staticmethod 84 | @raises(IOError) 85 | def test_cannot_find_config_file_raises(): 86 | """Tests that an exception is raised if it cannot find any yaml files 87 | """ 88 | with TempDirectory() as d: 89 | with TempDirectory() as d2: 90 | load_yaml([join(d.path, 'test.cfg'), 91 | join(d2.path, 'test.cfg')]) 92 | -------------------------------------------------------------------------------- /dataduct/steps/load_postgres.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for SQLActivity to load data into Postgres 3 | """ 4 | from ..config import Config 5 | from .etl_step import ETLStep 6 | from ..pipeline import PostgresNode 7 | from ..pipeline import PostgresDatabase 8 | from ..pipeline import PipelineObject 9 | from ..pipeline import CopyActivity 10 | 11 | config = Config() 12 | if not hasattr(config, 'postgres'): 13 | raise ETLInputError('Postgres config not specified in ETL') 14 | POSTGRES_CONFIG = config.postgres 15 | 16 | 17 | class LoadPostgresStep(ETLStep): 18 | """Load Postgres Step class that helps load data into postgres 19 | """ 20 | 21 | def __init__(self, 22 | table, 23 | postgres_database, 24 | insert_query, 25 | max_errors=None, 26 | replace_invalid_char=None, 27 | **kwargs): 28 | """Constructor for the LoadPostgresStep class 29 | 30 | Args: 31 | table(path): table name for load 32 | sql(str): sql query to be executed 33 | postgres_database(PostgresDatabase): database to excute the query 34 | output_path(str): s3 path where sql output should be saved 35 | **kwargs(optional): Keyword arguments directly passed to base class 36 | """ 37 | super(LoadPostgresStep, self).__init__(**kwargs) 38 | 39 | region = POSTGRES_CONFIG['REGION'] 40 | rds_instance_id = POSTGRES_CONFIG['RDS_INSTANCE_ID'] 41 | user = POSTGRES_CONFIG['USERNAME'] 42 | password = POSTGRES_CONFIG['PASSWORD'] 43 | database_node = self.create_pipeline_object( 44 | object_class=PostgresDatabase, 45 | region=region, 46 | rds_instance_id=rds_instance_id, 47 | username=user, 48 | password=password, 49 | ) 50 | 51 | # Create output node 52 | self._output = self.create_pipeline_object( 53 | object_class=PostgresNode, 54 | schedule=self.schedule, 55 | database=database_node, 56 | table=table, 57 | username=user, 58 | password=password, 59 | select_query=None, 60 | insert_query=insert_query, 61 | host=rds_instance_id, 62 | ) 63 | 64 | self.create_pipeline_object( 65 | object_class=CopyActivity, 66 | schedule=self.schedule, 67 | resource=self.resource, 68 | input_node=self.input, 69 | output_node=self.output, 70 | depends_on=self.depends_on, 71 | max_retries=self.max_retries, 72 | ) 73 | 74 | @classmethod 75 | def arguments_processor(cls, etl, input_args): 76 | """Parse the step arguments according to the ETL pipeline 77 | 78 | Args: 79 | etl(ETLPipeline): Pipeline object containing resources and steps 80 | step_args(dict): Dictionary of the step arguments for the class 81 | """ 82 | step_args = cls.base_arguments_processor(etl, input_args) 83 | step_args['postgres_database'] = etl.postgres_database 84 | 85 | return step_args 86 | -------------------------------------------------------------------------------- /dataduct/etl/tests/test_etl_actions.py: -------------------------------------------------------------------------------- 1 | """Tests for the ETL actions 2 | """ 3 | import os 4 | 5 | import unittest 6 | from testfixtures import TempDirectory 7 | from nose.tools import raises 8 | from nose.tools import eq_ 9 | 10 | from ..etl_actions import read_pipeline_definition 11 | from ..etl_actions import create_pipeline 12 | from ...utils.exceptions import ETLInputError 13 | 14 | 15 | class EtlActionsTests(unittest.TestCase): 16 | """Tests for the ETL actions 17 | """ 18 | 19 | def setUp(self): 20 | """Setup text fixtures 21 | """ 22 | self.load_hour = '01' 23 | self.load_min = '23' 24 | load_time = self.load_hour + ':' + self.load_min 25 | self.test_yaml = '\n'.join([ 26 | 'name: example_load_redshift', 27 | 'frequency: one-time', 28 | 'load_time: ' + load_time, 29 | 'max_retries: 5', 30 | 'description: Example for the load_redshift step', 31 | 'steps:', 32 | '- step_type: extract-local', 33 | ' path: data/test_table1.tsv', 34 | '- step_type: load-redshift', 35 | ' schema: dev', 36 | ' table: test_table', 37 | ]) 38 | # Definition has no description field 39 | self.test_definition = { 40 | 'name': 'example_load_redshift', 41 | 'frequency': 'one-time', 42 | 'description': 'Example for the load_redshift step', 43 | 'load_time': load_time, 44 | 'max_retries': 5, 45 | 'steps': [{ 46 | 'step_type': 'extract-local', 47 | 'path': 'data/test_table1.tsv', 48 | }, { 49 | 'step_type': 'load-redshift', 50 | 'schema': 'dev', 51 | 'table': 'test_table', 52 | }], 53 | } 54 | 55 | @staticmethod 56 | @raises(ETLInputError) 57 | def test_yaml_extension(): 58 | """Test if the yaml extension check works correctly 59 | for read_pipeline_definition 60 | """ 61 | read_pipeline_definition("name.txt") 62 | 63 | def test_read_pipeline_definition(self): 64 | """Test if the pipeline definition is parsed correctly 65 | """ 66 | with TempDirectory() as directory: 67 | directory.write('test_definition.yaml', self.test_yaml) 68 | result = read_pipeline_definition( 69 | os.path.join(directory.path, 'test_definition.yaml')) 70 | eq_(result, self.test_definition) 71 | 72 | def test_create_pipeline(self): 73 | """Test if simple pipeline creation is correct 74 | """ 75 | result = create_pipeline(self.test_definition) 76 | # Check that pipeline properties are accurate 77 | assert result.name.endswith(self.test_definition['name']) 78 | eq_(result.frequency, self.test_definition['frequency']) 79 | eq_(result.load_hour, int(self.load_hour)) 80 | eq_(result.load_min, int(self.load_min)) 81 | eq_(result.max_retries, self.test_definition['max_retries']) 82 | # Check that vital steps are created 83 | steps = result.steps 84 | assert 'ExtractLocalStep0' in steps 85 | assert 'LoadRedshiftStep0' in steps 86 | -------------------------------------------------------------------------------- /dataduct/steps/sql_command.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for SqlActivity can be executed on Ec2 3 | """ 4 | from .etl_step import ETLStep 5 | from ..pipeline import SqlActivity 6 | from ..database import SqlScript 7 | from ..s3 import S3File 8 | from ..utils.helpers import exactly_one 9 | from ..utils.helpers import parse_path 10 | from ..utils.exceptions import ETLInputError 11 | 12 | import logging 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class SqlCommandStep(ETLStep): 17 | """SQL Command Step class that helps run scripts on resouces 18 | """ 19 | 20 | def __init__(self, 21 | redshift_database, 22 | script=None, 23 | script_arguments=None, 24 | queue=None, 25 | sql_script=None, 26 | command=None, 27 | wrap_transaction=True, 28 | **kwargs): 29 | """Constructor for the SqlCommandStep class 30 | 31 | Args: 32 | command(str): command to be executed directly 33 | script(path): local path to the script that should executed 34 | queue(str): query queue that should be used 35 | script_arguments(list of str): arguments to the SQL command 36 | redshift_database(RedshiftDatabase): database to excute the query 37 | **kwargs(optional): Keyword arguments directly passed to base class 38 | """ 39 | if not exactly_one(command, script, sql_script): 40 | raise ETLInputError('Both command and script found') 41 | 42 | if sql_script is not None and not isinstance(sql_script, SqlScript): 43 | raise ETLInputError('sql_script should be of the type SqlScript') 44 | 45 | super(SqlCommandStep, self).__init__(**kwargs) 46 | 47 | # Create S3File with script / command provided 48 | if script: 49 | sql_script = SqlScript(filename=parse_path(script)) 50 | elif command: 51 | sql_script = SqlScript(command) 52 | 53 | if wrap_transaction: 54 | sql_script = sql_script.wrap_transaction() 55 | 56 | script = self.create_script(S3File(text=sql_script.sql())) 57 | 58 | logger.debug('Sql Query:') 59 | logger.debug(sql_script) 60 | 61 | self.create_pipeline_object( 62 | object_class=SqlActivity, 63 | max_retries=self.max_retries, 64 | resource=self.resource, 65 | worker_group=self.worker_group, 66 | schedule=self.schedule, 67 | database=redshift_database, 68 | script_arguments=script_arguments, 69 | depends_on=self.depends_on, 70 | script=script, 71 | queue=queue, 72 | ) 73 | 74 | @classmethod 75 | def arguments_processor(cls, etl, input_args): 76 | """Parse the step arguments according to the ETL pipeline 77 | 78 | Args: 79 | etl(ETLPipeline): Pipeline object containing resources and steps 80 | step_args(dict): Dictionary of the step arguments for the class 81 | """ 82 | input_args = cls.pop_inputs(input_args) 83 | step_args = cls.base_arguments_processor(etl, input_args) 84 | step_args['redshift_database'] = etl.redshift_database 85 | 86 | return step_args 87 | -------------------------------------------------------------------------------- /dataduct/steps/load_redshift.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step wrapper for RedshiftCopyActivity to load data into Redshift 3 | """ 4 | from .etl_step import ETLStep 5 | from ..pipeline import RedshiftNode 6 | from ..pipeline import RedshiftCopyActivity 7 | 8 | 9 | class LoadRedshiftStep(ETLStep): 10 | """Load Redshift Step class that helps load data into redshift 11 | """ 12 | 13 | def __init__(self, 14 | schema, 15 | table, 16 | redshift_database, 17 | insert_mode="TRUNCATE", 18 | max_errors=None, 19 | replace_invalid_char=None, 20 | compression=None, 21 | **kwargs): 22 | """Constructor for the LoadRedshiftStep class 23 | 24 | Args: 25 | schema(str): schema from which table should be extracted 26 | table(path): table name for extract 27 | insert_mode(str): insert mode for redshift copy activity 28 | redshift_database(RedshiftDatabase): database to excute the query 29 | max_errors(int): Maximum number of errors to be ignored during load 30 | replace_invalid_char(char): char to replace not utf-8 with 31 | **kwargs(optional): Keyword arguments directly passed to base class 32 | """ 33 | super(LoadRedshiftStep, self).__init__(**kwargs) 34 | 35 | # Create output node 36 | self._output = self.create_pipeline_object( 37 | object_class=RedshiftNode, 38 | schedule=self.schedule, 39 | redshift_database=redshift_database, 40 | schema_name=schema, 41 | table_name=table, 42 | ) 43 | 44 | command_options = ["DELIMITER '\t' ESCAPE TRUNCATECOLUMNS"] 45 | command_options.append("NULL AS 'NULL' ") 46 | 47 | if compression == "gzip": 48 | command_options.append("GZIP") 49 | elif compression == "bzip2": 50 | command_options.append("BZIP2") 51 | elif compression == "lzo": 52 | command_options.append("lzop") 53 | if max_errors: 54 | command_options.append('MAXERROR %d' % int(max_errors)) 55 | if replace_invalid_char: 56 | command_options.append( 57 | "ACCEPTINVCHARS AS '%s'" %replace_invalid_char) 58 | 59 | self.create_pipeline_object( 60 | object_class=RedshiftCopyActivity, 61 | max_retries=self.max_retries, 62 | input_node=self.input, 63 | output_node=self.output, 64 | insert_mode=insert_mode, 65 | resource=self.resource, 66 | worker_group=self.worker_group, 67 | schedule=self.schedule, 68 | depends_on=self.depends_on, 69 | command_options=command_options, 70 | ) 71 | 72 | @classmethod 73 | def arguments_processor(cls, etl, input_args): 74 | """Parse the step arguments according to the ETL pipeline 75 | 76 | Args: 77 | etl(ETLPipeline): Pipeline object containing resources and steps 78 | step_args(dict): Dictionary of the step arguments for the class 79 | """ 80 | step_args = cls.base_arguments_processor(etl, input_args) 81 | step_args['redshift_database'] = etl.redshift_database 82 | 83 | return step_args 84 | -------------------------------------------------------------------------------- /dataduct/pipeline/shell_command_activity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for ShellCommandActivity 3 | """ 4 | 5 | from .activity import Activity 6 | from ..config import Config 7 | from .schedule import Schedule 8 | from ..utils import constants as const 9 | from ..utils.exceptions import ETLInputError 10 | 11 | config = Config() 12 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) 13 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) 14 | 15 | 16 | class ShellCommandActivity(Activity): 17 | """ShellCommandActivity class 18 | """ 19 | 20 | def __init__(self, 21 | id, 22 | input_node, 23 | output_node, 24 | schedule, 25 | resource=None, 26 | worker_group=None, 27 | script_uri=None, 28 | script_arguments=None, 29 | command=None, 30 | max_retries=None, 31 | depends_on=None, 32 | additional_s3_files=None): 33 | """Constructor for the ShellCommandActivity class 34 | 35 | Args: 36 | id(str): id of the object 37 | input_node(S3Node / list of S3Nodes): input nodes for the activity 38 | output_node(S3Node / list of S3Nodes): output nodes for activity 39 | schedule(Schedule): schedule of the pipeline 40 | resource(Ec2Resource / EMRResource): resource to run the activity on 41 | worker_group(str): the worker group to run the activity on 42 | script_uri(S3File): s3 uri of the script 43 | script_arguments(list of str): command line arguments to the script 44 | command(str): command to be run as shell activity 45 | max_retries(int): number of retries for the activity 46 | depends_on(list of activities): dependendent pipelines steps 47 | additional_s3_files(list of s3File): additional files for activity 48 | """ 49 | 50 | # Validate inputs 51 | if not isinstance(schedule, Schedule): 52 | raise ETLInputError( 53 | 'Input schedule must be of the type Schedule') 54 | 55 | if command is not None and script_uri is not None: 56 | raise ETLInputError('command and script both can not be provided') 57 | 58 | # Set default values 59 | if depends_on is None: 60 | depends_on = [] 61 | if max_retries is None: 62 | max_retries = MAX_RETRIES 63 | # Set stage to true if we use either input or output node 64 | stage = 'true' if input_node or output_node else 'false' 65 | 66 | super(ShellCommandActivity, self).__init__( 67 | id=id, 68 | retryDelay=RETRY_DELAY, 69 | type='ShellCommandActivity', 70 | maximumRetries=max_retries, 71 | dependsOn=depends_on, 72 | stage=stage, 73 | input=input_node, 74 | output=output_node, 75 | runsOn=resource, 76 | workerGroup=worker_group, 77 | schedule=schedule, 78 | scriptUri=script_uri, 79 | scriptArgument=script_arguments, 80 | command=command 81 | ) 82 | 83 | # Add the additional s3 files 84 | self.add_additional_files(additional_s3_files) 85 | -------------------------------------------------------------------------------- /dataduct/pipeline/redshift_copy_activity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pipeline object class for RedshiftCopyActivity 3 | """ 4 | 5 | from .activity import Activity 6 | from ..config import Config 7 | from .redshift_node import RedshiftNode 8 | from .schedule import Schedule 9 | from ..utils import constants as const 10 | from ..utils.exceptions import ETLInputError 11 | 12 | config = Config() 13 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) 14 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) 15 | 16 | 17 | class RedshiftCopyActivity(Activity): 18 | """EMR Activity class 19 | """ 20 | 21 | def __init__(self, 22 | id, 23 | schedule, 24 | input_node, 25 | output_node, 26 | insert_mode, 27 | resource=None, 28 | worker_group=None, 29 | command_options=None, 30 | max_retries=None, 31 | depends_on=None): 32 | """Constructor for the RedshiftCopyActivity class 33 | 34 | Args: 35 | id(str): id of the object 36 | schedule(Schedule): schedule of the pipeline 37 | input_node(S3Node / RedshiftNode): input data node 38 | output_node(S3Node / RedshiftNode): output data node 39 | resource(Ec2Resource / EMRResource): resource to run the activity on 40 | worker_group(str): the worker group to run the activity on 41 | command_options(list of str): command options for the activity 42 | max_retries(int): number of retries for the activity 43 | depends_on(list of activities): dependendent pipelines steps 44 | """ 45 | 46 | # Validate inputs 47 | if not isinstance(schedule, Schedule): 48 | raise ETLInputError( 49 | 'Input schedule must be of the type Schedule') 50 | 51 | # Set default values 52 | if depends_on is None: 53 | depends_on = [] 54 | if max_retries is None: 55 | max_retries = MAX_RETRIES 56 | 57 | kwargs = { 58 | 'id': id, 59 | 'retryDelay': RETRY_DELAY, 60 | 'type': 'RedshiftCopyActivity', 61 | 'maximumRetries': max_retries, 62 | 'input': input_node, 63 | 'output': output_node, 64 | 'runsOn': resource, 65 | 'workerGroup': worker_group, 66 | 'insertMode': insert_mode, 67 | 'schedule': schedule, 68 | 'dependsOn': depends_on, 69 | 'commandOptions': command_options 70 | } 71 | 72 | if isinstance(input_node, RedshiftNode): 73 | # AWS BUG: AWS expects fully qualified name when extracting from 74 | # Redshift, but not when loading into redshift. Here, we enforce 75 | # a convention of providing schemaName and tableName separately. 76 | assert "." not in input_node['tableName'], \ 77 | "Using convention that table name is not fully qualified. " + \ 78 | "Provide the schema name separately from the table name." 79 | table_name = input_node['tableName'] 80 | del input_node['tableName'] 81 | input_node['tableName'] = "%s.%s" % (input_node['schemaName'], 82 | table_name) 83 | super(RedshiftCopyActivity, self).__init__(**kwargs) 84 | -------------------------------------------------------------------------------- /dataduct/database/parsers/utils.py: -------------------------------------------------------------------------------- 1 | """SQL parser utils and constants 2 | """ 3 | 4 | from pyparsing import CaselessKeyword 5 | from pyparsing import Combine 6 | from pyparsing import Forward 7 | from pyparsing import OneOrMore 8 | from pyparsing import Word 9 | from pyparsing import ZeroOrMore 10 | from pyparsing import alphanums 11 | from pyparsing import nums 12 | 13 | # Intermediate parsers 14 | _varchar_names = (CaselessKeyword('VARCHAR') | CaselessKeyword('TEXT')) 15 | _varchar_names |= CaselessKeyword('NVARCHAR') 16 | 17 | # Data types 18 | _smallint = (CaselessKeyword('SMALLINT') | CaselessKeyword('INT2')) 19 | _integer = CaselessKeyword('INTEGER') 20 | _integer |= CaselessKeyword('INT') | CaselessKeyword('INT4') 21 | _bigint = (CaselessKeyword('BIGINT') | CaselessKeyword('INT8')) 22 | _decimal = Combine((CaselessKeyword('DECIMAL') | CaselessKeyword('NUMERIC')) + '(' + Word(nums + ' ,') + ')') # noqa 23 | _real = (CaselessKeyword('REAL') | CaselessKeyword('FLOAT4')) 24 | _double = (CaselessKeyword('DOUBLE PRECISION') | CaselessKeyword('FLOAT') | CaselessKeyword('FLOAT8') | CaselessKeyword('DOUBLE')) # noqa 25 | _boolean = CaselessKeyword('BOOLEAN') 26 | _char = (CaselessKeyword('CHAR') | CaselessKeyword('CHARACTER')) 27 | _char |= (CaselessKeyword('NCHAR') | CaselessKeyword('BPCHAR')) 28 | _varchar = Combine(_varchar_names + '(' + Word(alphanums) + ')') 29 | _date = CaselessKeyword('DATE') 30 | _text = CaselessKeyword('TEXT') 31 | _timestamp = CaselessKeyword('TIMESTAMP') 32 | 33 | # Create SQL keywords 34 | _create = CaselessKeyword('CREATE') 35 | _table = CaselessKeyword('TABLE') 36 | _view = CaselessKeyword('VIEW') 37 | _temp = CaselessKeyword('TEMP') 38 | _temporary = CaselessKeyword('TEMPORARY') 39 | _if_not_exists = CaselessKeyword('IF NOT EXISTS') 40 | _or_replace = CaselessKeyword('OR REPLACE') 41 | _primary_key = CaselessKeyword('PRIMARY KEY') 42 | _foreign_key = CaselessKeyword('FOREIGN KEY') 43 | _references = CaselessKeyword('REFERENCES') 44 | _unique = CaselessKeyword('UNIQUE') 45 | _null = CaselessKeyword('NULL') 46 | _not_null = CaselessKeyword('NOT NULL') 47 | _distkey = CaselessKeyword('DISTKEY') 48 | _diststyle = CaselessKeyword('DISTSTYLE') 49 | _sortkey = CaselessKeyword('SORTKEY') 50 | _encode = CaselessKeyword('ENCODE') 51 | _all = CaselessKeyword('ALL') 52 | _even = CaselessKeyword('EVEN') 53 | _key = CaselessKeyword('KEY') 54 | 55 | # Select SQL Keywords 56 | _select = CaselessKeyword('SELECT') 57 | _with = CaselessKeyword('WITH') 58 | _from = CaselessKeyword('FROM') 59 | _as = CaselessKeyword('AS') 60 | _join = CaselessKeyword('JOIN') 61 | 62 | # Parsers 63 | _db_name = Word(alphanums+"_-.`") 64 | pk_check = (_primary_key | _unique) 65 | 66 | # Column types 67 | column_types = _smallint | _integer | _bigint | _decimal | _real | _double 68 | column_types |= _boolean | _char | _varchar | _date | _timestamp | _text 69 | 70 | # Define a field parser for create table fields or select query fields 71 | field_parser = Forward() 72 | subquery = Forward() 73 | 74 | # List of characters allowed in the query statements 75 | special_character = "\\_-. @*`>`__. The structure of a 14 | Dataduct YAML file can be broken down into 3 parts: 15 | 16 | - Header information 17 | - Description 18 | - Pipeline steps 19 | 20 | Example: 21 | 22 | .. code:: yaml 23 | 24 | # HEADER INFORMATION 25 | name : example_emr_streaming 26 | frequency : one-time 27 | load_time: 01:00 # Hour:Min in UTC 28 | topic_arn: 'arn:aws:sns:example_arn' 29 | emr_cluster_config: 30 | num_instances: 1 31 | instance_size: m1.xlarge 32 | bootstrap: 33 | string: "s3://elasticmapreduce/bootstrap-actions/configure-hadoop,--yarn-key-value, yarn.scheduler.maximum-allocation-mb=9500" 34 | 35 | # DESCRIPTION 36 | description : Example for the emr_streaming step 37 | 38 | # PIPELINE STEPS 39 | steps: 40 | - step_type: extract-local 41 | path: data/word_data.txt 42 | 43 | - step_type: emr-streaming 44 | mapper: scripts/word_mapper.py 45 | reducer: scripts/word_reducer.py 46 | 47 | - step_type: transform 48 | script: scripts/s3_profiler.py 49 | script_arguments: 50 | - --input=INPUT1_STAGING_DIR 51 | - --output=OUTPUT1_STAGING_DIR 52 | - -f 53 | 54 | 55 | Header Information 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 57 | 58 | The header includes configuration information for Data Pipeline and the 59 | Elastic MapReduce resource. 60 | 61 | The name field sets the overall pipeline name: 62 | 63 | .. code:: yaml 64 | 65 | name : example_emr_streaming 66 | 67 | The frequency represents how often the pipeline is run on a schedule 68 | basis. Currently supported intervals are *hourly, daily, one-time*: 69 | 70 | .. code:: yaml 71 | 72 | frequency : one-time 73 | 74 | The load time is what time of day (in UTC) the pipeline is scheduled to 75 | run. It is in the format of HH:MM so 01:00 would set the pipeline to run 76 | at 1AM UTC: 77 | 78 | .. code:: yaml 79 | 80 | load_time: 01:00 # Hour:Min in UTC 81 | 82 | In your config file, you have the option of specifying a default Amazon 83 | Resource Name that will be messaged if the pipeline fails, if you would wish to 84 | override this default ARN, you may use the topic_arn property: 85 | 86 | .. code:: yaml 87 | 88 | topic_arn: 'arn:aws:sns:example_arn' 89 | 90 | If the pipeline includes an EMR-streaming step, the EMR instance can be 91 | configured. For example, you can configure the bootstrap, number of core 92 | instances, and instance types: 93 | 94 | .. code:: yaml 95 | 96 | emr_cluster_config: 97 | num_instances: 1 98 | instance_size: m1.xlarge 99 | bootstrap: 100 | string: "s3://elasticmapreduce/bootstrap-actions/configure-hadoop,--yarn-key-value, yarn.scheduler.maximum-allocation-mb=9500" 101 | 102 | *Note: Arguments in the bootstrap step are delimited by commas, not spaces.* 103 | 104 | Description 105 | ^^^^^^^^^^^ 106 | 107 | The description allows the creator of the YAML file to clearly explain 108 | the purpose of the pipeline. 109 | -------------------------------------------------------------------------------- /dataduct/steps/pipeline_dependencies.py: -------------------------------------------------------------------------------- 1 | """ 2 | ETL step for pipeline dependencies using transform step 3 | """ 4 | from ..config import Config 5 | from ..utils import constants as const 6 | from .transform import TransformStep 7 | 8 | config = Config() 9 | NAME_PREFIX = config.etl.get('NAME_PREFIX', '') 10 | DEPENDENCY_OVERRIDE = config.etl.get('DEPENDENCY_OVERRIDE', False) 11 | SNS_TOPIC_ARN = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) 12 | 13 | 14 | class PipelineDependenciesStep(TransformStep): 15 | """PipelineDependencies Step class that helps wait for other pipelines 16 | to finish 17 | """ 18 | 19 | def __init__(self, 20 | id, 21 | pipeline_name, 22 | dependent_pipelines=None, 23 | dependent_pipelines_ok_to_fail=None, 24 | refresh_rate=300, 25 | start_date=None, 26 | script_arguments=None, 27 | **kwargs): 28 | """Constructor for the QATransformStep class 29 | 30 | Args: 31 | sns_arn(str): sns topic arn for QA steps 32 | script_arguments(list of str): list of arguments to the script 33 | **kwargs(optional): Keyword arguments directly passed to base class 34 | """ 35 | 36 | if script_arguments is None: 37 | script_arguments = list() 38 | 39 | if (dependent_pipelines is None and 40 | dependent_pipelines_ok_to_fail is None): 41 | raise ValueError('Must have some dependencies for dependency step') 42 | 43 | prefix_func = lambda p: p if not NAME_PREFIX else NAME_PREFIX + '_' + p 44 | argument_func = lambda x: [prefix_func(p) for p in x] 45 | 46 | if DEPENDENCY_OVERRIDE: 47 | command = 'ls' 48 | script_arguments = None 49 | else: 50 | command = const.DEPENDENCY_COMMAND 51 | if start_date is None: 52 | start_date = "#{format(@scheduledStartTime,'YYYY-MM-dd')}" 53 | 54 | script_arguments.extend( 55 | [ 56 | '--pipeline_name=%s' % pipeline_name, 57 | '--start_date=%s' % start_date, 58 | '--refresh_rate=%s' % str(refresh_rate), 59 | '--sns_topic_arn=%s' % SNS_TOPIC_ARN, 60 | ] 61 | ) 62 | 63 | if dependent_pipelines: 64 | script_arguments.append('--dependencies') 65 | script_arguments.extend(argument_func(dependent_pipelines)) 66 | 67 | if dependent_pipelines_ok_to_fail: 68 | script_arguments.append('--dependencies_ok_to_fail') 69 | script_arguments.extend( 70 | argument_func(dependent_pipelines_ok_to_fail)) 71 | 72 | super(PipelineDependenciesStep, self).__init__( 73 | id=id, 74 | command=command, 75 | script_arguments=script_arguments, 76 | no_output=True, 77 | **kwargs) 78 | 79 | self._output = None 80 | 81 | @classmethod 82 | def arguments_processor(cls, etl, input_args): 83 | """Parse the step arguments according to the ETL pipeline 84 | 85 | Args: 86 | etl(ETLPipeline): Pipeline object containing resources and steps 87 | step_args(dict): Dictionary of the step arguments for the class 88 | """ 89 | input_args = cls.pop_inputs(input_args) 90 | step_args = cls.base_arguments_processor(etl, input_args) 91 | step_args['pipeline_name'] = etl.name 92 | 93 | return step_args 94 | --------------------------------------------------------------------------------