├── dataduct
    ├── utils
    │   ├── __init__.py
    │   ├── tests
    │   │   └── __init__.py
    │   ├── exceptions.py
    │   ├── decorators.py
    │   ├── constants.py
    │   └── hook.py
    ├── config
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_config_actions.py
    │   │   ├── test_credentials.py
    │   │   └── test_config.py
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── example_config
    │   ├── config_actions.py
    │   ├── logger_config.py
    │   └── credentials.py
    ├── etl
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_etl_pipeline.py
    │   │   └── test_etl_actions.py
    │   ├── __init__.py
    │   └── utils.py
    ├── tests
    │   ├── __init__.py
    │   └── test_import.py
    ├── data_access
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_connection.py
    │   ├── __init__.py
    │   └── open_shell.py
    ├── database
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── helpers.py
    │   │   └── test_table.py
    │   ├── parsers
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_create_view.py
    │   │   │   ├── test_create_table.py
    │   │   │   └── test_select_query.py
    │   │   ├── __init__.py
    │   │   ├── helpers.py
    │   │   ├── create_view.py
    │   │   ├── utils.py
    │   │   └── transform.py
    │   ├── sql
    │   │   ├── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_sql_utils.py
    │   │   │   └── test_sql_statement.py
    │   │   ├── __init__.py
    │   │   ├── transaction.py
    │   │   ├── utils.py
    │   │   └── sql_statement.py
    │   ├── __init__.py
    │   ├── select_statement.py
    │   ├── view.py
    │   ├── column.py
    │   └── relation.py
    ├── steps
    │   ├── executors
    │   │   ├── __init__.py
    │   │   ├── primary_key_check.py
    │   │   └── count_check.py
    │   ├── __init__.py
    │   ├── reload.py
    │   ├── delta_load.py
    │   ├── extract_local.py
    │   ├── primary_key_check.py
    │   ├── emr_job.py
    │   ├── extract_s3.py
    │   ├── upsert.py
    │   ├── qa_transform.py
    │   ├── create_load_redshift.py
    │   ├── extract_redshift.py
    │   ├── create_update_sql.py
    │   ├── load_postgres.py
    │   ├── sql_command.py
    │   ├── load_redshift.py
    │   └── pipeline_dependencies.py
    ├── __init__.py
    ├── s3
    │   ├── __init__.py
    │   ├── s3_log_path.py
    │   └── s3_directory.py
    ├── qa
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── primary_key_check.py
    │   ├── count_check.py
    │   └── column_check.py
    └── pipeline
    │   ├── __init__.py
    │   ├── precondition.py
    │   ├── postgres_node.py
    │   ├── default_object.py
    │   ├── postgres_database.py
    │   ├── redshift_node.py
    │   ├── sns_alarm.py
    │   ├── mysql_node.py
    │   ├── redshift_database.py
    │   ├── copy_activity.py
    │   ├── emr_activity.py
    │   ├── activity.py
    │   ├── sql_activity.py
    │   ├── ec2_resource.py
    │   ├── shell_command_activity.py
    │   └── redshift_copy_activity.py
├── examples
    ├── README.md
    ├── resources
    │   ├── data
    │   │   ├── test_table2.tsv
    │   │   └── test_table1.tsv
    │   ├── tables
    │   │   ├── dev.test_table.sql
    │   │   ├── dev.test_table_2.sql
    │   │   ├── shippers.sql
    │   │   ├── categories.sql
    │   │   ├── employees.sql
    │   │   ├── order_details.sql
    │   │   ├── orders.sql
    │   │   ├── customers.sql
    │   │   ├── products.sql
    │   │   └── suppliers.sql
    │   └── scripts
    │   │   ├── word_mapper.py
    │   │   └── word_reducer.py
    ├── example_failed_pipeline.yaml
    ├── example_sql_command.yaml
    ├── example_extract_redshift.yaml
    ├── example_extract_s3.yaml
    ├── example_extract_local.yaml
    ├── example_custom_extract_local.yaml
    ├── example_extract_postgres.yaml
    ├── example_load_redshift.yaml
    ├── example_create_and_load_redshift.yaml
    ├── example_load_postgres.yaml
    ├── example_count_check.yaml
    ├── example_primary_key_check.yaml
    ├── example_load_reload_pk.yaml
    ├── example_reload.yaml
    ├── example_upsert.yaml
    ├── example_column_check.yaml
    ├── example_pipeline_dependency.yaml
    ├── example_create_update_sql.yaml
    ├── example_extract_rds.yaml
    ├── example_bootstrap.yaml
    ├── example_emr_streaming.yaml
    ├── example_double_input.yaml
    ├── steps
    │   └── custom_extract_local.py
    ├── example_transform.yaml
    └── example_double_output.yaml
├── read_the_docs.txt
├── docs
    ├── modules.rst
    ├── README.md
    ├── dataduct.rst
    ├── dataduct.tests.rst
    ├── dataduct.data_access.rst
    ├── dataduct.config.tests.rst
    ├── dataduct.etl.tests.rst
    ├── dataduct.database.tests.rst
    ├── dataduct.etl.rst
    ├── dataduct.database.sql.tests.rst
    ├── dataduct.s3.rst
    ├── dataduct.qa.rst
    ├── dataduct.utils.rst
    ├── dataduct.database.sql.rst
    ├── dataduct.config.rst
    ├── dataduct.database.parsers.tests.rst
    ├── index.rst
    ├── dataduct.database.parsers.rst
    ├── introduction.rst
    ├── dataduct.database.rst
    ├── installation.rst
    ├── hooks.rst
    └── creating_an_etl.rst
├── MANIFEST.in
├── bin
    └── README.md
├── requirements.txt
├── .gitignore
├── LICENSE.md
├── .travis.yml
├── README.rst
├── MANIFEST
├── setup.py
├── CONTRIBUTING.md
└── CHANGES.md


/dataduct/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/config/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/etl/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/dataduct/utils/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/data_access/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/database/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/steps/executors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | #### Examples
2 | 


--------------------------------------------------------------------------------
/read_the_docs.txt:
--------------------------------------------------------------------------------
1 | Sphinx>=1.3.1
2 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/resources/data/test_table2.tsv:
--------------------------------------------------------------------------------
1 | 2	this is another row (with ID=2)
2 | 


--------------------------------------------------------------------------------
/examples/resources/data/test_table1.tsv:
--------------------------------------------------------------------------------
1 | 1	thisis a roooow
2 | 3	
3 | 4	NULL
4 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | dataduct
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    dataduct
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.rst
4 | include *.py
5 | recursive-include bin *
6 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/__init__.py:
--------------------------------------------------------------------------------
1 | from .sql_statement import SqlStatement
2 | from .sql_script import SqlScript
3 | 


--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
1 | #### Bin
2 | 
3 | Folder contains scripts to be added to the path variable of the user for command line access.
4 | 


--------------------------------------------------------------------------------
/dataduct/__init__.py:
--------------------------------------------------------------------------------
1 | """Welcome to DataDuct
2 | """
3 | __version__ = '0.5.0'
4 | __import__('pkg_resources').declare_namespace(__name__)
5 | 


--------------------------------------------------------------------------------
/examples/resources/tables/dev.test_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE dev.test_table(
2 |     id INTEGER PRIMARY KEY,
3 |     description VARCHAR(255)
4 | );
5 | 


--------------------------------------------------------------------------------
/examples/resources/tables/dev.test_table_2.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE dev.test_table_2(
2 |     id INTEGER PRIMARY KEY,
3 |     description VARCHAR(255)
4 | );
5 | 


--------------------------------------------------------------------------------
/dataduct/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import Config
2 | from .logger_config import logger_configuration
3 | from .credentials import get_aws_credentials
4 | 


--------------------------------------------------------------------------------
/dataduct/s3/__init__.py:
--------------------------------------------------------------------------------
1 | from .s3_file import S3File
2 | from .s3_path import S3Path
3 | from .s3_directory import S3Directory
4 | from .s3_log_path import S3LogPath
5 | 


--------------------------------------------------------------------------------
/dataduct/qa/__init__.py:
--------------------------------------------------------------------------------
1 | from .check import Check
2 | from .count_check import CountCheck
3 | from .column_check import ColumnCheck
4 | from .primary_key_check import PrimaryKeyCheck
5 | 


--------------------------------------------------------------------------------
/examples/resources/tables/shippers.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE shippers (
2 |     shipper_id INTEGER DISTKEY PRIMARY KEY
3 |     ,shipper_name VARCHAR(200)
4 |     ,phone VARCHAR(20)
5 | ) SORTKEY(shipper_id);
6 | 


--------------------------------------------------------------------------------
/dataduct/config/constants.py:
--------------------------------------------------------------------------------
1 | """Constants shared across the config package
2 | """
3 | 
4 | CONFIG_STR = 'config'
5 | CONFIG_DIR = '.dataduct'
6 | CFG_FILE = 'dataduct.cfg'
7 | LOG_FILE = 'dataduct.log'
8 | 


--------------------------------------------------------------------------------
/dataduct/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | """Exceptions for dataduct
2 | """
3 | 
4 | class ETLInputError(Exception): pass
5 | 
6 | class ETLConfigError(Exception): pass
7 | 
8 | class DatabaseInputError(Exception): pass
9 | 


--------------------------------------------------------------------------------
/examples/resources/tables/categories.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE categories (
2 |     category_id INTEGER DISTKEY PRIMARY KEY
3 |     ,category_name VARCHAR(100)
4 |     ,description VARCHAR(2000)
5 | ) SORTKEY(category_id);
6 | 


--------------------------------------------------------------------------------
/examples/example_failed_pipeline.yaml:
--------------------------------------------------------------------------------
1 | name: example_failed_pipeline
2 | frequency: one-time
3 | load_time: 01:00  # Hour:Min in UTC
4 | 
5 | steps:
6 | -    step_type: transform
7 |      name: failure_step
8 |      command: this is going to fail
9 | 


--------------------------------------------------------------------------------
/examples/resources/tables/employees.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE employees (
2 |     employee_id INTEGER DISTKEY PRIMARY KEY
3 |     ,last_name VARCHAR(100)
4 |     ,first_name VARCHAR(100)
5 |     ,birth_date DATE
6 |     ,notes VARCHAR(2000)
7 | ) SORTKEY(employee_id);
8 | 


--------------------------------------------------------------------------------
/dataduct/etl/__init__.py:
--------------------------------------------------------------------------------
1 | from .etl_actions import activate_pipeline
2 | from .etl_actions import create_pipeline
3 | from .etl_actions import read_pipeline_definition
4 | from .etl_actions import validate_pipeline
5 | from .etl_actions import visualize_pipeline
6 | 


--------------------------------------------------------------------------------
/dataduct/qa/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shared utility functions
 3 | """
 4 | 
 5 | def render_output(data):
 6 |     """Print the formatted output for the list
 7 |     """
 8 |     output = ['[Dataduct]: ']
 9 |     output.extend(data)
10 |     return '\n'.join(output)
11 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | #### Documentation
2 | 
3 | This is the base directory for all the docs, we use sphinx and the sphinx
4 | napolean extention for autogenerating docs for any library code.
5 | 
6 | Running `make html` in the docs directory should create all the docs for you.
7 | 


--------------------------------------------------------------------------------
/examples/example_sql_command.yaml:
--------------------------------------------------------------------------------
 1 | name: example_sql_command
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the sql_command step
 6 | 
 7 | steps:
 8 | -   step_type: sql-command
 9 |     command: SELECT * FROM dev.test_table;
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | boto>=2.34.0
 2 | Sphinx>=1.2.3
 3 | sphinx-rtd-theme>=0.1.6
 4 | pandas>=0.14.1
 5 | psycopg2==2.6.0
 6 | MySQL-python
 7 | PyYAML
 8 | coverage
 9 | pyparsing==1.5.6
10 | pygraphviz
11 | testfixtures>=4.1.1
12 | mock
13 | pytimeparse
14 | pyprind
15 | requests
16 | 


--------------------------------------------------------------------------------
/examples/resources/tables/order_details.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE order_details (
2 |     order_detail_id INTEGER DISTKEY PRIMARY KEY
3 |     ,order_id INTEGER REFERENCES orders(order_id)
4 |     ,product_id INTEGER REFERENCES products(product_id)
5 |     ,quantity INTEGER
6 | ) SORTKEY(order_detail_id);
7 | 


--------------------------------------------------------------------------------
/dataduct/data_access/__init__.py:
--------------------------------------------------------------------------------
1 | from .connection import get_sql_config
2 | from .connection import rds_connection
3 | from .connection import get_redshift_config
4 | from .connection import redshift_connection
5 | from .connection import get_postgres_config
6 | from .connection import postgres_connection
7 | 


--------------------------------------------------------------------------------
/dataduct/database/__init__.py:
--------------------------------------------------------------------------------
1 | from .database import Database
2 | from .select_statement import SelectStatement
3 | from .sql import SqlScript
4 | from .sql import SqlStatement
5 | from .table import Table
6 | from .view import View
7 | from .history_table import HistoryTable
8 | from .column import Column
9 | 


--------------------------------------------------------------------------------
/examples/example_extract_redshift.yaml:
--------------------------------------------------------------------------------
 1 | name: example_extract_redshift
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: This example extracts data out of redshift
 6 | 
 7 | steps:
 8 | -   step_type: extract-redshift
 9 |     schema: dev
10 |     table: categories
11 | 


--------------------------------------------------------------------------------
/examples/example_extract_s3.yaml:
--------------------------------------------------------------------------------
 1 | name: example_extract_s3
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: This example creates an S3Node given a S3 Uri
 6 | 
 7 | steps:
 8 | -   step_type: extract-s3
 9 |     file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py
10 | 


--------------------------------------------------------------------------------
/examples/example_extract_local.yaml:
--------------------------------------------------------------------------------
 1 | name: example_extract_local
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: |
 6 |     This example uploads a local file to S3 with the extract-local step.
 7 | 
 8 | steps:
 9 | -   step_type: extract-local
10 |     path: data/test_table1.tsv
11 | 


--------------------------------------------------------------------------------
/examples/resources/tables/orders.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE orders (
2 |     order_id INTEGER DISTKEY PRIMARY KEY
3 |     ,customer_id INTEGER REFERENCES customers(customer_id)
4 |     ,employee_id INTEGER REFERENCES employees(employee_id)
5 |     ,order_date DATE
6 |     ,shipper_id INTEGER REFERENCES shippers(shipper_id)
7 | ) SORTKEY(order_id);
8 | 


--------------------------------------------------------------------------------
/examples/example_custom_extract_local.yaml:
--------------------------------------------------------------------------------
 1 | name: example_custom_extract_local
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: |
 6 |     This example uploads a local file to S3 with the extract-local step.
 7 | 
 8 | steps:
 9 | -   step_type: custom-extract-local
10 |     path: data/test_table1.tsv
11 | 


--------------------------------------------------------------------------------
/examples/resources/tables/customers.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE customers (
 2 |     customer_id INTEGER DISTKEY PRIMARY KEY
 3 |     ,customer_name VARCHAR(200)
 4 |     ,contact_name VARCHAR(200)
 5 |     ,address VARCHAR(200)
 6 |     ,city VARCHAR(100)
 7 |     ,postal_code VARCHAR(10)
 8 |     ,country VARCHAR(100)
 9 | ) SORTKEY(customer_id);
10 | 


--------------------------------------------------------------------------------
/examples/resources/tables/products.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE products (
2 |     product_id INTEGER DISTKEY PRIMARY KEY
3 |     ,product_name VARCHAR(200)
4 |     ,supplier_id INTEGER REFERENCES suppliers(supplier_id)
5 |     ,category_id INTEGER REFERENCES categories(category_id)
6 |     ,unit VARCHAR(200)
7 |     ,price REAL
8 | ) SORTKEY(product_id);
9 | 


--------------------------------------------------------------------------------
/examples/example_extract_postgres.yaml:
--------------------------------------------------------------------------------
 1 | name: example_extract_postgres
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: This example extracts data out of postgres
 6 | 
 7 | steps:
 8 | -   step_type: extract-postgres
 9 |     sql: "SELECT * from sometable"
10 |     output_path: s3://somebucket/somedata.csv
11 | 


--------------------------------------------------------------------------------
/docs/dataduct.rst:
--------------------------------------------------------------------------------
 1 | Code documentation
 2 | ==================
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 1
 6 | 
 7 |     dataduct.config
 8 |     dataduct.data_access
 9 |     dataduct.database
10 |     dataduct.etl
11 |     dataduct.pipeline
12 |     dataduct.qa
13 |     dataduct.s3
14 |     dataduct.steps
15 |     dataduct.tests
16 |     dataduct.utils
17 | 


--------------------------------------------------------------------------------
/examples/example_load_redshift.yaml:
--------------------------------------------------------------------------------
 1 | name: example_load_redshift
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the load_redshift step
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     path: data/test_table1.tsv
10 | 
11 | -   step_type: load-redshift
12 |     schema: dev
13 |     table: test_table
14 | 


--------------------------------------------------------------------------------
/examples/resources/tables/suppliers.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE suppliers (
 2 |     supplier_id INTEGER DISTKEY PRIMARY KEY
 3 |     ,supplier_name VARCHAR(200)
 4 |     ,contact_name VARCHAR(200)
 5 |     ,address VARCHAR(200)
 6 |     ,city VARCHAR(100)
 7 |     ,postal_code VARCHAR(10)
 8 |     ,county VARCHAR(100)
 9 |     ,phone VARCHAR(20)
10 | ) SORTKEY(supplier_id);
11 | 


--------------------------------------------------------------------------------
/examples/example_create_and_load_redshift.yaml:
--------------------------------------------------------------------------------
 1 | name: example_create_and_load_redshift
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the load_redshift step
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     path: data/test_table1.tsv
10 | 
11 | -   step_type: create-load-redshift
12 |     table_definition: tables/dev.test_table.sql
13 | 


--------------------------------------------------------------------------------
/examples/example_load_postgres.yaml:
--------------------------------------------------------------------------------
 1 | name: example_load_postgres
 2 | frequency: one-time
 3 | load_time: 00:01 # Hour:Min in UTC
 4 | 
 5 | description: Example for the load_postgres step
 6 | 
 7 | steps:
 8 | -   step_type: extract-s3
 9 |     file_uri: s3://somebucket/somedata.csv
10 | 
11 | -   step_type: load-postgres
12 |     table: sometable
13 |     insert_query: "INSERT INTO sometable (col1, col2, col3) VALUES (?,?,?);"
14 | 


--------------------------------------------------------------------------------
/examples/example_count_check.yaml:
--------------------------------------------------------------------------------
 1 | name: example_count_check
 2 | frequency: one-time
 3 | load_time: 01:00
 4 | 
 5 | description: Example for the count-check step
 6 | 
 7 | steps:
 8 | -   step_type: count-check
 9 |     source_sql: "SELECT id, name FROM networks_network;"
10 |     source_host: maestro
11 |     destination_sql: "SELECT network_id, network_name FROM prod.networks"
12 |     tolerance: 2.0
13 |     log_to_s3: true
14 | 


--------------------------------------------------------------------------------
/examples/example_primary_key_check.yaml:
--------------------------------------------------------------------------------
 1 | name: example_primary_key_check
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the primary-key-check step
 6 | 
 7 | steps:
 8 | -   step_type: primary-key-check
 9 |     table_definition: tables/dev.test_table.sql
10 |     log_to_s3: true
11 |     script_arguments:
12 |     -   "--path_suffix=#{format(@scheduledStartTime, 'YYYY-MM-dd')}"
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Setuptools distribution folder.
 5 | /dist/
 6 | 
 7 | # Docs build folder
 8 | /docs/_build
 9 | 
10 | # Build directory
11 | /build/
12 | 
13 | # Python egg metadata, regenerated from source files by setuptools.
14 | /*.egg-info
15 | /*.egg
16 | 
17 | # Images created should be checked in manually
18 | *.png
19 | 
20 | .coverage
21 | 
22 | # pycharm or intellij
23 | .idea/
24 | 
25 | .DS_Store
26 | 


--------------------------------------------------------------------------------
/examples/example_load_reload_pk.yaml:
--------------------------------------------------------------------------------
 1 | name: example_load_reload_primary_key_check
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the load-reload-pk step
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     path: data/test_table1.tsv
10 | 
11 | -   step_type: load-reload-pk
12 |     staging_table_definition: tables/dev.test_table.sql
13 |     production_table_definition: tables/dev.test_table_2.sql
14 | 


--------------------------------------------------------------------------------
/examples/example_reload.yaml:
--------------------------------------------------------------------------------
 1 | name: example_reload
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the reload step
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     path: data/test_table1.tsv
10 | 
11 | -   step_type: create-load-redshift
12 |     table_definition: tables/dev.test_table.sql
13 | 
14 | -   step_type: reload
15 |     source: tables/dev.test_table.sql
16 |     destination: tables/dev.test_table_2.sql
17 | 


--------------------------------------------------------------------------------
/examples/example_upsert.yaml:
--------------------------------------------------------------------------------
 1 | name: example_upsert
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the upsert step
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     path: data/test_table1.tsv
10 | 
11 | -   step_type: create-load-redshift
12 |     table_definition: tables/dev.test_table.sql
13 | 
14 | -   step_type: upsert
15 |     source: tables/dev.test_table.sql
16 |     destination: tables/dev.test_table_2.sql
17 | 


--------------------------------------------------------------------------------
/docs/dataduct.tests.rst:
--------------------------------------------------------------------------------
 1 | dataduct.tests package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.tests.test_import module
 8 | ---------------------------------
 9 | 
10 | .. automodule:: dataduct.tests.test_import
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: dataduct.tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/examples/example_column_check.yaml:
--------------------------------------------------------------------------------
 1 | name: example_column_check
 2 | frequency: one-time
 3 | load_time: 01:00
 4 | 
 5 | description: Example for the column-check step
 6 | 
 7 | steps:
 8 | -   step_type: column-check
 9 |     source_sql: "SELECT id, name FROM networks_network;"
10 |     source_host: maestro
11 |     destination_sql: "SELECT network_id, network_name FROM prod.networks"
12 |     sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER"
13 |     sample_size: 10
14 |     log_to_s3: true
15 | 


--------------------------------------------------------------------------------
/docs/dataduct.data_access.rst:
--------------------------------------------------------------------------------
 1 | dataduct.data_access package
 2 | ============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.data_access.connection module
 8 | --------------------------------------
 9 | 
10 | .. automodule:: dataduct.data_access.connection
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: dataduct.data_access
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/examples/example_pipeline_dependency.yaml:
--------------------------------------------------------------------------------
 1 | name: example_pipeline_dependency
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | steps:
 6 | -   step_type: pipeline-dependencies
 7 |     name: dependency_step
 8 |     refresh_rate: 60
 9 |     dependent_pipelines:
10 |     -   example_transform
11 |     dependent_pipelines_ok_to_fail:
12 |     -   example_failed_pipeline
13 | 
14 | -   step_type: transform
15 |     depends_on: dependency_step
16 |     command: whoami >> $OUTPUT1_STAGING_DIR/output.txt
17 | 


--------------------------------------------------------------------------------
/examples/example_create_update_sql.yaml:
--------------------------------------------------------------------------------
 1 | name: example_create_update_sql
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the create-update-sql step
 6 | 
 7 | steps:
 8 | -   step_type: create-update-sql
 9 |     command: |
10 |         DELETE FROM dev.test_table WHERE id < 0;
11 |         INSERT INTO dev.test_table
12 |         SELECT * FROM dev.test_table_2
13 |         WHERE id < %s;
14 |     table_definition: tables/dev.test_table.sql
15 |     script_arguments:
16 |     -   4
17 | 


--------------------------------------------------------------------------------
/examples/example_extract_rds.yaml:
--------------------------------------------------------------------------------
 1 | name: example_extract_rds
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: |
 6 |     This example extracts data from mysql to S3 with the extract-rds step.
 7 | 
 8 | steps:
 9 | -   step_type: extract-rds
10 |     host_name: maestro
11 |     database: maestro
12 |     table: specializations_specialization
13 | 
14 | -   step_type: extract-rds
15 |     host_name: maestro
16 |     database: maestro
17 |     sql: |
18 |         SELECT *
19 |         FROM networks_network;
20 | 


--------------------------------------------------------------------------------
/docs/dataduct.config.tests.rst:
--------------------------------------------------------------------------------
 1 | dataduct.config.tests package
 2 | =============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.config.tests.test_credentials module
 8 | ---------------------------------------------
 9 | 
10 | .. automodule:: dataduct.config.tests.test_credentials
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | 
16 | Module contents
17 | ---------------
18 | 
19 | .. automodule:: dataduct.config.tests
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .transform import remove_comments
 2 | from .transform import remove_empty_statements
 3 | from .transform import remove_transactional
 4 | from .transform import split_statements
 5 | from .transform import remove_newlines
 6 | 
 7 | from .select_query import parse_select_dependencies
 8 | from .select_query import parse_select_columns
 9 | from .select_query import parse_column_name
10 | 
11 | from .create_table import parse_create_table
12 | from .create_table import create_exists_clone
13 | from .create_view import parse_create_view
14 | 


--------------------------------------------------------------------------------
/examples/example_bootstrap.yaml:
--------------------------------------------------------------------------------
 1 | name: example_bootstrap
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the transform step
 6 | 
 7 | bootstrap:
 8 |     ec2:
 9 |     -   step_type: transform
10 |         input_node: []
11 |         command: pip install git+https://github.com/coursera/dataduct.git >> ${OUTPUT1_STAGING_DIR}/output.txt
12 |         name: bootstrap_override
13 | 
14 | steps:
15 | -   step_type: transform
16 |     input_node: []
17 |     command: python -c "import dataduct" >> ${OUTPUT1_STAGING_DIR}/output.txt
18 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright [2014] [Coursera]
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/dataduct/utils/decorators.py:
--------------------------------------------------------------------------------
 1 | """Common decorator utilities
 2 | """
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | def timeit(method):
 8 |     """Timing decorator for measuring performance of functions
 9 |     """
10 | 
11 |     def timed(*args, **kw):
12 |         ts = datetime.now()
13 |         print 'Starting time for Method %r is %s' % (method.__name__, ts)
14 | 
15 |         result = method(*args, **kw)
16 |         te = datetime.now()
17 |         print 'End time for Method %r is %s' % (method.__name__, te)
18 | 
19 |         print 'Method %r took %s time' % (method.__name__, te - ts)
20 |         return result
21 | 
22 |     return timed
23 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/transaction.py:
--------------------------------------------------------------------------------
 1 | """SQL Statements used in transactions
 2 | """
 3 | 
 4 | from .sql_statement import SqlStatement
 5 | 
 6 | 
 7 | class BeginStatement(SqlStatement):
 8 |     """Class representing begin sql statement
 9 |     """
10 |     def __init__(self):
11 |         """Constructor for begin class
12 |         """
13 |         super(BeginStatement, self).__init__('BEGIN', True)
14 | 
15 | 
16 | class CommitStatement(SqlStatement):
17 |     """Class representing Commit sql statement
18 |     """
19 |     def __init__(self):
20 |         """Constructor for Commit class
21 |         """
22 |         super(CommitStatement, self).__init__('COMMIT', True)
23 | 


--------------------------------------------------------------------------------
/examples/example_emr_streaming.yaml:
--------------------------------------------------------------------------------
 1 | name: example_emr_streaming
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | emr_cluster_config:
 5 |     num_instances: 1
 6 |     instance_size: m1.large
 7 |     ami_version: 3.3.1
 8 | 
 9 | description: Example for the emr_streaming step
10 | 
11 | steps:
12 | -   step_type: extract-local
13 |     path: data/word_data.txt
14 | 
15 | -   step_type: emr-streaming
16 |     mapper: scripts/word_mapper.py
17 |     reducer: scripts/word_reducer.py
18 | 
19 | -   step_type: transform
20 |     script: scripts/s3_profiler.py
21 |     script_arguments:
22 |     -   --input=INPUT1_STAGING_DIR
23 |     -   --output=OUTPUT1_STAGING_DIR
24 |     -   -f
25 | 


--------------------------------------------------------------------------------
/examples/example_double_input.yaml:
--------------------------------------------------------------------------------
 1 | name: example_double_input
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the transform step with multiple inputs
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     name: step1
10 |     path: data/test_table1.tsv
11 | 
12 | -   step_type: extract-local
13 |     name: step2
14 |     path: data/test_table2.tsv
15 | 
16 | -   step_type: transform
17 |     script: scripts/s3_profiler.py
18 |     input_node:
19 |         step1: script
20 |         step2: directory
21 |     script_arguments:
22 |     -   --input=INPUT1_STAGING_DIR
23 |     -   --output=OUTPUT1_STAGING_DIR
24 |     -   script/
25 |     -   directory/
26 | 


--------------------------------------------------------------------------------
/examples/steps/custom_extract_local.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for creating an S3 node for input from local files
 3 | """
 4 | from dataduct.steps import ExtractLocalStep
 5 | import logging
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class CustomExtractLocalStep(ExtractLocalStep):
10 |     """CustomExtractLocal Step class that helps get data from a local file
11 |     """
12 | 
13 |     def __init__(self, **kwargs):
14 |         """Constructor for the CustomExtractLocal class
15 | 
16 |         Args:
17 |             **kwargs(optional): Keyword arguments directly passed to base class
18 |         """
19 |         logger.info('Using the Custom Extract Local Step')
20 |         super(CustomExtractLocalStep, self).__init__(**kwargs)
21 | 


--------------------------------------------------------------------------------
/docs/dataduct.etl.tests.rst:
--------------------------------------------------------------------------------
 1 | dataduct.etl.tests package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.etl.tests.test_etl_actions module
 8 | ------------------------------------------
 9 | 
10 | .. automodule:: dataduct.etl.tests.test_etl_actions
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.etl.tests.test_etl_pipeline module
16 | -------------------------------------------
17 | 
18 | .. automodule:: dataduct.etl.tests.test_etl_pipeline
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: dataduct.etl.tests
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/dataduct/database/tests/helpers.py:
--------------------------------------------------------------------------------
 1 | """Helpers for Database Tests
 2 | """
 3 | from nose.tools import eq_
 4 | 
 5 | from ..table import Table
 6 | from ..view import View
 7 | from ..sql import SqlScript
 8 | 
 9 | 
10 | def create_table(sql):
11 |     """Creates a table object from a SQL string
12 |     """
13 |     return Table(SqlScript(sql))
14 | 
15 | 
16 | def create_view(sql):
17 |     """Creates a view object from a SQL string
18 |     """
19 |     return View(SqlScript(sql))
20 | 
21 | 
22 | def compare_scripts(actual_script, expected_script):
23 |     """Validates a SqlScript chain
24 |     """
25 |     assert len(actual_script) == len(expected_script)
26 |     for actual, expected in zip(actual_script, expected_script):
27 |         eq_(actual.sql(), expected)
28 | 


--------------------------------------------------------------------------------
/docs/dataduct.database.tests.rst:
--------------------------------------------------------------------------------
 1 | dataduct.database.tests package
 2 | ===============================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.database.tests.test_database module
 8 | --------------------------------------------
 9 | 
10 | .. automodule:: dataduct.database.tests.test_database
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.database.tests.test_history_table module
16 | -------------------------------------------------
17 | 
18 | .. automodule:: dataduct.database.tests.test_history_table
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: dataduct.database.tests
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/tests/test_create_view.py:
--------------------------------------------------------------------------------
 1 | """Tests for create view parser
 2 | """
 3 | 
 4 | from unittest import TestCase
 5 | from nose.tools import eq_
 6 | from ..create_view import parse_create_view
 7 | 
 8 | 
 9 | class TestCreateViewStatement(TestCase):
10 |     """Tests for create view
11 |     """
12 |     @staticmethod
13 |     def test_basic():
14 |         """Basic test for create view
15 |         """
16 |         query = 'CREATE VIEW orders AS (' + \
17 |                 'SELECT x, y, z from xyz_table)'
18 | 
19 |         full_name = 'orders'
20 |         replace = False
21 | 
22 |         output = parse_create_view(query)
23 | 
24 |         eq_(output['view_name'], full_name)
25 |         eq_(output['replace'], replace)
26 |         eq_(output['select_statement'], 'SELECT x, y, z from xyz_table')
27 | 


--------------------------------------------------------------------------------
/examples/example_transform.yaml:
--------------------------------------------------------------------------------
 1 | name: example_transform
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | ec2_resource_config:
 5 |     instance_type: m1.small
 6 | 
 7 | description: |
 8 |     Example for the transform step, uses an m1.small instance instead of
 9 |     the default
10 | 
11 | steps:
12 | -   step_type: extract-local
13 |     name: extract-node
14 |     path: data/test_table1.tsv
15 | 
16 | -   step_type: transform
17 |     input_node: extract-node
18 |     script: scripts/s3_profiler.py
19 |     script_arguments:
20 |     -   --input=INPUT1_STAGING_DIR
21 |     -   --output=OUTPUT1_STAGING_DIR
22 | 
23 | -   step_type: transform
24 |     input_node: extract-node
25 |     script_directory: scripts/
26 |     script_name: s3_profiler.py
27 |     script_arguments:
28 |     -   --input=INPUT1_STAGING_DIR
29 |     -   --output=OUTPUT1_STAGING_DIR
30 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | from .activity import Activity
 2 | from .copy_activity import CopyActivity
 3 | from .data_pipeline import DataPipeline
 4 | from .default_object import DefaultObject
 5 | from .ec2_resource import Ec2Resource
 6 | from .emr_resource import EmrResource
 7 | from .emr_activity import EmrActivity
 8 | from .mysql_node import MysqlNode
 9 | from .postgres_node import PostgresNode
10 | from .postgres_database import PostgresDatabase
11 | from .pipeline_object import PipelineObject
12 | from .precondition import Precondition
13 | from .redshift_copy_activity import RedshiftCopyActivity
14 | from .redshift_node import RedshiftNode
15 | from .redshift_database import RedshiftDatabase
16 | from .s3_node import S3Node
17 | from .schedule import Schedule
18 | from .shell_command_activity import ShellCommandActivity
19 | from .sns_alarm import SNSAlarm
20 | from .sql_activity import SqlActivity
21 | 


--------------------------------------------------------------------------------
/dataduct/config/tests/test_config_actions.py:
--------------------------------------------------------------------------------
 1 | """Tests that the config actions are working properly
 2 | """
 3 | from unittest import TestCase
 4 | from nose.tools import eq_
 5 | 
 6 | from .. import config_actions
 7 | from ..config import Config
 8 | 
 9 | 
10 | class TestConfigActions(TestCase):
11 |     """Tests for config actions
12 |     """
13 |     @staticmethod
14 |     def test_s3_config_path():
15 |         """Tests that s3_config_path correctly returns the S3 base path
16 |         """
17 |         config = Config()
18 |         config.etl['S3_BASE_PATH'] = 'test/path'
19 |         config.etl['S3_ETL_BUCKET'] = 'test_bucket'
20 |         config_actions.CONFIG_STR = 'test_config_str'
21 |         config_actions.CFG_FILE = 'test_cfg_file.cfg'
22 |         result = config_actions.s3_config_path()
23 |         eq_(result.bucket, 'test_bucket')
24 |         eq_(result.key, 'test/path/test_config_str/test_cfg_file.cfg')
25 | 


--------------------------------------------------------------------------------
/docs/dataduct.etl.rst:
--------------------------------------------------------------------------------
 1 | dataduct.etl package
 2 | ====================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     dataduct.etl.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | dataduct.etl.etl_actions module
15 | -------------------------------
16 | 
17 | .. automodule:: dataduct.etl.etl_actions
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | dataduct.etl.etl_pipeline module
23 | --------------------------------
24 | 
25 | .. automodule:: dataduct.etl.etl_pipeline
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | dataduct.etl.utils module
31 | -------------------------
32 | 
33 | .. automodule:: dataduct.etl.utils
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: dataduct.etl
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/dataduct/config/example_config:
--------------------------------------------------------------------------------
 1 | # Constants that are used across the dataduct library
 2 | 
 3 | ec2:
 4 |     INSTANCE_TYPE: m1.large
 5 |     ETL_AMI: ami-05355a6c # Default AMI used by data pipeline
 6 |     SECURITY_GROUP: FILL_ME_IN
 7 | 
 8 | emr:
 9 |     MASTER_INSTANCE_TYPE: m1.large
10 |     NUM_CORE_INSTANCES: 1
11 |     CORE_INSTANCE_TYPE: m1.large
12 |     CLUSTER_AMI: 3.7.0
13 | 
14 | etl:
15 |     S3_ETL_BUCKET: FILL_ME_IN
16 |     ROLE: FILL_ME_IN
17 |     RESOURCE_ROLE: FILL_ME_IN
18 | 
19 | postgres:
20 |     DATABASE_NAME: FILL_ME_IN
21 |     RDS_INSTANCE_ID: FILL_ME_IN
22 |     USERNAME: FILL_ME_IN
23 |     PASSWORD: FILL_ME_IN
24 |     REGION: FILL_ME_IN
25 | 
26 | mysql:
27 |     DATABASE:
28 |         HOST: FILL_ME_IN
29 |         PASSWORD: FILL_ME_IN
30 |         USERNAME: FILL_ME_IN
31 | 
32 | redshift:
33 |     CLUSTER_ID: FILL_ME_IN
34 |     DATABASE_NAME: FILL_ME_IN
35 |     HOST: FILL_ME_IN
36 |     PASSWORD: FILL_ME_IN
37 |     USERNAME: FILL_ME_IN
38 |     PORT: 5439
39 | 


--------------------------------------------------------------------------------
/examples/resources/scripts/word_mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Simple mapper for word count example"""
 3 | 
 4 | import sys
 5 | 
 6 | def read_input(file):
 7 |     """Reads the stdin line by line
 8 |     """
 9 |     for line in file:
10 |         # split the line into words
11 |         yield line.split()
12 | 
13 | def main(separator='\t'):
14 |     """Read the data and split the lines and emit the words
15 |     Args:
16 |         separator(str): Separator to be used between key and value
17 |     """
18 |     # input comes from STDIN (standard input)
19 |     data = read_input(sys.stdin)
20 |     for words in data:
21 |         # write the results to STDOUT (standard output);
22 |         # what we output here will be the input for the
23 |         # Reduce step, i.e. the input for reducer.py
24 |         #
25 |         # tab-delimited; the trivial word count is 1
26 |         for word in words:
27 |             print '%s%s%d' % (word, separator, 1)
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 


--------------------------------------------------------------------------------
/docs/dataduct.database.sql.tests.rst:
--------------------------------------------------------------------------------
 1 | dataduct.database.sql.tests package
 2 | ===================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.database.sql.tests.test_sql_script module
 8 | --------------------------------------------------
 9 | 
10 | .. automodule:: dataduct.database.sql.tests.test_sql_script
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.database.sql.tests.test_sql_statement module
16 | -----------------------------------------------------
17 | 
18 | .. automodule:: dataduct.database.sql.tests.test_sql_statement
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | dataduct.database.sql.tests.test_sql_utils module
24 | -------------------------------------------------
25 | 
26 | .. automodule:: dataduct.database.sql.tests.test_sql_utils
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: dataduct.database.sql.tests
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/dataduct/steps/__init__.py:
--------------------------------------------------------------------------------
 1 | from .column_check import ColumnCheckStep
 2 | from .count_check import CountCheckStep
 3 | from .create_load_redshift import CreateAndLoadStep
 4 | from .create_update_sql import CreateUpdateSqlStep
 5 | from .delta_load import DeltaLoadStep
 6 | from .emr_job import EMRJobStep
 7 | from .emr_streaming import EMRStreamingStep
 8 | from .etl_step import ETLStep
 9 | from .extract_local import ExtractLocalStep
10 | from .extract_rds import ExtractRdsStep
11 | from .extract_redshift import ExtractRedshiftStep
12 | from .extract_postgres import ExtractPostgresStep
13 | from .extract_s3 import ExtractS3Step
14 | from .load_redshift import LoadRedshiftStep
15 | from .load_postgres import LoadPostgresStep
16 | from .load_reload_pk import LoadReloadAndPrimaryKeyStep
17 | from .pipeline_dependencies import PipelineDependenciesStep
18 | from .primary_key_check import PrimaryKeyCheckStep
19 | from .qa_transform import QATransformStep
20 | from .reload import ReloadStep
21 | from .sql_command import SqlCommandStep
22 | from .transform import TransformStep
23 | from .upsert import UpsertStep
24 | 


--------------------------------------------------------------------------------
/dataduct/config/config_actions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Script that has action functions for config
 3 | """
 4 | from .config import Config
 5 | from ..s3 import S3Path
 6 | from ..s3 import S3File
 7 | 
 8 | from .constants import CONFIG_STR
 9 | from .constants import CFG_FILE
10 | 
11 | 
12 | config = Config()
13 | 
14 | def s3_config_path():
15 |     """S3 uri for the config files
16 |     """
17 |     key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, CFG_FILE]
18 |     return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key)
19 | 
20 | 
21 | def sync_to_s3():
22 |     """Upload the config file to an S3 location
23 |     """
24 |     s3_file = S3File(text=config.raw_config(), s3_path=s3_config_path())
25 |     s3_file.upload_to_s3()
26 | 
27 | 
28 | def sync_from_s3(filename):
29 |     """Read the config file from S3
30 |     """
31 |     s3_file = S3File(s3_path=s3_config_path())
32 |     text = s3_file.text
33 | 
34 |     if filename is None:
35 |         raise ValueError('Filename for config sync must be provided')
36 |     else:
37 |         with open(filename, 'w') as op_file:
38 |             op_file.write(text)
39 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/precondition.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for the precondition step
 3 | """
 4 | 
 5 | from .pipeline_object import PipelineObject
 6 | 
 7 | 
 8 | class Precondition(PipelineObject):
 9 |     """Precondition object added to all pipelines
10 |     """
11 | 
12 |     def __init__(self,
13 |                  id,
14 |                  is_directory=True,
15 |                  **kwargs):
16 |         """Constructor for the Precondition class
17 | 
18 |         Args:
19 |             id(str): id of the precondition object
20 |             is_directory(bool): if s3 path is a directory or not
21 |             **kwargs(optional): Keyword arguments directly passed to base class
22 |         """
23 | 
24 |         if is_directory:
25 |             super(Precondition, self).__init__(
26 |                 id=id,
27 |                 type='S3PrefixNotEmpty',
28 |                 s3Prefix="#{node.directoryPath}",
29 |             )
30 |         else:
31 |             super(Precondition, self).__init__(
32 |                 id=id,
33 |                 type='S3KeyExists',
34 |                 s3Prefix="#{node.filePath}",
35 |             )
36 | 


--------------------------------------------------------------------------------
/dataduct/steps/reload.py:
--------------------------------------------------------------------------------
 1 | """ETL step wrapper for Reload SQL script
 2 | """
 3 | from .upsert import UpsertStep
 4 | 
 5 | 
 6 | class ReloadStep(UpsertStep):
 7 |     """Reload Step class that helps run a step on the emr cluster
 8 |     """
 9 | 
10 |     def __init__(self, **kwargs):
11 |         """Constructor for the ReloadStep class
12 | 
13 |         Args:
14 |             **kwargs(optional): Keyword arguments directly passed to base class
15 |         """
16 | 
17 |         # Enforce PK by default.
18 |         if 'enforce_primary_key' not in kwargs:
19 |             kwargs['enforce_primary_key'] = True
20 |         super(ReloadStep, self).__init__(**kwargs)
21 | 
22 |     @classmethod
23 |     def arguments_processor(cls, etl, input_args):
24 |         """Parse the step arguments according to the ETL pipeline
25 | 
26 |         Args:
27 |             etl(ETLPipeline): Pipeline object containing resources and steps
28 |             step_args(dict): Dictionary of the step arguments for the class
29 |         """
30 |         input_args['delete_existing'] = True
31 |         return super(ReloadStep, cls).arguments_processor(etl, input_args)
32 | 


--------------------------------------------------------------------------------
/examples/example_double_output.yaml:
--------------------------------------------------------------------------------
 1 | name: example_double_output
 2 | frequency: one-time
 3 | load_time: 01:00  # Hour:Min in UTC
 4 | 
 5 | description: Example for the transform step with multiple outputs
 6 | 
 7 | steps:
 8 | -   step_type: extract-local
 9 |     name: step1_a
10 |     path: data/test_table1.tsv
11 | 
12 | -   step_type: extract-local
13 |     name: step1_b
14 |     path: data/test_table2.tsv
15 | 
16 | -   step_type: transform
17 |     command: cp -r $INPUT1_STAGING_DIR/* $OUTPUT1_STAGING_DIR
18 |     input_node:
19 |         step1_a: step2_a
20 |         step1_b: step2_b
21 |     output_node:
22 |     -   step2_a
23 |     -   step2_b
24 | 
25 | -   step_type: transform
26 |     name: profiler_1
27 |     script: scripts/s3_profiler.py
28 |     input_node: step2_a
29 |     script_arguments:
30 |     -   --input=INPUT1_STAGING_DIR
31 |     -   --output=OUTPUT1_STAGING_DIR
32 |     -   -f
33 | 
34 | -   step_type: transform
35 |     name: profiler_2
36 |     script: scripts/s3_profiler.py
37 |     input_node: step2_b
38 |     script_arguments:
39 |     -   --input=INPUT1_STAGING_DIR
40 |     -   --output=OUTPUT1_STAGING_DIR
41 |     -   -f
42 | 


--------------------------------------------------------------------------------
/dataduct/database/select_statement.py:
--------------------------------------------------------------------------------
 1 | """Script containing the SelectStatement object
 2 | """
 3 | 
 4 | from .sql import SqlStatement
 5 | from .column import Column
 6 | from .parsers import parse_select_dependencies
 7 | from .parsers import parse_select_columns
 8 | from .parsers import parse_column_name
 9 | 
10 | 
11 | class SelectStatement(SqlStatement):
12 |     """Class representing SelectStatement from a sql_statement
13 |     """
14 |     def __init__(self, sql):
15 |         """Constructor for CreateTableStatement class
16 |         """
17 |         super(SelectStatement, self).__init__(sql)
18 | 
19 |         self._dependencies = parse_select_dependencies(self.sql())
20 |         self._raw_columns = parse_select_columns(self.sql())
21 |         self._columns = [
22 |             Column(parse_column_name(c), None) for c in self._raw_columns]
23 | 
24 |     @property
25 |     def dependencies(self):
26 |         """Table dependencies of the select statement
27 |         """
28 |         return self._dependencies
29 | 
30 |     def columns(self):
31 |         """Table columns of the select statement
32 |         """
33 |         return self._columns
34 | 


--------------------------------------------------------------------------------
/dataduct/qa/primary_key_check.py:
--------------------------------------------------------------------------------
 1 | """QA test for we have duplicate primary keys inside redshift
 2 | """
 3 | 
 4 | from .check import Check
 5 | from .utils import render_output
 6 | 
 7 | 
 8 | class PrimaryKeyCheck(Check):
 9 |     """QA test for checking duplicate primary keys inside redshift
10 |     """
11 |     def __init__(self, duplicate_count=0, **kwargs):
12 |         """Constructor for Primary Key Check
13 | 
14 |         Args:
15 |             duplicate_count(int): Number of duplicates
16 |         """
17 |         super(PrimaryKeyCheck, self).__init__(**kwargs)
18 |         self.duplicate_count = duplicate_count
19 | 
20 |     @property
21 |     def error_rate(self):
22 |         """The error rate for the QA test
23 |         """
24 |         return self.duplicate_count
25 | 
26 |     @property
27 |     def summary(self):
28 |         """Summary of the test results for the SNS message
29 |         """
30 |         return render_output(
31 |             [
32 |                 'Test Name: %s' % self.name,
33 |                 'Success: %s' % self.success,
34 |                 'Tolerance: %d' % self.tolerance,
35 |                 'Error Rate: %d' % self.error_rate,
36 |             ]
37 |         )
38 | 


--------------------------------------------------------------------------------
/docs/dataduct.s3.rst:
--------------------------------------------------------------------------------
 1 | dataduct.s3 package
 2 | ===================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.s3.s3_directory module
 8 | -------------------------------
 9 | 
10 | .. automodule:: dataduct.s3.s3_directory
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.s3.s3_file module
16 | --------------------------
17 | 
18 | .. automodule:: dataduct.s3.s3_file
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | dataduct.s3.s3_log_path module
24 | ------------------------------
25 | 
26 | .. automodule:: dataduct.s3.s3_log_path
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | dataduct.s3.s3_path module
32 | --------------------------
33 | 
34 | .. automodule:: dataduct.s3.s3_path
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | dataduct.s3.utils module
40 | ------------------------
41 | 
42 | .. automodule:: dataduct.s3.utils
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | 
48 | Module contents
49 | ---------------
50 | 
51 | .. automodule:: dataduct.s3
52 |     :members:
53 |     :undoc-members:
54 |     :show-inheritance:
55 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shared utility functions
 3 | """
 4 | from ..parsers import remove_comments
 5 | from ..parsers import remove_empty_statements
 6 | from ..parsers import split_statements
 7 | from ..parsers import remove_transactional
 8 | from ..parsers import remove_newlines
 9 | 
10 | 
11 | def balanced_parenthesis(statement):
12 |     """Check if the SQL statement is balanced
13 |     """
14 |     counter = 0
15 |     for character in statement:
16 |         if character == '(':
17 |             counter += 1
18 |         if character == ')':
19 |             counter -= 1
20 |             if counter < 0:
21 |                 return False
22 |     return counter == 0
23 | 
24 | 
25 | def sanitize_sql(sql, keep_transaction=False):
26 |     """Sanatize the sql string
27 |     """
28 |     # remove comments
29 |     string = remove_comments(sql)
30 | 
31 |     # remove transactionals
32 |     if not keep_transaction:
33 |         string = remove_transactional(string)
34 | 
35 |     # remove new lines
36 |     string = remove_newlines(string)
37 | 
38 |     # remove empty statements
39 |     string = remove_empty_statements(string)
40 | 
41 |     # split into multiple statements
42 |     return split_statements(string)
43 | 


--------------------------------------------------------------------------------
/docs/dataduct.qa.rst:
--------------------------------------------------------------------------------
 1 | dataduct.qa package
 2 | ===================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.qa.check module
 8 | ------------------------
 9 | 
10 | .. automodule:: dataduct.qa.check
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.qa.column_check module
16 | -------------------------------
17 | 
18 | .. automodule:: dataduct.qa.column_check
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | dataduct.qa.count_check module
24 | ------------------------------
25 | 
26 | .. automodule:: dataduct.qa.count_check
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | dataduct.qa.primary_key_check module
32 | ------------------------------------
33 | 
34 | .. automodule:: dataduct.qa.primary_key_check
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | dataduct.qa.utils module
40 | ------------------------
41 | 
42 | .. automodule:: dataduct.qa.utils
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | 
48 | Module contents
49 | ---------------
50 | 
51 | .. automodule:: dataduct.qa
52 |     :members:
53 |     :undoc-members:
54 |     :show-inheritance:
55 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/tests/test_sql_utils.py:
--------------------------------------------------------------------------------
 1 | """Tests the utils functions
 2 | """
 3 | from unittest import TestCase
 4 | from nose.tools import eq_
 5 | 
 6 | from ..utils import balanced_parenthesis
 7 | from ..utils import sanitize_sql
 8 | 
 9 | 
10 | class TestSqlUtils(TestCase):
11 |     """Tests for sql utils function
12 |     """
13 |     @staticmethod
14 |     def test_balanced_paranthesis():
15 |         """Test for balanced_parenthesis
16 |         """
17 |         eq_(balanced_parenthesis('SELECT 1;'), True)
18 |         eq_(balanced_parenthesis('SELECT 1(;'), False)
19 |         eq_(balanced_parenthesis('SELECT 1();'), True)
20 |         eq_(balanced_parenthesis('SELECT 1(abcd);'), True)
21 |         eq_(balanced_parenthesis('SELECT 1(ab[cd);'), True)
22 |         eq_(balanced_parenthesis('SELECT 1(ab[cd));'), False)
23 |         eq_(balanced_parenthesis('SELECT 1);'), False)
24 |         eq_(balanced_parenthesis('SELECT 1(ab)(ab);'), True)
25 |         eq_(balanced_parenthesis('SELECT 1(a(ab)b);'), True)
26 | 
27 |     @staticmethod
28 |     def test_sanitize_sql():
29 |         """Test for sanitize_sql
30 |         """
31 |         sql = "SELECT 1 if x='x;y'; SELECT 1 ;"
32 |         eq_(sanitize_sql(sql), ["SELECT 1 if x='x;y'", 'SELECT 1'])
33 | 


--------------------------------------------------------------------------------
/dataduct/s3/s3_log_path.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Class for storing a S3 Log Path
 3 | """
 4 | 
 5 | from os.path import join
 6 | from .s3_path import S3Path
 7 | 
 8 | 
 9 | class S3LogPath(S3Path):
10 |     """S3 Log path for data pipeline
11 |     S3LogPath only exists to correct the use of S3 URI's by Data
12 |     Pipeline. In most cases, one should use a backslash to disambiguate
13 |     prefixes. For instance, the former prefix includes the latter
14 |     unless there is a backslash:
15 | 
16 |     ::
17 |         s3:://coursera-bucket/dev
18 |         s3:://coursera-bucket/dev_log_dir
19 | 
20 |     However, if one adds a backslash to the log s3 URI, Data Pipeline
21 |     will add another backslash before adding subdirectories. These
22 |     double backslashes break boto.
23 |     """
24 |     def __init(self, **kwargs):
25 |         """Constructor for S3LogPath
26 |         """
27 |         super(S3LogPath, self).__init__(**kwargs)
28 | 
29 |     @property
30 |     def uri(self):
31 |         """Get the log directory path
32 | 
33 |         Returns:
34 |             s3_uri(str): s3_log path without the trailing '/'
35 |         """
36 |         if self.key is None:
37 |             return None
38 |         return join('s3://', self.bucket, self.key).rstrip('/')
39 | 


--------------------------------------------------------------------------------
/docs/dataduct.utils.rst:
--------------------------------------------------------------------------------
 1 | dataduct.utils package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.utils.cli module
 8 | -------------------------
 9 | 
10 | .. automodule:: dataduct.utils.cli
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.utils.constants module
16 | -------------------------------
17 | 
18 | .. automodule:: dataduct.utils.constants
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | dataduct.utils.exceptions module
24 | --------------------------------
25 | 
26 | .. automodule:: dataduct.utils.exceptions
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | dataduct.utils.helpers module
32 | -----------------------------
33 | 
34 | .. automodule:: dataduct.utils.helpers
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | dataduct.utils.slack_hook module
40 | --------------------------------
41 | 
42 | .. automodule:: dataduct.utils.slack_hook
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | 
48 | Module contents
49 | ---------------
50 | 
51 | .. automodule:: dataduct.utils
52 |     :members:
53 |     :undoc-members:
54 |     :show-inheritance:
55 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - 2.7
 4 | 
 5 | sudo: false
 6 | 
 7 | addons:
 8 |     apt_packages:
 9 |         - graphviz
10 | # command to install dependencies
11 | install:
12 |   - pip install coveralls
13 |   - pip install -r requirements.txt
14 | 
15 | # Setup config file
16 | before_script:
17 |   - mkdir ~/.dataduct
18 |   - |+
19 |     echo "
20 |     etl:
21 |       ROLE: DataPipelineDefaultRole
22 |       RESOURCE_ROLE: DataPipelineDefaultResourceRole
23 |       S3_ETL_BUCKET: FILL_ME_IN
24 | 
25 |     ec2:
26 |       CORE_INSTANCE_TYPE: m1.large
27 | 
28 |     emr:
29 |       CLUSTER_AMI: 2.4.7
30 | 
31 |     redshift:
32 |       DATABASE_NAME: FILL_ME_IN
33 |       CLUSTER_ID: FILL_ME_IN
34 |       USERNAME: FILL_ME_IN
35 |       PASSWORD: FILL_ME_IN
36 | 
37 |     postgres:
38 |       DATABASE_NAME: FILL_ME_IN
39 |       RDS_INSTANCE_ID: FILL_ME_IN
40 |       USERNAME: FILL_ME_IN
41 |       PASSWORD: FILL_ME_IN
42 |       REGION: FILL_ME_IN
43 | 
44 |     mysql:
45 |       DATABASE_KEY:
46 |         HOST: FILL_ME_IN
47 |         USERNAME: FILL_ME_IN
48 |         PASSWORD: FILL_ME_IN" > ~/.dataduct/dataduct.cfg
49 | 
50 | # Run tests
51 | script: nosetests --with-coverage --cover-package=. --cover-erase
52 | after_success:
53 |   coveralls
54 | 


--------------------------------------------------------------------------------
/docs/dataduct.database.sql.rst:
--------------------------------------------------------------------------------
 1 | dataduct.database.sql package
 2 | =============================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     dataduct.database.sql.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | dataduct.database.sql.sql_script module
15 | ---------------------------------------
16 | 
17 | .. automodule:: dataduct.database.sql.sql_script
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | dataduct.database.sql.sql_statement module
23 | ------------------------------------------
24 | 
25 | .. automodule:: dataduct.database.sql.sql_statement
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | dataduct.database.sql.transaction module
31 | ----------------------------------------
32 | 
33 | .. automodule:: dataduct.database.sql.transaction
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | dataduct.database.sql.utils module
39 | ----------------------------------
40 | 
41 | .. automodule:: dataduct.database.sql.utils
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | 
47 | Module contents
48 | ---------------
49 | 
50 | .. automodule:: dataduct.database.sql
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 


--------------------------------------------------------------------------------
/examples/resources/scripts/word_reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Simple reducer for the word count example"""
 3 | 
 4 | from itertools import groupby
 5 | from operator import itemgetter
 6 | import sys
 7 | 
 8 | def read_mapper_output(file, separator='\t'):
 9 |     """Reads the stdin line by line
10 |     """
11 |     for line in file:
12 |         yield line.rstrip().split(separator, 1)
13 | 
14 | def main(separator='\t'):
15 |     """Read the key value pairs and count the number of words
16 |     Args:
17 |         separator(str): Separator to be used between key and value
18 |     """
19 | 
20 |     # input comes from STDIN (standard input)
21 |     data = read_mapper_output(sys.stdin, separator=separator)
22 |     # groupby groups multiple word-count pairs by word,
23 |     # and creates an iterator that returns consecutive keys and their group:
24 |     #   current_word - string containing a word (the key)
25 |     for current_word, group in groupby(data, itemgetter(0)):
26 |         try:
27 |             total_count = sum(int(count) for current_word, count in group)
28 |             print "%s%s%d" % (current_word, separator, total_count)
29 |         except ValueError:
30 |             # count was not a number, so silently discard this item
31 |             pass
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/dataduct/steps/executors/primary_key_check.py:
--------------------------------------------------------------------------------
 1 | """Script that checks for primary key violations on the input table
 2 | """
 3 | 
 4 | import argparse
 5 | import pandas.io.sql as pdsql
 6 | from dataduct.data_access import redshift_connection
 7 | from dataduct.database import SqlScript
 8 | from dataduct.database import Table
 9 | from dataduct.qa import PrimaryKeyCheck
10 | 
11 | 
12 | def primary_key_check():
13 |     parser = argparse.ArgumentParser()
14 | 
15 |     parser.add_argument('--table', dest='table', required=True)
16 |     parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None)
17 |     parser.add_argument('--test_name', dest='test_name',
18 |                         default="Check Primary Key")
19 |     parser.add_argument('--log_to_s3', action='store_true', default=False)
20 |     parser.add_argument('--path_suffix', dest='path_suffix', default=None)
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     connection = redshift_connection()
25 |     table = Table(SqlScript(args.table))
26 |     result = pdsql.read_sql(table.select_duplicates_script().sql(), connection)
27 |     check = PrimaryKeyCheck(len(result), name=args.test_name,
28 |                             sns_topic_arn=args.sns_topic_arn)
29 |     check.publish(args.log_to_s3, table=table.full_name,
30 |                   path_suffix=args.path_suffix)
31 |     connection.close()
32 | 


--------------------------------------------------------------------------------
/docs/dataduct.config.rst:
--------------------------------------------------------------------------------
 1 | dataduct.config package
 2 | =======================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     dataduct.config.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | dataduct.config.config module
15 | -----------------------------
16 | 
17 | .. automodule:: dataduct.config.config
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | dataduct.config.config_actions module
23 | -------------------------------------
24 | 
25 | .. automodule:: dataduct.config.config_actions
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | dataduct.config.constants module
31 | --------------------------------
32 | 
33 | .. automodule:: dataduct.config.constants
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | dataduct.config.credentials module
39 | ----------------------------------
40 | 
41 | .. automodule:: dataduct.config.credentials
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | dataduct.config.logger_config module
47 | ------------------------------------
48 | 
49 | .. automodule:: dataduct.config.logger_config
50 |     :members:
51 |     :undoc-members:
52 |     :show-inheritance:
53 | 
54 | 
55 | Module contents
56 | ---------------
57 | 
58 | .. automodule:: dataduct.config
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 


--------------------------------------------------------------------------------
/dataduct/data_access/open_shell.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from ..utils.hook import hook
 3 | 
 4 | import logging
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | @hook('connect_to_redshift')
 8 | def open_psql_shell(redshift_creds, **kwargs):
 9 |     command = [
10 |         "psql",
11 |         "-h", redshift_creds["HOST"],
12 |         "-p", str(redshift_creds["PORT"]),
13 |         "-U", redshift_creds["USERNAME"],
14 |         "-d", redshift_creds["DATABASE_NAME"],
15 |         "-vPROMPT1=%[%033[0m%]" + redshift_creds["CLUSTER_ID"] + "%R%[%033[0m%]%# ",
16 |         "-vPROMPT2=%[%033[0m%]" + redshift_creds["CLUSTER_ID"] + "%R%[%033[0m%]%# ",
17 |         ]
18 |     env = dict(os.environ)
19 |     env['PGPASSWORD'] = redshift_creds["PASSWORD"]
20 |     logger.info("Running command: {}".format(' '.join(command)))
21 |     os.execvpe(command[0], command, env=env)
22 | 
23 | 
24 | @hook('connect_to_mysql')
25 | def open_mysql_shell(sql_creds, **kwargs):
26 |     command = [
27 |         "mysql",
28 |         "-h", sql_creds["HOST"],
29 |         "-u", sql_creds["USERNAME"],
30 |         "--default-character-set=utf8"
31 |     ]
32 |     if sql_creds.get("DATABASE"):
33 |         command.extend(["-D", sql_creds["DATABASE"]])
34 | 
35 |     env = dict(os.environ)
36 |     env['MYSQL_PWD'] = sql_creds["PASSWORD"]
37 |     logger.info("Running command: {}".format(' '.join(command)))
38 |     os.execvpe(command[0], command, env=env)
39 | 


--------------------------------------------------------------------------------
/docs/dataduct.database.parsers.tests.rst:
--------------------------------------------------------------------------------
 1 | dataduct.database.parsers.tests package
 2 | =======================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | dataduct.database.parsers.tests.test_create_table module
 8 | --------------------------------------------------------
 9 | 
10 | .. automodule:: dataduct.database.parsers.tests.test_create_table
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | dataduct.database.parsers.tests.test_create_view module
16 | -------------------------------------------------------
17 | 
18 | .. automodule:: dataduct.database.parsers.tests.test_create_view
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | dataduct.database.parsers.tests.test_select_query module
24 | --------------------------------------------------------
25 | 
26 | .. automodule:: dataduct.database.parsers.tests.test_select_query
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | dataduct.database.parsers.tests.test_transfrom module
32 | -----------------------------------------------------
33 | 
34 | .. automodule:: dataduct.database.parsers.tests.test_transfrom
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | 
40 | Module contents
41 | ---------------
42 | 
43 | .. automodule:: dataduct.database.parsers.tests
44 |     :members:
45 |     :undoc-members:
46 |     :show-inheritance:
47 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/helpers.py:
--------------------------------------------------------------------------------
 1 | """SQL parser helpers
 2 | """
 3 | from pyparsing import delimitedList
 4 | from pyparsing import Optional
 5 | from pyparsing import ParseResults
 6 | 
 7 | from .utils import _db_name
 8 | from .utils import _temp
 9 | from .utils import _temporary
10 | from .utils import _if_not_exists
11 | from .utils import _or_replace
12 | 
13 | # Functions
14 | isNotEmpty = lambda x: len(x) > 0
15 | 
16 | temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty)
17 | 
18 | replace_check = Optional(_or_replace).setParseAction(isNotEmpty)
19 | 
20 | existance_check = Optional(_if_not_exists).setParseAction(isNotEmpty)
21 | 
22 | 
23 | def paranthesis_list(output_name, input_var=_db_name):
24 |     """Parser for a delimiedList enclosed in paranthesis
25 |     """
26 |     return '(' + delimitedList(input_var).setResultsName(output_name) + ')'
27 | 
28 | 
29 | def exists(parser, output_name):
30 |     """Get a parser that returns boolean on existance
31 |     """
32 |     return parser.setParseAction(isNotEmpty).setResultsName(output_name)
33 | 
34 | 
35 | def to_dict(input):
36 |     """Purge the ParseResults from output dictionary
37 |     """
38 |     output = dict()
39 |     for key, value in input.asDict().iteritems():
40 |         if isinstance(value, ParseResults):
41 |             output[key] = value.asList()
42 |         else:
43 |             output[key] = value
44 | 
45 |     return output
46 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Dataduct |build-status| |coverage-status|
 2 | -----------------------------------------
 3 | Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to
 4 | create ETL jobs. All jobs can be specified as a series of steps in a YAML file
 5 | and would automatically be translated into datapipeline with appropriate
 6 | pipeline objects.
 7 | 
 8 | **Documentation and Details**
 9 | 
10 | Documentation and more details can be found at http://dataduct.readthedocs.org/en/latest/
11 | 
12 | **License**
13 | 
14 | Copyright [2014] [Coursera]
15 | 
16 | Licensed under the Apache License, Version 2.0 (the "License");
17 | you may not use this file except in compliance with the License.
18 | You may obtain a copy of the License at
19 | 
20 |     http://www.apache.org/licenses/LICENSE-2.0
21 | 
22 | Unless required by applicable law or agreed to in writing, software
23 | distributed under the License is distributed on an "AS IS" BASIS,
24 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 | See the License for the specific language governing permissions and
26 | limitations under the License.
27 | 
28 | .. |build-status|
29 |    image:: https://travis-ci.org/coursera/dataduct.svg?branch=develop
30 |     :target: https://travis-ci.org/coursera/dataduct
31 | 
32 | .. |coverage-status|
33 |    image:: https://coveralls.io/repos/coursera/dataduct/badge.svg?branch=develop
34 |     :target: https://coveralls.io/r/coursera/dataduct?branch=develop
35 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/postgres_node.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for SqlNode
 3 | """
 4 | 
 5 | from ..utils.exceptions import ETLInputError
 6 | from .pipeline_object import PipelineObject
 7 | from .schedule import Schedule
 8 | 
 9 | 
10 | class PostgresNode(PipelineObject):
11 |     """SQL Data Node class
12 |     """
13 | 
14 |     def __init__(self, id, schedule, host, database, username, password,
15 |                  select_query, insert_query, table, depends_on=None):
16 |         """Constructor for the SqlNode class
17 | 
18 |         Args:
19 |             id(str): id of the object
20 |             schedule(Schedule): pipeline schedule
21 |             database(str): database name on the RDS host
22 |             sql(str): sql to be executed
23 |             table(str): table to be read
24 |         """
25 | 
26 |         # Validate inputs
27 |         if not isinstance(schedule, Schedule):
28 |             raise ETLInputError(
29 |                 'Input schedule must be of the type Schedule')
30 | 
31 |         if not depends_on:
32 |             depends_on = list()
33 | 
34 |         kwargs = {
35 |             'id': id,
36 |             'type': 'SqlDataNode',
37 |             'schedule': schedule,
38 |             'database': database,
39 |             'selectQuery': select_query,
40 |             'insertQuery': insert_query,
41 |             'table': table,
42 |             'dependsOn': depends_on,
43 |         }
44 |         super(PostgresNode, self).__init__(**kwargs)
45 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/create_view.py:
--------------------------------------------------------------------------------
 1 | """Create SQL parser
 2 | """
 3 | from pyparsing import Group
 4 | from pyparsing import Optional
 5 | from pyparsing import StringEnd
 6 | from pyparsing import Word
 7 | from pyparsing import ZeroOrMore
 8 | from pyparsing import printables
 9 | 
10 | from .utils import _as
11 | from .utils import _create
12 | from .utils import _db_name
13 | from .utils import _view
14 | 
15 | from .helpers import replace_check
16 | from .helpers import to_dict
17 | 
18 | 
19 | merge = lambda x: ' '.join(x[0])
20 | 
21 | 
22 | def rreplace(s, old, new):
23 |     li = s.rsplit(old, 1)
24 |     return new.join(li)
25 | 
26 | 
27 | def parse_create_view(string):
28 |     """Parse the create view sql query and return metadata
29 | 
30 |     Args:
31 |         string(str): Input sql string that should be parsed
32 | 
33 |     Returns:
34 |         view_data(dict): view_data dictionary for instantiating a view object
35 |     """
36 | 
37 |     string = rreplace(string, ')', ' )')
38 | 
39 |     end = Optional(')') + StringEnd()
40 |     select = Group(ZeroOrMore(~end + Word(printables)))
41 | 
42 |     parser = _create + replace_check.setResultsName('replace') + _view
43 |     parser += _db_name.setResultsName('view_name') + _as + Optional('(')
44 |     parser += select.setParseAction(merge).setResultsName('select_statement')
45 |     parser += end
46 | 
47 |     # Parse the base table definitions
48 |     view_data = to_dict(parser.parseString(string))
49 | 
50 |     return view_data
51 | 


--------------------------------------------------------------------------------
/dataduct/utils/constants.py:
--------------------------------------------------------------------------------
 1 | """Constants shared across dataduct
 2 | """
 3 | 
 4 | # Constants
 5 | ZERO = 0
 6 | ONE = 1
 7 | NONE = None
 8 | EMPTY_STR = ''
 9 | NULL_STR = 'NULL'
10 | DEFAULT_DELAY = '10 Minutes'
11 | DEFAULT_TIMEOUT = '6 Hours'
12 | 
13 | # ETL Constants
14 | EMR_CLUSTER_STR = 'emr'
15 | EC2_RESOURCE_STR = 'ec2'
16 | M1_LARGE = 'm1.large'
17 | 
18 | LOG_STR = 'logs'
19 | DATA_STR = 'data'
20 | SRC_STR = 'src'
21 | QA_STR = 'qa'
22 | 
23 | # Commands
24 | COMMAND_TEMPLATE = 'python -c "from {file} import {func}; {func}()" "$@"'
25 | 
26 | COUNT_CHECK_COMMAND = COMMAND_TEMPLATE.format(
27 |     file='dataduct.steps.executors.count_check',
28 |     func='count_check')
29 | 
30 | COLUMN_CHECK_COMMAND = COMMAND_TEMPLATE.format(
31 |     file='dataduct.steps.executors.column_check',
32 |     func='column_check')
33 | 
34 | LOAD_COMMAND = COMMAND_TEMPLATE.format(
35 |     file='dataduct.steps.executors.create_load_redshift',
36 |     func='create_load_redshift_runner')
37 | 
38 | PK_CHECK_COMMAND = COMMAND_TEMPLATE.format(
39 |     file='dataduct.steps.executors.primary_key_check',
40 |     func='primary_key_check')
41 | 
42 | DEPENDENCY_COMMAND = COMMAND_TEMPLATE.format(
43 |     file='dataduct.steps.executors.dependency_check',
44 |     func='dependency_check')
45 | 
46 | SCRIPT_RUNNER_COMMAND = COMMAND_TEMPLATE.format(
47 |     file='dataduct.steps.executors.runner', func='script_runner')
48 | 
49 | SQL_RUNNER_COMMAND = COMMAND_TEMPLATE.format(
50 |     file='dataduct.steps.executors.runner', func='sql_runner')
51 | 


--------------------------------------------------------------------------------
/dataduct/steps/delta_load.py:
--------------------------------------------------------------------------------
 1 | """ETL step wrapper for delta loading a table based on a date column
 2 | """
 3 | from ..database import SqlScript
 4 | from ..database import Table
 5 | from ..utils.helpers import parse_path
 6 | from .upsert import UpsertStep
 7 | 
 8 | 
 9 | class DeltaLoadStep(UpsertStep):
10 |     """DeltaLoadStep Step class that creates the table if needed and loads data
11 |     """
12 | 
13 |     def __init__(self, destination, date_column, window=0, **kwargs):
14 |         """Constructor for the DeltaLoadStep class
15 | 
16 |         Args:
17 |             date_column(string): name of column (of type date) to use as the
18 |                 delta value (i.e., only load the last X days)
19 |             window(int): number of days before last loaded day to update
20 |             **kwargs(optional): Keyword arguments directly passed to base class
21 |         """
22 |         dest = Table(SqlScript(filename=parse_path(destination)))
23 |         delta_clause = """
24 |             WHERE {date_column} >=
25 |                 COALESCE(
26 |                     (SELECT MAX({date_column}) FROM {destination}),
27 |                     '1800-01-01'::DATE
28 |                 ) - {window}
29 |         """.format(date_column=date_column,
30 |                    destination=dest.full_name,
31 |                    window=window)
32 |         super(DeltaLoadStep, self).__init__(destination=destination,
33 |                                             filter_clause=delta_clause,
34 |                                             **kwargs)
35 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. dataduct documentation master file, created by
 2 |    sphinx-quickstart on Mon Nov 10 17:50:14 2014.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Dataduct
 7 | ========
 8 | 
 9 |     Dataduct - DataPipeline for humans
10 | 
11 | `Dataduct <https://github.com/coursera/dataduct>`__ is a wrapper built
12 | on top of `AWS
13 | Datapipeline <http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/what-is-datapipeline.html>`__
14 | which makes it easy to create ETL jobs. All jobs can be specified as a
15 | series of steps in a YAML file and would automatically be translated
16 | into datapipeline with appropriate pipeline objects.
17 | 
18 | Features include:
19 | 
20 | - Visualizing pipeline activities
21 | - Extracting data from different sources such as RDS, S3, local files
22 | - Transforming data using EC2 and EMR
23 | - Loading data into redshift
24 | - Transforming data inside redshift
25 | - QA data between the source system and warehouse
26 | 
27 | It is easy to create custom steps to augment the DSL as per the
28 | requirements. As well as running a backfill with the command line
29 | interface.
30 | 
31 | 
32 | Contents:
33 | 
34 | .. toctree::
35 |    :maxdepth: 2
36 | 
37 |    introduction
38 |    installation
39 |    commands
40 |    config
41 |    creating_an_etl
42 |    steps
43 |    input_output
44 |    hooks
45 |    dataduct
46 | 
47 | Indices and tables
48 | ==================
49 | 
50 | * :ref:`genindex`
51 | * :ref:`modindex`
52 | * :ref:`search`
53 | 
54 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/default_object.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for default metadata
 3 | """
 4 | 
 5 | from ..config import Config
 6 | from ..utils import constants as const
 7 | from .pipeline_object import PipelineObject
 8 | 
 9 | config = Config()
10 | ROLE = config.etl['ROLE']
11 | RESOURCE_ROLE = config.etl['RESOURCE_ROLE']
12 | MAX_ACTIVE_INSTANCES = config.etl.get('MAX_ACTIVE_INSTANCES', const.ONE)
13 | 
14 | 
15 | class DefaultObject(PipelineObject):
16 |     """Default object added to all pipelines
17 |     """
18 | 
19 |     def __init__(self, id, pipeline_log_uri, sns=None, scheduleType='cron',
20 |                  failureAndRerunMode='CASCADE', **kwargs):
21 |         """Constructor for the DefaultObject class
22 | 
23 |         Args:
24 |             id(str): must be 'Default' for this class
25 |             sns(sns): notify on failure
26 |             scheduleType(str): frequency type for the pipeline
27 |             failureAndRerunMode(str): aws input argument for failure mode
28 |             **kwargs(optional): Keyword arguments directly passed to base class
29 | 
30 |         Note:
31 |             id must be Default for this object
32 |         """
33 | 
34 |         super(DefaultObject, self).__init__(
35 |             id='Default', # This should always have the default id
36 |             scheduleType=scheduleType,
37 |             failureAndRerunMode=failureAndRerunMode,
38 |             role=ROLE,
39 |             resourceRole=RESOURCE_ROLE,
40 |             maxActiveInstances=MAX_ACTIVE_INSTANCES,
41 |             pipelineLogUri=pipeline_log_uri,
42 |             onFail=sns
43 |         )
44 | 


--------------------------------------------------------------------------------
/dataduct/steps/extract_local.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for creating an S3 node for input from local files
 3 | """
 4 | from ..s3 import S3File
 5 | from ..utils.exceptions import ETLInputError
 6 | from .etl_step import ETLStep
 7 | 
 8 | 
 9 | class ExtractLocalStep(ETLStep):
10 |     """ExtractLocal Step class that helps get data from a local file
11 |     """
12 | 
13 |     def __init__(self, path, output_path=None, **kwargs):
14 |         """Constructor for the ExtractLocalStep class
15 | 
16 |         Args:
17 |             path(str): local path for data
18 |             **kwargs(optional): Keyword arguments directly passed to base class
19 |         """
20 |         super(ExtractLocalStep, self).__init__(**kwargs)
21 |         self._output = self.create_s3_data_node(
22 |             S3File(path=path, s3_path=self.get_output_s3_path(output_path)))
23 | 
24 |     @classmethod
25 |     def arguments_processor(cls, etl, input_args):
26 |         """Parse the step arguments according to the ETL pipeline
27 | 
28 |         Args:
29 |             etl(ETLPipeline): Pipeline object containing resources and steps
30 |             step_args(dict): Dictionary of the step arguments for the class
31 |         """
32 |         input_args = cls.pop_inputs(input_args)
33 |         step_args = cls.base_arguments_processor(etl, input_args)
34 | 
35 |         step_args.pop('resource', None)
36 |         step_args.pop('worker_group', None)
37 |         if etl.frequency != 'one-time':
38 |             raise ETLInputError(
39 |                 'Extract Local can be used for one-time pipelines only')
40 | 
41 |         return step_args
42 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/postgres_database.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for Rds database
 3 | """
 4 | 
 5 | from ..config import Config
 6 | from .pipeline_object import PipelineObject
 7 | from ..utils.exceptions import ETLConfigError
 8 | 
 9 | config = Config()
10 | 
11 | if not hasattr(config, 'postgres'):
12 |     raise ETLConfigError('Postgres credentials missing from config')
13 | 
14 | REGION = config.postgres['REGION']
15 | RDS_INSTANCE_ID = config.postgres['RDS_INSTANCE_ID']
16 | USERNAME = config.postgres['USERNAME']
17 | PASSWORD = config.postgres['PASSWORD']
18 | 
19 | 
20 | class PostgresDatabase(PipelineObject):
21 |     """Postgres resource class
22 |     """
23 | 
24 |     def __init__(self,
25 |                  id,
26 |                  region=REGION,
27 |                  rds_instance_id=RDS_INSTANCE_ID,
28 |                  username=USERNAME,
29 |                  password=PASSWORD):
30 |         """Constructor for the Postgres class
31 | 
32 |         Args:
33 |             id(str): id of the object
34 |             region(str): code for the region where the database exists
35 |             rds_instance_id(str): identifier of the DB instance
36 |             username(str): username for the database
37 |             password(str): password for the database
38 |         """
39 | 
40 |         kwargs = {
41 |             'id': id,
42 |             'type': 'RdsDatabase',
43 |             'region': region,
44 |             'rdsInstanceId': rds_instance_id,
45 |             'username': username,
46 |             '*password': password,
47 |         }
48 |         super(PostgresDatabase, self).__init__(**kwargs)
49 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | CHANGES.md
 3 | CONTRIBUTING.md
 4 | LICENSE.md
 5 | README.rst
 6 | setup.py
 7 | bin/README.md
 8 | dataduct/__init__.py
 9 | dataduct/definition_parser.py
10 | dataduct/etl_pipeline.py
11 | dataduct/pipeline/__init__.py
12 | dataduct/pipeline/activity.py
13 | dataduct/pipeline/copy_activity.py
14 | dataduct/pipeline/data_pipeline.py
15 | dataduct/pipeline/default_object.py
16 | dataduct/pipeline/ec2_resource.py
17 | dataduct/pipeline/emr_activity.py
18 | dataduct/pipeline/emr_resource.py
19 | dataduct/pipeline/mysql_node.py
20 | dataduct/pipeline/pipeline_object.py
21 | dataduct/pipeline/precondition.py
22 | dataduct/pipeline/redshift_copy_activity.py
23 | dataduct/pipeline/redshift_database.py
24 | dataduct/pipeline/redshift_node.py
25 | dataduct/pipeline/s3_node.py
26 | dataduct/pipeline/schedule.py
27 | dataduct/pipeline/shell_command_activity.py
28 | dataduct/pipeline/sns_alarm.py
29 | dataduct/pipeline/sql_activity.py
30 | dataduct/pipeline/utils.py
31 | dataduct/s3/__init__.py
32 | dataduct/s3/s3_directory.py
33 | dataduct/s3/s3_file.py
34 | dataduct/s3/s3_log_path.py
35 | dataduct/s3/s3_path.py
36 | dataduct/s3/utils.py
37 | dataduct/steps/__init__.py
38 | dataduct/steps/emr_streaming.py
39 | dataduct/steps/etl_step.py
40 | dataduct/steps/extract_local.py
41 | dataduct/steps/extract_rds.py
42 | dataduct/steps/extract_redshift.py
43 | dataduct/steps/extract_s3.py
44 | dataduct/steps/load_redshift.py
45 | dataduct/steps/sql_command.py
46 | dataduct/steps/transform.py
47 | dataduct/utils/__init__.py
48 | dataduct/utils/exceptions.py
49 | dataduct/utils/helpers.py
50 | scripts/README.md
51 | 


--------------------------------------------------------------------------------
/dataduct/s3/s3_directory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for storing a S3 File
 3 | """
 4 | from .s3_path import S3Path
 5 | from .utils import upload_dir_to_s3
 6 | from ..utils.helpers import parse_path
 7 | from ..utils.exceptions import ETLInputError
 8 | 
 9 | 
10 | class S3Directory(object):
11 |     """S3 Directory object helps operate with a directory on S3
12 | 
13 |     The S3Directory acts much like the S3File.
14 |     It represents a directory. Tries to unify the concept of a directory
15 |     stored locally with one stored in S3.
16 | 
17 |     """
18 |     def __init__(self, path=None, s3_path=None):
19 |         """Constructor for the S3 File object
20 | 
21 |         Args:
22 |             path (str): Local path to file
23 |             s3_path (S3Path, optional): s3_path of the file
24 | 
25 |         """
26 |         self.path = parse_path(path)
27 |         self._s3_path = s3_path
28 | 
29 |     @property
30 |     def s3_path(self):
31 |         """Outputs the s3_path
32 |         """
33 |         return self._s3_path
34 | 
35 |     @s3_path.setter
36 |     def s3_path(self, value):
37 |         """Set the S3 path for the file
38 | 
39 |         Args:
40 |             value(S3Path): s3path of the directory
41 |         """
42 |         if not isinstance(value, S3Path):
43 |             raise ETLInputError('Input path should be of type S3Path')
44 | 
45 |         if not value.is_directory:
46 |             raise ETLInputError('S3 path must be directory')
47 |         self._s3_path = value
48 | 
49 |     def upload_to_s3(self):
50 |         """Uploads the directory to the s3 directory
51 |         """
52 |         upload_dir_to_s3(self._s3_path, self.path)
53 | 


--------------------------------------------------------------------------------
/dataduct/steps/primary_key_check.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for PK check step can be executed on Ec2 resource
 3 | """
 4 | from ..config import Config
 5 | from ..database import SqlStatement
 6 | from ..database import Table
 7 | from ..utils import constants as const
 8 | from ..utils.helpers import parse_path
 9 | from .qa_transform import QATransformStep
10 | 
11 | config = Config()
12 | 
13 | 
14 | class PrimaryKeyCheckStep(QATransformStep):
15 |     """PrimaryKeyCheckStep class that checks a table for PK violations
16 |     """
17 | 
18 |     def __init__(self, id, table_definition, script_arguments=None,
19 |                  log_to_s3=False, command=None, script=None, **kwargs):
20 |         """Constructor for the PrimaryKeyCheckStep class
21 | 
22 |         Args:
23 |             table_definition(file): table definition for the table to check
24 |             **kwargs(optional): Keyword arguments directly passed to base class
25 |         """
26 |         with open(parse_path(table_definition)) as f:
27 |             table_def_string = f.read()
28 | 
29 |         if script_arguments is None:
30 |             script_arguments = list()
31 | 
32 |         # We initialize the table object to check valid strings
33 |         script_arguments.append(
34 |             '--table=%s' % Table(SqlStatement(table_def_string)).sql())
35 | 
36 |         if log_to_s3:
37 |             script_arguments.append('--log_to_s3')
38 | 
39 |         if script is None and command is None:
40 |             command = const.PK_CHECK_COMMAND
41 | 
42 |         super(PrimaryKeyCheckStep, self).__init__(
43 |             id=id, command=command, script=script,
44 |             script_arguments=script_arguments, **kwargs)
45 | 


--------------------------------------------------------------------------------
/dataduct/config/tests/test_credentials.py:
--------------------------------------------------------------------------------
 1 | """Tests for credentials file
 2 | """
 3 | from mock import patch
 4 | from nose.tools import eq_
 5 | import json
 6 | 
 7 | from ..credentials import get_aws_credentials_from_iam
 8 | 
 9 | @patch('requests.get')
10 | def test_get_aws_credentials_from_iam(patched_requests_get):
11 |     """Test for get credentials from IAM
12 |     """
13 |     class MockedReturn:
14 |         """Mock request response
15 |         """
16 |         def __init__(self, content):
17 |             self.content = content
18 |             self.ok = True
19 | 
20 |         def json(self):
21 |             """Returns a json for the content
22 |             """
23 |             return json.loads(self.content)
24 | 
25 |     def server_response(url):
26 |         """Mocked server responses
27 |         """
28 |         if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/':  # NOQA
29 |             return MockedReturn("role")
30 |         if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/role':  # NOQA
31 |             return MockedReturn("""
32 |             {
33 |                 "Code" : "Success",
34 |                 "LastUpdated" : "2012-04-26T16:39:16Z",
35 |                 "Type" : "AWS-HMAC",
36 |                 "AccessKeyId" : "access_id",
37 |                 "SecretAccessKey" : "secret_key",
38 |                 "Token" : "token",
39 |                 "Expiration" : "2012-04-27T22:39:16Z"
40 |             }
41 |             """)
42 | 
43 |     patched_requests_get.side_effect = server_response
44 |     access_id, secret_key, token = get_aws_credentials_from_iam()
45 |     eq_(access_id, 'access_id')
46 |     eq_(secret_key, 'secret_key')
47 |     eq_(token, 'token')
48 | 


--------------------------------------------------------------------------------
/docs/dataduct.database.parsers.rst:
--------------------------------------------------------------------------------
 1 | dataduct.database.parsers package
 2 | =================================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     dataduct.database.parsers.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | dataduct.database.parsers.create_table module
15 | ---------------------------------------------
16 | 
17 | .. automodule:: dataduct.database.parsers.create_table
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | dataduct.database.parsers.create_view module
23 | --------------------------------------------
24 | 
25 | .. automodule:: dataduct.database.parsers.create_view
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | dataduct.database.parsers.helpers module
31 | ----------------------------------------
32 | 
33 | .. automodule:: dataduct.database.parsers.helpers
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | dataduct.database.parsers.select_query module
39 | ---------------------------------------------
40 | 
41 | .. automodule:: dataduct.database.parsers.select_query
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | dataduct.database.parsers.transform module
47 | ------------------------------------------
48 | 
49 | .. automodule:: dataduct.database.parsers.transform
50 |     :members:
51 |     :undoc-members:
52 |     :show-inheritance:
53 | 
54 | dataduct.database.parsers.utils module
55 | --------------------------------------
56 | 
57 | .. automodule:: dataduct.database.parsers.utils
58 |     :members:
59 |     :undoc-members:
60 |     :show-inheritance:
61 | 
62 | 
63 | Module contents
64 | ---------------
65 | 
66 | .. automodule:: dataduct.database.parsers
67 |     :members:
68 |     :undoc-members:
69 |     :show-inheritance:
70 | 


--------------------------------------------------------------------------------
/dataduct/steps/emr_job.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for EmrActivity can be executed on EMR Cluster
 3 | """
 4 | from .etl_step import ETLStep
 5 | from ..pipeline import EmrActivity
 6 | from ..utils import constants as const
 7 | 
 8 | 
 9 | class EMRJobStep(ETLStep):
10 |     """EMR Step class that helps run a step on the emr cluster
11 |     """
12 | 
13 |     def __init__(self,
14 |                  step_string,
15 |                  **kwargs):
16 |         """Constructor for the EMRJobStep class
17 | 
18 |         Args:
19 |             step_string(str): Step string for the emr job to be executed
20 |             **kwargs(optional): Keyword arguments directly passed to base class
21 | 
22 |         Note:
23 |             In the step_string all comma within arguments should be escaped
24 |             using 4 backslashes
25 |         """
26 |         super(EMRJobStep, self).__init__(**kwargs)
27 | 
28 |         self.activity = self.create_pipeline_object(
29 |             object_class=EmrActivity,
30 |             resource=self.resource,
31 |             worker_group=self.worker_group,
32 |             input_node=self.input,
33 |             schedule=self.schedule,
34 |             emr_step_string=step_string,
35 |             output_node=self.output,
36 |             depends_on=self.depends_on,
37 |             max_retries=self.max_retries
38 |         )
39 | 
40 |     @classmethod
41 |     def arguments_processor(cls, etl, input_args):
42 |         """Parse the step arguments according to the ETL pipeline
43 | 
44 |         Args:
45 |             etl(ETLPipeline): Pipeline object containing resources and steps
46 |             step_args(dict): Dictionary of the step arguments for the class
47 |         """
48 |         step_args = cls.base_arguments_processor(
49 |             etl, input_args, resource_type=const.EMR_CLUSTER_STR)
50 | 
51 |         return step_args
52 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | =============
 3 | 
 4 | `Dataduct <https://github.com/coursera/dataduct>`__ is a wrapper built
 5 | on top of `AWS
 6 | Datapipeline <http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/what-is-datapipeline.html>`__
 7 | which makes it easy to create ETL jobs. All jobs can be specified as a
 8 | series of steps in a YAML file and would automatically be translated
 9 | into datapipeline with appropriate pipeline objects.
10 | 
11 | Features include:
12 | 
13 | - Visualizing pipeline activities
14 | - Extracting data from different sources such as RDS, S3, local files
15 | - Transforming data using EC2 and EMR
16 | - Loading data into redshift
17 | - Transforming data inside redshift
18 | - QA data between the source system and warehouse
19 | It is easy to create custom steps to augment the DSL as per the
20 | requirements. As well as running a backfill with the command line
21 | interface.
22 | 
23 | An example ETL from RDS would look like:
24 | 
25 | .. code:: YAML
26 | 
27 |     name: example_upsert
28 |     frequency: daily
29 |     load_time: 01:00  # Hour:Min in UTC
30 | 
31 |     steps:
32 |     -   step_type: extract-rds
33 |         host_name: test_host
34 |         database: test_database
35 |         sql: |
36 |             SELECT *
37 |             FROM test_table;
38 | 
39 |     -   step_type: create-load-redshift
40 |         table_definition: tables/dev.test_table.sql
41 | 
42 |     -   step_type: upsert
43 |         source: tables/dev.test_table.sql
44 |         destination: tables/dev.test_table_2.sql
45 | 
46 | This would first perform an extraction from the RDS database with the
47 | ``extract-rds`` step using the ``COPY ACTIVITY``. Then load the data
48 | into the ``dev.test_table`` in redshift with the
49 | ``create-load-redshift``. Then perform an ``upsert`` with the data into
50 | the ``test_table_2``.
51 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Setup file for installation of the dataduct code
 3 | """
 4 | from setuptools import find_packages
 5 | from setuptools import setup
 6 | 
 7 | from dataduct import __version__ as version
 8 | 
 9 | setup(
10 |     name='dataduct',
11 |     version=version,
12 |     author='Coursera Inc.',
13 |     packages=find_packages(
14 |         exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
15 |     namespace_packages=['dataduct'],
16 |     include_package_data=True,
17 |     url='https://github.com/coursera/dataduct',
18 |     long_description=open('README.rst').read(),
19 |     author_email='data-infra@coursera.org',
20 |     license='Apache License 2.0',
21 |     description='DataPipeline for Humans',
22 |     install_requires=[
23 |         'boto>=2.38',
24 |         'MySQL-python>=1.2.3',
25 |         'matplotlib==1.5.3',
26 |         'pandas==0.18.1',
27 |         'psycopg2==2.6.0',
28 |         'pyparsing>=1.5.6',
29 |         'pytimeparse>=1.1.4',
30 |         'PyYAML>=3.11',
31 |         'testfixtures>=4.1.2',
32 |         'pyprind'
33 |     ],
34 |     scripts=['bin/dataduct'],
35 |     classifiers=[
36 |         'Development Status :: 5 - Production/Stable',
37 |         'Intended Audience :: Developers',
38 |         'License :: OSI Approved :: Apache Software License',
39 |         'Natural Language :: English',
40 |         'Operating System :: MacOS',
41 |         'Operating System :: MacOS :: MacOS 9',
42 |         'Operating System :: MacOS :: MacOS X',
43 |         'Operating System :: Unix',
44 |         'Programming Language :: Python :: 2.7',
45 |         'Programming Language :: Unix Shell',
46 |         'Topic :: Database',
47 |         'Topic :: Scientific/Engineering',
48 |         'Topic :: Scientific/Engineering :: Information Analysis',
49 |         'Topic :: Scientific/Engineering :: Visualization',
50 |         'Topic :: Utilities',
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------
/docs/dataduct.database.rst:
--------------------------------------------------------------------------------
 1 | dataduct.database package
 2 | =========================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     dataduct.database.parsers
10 |     dataduct.database.sql
11 |     dataduct.database.tests
12 | 
13 | Submodules
14 | ----------
15 | 
16 | dataduct.database.column module
17 | -------------------------------
18 | 
19 | .. automodule:: dataduct.database.column
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 
24 | dataduct.database.database module
25 | ---------------------------------
26 | 
27 | .. automodule:: dataduct.database.database
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 
32 | dataduct.database.history_table module
33 | --------------------------------------
34 | 
35 | .. automodule:: dataduct.database.history_table
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 
40 | dataduct.database.relation module
41 | ---------------------------------
42 | 
43 | .. automodule:: dataduct.database.relation
44 |     :members:
45 |     :undoc-members:
46 |     :show-inheritance:
47 | 
48 | dataduct.database.select_statement module
49 | -----------------------------------------
50 | 
51 | .. automodule:: dataduct.database.select_statement
52 |     :members:
53 |     :undoc-members:
54 |     :show-inheritance:
55 | 
56 | dataduct.database.table module
57 | ------------------------------
58 | 
59 | .. automodule:: dataduct.database.table
60 |     :members:
61 |     :undoc-members:
62 |     :show-inheritance:
63 | 
64 | dataduct.database.view module
65 | -----------------------------
66 | 
67 | .. automodule:: dataduct.database.view
68 |     :members:
69 |     :undoc-members:
70 |     :show-inheritance:
71 | 
72 | 
73 | Module contents
74 | ---------------
75 | 
76 | .. automodule:: dataduct.database
77 |     :members:
78 |     :undoc-members:
79 |     :show-inheritance:
80 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/redshift_node.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for RedshiftNode
 3 | """
 4 | 
 5 | from .pipeline_object import PipelineObject
 6 | from .schedule import Schedule
 7 | from ..utils.exceptions import ETLInputError
 8 | 
 9 | 
10 | class RedshiftNode(PipelineObject):
11 |     """Redshift Data Node class
12 |     """
13 | 
14 |     def __init__(self,
15 |                  id,
16 |                  schedule,
17 |                  redshift_database,
18 |                  schema_name,
19 |                  table_name):
20 |         """Constructor for the RedshiftNode class
21 | 
22 |         Args:
23 |             id(str): id of the object
24 |             schedule(Schedule): pipeline schedule
25 |             redshift_database(RedshiftDatabase): database for the node
26 |             schema_name(str): schema for node to extract or load data
27 |             table_name(str): table for node to extract or load data
28 |         """
29 | 
30 |         # Validate inputs
31 |         if not isinstance(schedule, Schedule):
32 |             raise ETLInputError(
33 |                 'Input schedule must be of the type Schedule')
34 | 
35 |         super(RedshiftNode, self).__init__(
36 |             id=id,
37 |             type='RedshiftDataNode',
38 |             schedule=schedule,
39 |             database=redshift_database,
40 |             schemaName=schema_name,
41 |             tableName=table_name,
42 |         )
43 | 
44 |     @property
45 |     def schema(self):
46 |         """Get the schema name for the redshift node
47 | 
48 |         Returns:
49 |             result(str): schema name for this redshift node
50 |         """
51 |         return self['schemaName']
52 | 
53 |     @property
54 |     def table(self):
55 |         """Get the table name for the redshift node
56 | 
57 |         Returns:
58 |             result(str): table name for this redshift node
59 |         """
60 |         return self['tableName']
61 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/sns_alarm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for sns
 3 | """
 4 | 
 5 | from ..config import Config
 6 | from ..utils import constants as const
 7 | from .pipeline_object import PipelineObject
 8 | 
 9 | config = Config()
10 | SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE)
11 | ROLE = config.etl['ROLE']
12 | 
13 | 
14 | class SNSAlarm(PipelineObject):
15 |     """SNS object added to all pipelines
16 |     """
17 | 
18 |     def __init__(self,
19 |                  id,
20 |                  pipeline_name=None,
21 |                  failure_message=None,
22 |                  topic_arn=None,
23 |                  **kwargs):
24 |         """Constructor for the SNSAlarm class
25 | 
26 |         Args:
27 |             id(str): id of the object
28 |             pipeline_name(str): frequency type for the pipeline
29 |             failure_message(str): Message used in SNS on pipeline failures,
30 |             **kwargs(optional): Keyword arguments directly passed to base class
31 |         """
32 | 
33 |         if not pipeline_name:
34 |             pipeline_name = "None"
35 | 
36 |         if not failure_message:
37 |             failure_message = '\n'.join([
38 |                 'Identifier: ' + pipeline_name,
39 |                 'Object: #{node.name}',
40 |                 'Object Scheduled Start Time: #{node.@scheduledStartTime}',
41 |                 'Error Message: #{node.errorMessage}',
42 |                 'Error Stack Trace: #{node.errorStackTrace}'
43 |             ])
44 | 
45 |         subject = 'Data Pipeline %s failed' % pipeline_name
46 | 
47 |         if topic_arn is None:
48 |             topic_arn = SNS_TOPIC_ARN_FAILURE
49 | 
50 |         super(SNSAlarm, self).__init__(
51 |             id=id,
52 |             type='SnsAlarm',
53 |             topicArn=topic_arn,
54 |             role=ROLE,
55 |             subject=subject,
56 |             message=failure_message,
57 |         )
58 | 


--------------------------------------------------------------------------------
/dataduct/config/logger_config.py:
--------------------------------------------------------------------------------
 1 | """Script that has the base logger configurations
 2 | """
 3 | import os
 4 | import logging
 5 | from logging.handlers import RotatingFileHandler
 6 | 
 7 | from .config import Config
 8 | from .constants import CONFIG_DIR
 9 | from .constants import LOG_FILE
10 | 
11 | FILE_FORMAT_STR = '%(asctime)s [%(levelname)s]: %(message)s ' + \
12 |                   '[in %(name)s:%(lineno)d in %(funcName)s]'
13 | CONSOLE_FORMAT_STR = '[%(levelname)s]: %(message)s'
14 | 
15 | 
16 | def logger_configuration():
17 |     """Set the logger configurations for dataduct
18 |     """
19 |     config = Config()
20 | 
21 |     if not hasattr(config, 'logging'):
22 |         raise Exception('logging section is missing in config')
23 | 
24 |     log_directory = os.path.expanduser(config.logging.get(
25 |         'LOG_DIR', os.path.join('~', CONFIG_DIR)))
26 |     file_name = config.logging.get(
27 |         'LOG_FILE', LOG_FILE)
28 | 
29 |     console_level = config.logging.get(
30 |         'CONSOLE_DEBUG_LEVEL', logging.INFO)
31 |     file_level = config.logging.get(
32 |         'FILE_DEBUG_LEVEL', logging.DEBUG)
33 | 
34 |     if not os.path.exists(log_directory):
35 |         os.mkdir(log_directory)
36 | 
37 |     logger = logging.getLogger()
38 |     logger.setLevel(logging.DEBUG)
39 | 
40 |     file_handler = RotatingFileHandler(os.path.join(log_directory, file_name),
41 |                                        maxBytes=200000,
42 |                                        backupCount=10)
43 |     file_handler.setLevel(file_level)
44 |     file_handler.setFormatter(logging.Formatter(FILE_FORMAT_STR,
45 |                                                 datefmt='%Y-%m-%d %H:%M'))
46 | 
47 |     console_handler = logging.StreamHandler()
48 |     console_handler.setLevel(console_level)
49 |     console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT_STR))
50 | 
51 |     logger.addHandler(console_handler)
52 |     logger.addHandler(file_handler)
53 | 


--------------------------------------------------------------------------------
/dataduct/steps/extract_s3.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for creating an S3 node for input
 3 | """
 4 | from ..s3 import S3Path
 5 | from ..utils.exceptions import ETLInputError
 6 | from ..utils.helpers import exactly_one
 7 | from ..utils.helpers import get_modified_s3_path
 8 | from .etl_step import ETLStep
 9 | 
10 | 
11 | class ExtractS3Step(ETLStep):
12 |     """ExtractS3 Step class that helps get data from S3
13 |     """
14 | 
15 |     def __init__(self, directory_uri=None, file_uri=None, **kwargs):
16 |         """Constructor for the ExtractS3Step class
17 | 
18 |         Args:
19 |             directory_uri(str): s3 path for s3 data directory
20 |             file_uri(str): s3 path for s3 data file
21 |             **kwargs(optional): Keyword arguments directly passed to base class
22 |         """
23 |         if not exactly_one(directory_uri, file_uri):
24 |             raise ETLInputError('One of file_uri or directory_uri needed')
25 | 
26 |         super(ExtractS3Step, self).__init__(**kwargs)
27 | 
28 |         if directory_uri:
29 |             directory_uri = get_modified_s3_path(directory_uri)
30 |             s3_path = S3Path(uri=directory_uri, is_directory=True)
31 |         else:
32 |             file_uri = get_modified_s3_path(file_uri)
33 |             s3_path = S3Path(uri=file_uri)
34 |         self._output = self.create_s3_data_node(s3_path)
35 | 
36 |     @classmethod
37 |     def arguments_processor(cls, etl, input_args):
38 |         """Parse the step arguments according to the ETL pipeline
39 | 
40 |         Args:
41 |             etl(ETLPipeline): Pipeline object containing resources and steps
42 |             step_args(dict): Dictionary of the step arguments for the class
43 |         """
44 |         input_args = cls.pop_inputs(input_args)
45 |         step_args = cls.base_arguments_processor(etl, input_args)
46 |         step_args.pop('resource', None)
47 |         step_args.pop('worker_group', None)
48 | 
49 |         return step_args
50 | 


--------------------------------------------------------------------------------
/dataduct/steps/upsert.py:
--------------------------------------------------------------------------------
 1 | """ETL step wrapper for Upsert SQL script
 2 | """
 3 | from ..database import HistoryTable
 4 | from ..database import SelectStatement
 5 | from ..database import SqlScript
 6 | from ..database import Table
 7 | from ..utils.helpers import exactly_one
 8 | from ..utils.helpers import parse_path
 9 | from .create_update_sql import CreateUpdateSqlStep
10 | 
11 | 
12 | class UpsertStep(CreateUpdateSqlStep):
13 |     """Upsert Step class that helps run a step on the emr cluster
14 |     """
15 | 
16 |     def __init__(self, destination, sql=None, script=None, source=None,
17 |                  enforce_primary_key=True, delete_existing=False, history=None,
18 |                  analyze_table=True, filter_clause=None, **kwargs):
19 |         """Constructor for the UpsertStep class
20 | 
21 |         Args:
22 |             **kwargs(optional): Keyword arguments directly passed to base class
23 |         """
24 |         self.s3_source_dir = kwargs['s3_source_dir']
25 |         assert exactly_one(sql, source, script), 'One of sql/source/script'
26 | 
27 |         # Input formatting
28 |         dest = Table(SqlScript(filename=parse_path(destination)))
29 | 
30 |         if source is not None:
31 |             source_relation = Table(SqlScript(filename=parse_path(source)))
32 |         else:
33 |             source_relation = SelectStatement(
34 |                 SqlScript(sql=sql, filename=parse_path(script)).sql())
35 | 
36 |         # Create the destination table if doesn't exist
37 |         sql_script = dest.upsert_script(source_relation, enforce_primary_key,
38 |                                         delete_existing, filter_clause)
39 | 
40 |         if history:
41 |             hist = HistoryTable(SqlScript(
42 |                 filename=parse_path(history)))
43 |             sql_script.append(hist.update_history_script(dest))
44 | 
45 |         super(UpsertStep, self).__init__(
46 |             table_definition=destination, command=sql_script.sql(),
47 |             analyze_table=analyze_table, **kwargs)
48 | 


--------------------------------------------------------------------------------
/dataduct/steps/qa_transform.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for QA step can be executed on Ec2 resource
 3 | """
 4 | from .transform import TransformStep
 5 | from ..config import Config
 6 | 
 7 | config = Config()
 8 | 
 9 | 
10 | class QATransformStep(TransformStep):
11 |     """QATransform Step class that helps run scripts on resouces for QA checks
12 |     """
13 | 
14 |     def __init__(self,
15 |                  id,
16 |                  pipeline_name,
17 |                  script_arguments=None,
18 |                  sns_topic_arn=None,
19 |                  **kwargs):
20 |         """Constructor for the QATransformStep class
21 | 
22 |         Args:
23 |             sns_arn(str): sns topic arn for QA steps
24 |             script_arguments(list of str): list of arguments to the script
25 |             **kwargs(optional): Keyword arguments directly passed to base class
26 |         """
27 | 
28 |         if sns_topic_arn is None:
29 |             sns_topic_arn = config.etl.get('SNS_TOPIC_ARN_WARNING', None)
30 | 
31 |         if script_arguments is None:
32 |             script_arguments = list()
33 | 
34 |         script_arguments.append('--test_name=%s' % (pipeline_name + "." + id))
35 |         if sns_topic_arn:
36 |             script_arguments.append('--sns_topic_arn=%s' % sns_topic_arn)
37 | 
38 |         super(QATransformStep, self).__init__(
39 |             id=id,
40 |             script_arguments=script_arguments,
41 |             no_output=True,
42 |             **kwargs)
43 | 
44 |     @classmethod
45 |     def arguments_processor(cls, etl, input_args):
46 |         """Parse the step arguments according to the ETL pipeline
47 | 
48 |         Args:
49 |             etl(ETLPipeline): Pipeline object containing resources and steps
50 |             step_args(dict): Dictionary of the step arguments for the class
51 |         """
52 |         input_args = cls.pop_inputs(input_args)
53 |         step_args = cls.base_arguments_processor(etl, input_args)
54 |         step_args['pipeline_name'] = etl.name
55 | 
56 |         return step_args
57 | 


--------------------------------------------------------------------------------
/dataduct/database/view.py:
--------------------------------------------------------------------------------
 1 | """Script containing the view class object
 2 | """
 3 | from .parsers import parse_create_view
 4 | from .sql import SqlScript
 5 | from .select_statement import SelectStatement
 6 | from .relation import Relation
 7 | 
 8 | 
 9 | class View(Relation):
10 |     """Class representing view in the database
11 |     """
12 |     def __init__(self, sql):
13 |         """Constructor for view class
14 |         """
15 |         super(View, self).__init__()
16 | 
17 |         if isinstance(sql, SqlScript):
18 |             # Take the first statement and ignore the rest
19 |             sql = sql.statements[0]
20 | 
21 |         parameters = parse_create_view(sql.sql())
22 | 
23 |         self.sql_statement = sql
24 |         self.parameters = parameters
25 | 
26 |         self.full_name = parameters.get('view_name')
27 |         self.replace_flag = parameters.get('replace', False)
28 | 
29 |         self.select_statement = SelectStatement(parameters.get('select_statement'))
30 | 
31 |         self.schema_name, self.view_name = self.initialize_name()
32 | 
33 |     @property
34 |     def dependencies(self):
35 |         """List of relations which this view references.
36 |         """
37 |         return self.select_statement.dependencies
38 | 
39 |     @property
40 |     def columns(self):
41 |         """List of columns in the view's select statement
42 |         """
43 |         return self.select_statement.columns
44 | 
45 |     def drop_script(self):
46 |         """Sql script to drop the view
47 |         """
48 |         return SqlScript('DROP VIEW IF EXISTS %s CASCADE' % self.full_name)
49 | 
50 |     def check_not_exists_script(self):
51 |         """Sql script to create statement if the table exists or not
52 |         """
53 |         return SqlScript("""
54 |             SELECT NOT EXISTS(
55 |                 SELECT 1
56 |                 FROM information_schema.views
57 |                 WHERE table_schema = '%s'
58 |                 AND table_name = '%s'
59 |             )
60 |         """ % (self.schema_name, self.view_name))
61 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | We really appreciate any help we can get in making dataduct a successful project.
 4 | There are a few guidelines that we need contributors to follow so that we can
 5 | have a chance of keeping on top of things.
 6 | 
 7 | ## Getting Started
 8 | 
 9 | * Make sure you have a [GitHub account](https://github.com/signup/free)
10 | * Create an issue for the bug, assuming one does not already exist.
11 |   * Clearly describe the issue including steps to reproduce when it is a bug.
12 |   * Make sure you fill in the earliest version that you know has the issue.
13 | * Fork the repository on GitHub
14 | 
15 | ## Making Changes
16 | 
17 | * Create a topic branch from where you want to base your work.
18 |   * This is usually the master branch.
19 |   * Only target release branches if you are certain your fix must be on that
20 |     branch.
21 |   * To quickly create a topic branch based on master; `git checkout -b
22 |     fix/master/my_contribution master`. Please avoid working directly on the
23 |     `master` branch.
24 | * Make commits of logical units.
25 | * Check for unnecessary whitespace with `git diff --check` before committing.
26 | * Make sure your commit messages are in the proper format.
27 | * Make sure you have added the necessary tests for your changes.
28 | * Run _all_ the tests to assure nothing else was accidentally broken.
29 | * Make sure all the code follows PEP8
30 | 
31 | ## Making Trivial Changes
32 | 
33 | ### Documentation
34 | 
35 | For changes of a trivial nature to comments and documentation, it is not
36 | always necessary to create a new issue. In this case, it is
37 | appropriate to start the first line of a commit with '(doc)' instead of
38 | a ticket number.
39 | 
40 | ## Submitting Changes
41 | 
42 | * Push your changes to a topic branch in your fork of the repository.
43 | * Submit a pull request to the repository in the coursera organization.
44 | * Reference the issue you created in the pull requrest
45 | 
46 | # Additional Resources
47 | 
48 | * [General GitHub documentation](http://help.github.com/)
49 | * [GitHub pull request documentation](http://help.github.com/send-pull-requests/)
50 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Installation using pip
 5 | ----------------------
 6 | 
 7 | Dataduct can easily be installed using pip with the following commands.
 8 | 
 9 | ::
10 | 
11 |     pip install dataduct
12 | 
13 | The major dependencies of dataduct are:
14 | 
15 | -  ``boto`` greater than version 2.34, older versions are missing some
16 |    of the functionality provided by EMR
17 | -  ``PyYAML``
18 | -  ``pandas``
19 | -  ``psycopg2``
20 | -  ``pytimeparse``
21 | -  ``MySQL-python``
22 | -  ``pyparsing``
23 | -  ``testfixtures``
24 | 
25 | Ensure that a `boto config file <http://boto.cloudhackers.com/en/latest/boto_config_tut.html>`__
26 | containing proper AWS credentials is present.
27 | 
28 | The visualizations are created using:
29 | 
30 | -  ``graphviz``
31 | -  ``pygraphviz``
32 | 
33 | Autocomplete for the CLI is supported using:
34 | 
35 | -  ``argcomplete``
36 | 
37 | The documentation is created using:
38 | 
39 | -  ``sphinx``
40 | -  ``sphinx-napolean``
41 | -  ``sphinx_rtd_theme``
42 | 
43 | Installing in the developer environment
44 | ---------------------------------------
45 | 
46 | 1. Clone the Repo
47 | ^^^^^^^^^^^^^^^^^
48 | 
49 | ::
50 | 
51 |     git clone https://github.com/coursera/dataduct.git
52 | 
53 | 2. Update PATH and PYTHONPATH
54 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
55 | 
56 | Add these lines into your ``.bash_profile`` or ``.zshrc`` etc based on
57 | your shell type.
58 | 
59 | ::
60 | 
61 |     export PYTHONPATH=~/dataduct:$PYTHONPATH
62 |     export PATH=~/dataduct/bin:$PATH
63 | 
64 | 3. Config
65 | ^^^^^^^^^
66 | 
67 | Create a config file. Instructions for this are provided in the config
68 | section.
69 | 
70 | Setup Autocomplete
71 | ------------------
72 | 
73 | Install argcomplete with ``pip install argcomplete``.
74 | 
75 | If you're using ``bash`` then add the following to your
76 | ``.bash_profile``:
77 | 
78 | ::
79 | 
80 |     eval "$(register-python-argcomplete dataduct)"
81 | 
82 | if you're using ``zsh`` then add the following line to your ``.zshrc``:
83 | 
84 | ::
85 | 
86 |     autoload bashcompinit
87 |     bashcompinit
88 |     eval "$(register-python-argcomplete dataduct)"
89 | 


--------------------------------------------------------------------------------
/dataduct/data_access/tests/test_connection.py:
--------------------------------------------------------------------------------
 1 | """Tests for the connection file
 2 | """
 3 | from unittest import TestCase
 4 | from nose.tools import eq_
 5 | from nose.tools import raises
 6 | 
 7 | from ...config import Config
 8 | from ...utils.exceptions import ETLConfigError
 9 | from .. import connection
10 | 
11 | 
12 | class TestConnection(TestCase):
13 |     """Tests for the connection file
14 |     """
15 |     @staticmethod
16 |     def test_get_redshift_config_correctly_returns():
17 |         """Tests that get_redshift_config can successfully retrieve the
18 |         redshift config
19 |         """
20 |         config = Config()
21 |         config.redshift = 'test'
22 |         eq_(connection.get_redshift_config(), 'test')
23 | 
24 |     @staticmethod
25 |     @raises(ETLConfigError)
26 |     def test_get_redshift_config_no_config_raises():
27 |         """Tests that get_redshift_config raises an exception if the redshift
28 |         config cannot be found
29 |         """
30 |         config = Config()
31 |         del config.redshift
32 |         connection.get_redshift_config()
33 | 
34 |     @staticmethod
35 |     @raises(ETLConfigError)
36 |     def test_sql_config_no_config_raises():
37 |         """Tests that get_sql_config raises an exception if the config cannot
38 |         be found
39 |         """
40 |         config = Config()
41 |         del config.mysql
42 |         connection.get_sql_config('test')
43 | 
44 |     @staticmethod
45 |     @raises(ETLConfigError)
46 |     def test_sql_config_cannot_find_hostname_raises():
47 |         """Tests that get_sql_config raises an exception if the hostname is not
48 |         in the config
49 |         """
50 |         config = Config()
51 |         config.mysql = {'test': {}}
52 |         connection.get_sql_config('test1')
53 | 
54 |     @staticmethod
55 |     def test_sql_config_correctly_returns():
56 |         """Tests that get_sql_config can correctly retrieve the config
57 |         """
58 |         config = Config()
59 |         config.mysql = {'test': {'cred': 'data'}}
60 |         result = connection.get_sql_config('test')
61 |         eq_(result['DATABASE'], 'test')
62 |         eq_(result['cred'], 'data')
63 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/sql_statement.py:
--------------------------------------------------------------------------------
 1 | """Script that contains the sql statement class
 2 | """
 3 | from copy import deepcopy
 4 | from .utils import sanitize_sql
 5 | from ..parsers import parse_create_table
 6 | from ..parsers import parse_create_view
 7 | 
 8 | 
 9 | class SqlStatement(object):
10 |     """Class representing a single SQL statement
11 |     """
12 |     def __init__(self, sql=None, transactional=False):
13 |         """Constructor for the SqlStatement class
14 |         """
15 |         if sql is None:
16 |             sql = ''
17 |         self._raw_sql = sql
18 |         self.transactional = transactional
19 |         self._raw_statement = self._sanitize_sql()
20 | 
21 |     def __str__(self):
22 |         """Print a SqlStatement object
23 |         """
24 |         return self.sql()
25 | 
26 |     def copy(self):
27 |         """Create a copy of the relation object
28 |         """
29 |         return deepcopy(self)
30 | 
31 |     def sql(self):
32 |         """Returns the raw_sql for the SqlStatement
33 |         """
34 |         return self._raw_statement
35 | 
36 |     def _sanitize_sql(self):
37 |         """Clean the SQL, remove comments and empty statements
38 |         """
39 |         if self._raw_sql is None:
40 |             return ''
41 | 
42 |         raw_statements = sanitize_sql(self._raw_sql, self.transactional)
43 | 
44 |         if len(raw_statements) > 1:
45 |             raise ValueError('SQL Statement can not contain more than 1 query')
46 |         elif len(raw_statements) == 1:
47 |             return raw_statements[0]
48 |         else:
49 |             return ''
50 | 
51 |     def _validate_parser(self, func):
52 |         """Check if a parser satisfies the sql statement
53 |         """
54 |         try:
55 |             func(self.sql())
56 |         except Exception:
57 |             return False
58 |         return True
59 | 
60 |     def creates_table(self):
61 |         """SQL statement creates a table.
62 |         """
63 |         return self._validate_parser(parse_create_table)
64 | 
65 |     def creates_view(self):
66 |         """SQL statement creates a view.
67 |         """
68 |         return self._validate_parser(parse_create_view)
69 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/tests/test_create_table.py:
--------------------------------------------------------------------------------
 1 | """Tests for create table parser
 2 | """
 3 | 
 4 | from unittest import TestCase
 5 | from nose.tools import eq_
 6 | from nose.tools import raises
 7 | from pyparsing import ParseException
 8 | 
 9 | from ..create_table import parse_create_table
10 | from ..create_table import create_exists_clone
11 | 
12 | 
13 | class TestCreateTableStatement(TestCase):
14 |     """Tests for create table
15 |     """
16 |     @staticmethod
17 |     def test_basic():
18 |         """Basic test for create table
19 |         """
20 |         query = ('CREATE TABLE orders ('
21 |                  'customer_id INTEGER DISTKEY PRIMARY KEY,'
22 |                  'customer_name VARCHAR(200))')
23 | 
24 |         output = parse_create_table(query)
25 | 
26 |         eq_(output['full_name'], 'orders')
27 |         eq_(output['temporary'], False)
28 |         eq_(output['exists_checks'], False)
29 |         eq_(len(output['constraints']), 0)
30 |         eq_(len(output['columns']), 2)
31 | 
32 |     @staticmethod
33 |     def test_exists_clone():
34 |         """Basic test for create table clone with exists condition
35 |         """
36 |         query = ('CREATE TABLE orders ('
37 |                  'customer_id INTEGER DISTKEY PRIMARY KEY,'
38 |                  'customer_name VARCHAR(200))')
39 | 
40 |         exists_clone = create_exists_clone(query)
41 |         output = parse_create_table(exists_clone)
42 |         eq_(output['full_name'], 'orders')
43 |         eq_(output['temporary'], False)
44 |         eq_(output['exists_checks'], True)
45 | 
46 |     @staticmethod
47 |     @raises(ParseException)
48 |     def test_bad_input():
49 |         """Feeding malformed input into create table
50 |         """
51 |         query = 'CREATE TABLE orders (' +\
52 |                 'customer_id INTEGER DISTKEY PRIMARY KEY'
53 |         parse_create_table(query)
54 | 
55 |     @staticmethod
56 |     @raises(ParseException)
57 |     def test_bad_input_in_columns():
58 |         """Feeding malformed input into create table
59 |         """
60 |         query = 'CREATE TABLE orders (' +\
61 |                 'customer_id NEGATIVE DISTKEY PRIMARY KEY)'
62 |         parse_create_table(query)
63 | 


--------------------------------------------------------------------------------
/dataduct/qa/count_check.py:
--------------------------------------------------------------------------------
 1 | """QA test for comparing number of rows in the source system with the Warehouse
 2 | """
 3 | 
 4 | from .check import Check
 5 | from .utils import render_output
 6 | 
 7 | 
 8 | class CountCheck(Check):
 9 |     """QA test for comparing number of rows across the ETL
10 |     """
11 |     def __init__(self, source_count, destination_count, **kwargs):
12 |         """Constructor for the Count based QA
13 | 
14 |         Args:
15 |             source_count(int): Count of objects in the source system
16 |             destination_count(int): Count of objects in the warehouse
17 |         """
18 |         super(CountCheck, self).__init__(**kwargs)
19 |         self.source_count = source_count
20 |         self.destination_count = destination_count
21 | 
22 |     @property
23 |     def error_rate(self):
24 |         """The error rate.
25 |         If there are no values in the source or destination, the error is 0.
26 |         If there are no values in the source but some in the destination,
27 |         the error is None
28 |         """
29 |         return self.calculate_error_rate(self.source_count,
30 |                                          self.destination_count)
31 | 
32 |     @staticmethod
33 |     def calculate_error_rate(source_count, destination_count):
34 |         """Calculate the error rate based on the source and destination counts
35 |         """
36 |         if source_count > 0:
37 |             error_difference = float(source_count - destination_count)
38 |             return abs(error_difference * 100) / source_count
39 |         elif destination_count == 0:
40 |             return 0
41 |         else:
42 |             return None
43 | 
44 |     @property
45 |     def summary(self):
46 |         """Summary of the test results for the SNS message
47 |         """
48 |         return render_output(
49 |             [
50 |                 'Test Name: %s' % self.name,
51 |                 'Success: %s' % self.success,
52 |                 'Tolerance: %0.4f%%' % self.tolerance,
53 |                 'Error Rate: %0.4f%%' % self.error_rate,
54 |                 'Source Count: %d' % self.source_count,
55 |                 'Destination Count: %d' % self.destination_count,
56 |             ]
57 |         )
58 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | # Changes in dataduct
 2 | 
 3 | ### 0.5.0
 4 | - Cleanup commands being passed in QA steps
 5 | - Add support for postgres
 6 | - Status bar for uploading large files
 7 | - Minor bug fixes
 8 | 
 9 | ### 0.4.0
10 | - Support for starting database shell from dataduct cli
11 | - Fix bug in logger configuration
12 | - More performance tuning for analyze and vacumm
13 | - Improved subject line for SNS messages
14 | - More informed logging for load errors
15 | - Improvements to decorators
16 | - PK enforcement changes
17 | - New load-reload-pk step
18 | - Support for worker groups
19 | - Steps to move away from scripts to all code being contained in the library
20 | 
21 | ### 0.3.0
22 | - More documentation
23 | - Bug fixes in SQL parser
24 | - Hooks framework
25 | - Default bootstrap
26 | - Teardown
27 | - Frequency fixes
28 | 
29 | ### 0.2.0
30 | - Travis integration for continous builds
31 | - QA steps and logging to S3
32 | - Visualizing pipeline
33 | - Dataduct CLI updated as a single entry point
34 | - RDS connections for scripts
35 | - Bootstrap step for pipelines
36 | - Backfill or delay activation
37 | - Output path and input path options
38 | - Script directory for transform step
39 | - SQL sanatization for DBA actions
40 | - SQL parser for select and create table statements
41 | - Logging across the library
42 | - Support for custom steps
43 | - Pipeline dependency step
44 | - Reduce verbosity of imports
45 | - Step parsing is isolated in steps
46 | - More examples for steps
47 | - Sync config with S3
48 | - Config overides with modes
49 | - Rename keywords and safe config failure handling
50 | - EMR Streaming support with hadoop 2
51 | - Exceptions cleanup
52 | - Read the docs support
53 | - Creating tables automatically for various steps
54 | - History table support
55 | - EC2 and EMR config control from YAML
56 | - Slack integration
57 | - Support for Regions in DP
58 | 
59 | ### 0.1.0
60 | - Initial version of the dataduct library released
61 | - Support for the following steps:
62 |     - emr_streaming
63 |     - extract-local
64 |     - extract-s3
65 |     - extract-rds
66 |     - extract-redshift
67 |     - load-redshift
68 |     - sql-command
69 |     - transform
70 | - Examples and documentation added for all the steps
71 | 


--------------------------------------------------------------------------------
/dataduct/tests/test_import.py:
--------------------------------------------------------------------------------
 1 | """Tests for dependencies
 2 | """
 3 | from unittest import TestCase
 4 | 
 5 | 
 6 | class TestImports(TestCase):
 7 |     """Tests for dependencies
 8 |     """
 9 |     @staticmethod
10 |     def test_boto():
11 |         """Testing boto
12 |         """
13 |         print 'Trying to import boto'
14 |         import boto
15 | 
16 |     @staticmethod
17 |     def test_mysqldb():
18 |         """Testing MySQLdb
19 |         """
20 |         print 'Trying to import MySQLdb'
21 |         import MySQLdb
22 | 
23 |     @staticmethod
24 |     def test_pandas():
25 |         """Testing pandas
26 |         """
27 |         print 'Trying to import pandas'
28 |         import pandas
29 |         print pandas.io.sql
30 | 
31 |     @staticmethod
32 |     def test_psycopg2():
33 |         """Testing psycopg2
34 |         """
35 |         print 'Trying to import psycopg2'
36 |         import psycopg2
37 | 
38 |     @staticmethod
39 |     def test_pygraphviz():
40 |         """Testing pygraphviz
41 |         """
42 |         print 'Trying to import pygraphviz'
43 |         import pygraphviz
44 | 
45 |     @staticmethod
46 |     def test_pyparsing():
47 |         """Testing pyparsing
48 |         """
49 |         print 'Trying to import pyparsing'
50 |         import pyparsing
51 | 
52 |     @staticmethod
53 |     def test_pyyaml():
54 |         """Testing PyYAML
55 |         """
56 |         print 'Trying to import pyyaml'
57 |         import yaml
58 | 
59 |     @staticmethod
60 |     def test_setuptools():
61 |         """Testing setuptools
62 |         """
63 |         print 'Trying to import setuptools'
64 |         import setuptools
65 | 
66 |     @staticmethod
67 |     def test_sphinx_rtd_theme():
68 |         """Testing sphinx_rtd_theme
69 |         """
70 |         print 'Trying to import sphinx_rtd_theme'
71 |         import sphinx_rtd_theme
72 | 
73 |     @staticmethod
74 |     def test_testfixtures():
75 |         """Testing testfixtures
76 |         """
77 |         print 'Trying to import testfixtures'
78 |         import testfixtures
79 | 
80 |     @staticmethod
81 |     def test_pytimeparse():
82 |         """Testing pytimeparse
83 |         """
84 |         print 'Trying to import pytimeparse'
85 |         import pytimeparse
86 | 


--------------------------------------------------------------------------------
/dataduct/utils/hook.py:
--------------------------------------------------------------------------------
 1 | """Hook framework in dataduct.
 2 | 
 3 | To make a function hookable, add the hook decorator like so:
 4 | 
 5 | @hook('hook_name')
 6 | def function():
 7 |     ...
 8 | """
 9 | import os
10 | import imp
11 | import sys
12 | 
13 | from .helpers import parse_path
14 | 
15 | 
16 | def default_before_hook(*args, **kwargs):
17 |     """The default before hook, will act like it's not even there
18 |     """
19 |     return args, kwargs
20 | 
21 | 
22 | def default_after_hook(result):
23 |     """The default after hook, will act like it's not even there
24 |     """
25 |     return result
26 | 
27 | 
28 | def get_hooks(hook_name):
29 |     """Returns the before hook and after hook (in a tuple) for a particular
30 |     hook name
31 |     """
32 |     from dataduct.config import Config
33 |     config = Config()
34 | 
35 |     if 'HOOKS_BASE_PATH' not in config.etl:
36 |         return default_before_hook, default_after_hook
37 | 
38 |     hook_file = parse_path(hook_name + '.py', 'HOOKS_BASE_PATH')
39 |     if not os.path.isfile(hook_file):
40 |         return default_before_hook, default_after_hook
41 | 
42 |     # Delete the previous custom hook, so the imports are not merged.
43 |     if 'custom_hook' in sys.modules:
44 |         del sys.modules['custom_hook']
45 | 
46 |     # Get the hook functions, falling back to the default hooks
47 |     custom_hook = imp.load_source('custom_hook', hook_file)
48 |     before_hook = getattr(custom_hook, 'before_hook', default_before_hook)
49 |     after_hook = getattr(custom_hook, 'after_hook', default_after_hook)
50 | 
51 |     return before_hook, after_hook
52 | 
53 | 
54 | def hook(hook_name):
55 |     """The hook decorator creator
56 |     """
57 |     before_hook, after_hook = get_hooks(hook_name)
58 | 
59 |     def hook_decorator(func):
60 |         """The hook decorator
61 |         """
62 |         def function_wrapper(*args, **kwargs):
63 |             """The hook wrapper for the function
64 |             """
65 |             new_args, new_kwargs = before_hook(*args, **kwargs)
66 |             result = func(*new_args, **new_kwargs)
67 |             new_result = after_hook(result)
68 |             return new_result
69 | 
70 |         return function_wrapper
71 | 
72 |     return hook_decorator
73 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/mysql_node.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for MysqlNode
 3 | """
 4 | 
 5 | from .pipeline_object import PipelineObject
 6 | from .schedule import Schedule
 7 | from ..utils.exceptions import ETLInputError
 8 | 
 9 | 
10 | class MysqlNode(PipelineObject):
11 |     """MySQL Data Node class
12 |     """
13 | 
14 |     def __init__(self, id, schedule, host, database, username, password, sql,
15 |                  table, depends_on=None):
16 |         """Constructor for the MysqlNode class
17 | 
18 |         Args:
19 |             id(str): id of the object
20 |             schedule(Schedule): pipeline schedule
21 |             host(str): hostname for the mysql database
22 |             database(str): database name on the RDS host
23 |             user(str): username for the database
24 |             password(str): password for the database
25 |             sql(str): sql to be executed
26 |             table(str): table to be read
27 |         """
28 | 
29 |         # Validate inputs
30 |         if not isinstance(schedule, Schedule):
31 |             raise ETLInputError(
32 |                 'Input schedule must be of the type Schedule')
33 | 
34 |         if not depends_on:
35 |             depends_on = list()
36 | 
37 |         connection_string = "jdbc:mysql://" + host + ":3306/" + database
38 | 
39 |         kwargs = {
40 |             'id': id,
41 |             'type': 'SqlDataNode',
42 |             'schedule': schedule,
43 |             'connectionString': connection_string,
44 |             'username': username,
45 |             '*password': password,
46 |             'selectQuery': sql,
47 |             'table': table,
48 |             'dependsOn': depends_on,
49 |         }
50 |         super(MysqlNode, self).__init__(**kwargs)
51 | 
52 |     @property
53 |     def database(self):
54 |         """Get the database name for the MySQL node
55 | 
56 |         Returns:
57 |             result(str): database name for this MySQL node
58 |         """
59 |         return self['connectionString'].split("/").pop()
60 | 
61 |     @property
62 |     def table(self):
63 |         """Get the table name for the MySQL node
64 | 
65 |         Returns:
66 |             result(str): table name for this MySQL node
67 |         """
68 |         return self['tableName']
69 | 


--------------------------------------------------------------------------------
/dataduct/steps/create_load_redshift.py:
--------------------------------------------------------------------------------
 1 | """ETL step wrapper for loading into redshift with the COPY command
 2 | """
 3 | from ..config import Config
 4 | from ..database import SqlStatement
 5 | from ..database import Table
 6 | from ..utils import constants as const
 7 | from ..utils.helpers import parse_path
 8 | from .transform import TransformStep
 9 | 
10 | config = Config()
11 | 
12 | 
13 | class CreateAndLoadStep(TransformStep):
14 |     """CreateAndLoad Step class that creates table if needed and loads data
15 |     """
16 | 
17 |     def __init__(self, id, table_definition, input_node,
18 |                  script_arguments=None, **kwargs):
19 |         """Constructor for the CreateAndLoadStep class
20 | 
21 |         Args:
22 |             table_definition(filepath): schema file for the table to be loaded
23 |             script_arguments(list of str): list of arguments to the script
24 |             **kwargs(optional): Keyword arguments directly passed to base class
25 |         """
26 |         with open(parse_path(table_definition)) as f:
27 |             table_def_string = f.read()
28 | 
29 |         table = Table(SqlStatement(table_def_string))
30 | 
31 |         if isinstance(input_node, dict):
32 |             input_paths = [i.path().uri for i in input_node.values()]
33 |         else:
34 |             input_paths = [input_node.path().uri]
35 | 
36 |         if script_arguments is None:
37 |             script_arguments = list()
38 | 
39 |         script_arguments.extend([
40 |             '--table_definition=%s' % table.sql().sql(),
41 |             '--s3_input_paths'] + input_paths)
42 | 
43 |         super(CreateAndLoadStep, self).__init__(
44 |             id=id, command=const.LOAD_COMMAND,
45 |             script_arguments=script_arguments, no_input=True, no_output=True,
46 |             **kwargs)
47 | 
48 |     @classmethod
49 |     def arguments_processor(cls, etl, input_args):
50 |         """Parse the step arguments according to the ETL pipeline
51 | 
52 |         Args:
53 |             etl(ETLPipeline): Pipeline object containing resources and steps
54 |             step_args(dict): Dictionary of the step arguments for the class
55 |         """
56 |         step_args = cls.base_arguments_processor(etl, input_args)
57 | 
58 |         return step_args
59 | 


--------------------------------------------------------------------------------
/dataduct/etl/tests/test_etl_pipeline.py:
--------------------------------------------------------------------------------
 1 | """Tests for the ETL Pipeline object
 2 | """
 3 | import unittest
 4 | from nose.tools import raises
 5 | from nose.tools import eq_
 6 | 
 7 | from datetime import timedelta
 8 | from ..etl_pipeline import ETLPipeline
 9 | from ...utils.exceptions import ETLInputError
10 | 
11 | 
12 | class EtlPipelineTests(unittest.TestCase):
13 |     """Tests for the ETL Pipeline object
14 |     """
15 | 
16 |     def setUp(self):
17 |         """Setup text fixtures
18 |         """
19 |         self.default_pipeline = ETLPipeline('test_pipeline')
20 | 
21 |     @staticmethod
22 |     def test_construct_etl_pipeline():
23 |         """Test if the constructor for EtlPipeline is correct
24 |         """
25 |         result = ETLPipeline(
26 |             'test_pipeline',
27 |             frequency='one-time',
28 |             ec2_resource_config={'terminate_after':'2 Hours'},
29 |             time_delta=timedelta(seconds=3600),
30 |             emr_cluster_config={'cfg1': 'value'},
31 |             load_time='12:34',
32 |             topic_arn='sns:topic-arn:test-case',
33 |             max_retries=5,
34 |             bootstrap={'cfg1': 'value'},
35 |         )
36 |         assert result.name.endswith('test_pipeline')
37 |         eq_(result.frequency, 'one-time')
38 |         eq_(result.ec2_resource_config, {'terminate_after':'2 Hours'})
39 |         eq_(result.load_hour, 12)
40 |         eq_(result.load_min, 34)
41 |         eq_(result.time_delta, timedelta(seconds=3600))
42 |         eq_(result.max_retries, 5)
43 |         eq_(result.topic_arn, 'sns:topic-arn:test-case')
44 |         eq_(result.bootstrap_definitions, {'cfg1': 'value'})
45 |         eq_(result.emr_cluster_config, {'cfg1': 'value'})
46 | 
47 |     @staticmethod
48 |     def test_no_load_time_default_none():
49 |         """Test if the load_hour and load_min get set to None
50 |         if load_time is None
51 |         """
52 |         result = ETLPipeline('no_load_time_pipeline', load_time=None)
53 |         eq_(result.load_hour, None)
54 |         eq_(result.load_min, None)
55 | 
56 |     @raises(ETLInputError)
57 |     def test_bad_data_type_throws(self):
58 |         """Test that exception is thrown if the data_type parameter for
59 |         _s3_uri is bad
60 |         """
61 |         self.default_pipeline._s3_uri('TEST_DATA_TYPE')
62 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/redshift_database.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for redshift database
 3 | """
 4 | 
 5 | from ..config import Config
 6 | from .pipeline_object import PipelineObject
 7 | from ..utils.exceptions import ETLConfigError
 8 | 
 9 | config = Config()
10 | 
11 | if not hasattr(config, 'redshift'):
12 |     raise ETLConfigError('Redshift credentials missing from config')
13 | 
14 | CLUSTER_ID = None
15 | CONNECTION_STRING = None
16 | DATABASE_NAME = config.redshift['DATABASE_NAME']
17 | USERNAME = config.redshift['USERNAME']
18 | PASSWORD = config.redshift['PASSWORD']
19 | 
20 | 
21 | if 'CLUSTER_ID' in config.redshift and 'CONNECTION_STRING' in config.redshift:
22 |     raise ETLConfigError('Redshift credentials - either CLUSTER_ID or CONNECTION_STRING is required in config')
23 | elif 'CLUSTER_ID' in config.redshift:
24 |     CLUSTER_ID = config.redshift['CLUSTER_ID']
25 | elif 'CONNECTION_STRING' in config.redshift:
26 |     CONNECTION_STRING = config.redshift['CONNECTION_STRING']
27 | 
28 | class RedshiftDatabase(PipelineObject):
29 |     """Redshift resource class
30 |     """
31 | 
32 |     def __init__(self,
33 |                  id,
34 |                  database_name=DATABASE_NAME,
35 |                  cluster_id=CLUSTER_ID,
36 |                  connection_string=CONNECTION_STRING,
37 |                  username=USERNAME,
38 |                  password=PASSWORD):
39 |         """Constructor for the RedshiftDatabase class
40 | 
41 |         Args:
42 |             id(str): id of the object
43 |             database_name(str): host name of the database
44 |             cluster_id(str): identifier for the redshift database across aws
45 |             connection_string(str): JDBC connection string of the Redshift.
46 |             username(str): username for the database
47 |             password(str): password for the database
48 |         """
49 | 
50 |         kwargs = {
51 |             'id': id,
52 |             'type': 'RedshiftDatabase',
53 |             'databaseName': database_name,
54 |             'username': username,
55 |             '*password': password
56 |         }
57 | 
58 |         if CLUSTER_ID:
59 |             kwargs['clusterId'] = CLUSTER_ID
60 |         else:
61 |             kwargs['connectionString'] = CONNECTION_STRING
62 | 
63 |         super(RedshiftDatabase, self).__init__(**kwargs)
64 | 


--------------------------------------------------------------------------------
/dataduct/database/sql/tests/test_sql_statement.py:
--------------------------------------------------------------------------------
 1 | """Tests for the SqlStatement class
 2 | """
 3 | from nose.tools import assert_not_equal
 4 | from nose.tools import eq_
 5 | from nose.tools import raises
 6 | from unittest import TestCase
 7 | 
 8 | from ..sql_statement import SqlStatement
 9 | 
10 | 
11 | class TestSqlStatement(TestCase):
12 |     """Tests for sql statement function
13 |     """
14 |     @staticmethod
15 |     def test_basic():
16 |         """Basic test for statement declaration
17 |         """
18 |         query = 'select \n 1;'
19 |         result = 'select 1'
20 | 
21 |         eq_(SqlStatement(query).sql(), result)
22 | 
23 |     @staticmethod
24 |     def test_sanatization():
25 |         """Sanatization of comments
26 |         """
27 |         query = 'select 1 -- test connect \n;'
28 |         result = 'select 1'
29 | 
30 |         eq_(SqlStatement(query).sql(), result)
31 | 
32 |     @staticmethod
33 |     def test_sanatization_multiline_comment():
34 |         """Sanatization of comments
35 |         """
36 |         query = '/* Comment */\n select 1;'
37 |         result = 'select 1'
38 | 
39 |         eq_(SqlStatement(query).sql(), result)
40 | 
41 |     @staticmethod
42 |     def test_sanatization_multiline_comment_nesting():
43 |         """Sanatization of comments
44 |         """
45 |         query = '/* Comment /* nest */ */\n select 1;'
46 |         result = 'select 1'
47 | 
48 |         eq_(SqlStatement(query).sql(), result)
49 | 
50 |     @staticmethod
51 |     def test_sanatization_multiline_comment_partial_nesting():
52 |         """Sanatization of comments
53 |         This is a test to highlight issue #134 which was marked as won't fix
54 |         """
55 |         query = '/* Comment /* nest */\n select 1;'
56 |         result = 'select 1'
57 |         parsed_output = '/* Comment select 1'
58 | 
59 |         eq_(SqlStatement(query).sql(), parsed_output)
60 |         assert_not_equal(SqlStatement(query).sql(), result)
61 | 
62 |     @staticmethod
63 |     @raises(ValueError)
64 |     def test_error():
65 |         """Raise error if multiple queries are passed
66 |         """
67 |         query = 'select 1; select 2;'
68 |         SqlStatement(query)
69 | 
70 |     @staticmethod
71 |     def test_empty_declaration():
72 |         """Empty if no sql query is passed
73 |         """
74 |         eq_(SqlStatement().sql(), '')
75 | 


--------------------------------------------------------------------------------
/docs/hooks.rst:
--------------------------------------------------------------------------------
 1 | Hooks
 2 | =====
 3 | 
 4 | Dataduct has some endpoints you can use to execute python scripts before and
 5 | after certain events when using the CLI and library locally.
 6 | 
 7 | Available Hooks
 8 | ~~~~~~~~~~~~~~~
 9 | 
10 | -  ``activate_pipeline``, which hooks onto the ``activate_pipeline`` function in
11 |    ``dataduct.etl.etl_actions``.
12 | -  ``connect_to_redshift``, which hooks onto the ``redshift_connection`` function in
13 |    ``dataduct.data_access``.
14 | 
15 | Creating a hook
16 | ~~~~~~~~~~~~~~~
17 | 
18 | Dataduct tries to find available hooks by searching in the directory specified
19 | by the ``HOOKS_BASE_PATH`` config variable in the ``etl`` section, matching them
20 | by their filename. For example, a hook for the ``activate_pipeline``
21 | endpoint would saved as ``activate_pipeline.py`` in that directory.
22 | 
23 | Each hook has two endpoints: ``before_hook`` and ``after_hook``. To implement
24 | one of these endpoints, you declare them as functions inside the hook. You may
25 | implement only one or both endpoints per hook.
26 | 
27 | ``before_hook`` is called before the hooked function is executed. The parameters
28 | passed into the hooked function will also be passed to the ``before_hook``.
29 | The ``before_hook`` is designed to allow you to manipulate the arguments of
30 | the hooked function before it is called. At the end of the ``before_hook``,
31 | return the ``args`` and ``kwargs`` of the hooked function as a tuple.
32 | 
33 | Example ``before_hook``:
34 | 
35 | .. code:: python
36 | 
37 |     # hooked function signature:
38 |     # def example(arg_one, arg_two, arg_three='foo')
39 | 
40 |     def before_hook(arg_one, arg_two, arg_three='foo'):
41 |         return [arg_one + 1, 'hello world'], {'arg_three': 'bar'}
42 | 
43 | ``after_hook`` is called after the hooked function is executed. The result of the
44 | hooked function is passed into ``after_hook`` as a single parameter.
45 | The ``after_hook`` is designed to allow you to access or manipulate the result of
46 | the hooked function. At the end of the ``after_hook``, return the (modified)
47 | result of the hooked function.
48 | 
49 | Example ``after_hook``:
50 | 
51 | .. code:: python
52 | 
53 |     # hooked function result: {'foo': 1, 'bar': 'two'}
54 | 
55 |     def after_hook(result):
56 |         result['foo'] = 2
57 |         result['bar'] = result['bar'] + ' three'
58 |         return result
59 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/copy_activity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for CopyActivity
 3 | """
 4 | 
 5 | from .activity import Activity
 6 | from .schedule import Schedule
 7 | 
 8 | from ..config import Config
 9 | from ..utils import constants as const
10 | from ..utils.exceptions import ETLInputError
11 | 
12 | config = Config()
13 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO)
14 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY)
15 | 
16 | 
17 | class CopyActivity(Activity):
18 |     """EC2 Resource class
19 |     """
20 | 
21 |     def __init__(self,
22 |                  id,
23 |                  input_node,
24 |                  output_node,
25 |                  schedule,
26 |                  resource=None,
27 |                  worker_group=None,
28 |                  max_retries=None,
29 |                  depends_on=None,
30 |                  **kwargs):
31 |         """Constructor for the CopyActivity class
32 | 
33 |         Args:
34 |             id(str): id of the object
35 |             input_node(S3Node / list of S3Nodes): input nodes for the activity
36 |             output_node(S3Node / list of S3Nodes): output nodes for activity
37 |             schedule(Schedule): schedule of the pipeline
38 |             resource(Ec2Resource / EmrResource): resource to run the activity on
39 |             worker_group(str): the worker group to run the activity on
40 |             max_retries(int): number of retries for the activity
41 |             depends_on(list of activities): dependendent pipelines steps
42 |             **kwargs(optional): Keyword arguments directly passed to base class
43 |         """
44 | 
45 |         # Validate inputs
46 |         if not isinstance(schedule, Schedule):
47 |             raise ETLInputError(
48 |                 'Input schedule must be of the type Schedule')
49 | 
50 |         # Set default values
51 |         if depends_on is None:
52 |             depends_on = []
53 |         if max_retries is None:
54 |             max_retries = MAX_RETRIES
55 | 
56 |         super(CopyActivity, self).__init__(
57 |             id=id,
58 |             retryDelay=RETRY_DELAY,
59 |             type='CopyActivity',
60 |             maximumRetries=max_retries,
61 |             dependsOn=depends_on,
62 |             input=input_node,
63 |             output=output_node,
64 |             runsOn=resource,
65 |             workerGroup=worker_group,
66 |             schedule=schedule,
67 |         )
68 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/emr_activity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for EmrActivity
 3 | """
 4 | 
 5 | from .activity import Activity
 6 | from ..config import Config
 7 | from .schedule import Schedule
 8 | from ..utils import constants as const
 9 | from ..utils.exceptions import ETLInputError
10 | 
11 | config = Config()
12 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO)
13 | 
14 | 
15 | class EmrActivity(Activity):
16 |     """EMR Activity class
17 |     """
18 | 
19 |     def __init__(self,
20 |                  id,
21 |                  schedule,
22 |                  input_node,
23 |                  emr_step_string,
24 |                  resource=None,
25 |                  worker_group=None,
26 |                  output_node=None,
27 |                  additional_files=None,
28 |                  max_retries=None,
29 |                  depends_on=None):
30 |         """Constructor for the EmrActivity class
31 | 
32 |         Args:
33 |             id(str): id of the object
34 |             schedule(Schedule): schedule of the pipeline
35 |             emr_step_string(list of str): command string to be executed
36 |             resource(Ec2Resource / EMRResource): resource to run the activity on
37 |             worker_group(str): the worker group to run the activity on
38 |             output_node(S3Node): output_node for the emr job
39 |             additional_files(list of S3File): Additional files required for emr
40 |             max_retries(int): number of retries for the activity
41 |             depends_on(list of activities): dependendent pipelines steps
42 |         """
43 | 
44 |         # Validate inputs
45 |         if not isinstance(schedule, Schedule):
46 |             raise ETLInputError(
47 |                 'Input schedule must be of the type Schedule')
48 | 
49 |         # Set default values
50 |         if depends_on is None:
51 |             depends_on = []
52 |         if max_retries is None:
53 |             max_retries = MAX_RETRIES
54 | 
55 |         super(EmrActivity, self).__init__(
56 |             id=id,
57 |             type='EmrActivity',
58 |             maximumRetries=max_retries,
59 |             dependsOn=depends_on,
60 |             runsOn=resource,
61 |             workerGroup=worker_group,
62 |             schedule=schedule,
63 |             step=emr_step_string,
64 |             output=output_node,
65 |             input=input_node,
66 |         )
67 | 
68 |         self.add_additional_files(additional_files)
69 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/activity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base class for data pipeline instance
 3 | """
 4 | 
 5 | from ..utils.exceptions import ETLInputError
 6 | from ..utils.helpers import exactly_one
 7 | from .pipeline_object import PipelineObject
 8 | 
 9 | 
10 | class Activity(PipelineObject):
11 |     """Base class for pipeline activities
12 |     """
13 | 
14 |     def __init__(self, dependsOn, maximumRetries, runsOn,
15 |                  workerGroup, **kwargs):
16 |         """Constructor for the activity class
17 | 
18 |         Args:
19 |             dependsOn(list): list of dependent activities
20 |             maximumRetries(int): maximum number of retries
21 |             **kwargs(optional): Keyword arguments directly passed to base class
22 | 
23 |         Note:
24 |             dependsOn and maximum retries are required fields for any activity
25 |         """
26 |         if not exactly_one(runsOn, workerGroup):
27 |             raise ETLInputError(
28 |                 'Exactly one of runsOn or workerGroup allowed!')
29 | 
30 |         if runsOn:
31 |             kwargs['runsOn'] = runsOn
32 |         else:
33 |             kwargs['workerGroup'] = workerGroup
34 |         super(Activity, self).__init__(
35 |             dependsOn=dependsOn,
36 |             maximumRetries=maximumRetries,
37 |             **kwargs
38 |         )
39 | 
40 |     def __str__(self):
41 |         try:
42 |             return "%s with id %s" % tuple(self.id.split(".", 1)[::-1])
43 |         except:
44 |             return self.id
45 | 
46 |     @property
47 |     def input(self):
48 |         """Get the input node for the activity
49 | 
50 |         Returns:
51 |             result: Input node for this activity
52 |         """
53 |         return self['input']
54 | 
55 |     @property
56 |     def output(self):
57 |         """Get the output node for the activity
58 | 
59 |         Returns:
60 |             result: output node for this activity
61 |         """
62 |         return self['output']
63 | 
64 |     @property
65 |     def depends_on(self):
66 |         """Get the dependent activities for the activity
67 | 
68 |         Returns:
69 |             result: dependent activities for this activity
70 |         """
71 |         return self['dependsOn']
72 | 
73 |     @property
74 |     def maximum_retries(self):
75 |         """Get the maximum retries for the activity
76 | 
77 |         Returns:
78 |             result: maximum retries for this activity
79 |         """
80 |         return self['maximumRetries']
81 | 


--------------------------------------------------------------------------------
/dataduct/database/tests/test_table.py:
--------------------------------------------------------------------------------
 1 | """Tests for Table
 2 | """
 3 | from unittest import TestCase
 4 | 
 5 | from .helpers import create_table
 6 | from .helpers import compare_scripts
 7 | 
 8 | 
 9 | class TestTable(TestCase):
10 |     """Tests for tables
11 |     """
12 | 
13 |     def setUp(self):
14 |         """Setup test fixtures for the table tests
15 |         """
16 |         self.basic_table = create_table(
17 |             'CREATE TABLE test_table (id INTEGER);')
18 | 
19 |     def test_unload_script(self):
20 |         """Tests if the unload script generates successfully
21 |         """
22 |         result = [
23 |             ("UNLOAD ('SELECT * FROM test_table;') TO 's3://test/' "
24 |              "CREDENTIALS 'aws_access_key_id=a;aws_secret_access_key=b' "
25 |              "DELIMITER '\t' ESCAPE NULL AS 'NULL'")
26 |         ]
27 |         compare_scripts(
28 |             self.basic_table.unload_script('s3://test/', 'a', 'b'),
29 |             result)
30 | 
31 |     def test_unload_script_with_token(self):
32 |         """Tests if the unload script generates successfully
33 |         """
34 |         result = [
35 |             ("UNLOAD ('SELECT * FROM test_table;') TO 's3://test/' "
36 |              "CREDENTIALS "
37 |              "'aws_access_key_id=a;aws_secret_access_key=b;token=c' "
38 |              "DELIMITER '\t' ESCAPE NULL AS 'NULL'")
39 |         ]
40 |         compare_scripts(
41 |             self.basic_table.unload_script('s3://test/', 'a', 'b', 'c'),
42 |             result)
43 | 
44 |     def test_load_script(self):
45 |         """Tests if the unload script generates successfully
46 |         """
47 |         result = [
48 |             ("COPY test_table FROM 's3://test/' "
49 |              "CREDENTIALS 'aws_access_key_id=a;aws_secret_access_key=b' "
50 |              "DELIMITER '\t' ESCAPE NULL AS 'NULL'")
51 |         ]
52 |         compare_scripts(
53 |             self.basic_table.load_script('s3://test/', 'a', 'b'),
54 |             result)
55 | 
56 |     def test_load_script_with_token(self):
57 |         """Tests if the unload script generates successfully
58 |         """
59 |         result = [
60 |             ("COPY test_table FROM 's3://test/' "
61 |              "CREDENTIALS "
62 |              "'aws_access_key_id=a;aws_secret_access_key=b;token=c' "
63 |              "DELIMITER '\t' ESCAPE NULL AS 'NULL'")
64 |         ]
65 |         compare_scripts(
66 |             self.basic_table.load_script('s3://test/', 'a', 'b', 'c'),
67 |             result)
68 | 


--------------------------------------------------------------------------------
/dataduct/etl/utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for processing etl steps
 2 | """
 3 | import imp
 4 | from ..config import Config
 5 | from ..steps import *  # noqa
 6 | from ..utils.helpers import parse_path
 7 | from ..utils.exceptions import ETLInputError
 8 | 
 9 | STEP_CLASSES = {
10 |     'column-check': ColumnCheckStep,
11 |     'count-check': CountCheckStep,
12 |     'create-load-redshift': CreateAndLoadStep,
13 |     'create-update-sql': CreateUpdateSqlStep,
14 |     'delta-load': DeltaLoadStep,
15 |     'emr-step': EMRJobStep,
16 |     'emr-streaming': EMRStreamingStep,
17 |     'extract-local': ExtractLocalStep,
18 |     'extract-rds': ExtractRdsStep,
19 |     'extract-redshift': ExtractRedshiftStep,
20 |     'extract-postgres': ExtractPostgresStep,
21 |     'extract-s3': ExtractS3Step,
22 |     'load-redshift': LoadRedshiftStep,
23 |     'load-postgres': LoadPostgresStep,
24 |     'load-reload-pk': LoadReloadAndPrimaryKeyStep,
25 |     'pipeline-dependencies': PipelineDependenciesStep,
26 |     'primary-key-check': PrimaryKeyCheckStep,
27 |     'qa-transform': QATransformStep,
28 |     'reload': ReloadStep,
29 |     'sql-command': SqlCommandStep,
30 |     'transform': TransformStep,
31 |     'upsert': UpsertStep,
32 | }
33 | 
34 | 
35 | def get_custom_steps():
36 |     """Fetch the custom steps specified in config
37 |     """
38 |     config = Config()
39 |     custom_steps = dict()
40 | 
41 |     for step_def in getattr(config, 'custom_steps', list()):
42 |         step_type = step_def['step_type']
43 |         path = parse_path(step_def['file_path'], 'CUSTOM_STEPS_PATH')
44 | 
45 |         # Load source from the file path provided
46 |         step_mod = imp.load_source(step_type, path)
47 | 
48 |         # Get the step class based on class_name provided
49 |         step_class = getattr(step_mod, step_def['class_name'])
50 | 
51 |         # Check if step_class is of type ETLStep
52 |         if not issubclass(step_class, ETLStep):
53 |             raise ETLInputError('Step type %s is not of type ETLStep' %
54 |                                 step_class.__name__)
55 | 
56 |         custom_steps[step_type] = step_class
57 |     return custom_steps
58 | 
59 | 
60 | STEP_CONFIG = STEP_CLASSES.copy()
61 | STEP_CONFIG.update(get_custom_steps())
62 | 
63 | 
64 | def process_steps(steps_params):
65 |     """Format the step parameters by changing step type to step class
66 |     """
67 |     steps = []
68 |     for step_param in steps_params:
69 |         params = step_param.copy()
70 |         step_type = params.pop('step_type')
71 |         params['step_class'] = STEP_CONFIG[step_type]
72 |         steps.append(params)
73 |     return steps
74 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/sql_activity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for SqlActivity
 3 | """
 4 | 
 5 | from .activity import Activity
 6 | from ..config import Config
 7 | from .schedule import Schedule
 8 | from ..s3 import S3File
 9 | from ..utils import constants as const
10 | from ..utils.exceptions import ETLInputError
11 | 
12 | config = Config()
13 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO)
14 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY)
15 | 
16 | 
17 | class SqlActivity(Activity):
18 |     """Sql Activity class
19 |     """
20 | 
21 |     def __init__(self,
22 |                  id,
23 |                  schedule,
24 |                  script,
25 |                  database,
26 |                  resource=None,
27 |                  worker_group=None,
28 |                  script_arguments=None,
29 |                  queue=None,
30 |                  max_retries=None,
31 |                  depends_on=None):
32 |         """Constructor for the SqlActivity class
33 | 
34 |         Args:
35 |             id(str): id of the object
36 |             schedule(Schedule): schedule of the pipeline
37 |             script(S3File): s3 uri of the script
38 |             database(RedshiftDatabase): database to execute commands on
39 |             resource(Ec2Resource / EMRResource): resource to run the activity on
40 |             worker_group(str): the worker group to run the activity on
41 |             queue(str): queue in which the query should be executed
42 |             max_retries(int): number of retries for the activity
43 |             depends_on(list of activities): dependendent pipelines steps
44 |         """
45 | 
46 |         # Validate inputs
47 |         if not isinstance(schedule, Schedule):
48 |             raise ETLInputError(
49 |                 'Input schedule must be of the type Schedule')
50 | 
51 |         if not isinstance(script, S3File):
52 |             raise ETLInputError('script must be an S3File')
53 | 
54 |         # Set default values
55 |         if depends_on is None:
56 |             depends_on = []
57 |         if max_retries is None:
58 |             max_retries = MAX_RETRIES
59 | 
60 |         super(SqlActivity, self).__init__(
61 |             id=id,
62 |             retryDelay=RETRY_DELAY,
63 |             type='SqlActivity',
64 |             maximumRetries=max_retries,
65 |             dependsOn=depends_on,
66 |             runsOn=resource,
67 |             workerGroup=worker_group,
68 |             schedule=schedule,
69 |             scriptUri=script,
70 |             scriptArgument=script_arguments,
71 |             database=database,
72 |             queue=queue
73 |         )
74 | 


--------------------------------------------------------------------------------
/dataduct/steps/extract_redshift.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for RedshiftCopyActivity to extract data to S3
 3 | """
 4 | from .etl_step import ETLStep
 5 | from ..pipeline import RedshiftNode
 6 | from ..pipeline import RedshiftCopyActivity
 7 | 
 8 | 
 9 | class ExtractRedshiftStep(ETLStep):
10 |     """Extract Redshift Step class that helps get data out of redshift
11 |     """
12 | 
13 |     def __init__(self,
14 |                  schema,
15 |                  table,
16 |                  redshift_database,
17 |                  insert_mode="TRUNCATE",
18 |                  output_path=None,
19 |                  **kwargs):
20 |         """Constructor for the ExtractRedshiftStep class
21 | 
22 |         Args:
23 |             schema(str): schema from which table should be extracted
24 |             table(path): table name for extract
25 |             insert_mode(str): insert mode for redshift copy activity
26 |             redshift_database(RedshiftDatabase): database to excute the query
27 |             **kwargs(optional): Keyword arguments directly passed to base class
28 |         """
29 |         super(ExtractRedshiftStep, self).__init__(**kwargs)
30 | 
31 |         # Create input node
32 |         self._input_node = self.create_pipeline_object(
33 |             object_class=RedshiftNode,
34 |             schedule=self.schedule,
35 |             redshift_database=redshift_database,
36 |             schema_name=schema,
37 |             table_name=table,
38 |         )
39 | 
40 |         self._output = self.create_s3_data_node(
41 |             self.get_output_s3_path(output_path))
42 | 
43 |         self.create_pipeline_object(
44 |             object_class=RedshiftCopyActivity,
45 |             max_retries=self.max_retries,
46 |             input_node=self.input,
47 |             output_node=self.output,
48 |             insert_mode=insert_mode,
49 |             resource=self.resource,
50 |             worker_group=self.worker_group,
51 |             schedule=self.schedule,
52 |             depends_on=self.depends_on,
53 |             command_options=["DELIMITER '\t' ESCAPE"],
54 |         )
55 | 
56 |     @classmethod
57 |     def arguments_processor(cls, etl, input_args):
58 |         """Parse the step arguments according to the ETL pipeline
59 | 
60 |         Args:
61 |             etl(ETLPipeline): Pipeline object containing resources and steps
62 |             step_args(dict): Dictionary of the step arguments for the class
63 |         """
64 |         input_args = cls.pop_inputs(input_args)
65 |         step_args = cls.base_arguments_processor(etl, input_args)
66 |         step_args['redshift_database'] = etl.redshift_database
67 | 
68 |         return step_args
69 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/tests/test_select_query.py:
--------------------------------------------------------------------------------
 1 | """Tests for select statement parser
 2 | """
 3 | 
 4 | from nose.tools import eq_
 5 | from nose.tools import raises
 6 | from pyparsing import ParseException
 7 | from unittest import TestCase
 8 | 
 9 | from ..select_query import parse_column_name
10 | from ..select_query import parse_select_columns
11 | from ..select_query import parse_select_dependencies
12 | 
13 | 
14 | class TestCreateTableStatement(TestCase):
15 |     """Tests for create table
16 |     """
17 |     @staticmethod
18 |     def test_basic():
19 |         """Basic test for select statement
20 |         """
21 |         query = ('SELECT x, y, z AS t FROM abc JOIN pqr USING(y) WHERE x=1')
22 | 
23 |         dependencies = parse_select_dependencies(query)
24 |         eq_(dependencies, ['abc', 'pqr'])
25 | 
26 |         columns = parse_select_columns(query)
27 |         eq_(columns, ['x', 'y', 'z AS t'])
28 | 
29 |         column_name = parse_column_name(columns[0])
30 |         eq_(column_name, 'x')
31 | 
32 |         column_name = parse_column_name(columns[2])
33 |         eq_(column_name, 't')
34 | 
35 |     @staticmethod
36 |     @raises(ParseException)
37 |     def test_bad_input():
38 |         """Feeding malformed input into create table
39 |         """
40 |         query = 'SELECT x, y, z'
41 |         parse_select_dependencies(query)
42 | 
43 |     @staticmethod
44 |     def test_columns():
45 |         """Basic test for select statement
46 |         """
47 |         query = ('SELECT x'
48 |                  ',CASE WHEN y=10 THEN 5 ELSE z AS a'
49 |                  ',CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END AS b'
50 |                  ',COUNT(1) AS c'
51 |                  ",CASE WHEN course_platform = 'spark' THEN 'v1-' "
52 |                  "|| topic_id::VARCHAR ELSE course_id END AS course_id "
53 |                  'FROM abc')
54 | 
55 |         result = [
56 |             'x',
57 |             'CASE WHEN y=10 THEN 5 ELSE z AS a',
58 |             'CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END AS b',
59 |             'COUNT(1) AS c',
60 |             "CASE WHEN course_platform = 'spark' THEN 'v1-' " +
61 |             "|| topic_id::VARCHAR ELSE course_id END AS course_id"
62 |         ]
63 | 
64 |         result_names = ['x', 'a', 'b', 'c', 'course_id']
65 | 
66 |         columns = parse_select_columns(query)
67 |         eq_(columns, result)
68 | 
69 |         column_names = [parse_column_name(c) for c in columns]
70 |         eq_(column_names, result_names)
71 | 
72 |     @staticmethod
73 |     def test_with_query():
74 |         """Basic test for select statement with the with query
75 |         """
76 |         query = ('WITH data AS (SELECT x, y FROM xy) SELECT x,y FROM data')
77 | 
78 |         columns = parse_select_columns(query)
79 |         eq_(columns, ['x', 'y'])
80 | 


--------------------------------------------------------------------------------
/dataduct/database/column.py:
--------------------------------------------------------------------------------
 1 | """Script containing the column class object
 2 | """
 3 | 
 4 | 
 5 | class Column(object):
 6 |     """Class representing columns in a table
 7 |     """
 8 |     def __init__(self, column_name, column_type, encoding=None,
 9 |                  fk_reference=None, fk_table=None, is_distkey=False,
10 |                  is_sortkey=False, is_primarykey=False, is_null=False,
11 |                  is_not_null=False, position=None):
12 |         """Constructor for Column class
13 | 
14 |         Args:
15 |             column_name(str): The name of the column
16 |             column_type(str): The type of the column
17 |             encoding(str): The encoding type of the column
18 |             fk_reference(str): The column that this key is referring to
19 |             fk_table(str): The table that this key is referring to
20 |             is_distkey(bool): Whether or not this column is the DISTKEY
21 |             is_sortkey(bool): Whether or not this column is a SORTKEY
22 |             is_primarykey(bool): Whether or not this column is a primary key
23 |             is_null(bool): Whether or not is column is defaults to null
24 |             is_not_null(bool): Whether or not is column is not nullable
25 |             position(int): The position of the column
26 |         """
27 | 
28 |         self.column_name = column_name
29 |         self.column_type = column_type
30 |         self.encoding = encoding
31 |         self.fk_reference = fk_reference
32 |         self.fk_table = fk_table
33 |         self.is_distkey = is_distkey
34 |         self.is_sortkey = is_sortkey
35 |         self.is_primarykey = is_primarykey
36 |         self.is_null = is_null
37 |         self.is_not_null = is_not_null
38 |         self.position = position
39 | 
40 |         if is_null and is_not_null:
41 |             raise ValueError('Column cannot be both NULL and NOT NULL together')  # noqa
42 | 
43 |         if self.is_primarykey:
44 |             self.is_not_null = True
45 |             self.is_null = False
46 | 
47 |     def __str__(self):
48 |         """String output for the columns
49 |         """
50 |         if self.column_type is not None:
51 |             return '%s %s' % (self.column_name, self.column_type)
52 |         return self.column_name
53 | 
54 |     @property
55 |     def primary(self):
56 |         """Property for the column being part of primary key
57 |         """
58 |         return self.is_primarykey
59 | 
60 |     @primary.setter
61 |     def primary(self, value=True):
62 |         """Set the primary flag for the column
63 |         """
64 |         self.is_primarykey = value
65 | 
66 |         # Force not null for primary key columns
67 |         if self.is_primarykey:
68 |             self.is_not_null = True
69 |             self.is_null = False
70 | 
71 |     @property
72 |     def name(self):
73 |         """Get the name of the column
74 |         """
75 |         return self.column_name
76 | 


--------------------------------------------------------------------------------
/dataduct/qa/column_check.py:
--------------------------------------------------------------------------------
 1 | """QA test for comparing columns in the source system with the Warehouse
 2 | """
 3 | from .check import Check
 4 | from .utils import render_output
 5 | 
 6 | 
 7 | class ColumnCheck(Check):
 8 |     """QA test for comparing columns across the ETL
 9 |     """
10 |     def __init__(self, source_data, destination_data, **kwargs):
11 |         """Constructor for the Count based QA
12 | 
13 |         Args:
14 |             source_data(DataFrame): Sample of source data
15 |             destination_data(DataFrame): Sample of destination data
16 |         """
17 |         super(ColumnCheck, self).__init__(**kwargs)
18 |         self.source_data = source_data
19 |         self.destination_data = destination_data
20 |         self.errors = []
21 |         self.observed = 0
22 | 
23 |         # Identify errors
24 |         for key in source_data.index:
25 |             if key not in destination_data.index:
26 |                 continue
27 | 
28 |             source_value = ColumnCheck.column_value(self.source_data, key)
29 |             dest_value = ColumnCheck.column_value(self.destination_data, key)
30 | 
31 |             if source_value != dest_value:
32 |                 self.errors.append((key, source_value, dest_value))
33 |             self.observed += 1
34 | 
35 |     @property
36 |     def error_rate(self):
37 |         """The error rate for the column comparisons
38 | 
39 |         Note:
40 |             The error is only calculated for keys that exist in both dataframes.
41 |             Thus, we presume that issues dealing with row counts are addressed
42 |             in a separate QA test.
43 |         """
44 |         if self.observed == 0:
45 |             return None
46 | 
47 |         return float(len(self.errors) * 100) / self.observed
48 | 
49 |     @staticmethod
50 |     def column_value(data, key):
51 |         """Fetch the value for a key in the dataframe
52 | 
53 |         Args:
54 |             data(DataFrame): Single column dataframe
55 |             key(str): Key to lookup in the dataframe
56 | 
57 |         Returns:
58 |             value(str): Value for the key, unicode values are encoded as utf-8
59 |         """
60 |         value = data.loc[key].values[0]
61 |         if isinstance(value, unicode):
62 |             return value.encode('utf-8')
63 |         return value
64 | 
65 |     @property
66 |     def summary(self):
67 |         """Summary of the test results for the SNS message
68 |         """
69 |         return render_output(
70 |             [
71 |                 'Test Name: %s' % self.name,
72 |                 'Success: %s' % self.success,
73 |                 'Tolerance: %0.4f%%' % self.tolerance,
74 |                 'Error Rate: %0.4f%%' % self.error_rate,
75 |                 'Observed: %d' % self.observed,
76 |             ]
77 |         )
78 | 
79 |     @property
80 |     def results(self):
81 |         """Results from the the comparison of the errors
82 |         """
83 |         return render_output([str(a) for a in self.errors])
84 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/ec2_resource.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for ec2 resource
 3 | """
 4 | 
 5 | from ..config import Config
 6 | from .pipeline_object import PipelineObject
 7 | from ..s3 import S3LogPath
 8 | from .schedule import Schedule
 9 | from ..utils import constants as const
10 | from ..utils.exceptions import ETLInputError
11 | 
12 | config = Config()
13 | ROLE = config.etl['ROLE']
14 | RESOURCE_ROLE = config.etl['RESOURCE_ROLE']
15 | 
16 | INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', const.M1_LARGE)
17 | ETL_AMI = config.ec2.get('ETL_AMI', const.NONE)
18 | SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', const.NONE)
19 | SECURITY_GROUP_ID = config.ec2.get('SECURITY_GROUP_ID', const.NONE)
20 | SUBNET_ID = config.ec2.get('SUBNET_ID', const.NONE)
21 | KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE)
22 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY)
23 | 
24 | 
25 | class Ec2Resource(PipelineObject):
26 |     """EC2 Resource class
27 |     """
28 | 
29 |     def __init__(self,
30 |                  id,
31 |                  s3_log_dir=None,
32 |                  schedule=None,
33 |                  terminate_after='6 Hours',
34 |                  instance_type=INSTANCE_TYPE,
35 |                  ami=ETL_AMI,
36 |                  security_group=SECURITY_GROUP,
37 |                  security_group_id=SECURITY_GROUP_ID,
38 |                  subnet_id=SUBNET_ID,
39 |                  **kwargs):
40 |         """Constructor for the Ec2Resource class
41 | 
42 |         Args:
43 |             id(str): id of the object
44 |             s3_log_dir(S3Directory): s3 directory for pipeline logs
45 |             schedule(Schedule): pipeline schedule used for the machine
46 |             terminate_after(str): time to terminate the ec2resource after
47 |             instance_type(str): machine type to be used eg. m1.large
48 |             ami(str): ami id for the ec2 resource
49 |             retry_delay(str): time delay between step retries
50 |             **kwargs(optional): Keyword arguments directly passed to base class
51 |         """
52 | 
53 |         # Validate inputs
54 |         if not isinstance(schedule, Schedule):
55 |             raise ETLInputError(
56 |                 'Input schedule must be of the type Schedule')
57 |         if not isinstance(s3_log_dir, S3LogPath):
58 |             raise ETLInputError(
59 |                 's3 log directory must be of type S3LogPath')
60 | 
61 |         super(Ec2Resource, self).__init__(
62 |             id=id,
63 |             type='Ec2Resource',
64 |             terminateAfter=terminate_after,
65 |             logUri=s3_log_dir,
66 |             schedule=schedule,
67 |             imageId=ami,
68 |             instanceType=instance_type,
69 |             role=ROLE,
70 |             resourceRole=RESOURCE_ROLE,
71 |             keyPair=KEY_PAIR,
72 |             retryDelay=RETRY_DELAY,
73 |             securityGroups=security_group,
74 |             securityGroupIds=security_group_id,
75 |             subnetId=subnet_id
76 |         )
77 | 


--------------------------------------------------------------------------------
/dataduct/steps/create_update_sql.py:
--------------------------------------------------------------------------------
 1 | """ETL step wrapper for sql command for inserting into tables
 2 | """
 3 | from ..database import SqlScript
 4 | from ..database import Table
 5 | from ..s3 import S3File
 6 | from ..utils import constants as const
 7 | from ..utils.exceptions import ETLInputError
 8 | from ..utils.helpers import exactly_one
 9 | from ..utils.helpers import parse_path
10 | from .transform import TransformStep
11 | 
12 | 
13 | class CreateUpdateSqlStep(TransformStep):
14 |     """Create and Insert step that creates a table and then uses the query to
15 |     update the table data with any sql query provided
16 |     """
17 | 
18 |     def __init__(self,
19 |                  table_definition,
20 |                  script=None,
21 |                  command=None,
22 |                  analyze_table=True,
23 |                  script_arguments=None,
24 |                  non_transactional=False,
25 |                  **kwargs):
26 |         """Constructor for the CreateUpdateStep class
27 | 
28 |         Args:
29 |             **kwargs(optional): Keyword arguments directly passed to base class
30 |         """
31 |         if not exactly_one(command, script):
32 |             raise ETLInputError('Both command and script found')
33 | 
34 |         # Create S3File with script / command provided
35 |         if script:
36 |             update_script = SqlScript(filename=parse_path(script))
37 |         else:
38 |             update_script = SqlScript(command)
39 |         self.s3_source_dir = kwargs['s3_source_dir']
40 |         sql_script = self.create_script(S3File(text=update_script.sql()))
41 |         sql_script.upload_to_s3()
42 | 
43 |         dest = Table(SqlScript(filename=parse_path(table_definition)))
44 | 
45 |         arguments = [
46 |             '--table_definition=%s' % dest.sql().sql(),
47 |             '--sql=%s' % sql_script.s3_path.uri
48 |         ]
49 | 
50 |         if analyze_table:
51 |             arguments.append('--analyze')
52 | 
53 |         if non_transactional:
54 |             arguments.append('--non_transactional')
55 | 
56 |         if script_arguments is not None:
57 |             if not isinstance(script_arguments, list):
58 |                 raise ETLInputError(
59 |                     'Script arguments for SQL steps should be a list')
60 |             arguments.extend(script_arguments)
61 | 
62 |         super(CreateUpdateSqlStep, self).__init__(
63 |             command=const.SQL_RUNNER_COMMAND, script_arguments=arguments,
64 |             no_output=True, **kwargs)
65 | 
66 |     @classmethod
67 |     def arguments_processor(cls, etl, input_args):
68 |         """Parse the step arguments according to the ETL pipeline
69 | 
70 |         Args:
71 |             etl(ETLPipeline): Pipeline object containing resources and steps
72 |             step_args(dict): Dictionary of the step arguments for the class
73 |         """
74 |         step_args = cls.base_arguments_processor(etl, input_args)
75 |         cls.pop_inputs(step_args)
76 | 
77 |         return step_args
78 | 


--------------------------------------------------------------------------------
/dataduct/config/credentials.py:
--------------------------------------------------------------------------------
 1 | """Credentials utility functions for connecting to various services
 2 | """
 3 | import os
 4 | import requests
 5 | import sys
 6 | from ConfigParser import SafeConfigParser
 7 | 
 8 | 
 9 | def get_aws_credentials_from_iam():
10 |     """Get aws credentials using the IAM api
11 |     Note: this script only runs on an EC2 instance with the appropriate
12 |         resource roles. For more information, see the following:
13 |         http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/\
14 |         AESDG-chapter-instancedata.html
15 | 
16 |     Returns:
17 |         access_key(str): AWS access key
18 |         secret_key(str): AWS secret key
19 |         token(str): Connection token
20 |     """
21 |     url = 'http://169.254.169.254/latest/meta-data/iam/security-credentials/'
22 | 
23 |     # Get role name
24 |     r = requests.get(url)
25 | 
26 |     if not r.ok:
27 |         raise Exception('Request failed for url %s.' % url)
28 | 
29 |     # Add role name to url
30 |     url += r.content
31 | 
32 |     # Get access keys
33 |     r = requests.get(url)
34 |     if not r.ok:
35 |         raise Exception('Request failed for url %s.' % url)
36 | 
37 |     json_result = r.json()
38 |     return (json_result['AccessKeyId'],
39 |             json_result['SecretAccessKey'],
40 |             json_result['Token'])
41 | 
42 | 
43 | def get_aws_credentials_from_file(filename=None):
44 |     """Get the AWS credentials from boto config files
45 | 
46 |     Tries to load from the specified filename, if applicable, else follows what
47 |     boto does by following the order specified at
48 |     http://boto.cloudhackers.com/en/latest/boto_config_tut.html#details
49 |     """
50 |     config = SafeConfigParser()
51 |     cred_file = None
52 |     if filename is not None and os.path.isfile(filename):
53 |         cred_file = filename
54 |     elif os.path.isfile('/etc/boto.cfg'):
55 |         cred_file = '/etc/boto.cfg'
56 |     elif os.path.isfile(os.path.expanduser('~/.aws/credentials')):
57 |         cred_file = os.path.expanduser('~/.aws/credentials')
58 |     elif os.path.isfile(os.path.expanduser('~/.boto')):
59 |         cred_file = os.path.expanduser('~/.boto')
60 |     else:
61 |         raise Exception("Cannot find a credentials file")
62 | 
63 |     config.read(cred_file)
64 |     aws_access_key_id = config.get('Credentials',
65 |                                    'aws_access_key_id')
66 |     aws_secret_access_key = config.get('Credentials',
67 |                                        'aws_secret_access_key')
68 |     return (aws_access_key_id, aws_secret_access_key, None)
69 | 
70 | 
71 | def get_aws_credentials(filename=None):
72 |     """Get the aws credentials from IAM or files
73 |     """
74 |     try:
75 |         aws_key, aws_secret, token = get_aws_credentials_from_iam()
76 |     except Exception, error:
77 |         sys.stderr.write('Failed to get creds from IAM: %s \n' % error.message)
78 |         aws_key, aws_secret, token = get_aws_credentials_from_file(filename)
79 | 
80 |     return aws_key, aws_secret, token
81 | 


--------------------------------------------------------------------------------
/dataduct/steps/executors/count_check.py:
--------------------------------------------------------------------------------
 1 | """Script that compares the number of rows in the source select script with the
 2 | number of rows in the destination table
 3 | """
 4 | 
 5 | import argparse
 6 | import pandas.io.sql as pdsql
 7 | from dataduct.data_access import redshift_connection
 8 | from dataduct.data_access import rds_connection
 9 | from dataduct.qa import CountCheck
10 | 
11 | 
12 | def _get_source_data(sql, hostname):
13 |     """Gets the DataFrame containing all the rows of the table
14 |     The DataFrame will be indexed by the table's primary key(s)
15 | 
16 |     Args:
17 |         sql(str): The table definition representing the table to query
18 |         connection(Connection): A connection to the database
19 | 
20 |     Returns:
21 |         DataFrame: The rows of the table
22 |     """
23 |     connection = rds_connection(hostname)
24 |     data = pdsql.read_sql(sql, connection)
25 |     connection.close()
26 |     return data.iloc[0][0]
27 | 
28 | 
29 | def _get_destination_data(sql):
30 |     """Gets the DataFrame containing all the rows of the table
31 |     The DataFrame will be indexed by the table's primary key(s)
32 | 
33 |     Args:
34 |         sql(str): The table definition representing the table to query
35 |         connection(Connection): A connection to the database
36 | 
37 |     Returns:
38 |         DataFrame: The rows of the table
39 |     """
40 |     connection = redshift_connection()
41 |     data = pdsql.read_sql(sql, connection)
42 |     connection.close()
43 |     # All columns apart from last are PK columns
44 |     return data.iloc[0][0]
45 | 
46 | 
47 | def count_check():
48 |     """Args (taken in through argparse):
49 |         source_sql: SQL script of the source data
50 |         destination_sql: SQL script of the destination data
51 |     """
52 |     parser = argparse.ArgumentParser()
53 | 
54 |     parser.add_argument('--source_sql', dest='source_sql', required=True)
55 |     parser.add_argument('--source_host', dest='source_host', required=True)
56 |     parser.add_argument('--destination_sql', dest='destination_sql',
57 |                         required=True)
58 |     parser.add_argument('--tolerance', type=float, dest='tolerance',
59 |                         default=1.0)
60 |     parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None)
61 |     parser.add_argument('--test_name', dest='test_name',
62 |                         default='Check Count')
63 |     parser.add_argument('--log_to_s3', action='store_true', default=False)
64 |     parser.add_argument('--path_suffix', dest='path_suffix', default=None)
65 | 
66 |     args = parser.parse_args()
67 | 
68 |     source_count = _get_source_data(args.source_sql, args.source_host)
69 |     destination_count = _get_destination_data(args.destination_sql)
70 | 
71 |     check = CountCheck(source_count, destination_count,
72 |                        name=args.test_name,
73 |                        sns_topic_arn=args.sns_topic_arn,
74 |                        tolerance=args.tolerance)
75 | 
76 |     check.publish(args.log_to_s3, dest_sql=args.destination_sql,
77 |                   path_suffix=args.path_suffix)
78 | 


--------------------------------------------------------------------------------
/dataduct/database/relation.py:
--------------------------------------------------------------------------------
 1 | """Script containing the relation class object
 2 | """
 3 | from copy import deepcopy
 4 | from .sql import SqlScript
 5 | from ..config import Config
 6 | from ..utils.helpers import atleast_one
 7 | 
 8 | 
 9 | class Relation(object):
10 |     """Class representing a relation in the database
11 |     """
12 | 
13 |     def __str__(self):
14 |         """Output for the print statement of the relation
15 |         """
16 |         return self.sql_statement.sql()
17 | 
18 |     def sql(self):
19 |         """SqlStatement for the table object
20 |         """
21 |         return self.sql_statement
22 | 
23 |     def copy(self):
24 |         """Create a copy of the relation object
25 |         """
26 |         return deepcopy(self)
27 | 
28 |     def initialize_name(self):
29 |         """Parse the full name to declare the schema and relation name
30 |         """
31 |         split_name = self.full_name.split('.')
32 |         if len(split_name) == 2:
33 |             schema_name = split_name[0]
34 |             relation_name = split_name[1]
35 |         else:
36 |             schema_name = None
37 |             relation_name = self.full_name
38 | 
39 |         return schema_name, relation_name
40 | 
41 |     def _grant_sql_builder(self, permission, user=None, group=None):
42 |         """Return the sql string for granting permissions
43 |         """
44 |         if not atleast_one(user, group):
45 |             raise ValueError('Atleast one of user / group needed')
46 | 
47 |         result = list()
48 |         option_string = 'WITH GRANT OPTION'
49 |         base = 'GRANT %s ON %s TO {user} {option}' % (
50 |             permission, self.full_name)
51 | 
52 |         if user is not None:
53 |             result.append(base.format(user=user, option=option_string))
54 | 
55 |         if group is not None:
56 |             result.append(base.format(user='GROUP %s' % group, option=''))
57 |         return result
58 | 
59 |     def grant_script(self):
60 |         """Grant the permissions based on the config
61 |         """
62 |         config = Config()
63 |         if not hasattr(config, 'database'):
64 |             return
65 | 
66 |         permissions = config.database.get('permissions', list())
67 | 
68 |         sql = list()
69 |         for permission in permissions:
70 |             sql.extend(self._grant_sql_builder(**permission))
71 | 
72 |         return SqlScript('; '.join(sql))
73 | 
74 |     def select_script(self):
75 |         """Select everything from the relation
76 |         """
77 |         return SqlScript('SELECT * FROM %s' % self.full_name)
78 | 
79 |     def create_script(self, grant_permissions=True):
80 |         """Create script for the table object
81 |         """
82 |         script = SqlScript(statements=[self.sql_statement.copy()])
83 |         if grant_permissions:
84 |             script.append(self.grant_script())
85 |         return script
86 | 
87 |     def recreate_script(self, grant_permissions=True):
88 |         """Sql script to recreate the view
89 |         """
90 |         script = self.drop_script()
91 |         script.append(self.create_script(grant_permissions))
92 |         return script
93 | 


--------------------------------------------------------------------------------
/dataduct/config/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | """Tests that the config singleton is working properly
 2 | """
 3 | from os.path import expanduser
 4 | from os.path import join
 5 | 
 6 | from unittest import TestCase
 7 | from mock import patch
 8 | from testfixtures import TempDirectory
 9 | from nose.tools import eq_
10 | from nose.tools import raises
11 | 
12 | from ..config import get_config_files
13 | from ..config import load_yaml
14 | 
15 | 
16 | class TestConfig(TestCase):
17 |     """Tests for config singleton
18 |     """
19 |     def setUp(self):
20 |         self.test_yaml_file = '\n'.join([
21 |             'test:',
22 |             '    test_sub:',
23 |             '    -   test_sub1: foo',
24 |             '        test_sub1_other: bar',
25 |             '    -   test_sub2: foobar',
26 |         ])
27 |         self.test_config_dict = {
28 |             'test': {
29 |                 'test_sub': [
30 |                     {
31 |                         'test_sub1': 'foo',
32 |                         'test_sub1_other': 'bar',
33 |                     },
34 |                     {
35 |                         'test_sub2': 'foobar',
36 |                     }
37 |                 ]
38 |             }
39 |         }
40 | 
41 |     @staticmethod
42 |     @patch.dict('os.environ', {}, clear=True)
43 |     def test_get_config_files_no_enviroment_variable():
44 |         """Tests that correct config file paths are returned when there's no
45 |         enviroment variable
46 |         """
47 |         expected = [
48 |             '/etc/dataduct.cfg',
49 |             expanduser('~/.dataduct/dataduct.cfg'),
50 |         ]
51 |         result = get_config_files()
52 |         eq_(result, expected)
53 | 
54 |     @staticmethod
55 |     @patch.dict('os.environ', {'DATADUCT_CONFIG_PATH': '/test/test.cfg'})
56 |     def test_get_config_files_with_enviroment_variable():
57 |         """Tests that correct config file paths are returned when there is
58 |         an enviroment variable
59 |         """
60 |         expected = [
61 |             '/etc/dataduct.cfg',
62 |             expanduser('~/.dataduct/dataduct.cfg'),
63 |             '/test/test.cfg',
64 |         ]
65 |         result = get_config_files()
66 |         eq_(result, expected)
67 | 
68 |     def test_load_yaml_works_correctly(self):
69 |         """Tests that the yaml file can be loaded correctly
70 |         """
71 |         with TempDirectory() as d:
72 |             d.write('test.yaml', self.test_yaml_file)
73 |             result = load_yaml([join(d.path, 'test.yaml')])
74 |             eq_(result, self.test_config_dict)
75 | 
76 |     @staticmethod
77 |     @raises(IOError)
78 |     def test_no_config_file_raises():
79 |         """Tests that an exception is raised if no yaml file path is passed in
80 |         """
81 |         load_yaml([])
82 | 
83 |     @staticmethod
84 |     @raises(IOError)
85 |     def test_cannot_find_config_file_raises():
86 |         """Tests that an exception is raised if it cannot find any yaml files
87 |         """
88 |         with TempDirectory() as d:
89 |             with TempDirectory() as d2:
90 |                 load_yaml([join(d.path, 'test.cfg'),
91 |                            join(d2.path, 'test.cfg')])
92 | 


--------------------------------------------------------------------------------
/dataduct/steps/load_postgres.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for SQLActivity to load data into Postgres
 3 | """
 4 | from ..config import Config
 5 | from .etl_step import ETLStep
 6 | from ..pipeline import PostgresNode
 7 | from ..pipeline import PostgresDatabase
 8 | from ..pipeline import PipelineObject
 9 | from ..pipeline import CopyActivity
10 | 
11 | config = Config()
12 | if not hasattr(config, 'postgres'):
13 |     raise ETLInputError('Postgres config not specified in ETL')
14 | POSTGRES_CONFIG = config.postgres
15 | 
16 | 
17 | class LoadPostgresStep(ETLStep):
18 |     """Load Postgres Step class that helps load data into postgres
19 |     """
20 | 
21 |     def __init__(self,
22 |                  table,
23 |                  postgres_database,
24 |                  insert_query,
25 |                  max_errors=None,
26 |                  replace_invalid_char=None,
27 |                  **kwargs):
28 |         """Constructor for the LoadPostgresStep class
29 | 
30 |         Args:
31 |             table(path): table name for load
32 |             sql(str): sql query to be executed
33 |             postgres_database(PostgresDatabase): database to excute the query
34 |             output_path(str): s3 path where sql output should be saved
35 |             **kwargs(optional): Keyword arguments directly passed to base class
36 |         """
37 |         super(LoadPostgresStep, self).__init__(**kwargs)
38 | 
39 |         region = POSTGRES_CONFIG['REGION']
40 |         rds_instance_id = POSTGRES_CONFIG['RDS_INSTANCE_ID']
41 |         user = POSTGRES_CONFIG['USERNAME']
42 |         password = POSTGRES_CONFIG['PASSWORD']
43 |         database_node = self.create_pipeline_object(
44 |                     object_class=PostgresDatabase,
45 |                     region=region,
46 |                     rds_instance_id=rds_instance_id,
47 |                     username=user,
48 |                     password=password,
49 |         )
50 | 
51 |         # Create output node
52 |         self._output = self.create_pipeline_object(
53 |             object_class=PostgresNode,
54 |             schedule=self.schedule,
55 |             database=database_node,
56 |             table=table,
57 |             username=user,
58 |             password=password,
59 |             select_query=None,
60 |             insert_query=insert_query,
61 |             host=rds_instance_id,
62 |         )
63 | 
64 |         self.create_pipeline_object(
65 |             object_class=CopyActivity,
66 |             schedule=self.schedule,
67 |             resource=self.resource,
68 |             input_node=self.input,
69 |             output_node=self.output,
70 |             depends_on=self.depends_on,
71 |             max_retries=self.max_retries,
72 |         )
73 | 
74 |     @classmethod
75 |     def arguments_processor(cls, etl, input_args):
76 |         """Parse the step arguments according to the ETL pipeline
77 | 
78 |         Args:
79 |             etl(ETLPipeline): Pipeline object containing resources and steps
80 |             step_args(dict): Dictionary of the step arguments for the class
81 |         """
82 |         step_args = cls.base_arguments_processor(etl, input_args)
83 |         step_args['postgres_database'] = etl.postgres_database
84 | 
85 |         return step_args
86 | 


--------------------------------------------------------------------------------
/dataduct/etl/tests/test_etl_actions.py:
--------------------------------------------------------------------------------
 1 | """Tests for the ETL actions
 2 | """
 3 | import os
 4 | 
 5 | import unittest
 6 | from testfixtures import TempDirectory
 7 | from nose.tools import raises
 8 | from nose.tools import eq_
 9 | 
10 | from ..etl_actions import read_pipeline_definition
11 | from ..etl_actions import create_pipeline
12 | from ...utils.exceptions import ETLInputError
13 | 
14 | 
15 | class EtlActionsTests(unittest.TestCase):
16 |     """Tests for the ETL actions
17 |     """
18 | 
19 |     def setUp(self):
20 |         """Setup text fixtures
21 |         """
22 |         self.load_hour = '01'
23 |         self.load_min = '23'
24 |         load_time = self.load_hour + ':' + self.load_min
25 |         self.test_yaml = '\n'.join([
26 |             'name: example_load_redshift',
27 |             'frequency: one-time',
28 |             'load_time: ' + load_time,
29 |             'max_retries: 5',
30 |             'description: Example for the load_redshift step',
31 |             'steps:',
32 |             '-   step_type: extract-local',
33 |             '    path: data/test_table1.tsv',
34 |             '-   step_type: load-redshift',
35 |             '    schema: dev',
36 |             '    table: test_table',
37 |         ])
38 |         # Definition has no description field
39 |         self.test_definition = {
40 |             'name': 'example_load_redshift',
41 |             'frequency': 'one-time',
42 |             'description': 'Example for the load_redshift step',
43 |             'load_time': load_time,
44 |             'max_retries': 5,
45 |             'steps': [{
46 |                 'step_type': 'extract-local',
47 |                 'path': 'data/test_table1.tsv',
48 |             }, {
49 |                 'step_type': 'load-redshift',
50 |                 'schema': 'dev',
51 |                 'table': 'test_table',
52 |             }],
53 |         }
54 | 
55 |     @staticmethod
56 |     @raises(ETLInputError)
57 |     def test_yaml_extension():
58 |         """Test if the yaml extension check works correctly
59 |         for read_pipeline_definition
60 |         """
61 |         read_pipeline_definition("name.txt")
62 | 
63 |     def test_read_pipeline_definition(self):
64 |         """Test if the pipeline definition is parsed correctly
65 |         """
66 |         with TempDirectory() as directory:
67 |             directory.write('test_definition.yaml', self.test_yaml)
68 |             result = read_pipeline_definition(
69 |                 os.path.join(directory.path, 'test_definition.yaml'))
70 |             eq_(result, self.test_definition)
71 | 
72 |     def test_create_pipeline(self):
73 |         """Test if simple pipeline creation is correct
74 |         """
75 |         result = create_pipeline(self.test_definition)
76 |         # Check that pipeline properties are accurate
77 |         assert result.name.endswith(self.test_definition['name'])
78 |         eq_(result.frequency, self.test_definition['frequency'])
79 |         eq_(result.load_hour, int(self.load_hour))
80 |         eq_(result.load_min, int(self.load_min))
81 |         eq_(result.max_retries, self.test_definition['max_retries'])
82 |         # Check that vital steps are created
83 |         steps = result.steps
84 |         assert 'ExtractLocalStep0' in steps
85 |         assert 'LoadRedshiftStep0' in steps
86 | 


--------------------------------------------------------------------------------
/dataduct/steps/sql_command.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for SqlActivity can be executed on Ec2
 3 | """
 4 | from .etl_step import ETLStep
 5 | from ..pipeline import SqlActivity
 6 | from ..database import SqlScript
 7 | from ..s3 import S3File
 8 | from ..utils.helpers import exactly_one
 9 | from ..utils.helpers import parse_path
10 | from ..utils.exceptions import ETLInputError
11 | 
12 | import logging
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | class SqlCommandStep(ETLStep):
17 |     """SQL Command Step class that helps run scripts on resouces
18 |     """
19 | 
20 |     def __init__(self,
21 |                  redshift_database,
22 |                  script=None,
23 |                  script_arguments=None,
24 |                  queue=None,
25 |                  sql_script=None,
26 |                  command=None,
27 |                  wrap_transaction=True,
28 |                  **kwargs):
29 |         """Constructor for the SqlCommandStep class
30 | 
31 |         Args:
32 |             command(str): command to be executed directly
33 |             script(path): local path to the script that should executed
34 |             queue(str): query queue that should be used
35 |             script_arguments(list of str): arguments to the SQL command
36 |             redshift_database(RedshiftDatabase): database to excute the query
37 |             **kwargs(optional): Keyword arguments directly passed to base class
38 |         """
39 |         if not exactly_one(command, script, sql_script):
40 |             raise ETLInputError('Both command and script found')
41 | 
42 |         if sql_script is not None and not isinstance(sql_script, SqlScript):
43 |             raise ETLInputError('sql_script should be of the type SqlScript')
44 | 
45 |         super(SqlCommandStep, self).__init__(**kwargs)
46 | 
47 |         # Create S3File with script / command provided
48 |         if script:
49 |             sql_script = SqlScript(filename=parse_path(script))
50 |         elif command:
51 |             sql_script = SqlScript(command)
52 | 
53 |         if wrap_transaction:
54 |             sql_script = sql_script.wrap_transaction()
55 | 
56 |         script = self.create_script(S3File(text=sql_script.sql()))
57 | 
58 |         logger.debug('Sql Query:')
59 |         logger.debug(sql_script)
60 | 
61 |         self.create_pipeline_object(
62 |             object_class=SqlActivity,
63 |             max_retries=self.max_retries,
64 |             resource=self.resource,
65 |             worker_group=self.worker_group,
66 |             schedule=self.schedule,
67 |             database=redshift_database,
68 |             script_arguments=script_arguments,
69 |             depends_on=self.depends_on,
70 |             script=script,
71 |             queue=queue,
72 |         )
73 | 
74 |     @classmethod
75 |     def arguments_processor(cls, etl, input_args):
76 |         """Parse the step arguments according to the ETL pipeline
77 | 
78 |         Args:
79 |             etl(ETLPipeline): Pipeline object containing resources and steps
80 |             step_args(dict): Dictionary of the step arguments for the class
81 |         """
82 |         input_args = cls.pop_inputs(input_args)
83 |         step_args = cls.base_arguments_processor(etl, input_args)
84 |         step_args['redshift_database'] = etl.redshift_database
85 | 
86 |         return step_args
87 | 


--------------------------------------------------------------------------------
/dataduct/steps/load_redshift.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step wrapper for RedshiftCopyActivity to load data into Redshift
 3 | """
 4 | from .etl_step import ETLStep
 5 | from ..pipeline import RedshiftNode
 6 | from ..pipeline import RedshiftCopyActivity
 7 | 
 8 | 
 9 | class LoadRedshiftStep(ETLStep):
10 |     """Load Redshift Step class that helps load data into redshift
11 |     """
12 | 
13 |     def __init__(self,
14 |                  schema,
15 |                  table,
16 |                  redshift_database,
17 |                  insert_mode="TRUNCATE",
18 |                  max_errors=None,
19 |                  replace_invalid_char=None,
20 |                  compression=None,
21 |                  **kwargs):
22 |         """Constructor for the LoadRedshiftStep class
23 | 
24 |         Args:
25 |             schema(str): schema from which table should be extracted
26 |             table(path): table name for extract
27 |             insert_mode(str): insert mode for redshift copy activity
28 |             redshift_database(RedshiftDatabase): database to excute the query
29 |             max_errors(int): Maximum number of errors to be ignored during load
30 |             replace_invalid_char(char): char to replace not utf-8 with
31 |             **kwargs(optional): Keyword arguments directly passed to base class
32 |         """
33 |         super(LoadRedshiftStep, self).__init__(**kwargs)
34 | 
35 |         # Create output node
36 |         self._output = self.create_pipeline_object(
37 |             object_class=RedshiftNode,
38 |             schedule=self.schedule,
39 |             redshift_database=redshift_database,
40 |             schema_name=schema,
41 |             table_name=table,
42 |         )
43 | 
44 |         command_options = ["DELIMITER '\t' ESCAPE TRUNCATECOLUMNS"]
45 |         command_options.append("NULL AS 'NULL' ")
46 | 
47 |         if compression == "gzip":
48 |           command_options.append("GZIP")
49 |         elif compression == "bzip2":
50 |           command_options.append("BZIP2")
51 |         elif compression == "lzo":
52 |           command_options.append("lzop")
53 |         if max_errors:
54 |             command_options.append('MAXERROR %d' % int(max_errors))
55 |         if replace_invalid_char:
56 |             command_options.append(
57 |                 "ACCEPTINVCHARS AS '%s'" %replace_invalid_char)
58 | 
59 |         self.create_pipeline_object(
60 |             object_class=RedshiftCopyActivity,
61 |             max_retries=self.max_retries,
62 |             input_node=self.input,
63 |             output_node=self.output,
64 |             insert_mode=insert_mode,
65 |             resource=self.resource,
66 |             worker_group=self.worker_group,
67 |             schedule=self.schedule,
68 |             depends_on=self.depends_on,
69 |             command_options=command_options,
70 |         )
71 | 
72 |     @classmethod
73 |     def arguments_processor(cls, etl, input_args):
74 |         """Parse the step arguments according to the ETL pipeline
75 | 
76 |         Args:
77 |             etl(ETLPipeline): Pipeline object containing resources and steps
78 |             step_args(dict): Dictionary of the step arguments for the class
79 |         """
80 |         step_args = cls.base_arguments_processor(etl, input_args)
81 |         step_args['redshift_database'] = etl.redshift_database
82 | 
83 |         return step_args
84 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/shell_command_activity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for ShellCommandActivity
 3 | """
 4 | 
 5 | from .activity import Activity
 6 | from ..config import Config
 7 | from .schedule import Schedule
 8 | from ..utils import constants as const
 9 | from ..utils.exceptions import ETLInputError
10 | 
11 | config = Config()
12 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO)
13 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY)
14 | 
15 | 
16 | class ShellCommandActivity(Activity):
17 |     """ShellCommandActivity class
18 |     """
19 | 
20 |     def __init__(self,
21 |                  id,
22 |                  input_node,
23 |                  output_node,
24 |                  schedule,
25 |                  resource=None,
26 |                  worker_group=None,
27 |                  script_uri=None,
28 |                  script_arguments=None,
29 |                  command=None,
30 |                  max_retries=None,
31 |                  depends_on=None,
32 |                  additional_s3_files=None):
33 |         """Constructor for the ShellCommandActivity class
34 | 
35 |         Args:
36 |             id(str): id of the object
37 |             input_node(S3Node / list of S3Nodes): input nodes for the activity
38 |             output_node(S3Node / list of S3Nodes): output nodes for activity
39 |             schedule(Schedule): schedule of the pipeline
40 |             resource(Ec2Resource / EMRResource): resource to run the activity on
41 |             worker_group(str): the worker group to run the activity on
42 |             script_uri(S3File): s3 uri of the script
43 |             script_arguments(list of str): command line arguments to the script
44 |             command(str): command to be run as shell activity
45 |             max_retries(int): number of retries for the activity
46 |             depends_on(list of activities): dependendent pipelines steps
47 |             additional_s3_files(list of s3File): additional files for activity
48 |         """
49 | 
50 |         # Validate inputs
51 |         if not isinstance(schedule, Schedule):
52 |             raise ETLInputError(
53 |                 'Input schedule must be of the type Schedule')
54 | 
55 |         if command is not None and script_uri is not None:
56 |             raise ETLInputError('command and script both can not be provided')
57 | 
58 |         # Set default values
59 |         if depends_on is None:
60 |             depends_on = []
61 |         if max_retries is None:
62 |             max_retries = MAX_RETRIES
63 |         # Set stage to true if we use either input or output node
64 |         stage = 'true' if input_node or output_node else 'false'
65 | 
66 |         super(ShellCommandActivity, self).__init__(
67 |             id=id,
68 |             retryDelay=RETRY_DELAY,
69 |             type='ShellCommandActivity',
70 |             maximumRetries=max_retries,
71 |             dependsOn=depends_on,
72 |             stage=stage,
73 |             input=input_node,
74 |             output=output_node,
75 |             runsOn=resource,
76 |             workerGroup=worker_group,
77 |             schedule=schedule,
78 |             scriptUri=script_uri,
79 |             scriptArgument=script_arguments,
80 |             command=command
81 |         )
82 | 
83 |         # Add the additional s3 files
84 |         self.add_additional_files(additional_s3_files)
85 | 


--------------------------------------------------------------------------------
/dataduct/pipeline/redshift_copy_activity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pipeline object class for RedshiftCopyActivity
 3 | """
 4 | 
 5 | from .activity import Activity
 6 | from ..config import Config
 7 | from .redshift_node import RedshiftNode
 8 | from .schedule import Schedule
 9 | from ..utils import constants as const
10 | from ..utils.exceptions import ETLInputError
11 | 
12 | config = Config()
13 | MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO)
14 | RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY)
15 | 
16 | 
17 | class RedshiftCopyActivity(Activity):
18 |     """EMR Activity class
19 |     """
20 | 
21 |     def __init__(self,
22 |                  id,
23 |                  schedule,
24 |                  input_node,
25 |                  output_node,
26 |                  insert_mode,
27 |                  resource=None,
28 |                  worker_group=None,
29 |                  command_options=None,
30 |                  max_retries=None,
31 |                  depends_on=None):
32 |         """Constructor for the RedshiftCopyActivity class
33 | 
34 |         Args:
35 |             id(str): id of the object
36 |             schedule(Schedule): schedule of the pipeline
37 |             input_node(S3Node / RedshiftNode): input data node
38 |             output_node(S3Node / RedshiftNode): output data node
39 |             resource(Ec2Resource / EMRResource): resource to run the activity on
40 |             worker_group(str): the worker group to run the activity on
41 |             command_options(list of str): command options for the activity
42 |             max_retries(int): number of retries for the activity
43 |             depends_on(list of activities): dependendent pipelines steps
44 |         """
45 | 
46 |         # Validate inputs
47 |         if not isinstance(schedule, Schedule):
48 |             raise ETLInputError(
49 |                 'Input schedule must be of the type Schedule')
50 | 
51 |         # Set default values
52 |         if depends_on is None:
53 |             depends_on = []
54 |         if max_retries is None:
55 |             max_retries = MAX_RETRIES
56 | 
57 |         kwargs = {
58 |             'id': id,
59 |             'retryDelay': RETRY_DELAY,
60 |             'type': 'RedshiftCopyActivity',
61 |             'maximumRetries': max_retries,
62 |             'input': input_node,
63 |             'output': output_node,
64 |             'runsOn': resource,
65 |             'workerGroup': worker_group,
66 |             'insertMode': insert_mode,
67 |             'schedule': schedule,
68 |             'dependsOn': depends_on,
69 |             'commandOptions': command_options
70 |         }
71 | 
72 |         if isinstance(input_node, RedshiftNode):
73 |             # AWS BUG: AWS expects fully qualified name when extracting from
74 |             # Redshift, but not when loading into redshift. Here, we enforce
75 |             # a convention of providing schemaName and tableName separately.
76 |             assert "." not in input_node['tableName'], \
77 |                 "Using convention that table name is not fully qualified. " + \
78 |                 "Provide the schema name separately from the table name."
79 |             table_name = input_node['tableName']
80 |             del input_node['tableName']
81 |             input_node['tableName'] = "%s.%s" % (input_node['schemaName'],
82 |                                                  table_name)
83 |         super(RedshiftCopyActivity, self).__init__(**kwargs)
84 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/utils.py:
--------------------------------------------------------------------------------
 1 | """SQL parser utils and constants
 2 | """
 3 | 
 4 | from pyparsing import CaselessKeyword
 5 | from pyparsing import Combine
 6 | from pyparsing import Forward
 7 | from pyparsing import OneOrMore
 8 | from pyparsing import Word
 9 | from pyparsing import ZeroOrMore
10 | from pyparsing import alphanums
11 | from pyparsing import nums
12 | 
13 | # Intermediate parsers
14 | _varchar_names = (CaselessKeyword('VARCHAR') | CaselessKeyword('TEXT'))
15 | _varchar_names |= CaselessKeyword('NVARCHAR')
16 | 
17 | # Data types
18 | _smallint = (CaselessKeyword('SMALLINT') | CaselessKeyword('INT2'))
19 | _integer = CaselessKeyword('INTEGER')
20 | _integer |= CaselessKeyword('INT') | CaselessKeyword('INT4')
21 | _bigint = (CaselessKeyword('BIGINT') | CaselessKeyword('INT8'))
22 | _decimal = Combine((CaselessKeyword('DECIMAL') | CaselessKeyword('NUMERIC')) + '(' + Word(nums + ' ,') + ')')  # noqa
23 | _real = (CaselessKeyword('REAL') | CaselessKeyword('FLOAT4'))
24 | _double = (CaselessKeyword('DOUBLE PRECISION') | CaselessKeyword('FLOAT') | CaselessKeyword('FLOAT8') | CaselessKeyword('DOUBLE'))  # noqa
25 | _boolean = CaselessKeyword('BOOLEAN')
26 | _char = (CaselessKeyword('CHAR') | CaselessKeyword('CHARACTER'))
27 | _char |= (CaselessKeyword('NCHAR') | CaselessKeyword('BPCHAR'))
28 | _varchar = Combine(_varchar_names + '(' + Word(alphanums) + ')')
29 | _date = CaselessKeyword('DATE')
30 | _text = CaselessKeyword('TEXT')
31 | _timestamp = CaselessKeyword('TIMESTAMP')
32 | 
33 | # Create SQL keywords
34 | _create = CaselessKeyword('CREATE')
35 | _table = CaselessKeyword('TABLE')
36 | _view = CaselessKeyword('VIEW')
37 | _temp = CaselessKeyword('TEMP')
38 | _temporary = CaselessKeyword('TEMPORARY')
39 | _if_not_exists = CaselessKeyword('IF NOT EXISTS')
40 | _or_replace = CaselessKeyword('OR REPLACE')
41 | _primary_key = CaselessKeyword('PRIMARY KEY')
42 | _foreign_key = CaselessKeyword('FOREIGN KEY')
43 | _references = CaselessKeyword('REFERENCES')
44 | _unique = CaselessKeyword('UNIQUE')
45 | _null = CaselessKeyword('NULL')
46 | _not_null = CaselessKeyword('NOT NULL')
47 | _distkey = CaselessKeyword('DISTKEY')
48 | _diststyle = CaselessKeyword('DISTSTYLE')
49 | _sortkey = CaselessKeyword('SORTKEY')
50 | _encode = CaselessKeyword('ENCODE')
51 | _all = CaselessKeyword('ALL')
52 | _even = CaselessKeyword('EVEN')
53 | _key = CaselessKeyword('KEY')
54 | 
55 | # Select SQL Keywords
56 | _select = CaselessKeyword('SELECT')
57 | _with = CaselessKeyword('WITH')
58 | _from = CaselessKeyword('FROM')
59 | _as = CaselessKeyword('AS')
60 | _join = CaselessKeyword('JOIN')
61 | 
62 | # Parsers
63 | _db_name = Word(alphanums+"_-.`")
64 | pk_check = (_primary_key | _unique)
65 | 
66 | # Column types
67 | column_types = _smallint | _integer | _bigint | _decimal | _real | _double
68 | column_types |= _boolean | _char | _varchar | _date | _timestamp | _text
69 | 
70 | # Define a field parser for create table fields or select query fields
71 | field_parser = Forward()
72 | subquery = Forward()
73 | 
74 | # List of characters allowed in the query statements
75 | special_character = "\\_-. @*`><!+/=%':{}|~?[]"
76 | _word = Word(alphanums + special_character)
77 | 
78 | # Subqueries allow words and commas in them
79 | _word_subquery = Word(alphanums + "," + special_character)
80 | 
81 | # Field / Subquery are either one or more words or subquery
82 | field_parser << Combine(OneOrMore(_word | subquery))
83 | subquery << Combine('(' + Combine(ZeroOrMore(_word_subquery | subquery)) + ')')
84 | 


--------------------------------------------------------------------------------
/dataduct/database/parsers/transform.py:
--------------------------------------------------------------------------------
  1 | """Module containing basic transform functions on strings
  2 | """
  3 | 
  4 | import re
  5 | 
  6 | from pyparsing import CaselessKeyword
  7 | from pyparsing import CharsNotIn
  8 | from pyparsing import Literal
  9 | from pyparsing import OneOrMore
 10 | from pyparsing import WordStart
 11 | from pyparsing import ZeroOrMore
 12 | from pyparsing import nestedExpr
 13 | from pyparsing import replaceWith
 14 | 
 15 | 
 16 | def remove_empty_statements(string, seperator=';'):
 17 |     """Remove empty statements from the string
 18 | 
 19 |     Args:
 20 |         string(str): String to be processed
 21 |         seperator(str): Seperater to be checked for duplicates
 22 | 
 23 |     Returns:
 24 |         result(str): String with empty statements trimmed
 25 |     """
 26 |     if string == '':
 27 |         return string
 28 | 
 29 |     empty_statement = seperator + OneOrMore(seperator)
 30 |     empty_statement.setParseAction(replaceWith(seperator))
 31 |     string = empty_statement.transformString(string)
 32 | 
 33 |     return string.lstrip(seperator)
 34 | 
 35 | 
 36 | def remove_comments(string):
 37 |     """Remove comments from the statements
 38 | 
 39 |     Args:
 40 |         string(str): String to be processed
 41 | 
 42 |     Returns:
 43 |         result(str): String with comments trimmed
 44 |     """
 45 | 
 46 |     if string == '':
 47 |         return string
 48 | 
 49 |     # Remove multiline comments
 50 |     multiline_comment = nestedExpr('/*', '*/').suppress()
 51 |     string = multiline_comment.transformString(string)
 52 | 
 53 |     # Remove single line comments
 54 |     singleline_comment = Literal('--') + ZeroOrMore(CharsNotIn('\n'))
 55 |     string = singleline_comment.suppress().transformString(string)
 56 | 
 57 |     return string
 58 | 
 59 | 
 60 | def remove_transactional(string):
 61 |     """Remove begin or commit from the statement
 62 | 
 63 |     Args:
 64 |         string(str): String to be processed
 65 | 
 66 |     Returns:
 67 |         result(str): String with begin and commit trimmed
 68 |     """
 69 |     transaction = WordStart() + (
 70 |         CaselessKeyword('BEGIN') | CaselessKeyword('COMMIT'))
 71 | 
 72 |     return transaction.suppress().transformString(string)
 73 | 
 74 | 
 75 | def split_statements(string, seperator=';', quote_char="'"):
 76 |     """Seperate the string based on the seperator
 77 | 
 78 |     Args:
 79 |         string(str): String to be processed
 80 |         seperator(str): Seperater to split the statements
 81 | 
 82 |     Returns:
 83 |         result(list of str): Statements split based on the seperator
 84 |     """
 85 |     if string == '':
 86 |         return []
 87 | 
 88 |     # We can not directly split a sql statement as we want to skip on
 89 |     # semicolons inside a string in the sql query.
 90 |     stack = 0
 91 |     result = []
 92 |     statement = ''
 93 |     for char in string:
 94 |         if char == seperator and not stack % 2:
 95 |             result.append(statement.strip())
 96 |             statement = ''
 97 |         else:
 98 |             statement += char
 99 |             if char == quote_char:
100 |                 stack += 1
101 |     if statement.strip():
102 |         result.append(statement.strip())
103 |     return result
104 | 
105 | 
106 | def remove_newlines(string):
107 |     """Remove new lines from a string unless in single quotes
108 |     """
109 |     # In general the aim is to avoid regex as they are hard to maintain
110 |     regex = r"(?:[^\s\n\r']|'(?:\\.|[^'])*')+"
111 |     return ' '.join(re.findall(regex, string))
112 | 


--------------------------------------------------------------------------------
/docs/creating_an_etl.rst:
--------------------------------------------------------------------------------
  1 | Creating an ETL
  2 | ===============
  3 | 
  4 | Dataduct makes it extremely easy to write ETL in Data Pipeline. All the
  5 | details and logic can be abstracted in the YAML files which will be
  6 | automatically translated into Data Pipeline with appropriate pipeline
  7 | objects and other configurations.
  8 | 
  9 | Writing a Dataduct YAML File
 10 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 11 | 
 12 | To learn about general YAML syntax, please see `YAML
 13 | syntax <http://en.wikipedia.org/wiki/YAML>`__. The structure of a
 14 | Dataduct YAML file can be broken down into 3 parts:
 15 | 
 16 | -  Header information
 17 | -  Description
 18 | -  Pipeline steps
 19 | 
 20 | Example:
 21 | 
 22 | .. code:: yaml
 23 | 
 24 |     # HEADER INFORMATION
 25 |     name : example_emr_streaming
 26 |     frequency : one-time
 27 |     load_time: 01:00  # Hour:Min in UTC
 28 |     topic_arn: 'arn:aws:sns:example_arn'
 29 |     emr_cluster_config:
 30 |         num_instances: 1
 31 |         instance_size: m1.xlarge
 32 |         bootstrap:
 33 |             string: "s3://elasticmapreduce/bootstrap-actions/configure-hadoop,--yarn-key-value, yarn.scheduler.maximum-allocation-mb=9500"
 34 | 
 35 |     # DESCRIPTION
 36 |     description : Example for the emr_streaming step
 37 | 
 38 |     # PIPELINE STEPS
 39 |     steps:
 40 |     -   step_type: extract-local
 41 |         path: data/word_data.txt
 42 | 
 43 |     -   step_type: emr-streaming
 44 |         mapper: scripts/word_mapper.py
 45 |         reducer: scripts/word_reducer.py
 46 | 
 47 |     -   step_type: transform
 48 |         script: scripts/s3_profiler.py
 49 |         script_arguments:
 50 |         -   --input=INPUT1_STAGING_DIR
 51 |         -   --output=OUTPUT1_STAGING_DIR
 52 |         -   -f
 53 | 
 54 | 
 55 | Header Information
 56 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 57 | 
 58 | The header includes configuration information for Data Pipeline and the
 59 | Elastic MapReduce resource.
 60 | 
 61 | The name field sets the overall pipeline name:
 62 | 
 63 | .. code:: yaml
 64 | 
 65 |     name : example_emr_streaming
 66 | 
 67 | The frequency represents how often the pipeline is run on a schedule
 68 | basis. Currently supported intervals are *hourly, daily, one-time*:
 69 | 
 70 | .. code:: yaml
 71 | 
 72 |     frequency : one-time
 73 | 
 74 | The load time is what time of day (in UTC) the pipeline is scheduled to
 75 | run. It is in the format of HH:MM so 01:00 would set the pipeline to run
 76 | at 1AM UTC:
 77 | 
 78 | .. code:: yaml
 79 | 
 80 |     load_time: 01:00  # Hour:Min in UTC
 81 | 
 82 | In your config file, you have the option of specifying a default Amazon
 83 | Resource Name that will be messaged if the pipeline fails, if you would wish to
 84 | override this default ARN, you may use the topic_arn property:
 85 | 
 86 | .. code:: yaml
 87 | 
 88 |     topic_arn: 'arn:aws:sns:example_arn'
 89 | 
 90 | If the pipeline includes an EMR-streaming step, the EMR instance can be
 91 | configured. For example, you can configure the bootstrap, number of core
 92 | instances, and instance types:
 93 | 
 94 | .. code:: yaml
 95 | 
 96 |     emr_cluster_config:
 97 |         num_instances: 1
 98 |         instance_size: m1.xlarge
 99 |         bootstrap:
100 |             string: "s3://elasticmapreduce/bootstrap-actions/configure-hadoop,--yarn-key-value, yarn.scheduler.maximum-allocation-mb=9500"
101 | 
102 | *Note: Arguments in the bootstrap step are delimited by commas, not spaces.*
103 | 
104 | Description
105 | ^^^^^^^^^^^
106 | 
107 | The description allows the creator of the YAML file to clearly explain
108 | the purpose of the pipeline.
109 | 


--------------------------------------------------------------------------------
/dataduct/steps/pipeline_dependencies.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ETL step for pipeline dependencies using transform step
 3 | """
 4 | from ..config import Config
 5 | from ..utils import constants as const
 6 | from .transform import TransformStep
 7 | 
 8 | config = Config()
 9 | NAME_PREFIX = config.etl.get('NAME_PREFIX', '')
10 | DEPENDENCY_OVERRIDE = config.etl.get('DEPENDENCY_OVERRIDE', False)
11 | SNS_TOPIC_ARN = config.etl.get('SNS_TOPIC_ARN_FAILURE', None)
12 | 
13 | 
14 | class PipelineDependenciesStep(TransformStep):
15 |     """PipelineDependencies Step class that helps wait for other pipelines
16 |         to finish
17 |     """
18 | 
19 |     def __init__(self,
20 |                  id,
21 |                  pipeline_name,
22 |                  dependent_pipelines=None,
23 |                  dependent_pipelines_ok_to_fail=None,
24 |                  refresh_rate=300,
25 |                  start_date=None,
26 |                  script_arguments=None,
27 |                  **kwargs):
28 |         """Constructor for the QATransformStep class
29 | 
30 |         Args:
31 |             sns_arn(str): sns topic arn for QA steps
32 |             script_arguments(list of str): list of arguments to the script
33 |             **kwargs(optional): Keyword arguments directly passed to base class
34 |         """
35 | 
36 |         if script_arguments is None:
37 |             script_arguments = list()
38 | 
39 |         if (dependent_pipelines is None and
40 |                 dependent_pipelines_ok_to_fail is None):
41 |             raise ValueError('Must have some dependencies for dependency step')
42 | 
43 |         prefix_func = lambda p: p if not NAME_PREFIX else NAME_PREFIX + '_' + p
44 |         argument_func = lambda x: [prefix_func(p) for p in x]
45 | 
46 |         if DEPENDENCY_OVERRIDE:
47 |             command = 'ls'
48 |             script_arguments = None
49 |         else:
50 |             command = const.DEPENDENCY_COMMAND
51 |             if start_date is None:
52 |                 start_date = "#{format(@scheduledStartTime,'YYYY-MM-dd')}"
53 | 
54 |             script_arguments.extend(
55 |                 [
56 |                     '--pipeline_name=%s' % pipeline_name,
57 |                     '--start_date=%s' % start_date,
58 |                     '--refresh_rate=%s' % str(refresh_rate),
59 |                     '--sns_topic_arn=%s' % SNS_TOPIC_ARN,
60 |                 ]
61 |             )
62 | 
63 |             if dependent_pipelines:
64 |                 script_arguments.append('--dependencies')
65 |                 script_arguments.extend(argument_func(dependent_pipelines))
66 | 
67 |             if dependent_pipelines_ok_to_fail:
68 |                 script_arguments.append('--dependencies_ok_to_fail')
69 |                 script_arguments.extend(
70 |                     argument_func(dependent_pipelines_ok_to_fail))
71 | 
72 |         super(PipelineDependenciesStep, self).__init__(
73 |             id=id,
74 |             command=command,
75 |             script_arguments=script_arguments,
76 |             no_output=True,
77 |             **kwargs)
78 | 
79 |         self._output = None
80 | 
81 |     @classmethod
82 |     def arguments_processor(cls, etl, input_args):
83 |         """Parse the step arguments according to the ETL pipeline
84 | 
85 |         Args:
86 |             etl(ETLPipeline): Pipeline object containing resources and steps
87 |             step_args(dict): Dictionary of the step arguments for the class
88 |         """
89 |         input_args = cls.pop_inputs(input_args)
90 |         step_args = cls.base_arguments_processor(etl, input_args)
91 |         step_args['pipeline_name'] = etl.name
92 | 
93 |         return step_args
94 | 


--------------------------------------------------------------------------------