├── analyses └── .gitkeep ├── macros └── .gitkeep ├── seeds └── .gitkeep ├── snapshots └── .gitkeep ├── tests ├── .gitkeep └── fact_trade__unique_trade.sql ├── models ├── gold │ ├── dim_date.sql │ ├── dim_broker.sql │ ├── fact_cash_balances.sql │ ├── dim_trade.sql │ ├── fact_cash_transactions.sql │ ├── fact_watches.sql │ ├── dim_account.sql │ ├── fact_market_history.sql │ ├── dim_security.sql │ ├── fact_holdings.sql │ ├── fact_trade.sql │ ├── dim_company.sql │ └── dim_customer.sql ├── silver │ ├── date.sql │ ├── employees.sql │ ├── cash_transactions.sql │ ├── watches_history.sql │ ├── holdings_history.sql │ ├── trades.sql │ ├── watches.sql │ ├── companies.sql │ ├── securities.sql │ ├── daily_market.sql │ ├── financials.sql │ ├── trades_history.sql │ ├── accounts.sql │ └── customers.sql ├── bronze │ ├── hr │ │ ├── hr_employee.sql │ │ └── sources.yml │ ├── crm │ │ ├── crm_customer_mgmt.sql │ │ └── sources.yml │ ├── brokerage │ │ ├── brokerage_trade.sql │ │ ├── brokerage_holding_history.sql │ │ ├── brokerage_trade_history.sql │ │ ├── brokerage_cash_transaction.sql │ │ ├── brokerage_watch_history.sql │ │ ├── brokerage_daily_market.sql │ │ └── sources.yml │ ├── reference │ │ ├── reference_industry.sql │ │ ├── reference_tax_rate.sql │ │ ├── reference_status_type.sql │ │ ├── reference_trade_type.sql │ │ ├── sources.yml │ │ └── reference_date.sql │ ├── finwire │ │ ├── finwire_company.sql │ │ ├── sources.yml │ │ ├── finwire_security.sql │ │ └── finwire_financial.sql │ └── syndicated │ │ ├── syndicated_prospect.sql │ │ └── sources.yml ├── example │ ├── my_second_dbt_model.sql │ ├── schema.yml │ └── my_first_dbt_model.sql └── work │ └── wrk_company_financials.sql ├── packages.yml ├── images ├── dbt-dag.png ├── title-qr.png ├── title-page.png ├── tpcdi-help.png ├── snowflake-dag.png ├── refresh-conflict.png ├── dbt-dynamic-tables.png ├── tpc-di-etl-diagram.png ├── tpc-di-logical-model.png └── tpcdi-process-files-help.png ├── .gitignore ├── .github ├── release-drafter.yml └── workflows │ └── release-drafter.yml ├── devrel └── multi-record.txt ├── dbt_project.yml ├── environment.yml ├── README.md ├── tpcdi.py └── demo.ipynb /analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/gold/dim_date.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ ref('date') }} -------------------------------------------------------------------------------- /models/silver/date.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ ref('reference_date') }} -------------------------------------------------------------------------------- /models/bronze/hr/hr_employee.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('hr', 'hr') }} -------------------------------------------------------------------------------- /packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | -------------------------------------------------------------------------------- /models/bronze/crm/crm_customer_mgmt.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('crm', 'customer_mgmt') }} -------------------------------------------------------------------------------- /images/dbt-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/dbt-dag.png -------------------------------------------------------------------------------- /images/title-qr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/title-qr.png -------------------------------------------------------------------------------- /images/title-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/title-page.png -------------------------------------------------------------------------------- /images/tpcdi-help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpcdi-help.png -------------------------------------------------------------------------------- /models/bronze/brokerage/brokerage_trade.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('brokerage', 'trade') }} 3 | -------------------------------------------------------------------------------- /models/bronze/reference/reference_industry.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('reference','industry') }} -------------------------------------------------------------------------------- /models/bronze/reference/reference_tax_rate.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('reference','tax_rate') }} -------------------------------------------------------------------------------- /images/snowflake-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/snowflake-dag.png -------------------------------------------------------------------------------- /models/bronze/finwire/finwire_company.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | * 4 | from {{ source("finwire", "cmp") }} 5 | -------------------------------------------------------------------------------- /models/bronze/reference/reference_status_type.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('reference','status_type') }} -------------------------------------------------------------------------------- /models/bronze/reference/reference_trade_type.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('reference','trade_type') }} -------------------------------------------------------------------------------- /models/bronze/syndicated/syndicated_prospect.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('syndicated', 'prospect') }} -------------------------------------------------------------------------------- /images/refresh-conflict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/refresh-conflict.png -------------------------------------------------------------------------------- /images/dbt-dynamic-tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/dbt-dynamic-tables.png -------------------------------------------------------------------------------- /images/tpc-di-etl-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpc-di-etl-diagram.png -------------------------------------------------------------------------------- /models/bronze/brokerage/brokerage_holding_history.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('brokerage', 'holding_history') }} -------------------------------------------------------------------------------- /models/bronze/brokerage/brokerage_trade_history.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('brokerage', 'trade_history') }} 3 | -------------------------------------------------------------------------------- /images/tpc-di-logical-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpc-di-logical-model.png -------------------------------------------------------------------------------- /models/bronze/brokerage/brokerage_cash_transaction.sql: -------------------------------------------------------------------------------- 1 | select * 2 | from {{ source('brokerage', 'cash_transaction') }} 3 | -------------------------------------------------------------------------------- /models/bronze/brokerage/brokerage_watch_history.sql: -------------------------------------------------------------------------------- 1 | select 2 | * 3 | from {{ source("brokerage", "watch_history") }} -------------------------------------------------------------------------------- /models/bronze/brokerage/brokerage_daily_market.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | * 4 | from {{ source("brokerage", "daily_market") }} 5 | -------------------------------------------------------------------------------- /images/tpcdi-process-files-help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpcdi-process-files-help.png -------------------------------------------------------------------------------- /tests/fact_trade__unique_trade.sql: -------------------------------------------------------------------------------- 1 | select 2 | sk_trade_id, 3 | count(*) cnt 4 | from {{ ref('fact_trade') }} 5 | group by all 6 | having cnt > 1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | credentials.properties 6 | credentials.json 7 | .java-version 8 | .vscode/ 9 | __pycache__/ 10 | .DS_Store 11 | .ipynb_checkpoints/ 12 | -------------------------------------------------------------------------------- /models/bronze/hr/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: hr 5 | database: tpcdi 6 | schema: digen 7 | loader: snowpark 8 | 9 | tables: 10 | - name: hr 11 | -------------------------------------------------------------------------------- /models/example/my_second_dbt_model.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Use the `ref` function to select from other models 3 | {{ config(enabled=false) }} 4 | select * 5 | from {{ ref('my_first_dbt_model') }} 6 | where id = 1 7 | -------------------------------------------------------------------------------- /models/bronze/crm/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: crm 5 | database: tpcdi 6 | schema: digen 7 | loader: snowpark 8 | 9 | tables: 10 | - name: customer_mgmt 11 | -------------------------------------------------------------------------------- /models/bronze/syndicated/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: syndicated 5 | database: tpcdi 6 | schema: digen 7 | loader: snowpark 8 | 9 | tables: 10 | - name: prospect -------------------------------------------------------------------------------- /models/bronze/finwire/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: finwire 5 | database: tpcdi 6 | schema: digen 7 | loader: snowpark 8 | 9 | tables: 10 | - name: cmp 11 | - name: sec 12 | - name: fin 13 | -------------------------------------------------------------------------------- /models/bronze/reference/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: reference 5 | database: tpcdi 6 | schema: digen 7 | loader: snowpark 8 | 9 | tables: 10 | - name: status_type 11 | - name: industry 12 | - name: date 13 | - name: tax_rate 14 | - name: trade_type -------------------------------------------------------------------------------- /models/gold/dim_broker.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{dbt_utils.generate_surrogate_key(['employee_id'])}} sk_broker_id, 3 | employee_id broker_id, 4 | manager_id, 5 | first_name, 6 | last_name, 7 | middle_initial, 8 | job_code, 9 | branch, 10 | office, 11 | phone 12 | from 13 | {{ ref('employees') }} -------------------------------------------------------------------------------- /models/gold/fact_cash_balances.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select * 3 | from {{ ref('fact_cash_transactions') }} 4 | ) 5 | select 6 | sk_customer_id, 7 | sk_account_id, 8 | sk_transaction_date, 9 | sum(amount) amount, 10 | description 11 | from s1 12 | group by all 13 | order by sk_transaction_date, sk_customer_id, sk_account_id -------------------------------------------------------------------------------- /models/silver/employees.sql: -------------------------------------------------------------------------------- 1 | select 2 | employee_id, 3 | manager_id, 4 | employee_first_name first_name, 5 | employee_last_name last_name, 6 | employee_mi middle_initial, 7 | employee_job_code job_code, 8 | employee_branch branch, 9 | employee_office office, 10 | employee_phone phone 11 | from {{ ref('hr_employee') }} -------------------------------------------------------------------------------- /models/bronze/brokerage/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: brokerage 5 | database: tpcdi 6 | schema: digen 7 | loader: snowpark 8 | 9 | tables: 10 | - name: daily_market 11 | - name: watch_history 12 | - name: trade 13 | - name: trade_history 14 | - name: holding_history 15 | - name: cash_transaction 16 | 17 | -------------------------------------------------------------------------------- /models/gold/dim_trade.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{dbt_utils.generate_surrogate_key(['trade_id','t.effective_timestamp'])}} sk_trade_id, 3 | trade_id, 4 | trade_status status, 5 | transaction_type, 6 | trade_type type, 7 | executor_name executed_by, 8 | t.effective_timestamp, 9 | t.end_timestamp, 10 | t.IS_CURRENT 11 | from 12 | {{ ref('trades_history') }} t 13 | -------------------------------------------------------------------------------- /models/work/wrk_company_financials.sql: -------------------------------------------------------------------------------- 1 | select 2 | sk_company_id, 3 | f.company_id, 4 | QUARTER_START_DATE, 5 | sum(eps) over ( 6 | partition by f.company_id 7 | order by QUARTER_START_DATE 8 | rows between 4 preceding and current row 9 | ) - eps sum_basic_eps 10 | from {{ ref("financials") }} f 11 | join {{ ref("dim_company") }} c 12 | on f.company_id = c.company_id 13 | and f.effective_timestamp between c.effective_timestamp and c.end_timestamp -------------------------------------------------------------------------------- /models/bronze/reference/reference_date.sql: -------------------------------------------------------------------------------- 1 | select 2 | DATE_VALUE SK_DATE_ID, 3 | DATE_VALUE, 4 | DATE_DESC, 5 | CALENDAR_YEAR_ID, 6 | CALENDAR_YEAR_DESC, 7 | CALENDAR_QTR_ID, 8 | CALENDAR_QTR_DESC, 9 | CALENDAR_MONTH_ID, 10 | CALENDAR_MONTH_DESC, 11 | CALENDAR_WEEK_ID, 12 | CALENDAR_WEEK_DESC, 13 | DAY_OF_WEEK_NUM, 14 | DAY_OF_WEEK_DESC, 15 | FISCAL_YEAR_ID, 16 | FISCAL_YEAR_DESC, 17 | FISCAL_QTR_ID, 18 | FISCAL_QTR_DESC, 19 | HOLIDAY_FLAG 20 | from {{ source('reference', 'date') }} -------------------------------------------------------------------------------- /models/silver/cash_transactions.sql: -------------------------------------------------------------------------------- 1 | with t as ( 2 | select 3 | ct_ca_id account_id, 4 | ct_dts transaction_timestamp, 5 | ct_amt amount, 6 | ct_name description 7 | from 8 | {{ ref('brokerage_cash_transaction') }} 9 | ) 10 | select 11 | a.customer_id, 12 | t.* 13 | from 14 | t 15 | join 16 | {{ ref('accounts') }} a 17 | on 18 | t.account_id = a.account_id 19 | and 20 | t.transaction_timestamp between a.effective_timestamp and a.end_timestamp 21 | 22 | -------------------------------------------------------------------------------- /models/example/schema.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | - name: my_first_dbt_model 6 | description: "A starter dbt model" 7 | columns: 8 | - name: id 9 | description: "The primary key for this table" 10 | tests: 11 | - unique 12 | - not_null 13 | 14 | - name: my_second_dbt_model 15 | description: "A starter dbt model" 16 | columns: 17 | - name: id 18 | description: "The primary key for this table" 19 | tests: 20 | - unique 21 | - not_null 22 | -------------------------------------------------------------------------------- /models/gold/fact_cash_transactions.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select 3 | *, 4 | to_date(transaction_timestamp) sk_transaction_date 5 | from 6 | {{ ref('cash_transactions') }} 7 | ) 8 | select 9 | sk_customer_id, 10 | sk_account_id, 11 | sk_transaction_date, 12 | transaction_timestamp, 13 | amount, 14 | description 15 | from 16 | s1 17 | join 18 | {{ ref('dim_account') }} a 19 | on 20 | s1.account_id = a.account_id 21 | and 22 | s1.transaction_timestamp between a.effective_timestamp and a.end_timestamp -------------------------------------------------------------------------------- /models/silver/watches_history.sql: -------------------------------------------------------------------------------- 1 | 2 | with s1 as ( 3 | select 4 | w_c_id customer_id, 5 | w_s_symb symbol, 6 | w_dts watch_timestamp, 7 | case w_action 8 | when 'ACTV' then 'Activate' 9 | when 'CNCL' then 'Cancelled' 10 | else null end action_type 11 | from 12 | {{ ref('brokerage_watch_history') }} 13 | ) 14 | select 15 | s1.*, 16 | company_id, 17 | company_name, 18 | exchange_id, 19 | status security_status 20 | from 21 | s1 22 | join 23 | {{ ref('securities') }} s 24 | using (symbol) -------------------------------------------------------------------------------- /models/silver/holdings_history.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select 3 | HH_T_ID trade_id, 4 | HH_H_T_ID previous_trade_id, 5 | hh_before_qty previous_quantity, 6 | hh_after_qty quantity 7 | from {{ ref('brokerage_holding_history') }} 8 | ) 9 | select s1.*, 10 | ct.account_id account_id, 11 | ct.symbol symbol, 12 | ct.create_timestamp, 13 | ct.close_timestamp, 14 | ct.trade_price, 15 | ct.bid_price, 16 | ct.fee, 17 | ct.commission 18 | from s1 19 | join {{ ref('trades') }} ct 20 | using (trade_id) 21 | 22 | -------------------------------------------------------------------------------- /models/gold/fact_watches.sql: -------------------------------------------------------------------------------- 1 | select 2 | sk_customer_id, 3 | sk_security_id, 4 | to_date(placed_timestamp) sk_date_placed, 5 | to_date(removed_timestamp) sk_date_removed, 6 | 1 as watch_cnt 7 | from 8 | {{ ref('watches') }} w 9 | join 10 | {{ ref('dim_customer') }} c 11 | ON 12 | w.customer_id = c.customer_id 13 | and 14 | placed_timestamp between c.effective_timestamp and c.end_timestamp 15 | join 16 | {{ ref('dim_security') }} s 17 | ON 18 | w.symbol = s.symbol 19 | and 20 | placed_timestamp between s.effective_timestamp and s.end_timestamp 21 | -------------------------------------------------------------------------------- /models/gold/dim_account.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | {{dbt_utils.generate_surrogate_key(['account_id','a.effective_timestamp'])}} sk_account_id, 3 | a.account_id, 4 | sk_broker_id, 5 | sk_customer_id, 6 | a.status, 7 | account_desc, 8 | tax_status, 9 | a.effective_timestamp, 10 | a.end_timestamp, 11 | a.is_current 12 | from 13 | {{ ref('accounts') }} a 14 | join 15 | {{ ref('dim_customer') }} c 16 | on a.customer_id = c.customer_id 17 | and a.effective_timestamp between c.effective_timestamp and c.end_timestamp 18 | join 19 | {{ ref('dim_broker') }} b 20 | using (broker_id) -------------------------------------------------------------------------------- /models/bronze/finwire/finwire_security.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select *, 3 | try_to_number(co_name_or_cik) as try_cik 4 | from {{ source("finwire", "sec") }} 5 | ) 6 | select 7 | pts, 8 | symbol, 9 | issue_type, 10 | status, 11 | name, 12 | ex_id, 13 | to_number(sh_out) as sh_out, 14 | to_date(first_trade_date,'yyyymmdd') as first_trade_date, 15 | to_date(first_exchange_date,'yyyymmdd') as first_exchange_date, 16 | cast(dividend as float) as dividend, 17 | try_cik cik, 18 | case when try_cik is null then co_name_or_cik else null end company_name 19 | from s1 -------------------------------------------------------------------------------- /models/example/my_first_dbt_model.sql: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | Welcome to your first dbt model! 4 | Did you know that you can also configure models directly within SQL files? 5 | This will override configurations stated in dbt_project.yml 6 | 7 | Try changing "table" to "view" below 8 | */ 9 | {{ config(enabled=false) }} 10 | {{ config(materialized='table') }} 11 | 12 | with source_data as ( 13 | 14 | select 1 as id 15 | union all 16 | select null as id 17 | 18 | ) 19 | 20 | select * 21 | from source_data 22 | 23 | /* 24 | Uncomment the line below to remove records with null `id` values 25 | */ 26 | 27 | -- where id is not null 28 | -------------------------------------------------------------------------------- /models/silver/trades.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select distinct 3 | trade_id, 4 | account_id, 5 | trade_status, 6 | trade_type, 7 | transaction_type, 8 | symbol, 9 | executor_name, 10 | quantity, 11 | bid_price, 12 | trade_price, 13 | fee, 14 | commission, 15 | tax, 16 | min(effective_timestamp) over (partition by trade_id) create_timestamp, 17 | max(effective_timestamp) over (partition by trade_id) close_timestamp 18 | from 19 | {{ ref('trades_history') }} 20 | order by trade_id, create_timestamp 21 | ) 22 | select * 23 | from s1 -------------------------------------------------------------------------------- /models/gold/fact_market_history.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | s.sk_security_id, 3 | s.sk_company_id, 4 | dm_date sk_date_id, 5 | --dmh.dm_close / sum_basic_eps AS peratio, 6 | (s.dividend / dmh.dm_close) / 100 yield, 7 | fifty_two_week_high, 8 | fifty_two_week_high_date sk_fifty_two_week_high_date, 9 | fifty_two_week_low, 10 | fifty_two_week_low_date sk_fifty_two_week_low_date, 11 | dm_close closeprice, 12 | dm_high dayhigh, 13 | dm_low daylow, 14 | dm_vol volume 15 | FROM {{ ref("daily_market") }} dmh 16 | JOIN {{ ref("dim_security") }} s 17 | ON s.symbol = dmh.dm_s_symb 18 | AND dmh.dm_date between s.effective_timestamp and s.end_timestamp 19 | LEFT JOIN {{ ref("wrk_company_financials")}} f 20 | USING (sk_company_id) 21 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: 'v$RESOLVED_VERSION' 2 | tag-template: 'v$RESOLVED_VERSION' 3 | categories: 4 | - title: '🚀 Features' 5 | labels: 6 | - 'feature' 7 | - 'enhancement' 8 | - title: '🐛 Bug Fixes' 9 | labels: 10 | - 'fix' 11 | - 'bugfix' 12 | - 'bug' 13 | - title: '🧰 Maintenance' 14 | label: 'chore' 15 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)' 16 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. 17 | version-resolver: 18 | major: 19 | labels: 20 | - 'major' 21 | minor: 22 | labels: 23 | - 'minor' 24 | patch: 25 | labels: 26 | - 'patch' 27 | default: patch 28 | template: | 29 | ## Changes 30 | 31 | $CHANGES -------------------------------------------------------------------------------- /models/gold/dim_security.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select 3 | symbol, 4 | issue_type issue, 5 | s.status, 6 | s.name, 7 | exchange_id, 8 | sk_company_id, 9 | shares_outstanding, 10 | first_trade_date, 11 | first_exchange_date, 12 | dividend, 13 | s.effective_timestamp, 14 | s.end_timestamp, 15 | s.IS_CURRENT 16 | from 17 | {{ ref("securities") }} s 18 | join 19 | {{ ref("dim_company") }} c 20 | on 21 | s.company_id = c.company_id 22 | and 23 | s.effective_timestamp between c.effective_timestamp and c.end_timestamp 24 | ) 25 | select 26 | {{dbt_utils.generate_surrogate_key(['symbol','effective_timestamp'])}} sk_security_id, 27 | * 28 | from 29 | s1 30 | -------------------------------------------------------------------------------- /devrel/multi-record.txt: -------------------------------------------------------------------------------- 1 | 20151230-152248FIN201542015100120151230 4880880089.63 2473473307.30 4.82 4.42 0.51 200321223.47 139284472514.02 9402305760.19 512872010 5597926720000001595 2 | 20151230-152511SECAAAAAAAAAAAAKVDPREF_AACTVDJBJXyQHLBvn EEOGAOvUNgL XwrOxQUBMrgPv AMEX 982113436 1903022619730704 1.200000000254 3 | 20151230-163207CMPWWfcsOHprIDIUsPfRLrcLPlxaQ 0000004432ACTVMCA 1873092521088 Vessey Crescent M5D 1Z1 Winnipeg AL United States of AmericaMoreno rlRIDCNz dVGrEzomCXIvZVZzFzxCzbGYIEbAXJMJlsYUQEV -------------------------------------------------------------------------------- /models/gold/fact_holdings.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select 3 | * 4 | from {{ ref('holdings_history') }} 5 | ) 6 | select 7 | ct.sk_trade_id sk_current_trade_id, 8 | pt.sk_trade_id, 9 | sk_customer_id, 10 | sk_account_id, 11 | sk_security_id, 12 | to_date(create_timestamp) sk_trade_date, 13 | create_timestamp trade_timestamp, 14 | trade_price current_price, 15 | quantity current_holding, 16 | bid_price current_bid_price, 17 | fee current_fee, 18 | commission current_commission 19 | from s1 20 | join {{ ref('dim_trade') }} ct 21 | using (trade_id) 22 | join {{ ref('dim_trade') }} pt 23 | on s1.previous_trade_id = pt.trade_id 24 | join {{ ref('dim_account') }} a 25 | on s1.account_id = a.account_id 26 | and s1.create_timestamp between a.effective_timestamp and a.end_timestamp 27 | join {{ ref('dim_security') }} s 28 | on s1.symbol = s.symbol -------------------------------------------------------------------------------- /models/gold/fact_trade.sql: -------------------------------------------------------------------------------- 1 | select 2 | sk_trade_id, 3 | sk_broker_id, 4 | sk_customer_id, 5 | sk_account_id, 6 | sk_security_id, 7 | to_date(create_timestamp) sk_create_date, 8 | create_timestamp, 9 | to_date(close_timestamp) sk_close_date, 10 | close_timestamp, 11 | executed_by, 12 | quantity, 13 | bid_price, 14 | trade_price, 15 | fee, 16 | commission, 17 | tax 18 | from {{ ref('trades') }} t 19 | join {{ ref('dim_trade') }} dt 20 | on t.trade_id = dt.trade_id 21 | and t.create_timestamp between dt.effective_timestamp and dt.end_timestamp 22 | join 23 | {{ ref('dim_account') }} a 24 | on 25 | t.account_id = a.account_id 26 | and 27 | t.create_timestamp between a.effective_timestamp and a.end_timestamp 28 | join 29 | {{ ref('dim_security') }} s 30 | on 31 | t.symbol = s.symbol 32 | and 33 | t.create_timestamp between s.effective_timestamp and s.end_timestamp -------------------------------------------------------------------------------- /models/bronze/finwire/finwire_financial.sql: -------------------------------------------------------------------------------- 1 | 2 | with s1 as ( 3 | select 4 | *, 5 | try_to_number(co_name_or_cik) as try_cik 6 | from {{ source("finwire", "fin") }} 7 | ) 8 | select 9 | pts, 10 | to_number(year) as year, 11 | to_number(quarter) as quarter, 12 | to_date(quarter_start_date,'yyyymmdd') as quarter_start_date, 13 | to_date(posting_date,'yyyymmdd') as posting_date, 14 | cast(revenue as float) as revenue, 15 | cast(earnings as float) as earnings, 16 | cast(eps as float) as eps, 17 | cast(diluted_eps as float) as diluted_eps, 18 | cast(margin as float) as margin, 19 | cast(inventory as float) as inventory, 20 | cast(assets as float) as assets, 21 | cast(liabilities as float) as liabilities, 22 | to_number(sh_out) as sh_out, 23 | to_number(diluted_sh_out) as diluted_sh_out, 24 | try_cik cik, 25 | case when try_cik is null then co_name_or_cik else null end company_name 26 | from s1 -------------------------------------------------------------------------------- /models/gold/dim_company.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{dbt_utils.generate_surrogate_key(['company_id','effective_timestamp'])}} sk_company_id, 3 | company_id, 4 | status, 5 | name, 6 | industry, 7 | ceo, 8 | address_line1, 9 | address_line2, 10 | postal_code, 11 | city, 12 | state_province, 13 | country, 14 | description, 15 | founding_date, 16 | sp_rating, 17 | case 18 | when 19 | sp_rating in ( 20 | 'BB', 21 | 'B', 22 | 'CCC', 23 | 'CC', 24 | 'C', 25 | 'D', 26 | 'BB+', 27 | 'B+', 28 | 'CCC+', 29 | 'BB-', 30 | 'B-', 31 | 'CCC-' 32 | ) 33 | then true 34 | else false 35 | end as is_lowgrade, 36 | effective_timestamp, 37 | end_timestamp, 38 | is_current 39 | from {{ ref("companies") }} -------------------------------------------------------------------------------- /models/silver/watches.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select 3 | customer_id, 4 | symbol, 5 | watch_timestamp, 6 | action_type, 7 | company_id, 8 | company_name, 9 | exchange_id, 10 | security_status, 11 | case action_type 12 | when 'Activate' then watch_timestamp 13 | else null 14 | end placed_timestamp, 15 | case action_type 16 | when 'Cancelled' then watch_timestamp 17 | else null 18 | end removed_timestamp 19 | from 20 | {{ ref('watches_history') }} 21 | ), 22 | s2 as ( 23 | select 24 | customer_id, 25 | symbol, 26 | company_id, 27 | company_name, 28 | exchange_id, 29 | security_status, 30 | min(placed_timestamp) placed_timestamp, 31 | max(removed_timestamp) removed_timestamp 32 | from s1 33 | group by all 34 | ) 35 | select 36 | *, 37 | case 38 | when removed_timestamp is null then 'Active' 39 | else 'Inactive' 40 | end watch_status 41 | from s2 42 | -------------------------------------------------------------------------------- /models/silver/companies.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | cik as company_id, 4 | st.st_name status, 5 | company_name name, 6 | ind.in_name industry, 7 | ceo_name ceo, 8 | address_line1, 9 | address_line2, 10 | postal_code, 11 | city, 12 | state_province, 13 | country, 14 | description, 15 | founding_date, 16 | sp_rating, 17 | pts as effective_timestamp, 18 | ifnull( 19 | timestampadd( 20 | 'millisecond', 21 | -1, 22 | lag(pts) over ( 23 | partition by company_id 24 | order by 25 | pts desc 26 | ) 27 | ), 28 | to_timestamp('9999-12-31 23:59:59.999') 29 | ) as end_timestamp, 30 | CASE 31 | WHEN ( 32 | row_number() over ( 33 | partition by company_id 34 | order by 35 | pts desc 36 | ) = 1 37 | ) THEN TRUE 38 | ELSE FALSE 39 | END as IS_CURRENT 40 | from {{ ref("finwire_company") }} cmp 41 | join {{ ref("reference_status_type") }} st on cmp.status = st.st_id 42 | join {{ ref("reference_industry") }} ind on cmp.industry_id = ind.in_id 43 | -------------------------------------------------------------------------------- /.github/workflows/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | # branches to consider in the event; optional, defaults to all 6 | branches: 7 | - main 8 | # pull_request event is required only for autolabeler 9 | pull_request: 10 | # Only following types are handled by the action, but one can default to all as well 11 | types: [opened, reopened, synchronize] 12 | # pull_request_target event is required for autolabeler to support PRs from forks 13 | # pull_request_target: 14 | # types: [opened, reopened, synchronize] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | update_release_draft: 21 | permissions: 22 | # write permission is required to create a github release 23 | contents: write 24 | # write permission is required for autolabeler 25 | # otherwise, read permission is required at least 26 | pull-requests: write 27 | runs-on: ubuntu-latest 28 | steps: 29 | # Drafts your next Release notes as Pull Requests are merged into "master" 30 | - uses: release-drafter/release-drafter@v5 31 | with: 32 | commitish: main 33 | env: 34 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /models/silver/securities.sql: -------------------------------------------------------------------------------- 1 | select 2 | symbol, 3 | issue_type, 4 | case s.status 5 | when 'ACTV' then 'Active' 6 | when 'INAC' then 'Inactive' 7 | else null 8 | end status, 9 | s.name, 10 | ex_id exchange_id, 11 | sh_out shares_outstanding, 12 | first_trade_date, 13 | first_exchange_date, 14 | dividend, 15 | coalesce(c1.name,c2.name) company_name, 16 | coalesce(c1.company_id, c2.company_id) company_id, 17 | pts as effective_timestamp, 18 | ifnull( 19 | timestampadd( 20 | 'millisecond', 21 | -1, 22 | lag(pts) over ( 23 | partition by symbol 24 | order by 25 | pts desc 26 | ) 27 | ), 28 | to_timestamp('9999-12-31 23:59:59.999') 29 | ) as end_timestamp, 30 | CASE 31 | WHEN ( 32 | row_number() over ( 33 | partition by symbol 34 | order by 35 | pts desc 36 | ) = 1 37 | ) THEN TRUE 38 | ELSE FALSE 39 | END as IS_CURRENT 40 | from {{ ref('finwire_security') }} s 41 | left join {{ ref('companies') }} c1 42 | on s.cik = c1.company_id 43 | and pts between c1.effective_timestamp and c1.end_timestamp 44 | left join {{ ref('companies') }} c2 45 | on s.company_name = c2.name 46 | and pts between c2.effective_timestamp and c2.end_timestamp 47 | -------------------------------------------------------------------------------- /models/silver/daily_market.sql: -------------------------------------------------------------------------------- 1 | with 2 | s1 as ( 3 | select 4 | -- dm_date, 5 | min(dm_low) over ( 6 | partition by dm_s_symb 7 | order by dm_date asc 8 | rows between 364 preceding and 0 following -- CURRENT ROW 9 | ) fifty_two_week_low, 10 | max(dm_high) over ( 11 | partition by dm_s_symb 12 | order by dm_date asc 13 | rows between 364 preceding and 0 following -- CURRENT ROW 14 | ) fifty_two_week_high, 15 | * 16 | from {{ ref("brokerage_daily_market") }} 17 | ), 18 | s2 as ( 19 | select a.*, 20 | b.dm_date as fifty_two_week_low_date, 21 | c.dm_date as fifty_two_week_high_date 22 | from s1 a 23 | join 24 | s1 b 25 | on a.dm_s_symb = b.dm_s_symb 26 | and a.fifty_two_week_low = b.dm_low 27 | and b.dm_date between add_months(a.dm_date, -12) and a.dm_date 28 | join 29 | s1 c 30 | on a.dm_s_symb = c.dm_s_symb 31 | and a.fifty_two_week_high = c.dm_high 32 | and c.dm_date between add_months(a.dm_date, -12) and a.dm_date 33 | ) 34 | select * 35 | from s2 36 | qualify 37 | row_number() over ( 38 | partition by dm_s_symb, dm_date 39 | order by fifty_two_week_low_date, fifty_two_week_high_date 40 | ) = 1 41 | -------------------------------------------------------------------------------- /models/silver/financials.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select 3 | YEAR, 4 | QUARTER, 5 | QUARTER_START_DATE, 6 | POSTING_DATE, 7 | REVENUE, 8 | EARNINGS, 9 | EPS, 10 | DILUTED_EPS, 11 | MARGIN, 12 | INVENTORY, 13 | ASSETS, 14 | LIABILITIES, 15 | SH_OUT, 16 | DILUTED_SH_OUT, 17 | coalesce(c1.name,c2.name) company_name, 18 | coalesce(c1.company_id, c2.company_id) company_id, 19 | pts as effective_timestamp 20 | from {{ ref('finwire_financial') }} s 21 | left join {{ ref('companies') }} c1 22 | on s.cik = c1.company_id 23 | and pts between c1.effective_timestamp and c1.end_timestamp 24 | left join {{ ref('companies') }} c2 25 | on s.company_name = c2.name 26 | and pts between c2.effective_timestamp and c2.end_timestamp 27 | ) 28 | select 29 | *, 30 | ifnull( 31 | timestampadd( 32 | 'millisecond', 33 | -1, 34 | lag(effective_timestamp) over ( 35 | partition by company_id 36 | order by 37 | effective_timestamp desc 38 | ) 39 | ), 40 | to_timestamp('9999-12-31 23:59:59.999') 41 | ) as end_timestamp, 42 | CASE 43 | WHEN ( 44 | row_number() over ( 45 | partition by company_id 46 | order by 47 | effective_timestamp desc 48 | ) = 1 49 | ) THEN TRUE 50 | ELSE FALSE 51 | END as IS_CURRENT 52 | from s1 53 | -------------------------------------------------------------------------------- /models/silver/trades_history.sql: -------------------------------------------------------------------------------- 1 | select 2 | t_id trade_id, 3 | t_dts trade_timestamp, 4 | t_ca_id account_id, 5 | ts.st_name trade_status, 6 | tt_name trade_type, 7 | case t_is_cash 8 | when true then 'Cash' 9 | when false then 'Margin' 10 | end transaction_type, 11 | t_s_symb symbol, 12 | t_exec_name executor_name, 13 | t_qty quantity, 14 | t_bid_price bid_price, 15 | t_trade_price trade_price, 16 | t_chrg fee, 17 | t_comm commission, 18 | t_tax tax, 19 | us.st_name update_status, 20 | th_dts effective_timestamp, 21 | ifnull( 22 | timestampadd( 23 | 'millisecond', 24 | -1, 25 | lag(th_dts) over ( 26 | partition by t_id 27 | order by 28 | th_dts desc 29 | ) 30 | ), 31 | to_timestamp('9999-12-31 23:59:59.999') 32 | ) as end_timestamp, 33 | CASE 34 | WHEN ( 35 | row_number() over ( 36 | partition by t_id 37 | order by 38 | th_dts desc 39 | ) = 1 40 | ) THEN TRUE 41 | ELSE FALSE 42 | END as IS_CURRENT 43 | from 44 | {{ ref('brokerage_trade') }} 45 | join 46 | {{ ref('brokerage_trade_history') }} 47 | on 48 | t_id = th_t_id 49 | join 50 | {{ ref('reference_trade_type') }} 51 | on 52 | t_tt_id = tt_id 53 | join 54 | {{ ref('reference_status_type') }} ts 55 | on 56 | t_st_id = ts.st_id 57 | join 58 | {{ ref('reference_status_type') }} us 59 | on th_st_id = us.st_id -------------------------------------------------------------------------------- /dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'dbt_tpcdi' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'tpcdi' 11 | 12 | # These configurations specify where dbt should look for different types of files. 13 | # The `model-paths` config, for example, states that models in this project can be 14 | # found in the "models/" directory. You probably won't need to change these! 15 | model-paths: ["models"] 16 | analysis-paths: ["analyses"] 17 | test-paths: ["tests"] 18 | seed-paths: ["seeds"] 19 | macro-paths: ["macros"] 20 | snapshot-paths: ["snapshots"] 21 | 22 | clean-targets: # directories to be removed by `dbt clean` 23 | - "target" 24 | - "dbt_packages" 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | models: 30 | dbt_tpcdi: 31 | example: 32 | +materialized: view 33 | bronze: 34 | +schema: bronze 35 | +materialized: dynamic_table 36 | +snowflake_warehouse: tpcdi_large 37 | +target_lag: downstream 38 | silver: 39 | +schema: silver 40 | +materialized: dynamic_table 41 | +snowflake_warehouse: tpcdi_large 42 | +target_lag: '10 minutes' 43 | gold: 44 | +schema: gold 45 | +materialized: dynamic_table 46 | +snowflake_warehouse: tpcdi_large 47 | +target_lag: '20 minutes' 48 | work: 49 | +schema: work 50 | +materialized: ephemeral 51 | -------------------------------------------------------------------------------- /models/silver/accounts.sql: -------------------------------------------------------------------------------- 1 | select 2 | action_type, 3 | decode(action_type, 4 | 'NEW','Active', 5 | 'ADDACCT','Active', 6 | 'UPDACCT','Active', 7 | 'CLOSEACCT','Inactive') status, 8 | ca_id account_id, 9 | ca_name account_desc, 10 | c_id customer_id, 11 | c_tax_id tax_id, 12 | c_gndr gender, 13 | c_tier tier, 14 | c_dob dob, 15 | c_l_name last_name, 16 | c_f_name first_name, 17 | c_m_name middle_name, 18 | c_adline1 address_line1, 19 | c_adline2 address_line2, 20 | c_zipcode postal_code, 21 | c_city city, 22 | c_state_prov state_province, 23 | c_ctry country, 24 | c_prim_email primary_email, 25 | c_alt_email alternate_email, 26 | c_phone_1 phone1, 27 | c_phone_2 phone2, 28 | c_phone_3 phone3, 29 | c_lcl_tx_id local_tax_rate_name, 30 | ltx.tx_rate local_tax_rate, 31 | c_nat_tx_id national_tax_rate_name, 32 | ntx.tx_rate national_tax_rate, 33 | ca_tax_st tax_status, 34 | ca_b_id broker_id, 35 | action_ts as effective_timestamp, 36 | ifnull( 37 | timestampadd( 38 | 'millisecond', 39 | -1, 40 | lag(action_ts) over ( 41 | partition by ca_id 42 | order by 43 | action_ts desc 44 | ) 45 | ), 46 | to_timestamp('9999-12-31 23:59:59.999') 47 | ) as end_timestamp, 48 | CASE 49 | WHEN ( 50 | row_number() over ( 51 | partition by ca_id 52 | order by 53 | action_ts desc 54 | ) = 1 55 | ) THEN TRUE 56 | ELSE FALSE 57 | END as IS_CURRENT 58 | from 59 | {{ ref('crm_customer_mgmt') }} c 60 | left join 61 | {{ ref('reference_tax_rate') }} ntx 62 | on 63 | c.c_nat_tx_id = ntx.tx_id 64 | left join 65 | {{ ref('reference_tax_rate') }} ltx 66 | on 67 | c.c_lcl_tx_id = ltx.tx_id 68 | where ca_id is not null -------------------------------------------------------------------------------- /models/silver/customers.sql: -------------------------------------------------------------------------------- 1 | select 2 | action_type, 3 | decode(action_type, 4 | 'NEW','Active', 5 | 'ADDACCT','Active', 6 | 'UPDACCT','Active', 7 | 'UPDCUST','Active', 8 | 'INACT','Inactive') status, 9 | c_id customer_id, 10 | ca_id account_id, 11 | c_tax_id tax_id, 12 | c_gndr gender, 13 | c_tier tier, 14 | c_dob dob, 15 | c_l_name last_name, 16 | c_f_name first_name, 17 | c_m_name middle_name, 18 | c_adline1 address_line1, 19 | c_adline2 address_line2, 20 | c_zipcode postal_code, 21 | c_city city, 22 | c_state_prov state_province, 23 | c_ctry country, 24 | c_prim_email primary_email, 25 | c_alt_email alternate_email, 26 | c_phone_1 phone1, 27 | c_phone_2 phone2, 28 | c_phone_3 phone3, 29 | c_lcl_tx_id local_tax_rate_name, 30 | ltx.tx_rate local_tax_rate, 31 | c_nat_tx_id national_tax_rate_name, 32 | ntx.tx_rate national_tax_rate, 33 | ca_tax_st account_tax_status, 34 | ca_b_id broker_id, 35 | action_ts as effective_timestamp, 36 | ifnull( 37 | timestampadd( 38 | 'millisecond', 39 | -1, 40 | lag(action_ts) over ( 41 | partition by c_id 42 | order by 43 | action_ts desc 44 | ) 45 | ), 46 | to_timestamp('9999-12-31 23:59:59.999') 47 | ) as end_timestamp, 48 | CASE 49 | WHEN ( 50 | row_number() over ( 51 | partition by c_id 52 | order by 53 | action_ts desc 54 | ) = 1 55 | ) THEN TRUE 56 | ELSE FALSE 57 | END as IS_CURRENT 58 | from 59 | {{ ref('crm_customer_mgmt') }} c 60 | left join 61 | {{ ref('reference_tax_rate') }} ntx 62 | on 63 | c.c_nat_tx_id = ntx.tx_id 64 | left join 65 | {{ ref('reference_tax_rate') }} ltx 66 | on 67 | c.c_lcl_tx_id = ltx.tx_id 68 | where action_type in ('NEW', 'INACT', 'UPDCUST') -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: tpcdi 2 | channels: 3 | - defaults 4 | dependencies: 5 | - abseil-cpp=20211102.0=hc377ac9_0 6 | - arrow-cpp=10.0.1=py38h0fd5f32_0 7 | - asn1crypto=1.5.1=py38hca03da5_0 8 | - aws-c-common=0.6.8=h80987f9_1 9 | - aws-c-event-stream=0.1.6=h313beb8_6 10 | - aws-checksums=0.1.11=h80987f9_2 11 | - aws-sdk-cpp=1.8.185=h4a942e0_0 12 | - blas=1.0=openblas 13 | - boost-cpp=1.73.0=h1a28f6b_12 14 | - brotli=1.0.9=h1a28f6b_7 15 | - brotli-bin=1.0.9=h1a28f6b_7 16 | - brotlipy=0.7.0=py38h1a28f6b_1002 17 | - bzip2=1.0.8=h620ffc9_4 18 | - c-ares=1.19.0=h80987f9_0 19 | - ca-certificates=2023.05.30=hca03da5_0 20 | - certifi=2023.5.7=py38hca03da5_0 21 | - cffi=1.15.1=py38h80987f9_3 22 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 23 | - cloudpickle=2.0.0=pyhd3eb1b0_0 24 | - cryptography=39.0.1=py38h834c97f_0 25 | - filelock=3.9.0=py38hca03da5_0 26 | - gflags=2.2.2=hc377ac9_0 27 | - glog=0.5.0=hc377ac9_0 28 | - gmp=6.2.1=hc377ac9_3 29 | - grpc-cpp=1.46.1=h8e4afa7_1 30 | - icu=68.1=hc377ac9_0 31 | - idna=3.4=py38hca03da5_0 32 | - krb5=1.20.1=h8380606_1 33 | - libboost=1.73.0=h49e8a49_12 34 | - libbrotlicommon=1.0.9=h1a28f6b_7 35 | - libbrotlidec=1.0.9=h1a28f6b_7 36 | - libbrotlienc=1.0.9=h1a28f6b_7 37 | - libcurl=8.1.1=h0f1d93c_1 38 | - libcxx=14.0.6=h848a8c0_0 39 | - libedit=3.1.20221030=h80987f9_0 40 | - libev=4.33=h1a28f6b_1 41 | - libevent=2.1.12=hf27765b_0 42 | - libffi=3.4.4=hca03da5_0 43 | - libgfortran=5.0.0=11_3_0_hca03da5_28 44 | - libgfortran5=11.3.0=h009349e_28 45 | - libiconv=1.16=h1a28f6b_2 46 | - libnghttp2=1.52.0=h10c0552_1 47 | - libopenblas=0.3.21=h269037a_0 48 | - libprotobuf=3.20.3=h514c7bf_0 49 | - libssh2=1.10.0=h449679c_2 50 | - libthrift=0.13.0=hd358383_6 51 | - llvm-openmp=14.0.6=hc6e5704_0 52 | - lz4-c=1.9.4=h313beb8_0 53 | - ncurses=6.4=h313beb8_0 54 | - numpy=1.24.3=py38h1398885_0 55 | - numpy-base=1.24.3=py38h90707a3_0 56 | - openssl=1.1.1u=h1a28f6b_0 57 | - orc=1.7.4=hdca1487_1 58 | - oscrypto=1.2.1=pyhd3eb1b0_0 59 | - packaging=23.0=py38hca03da5_0 60 | - pip=23.1.2=py38hca03da5_0 61 | - pyarrow=10.0.1=py38h23c13bc_0 62 | - pycparser=2.21=pyhd3eb1b0_0 63 | - pycryptodomex=3.15.0=py38h1a28f6b_0 64 | - pyjwt=2.4.0=py38hca03da5_0 65 | - pyopenssl=23.0.0=py38hca03da5_0 66 | - pysocks=1.7.1=py38hca03da5_0 67 | - python=3.8.17=hc0d8a6c_0 68 | - pytz=2022.7=py38hca03da5_0 69 | - re2=2022.04.01=hc377ac9_0 70 | - readline=8.2=h1a28f6b_0 71 | - requests=2.29.0=py38hca03da5_0 72 | - setuptools=67.8.0=py38hca03da5_0 73 | - snappy=1.1.9=hc377ac9_0 74 | - snowflake-connector-python=3.0.3=py38h78102c4_0 75 | - snowflake-snowpark-python=1.5.1=py38hca03da5_0 76 | - sqlite=3.41.2=h80987f9_0 77 | - tk=8.6.12=hb8d0fd4_0 78 | - typing-extensions=4.6.3=py38hca03da5_0 79 | - typing_extensions=4.6.3=py38hca03da5_0 80 | - urllib3=1.26.16=py38hca03da5_0 81 | - utf8proc=2.6.1=h1a28f6b_0 82 | - wheel=0.38.4=py38hca03da5_0 83 | - xz=5.4.2=h80987f9_0 84 | - zlib=1.2.13=h5a0b063_0 85 | - zstd=1.5.5=hd90d995_0 86 | - pip: 87 | - click==8.1.6 88 | - markdown-it-py==3.0.0 89 | - mdurl==0.1.2 90 | - pygments==2.15.1 91 | - rich==13.4.2 92 | - typer==0.9.0 93 | prefix: /opt/homebrew/Caskroom/miniconda/base/envs/tpcdi 94 | -------------------------------------------------------------------------------- /models/gold/dim_customer.sql: -------------------------------------------------------------------------------- 1 | with s1 as ( 2 | select c.*, 3 | p.agency_id, 4 | p.credit_rating, 5 | p.net_worth 6 | FROM {{ ref('customers') }} c 7 | left join {{ ref('syndicated_prospect') }} p 8 | using (first_name, last_name, postal_code, address_line1, address_line2) 9 | ), 10 | s2 as ( 11 | SELECT 12 | {{dbt_utils.generate_surrogate_key(['customer_id','effective_timestamp'])}} sk_customer_id, 13 | customer_id, 14 | coalesce(tax_id, last_value(tax_id) IGNORE NULLS OVER ( 15 | PARTITION BY customer_id 16 | ORDER BY effective_timestamp)) tax_id, 17 | status, 18 | coalesce(last_name, last_value(last_name) IGNORE NULLS OVER ( 19 | PARTITION BY customer_id 20 | ORDER BY effective_timestamp)) last_name, 21 | coalesce(first_name, last_value(first_name) IGNORE NULLS OVER ( 22 | PARTITION BY customer_id 23 | ORDER BY effective_timestamp)) first_name, 24 | coalesce(middle_name, last_value(middle_name) IGNORE NULLS OVER ( 25 | PARTITION BY customer_id 26 | ORDER BY effective_timestamp)) middleinitial, 27 | coalesce(gender, last_value(gender) IGNORE NULLS OVER ( 28 | PARTITION BY customer_id 29 | ORDER BY effective_timestamp)) gender, 30 | coalesce(tier, last_value(tier) IGNORE NULLS OVER ( 31 | PARTITION BY customer_id 32 | ORDER BY effective_timestamp)) tier, 33 | coalesce(dob, last_value(dob) IGNORE NULLS OVER ( 34 | PARTITION BY customer_id 35 | ORDER BY effective_timestamp)) dob, 36 | coalesce(address_line1, last_value(address_line1) IGNORE NULLS OVER ( 37 | PARTITION BY customer_id 38 | ORDER BY effective_timestamp)) address_line1, 39 | coalesce(address_line2, last_value(address_line2) IGNORE NULLS OVER ( 40 | PARTITION BY customer_id 41 | ORDER BY effective_timestamp)) address_line2, 42 | coalesce(postal_code, last_value(postal_code) IGNORE NULLS OVER ( 43 | PARTITION BY customer_id 44 | ORDER BY effective_timestamp)) postal_code, 45 | coalesce(CITY, last_value(CITY) IGNORE NULLS OVER ( 46 | PARTITION BY customer_id 47 | ORDER BY effective_timestamp)) CITY, 48 | coalesce(state_province, last_value(state_province) IGNORE NULLS OVER ( 49 | PARTITION BY customer_id 50 | ORDER BY effective_timestamp)) state_province, 51 | coalesce(country, last_value(country) IGNORE NULLS OVER ( 52 | PARTITION BY customer_id 53 | ORDER BY effective_timestamp)) country, 54 | coalesce(phone1, last_value(phone1) IGNORE NULLS OVER ( 55 | PARTITION BY customer_id 56 | ORDER BY effective_timestamp)) phone1, 57 | coalesce(phone2, last_value(phone2) IGNORE NULLS OVER ( 58 | PARTITION BY customer_id 59 | ORDER BY effective_timestamp)) phone2, 60 | coalesce(phone3, last_value(phone3) IGNORE NULLS OVER ( 61 | PARTITION BY customer_id 62 | ORDER BY effective_timestamp)) phone3, 63 | coalesce(primary_email, last_value(primary_email) IGNORE NULLS OVER ( 64 | PARTITION BY customer_id 65 | ORDER BY effective_timestamp)) primary_email, 66 | coalesce(alternate_email, last_value(alternate_email) IGNORE NULLS OVER ( 67 | PARTITION BY customer_id 68 | ORDER BY effective_timestamp)) alternate_email, 69 | coalesce(local_tax_rate_name, last_value(local_tax_rate_name) IGNORE NULLS OVER ( 70 | PARTITION BY customer_id 71 | ORDER BY effective_timestamp)) local_tax_rate_name, 72 | coalesce(local_tax_rate, last_value(local_tax_rate) IGNORE NULLS OVER ( 73 | PARTITION BY customer_id 74 | ORDER BY effective_timestamp)) local_tax_rate, 75 | coalesce(national_tax_rate_name, last_value(national_tax_rate_name) IGNORE NULLS OVER ( 76 | PARTITION BY customer_id 77 | ORDER BY effective_timestamp)) national_tax_rate_name, 78 | coalesce(national_tax_rate, last_value(national_tax_rate) IGNORE NULLS OVER ( 79 | PARTITION BY customer_id 80 | ORDER BY effective_timestamp)) national_tax_rate, 81 | agency_id, 82 | credit_rating, 83 | net_worth, 84 | effective_timestamp, 85 | end_timestamp, 86 | is_current 87 | FROM s1 88 | ) 89 | select * 90 | from s2 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is this? 2 | This project contains two distinct portions: 3 | 1. A CLI developed using [Snowpark Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index) and [Typer](https://typer.tiangolo.com/). This CLI creates tables in Snowflake for the files generated by the [TPC-DI](https://www.tpc.org/tpcdi/) benchmark specification. 4 | 2. A [dbt](https://www.getdbt.com/) project using [Snowflake Dynamic Tables](https://docs.snowflake.com/en/user-guide/dynamic-tables-about) for building the data warehouse defined in the TPC-DI specification. 5 | 6 | 7 | ![Figure 1.2–1 from the TPC-DI specification describes the ETL process.](images/tpc-di-etl-diagram.png) 8 | 9 | I should be clear in my goal here: I am not attempting to _actually run and measure_ the TPC-DI benchmark. 10 | The `tpcdi.py` CLI in this repository is single-threaded and ingests the data sequentially, which would be the worst approach if trying to optimize for a benchmark. 11 | Instead, I needed a dataset that could be used to mimic data engineering workloads inside of Snowflake, so I just wanted the files loaded. 12 | 13 | 14 | ![Figure 1.4–1 from the TPC-DI specification describes the target logical model. More on DimTrade later.](images/tpc-di-logical-model.png) 15 | 16 | 17 | I took a few liberties with the TPC-DI specification to update it a bit for Snowflake. I replaced `CamelCase` names with `SNAKE_CASE`, mostly out of irritation with readability. 18 | Secondly, I just couldn't stand for having the `DimTrade` table be "both a dimension table and a fact table, depending on how it is being used" as it was designed by TPC-DI. 19 | This decision was made during an era when storage and compute were constrained, so in my design, I created both `DIM_TRADE` and `FACT_TRADE` tables. 20 | Finally, I used a Medallion Lakehouse Architecture with Bronze, Silver, and Gold zones, with the logical model above materialized in the Gold zone. 21 | 22 | # Demo Notebook 23 | I've included a [demo Jupyter notebook](demo.ipynb) that walks through a subset of the instructions below. This notebook pairs with a presentation I give that focuses more on how Snowpark Python was used in the CLI, and also the motivation for using Dynamic Tables with dbt. But it may still be helpful. 24 | 25 | # Using DIGen.jar to Generate Source Files 26 | The Java program to generate the source files is downloaded by [filling out a form on the TPC-DI website](https://www.tpc.org/TPC_Documents_Current_Versions/download_programs/tools-download-request5.asp?bm_type=TPC-DI&bm_vers=1.1.0&mode=CURRENT-ONLY) and clicking a link in an email. Once unzipped, we have to make one slight change for running on macOS: 27 | 28 | ```bash 29 | unzip 66a2b600-af36-4198-bfbc-c94c40cc22af-tpc-di-tool.zip && \ 30 | mv Tools/PDGF Tools/pdgf && \ #clearly written on a case-insensitive OS 31 | cd Tools 32 | ``` 33 | 34 | I couldn't find any way to execute the provided JAR with a Java version newer than `1.8`. I spent some time trying to rebuild the JAR file using a newer Java version for posterity, but it was a fool's errand. 35 | I installed [Azul Zulu Java 1.8](https://www.azul.com/downloads/?package=jdk#zulu) and used [jEnv](https://www.jenv.be/) to set a local version, and with that, we can see the help context from the JAR: 36 | 37 | ``` 38 | jenv add /Library/Java/JavaVirtualMachines/zulu-8.jdk/Contents/Home && \ 39 | jenv local 1.8 && \ 40 | java -jar DIGen.jar --help 41 | 42 | usage: DIGen 43 | -h print this message 44 | -jvm JVM options. E.g. -jvm "-Xms1g -Xmx2g" 45 | -o Specify output directory. Default is output. 46 | -sf Scale factor. Default value is 5. (range: 3 - 47 | 2147483647 48 | -v print DIGen version 49 | ``` 50 | 51 | This utility will generate a bunch of different data files in various formats with a scaling (or multiplication) factor determining how much data the files contain. 52 | It attempts to mimic ETL processes at the time the specification was written, which generally utilized file extracts from source systems. It generates CSVs and pipe-separated files (PSVs?), which are quite simple with Snowpark. 53 | The two file formats that proved the most fun and challenging were fixed-width fields and XML, as both required heavy DataFrame transformations. The files are generated in batches, with `Batch1` representing the historical load, and `Batch2` and `Batch3` representing various incremental loads. 54 | Currently, I've only tested the loader against `Batch1` and the dbt models have not yet been extended to handle additional batches. 55 | Also, I haven't yet implemented the Audit portion of the specification, which is somewhat embarrassing as a former co-founder of a data quality company. 56 | 57 | ```bash 58 | java -jar DIGen.jar -o ~/dev/tpcdi-output -sf 10 && \ 59 | ls -lhtr ~/dev/tpcdi-output/Batch1 60 | 61 | -rw-r--r-- 1 stewartbryson staff 52B Jul 21 14:30 StatusType_audit.csv 62 | -rw-r--r-- 1 stewartbryson staff 89B Jul 21 14:30 StatusType.txt 63 | -rw-r--r-- 1 stewartbryson staff 52B Jul 21 14:30 TaxRate_audit.csv 64 | -rw-r--r-- 1 stewartbryson staff 17K Jul 21 14:30 TaxRate.txt 65 | -rw-r--r-- 1 stewartbryson staff 52B Jul 21 14:30 Date_audit.csv 66 | -rw-r--r-- 1 stewartbryson staff 3.3M Jul 21 14:30 Date.txt 67 | -rw-r--r-- 1 stewartbryson staff 52B Jul 21 14:30 Time_audit.csv 68 | -rw-r--r-- 1 stewartbryson staff 4.6M Jul 21 14:30 Time.txt 69 | -rw-r--r-- 1 stewartbryson staff 12B Jul 21 14:30 BatchDate.txt 70 | -rw-r--r-- 1 stewartbryson staff 3.9M Jul 21 14:30 HR.csv 71 | -rw-r--r-- 1 stewartbryson staff 84B Jul 21 14:30 HR_audit.csv 72 | -rw-r--r-- 1 stewartbryson staff 391B Jul 21 14:30 CustomerMgmt_audit.csv 73 | -rw-r--r-- 1 stewartbryson staff 30M Jul 21 14:30 CustomerMgmt.xml 74 | -rw-r--r-- 1 stewartbryson staff 10M Jul 21 14:30 Prospect.csv 75 | -rw-r--r-- 1 stewartbryson staff 141B Jul 21 14:30 Prospect_audit.csv 76 | -rw-r--r-- 1 stewartbryson staff 52B Jul 21 14:30 Industry_audit.csv 77 | -rw-r--r-- 1 stewartbryson staff 2.6K Jul 21 14:30 Industry.txt 78 | -rw-r--r-- 1 stewartbryson staff 81K Jul 21 14:30 FINWIRE1967Q1 79 | -rw-r--r-- 1 stewartbryson staff 219B Jul 21 14:30 FINWIRE1967Q1_audit.csv 80 | -rw-r--r-- 1 stewartbryson staff 77K Jul 21 14:30 FINWIRE1967Q2 81 | 82 | { Truncated } 83 | ``` 84 | 85 | # The Python Snowpark Loader: tpcdi.py 86 | I used Conda to build my virtual environment, with all the steps to replicate in the snippet below: 87 | 88 | ```bash 89 | conda env create -f environment.yml && \ 90 | conda activate tpcdi && \ 91 | python tpcdi.py --help 92 | ``` 93 | 94 | ![Output from `tpcdi.py --help`](images/tpcdi-help.png) 95 | 96 | I created the loader using Typer for the CLI interface and Snowpark for uploading files, creating DataFrames from those files -- in some cases doing heavy transformations -- and saving them as tables. 97 | Credentials are provided using a credentials.json file in the root directory, and looks like this: 98 | 99 | ```json 100 | { 101 | "account": "myaccount", 102 | "user": "myuser", 103 | "password": "mypassword", 104 | "role": "myrole", 105 | "warehouse": "stewart_dev", 106 | "database": "tpc_di", 107 | "schema": "digen" 108 | } 109 | ``` 110 | 111 | Some improvements could be made to the credentials, schema, and database handling. 112 | The utility writes the data to whatever database and schema is specified in the credentials, so those aspects of the connection are required. 113 | Loading the files is accomplished with the process-files command and we can see the help context below: 114 | 115 | ``` 116 | ❯ python tpcdi.py process-files --help 117 | ``` 118 | 119 | ![Output from `tpcdi.py process-files --help`](images/tpcdi-process-files-help.png) 120 | 121 | Let's start by loading a rather simple file - the `StatusType.txt`  which is pipe-delimited. 122 | I'll first demonstrate the `--show` option, which displays a sample of the DataFrame instead of loading it to a table. 123 | As you might have guessed, I added this option to aid in the development of the loader. 124 | Then I'll demonstrate loading the table: 125 | 126 | ``` 127 | ❯ python tpcdi.py process-files --output-directory ~/dev/tpcdi-output \ 128 | --file-name StatusType.txt --show 129 | 130 | File StatusType.txt: UPLOADED 131 | ----------------------- 132 | |"ST_ID" |"ST_NAME" | 133 | ----------------------- 134 | |ACTV |Active | 135 | |CMPT |Completed | 136 | |CNCL |Canceled | 137 | |PNDG |Pending | 138 | |SBMT |Submitted | 139 | |INAC |Inactive | 140 | ----------------------- 141 | 142 | ❯ python tpcdi.py process-files --output-directory ~/dev/tpcdi-output \ 143 | --file-name StatusType.txt 144 | 145 | File StatusType.txt: SKIPPED 146 | STATUS_TYPE table created. 147 | 148 | ❯ snowsql -c demo 149 | * SnowSQL * v1.2.27 150 | Type SQL statements or !help 151 | stewartbryson#STEWART_DEV@TPC_DI.DIGEN>select * from STATUS_TYPE; 152 | +-------+-----------+ 153 | | ST_ID | ST_NAME | 154 | |-------+-----------| 155 | | ACTV | Active | 156 | | CMPT | Completed | 157 | | CNCL | Canceled | 158 | | PNDG | Pending | 159 | | SBMT | Submitted | 160 | | INAC | Inactive | 161 | +-------+-----------+ 162 | 6 Row(s) produced. Time Elapsed: 0.752s 163 | stewartbryson#STEWART_DEV@TPC_DI.DIGEN> 164 | ``` 165 | 166 | Notice that the second execution used Snowpark's `overwrite=False` feature of skipping already existing files during a put, which can be overridden with the `--overwrite` option. 167 | For now, let's get the rest of the files loaded so we can move on to the dbt models. 168 | All DataFrames are saved in `overwrite` mode, so we can run it again without duplicating data: 169 | 170 | ```bash 171 | ❯ python tpcdi.py process-files --output-directory ~/dev/tpcdi-output 172 | 173 | File Date.txt: UPLOADED 174 | DATE table created. 175 | File DailyMarket.txt: UPLOADED 176 | DAILY_MARKET table created. 177 | File Industry.txt: UPLOADED 178 | INDUSTRY table created. 179 | File Prospect.csv: UPLOADED 180 | PROSPECT table created. 181 | File CustomerMgmt.xml: UPLOADED 182 | CUSTOMER_MGMT table created. 183 | File TaxRate.txt: UPLOADED 184 | TAX_RATE table created. 185 | File HR.csv: UPLOADED 186 | HR table created. 187 | File WatchHistory.txt: UPLOADED 188 | WATCH_HISTORY table created. 189 | File Trade.txt: UPLOADED 190 | TRADE table created. 191 | File TradeHistory.txt: UPLOADED 192 | TRADE_HISTORY table created. 193 | File StatusType.txt: SKIPPED 194 | STATUS_TYPE table created. 195 | 196 | { Truncated } 197 | ``` 198 | 199 | # Building the Data Warehouse with dbt 200 | I could have also used Snowpark Python to build the transformation pipelines as well, but instead, I wanted to use Dynamic Tables, and the ability to easily do this using dbt made the decision easy. 201 | 202 | In the Medallion architecture, we typically append raw data in their original format into Bronze, business entities modeled in Silver, and our highly curated facts and dimensions in Gold. 203 | I'm loading with an x-small warehouse and 4 threads, with a `DIGen.jar` scaling factor of 10. 204 | The dbt DAG looks like this: 205 | 206 | ![dbt DAG](images/dbt-dag.png) 207 | 208 | ```bash 209 | ❯ dbt build 210 | 16:55:34 Running with dbt=1.7.2 211 | 16:55:34 Registered adapter: snowflake=1.7.0 212 | 16:55:34 Found 45 models, 1 test, 17 sources, 0 exposures, 0 metrics, 544 macros, 0 groups, 0 semantic models 213 | 16:55:34 214 | 16:55:36 Concurrency: 20 threads (target='dev') 215 | 16:55:36 216 | 16:55:36 1 of 45 START sql dynamic_table model dl_bronze.brokerage_cash_transaction ..... [RUN] 217 | 16:55:36 2 of 45 START sql dynamic_table model dl_bronze.brokerage_daily_market ......... [RUN] 218 | 16:55:36 3 of 45 START sql dynamic_table model dl_bronze.brokerage_holding_history ...... [RUN] 219 | 16:55:36 4 of 45 START sql dynamic_table model dl_bronze.brokerage_trade ................ [RUN] 220 | 16:55:36 5 of 45 START sql dynamic_table model dl_bronze.brokerage_trade_history ........ [RUN] 221 | 16:55:36 6 of 45 START sql dynamic_table model dl_bronze.brokerage_watch_history ........ [RUN] 222 | 16:55:36 7 of 45 START sql dynamic_table model dl_bronze.crm_customer_mgmt .............. [RUN] 223 | 16:55:36 8 of 45 START sql dynamic_table model dl_bronze.finwire_company ................ [RUN] 224 | 16:55:36 9 of 45 START sql dynamic_table model dl_bronze.finwire_financial .............. [RUN] 225 | 16:55:36 10 of 45 START sql dynamic_table model dl_bronze.finwire_security .............. [RUN] 226 | 16:55:36 11 of 45 START sql dynamic_table model dl_bronze.hr_employee ................... [RUN] 227 | 16:55:36 12 of 45 START sql dynamic_table model dl_bronze.reference_date ................ [RUN] 228 | 16:55:36 13 of 45 START sql dynamic_table model dl_bronze.reference_industry ............ [RUN] 229 | 16:55:36 14 of 45 START sql dynamic_table model dl_bronze.reference_status_type ......... [RUN] 230 | 16:55:36 15 of 45 START sql dynamic_table model dl_bronze.reference_tax_rate ............ [RUN] 231 | 16:55:36 16 of 45 START sql dynamic_table model dl_bronze.reference_trade_type .......... [RUN] 232 | 16:55:36 17 of 45 START sql dynamic_table model dl_bronze.syndicated_prospect ........... [RUN] 233 | 16:55:38 13 of 45 OK created sql dynamic_table model dl_bronze.reference_industry ....... [SUCCESS 1 in 2.54s] 234 | 16:55:39 12 of 45 OK created sql dynamic_table model dl_bronze.reference_date ........... [SUCCESS 1 in 2.85s] 235 | 16:55:39 18 of 45 START sql dynamic_table model dl_silver.date .......................... [RUN] 236 | 16:55:39 14 of 45 OK created sql dynamic_table model dl_bronze.reference_status_type .... [SUCCESS 1 in 3.09s] 237 | 16:55:39 15 of 45 OK created sql dynamic_table model dl_bronze.reference_tax_rate ....... [SUCCESS 1 in 3.09s] 238 | 16:55:39 16 of 45 OK created sql dynamic_table model dl_bronze.reference_trade_type ..... [SUCCESS 1 in 3.21s] 239 | 16:55:39 9 of 45 OK created sql dynamic_table model dl_bronze.finwire_financial ......... [SUCCESS 1 in 3.57s] 240 | 16:55:40 8 of 45 OK created sql dynamic_table model dl_bronze.finwire_company ........... [SUCCESS 1 in 4.08s] 241 | 16:55:40 11 of 45 OK created sql dynamic_table model dl_bronze.hr_employee .............. [SUCCESS 1 in 4.08s] 242 | 16:55:40 19 of 45 START sql dynamic_table model dl_silver.companies ..................... [RUN] 243 | 16:55:40 20 of 45 START sql dynamic_table model dl_silver.employees ..................... [RUN] 244 | 16:55:40 10 of 45 OK created sql dynamic_table model dl_bronze.finwire_security ......... [SUCCESS 1 in 4.18s] 245 | 16:55:40 7 of 45 OK created sql dynamic_table model dl_bronze.crm_customer_mgmt ......... [SUCCESS 1 in 4.32s] 246 | 16:55:40 21 of 45 START sql dynamic_table model dl_silver.accounts ...................... [RUN] 247 | 16:55:40 22 of 45 START sql dynamic_table model dl_silver.customers ..................... [RUN] 248 | 16:55:41 18 of 45 OK created sql dynamic_table model dl_silver.date ..................... [SUCCESS 1 in 2.45s] 249 | 16:55:41 23 of 45 START sql dynamic_table model dl_gold.dim_date ........................ [RUN] 250 | 16:55:41 17 of 45 OK created sql dynamic_table model dl_bronze.syndicated_prospect ...... [SUCCESS 1 in 5.55s] 251 | 16:55:42 1 of 45 OK created sql dynamic_table model dl_bronze.brokerage_cash_transaction [SUCCESS 1 in 6.55s] 252 | 16:55:43 21 of 45 OK created sql dynamic_table model dl_silver.accounts ................. [SUCCESS 1 in 2.52s] 253 | 16:55:43 24 of 45 START sql dynamic_table model dl_silver.cash_transactions ............. [RUN] 254 | 16:55:43 19 of 45 OK created sql dynamic_table model dl_silver.companies ................ [SUCCESS 1 in 2.77s] 255 | 16:55:43 26 of 45 START sql dynamic_table model dl_silver.financials .................... [RUN] 256 | 16:55:43 25 of 45 START sql dynamic_table model dl_gold.dim_company ..................... [RUN] 257 | 16:55:43 27 of 45 START sql dynamic_table model dl_silver.securities .................... [RUN] 258 | 16:55:44 22 of 45 OK created sql dynamic_table model dl_silver.customers ................ [SUCCESS 1 in 3.64s] 259 | 16:55:44 28 of 45 START sql dynamic_table model dl_gold.dim_customer .................... [RUN] 260 | 16:55:44 20 of 45 OK created sql dynamic_table model dl_silver.employees ................ [SUCCESS 1 in 4.11s] 261 | 16:55:44 29 of 45 START sql dynamic_table model dl_gold.dim_broker ...................... [RUN] 262 | 16:55:44 4 of 45 OK created sql dynamic_table model dl_bronze.brokerage_trade ........... [SUCCESS 1 in 8.22s] 263 | 16:55:44 2 of 45 OK created sql dynamic_table model dl_bronze.brokerage_daily_market .... [SUCCESS 1 in 8.23s] 264 | 16:55:44 30 of 45 START sql dynamic_table model dl_silver.daily_market .................. [RUN] 265 | 16:55:44 23 of 45 OK created sql dynamic_table model dl_gold.dim_date ................... [SUCCESS 1 in 2.94s] 266 | 16:55:44 3 of 45 OK created sql dynamic_table model dl_bronze.brokerage_holding_history . [SUCCESS 1 in 8.49s] 267 | 16:55:46 25 of 45 OK created sql dynamic_table model dl_gold.dim_company ................ [SUCCESS 1 in 3.33s] 268 | 16:55:47 6 of 45 OK created sql dynamic_table model dl_bronze.brokerage_watch_history ... [SUCCESS 1 in 10.86s] 269 | 16:55:47 29 of 45 OK created sql dynamic_table model dl_gold.dim_broker ................. [SUCCESS 1 in 2.98s] 270 | 16:55:47 27 of 45 OK created sql dynamic_table model dl_silver.securities ............... [SUCCESS 1 in 4.76s] 271 | 16:55:47 31 of 45 START sql dynamic_table model dl_gold.dim_security .................... [RUN] 272 | 16:55:47 32 of 45 START sql dynamic_table model dl_silver.watches_history ............... [RUN] 273 | 16:55:48 5 of 45 OK created sql dynamic_table model dl_bronze.brokerage_trade_history ... [SUCCESS 1 in 11.82s] 274 | 16:55:48 33 of 45 START sql dynamic_table model dl_silver.trades_history ................ [RUN] 275 | 16:55:48 28 of 45 OK created sql dynamic_table model dl_gold.dim_customer ............... [SUCCESS 1 in 4.58s] 276 | 16:55:48 34 of 45 START sql dynamic_table model dl_gold.dim_account ..................... [RUN] 277 | 16:55:49 24 of 45 OK created sql dynamic_table model dl_silver.cash_transactions ........ [SUCCESS 1 in 5.91s] 278 | 16:55:49 30 of 45 OK created sql dynamic_table model dl_silver.daily_market ............. [SUCCESS 1 in 4.63s] 279 | 16:55:50 26 of 45 OK created sql dynamic_table model dl_silver.financials ............... [SUCCESS 1 in 7.20s] 280 | 16:55:51 31 of 45 OK created sql dynamic_table model dl_gold.dim_security ............... [SUCCESS 1 in 3.81s] 281 | 16:55:51 35 of 45 START sql dynamic_table model dl_gold.fact_market_history ............. [RUN] 282 | 16:55:52 34 of 45 OK created sql dynamic_table model dl_gold.dim_account ................ [SUCCESS 1 in 3.71s] 283 | 16:55:52 36 of 45 START sql dynamic_table model dl_gold.fact_cash_transactions .......... [RUN] 284 | 16:55:54 32 of 45 OK created sql dynamic_table model dl_silver.watches_history .......... [SUCCESS 1 in 6.08s] 285 | 16:55:54 37 of 45 START sql dynamic_table model dl_silver.watches ....................... [RUN] 286 | 16:55:58 36 of 45 OK created sql dynamic_table model dl_gold.fact_cash_transactions ..... [SUCCESS 1 in 5.65s] 287 | 16:55:58 38 of 45 START sql dynamic_table model dl_gold.fact_cash_balances .............. [RUN] 288 | 16:56:00 37 of 45 OK created sql dynamic_table model dl_silver.watches .................. [SUCCESS 1 in 6.13s] 289 | 16:56:00 39 of 45 START sql dynamic_table model dl_gold.fact_watches .................... [RUN] 290 | 16:56:00 33 of 45 OK created sql dynamic_table model dl_silver.trades_history ........... [SUCCESS 1 in 12.60s] 291 | 16:56:00 40 of 45 START sql dynamic_table model dl_gold.dim_trade ....................... [RUN] 292 | 16:56:00 41 of 45 START sql dynamic_table model dl_silver.trades ........................ [RUN] 293 | 16:56:03 38 of 45 OK created sql dynamic_table model dl_gold.fact_cash_balances ......... [SUCCESS 1 in 5.45s] 294 | 16:56:05 40 of 45 OK created sql dynamic_table model dl_gold.dim_trade .................. [SUCCESS 1 in 4.51s] 295 | 16:56:06 39 of 45 OK created sql dynamic_table model dl_gold.fact_watches ............... [SUCCESS 1 in 6.40s] 296 | 16:56:08 41 of 45 OK created sql dynamic_table model dl_silver.trades ................... [SUCCESS 1 in 7.40s] 297 | 16:56:08 42 of 45 START sql dynamic_table model dl_silver.holdings_history .............. [RUN] 298 | 16:56:08 43 of 45 START sql dynamic_table model dl_gold.fact_trade ...................... [RUN] 299 | 16:56:15 42 of 45 OK created sql dynamic_table model dl_silver.holdings_history ......... [SUCCESS 1 in 7.03s] 300 | 16:56:15 44 of 45 START sql dynamic_table model dl_gold.fact_holdings ................... [RUN] 301 | 16:56:22 43 of 45 OK created sql dynamic_table model dl_gold.fact_trade ................. [SUCCESS 1 in 14.53s] 302 | 16:56:22 45 of 45 START test fact_trade__unique_trade ................................... [RUN] 303 | 16:56:23 45 of 45 PASS fact_trade__unique_trade ......................................... [PASS in 1.30s] 304 | 16:56:25 44 of 45 OK created sql dynamic_table model dl_gold.fact_holdings .............. [SUCCESS 1 in 9.95s] 305 | 16:56:44 35 of 45 OK created sql dynamic_table model dl_gold.fact_market_history ........ [SUCCESS 1 in 53.21s] 306 | 16:56:44 307 | 16:56:44 Finished running 44 dynamic_table models, 1 test in 0 hours 1 minutes and 10.51 seconds (70.51s). 308 | 16:56:45 309 | 16:56:45 Completed successfully 310 | 16:56:45 311 | 16:56:45 Done. PASS=45 WARN=0 ERROR=0 SKIP=0 TOTAL=45 312 | ``` 313 | 314 | Once the DAG has been created in Snowflake, we can browse it in the UI: 315 | 316 | ![Snowflake Graph](images/snowflake-dag.png) 317 | 318 | It's important to note that Snowflake is only aware of all the dependent relations _after_ the tables have been created. 319 | dbt is aware of them before. 320 | 321 | # Future Enhancements 322 | Although it wasn't my goal, it would be cool to enhance this project so that it could be used to run and measure the benchmark. 323 | These are my thoughts on where to take this next: 324 | 325 | 1. Complete `Batch2` and `Batch3` using dbt incremental models, and put the audit queries in as dbt tests. 326 | 2. Refactor tpcdi.py to only upload the files and do that concurrently, then put all the Snowpark transformations into procedures so they can be executed as concurrent tasks. 327 | 3. Maybe take another pass at credential handling, using the `config.toml` from [Snowflake CLI](https://github.com/snowflake-labs/snowcli#cli-tour-and-quickstart). 328 | Provide a command-line option `--schema` so it can be specified during loading, instead of using `CURRENT_SCHEMA`. 329 | 330 | If you are interested in contributing, jump on board. You don't need my permission, or even incredible skill, clearly. 331 | Just open a pull request. 332 | -------------------------------------------------------------------------------- /tpcdi.py: -------------------------------------------------------------------------------- 1 | import sys, typer, json, re, logging 2 | from snowflake.snowpark import Session, DataFrame 3 | from snowflake.snowpark.types import * 4 | from snowflake.snowpark.functions import * 5 | from typing_extensions import Annotated 6 | from pathlib import Path 7 | 8 | app = typer.Typer(help="A utility for loading TPC-DI generated files into Snowflake.") 9 | 10 | def get_session(): 11 | # Read the credentials.json file 12 | with open("credentials.json") as jsonfile: 13 | credentials_dict = json.load(jsonfile) 14 | 15 | # build the session 16 | session = Session.builder.configs(credentials_dict).create() 17 | #print(f"Session: {session}") 18 | 19 | # and return it 20 | return session 21 | 22 | @app.command(help="CREATE or REPLACE the stage. Mostly useful while developing this utility.") 23 | def recreate_stage( 24 | stage: Annotated[str, typer.Option(help="Name of the stage to recreate.")] = 'tpcdi', 25 | ): 26 | session = get_session() 27 | session.sql(f"create or replace stage {stage} directory = (enable = true)").collect() 28 | print(f"Stage {stage} recreated.") 29 | 30 | @app.command(help="DROP the stage. Useful when all the data has been successfully loaded.") 31 | def drop_stage( 32 | stage: Annotated[str, typer.Option(help="Name of the stage to drop.")] = 'tpcdi', 33 | ): 34 | session = get_session() 35 | session.sql(f"drop stage {stage}").collect() 36 | print(f"Stage {stage} dropped.") 37 | 38 | @app.command(help="DROP a schema. Useful for cleaning up after a demo.") 39 | def drop_schema( 40 | schema: Annotated[str, typer.Option(help="Name of the schema to drop.")], 41 | ): 42 | session = get_session() 43 | session.sql(f"drop schema if exists {schema}").collect() 44 | print(f"Schema {schema} dropped.") 45 | 46 | @app.command(help="Upload a file or files into the stage and build the dependent tables.") 47 | def process_files( 48 | output_directory: Annotated[str, typer.Option(help='The output directory from the TPC-DI DIGen.jar execution.')], 49 | file_name: Annotated[str, typer.Option(help="The TPC-DI file name to upload and process. Pass value 'FINWIRE' to process all of the financial wire files.")] = 'all', 50 | stage: Annotated[str, typer.Option(help="The stage name to upload to, without specifying '@'.")] = 'tpcdi', 51 | batch: Annotated[int, typer.Option(help="The TPC-DI batch number to process. Currently only supports the default of '1'.")] = 1, 52 | overwrite: Annotated[bool, typer.Option(help="Overwrite the file even if it exists?")] = False, 53 | skip_upload: Annotated[bool, typer.Option(help="Skip uploading the files?")] = False, 54 | show: Annotated[bool, typer.Option(help="Show the DataFrame instead of saving it as a table? This was useful during development.")] = False, 55 | ): 56 | session = get_session() 57 | 58 | # create the stage if it doesn't exist 59 | ( 60 | session 61 | .sql(f"create stage if not exists {stage} directory = (enable = true)") 62 | .collect() 63 | ) 64 | 65 | def get_stage_path( 66 | stage:str, 67 | file_name:str, 68 | ): 69 | return f"@{stage}/Batch{batch}/{file_name}" 70 | 71 | # method to control printing the dataframe or saving it 72 | def save_df( 73 | df: DataFrame, 74 | table_name: str 75 | ): 76 | if show: 77 | df.show() 78 | else: 79 | ( 80 | df.write 81 | .mode("overwrite") 82 | .save_as_table(table_name) 83 | ) 84 | 85 | print(f"{table_name.upper()} table created.") 86 | 87 | # method for uploading files 88 | def upload_files( 89 | file_name: str, 90 | stage_path: str, 91 | ): 92 | 93 | delimiter="|" 94 | 95 | if file_name == 'FINWIRE': 96 | pathlist = Path(output_directory).glob(f"Batch{batch}/FINWIRE??????") 97 | else: 98 | pathlist = Path(output_directory).glob(f"Batch{batch}/{file_name}") 99 | 100 | for file in pathlist: 101 | # capture the delimiter 102 | if file_name != 'FINWIRE': 103 | logging.info(f"Suffix: {file.suffix}") 104 | if file.suffix == '.csv': 105 | delimiter=',' 106 | logging.info(f"Delimiter: {delimiter}") 107 | 108 | # put the file(s) in the stage 109 | if not skip_upload: 110 | put_result = ( 111 | session 112 | .file 113 | .put( 114 | str(file), 115 | stage_path, 116 | overwrite=overwrite, 117 | parallel=4, 118 | auto_compress=True 119 | ) 120 | ) 121 | for result in put_result: 122 | print(f"File {result.source}: {result.status}") 123 | 124 | # return the delimiter 125 | return delimiter 126 | 127 | # method for creating a table from a simple CSV 128 | # works for either comma or pipe delimited, detecting that automatically by suffix 129 | def load_csv( 130 | schema: StructType, 131 | file_name: str, 132 | table_name: str, 133 | ): 134 | stage_path = get_stage_path(stage, file_name) 135 | delimiter=upload_files(file_name, stage_path) 136 | 137 | df = ( 138 | session 139 | .read 140 | .schema(schema) 141 | .option("field_delimiter", delimiter) 142 | .csv(stage_path) 143 | ) 144 | 145 | save_df(df, table_name) 146 | 147 | # Simplifies the DataFrame transformations for retrieving XML elements 148 | def get_xml_element( 149 | column:str, 150 | element:str, 151 | datatype:str, 152 | with_alias:bool = True 153 | ): 154 | new_element = get(xmlget(col(column), lit(element)), lit('$')).cast(datatype) 155 | # alias needs to be optional 156 | return ( 157 | new_element.alias(element) if with_alias else new_element 158 | ) 159 | 160 | # Simplifies the DataFrame transformations for retrieving XML attributes 161 | def get_xml_attribute( 162 | column:str, 163 | attribute: str, 164 | datatype: str, 165 | with_alias:bool = True 166 | ): 167 | new_attribute = get(col(column), lit(f"@{attribute}")).cast(datatype) 168 | # alias needs to be optional 169 | return ( 170 | new_attribute.alias(attribute) if with_alias else new_attribute 171 | ) 172 | 173 | # Simplifies the logic for constructing a phone number from multiple nested fields 174 | def get_phone_number( 175 | phone_id:str, 176 | separator:str = '-' 177 | ): 178 | return concat( 179 | get_xml_element(f"phone{phone_id}", 'C_CTRY_CODE', 'STRING', False), 180 | when(get_xml_element(f"phone{phone_id}", 'C_CTRY_CODE', 'STRING', False) == '', '').otherwise(separator), 181 | get_xml_element(f"phone{phone_id}", 'C_AREA_CODE', 'STRING', False), 182 | when(get_xml_element(f"phone{phone_id}", 'C_AREA_CODE', 'STRING', False) == '', '').otherwise(separator), 183 | get_xml_element(f"phone{phone_id}", 'C_LOCAL', 'STRING', False), 184 | when(get_xml_element(f"phone{phone_id}", 'C_EXT', 'STRING', False) == '', '').otherwise(" ext: "), 185 | get_xml_element(f"phone{phone_id}", 'C_EXT', 'STRING', False) 186 | ).alias(f"c_phone_{phone_id}") 187 | 188 | 189 | ### Start defining and loading the actual tables 190 | ### Variable 'con_file_name' declaration acts as the comment. 191 | 192 | con_file_name = 'Date.txt' 193 | if file_name in ['all', con_file_name]: 194 | schema = StructType([ 195 | StructField("SK_DATE_ID", IntegerType(), False), 196 | StructField("DATE_VALUE", DateType(), False), 197 | StructField("DATE_DESC", StringType(), False), 198 | StructField("CALENDAR_YEAR_ID", IntegerType(), False), 199 | StructField("CALENDAR_YEAR_DESC", StringType(), False), 200 | StructField("CALENDAR_QTR_ID", IntegerType(), False), 201 | StructField("CALENDAR_QTR_DESC", StringType(), False), 202 | StructField("CALENDAR_MONTH_ID", IntegerType(), False), 203 | StructField("CALENDAR_MONTH_DESC", StringType(), False), 204 | StructField("CALENDAR_WEEK_ID", IntegerType(), False), 205 | StructField("CALENDAR_WEEK_DESC", StringType(), False), 206 | StructField("DAY_OF_WEEK_NUM", IntegerType(), False), 207 | StructField("DAY_OF_WEEK_DESC", StringType(), False), 208 | StructField("FISCAL_YEAR_ID", IntegerType(), False), 209 | StructField("FISCAL_YEAR_DESC", StringType(), False), 210 | StructField("FISCAL_QTR_ID", IntegerType(), False), 211 | StructField("FISCAL_QTR_DESC", StringType(), False), 212 | StructField("HOLIDAY_FLAG", BooleanType(), False), 213 | ]) 214 | load_csv(schema, con_file_name, 'date') 215 | 216 | con_file_name = 'DailyMarket.txt' 217 | if file_name in ['all', con_file_name]: 218 | schema = StructType([ 219 | StructField("DM_DATE", DateType(), False), 220 | StructField("DM_S_SYMB", StringType(), False), 221 | StructField("DM_CLOSE", FloatType(), False), 222 | StructField("DM_HIGH", FloatType(), False), 223 | StructField("DM_LOW", FloatType(), False), 224 | StructField("DM_VOL", FloatType(), False), 225 | ]) 226 | load_csv(schema, con_file_name, 'daily_market') 227 | 228 | con_file_name = 'Industry.txt' 229 | if file_name in ['all', con_file_name]: 230 | schema = StructType([ 231 | StructField("IN_ID", StringType(), False), 232 | StructField("IN_NAME", StringType(), False), 233 | StructField("IN_SC_ID", StringType(), False), 234 | ]) 235 | load_csv(schema, con_file_name, 'industry') 236 | 237 | con_file_name = 'Prospect.csv' 238 | if file_name in ['all', con_file_name]: 239 | schema = StructType([ 240 | StructField("AGENCY_ID", StringType(), False), 241 | StructField("LAST_NAME", StringType(), True), 242 | StructField("FIRST_NAME", StringType(), True), 243 | StructField("MIDDLE_INITIAL", StringType(), True), 244 | StructField("GENDER", StringType(), True), 245 | StructField("ADDRESS_LINE1", StringType(), True), 246 | StructField("ADDRESS_LINE2", StringType(), True), 247 | StructField("POSTAL_CODE", StringType(), True), 248 | StructField("CITY", StringType(), True), 249 | StructField("STATE", StringType(), True), 250 | StructField("COUNTRY", StringType(), True), 251 | StructField("PHONE", StringType(), True), 252 | StructField("INCOME", IntegerType(), True), 253 | StructField("NUMBER_CARS", IntegerType(), True), 254 | StructField("NUMBER_CHILDREN", IntegerType(), True), 255 | StructField("MARITAL_STATUS", StringType(), True), 256 | StructField("AGE", IntegerType(), True), 257 | StructField("CREDIT_RATING", IntegerType(), True), 258 | StructField("OWN_OR_RENT_FLAG", StringType(), True), 259 | StructField("EMPLOYER", StringType(), True), 260 | StructField("NUMBER_CREDIT_CARDS", IntegerType(), True), 261 | StructField("NET_WORTH", IntegerType(), True), 262 | ]) 263 | load_csv(schema, con_file_name, 'prospect') 264 | 265 | con_file_name = 'CustomerMgmt.xml' 266 | if file_name in ['all', con_file_name]: 267 | 268 | upload_files(con_file_name, get_stage_path(stage, con_file_name)) 269 | 270 | # this might get hairy 271 | df = ( 272 | session 273 | .read 274 | .option('STRIP_OUTER_ELEMENT', True) # Strips the TPCDI:Actions element 275 | .xml(get_stage_path(stage, con_file_name)) 276 | .select( 277 | # flatten out all of the nested elements 278 | xmlget(col('$1'), lit('Customer'), 0).alias('customer'), 279 | xmlget(col('customer'), lit('Name'), 0).alias('name'), 280 | xmlget(col('customer'), lit('Address'), 0).alias('address'), 281 | xmlget(col('customer'), lit('ContactInfo'), 0).alias('contact_info'), 282 | xmlget(col('contact_info'), lit('C_PHONE_1')).alias('phone1'), 283 | xmlget(col('contact_info'), lit('C_PHONE_2')).alias('phone2'), 284 | xmlget(col('contact_info'), lit('C_PHONE_3')).alias('phone3'), 285 | xmlget(col('customer'), lit('TaxInfo'), 0).alias('tax_info'), 286 | xmlget(col('customer'), lit('Account'), 0).alias('account'), 287 | # get the Action attributes 288 | get_xml_attribute('$1','ActionType','STRING'), 289 | get_xml_attribute('$1','ActionTS','STRING'), 290 | ) 291 | .select( 292 | # Handling Action attributes 293 | to_timestamp(col('ActionTs'), lit('yyyy-mm-ddThh:mi:ss')).alias('action_ts'), 294 | col('ActionType').alias('ACTION_TYPE'), 295 | # Get Customer Attributes 296 | get_xml_attribute('customer','C_ID','NUMBER'), 297 | get_xml_attribute('customer','C_TAX_ID','STRING'), 298 | get_xml_attribute('customer','C_GNDR','STRING'), 299 | try_cast(get_xml_attribute('customer','C_TIER','STRING', False),'NUMBER').alias('c_tier'), 300 | get_xml_attribute('customer','C_DOB','DATE'), 301 | # Get Name elements 302 | get_xml_element('name','C_L_NAME','STRING'), 303 | get_xml_element('name','C_F_NAME','STRING'), 304 | get_xml_element('name','C_M_NAME','STRING'), 305 | # Get Address elements 306 | get_xml_element('address','C_ADLINE1','STRING'), 307 | get_xml_element('address', 'C_ADLINE2', 'STRING'), 308 | get_xml_element('address','C_ZIPCODE','STRING'), 309 | get_xml_element('address','C_CITY','STRING'), 310 | get_xml_element('address','C_STATE_PROV','STRING'), 311 | get_xml_element('address','C_CTRY','STRING'), 312 | # Get Contact Info elements 313 | get_xml_element('contact_info','C_PRIM_EMAIL','STRING'), 314 | get_xml_element('contact_info','C_ALT_EMAIL','STRING'), 315 | # Contruct phone numbers from multi-nested elements 316 | get_phone_number('1'), 317 | get_phone_number('2'), 318 | get_phone_number('3'), 319 | # Get TaxInfo elements 320 | get_xml_element('tax_info','C_LCL_TX_ID','STRING'), 321 | get_xml_element('tax_info','C_NAT_TX_ID','STRING'), 322 | # Get Account Attributes 323 | get_xml_attribute('account','CA_ID','STRING'), 324 | get_xml_attribute('account','CA_TAX_ST','NUMBER'), 325 | # Get Account elements 326 | get_xml_element('account','CA_B_ID','NUMBER'), 327 | get_xml_element('account','CA_NAME','STRING'), 328 | ) 329 | ) 330 | 331 | save_df(df, 'customer_mgmt') 332 | 333 | con_file_name = 'TaxRate.txt' 334 | if file_name in ['all', con_file_name]: 335 | schema = StructType([ 336 | StructField("TX_ID", StringType(), False), 337 | StructField("TX_NAME", StringType(), True), 338 | StructField("TX_RATE", FloatType(), True), 339 | ]) 340 | load_csv(schema, con_file_name, 'tax_rate') 341 | 342 | con_file_name = 'HR.csv' 343 | if file_name in ['all', con_file_name]: 344 | schema = StructType([ 345 | StructField("EMPLOYEE_ID", IntegerType(), False), 346 | StructField("MANAGER_ID", IntegerType(), False), 347 | StructField("EMPLOYEE_FIRST_NAME", StringType(), True), 348 | StructField("EMPLOYEE_LAST_NAME", StringType(), True), 349 | StructField("EMPLOYEE_MI", StringType(), True), 350 | StructField("EMPLOYEE_JOB_CODE", IntegerType(), True), 351 | StructField("EMPLOYEE_BRANCH", StringType(), True), 352 | StructField("EMPLOYEE_OFFICE", StringType(), True), 353 | StructField("EMPLOYEE_PHONE", StringType(), True) 354 | ]) 355 | load_csv(schema, con_file_name, 'hr') 356 | 357 | con_file_name = 'WatchHistory.txt' 358 | if file_name in ['all', con_file_name]: 359 | schema = StructType([ 360 | StructField("W_C_ID", IntegerType(), False), 361 | StructField("W_S_SYMB", StringType(), True), 362 | StructField("W_DTS", TimestampType(), True), 363 | StructField("W_ACTION", StringType(), True) 364 | ]) 365 | load_csv(schema, con_file_name, 'watch_history') 366 | 367 | con_file_name = 'Trade.txt' 368 | if file_name in ['all', con_file_name]: 369 | schema = StructType([ 370 | StructField("T_ID", IntegerType(), False), 371 | StructField("T_DTS", TimestampType(), False), 372 | StructField("T_ST_ID", StringType(), False), 373 | StructField("T_TT_ID", StringType(), False), 374 | StructField("T_IS_CASH", BooleanType(), False), 375 | StructField("T_S_SYMB", StringType(), False), 376 | StructField("T_QTY", FloatType(),False), 377 | StructField("T_BID_PRICE", FloatType(), False), 378 | StructField("T_CA_ID", IntegerType(), False), 379 | StructField("T_EXEC_NAME", StringType(), False), 380 | StructField("T_TRADE_PRICE", FloatType(), True), 381 | StructField("T_CHRG", FloatType(), True), 382 | StructField("T_COMM", FloatType(), True), 383 | StructField("T_TAX", FloatType(), True), 384 | ]) 385 | load_csv(schema, con_file_name, 'trade') 386 | 387 | con_file_name = 'TradeHistory.txt' 388 | if file_name in ['all', con_file_name]: 389 | schema = StructType([ 390 | StructField("TH_T_ID", IntegerType(), False), 391 | StructField("TH_DTS", TimestampType(), False), 392 | StructField("TH_ST_ID", StringType(), False), 393 | ]) 394 | load_csv(schema, con_file_name, 'trade_history') 395 | 396 | con_file_name = 'StatusType.txt' 397 | if file_name in ['all', con_file_name]: 398 | schema = StructType([ 399 | StructField("ST_ID", StringType(), False), 400 | StructField("ST_NAME", StringType(), False), 401 | ]) 402 | load_csv(schema, con_file_name, 'status_type') 403 | 404 | con_file_name = 'TradeType.txt' 405 | if file_name in ['all', con_file_name]: 406 | schema = StructType([ 407 | StructField("TT_ID", StringType(), False), 408 | StructField("TT_NAME", StringType(), False), 409 | StructField("TT_IS_SELL", BooleanType(), False), 410 | StructField("TT_IS_MARKET", BooleanType(), False), 411 | ]) 412 | load_csv(schema, con_file_name, 'trade_type') 413 | 414 | con_file_name = 'HoldingHistory.txt' 415 | if file_name in ['all', con_file_name]: 416 | schema = StructType([ 417 | StructField("HH_H_T_ID", IntegerType(), False), 418 | StructField("HH_T_ID", IntegerType(), False), 419 | StructField("HH_BEFORE_QTY", FloatType(), False), 420 | StructField("HH_AFTER_QTY", FloatType(), False), 421 | ]) 422 | load_csv(schema, con_file_name, 'holding_history') 423 | 424 | con_file_name = 'CashTransaction.txt' 425 | if file_name in ['all', con_file_name]: 426 | schema = StructType([ 427 | StructField("CT_CA_ID", IntegerType(), False), 428 | StructField("CT_DTS", TimestampType(), False), 429 | StructField("CT_AMT", FloatType(), False), 430 | StructField("CT_NAME", StringType(), False), 431 | ]) 432 | load_csv(schema, con_file_name, 'cash_transaction') 433 | 434 | con_file_name = 'FINWIRE' 435 | if file_name in ['all', con_file_name]: 436 | # These are fixed-width fields, so read the entire line in as "line" 437 | schema = StructType([ 438 | StructField("line", StringType(), False), 439 | ]) 440 | 441 | stage_path = get_stage_path(stage, con_file_name) 442 | upload_files(con_file_name, stage_path) 443 | 444 | # generic dataframe for all record types 445 | # create a temporary table 446 | df = ( 447 | session 448 | .read 449 | .schema(schema) 450 | .option('field_delimiter', '|') 451 | .csv(stage_path) 452 | .with_column('rec_type', substring(col("line"), lit(16), lit(3))) 453 | .with_column('pts', to_timestamp(substring(col("line"), lit(0), lit(15)), lit("yyyymmdd-hhmiss"))) 454 | .write.mode("overwrite").save_as_table("finwire", table_type="temporary") 455 | ) 456 | 457 | # CMP record types 458 | df = ( 459 | session 460 | .table('finwire') 461 | .where(col('rec_type') == 'CMP') 462 | .with_column('company_name', substr(col('line'), lit(19), lit(60))) 463 | .withColumn('cik', substring(col("line"), lit(79), lit(10))) 464 | .withColumn('status', substring(col("line"), lit(89), lit(4))) 465 | .withColumn('industry_id', substring(col("line"), lit(93), lit(2))) 466 | .withColumn('sp_rating', substring(col("line"), lit(95), lit(4))) 467 | .withColumn( 468 | 'founding_date', 469 | try_cast( 470 | trim( 471 | substring( 472 | col("line"), lit(99), lit(8) 473 | ) 474 | ), 475 | 'DATE' 476 | ) 477 | ) 478 | .withColumn('address_line1', substring(col("line"), lit(107), lit(80))) 479 | .withColumn('address_line2', substring(col("line"), lit(187), lit(80))) 480 | .withColumn('postal_code', substring(col("line"), lit(267), lit(12))) 481 | .withColumn('city', substring(col("line"), lit(279), lit(25))) 482 | .withColumn('state_province', substring(col("line"), lit(304), lit(20))) 483 | .withColumn('country', substring(col("line"), lit(324), lit(24))) 484 | .withColumn('ceo_name', substring(col("line"), lit(348), lit(46))) 485 | .withColumn('description', substring(col("line"), lit(394), lit(150))) 486 | .drop(col("line"), col("rec_type")) 487 | ) 488 | 489 | save_df(df, 'cmp') 490 | 491 | # SEC record types 492 | df = ( 493 | session 494 | .table('finwire') 495 | .where(col('rec_type') == 'SEC') 496 | .withColumn('symbol', substring(col("line"), lit(19), lit(15))) 497 | .withColumn('issue_type', substring(col("line"), lit(34), lit(6))) 498 | .withColumn('status', substring(col("line"), lit(40), lit(4))) 499 | .withColumn('name', substring(col("line"), lit(44), lit(70))) 500 | .withColumn('ex_id', substring(col("line"), lit(114), lit(6))) 501 | .withColumn('sh_out', substring(col("line"), lit(120), lit(13))) 502 | .withColumn('first_trade_date', substring(col("line"), lit(133), lit(8))) 503 | .withColumn('first_exchange_date', substring(col("line"), lit(141), lit(8))) 504 | .withColumn('dividend', substring(col("line"), lit(149), lit(12))) 505 | .withColumn('co_name_or_cik', substring(col("line"), lit(161), lit(60))) 506 | .drop(col("line"), col("rec_type")) 507 | ) 508 | 509 | save_df(df, 'sec') 510 | 511 | # FIN record types 512 | df = ( 513 | session 514 | .table('finwire') 515 | .where(col('rec_type') == 'FIN') 516 | .withColumn('year', substring(col("line"), lit(19), lit(4))) 517 | .withColumn('quarter', substring(col("line"), lit(23), lit(1))) 518 | .withColumn('quarter_start_date', substring(col("line"), lit(24), lit(8))) 519 | .withColumn('posting_date', substring(col("line"), lit(32), lit(8))) 520 | .withColumn('revenue', substring(col("line"), lit(40), lit(17))) 521 | .withColumn('earnings', substring(col("line"), lit(57), lit(17))) 522 | .withColumn('eps', substring(col("line"), lit(74), lit(12))) 523 | .withColumn('diluted_eps', substring(col("line"), lit(86), lit(12))) 524 | .withColumn('margin', substring(col("line"), lit(98), lit(12))) 525 | .withColumn('inventory', substring(col("line"), lit(110), lit(17))) 526 | .withColumn('assets', substring(col("line"), lit(127), lit(17))) 527 | .withColumn('liabilities', substring(col("line"), lit(144), lit(17))) 528 | .withColumn('sh_out', substring(col("line"), lit(161), lit(13))) 529 | .withColumn('diluted_sh_out', substring(col("line"), lit(174), lit(13))) 530 | .withColumn('co_name_or_cik', substring(col("line"), lit(187), lit(60))) 531 | .drop(col("line"), col("rec_type")) 532 | ) 533 | 534 | 535 | save_df(df, 'fin') 536 | 537 | if __name__ == "__main__": 538 | app() 539 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0c989f41", 6 | "metadata": { 7 | "slideshow": { 8 | "slide_type": "slide" 9 | } 10 | }, 11 | "source": [ 12 | "![Title](images/title-page.png)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "f90ad24f-5035-4c5d-81e4-a81d9dff1e9a", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "![Title](images/title-qr.png)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "ed161285-5c6c-41de-a4ad-685a5de5ba0e", 30 | "metadata": {}, 31 | "source": [ 32 | "### What is the TPC?\n", 33 | "The TPC is a non-profit corporation focused on developing data-centric benchmark standards and disseminating objective, verifiable data to the industry.\n", 34 | "\n", 35 | "[tpc.org](https://www.tpc.org)\n", 36 | "\n", 37 | "### What is TPC-DI?\n", 38 | "The TPC-DI benchmark combines and transforms data extracted from an On-Line Transaction Processing (OTLP) system along with other sources of data, and loads it into a data warehouse.\n", 39 | "\n", 40 | "[tpc.org/tpcdi](https://www.tpc.org/tpcdi/)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "87a9e518-484a-4f18-80d4-4ae43c72312a", 46 | "metadata": {}, 47 | "source": [ 48 | "![ETL Diagram](images/tpc-di-etl-diagram.png)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "TPC-DI provides `DIGen.jar` to generate the source files.\n", 56 | "\n", 57 | "The JAR is dated and requires a 1.8 JDK." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "!jenv local 1.8\n", 67 | "!java -jar ~/dev/Tools/DIGen.jar --help" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!rm -rf ~/dev/tpcdi-output\n", 77 | "!mkdir -p ~/dev/tpcdi-output\n", 78 | "!cd ~/dev/Tools && java -jar ~/dev/Tools/DIGen.jar -o ~/dev/tpcdi-output -sf 10" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### The GitHub repository has a prebuilt CLI for easily loading the files.\n", 86 | "[github.com/stewartbryson/dbt-tpcdi](https://www.github.com/stewartbryson/dbt-tpcdi)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "!python tpcdi.py --help" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "!python tpcdi.py process-files --help" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "!python tpcdi.py process-files --output-directory ~/dev/tpcdi-output --file-name DailyMarket.txt --show" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "### If you get nothing else from this video, know that there's an easy way to load this dataset into Snowflake.\n", 121 | "\n", 122 | "But I also wanted to show some interesting approaches using Snowpark.\n", 123 | "\n", 124 | "All of the code samples below are snippets from the CLI with abstractions removed.\n", 125 | "\n", 126 | "We start with a `credentials.json` file to store our Snowflake credentials. Something like this:\n", 127 | "\n", 128 | "```json\n", 129 | "{\n", 130 | " \"account\": \"myaccount\",\n", 131 | " \"user\": \"myuser\",\n", 132 | " \"password\": \"mypassword\",\n", 133 | " \"role\": \"myrole\",\n", 134 | " \"warehouse\": \"stewart_dev\",\n", 135 | " \"database\": \"tpcdi\",\n", 136 | " \"schema\": \"digen\"\n", 137 | "}\n", 138 | "```" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "We create a Snowflake session." 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "import json\n", 155 | "from snowflake.snowpark.types import *\n", 156 | "from snowflake.snowpark.functions import *\n", 157 | "from snowflake.snowpark import Session\n", 158 | "\n", 159 | "# Read the credentials.json file\n", 160 | "with open(\"credentials.json\") as jsonfile:\n", 161 | " credentials_dict = json.load(jsonfile)\n", 162 | "\n", 163 | "# build the session\n", 164 | "session = (\n", 165 | " Session\n", 166 | " .builder\n", 167 | " .configs(credentials_dict)\n", 168 | " .create()\n", 169 | ")\n", 170 | "\n", 171 | "# Constants\n", 172 | "source_path = '/Users/stewartbryson/dev/tpcdi-output/Batch1'" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Most of the files generated by `DIGen.jar` are pipe-separated files, very similar to CSV files.\n", 180 | "\n", 181 | "These are very simple to handle. First let's upload the file to a stage:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# Constants\n", 191 | "stage_path = \"@tpcdi/Batch1\"\n", 192 | "\n", 193 | "# Put the file\n", 194 | "put_result = (\n", 195 | " session\n", 196 | " .file\n", 197 | " .put(\n", 198 | " f\"{source_path}/DailyMarket.txt\",\n", 199 | " f\"{stage_path}/DailyMarket.txt\",\n", 200 | " parallel=4,\n", 201 | " auto_compress=True,\n", 202 | " )\n", 203 | ")\n", 204 | "\n", 205 | "# Report back file results\n", 206 | "for result in put_result:\n", 207 | " print(f\"File {result.source}: {result.status}\")" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "And now we'll create a table from that file:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "table_name = 'daily_market'\n", 224 | "# Define the schema\n", 225 | "schema = StructType([\n", 226 | " StructField(\"DM_DATE\", DateType(), False),\n", 227 | " StructField(\"DM_S_SYMB\", StringType(), False),\n", 228 | " StructField(\"DM_CLOSE\", FloatType(), False),\n", 229 | " StructField(\"DM_HIGH\", FloatType(), False),\n", 230 | " StructField(\"DM_LOW\", FloatType(), False),\n", 231 | " StructField(\"DM_VOL\", FloatType(), False),\n", 232 | " ])\n", 233 | "\n", 234 | "# create a table from a DataFrame\n", 235 | "df = (\n", 236 | " session\n", 237 | " .read\n", 238 | " .schema(schema)\n", 239 | " .option(\"field_delimiter\", '|')\n", 240 | " .csv(f\"{stage_path}/DailyMarket.txt\")\n", 241 | " .write\n", 242 | " .mode(\"overwrite\")\n", 243 | " .save_as_table(table_name)\n", 244 | ")\n", 245 | "\n", 246 | "print(f\"{table_name.upper()} table created.\")" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# show the table\n", 256 | "df = (\n", 257 | " session \n", 258 | " .table(table_name) \n", 259 | " .show()\n", 260 | ")" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "The `DIGen.jar` utility generates a series of \"finwire\" files.\n", 268 | "\n", 269 | "These files represent market history over time.\n", 270 | "\n", 271 | "They are fixed-width, multi-format files." 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "Here's a sample of one of the files that has one of each type of record: `FIN`, `SEC`, and `CMP`:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "!cat devrel/multi-record.txt" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "We'll start by uploading all the files:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "from pathlib import Path\n", 304 | "\n", 305 | "# File paths\n", 306 | "stage_path = \"@tpcdi/Batch1/FINWIRE\"\n", 307 | "\n", 308 | "# glob the files\n", 309 | "pathlist = (\n", 310 | " Path(source_path)\n", 311 | " .glob(\"FINWIRE??????\")\n", 312 | ")\n", 313 | "\n", 314 | "for file in pathlist:\n", 315 | " # put the file(s) in the stage\n", 316 | " put_result = (\n", 317 | " session \n", 318 | " .file\n", 319 | " .put(\n", 320 | " str(file), \n", 321 | " stage_path, \n", 322 | " parallel=4, \n", 323 | " auto_compress=True\n", 324 | " )\n", 325 | " )\n", 326 | " for result in put_result:\n", 327 | " print(f\"File {result.source}: {result.status}\")" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "The `CMP`, `SEC`, and `FIN` records all have two fields in common, so we want to create a generic DataFrame that contains the shared logic and we’ll save that DataFrame as a Snowflake temporary table called `FINWIRE`. We'll use `WITH_COLUMN` and `SUBSTRING`" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# These are fixed-width fields, so read the entire line in as \"line\"\n", 344 | "schema = StructType([\n", 345 | " StructField(\"line\", StringType(), False),\n", 346 | "])\n", 347 | "\n", 348 | "# generic dataframe for all record types\n", 349 | "# create a temporary table\n", 350 | "# The delimiter '|' seems safer\n", 351 | "df = (\n", 352 | " session\n", 353 | " .read\n", 354 | " .schema(schema)\n", 355 | " .option('field_delimiter', '|')\n", 356 | " .csv(stage_path)\n", 357 | " .with_column(\n", 358 | " 'pts', \n", 359 | " to_timestamp(\n", 360 | " substring(col(\"line\"), lit(0), lit(15)), \n", 361 | " lit(\"yyyymmdd-hhmiss\")\n", 362 | " )\n", 363 | " )\n", 364 | " .with_column(\n", 365 | " 'rec_type', \n", 366 | " substring(col(\"line\"), lit(16), lit(3))\n", 367 | " )\n", 368 | " .write\n", 369 | " .mode(\"overwrite\")\n", 370 | " .save_as_table(\"finwire\", table_type=\"temporary\")\n", 371 | ")\n", 372 | "\n", 373 | "# let's see the table\n", 374 | "df = (\n", 375 | " session \n", 376 | " .table('finwire') \n", 377 | " .show()\n", 378 | ")" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "Now we can create the three separate tables from this temporary table using the `WITH_COLUMN` and `SUBSTRING` functions.\n", 386 | "\n", 387 | "I'll only show the Security table as an example, but the other two are done the same way:" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "# SEC record types\n", 397 | "table_name = 'sec'\n", 398 | "df = (\n", 399 | " session\n", 400 | " .table('finwire')\n", 401 | " .where(col('rec_type') == 'SEC')\n", 402 | " .withColumn(\n", 403 | " 'symbol', \n", 404 | " substring(col(\"line\"), lit(19), lit(15))\n", 405 | " )\n", 406 | " .withColumn(\n", 407 | " 'issue_type', \n", 408 | " substring(col(\"line\"), lit(34), lit(6))\n", 409 | " )\n", 410 | " .withColumn(\n", 411 | " 'status', \n", 412 | " substring(col(\"line\"), lit(40), lit(4))\n", 413 | " )\n", 414 | " .withColumn(\n", 415 | " 'name', \n", 416 | " substring(col(\"line\"), lit(44), lit(70))\n", 417 | " )\n", 418 | " .withColumn(\n", 419 | " 'ex_id', \n", 420 | " substring(col(\"line\"), lit(114), lit(6))\n", 421 | " )\n", 422 | " .withColumn(\n", 423 | " 'sh_out', \n", 424 | " substring(col(\"line\"), lit(120), lit(13))\n", 425 | " )\n", 426 | " .withColumn(\n", 427 | " 'first_trade_date', \n", 428 | " substring(col(\"line\"), lit(133), lit(8))\n", 429 | " )\n", 430 | " .withColumn(\n", 431 | " 'first_exchange_date', \n", 432 | " substring(col(\"line\"), lit(141), lit(8))\n", 433 | " )\n", 434 | " .withColumn(\n", 435 | " 'dividend', \n", 436 | " substring(col(\"line\"), lit(149), lit(12))\n", 437 | " )\n", 438 | " .withColumn(\n", 439 | " 'co_name_or_cik', \n", 440 | " substring(col(\"line\"), lit(161), lit(60))\n", 441 | " )\n", 442 | " # these columns are no longer relevant\n", 443 | " .drop(col(\"line\"), col(\"rec_type\"))\n", 444 | " .write\n", 445 | " .mode(\"overwrite\")\n", 446 | " .save_as_table(table_name)\n", 447 | ")\n", 448 | "\n", 449 | "print(f\"{table_name.upper()} table created.\")" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "\n", 459 | "# let's see the table\n", 460 | "df = (\n", 461 | " session \n", 462 | " .table('sec') \n", 463 | " .show()\n", 464 | ")" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "The `DIGen.jar` utility creates a single XML file called `CustomerMgmt.xml`, with a sample below:\n", 472 | "\n", 473 | "```xml\n", 474 | "\n", 475 | "\n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " Joannis\n", 480 | " Adara\n", 481 | " \n", 482 | " \n", 483 | "
\n", 484 | " 4779 Weller Way\n", 485 | " \n", 486 | " 92624\n", 487 | " Columbus\n", 488 | " Ontario\n", 489 | " Canada\n", 490 | "
\n", 491 | " \n", 492 | " Adara.Joannis@moose-mail.com\n", 493 | " Adara.Joannis@gmx.com\n", 494 | " \n", 495 | " 1\n", 496 | " 872\n", 497 | " 523-8928\n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " 492-3961\n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " CA3\n", 515 | " YT3\n", 516 | " \n", 517 | " \n", 518 | " 17713\n", 519 | " CJlmMuFyibKOmKLHIaTeWugvCgZdmcfpDsYb\n", 520 | " \n", 521 | "
\n", 522 | "
\n", 523 | "
\n", 524 | "```\n", 525 | "\n", 526 | "The hierarchical representation of a TPCDI:Action record, with @ signifying a node attribute as opposed to an element, is shown below:\n", 527 | "\n", 528 | "```\n", 529 | "|-- TPCDI:Action\n", 530 | " |-- @ActionType: string\n", 531 | " |-- @ActionTS: timestamp\n", 532 | " |-- Customer\n", 533 | " |-- @C_ID: number\n", 534 | " |-- @C_TAX_ID: string\n", 535 | " |-- @C_GNDR: string\n", 536 | " |-- @C_TIER: number\n", 537 | " |-- @C_DOB: date\n", 538 | " |-- Name\n", 539 | " |-- C_F_NAME: string\n", 540 | " |-- C_L_NAME: string\n", 541 | " |-- C_M_NAME: string\n", 542 | " |-- Address\n", 543 | " |-- C_ADLINE1: string\n", 544 | " |-- C_ADLINE2: string\n", 545 | " |-- C_CITY: string\n", 546 | " |-- C_CTRY: string\n", 547 | " |-- C_STATE_PROV: string\n", 548 | " |-- C_ZIPCODE: string\n", 549 | " |-- ContactInfo\n", 550 | " |-- C_ALT_EMAIL: string\n", 551 | " |-- C_PHONE_1\n", 552 | " |-- C_AREA_CODE: number\n", 553 | " |-- C_CTRY_CODE: number\n", 554 | " |-- C_EXT: long\n", 555 | " |-- C_LOCAL: string\n", 556 | " |-- C_PHONE_2\n", 557 | " |-- C_AREA_CODE: number\n", 558 | " |-- C_CTRY_CODE: number\n", 559 | " |-- C_EXT: number\n", 560 | " |-- C_LOCAL: string\n", 561 | " |-- C_PHONE_3\n", 562 | " |-- C_AREA_CODE: number\n", 563 | " |-- C_CTRY_CODE: number\n", 564 | " |-- C_EXT: number\n", 565 | " |-- C_LOCAL: string\n", 566 | " |-- TaxInfo\n", 567 | " |-- C_LCL_TX_ID: string\n", 568 | " |-- C_NAT_TX_ID: string\n", 569 | " |-- Account\n", 570 | " |-- CA_B_ID: number\n", 571 | " |-- CA_NAME: string\n", 572 | " |-- @CA_ID: number\n", 573 | " |-- @CA_TAX_ST: number\n", 574 | "```\n" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "Let's create a DataFrame from the XML file and see what we get:" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "# File paths\n", 591 | "stage_path = \"@tpcdi/Batch1\"\n", 592 | "\n", 593 | "# Put the file\n", 594 | "put_result = (\n", 595 | " session\n", 596 | " .file\n", 597 | " .put(\n", 598 | " f\"{source_path}/CustomerMgmt.xml\",\n", 599 | " f\"{stage_path}/CustomerMgmt.xml\",\n", 600 | " parallel=4,\n", 601 | " auto_compress=True,\n", 602 | " )\n", 603 | ")\n", 604 | "for result in put_result:\n", 605 | " print(f\"File {result.source}: {result.status}\")\n", 606 | "\n", 607 | "# Read the XML file into a DataFrame and show it\n", 608 | "df = (\n", 609 | " session\n", 610 | " .read\n", 611 | " .option('STRIP_OUTER_ELEMENT', True) # Strips TPCDI:Actions\n", 612 | " .xml(f\"{stage_path}/CustomerMgmt.xml\")\n", 613 | " .show(1, 100)\n", 614 | ")" 615 | ] 616 | }, 617 | { 618 | "cell_type": "markdown", 619 | "metadata": {}, 620 | "source": [ 621 | "Snowflake does not support simple dot notation for XML the way it does for JSON.\n", 622 | "\n", 623 | "Instead we have to pair the `GET()` function with an `XMLGET()`, which can be quite tedious.\n", 624 | "\n", 625 | "So we can create helper functions to abstract the toil:" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "# Simplifies retrieving XML elements\n", 635 | "def get_xml_element(\n", 636 | " column:str,\n", 637 | " element:str,\n", 638 | " datatype:str,\n", 639 | " with_alias:bool = True\n", 640 | "):\n", 641 | " new_element = (\n", 642 | " get(\n", 643 | " xmlget(\n", 644 | " col(column),\n", 645 | " lit(element),\n", 646 | " ),\n", 647 | " lit('$')\n", 648 | " )\n", 649 | " .cast(datatype)\n", 650 | " )\n", 651 | "\n", 652 | " # alias needs to be optional\n", 653 | " return (\n", 654 | " new_element.alias(element) if with_alias else new_element\n", 655 | " )\n", 656 | "\n", 657 | "# Simplifies retrieving XML attributes\n", 658 | "def get_xml_attribute(\n", 659 | " column:str,\n", 660 | " attribute:str,\n", 661 | " datatype:str,\n", 662 | " with_alias:bool = True\n", 663 | "):\n", 664 | " new_attribute = (\n", 665 | " get(\n", 666 | " col(column),\n", 667 | " lit(f\"@{attribute}\")\n", 668 | " )\n", 669 | " .cast(datatype)\n", 670 | " )\n", 671 | "\n", 672 | " # alias needs to be optional\n", 673 | " return (\n", 674 | " new_attribute.alias(attribute) if with_alias else new_attribute\n", 675 | " )\n", 676 | "\n", 677 | "# Constructs a phone number from multiple nested fields\n", 678 | "def get_phone_number(\n", 679 | " phone_id:str,\n", 680 | " separator:str = '-'\n", 681 | "):\n", 682 | " return (\n", 683 | " concat (\n", 684 | " get_xml_element(f\"phone{phone_id}\", 'C_CTRY_CODE', 'STRING', False),\n", 685 | " when(get_xml_element(f\"phone{phone_id}\", 'C_CTRY_CODE', 'STRING', False) == '', '')\n", 686 | " .otherwise(separator),\n", 687 | " get_xml_element(f\"phone{phone_id}\", 'C_AREA_CODE', 'STRING', False),\n", 688 | " when(get_xml_element(f\"phone{phone_id}\", 'C_AREA_CODE', 'STRING', False) == '', '')\n", 689 | " .otherwise(separator),\n", 690 | " get_xml_element(f\"phone{phone_id}\", 'C_LOCAL', 'STRING', False),\n", 691 | " when(get_xml_element(f\"phone{phone_id}\", 'C_EXT', 'STRING', False) == '', '')\n", 692 | " .otherwise(\" ext: \"),\n", 693 | " get_xml_element(f\"phone{phone_id}\", 'C_EXT', 'STRING', False)\n", 694 | " )\n", 695 | " .alias(f\"c_phone_{phone_id}\")\n", 696 | " )" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "Now let's put it all together and create our `customer_mgmt` table:" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": null, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "table_name = 'customer_mgmt'\n", 713 | "df = (\n", 714 | " session\n", 715 | " .read\n", 716 | " .option('STRIP_OUTER_ELEMENT', True) # Strips the TPCDI:Actions node\n", 717 | " .xml(f\"{stage_path}/CustomerMgmt.xml\")\n", 718 | " .select(\n", 719 | " # flatten out all of the nested elements\n", 720 | " xmlget(col('$1'), lit('Customer'), 0).alias('customer'),\n", 721 | " xmlget(col('customer'), lit('Name'), 0).alias('name'),\n", 722 | " xmlget(col('customer'), lit('Address'), 0).alias('address'),\n", 723 | " xmlget(col('customer'), lit('ContactInfo'), 0).alias('contact_info'),\n", 724 | " xmlget(col('contact_info'), lit('C_PHONE_1')).alias('phone1'),\n", 725 | " xmlget(col('contact_info'), lit('C_PHONE_2')).alias('phone2'),\n", 726 | " xmlget(col('contact_info'), lit('C_PHONE_3')).alias('phone3'),\n", 727 | " xmlget(col('customer'), lit('TaxInfo'), 0).alias('tax_info'),\n", 728 | " xmlget(col('customer'), lit('Account'), 0).alias('account'),\n", 729 | " # get the Action attributes\n", 730 | " get_xml_attribute('$1','ActionType','STRING'),\n", 731 | " get_xml_attribute('$1','ActionTS','STRING'),\n", 732 | " )\n", 733 | " .select(\n", 734 | " # Handling Action attributes\n", 735 | " to_timestamp(\n", 736 | " col('ActionTs'),\n", 737 | " lit('yyyy-mm-ddThh:mi:ss')\n", 738 | " ).alias('action_ts'),\n", 739 | " col('ActionType').alias('ACTION_TYPE'),\n", 740 | " # Get Customer Attributes\n", 741 | " get_xml_attribute('customer','C_ID','NUMBER'),\n", 742 | " get_xml_attribute('customer','C_TAX_ID','STRING'),\n", 743 | " get_xml_attribute('customer','C_GNDR','STRING'),\n", 744 | " # Had to disable auto-aliasing\n", 745 | " try_cast(\n", 746 | " get_xml_attribute('customer','C_TIER','STRING', False),\n", 747 | " 'NUMBER'\n", 748 | " ).alias('c_tier'),\n", 749 | " get_xml_attribute('customer','C_DOB','DATE'),\n", 750 | " # Get Name elements\n", 751 | " get_xml_element('name','C_L_NAME','STRING'),\n", 752 | " get_xml_element('name','C_F_NAME','STRING'),\n", 753 | " get_xml_element('name','C_M_NAME','STRING'),\n", 754 | " # Get Address elements\n", 755 | " get_xml_element('address','C_ADLINE1','STRING'),\n", 756 | " get_xml_element('address', 'C_ADLINE2', 'STRING'),\n", 757 | " get_xml_element('address','C_ZIPCODE','STRING'),\n", 758 | " get_xml_element('address','C_CITY','STRING'),\n", 759 | " get_xml_element('address','C_STATE_PROV','STRING'),\n", 760 | " get_xml_element('address','C_CTRY','STRING'),\n", 761 | " # Get Contact Info elements\n", 762 | " get_xml_element('contact_info','C_PRIM_EMAIL','STRING'),\n", 763 | " get_xml_element('contact_info','C_ALT_EMAIL','STRING'),\n", 764 | " # Contruct phone numbers from multi-nested elements\n", 765 | " get_phone_number('1'),\n", 766 | " get_phone_number('2'),\n", 767 | " get_phone_number('3'),\n", 768 | " # Get TaxInfo elements\n", 769 | " get_xml_element('tax_info','C_LCL_TX_ID','STRING'),\n", 770 | " get_xml_element('tax_info','C_NAT_TX_ID','STRING'),\n", 771 | " # Get Account Attributes\n", 772 | " get_xml_attribute('account','CA_ID','STRING'),\n", 773 | " get_xml_attribute('account','CA_TAX_ST','NUMBER'),\n", 774 | " # Get Account elements\n", 775 | " get_xml_element('account','CA_B_ID','NUMBER'),\n", 776 | " get_xml_element('account','CA_NAME','STRING'),\n", 777 | " )\n", 778 | " .write\n", 779 | " .mode(\"overwrite\")\n", 780 | " .save_as_table(table_name)\n", 781 | ")\n", 782 | "\n", 783 | "print(f\"{table_name.upper()} table created.\")" 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "metadata": {}, 790 | "outputs": [], 791 | "source": [ 792 | "df = (\n", 793 | " session\n", 794 | " .table('customer_mgmt')\n", 795 | " .select(\n", 796 | " col('action_ts'),\n", 797 | " col('c_id'),\n", 798 | " col('c_tier'),\n", 799 | " col('c_phone_1')\n", 800 | " )\n", 801 | " .show()\n", 802 | ")" 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "![ETL Diagram](images/tpc-di-logical-model.png)" 810 | ] 811 | }, 812 | { 813 | "cell_type": "markdown", 814 | "id": "442d9973-dcdb-491f-82d2-0a1ec2d133aa", 815 | "metadata": { 816 | "tags": [] 817 | }, 818 | "source": [ 819 | "# When we Google \"dbt dynamic tables\":\n", 820 | "\n", 821 | "![Google Search](images/dbt-dynamic-tables.png)" 822 | ] 823 | }, 824 | { 825 | "cell_type": "markdown", 826 | "id": "c73ac995-dcdf-4d7d-96d8-4f23eafc16c9", 827 | "metadata": {}, 828 | "source": [ 829 | "# It's not as simple as this\n", 830 | "\n", 831 | "![Conflict](images/refresh-conflict.png)" 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "id": "96e4236c-46da-4b50-aa37-225df6ff9f7f", 837 | "metadata": {}, 838 | "source": [ 839 | "# dbt is more than just a job scheduler.\n", 840 | "Dynamic Tables need to be (re)created in the correct order.\n", 841 | "The dependent relationships become very complex as the number of dependencies increases.\n", 842 | "We don't want to maintain a create script with all the tables in the correct order.\n", 843 | "\n", 844 | "dbt understands your DAG and can infer the relationships." 845 | ] 846 | }, 847 | { 848 | "cell_type": "markdown", 849 | "metadata": {}, 850 | "source": [ 851 | "![dbt DAG](images/dbt-dag.png)" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "id": "469f485d-792c-46df-906f-cd09f364d7d0", 857 | "metadata": {}, 858 | "source": [ 859 | "### Enabling dynamic tables in our dbt project\n", 860 | "Be on version `1.7.0` of `dbt-snowflake` and set a new materialization type in our `dbt_project.yml` file:\n", 861 | "\n", 862 | "```yaml\n", 863 | "models:\n", 864 | " dbt_tpcdi:\n", 865 | " example:\n", 866 | " +materialized: view\n", 867 | " bronze:\n", 868 | " +schema: bronze\n", 869 | " +materialized: dynamic_table\n", 870 | " +snowflake_warehouse: tpcdi_large\n", 871 | " +target_lag: downstream\n", 872 | " silver:\n", 873 | " +schema: silver\n", 874 | " +materialized: dynamic_table\n", 875 | " +snowflake_warehouse: tpcdi_large\n", 876 | " +target_lag: '10 minutes'\n", 877 | " gold:\n", 878 | " +schema: gold\n", 879 | " +materialized: dynamic_table\n", 880 | " +snowflake_warehouse: tpcdi_large\n", 881 | " +target_lag: '20 minutes'\n", 882 | " work:\n", 883 | " +schema: work\n", 884 | " +materialized: ephemeral\n", 885 | "```" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "id": "420db6a7-ce06-4638-a3a5-1ac16aff2fc8", 892 | "metadata": { 893 | "tags": [] 894 | }, 895 | "outputs": [], 896 | "source": [ 897 | "!dbt build" 898 | ] 899 | }, 900 | { 901 | "cell_type": "markdown", 902 | "id": "3eb79852", 903 | "metadata": {}, 904 | "source": [ 905 | "### dbt also has Tests\n", 906 | "\n", 907 | "We can run them when we create the Dynamic Table:" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": null, 913 | "id": "a62811a9", 914 | "metadata": { 915 | "tags": [] 916 | }, 917 | "outputs": [], 918 | "source": [ 919 | "!dbt build --select fact_trade" 920 | ] 921 | }, 922 | { 923 | "cell_type": "markdown", 924 | "id": "d564eded", 925 | "metadata": {}, 926 | "source": [ 927 | "Or we can schedule them to run periodically:" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "id": "cf40309e", 934 | "metadata": { 935 | "tags": [] 936 | }, 937 | "outputs": [], 938 | "source": [ 939 | "!dbt test" 940 | ] 941 | }, 942 | { 943 | "cell_type": "markdown", 944 | "id": "ac3f442b", 945 | "metadata": {}, 946 | "source": [ 947 | "### dbt is more than a scheduler\n", 948 | "\n", 949 | "1. Cloud development environment for those that prefer it.\n", 950 | "1. CI/CD workflows for promoting Dynamic Table changes into Production.\n", 951 | "1. Perhaps there's promise in the Semantic Layer." 952 | ] 953 | }, 954 | { 955 | "cell_type": "markdown", 956 | "metadata": {}, 957 | "source": [ 958 | "### Snowflake has its own version of the DAG\n", 959 | "However, this only exists after all the Dynamic Tables have been created.\n", 960 | "dbt understands the DAG _before_ it's created.\n", 961 | "\n", 962 | "![Snowflake DAG](images/snowflake-dag.png)" 963 | ] 964 | }, 965 | { 966 | "cell_type": "markdown", 967 | "metadata": {}, 968 | "source": [] 969 | }, 970 | { 971 | "cell_type": "markdown", 972 | "id": "22101a49", 973 | "metadata": {}, 974 | "source": [ 975 | "# Clean-up" 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "id": "3c6a912a", 982 | "metadata": { 983 | "tags": [] 984 | }, 985 | "outputs": [], 986 | "source": [ 987 | "!python tpcdi.py drop-schema --schema dl_gold\n", 988 | "!python tpcdi.py drop-schema --schema dl_silver\n", 989 | "!python tpcdi.py drop-schema --schema dl_bronze\n", 990 | "!python tpcdi.py drop-schema --schema dl_work" 991 | ] 992 | } 993 | ], 994 | "metadata": { 995 | "kernelspec": { 996 | "display_name": "Python 3 (ipykernel)", 997 | "language": "python", 998 | "name": "python3" 999 | }, 1000 | "language_info": { 1001 | "codemirror_mode": { 1002 | "name": "ipython", 1003 | "version": 3 1004 | }, 1005 | "file_extension": ".py", 1006 | "mimetype": "text/x-python", 1007 | "name": "python", 1008 | "nbconvert_exporter": "python", 1009 | "pygments_lexer": "ipython3", 1010 | "version": "3.8.17" 1011 | } 1012 | }, 1013 | "nbformat": 4, 1014 | "nbformat_minor": 5 1015 | } 1016 | --------------------------------------------------------------------------------