├── analyses
    └── .gitkeep
├── macros
    └── .gitkeep
├── seeds
    └── .gitkeep
├── snapshots
    └── .gitkeep
├── tests
    ├── .gitkeep
    └── fact_trade__unique_trade.sql
├── models
    ├── gold
    │   ├── dim_date.sql
    │   ├── dim_broker.sql
    │   ├── fact_cash_balances.sql
    │   ├── dim_trade.sql
    │   ├── fact_cash_transactions.sql
    │   ├── fact_watches.sql
    │   ├── dim_account.sql
    │   ├── fact_market_history.sql
    │   ├── dim_security.sql
    │   ├── fact_holdings.sql
    │   ├── fact_trade.sql
    │   ├── dim_company.sql
    │   └── dim_customer.sql
    ├── silver
    │   ├── date.sql
    │   ├── employees.sql
    │   ├── cash_transactions.sql
    │   ├── watches_history.sql
    │   ├── holdings_history.sql
    │   ├── trades.sql
    │   ├── watches.sql
    │   ├── companies.sql
    │   ├── securities.sql
    │   ├── daily_market.sql
    │   ├── financials.sql
    │   ├── trades_history.sql
    │   ├── accounts.sql
    │   └── customers.sql
    ├── bronze
    │   ├── hr
    │   │   ├── hr_employee.sql
    │   │   └── sources.yml
    │   ├── crm
    │   │   ├── crm_customer_mgmt.sql
    │   │   └── sources.yml
    │   ├── brokerage
    │   │   ├── brokerage_trade.sql
    │   │   ├── brokerage_holding_history.sql
    │   │   ├── brokerage_trade_history.sql
    │   │   ├── brokerage_cash_transaction.sql
    │   │   ├── brokerage_watch_history.sql
    │   │   ├── brokerage_daily_market.sql
    │   │   └── sources.yml
    │   ├── reference
    │   │   ├── reference_industry.sql
    │   │   ├── reference_tax_rate.sql
    │   │   ├── reference_status_type.sql
    │   │   ├── reference_trade_type.sql
    │   │   ├── sources.yml
    │   │   └── reference_date.sql
    │   ├── finwire
    │   │   ├── finwire_company.sql
    │   │   ├── sources.yml
    │   │   ├── finwire_security.sql
    │   │   └── finwire_financial.sql
    │   └── syndicated
    │   │   ├── syndicated_prospect.sql
    │   │   └── sources.yml
    ├── example
    │   ├── my_second_dbt_model.sql
    │   ├── schema.yml
    │   └── my_first_dbt_model.sql
    └── work
    │   └── wrk_company_financials.sql
├── packages.yml
├── images
    ├── dbt-dag.png
    ├── title-qr.png
    ├── title-page.png
    ├── tpcdi-help.png
    ├── snowflake-dag.png
    ├── refresh-conflict.png
    ├── dbt-dynamic-tables.png
    ├── tpc-di-etl-diagram.png
    ├── tpc-di-logical-model.png
    └── tpcdi-process-files-help.png
├── .gitignore
├── .github
    ├── release-drafter.yml
    └── workflows
    │   └── release-drafter.yml
├── devrel
    └── multi-record.txt
├── dbt_project.yml
├── environment.yml
├── README.md
├── tpcdi.py
└── demo.ipynb


/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/gold/dim_date.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ ref('date') }}


--------------------------------------------------------------------------------
/models/silver/date.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ ref('reference_date') }}


--------------------------------------------------------------------------------
/models/bronze/hr/hr_employee.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('hr', 'hr') }}


--------------------------------------------------------------------------------
/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - package: dbt-labs/dbt_utils
3 |     version: 1.1.1
4 | 


--------------------------------------------------------------------------------
/models/bronze/crm/crm_customer_mgmt.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('crm', 'customer_mgmt') }}


--------------------------------------------------------------------------------
/images/dbt-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/dbt-dag.png


--------------------------------------------------------------------------------
/images/title-qr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/title-qr.png


--------------------------------------------------------------------------------
/images/title-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/title-page.png


--------------------------------------------------------------------------------
/images/tpcdi-help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpcdi-help.png


--------------------------------------------------------------------------------
/models/bronze/brokerage/brokerage_trade.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('brokerage', 'trade') }}
3 | 


--------------------------------------------------------------------------------
/models/bronze/reference/reference_industry.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('reference','industry') }}


--------------------------------------------------------------------------------
/models/bronze/reference/reference_tax_rate.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('reference','tax_rate') }}


--------------------------------------------------------------------------------
/images/snowflake-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/snowflake-dag.png


--------------------------------------------------------------------------------
/models/bronze/finwire/finwire_company.sql:
--------------------------------------------------------------------------------
1 | 
2 | select
3 |     *
4 | from {{ source("finwire", "cmp") }}
5 | 


--------------------------------------------------------------------------------
/models/bronze/reference/reference_status_type.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('reference','status_type') }}


--------------------------------------------------------------------------------
/models/bronze/reference/reference_trade_type.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('reference','trade_type') }}


--------------------------------------------------------------------------------
/models/bronze/syndicated/syndicated_prospect.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('syndicated', 'prospect') }}


--------------------------------------------------------------------------------
/images/refresh-conflict.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/refresh-conflict.png


--------------------------------------------------------------------------------
/images/dbt-dynamic-tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/dbt-dynamic-tables.png


--------------------------------------------------------------------------------
/images/tpc-di-etl-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpc-di-etl-diagram.png


--------------------------------------------------------------------------------
/models/bronze/brokerage/brokerage_holding_history.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('brokerage', 'holding_history') }}


--------------------------------------------------------------------------------
/models/bronze/brokerage/brokerage_trade_history.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('brokerage', 'trade_history') }}
3 | 


--------------------------------------------------------------------------------
/images/tpc-di-logical-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpc-di-logical-model.png


--------------------------------------------------------------------------------
/models/bronze/brokerage/brokerage_cash_transaction.sql:
--------------------------------------------------------------------------------
1 | select *
2 | from {{ source('brokerage', 'cash_transaction') }}
3 | 


--------------------------------------------------------------------------------
/models/bronze/brokerage/brokerage_watch_history.sql:
--------------------------------------------------------------------------------
1 | select
2 |     *
3 | from {{ source("brokerage", "watch_history") }}


--------------------------------------------------------------------------------
/models/bronze/brokerage/brokerage_daily_market.sql:
--------------------------------------------------------------------------------
1 | 
2 | select
3 |     *
4 | from {{ source("brokerage", "daily_market") }}
5 | 


--------------------------------------------------------------------------------
/images/tpcdi-process-files-help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stewartbryson/dbt-tpcdi/HEAD/images/tpcdi-process-files-help.png


--------------------------------------------------------------------------------
/tests/fact_trade__unique_trade.sql:
--------------------------------------------------------------------------------
1 | select 
2 |     sk_trade_id, 
3 |     count(*) cnt
4 | from {{ ref('fact_trade') }} 
5 | group by all
6 | having cnt > 1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | target/
 3 | dbt_packages/
 4 | logs/
 5 | credentials.properties
 6 | credentials.json
 7 | .java-version
 8 | .vscode/
 9 | __pycache__/
10 | .DS_Store
11 | .ipynb_checkpoints/
12 | 


--------------------------------------------------------------------------------
/models/bronze/hr/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: hr
 5 |     database: tpcdi
 6 |     schema: digen
 7 |     loader: snowpark
 8 | 
 9 |     tables:
10 |       - name: hr
11 | 


--------------------------------------------------------------------------------
/models/example/my_second_dbt_model.sql:
--------------------------------------------------------------------------------
1 | 
2 | -- Use the `ref` function to select from other models
3 | {{ config(enabled=false) }}
4 | select *
5 | from {{ ref('my_first_dbt_model') }}
6 | where id = 1
7 | 


--------------------------------------------------------------------------------
/models/bronze/crm/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: crm
 5 |     database: tpcdi
 6 |     schema: digen
 7 |     loader: snowpark
 8 | 
 9 |     tables:
10 |       - name: customer_mgmt
11 | 


--------------------------------------------------------------------------------
/models/bronze/syndicated/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: syndicated
 5 |     database: tpcdi
 6 |     schema: digen
 7 |     loader: snowpark
 8 | 
 9 |     tables:
10 |       - name: prospect


--------------------------------------------------------------------------------
/models/bronze/finwire/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: finwire
 5 |     database: tpcdi
 6 |     schema: digen
 7 |     loader: snowpark
 8 | 
 9 |     tables:
10 |       - name: cmp
11 |       - name: sec
12 |       - name: fin
13 | 


--------------------------------------------------------------------------------
/models/bronze/reference/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: reference
 5 |     database: tpcdi
 6 |     schema: digen
 7 |     loader: snowpark
 8 | 
 9 |     tables:
10 |       - name: status_type
11 |       - name: industry
12 |       - name: date
13 |       - name: tax_rate
14 |       - name: trade_type


--------------------------------------------------------------------------------
/models/gold/dim_broker.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     {{dbt_utils.generate_surrogate_key(['employee_id'])}} sk_broker_id,
 3 |     employee_id broker_id,
 4 |     manager_id,
 5 |     first_name,
 6 |     last_name,
 7 |     middle_initial,
 8 |     job_code,
 9 |     branch,
10 |     office,
11 |     phone
12 | from
13 |     {{ ref('employees') }}


--------------------------------------------------------------------------------
/models/gold/fact_cash_balances.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select *
 3 |     from {{ ref('fact_cash_transactions') }}
 4 | )
 5 | select 
 6 |     sk_customer_id,
 7 |     sk_account_id,
 8 |     sk_transaction_date,
 9 |     sum(amount) amount,
10 |     description
11 | from s1
12 | group by all
13 | order by sk_transaction_date, sk_customer_id, sk_account_id


--------------------------------------------------------------------------------
/models/silver/employees.sql:
--------------------------------------------------------------------------------
 1 | select 
 2 |     employee_id,
 3 |     manager_id,
 4 |     employee_first_name first_name,
 5 |     employee_last_name last_name,
 6 |     employee_mi middle_initial,
 7 |     employee_job_code job_code,
 8 |     employee_branch branch,
 9 |     employee_office office,
10 |     employee_phone phone
11 | from {{ ref('hr_employee') }}


--------------------------------------------------------------------------------
/models/bronze/brokerage/sources.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name: brokerage
 5 |     database: tpcdi
 6 |     schema: digen
 7 |     loader: snowpark
 8 | 
 9 |     tables:
10 |       - name: daily_market
11 |       - name: watch_history
12 |       - name: trade
13 |       - name: trade_history
14 |       - name: holding_history
15 |       - name: cash_transaction
16 |       
17 | 


--------------------------------------------------------------------------------
/models/gold/dim_trade.sql:
--------------------------------------------------------------------------------
 1 | select 
 2 |     {{dbt_utils.generate_surrogate_key(['trade_id','t.effective_timestamp'])}} sk_trade_id,
 3 |     trade_id,
 4 |     trade_status status,
 5 |     transaction_type,
 6 |     trade_type type,
 7 |     executor_name executed_by,
 8 |     t.effective_timestamp,
 9 |     t.end_timestamp,
10 |     t.IS_CURRENT
11 | from
12 |     {{ ref('trades_history') }} t
13 |     


--------------------------------------------------------------------------------
/models/work/wrk_company_financials.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     sk_company_id,
 3 |     f.company_id,
 4 |     QUARTER_START_DATE,
 5 |     sum(eps) over (
 6 |         partition by f.company_id
 7 |         order by QUARTER_START_DATE
 8 |         rows between 4 preceding and current row
 9 |     ) - eps sum_basic_eps
10 | from {{ ref("financials") }} f
11 | join {{ ref("dim_company") }} c 
12 | on f.company_id = c.company_id
13 | and f.effective_timestamp between c.effective_timestamp and c.end_timestamp


--------------------------------------------------------------------------------
/models/bronze/reference/reference_date.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     DATE_VALUE SK_DATE_ID,
 3 | 	DATE_VALUE,
 4 | 	DATE_DESC,
 5 | 	CALENDAR_YEAR_ID,
 6 | 	CALENDAR_YEAR_DESC,
 7 | 	CALENDAR_QTR_ID,
 8 | 	CALENDAR_QTR_DESC,
 9 | 	CALENDAR_MONTH_ID,
10 | 	CALENDAR_MONTH_DESC,
11 | 	CALENDAR_WEEK_ID,
12 | 	CALENDAR_WEEK_DESC,
13 | 	DAY_OF_WEEK_NUM,
14 | 	DAY_OF_WEEK_DESC,
15 | 	FISCAL_YEAR_ID,
16 | 	FISCAL_YEAR_DESC,
17 | 	FISCAL_QTR_ID,
18 | 	FISCAL_QTR_DESC,
19 | 	HOLIDAY_FLAG
20 | from {{ source('reference', 'date') }}


--------------------------------------------------------------------------------
/models/silver/cash_transactions.sql:
--------------------------------------------------------------------------------
 1 | with t as (
 2 |     select
 3 |         ct_ca_id account_id,
 4 |         ct_dts transaction_timestamp,
 5 |         ct_amt amount,
 6 |         ct_name description
 7 |     from
 8 |         {{ ref('brokerage_cash_transaction') }}
 9 | )
10 | select
11 |     a.customer_id,
12 |     t.*
13 | from
14 |     t
15 | join
16 |     {{ ref('accounts') }} a
17 | on
18 |     t.account_id = a.account_id
19 | and
20 |     t.transaction_timestamp between a.effective_timestamp and a.end_timestamp
21 | 
22 | 


--------------------------------------------------------------------------------
/models/example/schema.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: 2
 3 | 
 4 | models:
 5 |   - name: my_first_dbt_model
 6 |     description: "A starter dbt model"
 7 |     columns:
 8 |       - name: id
 9 |         description: "The primary key for this table"
10 |         tests:
11 |           - unique
12 |           - not_null
13 | 
14 |   - name: my_second_dbt_model
15 |     description: "A starter dbt model"
16 |     columns:
17 |       - name: id
18 |         description: "The primary key for this table"
19 |         tests:
20 |           - unique
21 |           - not_null
22 | 


--------------------------------------------------------------------------------
/models/gold/fact_cash_transactions.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select
 3 |         *,
 4 |         to_date(transaction_timestamp) sk_transaction_date
 5 |     from
 6 |         {{ ref('cash_transactions') }}
 7 | )
 8 | select
 9 |     sk_customer_id,
10 |     sk_account_id,
11 |     sk_transaction_date,
12 |     transaction_timestamp,
13 |     amount,
14 |     description
15 | from
16 |     s1
17 | join
18 |     {{ ref('dim_account') }} a
19 | on
20 |     s1.account_id = a.account_id
21 | and
22 |     s1.transaction_timestamp between a.effective_timestamp and a.end_timestamp


--------------------------------------------------------------------------------
/models/silver/watches_history.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | with s1 as (
 3 |     select
 4 |         w_c_id customer_id,
 5 |         w_s_symb symbol,
 6 |         w_dts watch_timestamp,
 7 |         case w_action
 8 |         when 'ACTV' then 'Activate'
 9 |         when 'CNCL' then 'Cancelled'
10 |         else null end action_type
11 |     from
12 |         {{ ref('brokerage_watch_history') }}
13 | )
14 | select 
15 |     s1.*,
16 |     company_id,
17 |     company_name,
18 |     exchange_id,
19 |     status security_status
20 | from 
21 |     s1
22 | join
23 |     {{ ref('securities') }} s
24 | using (symbol)


--------------------------------------------------------------------------------
/models/silver/holdings_history.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select
 3 |         HH_T_ID trade_id,
 4 |         HH_H_T_ID previous_trade_id,
 5 |         hh_before_qty previous_quantity,
 6 |         hh_after_qty quantity
 7 |     from {{ ref('brokerage_holding_history') }}
 8 | )
 9 | select s1.*,
10 |        ct.account_id account_id,
11 |        ct.symbol symbol,
12 |        ct.create_timestamp,
13 |        ct.close_timestamp,
14 |        ct.trade_price,
15 |        ct.bid_price,
16 |        ct.fee,
17 |        ct.commission
18 | from s1
19 | join {{ ref('trades') }} ct
20 | using (trade_id)
21 | 
22 | 


--------------------------------------------------------------------------------
/models/gold/fact_watches.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     sk_customer_id,
 3 |     sk_security_id,
 4 |     to_date(placed_timestamp) sk_date_placed,
 5 |     to_date(removed_timestamp) sk_date_removed,
 6 |     1 as watch_cnt
 7 | from 
 8 |     {{ ref('watches') }} w
 9 | join
10 |     {{ ref('dim_customer') }} c
11 | ON
12 |     w.customer_id = c.customer_id
13 | and
14 |     placed_timestamp between c.effective_timestamp and c.end_timestamp
15 | join
16 |     {{ ref('dim_security') }} s
17 | ON
18 |     w.symbol = s.symbol
19 | and
20 |     placed_timestamp between s.effective_timestamp and s.end_timestamp
21 | 


--------------------------------------------------------------------------------
/models/gold/dim_account.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     {{dbt_utils.generate_surrogate_key(['account_id','a.effective_timestamp'])}} sk_account_id,
 3 |     a.account_id,
 4 |     sk_broker_id,
 5 |     sk_customer_id,
 6 |     a.status,
 7 |     account_desc,
 8 |     tax_status,
 9 |     a.effective_timestamp,
10 |     a.end_timestamp,
11 |     a.is_current
12 | from
13 |     {{ ref('accounts') }} a
14 | join
15 |     {{ ref('dim_customer') }} c
16 | on a.customer_id = c.customer_id
17 | and a.effective_timestamp between c.effective_timestamp and c.end_timestamp
18 | join
19 |     {{ ref('dim_broker') }} b 
20 | using (broker_id)


--------------------------------------------------------------------------------
/models/bronze/finwire/finwire_security.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select *,
 3 |     try_to_number(co_name_or_cik) as try_cik
 4 |     from {{ source("finwire", "sec") }}
 5 | )
 6 | select  
 7 |     pts,
 8 |     symbol,
 9 |     issue_type,
10 |     status,
11 |     name,
12 |     ex_id,
13 |     to_number(sh_out) as sh_out,
14 |     to_date(first_trade_date,'yyyymmdd') as first_trade_date,
15 |     to_date(first_exchange_date,'yyyymmdd') as first_exchange_date,
16 |     cast(dividend as float) as dividend,
17 |     try_cik cik,
18 |     case when try_cik is null then co_name_or_cik else null end company_name
19 | from s1


--------------------------------------------------------------------------------
/models/example/my_first_dbt_model.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |     Welcome to your first dbt model!
 4 |     Did you know that you can also configure models directly within SQL files?
 5 |     This will override configurations stated in dbt_project.yml
 6 | 
 7 |     Try changing "table" to "view" below
 8 | */
 9 | {{ config(enabled=false) }}
10 | {{ config(materialized='table') }}
11 | 
12 | with source_data as (
13 | 
14 |     select 1 as id
15 |     union all
16 |     select null as id
17 | 
18 | )
19 | 
20 | select *
21 | from source_data
22 | 
23 | /*
24 |     Uncomment the line below to remove records with null `id` values
25 | */
26 | 
27 | -- where id is not null
28 | 


--------------------------------------------------------------------------------
/models/silver/trades.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select distinct
 3 |         trade_id,
 4 |         account_id,
 5 |         trade_status,
 6 |         trade_type,
 7 |         transaction_type,
 8 |         symbol,
 9 |         executor_name,
10 |         quantity,
11 |         bid_price,
12 |         trade_price,
13 |         fee,
14 |         commission,
15 |         tax,
16 |         min(effective_timestamp) over (partition by trade_id) create_timestamp,
17 |         max(effective_timestamp) over (partition by trade_id) close_timestamp
18 |     from
19 |         {{ ref('trades_history') }}
20 |     order by trade_id, create_timestamp
21 | )
22 | select *
23 | from s1


--------------------------------------------------------------------------------
/models/gold/fact_market_history.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     s.sk_security_id,
 3 |     s.sk_company_id,
 4 |     dm_date sk_date_id,
 5 |     --dmh.dm_close / sum_basic_eps AS peratio,
 6 |     (s.dividend / dmh.dm_close) / 100 yield,
 7 |     fifty_two_week_high,
 8 |     fifty_two_week_high_date sk_fifty_two_week_high_date,
 9 |     fifty_two_week_low,
10 |     fifty_two_week_low_date sk_fifty_two_week_low_date,
11 |     dm_close closeprice,
12 |     dm_high dayhigh,
13 |     dm_low daylow,
14 |     dm_vol volume
15 | FROM {{ ref("daily_market") }} dmh
16 | JOIN {{ ref("dim_security") }} s
17 | ON s.symbol = dmh.dm_s_symb
18 |     AND dmh.dm_date between s.effective_timestamp and s.end_timestamp
19 | LEFT JOIN {{ ref("wrk_company_financials")}} f
20 | USING (sk_company_id)
21 | 


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name-template: 'v$RESOLVED_VERSION'
 2 | tag-template: 'v$RESOLVED_VERSION'
 3 | categories:
 4 |   - title: '🚀 Features'
 5 |     labels:
 6 |       - 'feature'
 7 |       - 'enhancement'
 8 |   - title: '🐛 Bug Fixes'
 9 |     labels:
10 |       - 'fix'
11 |       - 'bugfix'
12 |       - 'bug'
13 |   - title: '🧰 Maintenance'
14 |     label: 'chore'
15 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
16 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
17 | version-resolver:
18 |   major:
19 |     labels:
20 |       - 'major'
21 |   minor:
22 |     labels:
23 |       - 'minor'
24 |   patch:
25 |     labels:
26 |       - 'patch'
27 |   default: patch
28 | template: |
29 |   ## Changes
30 | 
31 |   $CHANGES


--------------------------------------------------------------------------------
/models/gold/dim_security.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select
 3 |         symbol,
 4 |         issue_type issue,
 5 |         s.status,
 6 |         s.name,
 7 |         exchange_id,
 8 |         sk_company_id,
 9 |         shares_outstanding,
10 |         first_trade_date,
11 |         first_exchange_date,
12 |         dividend,
13 |         s.effective_timestamp,
14 |         s.end_timestamp,
15 |         s.IS_CURRENT
16 |     from
17 |         {{ ref("securities") }} s
18 |     join
19 |         {{ ref("dim_company") }} c
20 |     on 
21 |         s.company_id = c.company_id
22 |     and
23 |         s.effective_timestamp between c.effective_timestamp and c.end_timestamp
24 | )
25 | select
26 |     {{dbt_utils.generate_surrogate_key(['symbol','effective_timestamp'])}} sk_security_id,
27 |     *
28 | from
29 |     s1
30 | 


--------------------------------------------------------------------------------
/devrel/multi-record.txt:
--------------------------------------------------------------------------------
1 | 20151230-152248FIN201542015100120151230    4880880089.63    2473473307.30        4.82        4.42        0.51     200321223.47  139284472514.02    9402305760.19    512872010    5597926720000001595
2 | 20151230-152511SECAAAAAAAAAAAAKVDPREF_AACTVDJBJXyQHLBvn EEOGAOvUNgL XwrOxQUBMrgPv                                AMEX  982113436    1903022619730704        1.200000000254
3 | 20151230-163207CMPWWfcsOHprIDIUsPfRLrcLPlxaQ                                  0000004432ACTVMCA   1873092521088 Vessey Crescent                                                                                                                                           M5D 1Z1     Winnipeg                 AL                  United States of AmericaMoreno                                        rlRIDCNz dVGrEzomCXIvZVZzFzxCzbGYIEbAXJMJlsYUQEV


--------------------------------------------------------------------------------
/models/gold/fact_holdings.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select 
 3 |         *
 4 |     from {{ ref('holdings_history') }}
 5 | )
 6 | select
 7 |     ct.sk_trade_id sk_current_trade_id,
 8 |     pt.sk_trade_id,
 9 |     sk_customer_id,
10 |     sk_account_id,
11 |     sk_security_id,
12 |     to_date(create_timestamp) sk_trade_date,
13 |     create_timestamp trade_timestamp,
14 |     trade_price current_price,
15 |     quantity current_holding,
16 |     bid_price current_bid_price,
17 |     fee current_fee,
18 |     commission current_commission
19 | from s1
20 | join {{ ref('dim_trade') }} ct
21 | using (trade_id)
22 | join {{ ref('dim_trade') }} pt 
23 | on s1.previous_trade_id = pt.trade_id
24 | join {{ ref('dim_account') }} a 
25 | on s1.account_id = a.account_id 
26 | and s1.create_timestamp between a.effective_timestamp and a.end_timestamp
27 | join {{ ref('dim_security') }} s 
28 | on s1.symbol = s.symbol


--------------------------------------------------------------------------------
/models/gold/fact_trade.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     sk_trade_id,
 3 |     sk_broker_id,
 4 |     sk_customer_id,
 5 |     sk_account_id,
 6 |     sk_security_id,
 7 |     to_date(create_timestamp) sk_create_date,
 8 |     create_timestamp,
 9 |     to_date(close_timestamp) sk_close_date,
10 |     close_timestamp,
11 |     executed_by,
12 |     quantity,
13 |     bid_price,
14 |     trade_price,
15 |     fee,
16 |     commission,
17 |     tax
18 | from {{ ref('trades') }} t
19 | join {{ ref('dim_trade') }} dt
20 | on t.trade_id = dt.trade_id
21 | and t.create_timestamp between dt.effective_timestamp and dt.end_timestamp
22 | join
23 |     {{ ref('dim_account') }} a
24 | on 
25 |     t.account_id = a.account_id
26 | and
27 |     t.create_timestamp between a.effective_timestamp and a.end_timestamp
28 | join
29 |     {{ ref('dim_security') }} s
30 | on
31 |     t.symbol = s.symbol
32 | and
33 |     t.create_timestamp between s.effective_timestamp and s.end_timestamp


--------------------------------------------------------------------------------
/models/bronze/finwire/finwire_financial.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | with s1 as (
 3 |     select 
 4 |         *,
 5 |         try_to_number(co_name_or_cik) as try_cik
 6 |     from {{ source("finwire", "fin") }}
 7 | )
 8 | select 
 9 |     pts,
10 |     to_number(year) as year,
11 |     to_number(quarter) as quarter,
12 |     to_date(quarter_start_date,'yyyymmdd') as quarter_start_date,
13 |     to_date(posting_date,'yyyymmdd') as posting_date,
14 |     cast(revenue as float) as revenue,
15 |     cast(earnings as float) as earnings,
16 |     cast(eps as float) as eps,
17 |     cast(diluted_eps as float) as diluted_eps,
18 |     cast(margin as float) as margin,
19 |     cast(inventory as float) as inventory,
20 |     cast(assets as float) as assets,
21 |     cast(liabilities as float) as liabilities,
22 |     to_number(sh_out) as sh_out,
23 |     to_number(diluted_sh_out) as diluted_sh_out,
24 |     try_cik cik,
25 |     case when try_cik is null then co_name_or_cik else null end company_name
26 | from s1


--------------------------------------------------------------------------------
/models/gold/dim_company.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     {{dbt_utils.generate_surrogate_key(['company_id','effective_timestamp'])}} sk_company_id,
 3 |     company_id,
 4 |     status,
 5 |     name,
 6 |     industry,
 7 |     ceo,
 8 |     address_line1,
 9 |     address_line2,
10 |     postal_code,
11 |     city,
12 |     state_province,
13 |     country,
14 |     description,
15 |     founding_date,
16 |     sp_rating,
17 |     case
18 |         when
19 |             sp_rating in (
20 |                 'BB',
21 |                 'B',
22 |                 'CCC',
23 |                 'CC',
24 |                 'C',
25 |                 'D',
26 |                 'BB+',
27 |                 'B+',
28 |                 'CCC+',
29 |                 'BB-',
30 |                 'B-',
31 |                 'CCC-'
32 |             )
33 |         then true
34 |         else false
35 |     end as is_lowgrade,
36 |     effective_timestamp,
37 |     end_timestamp,
38 |     is_current
39 | from {{ ref("companies") }}


--------------------------------------------------------------------------------
/models/silver/watches.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select
 3 |         customer_id,
 4 |         symbol,
 5 |         watch_timestamp,
 6 |         action_type,
 7 |         company_id,
 8 |         company_name,
 9 |         exchange_id,
10 |         security_status,
11 |         case action_type
12 |             when 'Activate' then watch_timestamp 
13 |             else null 
14 |         end placed_timestamp,
15 |         case action_type
16 |             when 'Cancelled' then watch_timestamp 
17 |             else null 
18 |         end removed_timestamp
19 |     from
20 |         {{ ref('watches_history') }}
21 | ),
22 | s2 as (
23 |     select
24 |         customer_id,
25 |         symbol,
26 |         company_id,
27 |         company_name,
28 |         exchange_id,
29 |         security_status,
30 |         min(placed_timestamp) placed_timestamp, 
31 |         max(removed_timestamp) removed_timestamp
32 |     from s1
33 |     group by all
34 | )
35 | select 
36 |     *,
37 |     case
38 |         when removed_timestamp is null then 'Active'
39 |         else 'Inactive'
40 |     end watch_status
41 | from s2
42 | 


--------------------------------------------------------------------------------
/models/silver/companies.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | select
 3 |     cik as company_id,
 4 |     st.st_name status,
 5 |     company_name name,
 6 |     ind.in_name industry,
 7 |     ceo_name ceo,
 8 |     address_line1,
 9 |     address_line2,
10 |     postal_code,
11 |     city,
12 |     state_province,
13 |     country,
14 |     description,
15 |     founding_date,
16 |     sp_rating,
17 |     pts as effective_timestamp,
18 |     ifnull(
19 |         timestampadd(
20 |             'millisecond',
21 |             -1,
22 |             lag(pts) over (
23 |                 partition by company_id
24 |                 order by
25 |                 pts desc
26 |             )
27 |         ),
28 |         to_timestamp('9999-12-31 23:59:59.999')
29 |     ) as end_timestamp,
30 |     CASE
31 |         WHEN (
32 |             row_number() over (
33 |                 partition by company_id
34 |                 order by
35 |                 pts desc
36 |             ) = 1
37 |         ) THEN TRUE
38 |         ELSE FALSE
39 |     END as IS_CURRENT
40 | from {{ ref("finwire_company") }} cmp
41 | join {{ ref("reference_status_type") }} st on cmp.status = st.st_id
42 | join {{ ref("reference_industry") }} ind on cmp.industry_id = ind.in_id
43 | 


--------------------------------------------------------------------------------
/.github/workflows/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name: Release Drafter
 2 | 
 3 | on:
 4 |   push:
 5 |     # branches to consider in the event; optional, defaults to all
 6 |     branches:
 7 |       - main
 8 |   # pull_request event is required only for autolabeler
 9 |   pull_request:
10 |     # Only following types are handled by the action, but one can default to all as well
11 |     types: [opened, reopened, synchronize]
12 |   # pull_request_target event is required for autolabeler to support PRs from forks
13 |   # pull_request_target:
14 |   #   types: [opened, reopened, synchronize]
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   update_release_draft:
21 |     permissions:
22 |       # write permission is required to create a github release
23 |       contents: write
24 |       # write permission is required for autolabeler
25 |       # otherwise, read permission is required at least
26 |       pull-requests: write
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       # Drafts your next Release notes as Pull Requests are merged into "master"
30 |       - uses: release-drafter/release-drafter@v5
31 |         with:
32 |           commitish: main
33 |         env:
34 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/models/silver/securities.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     symbol,
 3 |     issue_type,
 4 |     case s.status
 5 |         when 'ACTV' then 'Active'
 6 |         when 'INAC' then 'Inactive'
 7 |         else null
 8 |     end status,
 9 |     s.name,
10 |     ex_id exchange_id,
11 |     sh_out shares_outstanding,
12 |     first_trade_date,
13 |     first_exchange_date,
14 |     dividend,
15 |     coalesce(c1.name,c2.name) company_name,
16 |     coalesce(c1.company_id, c2.company_id) company_id,
17 |     pts as effective_timestamp,
18 |     ifnull(
19 |         timestampadd(
20 |         'millisecond',
21 |         -1,
22 |         lag(pts) over (
23 |             partition by symbol
24 |             order by
25 |             pts desc
26 |         )
27 |         ),
28 |         to_timestamp('9999-12-31 23:59:59.999')
29 |     ) as end_timestamp,
30 |     CASE
31 |         WHEN (
32 |             row_number() over (
33 |                 partition by symbol
34 |                 order by
35 |                 pts desc
36 |             ) = 1
37 |         ) THEN TRUE
38 |         ELSE FALSE
39 |     END as IS_CURRENT
40 | from {{ ref('finwire_security') }} s 
41 | left join {{ ref('companies') }} c1
42 | on s.cik = c1.company_id
43 | and pts between c1.effective_timestamp and c1.end_timestamp
44 | left join {{ ref('companies') }} c2
45 | on s.company_name = c2.name
46 | and pts between c2.effective_timestamp and c2.end_timestamp
47 | 


--------------------------------------------------------------------------------
/models/silver/daily_market.sql:
--------------------------------------------------------------------------------
 1 | with
 2 |     s1 as (
 3 |         select
 4 |             -- dm_date,
 5 |             min(dm_low) over (
 6 |                 partition by dm_s_symb
 7 |                 order by dm_date asc
 8 |                 rows between 364 preceding and 0 following  -- CURRENT ROW
 9 |             ) fifty_two_week_low,
10 |             max(dm_high) over (
11 |                 partition by dm_s_symb
12 |                 order by dm_date asc
13 |                 rows between 364 preceding and 0 following  -- CURRENT ROW
14 |             ) fifty_two_week_high,
15 |             *
16 |         from {{ ref("brokerage_daily_market") }}
17 |     ),
18 |     s2 as (
19 |         select a.*, 
20 |                b.dm_date as fifty_two_week_low_date, 
21 |                c.dm_date as fifty_two_week_high_date
22 |         from s1 a
23 |         join
24 |             s1 b
25 |             on a.dm_s_symb = b.dm_s_symb
26 |             and a.fifty_two_week_low = b.dm_low
27 |             and b.dm_date between add_months(a.dm_date, -12) and a.dm_date
28 |         join
29 |             s1 c
30 |             on a.dm_s_symb = c.dm_s_symb
31 |             and a.fifty_two_week_high = c.dm_high
32 |             and c.dm_date between add_months(a.dm_date, -12) and a.dm_date
33 |     )
34 | select *
35 | from s2
36 | qualify
37 |     row_number() over (
38 |         partition by dm_s_symb, dm_date
39 |         order by fifty_two_week_low_date, fifty_two_week_high_date
40 |     ) = 1
41 | 


--------------------------------------------------------------------------------
/models/silver/financials.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select
 3 |         YEAR,
 4 |         QUARTER,
 5 |         QUARTER_START_DATE,
 6 |         POSTING_DATE,
 7 |         REVENUE,
 8 |         EARNINGS,
 9 |         EPS,
10 |         DILUTED_EPS,
11 |         MARGIN,
12 |         INVENTORY,
13 |         ASSETS,
14 |         LIABILITIES,
15 |         SH_OUT,
16 |         DILUTED_SH_OUT,
17 |         coalesce(c1.name,c2.name) company_name,
18 |         coalesce(c1.company_id, c2.company_id) company_id,
19 |         pts as effective_timestamp
20 |     from {{ ref('finwire_financial') }} s 
21 |     left join {{ ref('companies') }} c1
22 |     on s.cik = c1.company_id
23 |     and pts between c1.effective_timestamp and c1.end_timestamp
24 |     left join {{ ref('companies') }} c2
25 |     on s.company_name = c2.name
26 |     and pts between c2.effective_timestamp and c2.end_timestamp
27 | )
28 | select
29 |     *,
30 |     ifnull(
31 |         timestampadd(
32 |         'millisecond',
33 |         -1,
34 |         lag(effective_timestamp) over (
35 |             partition by company_id
36 |             order by
37 |             effective_timestamp desc
38 |         )
39 |         ),
40 |         to_timestamp('9999-12-31 23:59:59.999')
41 |     ) as end_timestamp,
42 |     CASE
43 |         WHEN (
44 |             row_number() over (
45 |                 partition by company_id
46 |                 order by
47 |                 effective_timestamp desc
48 |             ) = 1
49 |         ) THEN TRUE
50 |         ELSE FALSE
51 |     END as IS_CURRENT
52 | from s1
53 | 


--------------------------------------------------------------------------------
/models/silver/trades_history.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     t_id trade_id,
 3 |     t_dts trade_timestamp,
 4 |     t_ca_id account_id,
 5 |     ts.st_name trade_status,
 6 |     tt_name trade_type,
 7 |     case t_is_cash
 8 |         when true then 'Cash'
 9 |         when false then 'Margin'
10 |     end transaction_type,
11 |     t_s_symb symbol,
12 |     t_exec_name executor_name,
13 |     t_qty quantity,
14 |     t_bid_price bid_price,
15 |     t_trade_price trade_price,
16 |     t_chrg fee,
17 |     t_comm commission,
18 |     t_tax tax,
19 |     us.st_name update_status,
20 |     th_dts effective_timestamp,
21 |     ifnull(
22 |         timestampadd(
23 |         'millisecond',
24 |         -1,
25 |         lag(th_dts) over (
26 |             partition by t_id
27 |             order by
28 |             th_dts desc
29 |         )
30 |         ),
31 |         to_timestamp('9999-12-31 23:59:59.999')
32 |     ) as end_timestamp,
33 |     CASE
34 |         WHEN (
35 |             row_number() over (
36 |                 partition by t_id
37 |                 order by
38 |                 th_dts desc
39 |             ) = 1
40 |         ) THEN TRUE
41 |         ELSE FALSE
42 |     END as IS_CURRENT
43 | from
44 |     {{ ref('brokerage_trade') }} 
45 | join
46 |     {{ ref('brokerage_trade_history') }}  
47 | on
48 |     t_id = th_t_id
49 | join
50 |     {{ ref('reference_trade_type') }} 
51 | on
52 |     t_tt_id = tt_id
53 | join
54 |     {{ ref('reference_status_type') }} ts
55 | on
56 |     t_st_id = ts.st_id
57 | join
58 |     {{ ref('reference_status_type') }} us
59 | on th_st_id = us.st_id


--------------------------------------------------------------------------------
/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'dbt_tpcdi'
 6 | version: '1.0.0'
 7 | config-version: 2
 8 | 
 9 | # This setting configures which "profile" dbt uses for this project.
10 | profile: 'tpcdi'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `model-paths` config, for example, states that models in this project can be
14 | # found in the "models/" directory. You probably won't need to change these!
15 | model-paths: ["models"]
16 | analysis-paths: ["analyses"]
17 | test-paths: ["tests"]
18 | seed-paths: ["seeds"]
19 | macro-paths: ["macros"]
20 | snapshot-paths: ["snapshots"]
21 | 
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |   - "target"
24 |   - "dbt_packages"
25 | 
26 | # Configuring models
27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
28 | 
29 | models:
30 |   dbt_tpcdi:
31 |     example:
32 |       +materialized: view
33 |     bronze:
34 |       +schema: bronze
35 |       +materialized: dynamic_table
36 |       +snowflake_warehouse: tpcdi_large
37 |       +target_lag: downstream
38 |     silver:
39 |       +schema: silver
40 |       +materialized: dynamic_table
41 |       +snowflake_warehouse: tpcdi_large
42 |       +target_lag: '10 minutes'
43 |     gold:
44 |       +schema: gold
45 |       +materialized: dynamic_table
46 |       +snowflake_warehouse: tpcdi_large
47 |       +target_lag: '20 minutes'
48 |     work:
49 |       +schema: work
50 |       +materialized: ephemeral
51 | 


--------------------------------------------------------------------------------
/models/silver/accounts.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     action_type,
 3 |     decode(action_type,
 4 |       'NEW','Active',
 5 |       'ADDACCT','Active',
 6 |       'UPDACCT','Active',
 7 |       'CLOSEACCT','Inactive') status,
 8 |     ca_id account_id,
 9 |     ca_name account_desc,
10 |     c_id customer_id,
11 |     c_tax_id tax_id,
12 |     c_gndr gender,
13 |     c_tier tier,
14 |     c_dob dob,
15 |     c_l_name last_name,
16 |     c_f_name first_name,
17 |     c_m_name middle_name,
18 |     c_adline1 address_line1,
19 |     c_adline2 address_line2,
20 |     c_zipcode postal_code,
21 |     c_city city,
22 |     c_state_prov state_province,
23 |     c_ctry country,
24 |     c_prim_email primary_email,
25 |     c_alt_email alternate_email,
26 |     c_phone_1 phone1,
27 |     c_phone_2 phone2,
28 |     c_phone_3 phone3,
29 |     c_lcl_tx_id local_tax_rate_name,
30 |     ltx.tx_rate local_tax_rate,
31 |     c_nat_tx_id national_tax_rate_name,
32 |     ntx.tx_rate national_tax_rate,
33 |     ca_tax_st tax_status,
34 |     ca_b_id broker_id,
35 |     action_ts as effective_timestamp,
36 |     ifnull(
37 |         timestampadd(
38 |         'millisecond',
39 |         -1,
40 |         lag(action_ts) over (
41 |             partition by ca_id
42 |             order by
43 |             action_ts desc
44 |         )
45 |         ),
46 |         to_timestamp('9999-12-31 23:59:59.999')
47 |     ) as end_timestamp,
48 |     CASE
49 |         WHEN (
50 |             row_number() over (
51 |                 partition by ca_id
52 |                 order by
53 |                 action_ts desc
54 |             ) = 1
55 |         ) THEN TRUE
56 |         ELSE FALSE
57 |     END as IS_CURRENT
58 | from
59 |     {{ ref('crm_customer_mgmt') }} c
60 | left join
61 |     {{ ref('reference_tax_rate') }} ntx
62 | on
63 |     c.c_nat_tx_id = ntx.tx_id
64 | left join
65 |     {{ ref('reference_tax_rate') }} ltx
66 | on
67 |     c.c_lcl_tx_id = ltx.tx_id
68 | where ca_id is not null


--------------------------------------------------------------------------------
/models/silver/customers.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     action_type,
 3 |     decode(action_type,
 4 |       'NEW','Active',
 5 |       'ADDACCT','Active',
 6 |       'UPDACCT','Active',
 7 |       'UPDCUST','Active',
 8 |       'INACT','Inactive') status,
 9 |     c_id customer_id,
10 |     ca_id account_id,
11 |     c_tax_id tax_id,
12 |     c_gndr gender,
13 |     c_tier tier,
14 |     c_dob dob,
15 |     c_l_name last_name,
16 |     c_f_name first_name,
17 |     c_m_name middle_name,
18 |     c_adline1 address_line1,
19 |     c_adline2 address_line2,
20 |     c_zipcode postal_code,
21 |     c_city city,
22 |     c_state_prov state_province,
23 |     c_ctry country,
24 |     c_prim_email primary_email,
25 |     c_alt_email alternate_email,
26 |     c_phone_1 phone1,
27 |     c_phone_2 phone2,
28 |     c_phone_3 phone3,
29 |     c_lcl_tx_id local_tax_rate_name,
30 |     ltx.tx_rate local_tax_rate,
31 |     c_nat_tx_id national_tax_rate_name,
32 |     ntx.tx_rate national_tax_rate,
33 |     ca_tax_st account_tax_status,
34 |     ca_b_id broker_id,
35 |     action_ts as effective_timestamp,
36 |     ifnull(
37 |         timestampadd(
38 |             'millisecond',
39 |             -1,
40 |             lag(action_ts) over (
41 |                 partition by c_id
42 |                 order by
43 |                 action_ts desc
44 |             )
45 |         ),
46 |         to_timestamp('9999-12-31 23:59:59.999')
47 |     ) as end_timestamp,
48 |     CASE
49 |         WHEN (
50 |             row_number() over (
51 |                 partition by c_id
52 |                 order by
53 |                 action_ts desc
54 |             ) = 1
55 |         ) THEN TRUE
56 |         ELSE FALSE
57 |     END as IS_CURRENT
58 | from
59 |     {{ ref('crm_customer_mgmt') }} c
60 | left join
61 |     {{ ref('reference_tax_rate') }} ntx
62 | on
63 |     c.c_nat_tx_id = ntx.tx_id
64 | left join
65 |     {{ ref('reference_tax_rate') }} ltx
66 | on
67 |     c.c_lcl_tx_id = ltx.tx_id
68 | where action_type in ('NEW', 'INACT', 'UPDCUST')


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: tpcdi
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - abseil-cpp=20211102.0=hc377ac9_0
 6 |   - arrow-cpp=10.0.1=py38h0fd5f32_0
 7 |   - asn1crypto=1.5.1=py38hca03da5_0
 8 |   - aws-c-common=0.6.8=h80987f9_1
 9 |   - aws-c-event-stream=0.1.6=h313beb8_6
10 |   - aws-checksums=0.1.11=h80987f9_2
11 |   - aws-sdk-cpp=1.8.185=h4a942e0_0
12 |   - blas=1.0=openblas
13 |   - boost-cpp=1.73.0=h1a28f6b_12
14 |   - brotli=1.0.9=h1a28f6b_7
15 |   - brotli-bin=1.0.9=h1a28f6b_7
16 |   - brotlipy=0.7.0=py38h1a28f6b_1002
17 |   - bzip2=1.0.8=h620ffc9_4
18 |   - c-ares=1.19.0=h80987f9_0
19 |   - ca-certificates=2023.05.30=hca03da5_0
20 |   - certifi=2023.5.7=py38hca03da5_0
21 |   - cffi=1.15.1=py38h80987f9_3
22 |   - charset-normalizer=2.0.4=pyhd3eb1b0_0
23 |   - cloudpickle=2.0.0=pyhd3eb1b0_0
24 |   - cryptography=39.0.1=py38h834c97f_0
25 |   - filelock=3.9.0=py38hca03da5_0
26 |   - gflags=2.2.2=hc377ac9_0
27 |   - glog=0.5.0=hc377ac9_0
28 |   - gmp=6.2.1=hc377ac9_3
29 |   - grpc-cpp=1.46.1=h8e4afa7_1
30 |   - icu=68.1=hc377ac9_0
31 |   - idna=3.4=py38hca03da5_0
32 |   - krb5=1.20.1=h8380606_1
33 |   - libboost=1.73.0=h49e8a49_12
34 |   - libbrotlicommon=1.0.9=h1a28f6b_7
35 |   - libbrotlidec=1.0.9=h1a28f6b_7
36 |   - libbrotlienc=1.0.9=h1a28f6b_7
37 |   - libcurl=8.1.1=h0f1d93c_1
38 |   - libcxx=14.0.6=h848a8c0_0
39 |   - libedit=3.1.20221030=h80987f9_0
40 |   - libev=4.33=h1a28f6b_1
41 |   - libevent=2.1.12=hf27765b_0
42 |   - libffi=3.4.4=hca03da5_0
43 |   - libgfortran=5.0.0=11_3_0_hca03da5_28
44 |   - libgfortran5=11.3.0=h009349e_28
45 |   - libiconv=1.16=h1a28f6b_2
46 |   - libnghttp2=1.52.0=h10c0552_1
47 |   - libopenblas=0.3.21=h269037a_0
48 |   - libprotobuf=3.20.3=h514c7bf_0
49 |   - libssh2=1.10.0=h449679c_2
50 |   - libthrift=0.13.0=hd358383_6
51 |   - llvm-openmp=14.0.6=hc6e5704_0
52 |   - lz4-c=1.9.4=h313beb8_0
53 |   - ncurses=6.4=h313beb8_0
54 |   - numpy=1.24.3=py38h1398885_0
55 |   - numpy-base=1.24.3=py38h90707a3_0
56 |   - openssl=1.1.1u=h1a28f6b_0
57 |   - orc=1.7.4=hdca1487_1
58 |   - oscrypto=1.2.1=pyhd3eb1b0_0
59 |   - packaging=23.0=py38hca03da5_0
60 |   - pip=23.1.2=py38hca03da5_0
61 |   - pyarrow=10.0.1=py38h23c13bc_0
62 |   - pycparser=2.21=pyhd3eb1b0_0
63 |   - pycryptodomex=3.15.0=py38h1a28f6b_0
64 |   - pyjwt=2.4.0=py38hca03da5_0
65 |   - pyopenssl=23.0.0=py38hca03da5_0
66 |   - pysocks=1.7.1=py38hca03da5_0
67 |   - python=3.8.17=hc0d8a6c_0
68 |   - pytz=2022.7=py38hca03da5_0
69 |   - re2=2022.04.01=hc377ac9_0
70 |   - readline=8.2=h1a28f6b_0
71 |   - requests=2.29.0=py38hca03da5_0
72 |   - setuptools=67.8.0=py38hca03da5_0
73 |   - snappy=1.1.9=hc377ac9_0
74 |   - snowflake-connector-python=3.0.3=py38h78102c4_0
75 |   - snowflake-snowpark-python=1.5.1=py38hca03da5_0
76 |   - sqlite=3.41.2=h80987f9_0
77 |   - tk=8.6.12=hb8d0fd4_0
78 |   - typing-extensions=4.6.3=py38hca03da5_0
79 |   - typing_extensions=4.6.3=py38hca03da5_0
80 |   - urllib3=1.26.16=py38hca03da5_0
81 |   - utf8proc=2.6.1=h1a28f6b_0
82 |   - wheel=0.38.4=py38hca03da5_0
83 |   - xz=5.4.2=h80987f9_0
84 |   - zlib=1.2.13=h5a0b063_0
85 |   - zstd=1.5.5=hd90d995_0
86 |   - pip:
87 |       - click==8.1.6
88 |       - markdown-it-py==3.0.0
89 |       - mdurl==0.1.2
90 |       - pygments==2.15.1
91 |       - rich==13.4.2
92 |       - typer==0.9.0
93 | prefix: /opt/homebrew/Caskroom/miniconda/base/envs/tpcdi
94 | 


--------------------------------------------------------------------------------
/models/gold/dim_customer.sql:
--------------------------------------------------------------------------------
 1 | with s1 as (
 2 |     select c.*,
 3 |            p.agency_id,
 4 |            p.credit_rating,
 5 |            p.net_worth
 6 |     FROM {{ ref('customers') }} c
 7 |     left join {{ ref('syndicated_prospect') }} p
 8 |     using (first_name, last_name, postal_code, address_line1, address_line2)
 9 | ),
10 | s2 as (
11 |     SELECT
12 |         {{dbt_utils.generate_surrogate_key(['customer_id','effective_timestamp'])}} sk_customer_id,
13 |         customer_id,
14 |         coalesce(tax_id, last_value(tax_id) IGNORE NULLS OVER (
15 |             PARTITION BY customer_id
16 |             ORDER BY effective_timestamp)) tax_id,
17 |         status,
18 |         coalesce(last_name, last_value(last_name) IGNORE NULLS OVER (
19 |             PARTITION BY customer_id
20 |             ORDER BY effective_timestamp)) last_name,
21 |         coalesce(first_name, last_value(first_name) IGNORE NULLS OVER (
22 |             PARTITION BY customer_id
23 |             ORDER BY effective_timestamp)) first_name,
24 |         coalesce(middle_name, last_value(middle_name) IGNORE NULLS OVER (
25 |             PARTITION BY customer_id
26 |             ORDER BY effective_timestamp)) middleinitial,
27 |         coalesce(gender, last_value(gender) IGNORE NULLS OVER (
28 |             PARTITION BY customer_id
29 |             ORDER BY effective_timestamp)) gender,
30 |         coalesce(tier, last_value(tier) IGNORE NULLS OVER (
31 |             PARTITION BY customer_id
32 |             ORDER BY effective_timestamp)) tier,
33 |         coalesce(dob, last_value(dob) IGNORE NULLS OVER (
34 |             PARTITION BY customer_id
35 |             ORDER BY effective_timestamp)) dob,
36 |         coalesce(address_line1, last_value(address_line1) IGNORE NULLS OVER (
37 |             PARTITION BY customer_id
38 |             ORDER BY effective_timestamp)) address_line1,
39 |         coalesce(address_line2, last_value(address_line2) IGNORE NULLS OVER (
40 |             PARTITION BY customer_id
41 |             ORDER BY effective_timestamp)) address_line2,
42 |         coalesce(postal_code, last_value(postal_code) IGNORE NULLS OVER (
43 |             PARTITION BY customer_id
44 |             ORDER BY effective_timestamp)) postal_code,
45 |         coalesce(CITY, last_value(CITY) IGNORE NULLS OVER (
46 |             PARTITION BY customer_id
47 |             ORDER BY effective_timestamp)) CITY,
48 |         coalesce(state_province, last_value(state_province) IGNORE NULLS OVER (
49 |             PARTITION BY customer_id
50 |             ORDER BY effective_timestamp)) state_province,
51 |         coalesce(country, last_value(country) IGNORE NULLS OVER (
52 |             PARTITION BY customer_id
53 |             ORDER BY effective_timestamp)) country,
54 |         coalesce(phone1, last_value(phone1) IGNORE NULLS OVER (
55 |             PARTITION BY customer_id
56 |             ORDER BY effective_timestamp)) phone1,
57 |         coalesce(phone2, last_value(phone2) IGNORE NULLS OVER (
58 |             PARTITION BY customer_id
59 |             ORDER BY effective_timestamp)) phone2,
60 |         coalesce(phone3, last_value(phone3) IGNORE NULLS OVER (
61 |             PARTITION BY customer_id
62 |             ORDER BY effective_timestamp)) phone3,
63 |         coalesce(primary_email, last_value(primary_email) IGNORE NULLS OVER (
64 |             PARTITION BY customer_id
65 |             ORDER BY effective_timestamp)) primary_email,
66 |         coalesce(alternate_email, last_value(alternate_email) IGNORE NULLS OVER (
67 |             PARTITION BY customer_id
68 |             ORDER BY effective_timestamp)) alternate_email,
69 |         coalesce(local_tax_rate_name, last_value(local_tax_rate_name) IGNORE NULLS OVER (
70 |             PARTITION BY customer_id
71 |             ORDER BY effective_timestamp)) local_tax_rate_name,
72 |         coalesce(local_tax_rate, last_value(local_tax_rate) IGNORE NULLS OVER (
73 |             PARTITION BY customer_id
74 |             ORDER BY effective_timestamp)) local_tax_rate,
75 |         coalesce(national_tax_rate_name, last_value(national_tax_rate_name) IGNORE NULLS OVER (
76 |             PARTITION BY customer_id
77 |             ORDER BY effective_timestamp)) national_tax_rate_name,
78 |         coalesce(national_tax_rate, last_value(national_tax_rate) IGNORE NULLS OVER (
79 |             PARTITION BY customer_id
80 |             ORDER BY effective_timestamp)) national_tax_rate,
81 |         agency_id,
82 |         credit_rating,
83 |         net_worth,
84 |         effective_timestamp,
85 |         end_timestamp,
86 |         is_current
87 |     FROM s1
88 | )
89 | select *
90 | from s2
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # What is this?
  2 | This project contains two distinct portions:
  3 | 1. A CLI developed using [Snowpark Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index) and [Typer](https://typer.tiangolo.com/). This CLI creates tables in Snowflake for the files generated by the [TPC-DI](https://www.tpc.org/tpcdi/) benchmark specification.
  4 | 2. A [dbt](https://www.getdbt.com/) project using [Snowflake Dynamic Tables](https://docs.snowflake.com/en/user-guide/dynamic-tables-about) for building the data warehouse defined in the TPC-DI specification.
  5 | 
  6 | 
  7 | ![Figure 1.2–1 from the TPC-DI specification describes the ETL process.](images/tpc-di-etl-diagram.png)
  8 | 
  9 | I should be clear in my goal here: I am not attempting to _actually run and measure_ the TPC-DI benchmark.
 10 | The `tpcdi.py` CLI in this repository is single-threaded and ingests the data sequentially, which would be the worst approach if trying to optimize for a benchmark.
 11 | Instead, I needed a dataset that could be used to mimic data engineering workloads inside of Snowflake, so I just wanted the files loaded.
 12 | 
 13 | 
 14 | ![Figure 1.4–1 from the TPC-DI specification describes the target logical model. More on DimTrade later.](images/tpc-di-logical-model.png)
 15 | 
 16 | 
 17 | I took a few liberties with the TPC-DI specification to update it a bit for Snowflake. I replaced `CamelCase` names with `SNAKE_CASE`, mostly out of irritation with readability.
 18 | Secondly, I just couldn't stand for having the `DimTrade` table be "both a dimension table and a fact table, depending on how it is being used" as it was designed by TPC-DI.
 19 | This decision was made during an era when storage and compute were constrained, so in my design, I created both `DIM_TRADE` and `FACT_TRADE` tables.
 20 | Finally, I used a Medallion Lakehouse Architecture with Bronze, Silver, and Gold zones, with the logical model above materialized in the Gold zone.
 21 | 
 22 | # Demo Notebook
 23 | I've included a [demo Jupyter notebook](demo.ipynb) that walks through a subset of the instructions below. This notebook pairs with a presentation I give that focuses more on how Snowpark Python was used in the CLI, and also the motivation for using Dynamic Tables with dbt. But it may still be helpful.
 24 | 
 25 | # Using DIGen.jar to Generate Source Files
 26 | The Java program to generate the source files is downloaded by [filling out a form on the TPC-DI website](https://www.tpc.org/TPC_Documents_Current_Versions/download_programs/tools-download-request5.asp?bm_type=TPC-DI&bm_vers=1.1.0&mode=CURRENT-ONLY) and clicking a link in an email. Once unzipped, we have to make one slight change for running on macOS:
 27 | 
 28 | ```bash
 29 | unzip 66a2b600-af36-4198-bfbc-c94c40cc22af-tpc-di-tool.zip && \
 30 | mv Tools/PDGF Tools/pdgf && \ #clearly written on a case-insensitive OS
 31 | cd Tools
 32 | ```
 33 | 
 34 | I couldn't find any way to execute the provided JAR with a Java version newer than `1.8`. I spent some time trying to rebuild the JAR file using a newer Java version for posterity, but it was a fool's errand.
 35 | I installed [Azul Zulu Java 1.8](https://www.azul.com/downloads/?package=jdk#zulu) and used [jEnv](https://www.jenv.be/) to set a local version, and with that, we can see the help context from the JAR:
 36 | 
 37 | ```
 38 | jenv add /Library/Java/JavaVirtualMachines/zulu-8.jdk/Contents/Home && \
 39 | jenv local 1.8 && \
 40 | java -jar DIGen.jar --help
 41 | 
 42 | usage: DIGen
 43 |  -h                   print this message
 44 |  -jvm <JVM options>   JVM options. E.g. -jvm "-Xms1g -Xmx2g"
 45 |  -o <directory>       Specify output directory.  Default is output.
 46 |  -sf <sf>             Scale factor.  Default value is 5. (range: 3 -
 47 |                       2147483647
 48 |  -v                   print DIGen version
 49 |  ```
 50 | 
 51 | This utility will generate a bunch of different data files in various formats with a scaling (or multiplication) factor determining how much data the files contain. 
 52 | It attempts to mimic ETL processes at the time the specification was written, which generally utilized file extracts from source systems. It generates CSVs and pipe-separated files (PSVs?), which are quite simple with Snowpark. 
 53 | The two file formats that proved the most fun and challenging were fixed-width fields and XML, as both required heavy DataFrame transformations. The files are generated in batches, with `Batch1` representing the historical load, and `Batch2` and `Batch3` representing various incremental loads. 
 54 | Currently, I've only tested the loader against `Batch1` and the dbt models have not yet been extended to handle additional batches. 
 55 | Also, I haven't yet implemented the Audit portion of the specification, which is somewhat embarrassing as a former co-founder of a data quality company.
 56 | 
 57 | ```bash
 58 | java -jar DIGen.jar -o ~/dev/tpcdi-output -sf 10 && \
 59 | ls -lhtr ~/dev/tpcdi-output/Batch1
 60 | 
 61 | -rw-r--r--  1 stewartbryson  staff    52B Jul 21 14:30 StatusType_audit.csv
 62 | -rw-r--r--  1 stewartbryson  staff    89B Jul 21 14:30 StatusType.txt
 63 | -rw-r--r--  1 stewartbryson  staff    52B Jul 21 14:30 TaxRate_audit.csv
 64 | -rw-r--r--  1 stewartbryson  staff    17K Jul 21 14:30 TaxRate.txt
 65 | -rw-r--r--  1 stewartbryson  staff    52B Jul 21 14:30 Date_audit.csv
 66 | -rw-r--r--  1 stewartbryson  staff   3.3M Jul 21 14:30 Date.txt
 67 | -rw-r--r--  1 stewartbryson  staff    52B Jul 21 14:30 Time_audit.csv
 68 | -rw-r--r--  1 stewartbryson  staff   4.6M Jul 21 14:30 Time.txt
 69 | -rw-r--r--  1 stewartbryson  staff    12B Jul 21 14:30 BatchDate.txt
 70 | -rw-r--r--  1 stewartbryson  staff   3.9M Jul 21 14:30 HR.csv
 71 | -rw-r--r--  1 stewartbryson  staff    84B Jul 21 14:30 HR_audit.csv
 72 | -rw-r--r--  1 stewartbryson  staff   391B Jul 21 14:30 CustomerMgmt_audit.csv
 73 | -rw-r--r--  1 stewartbryson  staff    30M Jul 21 14:30 CustomerMgmt.xml
 74 | -rw-r--r--  1 stewartbryson  staff    10M Jul 21 14:30 Prospect.csv
 75 | -rw-r--r--  1 stewartbryson  staff   141B Jul 21 14:30 Prospect_audit.csv
 76 | -rw-r--r--  1 stewartbryson  staff    52B Jul 21 14:30 Industry_audit.csv
 77 | -rw-r--r--  1 stewartbryson  staff   2.6K Jul 21 14:30 Industry.txt
 78 | -rw-r--r--  1 stewartbryson  staff    81K Jul 21 14:30 FINWIRE1967Q1
 79 | -rw-r--r--  1 stewartbryson  staff   219B Jul 21 14:30 FINWIRE1967Q1_audit.csv
 80 | -rw-r--r--  1 stewartbryson  staff    77K Jul 21 14:30 FINWIRE1967Q2
 81 | 
 82 | { Truncated }
 83 | ```
 84 | 
 85 | # The Python Snowpark Loader: tpcdi.py
 86 | I used Conda to build my virtual environment, with all the steps to replicate in the snippet below:
 87 | 
 88 | ```bash
 89 | conda env create -f environment.yml && \
 90 | conda activate tpcdi && \
 91 | python tpcdi.py --help
 92 | ```
 93 | 
 94 | ![Output from `tpcdi.py --help`](images/tpcdi-help.png)
 95 | 
 96 | I created the loader using Typer for the CLI interface and Snowpark for uploading files, creating DataFrames from those files -- in some cases doing heavy transformations -- and saving them as tables. 
 97 | Credentials are provided using a credentials.json file in the root directory, and looks like this:
 98 | 
 99 | ```json
100 | {
101 |     "account": "myaccount",
102 |     "user": "myuser",
103 |     "password": "mypassword",
104 |     "role": "myrole",
105 |     "warehouse": "stewart_dev",
106 |     "database": "tpc_di",
107 |     "schema": "digen"
108 | }
109 | ```
110 | 
111 | Some improvements could be made to the credentials, schema, and database handling.
112 | The utility writes the data to whatever database and schema is specified in the credentials, so those aspects of the connection are required.
113 | Loading the files is accomplished with the process-files command and we can see the help context below:
114 | 
115 | ```
116 | ❯ python tpcdi.py process-files --help
117 | ```
118 | 
119 | ![Output from `tpcdi.py process-files --help`](images/tpcdi-process-files-help.png)
120 | 
121 | Let's start by loading a rather simple file - the `StatusType.txt`  which is pipe-delimited. 
122 | I'll first demonstrate the `--show` option, which displays a sample of the DataFrame instead of loading it to a table. 
123 | As you might have guessed, I added this option to aid in the development of the loader. 
124 | Then I'll demonstrate loading the table:
125 | 
126 | ```
127 | ❯ python tpcdi.py process-files --output-directory ~/dev/tpcdi-output \
128 | --file-name StatusType.txt --show
129 | 
130 | File StatusType.txt: UPLOADED
131 | -----------------------
132 | |"ST_ID"  |"ST_NAME"  |
133 | -----------------------
134 | |ACTV     |Active     |
135 | |CMPT     |Completed  |
136 | |CNCL     |Canceled   |
137 | |PNDG     |Pending    |
138 | |SBMT     |Submitted  |
139 | |INAC     |Inactive   |
140 | -----------------------
141 | 
142 | ❯ python tpcdi.py process-files --output-directory ~/dev/tpcdi-output \
143 | --file-name StatusType.txt      
144 | 
145 | File StatusType.txt: SKIPPED
146 | STATUS_TYPE table created.
147 | 
148 | ❯ snowsql -c demo
149 | * SnowSQL * v1.2.27
150 | Type SQL statements or !help
151 | stewartbryson#STEWART_DEV@TPC_DI.DIGEN>select * from STATUS_TYPE;
152 | +-------+-----------+
153 | | ST_ID | ST_NAME   |
154 | |-------+-----------|
155 | | ACTV  | Active    |
156 | | CMPT  | Completed |
157 | | CNCL  | Canceled  |
158 | | PNDG  | Pending   |
159 | | SBMT  | Submitted |
160 | | INAC  | Inactive  |
161 | +-------+-----------+
162 | 6 Row(s) produced. Time Elapsed: 0.752s
163 | stewartbryson#STEWART_DEV@TPC_DI.DIGEN>
164 | ```
165 | 
166 | Notice that the second execution used Snowpark's `overwrite=False` feature of skipping already existing files during a put, which can be overridden with the `--overwrite` option.
167 | For now, let's get the rest of the files loaded so we can move on to the dbt models. 
168 | All DataFrames are saved in `overwrite` mode, so we can run it again without duplicating data:
169 | 
170 | ```bash
171 | ❯ python tpcdi.py process-files --output-directory ~/dev/tpcdi-output  
172 | 
173 | File Date.txt: UPLOADED
174 | DATE table created.
175 | File DailyMarket.txt: UPLOADED
176 | DAILY_MARKET table created.
177 | File Industry.txt: UPLOADED
178 | INDUSTRY table created.
179 | File Prospect.csv: UPLOADED
180 | PROSPECT table created.
181 | File CustomerMgmt.xml: UPLOADED
182 | CUSTOMER_MGMT table created.
183 | File TaxRate.txt: UPLOADED
184 | TAX_RATE table created.
185 | File HR.csv: UPLOADED
186 | HR table created.
187 | File WatchHistory.txt: UPLOADED
188 | WATCH_HISTORY table created.
189 | File Trade.txt: UPLOADED
190 | TRADE table created.
191 | File TradeHistory.txt: UPLOADED
192 | TRADE_HISTORY table created.
193 | File StatusType.txt: SKIPPED
194 | STATUS_TYPE table created.
195 | 
196 | { Truncated }
197 | ```
198 | 
199 | # Building the Data Warehouse with dbt
200 | I could have also used Snowpark Python to build the transformation pipelines as well, but instead, I wanted to use Dynamic Tables, and the ability to easily do this using dbt made the decision easy.
201 | 
202 | In the Medallion architecture, we typically append raw data in their original format into Bronze, business entities modeled in Silver, and our highly curated facts and dimensions in Gold. 
203 | I'm loading with an x-small warehouse and 4 threads, with a `DIGen.jar` scaling factor of 10.
204 | The dbt DAG looks like this:
205 | 
206 | ![dbt DAG](images/dbt-dag.png)
207 | 
208 | ```bash
209 | ❯ dbt build
210 | 16:55:34  Running with dbt=1.7.2
211 | 16:55:34  Registered adapter: snowflake=1.7.0
212 | 16:55:34  Found 45 models, 1 test, 17 sources, 0 exposures, 0 metrics, 544 macros, 0 groups, 0 semantic models
213 | 16:55:34  
214 | 16:55:36  Concurrency: 20 threads (target='dev')
215 | 16:55:36  
216 | 16:55:36  1 of 45 START sql dynamic_table model dl_bronze.brokerage_cash_transaction ..... [RUN]
217 | 16:55:36  2 of 45 START sql dynamic_table model dl_bronze.brokerage_daily_market ......... [RUN]
218 | 16:55:36  3 of 45 START sql dynamic_table model dl_bronze.brokerage_holding_history ...... [RUN]
219 | 16:55:36  4 of 45 START sql dynamic_table model dl_bronze.brokerage_trade ................ [RUN]
220 | 16:55:36  5 of 45 START sql dynamic_table model dl_bronze.brokerage_trade_history ........ [RUN]
221 | 16:55:36  6 of 45 START sql dynamic_table model dl_bronze.brokerage_watch_history ........ [RUN]
222 | 16:55:36  7 of 45 START sql dynamic_table model dl_bronze.crm_customer_mgmt .............. [RUN]
223 | 16:55:36  8 of 45 START sql dynamic_table model dl_bronze.finwire_company ................ [RUN]
224 | 16:55:36  9 of 45 START sql dynamic_table model dl_bronze.finwire_financial .............. [RUN]
225 | 16:55:36  10 of 45 START sql dynamic_table model dl_bronze.finwire_security .............. [RUN]
226 | 16:55:36  11 of 45 START sql dynamic_table model dl_bronze.hr_employee ................... [RUN]
227 | 16:55:36  12 of 45 START sql dynamic_table model dl_bronze.reference_date ................ [RUN]
228 | 16:55:36  13 of 45 START sql dynamic_table model dl_bronze.reference_industry ............ [RUN]
229 | 16:55:36  14 of 45 START sql dynamic_table model dl_bronze.reference_status_type ......... [RUN]
230 | 16:55:36  15 of 45 START sql dynamic_table model dl_bronze.reference_tax_rate ............ [RUN]
231 | 16:55:36  16 of 45 START sql dynamic_table model dl_bronze.reference_trade_type .......... [RUN]
232 | 16:55:36  17 of 45 START sql dynamic_table model dl_bronze.syndicated_prospect ........... [RUN]
233 | 16:55:38  13 of 45 OK created sql dynamic_table model dl_bronze.reference_industry ....... [SUCCESS 1 in 2.54s]
234 | 16:55:39  12 of 45 OK created sql dynamic_table model dl_bronze.reference_date ........... [SUCCESS 1 in 2.85s]
235 | 16:55:39  18 of 45 START sql dynamic_table model dl_silver.date .......................... [RUN]
236 | 16:55:39  14 of 45 OK created sql dynamic_table model dl_bronze.reference_status_type .... [SUCCESS 1 in 3.09s]
237 | 16:55:39  15 of 45 OK created sql dynamic_table model dl_bronze.reference_tax_rate ....... [SUCCESS 1 in 3.09s]
238 | 16:55:39  16 of 45 OK created sql dynamic_table model dl_bronze.reference_trade_type ..... [SUCCESS 1 in 3.21s]
239 | 16:55:39  9 of 45 OK created sql dynamic_table model dl_bronze.finwire_financial ......... [SUCCESS 1 in 3.57s]
240 | 16:55:40  8 of 45 OK created sql dynamic_table model dl_bronze.finwire_company ........... [SUCCESS 1 in 4.08s]
241 | 16:55:40  11 of 45 OK created sql dynamic_table model dl_bronze.hr_employee .............. [SUCCESS 1 in 4.08s]
242 | 16:55:40  19 of 45 START sql dynamic_table model dl_silver.companies ..................... [RUN]
243 | 16:55:40  20 of 45 START sql dynamic_table model dl_silver.employees ..................... [RUN]
244 | 16:55:40  10 of 45 OK created sql dynamic_table model dl_bronze.finwire_security ......... [SUCCESS 1 in 4.18s]
245 | 16:55:40  7 of 45 OK created sql dynamic_table model dl_bronze.crm_customer_mgmt ......... [SUCCESS 1 in 4.32s]
246 | 16:55:40  21 of 45 START sql dynamic_table model dl_silver.accounts ...................... [RUN]
247 | 16:55:40  22 of 45 START sql dynamic_table model dl_silver.customers ..................... [RUN]
248 | 16:55:41  18 of 45 OK created sql dynamic_table model dl_silver.date ..................... [SUCCESS 1 in 2.45s]
249 | 16:55:41  23 of 45 START sql dynamic_table model dl_gold.dim_date ........................ [RUN]
250 | 16:55:41  17 of 45 OK created sql dynamic_table model dl_bronze.syndicated_prospect ...... [SUCCESS 1 in 5.55s]
251 | 16:55:42  1 of 45 OK created sql dynamic_table model dl_bronze.brokerage_cash_transaction  [SUCCESS 1 in 6.55s]
252 | 16:55:43  21 of 45 OK created sql dynamic_table model dl_silver.accounts ................. [SUCCESS 1 in 2.52s]
253 | 16:55:43  24 of 45 START sql dynamic_table model dl_silver.cash_transactions ............. [RUN]
254 | 16:55:43  19 of 45 OK created sql dynamic_table model dl_silver.companies ................ [SUCCESS 1 in 2.77s]
255 | 16:55:43  26 of 45 START sql dynamic_table model dl_silver.financials .................... [RUN]
256 | 16:55:43  25 of 45 START sql dynamic_table model dl_gold.dim_company ..................... [RUN]
257 | 16:55:43  27 of 45 START sql dynamic_table model dl_silver.securities .................... [RUN]
258 | 16:55:44  22 of 45 OK created sql dynamic_table model dl_silver.customers ................ [SUCCESS 1 in 3.64s]
259 | 16:55:44  28 of 45 START sql dynamic_table model dl_gold.dim_customer .................... [RUN]
260 | 16:55:44  20 of 45 OK created sql dynamic_table model dl_silver.employees ................ [SUCCESS 1 in 4.11s]
261 | 16:55:44  29 of 45 START sql dynamic_table model dl_gold.dim_broker ...................... [RUN]
262 | 16:55:44  4 of 45 OK created sql dynamic_table model dl_bronze.brokerage_trade ........... [SUCCESS 1 in 8.22s]
263 | 16:55:44  2 of 45 OK created sql dynamic_table model dl_bronze.brokerage_daily_market .... [SUCCESS 1 in 8.23s]
264 | 16:55:44  30 of 45 START sql dynamic_table model dl_silver.daily_market .................. [RUN]
265 | 16:55:44  23 of 45 OK created sql dynamic_table model dl_gold.dim_date ................... [SUCCESS 1 in 2.94s]
266 | 16:55:44  3 of 45 OK created sql dynamic_table model dl_bronze.brokerage_holding_history . [SUCCESS 1 in 8.49s]
267 | 16:55:46  25 of 45 OK created sql dynamic_table model dl_gold.dim_company ................ [SUCCESS 1 in 3.33s]
268 | 16:55:47  6 of 45 OK created sql dynamic_table model dl_bronze.brokerage_watch_history ... [SUCCESS 1 in 10.86s]
269 | 16:55:47  29 of 45 OK created sql dynamic_table model dl_gold.dim_broker ................. [SUCCESS 1 in 2.98s]
270 | 16:55:47  27 of 45 OK created sql dynamic_table model dl_silver.securities ............... [SUCCESS 1 in 4.76s]
271 | 16:55:47  31 of 45 START sql dynamic_table model dl_gold.dim_security .................... [RUN]
272 | 16:55:47  32 of 45 START sql dynamic_table model dl_silver.watches_history ............... [RUN]
273 | 16:55:48  5 of 45 OK created sql dynamic_table model dl_bronze.brokerage_trade_history ... [SUCCESS 1 in 11.82s]
274 | 16:55:48  33 of 45 START sql dynamic_table model dl_silver.trades_history ................ [RUN]
275 | 16:55:48  28 of 45 OK created sql dynamic_table model dl_gold.dim_customer ............... [SUCCESS 1 in 4.58s]
276 | 16:55:48  34 of 45 START sql dynamic_table model dl_gold.dim_account ..................... [RUN]
277 | 16:55:49  24 of 45 OK created sql dynamic_table model dl_silver.cash_transactions ........ [SUCCESS 1 in 5.91s]
278 | 16:55:49  30 of 45 OK created sql dynamic_table model dl_silver.daily_market ............. [SUCCESS 1 in 4.63s]
279 | 16:55:50  26 of 45 OK created sql dynamic_table model dl_silver.financials ............... [SUCCESS 1 in 7.20s]
280 | 16:55:51  31 of 45 OK created sql dynamic_table model dl_gold.dim_security ............... [SUCCESS 1 in 3.81s]
281 | 16:55:51  35 of 45 START sql dynamic_table model dl_gold.fact_market_history ............. [RUN]
282 | 16:55:52  34 of 45 OK created sql dynamic_table model dl_gold.dim_account ................ [SUCCESS 1 in 3.71s]
283 | 16:55:52  36 of 45 START sql dynamic_table model dl_gold.fact_cash_transactions .......... [RUN]
284 | 16:55:54  32 of 45 OK created sql dynamic_table model dl_silver.watches_history .......... [SUCCESS 1 in 6.08s]
285 | 16:55:54  37 of 45 START sql dynamic_table model dl_silver.watches ....................... [RUN]
286 | 16:55:58  36 of 45 OK created sql dynamic_table model dl_gold.fact_cash_transactions ..... [SUCCESS 1 in 5.65s]
287 | 16:55:58  38 of 45 START sql dynamic_table model dl_gold.fact_cash_balances .............. [RUN]
288 | 16:56:00  37 of 45 OK created sql dynamic_table model dl_silver.watches .................. [SUCCESS 1 in 6.13s]
289 | 16:56:00  39 of 45 START sql dynamic_table model dl_gold.fact_watches .................... [RUN]
290 | 16:56:00  33 of 45 OK created sql dynamic_table model dl_silver.trades_history ........... [SUCCESS 1 in 12.60s]
291 | 16:56:00  40 of 45 START sql dynamic_table model dl_gold.dim_trade ....................... [RUN]
292 | 16:56:00  41 of 45 START sql dynamic_table model dl_silver.trades ........................ [RUN]
293 | 16:56:03  38 of 45 OK created sql dynamic_table model dl_gold.fact_cash_balances ......... [SUCCESS 1 in 5.45s]
294 | 16:56:05  40 of 45 OK created sql dynamic_table model dl_gold.dim_trade .................. [SUCCESS 1 in 4.51s]
295 | 16:56:06  39 of 45 OK created sql dynamic_table model dl_gold.fact_watches ............... [SUCCESS 1 in 6.40s]
296 | 16:56:08  41 of 45 OK created sql dynamic_table model dl_silver.trades ................... [SUCCESS 1 in 7.40s]
297 | 16:56:08  42 of 45 START sql dynamic_table model dl_silver.holdings_history .............. [RUN]
298 | 16:56:08  43 of 45 START sql dynamic_table model dl_gold.fact_trade ...................... [RUN]
299 | 16:56:15  42 of 45 OK created sql dynamic_table model dl_silver.holdings_history ......... [SUCCESS 1 in 7.03s]
300 | 16:56:15  44 of 45 START sql dynamic_table model dl_gold.fact_holdings ................... [RUN]
301 | 16:56:22  43 of 45 OK created sql dynamic_table model dl_gold.fact_trade ................. [SUCCESS 1 in 14.53s]
302 | 16:56:22  45 of 45 START test fact_trade__unique_trade ................................... [RUN]
303 | 16:56:23  45 of 45 PASS fact_trade__unique_trade ......................................... [PASS in 1.30s]
304 | 16:56:25  44 of 45 OK created sql dynamic_table model dl_gold.fact_holdings .............. [SUCCESS 1 in 9.95s]
305 | 16:56:44  35 of 45 OK created sql dynamic_table model dl_gold.fact_market_history ........ [SUCCESS 1 in 53.21s]
306 | 16:56:44  
307 | 16:56:44  Finished running 44 dynamic_table models, 1 test in 0 hours 1 minutes and 10.51 seconds (70.51s).
308 | 16:56:45  
309 | 16:56:45  Completed successfully
310 | 16:56:45  
311 | 16:56:45  Done. PASS=45 WARN=0 ERROR=0 SKIP=0 TOTAL=45
312 | ```
313 | 
314 | Once the DAG has been created in Snowflake, we can browse it in the UI:
315 | 
316 | ![Snowflake Graph](images/snowflake-dag.png)
317 | 
318 | It's important to note that Snowflake is only aware of all the dependent relations _after_ the tables have been created.
319 | dbt is aware of them before.
320 | 
321 | # Future Enhancements
322 | Although it wasn't my goal, it would be cool to enhance this project so that it could be used to run and measure the benchmark. 
323 | These are my thoughts on where to take this next:
324 | 
325 | 1. Complete `Batch2` and `Batch3` using dbt incremental models, and put the audit queries in as dbt tests.
326 | 2. Refactor tpcdi.py to only upload the files and do that concurrently, then put all the Snowpark transformations into procedures so they can be executed as concurrent tasks.
327 | 3. Maybe take another pass at credential handling, using the `config.toml` from [Snowflake CLI](https://github.com/snowflake-labs/snowcli#cli-tour-and-quickstart).
328 | Provide a command-line option `--schema` so it can be specified during loading, instead of using `CURRENT_SCHEMA`.
329 | 
330 | If you are interested in contributing, jump on board. You don't need my permission, or even incredible skill, clearly. 
331 | Just open a pull request.
332 | 


--------------------------------------------------------------------------------
/tpcdi.py:
--------------------------------------------------------------------------------
  1 | import sys, typer, json, re, logging
  2 | from snowflake.snowpark import Session, DataFrame
  3 | from snowflake.snowpark.types import *
  4 | from snowflake.snowpark.functions import *
  5 | from typing_extensions import Annotated
  6 | from pathlib import Path
  7 | 
  8 | app = typer.Typer(help="A utility for loading TPC-DI generated files into Snowflake.")
  9 | 
 10 | def get_session():
 11 |     # Read the credentials.json file
 12 |     with open("credentials.json") as jsonfile:
 13 |         credentials_dict = json.load(jsonfile)
 14 | 
 15 |     # build the session
 16 |     session = Session.builder.configs(credentials_dict).create()
 17 |     #print(f"Session: {session}")
 18 | 
 19 |     # and return it
 20 |     return session
 21 | 
 22 | @app.command(help="CREATE or REPLACE the stage. Mostly useful while developing this utility.")
 23 | def recreate_stage(
 24 |         stage: Annotated[str, typer.Option(help="Name of the stage to recreate.")] = 'tpcdi',
 25 | ):
 26 |     session = get_session()
 27 |     session.sql(f"create or replace stage {stage} directory = (enable = true)").collect()
 28 |     print(f"Stage {stage} recreated.")
 29 | 
 30 | @app.command(help="DROP the stage. Useful when all the data has been successfully loaded.")
 31 | def drop_stage(
 32 |         stage: Annotated[str, typer.Option(help="Name of the stage to drop.")] = 'tpcdi',
 33 | ):
 34 |     session = get_session()
 35 |     session.sql(f"drop stage {stage}").collect()
 36 |     print(f"Stage {stage} dropped.")
 37 | 
 38 | @app.command(help="DROP a schema. Useful for cleaning up after a demo.")
 39 | def drop_schema(
 40 |         schema: Annotated[str, typer.Option(help="Name of the schema to drop.")],
 41 | ):
 42 |     session = get_session()
 43 |     session.sql(f"drop schema if exists {schema}").collect()
 44 |     print(f"Schema {schema} dropped.")
 45 | 
 46 | @app.command(help="Upload a file or files into the stage and build the dependent tables.")
 47 | def process_files(
 48 |     output_directory: Annotated[str, typer.Option(help='The output directory from the TPC-DI DIGen.jar execution.')],
 49 |     file_name: Annotated[str, typer.Option(help="The TPC-DI file name to upload and process. Pass value 'FINWIRE' to process all of the financial wire files.")] = 'all',
 50 |     stage: Annotated[str, typer.Option(help="The stage name to upload to, without specifying '@'.")] = 'tpcdi',
 51 |     batch: Annotated[int, typer.Option(help="The TPC-DI batch number to process. Currently only supports the default of '1'.")] = 1,
 52 |     overwrite: Annotated[bool, typer.Option(help="Overwrite the file even if it exists?")] = False,
 53 |     skip_upload: Annotated[bool, typer.Option(help="Skip uploading the files?")] = False,
 54 |     show: Annotated[bool, typer.Option(help="Show the DataFrame instead of saving it as a table? This was useful during development.")] = False,
 55 | ):
 56 |     session = get_session()
 57 | 
 58 |     # create the stage if it doesn't exist
 59 |     (
 60 |         session
 61 |         .sql(f"create stage if not exists {stage} directory = (enable = true)")
 62 |         .collect()
 63 |     )
 64 | 
 65 |     def get_stage_path(
 66 |             stage:str,
 67 |             file_name:str,
 68 |     ):
 69 |         return f"@{stage}/Batch{batch}/{file_name}"
 70 | 
 71 |     # method to control printing the dataframe or saving it
 72 |     def save_df(
 73 |             df: DataFrame,
 74 |             table_name: str
 75 |     ):
 76 |         if show:
 77 |             df.show()
 78 |         else:
 79 |             (
 80 |                 df.write
 81 |                 .mode("overwrite")
 82 |                 .save_as_table(table_name)
 83 |             )
 84 | 
 85 |             print(f"{table_name.upper()} table created.")
 86 | 
 87 |     # method for uploading files
 88 |     def upload_files(
 89 |             file_name: str,
 90 |             stage_path: str,
 91 |     ):
 92 | 
 93 |         delimiter="|"
 94 |             
 95 |         if file_name == 'FINWIRE':
 96 |             pathlist = Path(output_directory).glob(f"Batch{batch}/FINWIRE??????")
 97 |         else:
 98 |             pathlist = Path(output_directory).glob(f"Batch{batch}/{file_name}")
 99 |         
100 |         for file in pathlist:
101 |             # capture the delimiter
102 |             if file_name != 'FINWIRE':
103 |                 logging.info(f"Suffix: {file.suffix}")
104 |                 if file.suffix == '.csv':
105 |                     delimiter=','
106 |                 logging.info(f"Delimiter: {delimiter}")
107 | 
108 |             # put the file(s) in the stage
109 |             if not skip_upload:
110 |                 put_result = (
111 |                         session
112 |                         .file
113 |                         .put(
114 |                             str(file), 
115 |                             stage_path, 
116 |                             overwrite=overwrite, 
117 |                             parallel=4, 
118 |                             auto_compress=True 
119 |                         )
120 |                 )
121 |                 for result in put_result:
122 |                     print(f"File {result.source}: {result.status}")
123 |         
124 |         # return the delimiter
125 |         return delimiter
126 |         
127 |     # method for creating a table from a simple CSV
128 |     # works for either comma or pipe delimited, detecting that automatically by suffix
129 |     def load_csv(
130 |             schema: StructType,
131 |             file_name: str,
132 |             table_name: str,
133 |     ):
134 |         stage_path = get_stage_path(stage, file_name)
135 |         delimiter=upload_files(file_name, stage_path)
136 |         
137 |         df = (
138 |             session
139 |             .read
140 |             .schema(schema)
141 |             .option("field_delimiter", delimiter)
142 |             .csv(stage_path)
143 |         )
144 |         
145 |         save_df(df, table_name)
146 | 
147 |     # Simplifies the DataFrame transformations for retrieving XML elements
148 |     def get_xml_element(
149 |             column:str,
150 |             element:str,
151 |             datatype:str,
152 |             with_alias:bool = True
153 |     ):
154 |         new_element = get(xmlget(col(column), lit(element)), lit('$')).cast(datatype)
155 |         # alias needs to be optional
156 |         return (
157 |             new_element.alias(element) if with_alias else new_element
158 |         )
159 |     
160 |     # Simplifies the DataFrame transformations for retrieving XML attributes
161 |     def get_xml_attribute(
162 |             column:str,
163 |             attribute: str,
164 |             datatype: str,
165 |             with_alias:bool = True
166 |     ):
167 |         new_attribute = get(col(column), lit(f"@{attribute}")).cast(datatype)
168 |         # alias needs to be optional
169 |         return (
170 |             new_attribute.alias(attribute) if with_alias else new_attribute
171 |         )
172 |     
173 |     # Simplifies the logic for constructing a phone number from multiple nested fields
174 |     def get_phone_number(
175 |             phone_id:str,
176 |             separator:str = '-'
177 |     ):
178 |         return concat(
179 |             get_xml_element(f"phone{phone_id}", 'C_CTRY_CODE', 'STRING', False),
180 |             when(get_xml_element(f"phone{phone_id}", 'C_CTRY_CODE', 'STRING', False) == '', '').otherwise(separator),
181 |             get_xml_element(f"phone{phone_id}", 'C_AREA_CODE', 'STRING', False),
182 |             when(get_xml_element(f"phone{phone_id}", 'C_AREA_CODE', 'STRING', False) == '', '').otherwise(separator),
183 |             get_xml_element(f"phone{phone_id}", 'C_LOCAL', 'STRING', False),
184 |             when(get_xml_element(f"phone{phone_id}", 'C_EXT', 'STRING', False) == '', '').otherwise(" ext: "),
185 |             get_xml_element(f"phone{phone_id}", 'C_EXT', 'STRING', False)
186 |         ).alias(f"c_phone_{phone_id}")
187 |     
188 |     
189 |     ### Start defining and loading the actual tables
190 |     ### Variable 'con_file_name' declaration acts as the comment.
191 | 
192 |     con_file_name = 'Date.txt'
193 |     if file_name in ['all', con_file_name]:
194 |         schema = StructType([
195 |                 StructField("SK_DATE_ID", IntegerType(), False),
196 |                 StructField("DATE_VALUE", DateType(), False),
197 |                 StructField("DATE_DESC", StringType(), False),
198 |                 StructField("CALENDAR_YEAR_ID", IntegerType(), False),
199 |                 StructField("CALENDAR_YEAR_DESC", StringType(), False),
200 |                 StructField("CALENDAR_QTR_ID", IntegerType(), False),
201 |                 StructField("CALENDAR_QTR_DESC", StringType(), False),
202 |                 StructField("CALENDAR_MONTH_ID", IntegerType(), False),
203 |                 StructField("CALENDAR_MONTH_DESC", StringType(), False),
204 |                 StructField("CALENDAR_WEEK_ID", IntegerType(), False),
205 |                 StructField("CALENDAR_WEEK_DESC", StringType(), False),
206 |                 StructField("DAY_OF_WEEK_NUM", IntegerType(), False),
207 |                 StructField("DAY_OF_WEEK_DESC", StringType(), False),
208 |                 StructField("FISCAL_YEAR_ID", IntegerType(), False),
209 |                 StructField("FISCAL_YEAR_DESC", StringType(), False),
210 |                 StructField("FISCAL_QTR_ID", IntegerType(), False),
211 |                 StructField("FISCAL_QTR_DESC", StringType(), False),
212 |                 StructField("HOLIDAY_FLAG", BooleanType(), False),
213 |         ])
214 |         load_csv(schema, con_file_name, 'date')
215 | 
216 |     con_file_name = 'DailyMarket.txt'
217 |     if file_name in ['all', con_file_name]:
218 |         schema = StructType([
219 |                 StructField("DM_DATE", DateType(), False),
220 |                 StructField("DM_S_SYMB", StringType(), False),
221 |                 StructField("DM_CLOSE", FloatType(), False),
222 |                 StructField("DM_HIGH", FloatType(), False),
223 |                 StructField("DM_LOW", FloatType(), False),
224 |                 StructField("DM_VOL", FloatType(), False),
225 |         ])
226 |         load_csv(schema, con_file_name, 'daily_market')
227 | 
228 |     con_file_name = 'Industry.txt'
229 |     if file_name in ['all', con_file_name]:
230 |         schema = StructType([
231 |                 StructField("IN_ID", StringType(), False),
232 |                 StructField("IN_NAME", StringType(), False),
233 |                 StructField("IN_SC_ID", StringType(), False),
234 |         ])
235 |         load_csv(schema, con_file_name, 'industry')
236 | 
237 |     con_file_name = 'Prospect.csv'
238 |     if file_name in ['all', con_file_name]:
239 |         schema = StructType([
240 |                 StructField("AGENCY_ID", StringType(), False),
241 |                 StructField("LAST_NAME", StringType(), True),
242 |                 StructField("FIRST_NAME", StringType(), True),
243 |                 StructField("MIDDLE_INITIAL", StringType(), True),
244 |                 StructField("GENDER", StringType(), True),
245 |                 StructField("ADDRESS_LINE1", StringType(), True),
246 |                 StructField("ADDRESS_LINE2", StringType(), True),
247 |                 StructField("POSTAL_CODE", StringType(), True),
248 |                 StructField("CITY", StringType(), True),
249 |                 StructField("STATE", StringType(), True),
250 |                 StructField("COUNTRY", StringType(), True),
251 |                 StructField("PHONE", StringType(), True),
252 |                 StructField("INCOME", IntegerType(), True),
253 |                 StructField("NUMBER_CARS", IntegerType(), True),
254 |                 StructField("NUMBER_CHILDREN", IntegerType(), True),
255 |                 StructField("MARITAL_STATUS", StringType(), True),
256 |                 StructField("AGE", IntegerType(), True),
257 |                 StructField("CREDIT_RATING", IntegerType(), True),
258 |                 StructField("OWN_OR_RENT_FLAG", StringType(), True),
259 |                 StructField("EMPLOYER", StringType(), True),
260 |                 StructField("NUMBER_CREDIT_CARDS", IntegerType(), True),
261 |                 StructField("NET_WORTH", IntegerType(), True),
262 |         ])
263 |         load_csv(schema, con_file_name, 'prospect')
264 | 
265 |     con_file_name = 'CustomerMgmt.xml'
266 |     if file_name in ['all', con_file_name]:
267 | 
268 |         upload_files(con_file_name, get_stage_path(stage, con_file_name))
269 | 
270 |         # this might get hairy
271 |         df = (
272 |             session
273 |             .read
274 |             .option('STRIP_OUTER_ELEMENT', True) # Strips the TPCDI:Actions element
275 |             .xml(get_stage_path(stage, con_file_name))
276 |             .select(
277 |                 # flatten out all of the nested elements
278 |                 xmlget(col('$1'), lit('Customer'), 0).alias('customer'),
279 |                 xmlget(col('customer'), lit('Name'), 0).alias('name'),
280 |                 xmlget(col('customer'), lit('Address'), 0).alias('address'),
281 |                 xmlget(col('customer'), lit('ContactInfo'), 0).alias('contact_info'),
282 |                 xmlget(col('contact_info'), lit('C_PHONE_1')).alias('phone1'),
283 |                 xmlget(col('contact_info'), lit('C_PHONE_2')).alias('phone2'),
284 |                 xmlget(col('contact_info'), lit('C_PHONE_3')).alias('phone3'),
285 |                 xmlget(col('customer'), lit('TaxInfo'), 0).alias('tax_info'),
286 |                 xmlget(col('customer'), lit('Account'), 0).alias('account'),
287 |                 # get the Action attributes
288 |                 get_xml_attribute('$1','ActionType','STRING'),
289 |                 get_xml_attribute('$1','ActionTS','STRING'),
290 |             )
291 |             .select(
292 |                 # Handling Action attributes
293 |                 to_timestamp(col('ActionTs'), lit('yyyy-mm-ddThh:mi:ss')).alias('action_ts'),
294 |                 col('ActionType').alias('ACTION_TYPE'),
295 |                 # Get Customer Attributes
296 |                 get_xml_attribute('customer','C_ID','NUMBER'),
297 |                 get_xml_attribute('customer','C_TAX_ID','STRING'),
298 |                 get_xml_attribute('customer','C_GNDR','STRING'),
299 |                 try_cast(get_xml_attribute('customer','C_TIER','STRING', False),'NUMBER').alias('c_tier'),
300 |                 get_xml_attribute('customer','C_DOB','DATE'),
301 |                 # Get Name elements
302 |                 get_xml_element('name','C_L_NAME','STRING'),
303 |                 get_xml_element('name','C_F_NAME','STRING'),
304 |                 get_xml_element('name','C_M_NAME','STRING'),
305 |                 # Get Address elements
306 |                 get_xml_element('address','C_ADLINE1','STRING'),
307 |                 get_xml_element('address', 'C_ADLINE2', 'STRING'),
308 |                 get_xml_element('address','C_ZIPCODE','STRING'),
309 |                 get_xml_element('address','C_CITY','STRING'),
310 |                 get_xml_element('address','C_STATE_PROV','STRING'),
311 |                 get_xml_element('address','C_CTRY','STRING'),
312 |                 # Get Contact Info elements
313 |                 get_xml_element('contact_info','C_PRIM_EMAIL','STRING'),
314 |                 get_xml_element('contact_info','C_ALT_EMAIL','STRING'),
315 |                 # Contruct phone numbers from multi-nested elements
316 |                 get_phone_number('1'),
317 |                 get_phone_number('2'),
318 |                 get_phone_number('3'),
319 |                 # Get TaxInfo elements
320 |                 get_xml_element('tax_info','C_LCL_TX_ID','STRING'),
321 |                 get_xml_element('tax_info','C_NAT_TX_ID','STRING'),
322 |                 # Get Account Attributes
323 |                 get_xml_attribute('account','CA_ID','STRING'),
324 |                 get_xml_attribute('account','CA_TAX_ST','NUMBER'),
325 |                 # Get Account elements
326 |                 get_xml_element('account','CA_B_ID','NUMBER'),
327 |                 get_xml_element('account','CA_NAME','STRING'),
328 |             )
329 |         )
330 |         
331 |         save_df(df, 'customer_mgmt')
332 | 
333 |     con_file_name = 'TaxRate.txt'
334 |     if file_name in ['all', con_file_name]:
335 |         schema = StructType([
336 |                 StructField("TX_ID", StringType(), False),
337 |                 StructField("TX_NAME", StringType(), True),
338 |                 StructField("TX_RATE", FloatType(), True),
339 |         ])
340 |         load_csv(schema, con_file_name, 'tax_rate')
341 | 
342 |     con_file_name = 'HR.csv'
343 |     if file_name in ['all', con_file_name]:
344 |         schema = StructType([
345 |                 StructField("EMPLOYEE_ID", IntegerType(), False),
346 |                 StructField("MANAGER_ID", IntegerType(), False),
347 |                 StructField("EMPLOYEE_FIRST_NAME", StringType(), True),
348 |                 StructField("EMPLOYEE_LAST_NAME", StringType(), True),
349 |                 StructField("EMPLOYEE_MI", StringType(), True),
350 |                 StructField("EMPLOYEE_JOB_CODE", IntegerType(), True),
351 |                 StructField("EMPLOYEE_BRANCH", StringType(), True),
352 |                 StructField("EMPLOYEE_OFFICE", StringType(), True),
353 |                 StructField("EMPLOYEE_PHONE", StringType(), True)
354 |         ])
355 |         load_csv(schema, con_file_name, 'hr')
356 | 
357 |     con_file_name = 'WatchHistory.txt'
358 |     if file_name in ['all', con_file_name]:
359 |         schema = StructType([
360 |                 StructField("W_C_ID", IntegerType(), False),
361 |                 StructField("W_S_SYMB", StringType(), True),
362 |                 StructField("W_DTS", TimestampType(), True),
363 |                 StructField("W_ACTION", StringType(), True)
364 |         ])
365 |         load_csv(schema, con_file_name, 'watch_history')
366 | 
367 |     con_file_name = 'Trade.txt'
368 |     if file_name in ['all', con_file_name]:
369 |         schema = StructType([
370 |                 StructField("T_ID", IntegerType(), False),
371 |                 StructField("T_DTS", TimestampType(), False),
372 |                 StructField("T_ST_ID", StringType(), False),
373 |                 StructField("T_TT_ID", StringType(), False),
374 |                 StructField("T_IS_CASH", BooleanType(), False),
375 |                 StructField("T_S_SYMB", StringType(), False),
376 |                 StructField("T_QTY", FloatType(),False),
377 |                 StructField("T_BID_PRICE", FloatType(), False),
378 |                 StructField("T_CA_ID", IntegerType(), False),
379 |                 StructField("T_EXEC_NAME", StringType(), False),
380 |                 StructField("T_TRADE_PRICE", FloatType(), True),
381 |                 StructField("T_CHRG", FloatType(), True),
382 |                 StructField("T_COMM", FloatType(), True),
383 |                 StructField("T_TAX", FloatType(), True),
384 |         ])
385 |         load_csv(schema, con_file_name, 'trade')
386 | 
387 |     con_file_name = 'TradeHistory.txt'
388 |     if file_name in ['all', con_file_name]:
389 |         schema = StructType([
390 |                 StructField("TH_T_ID", IntegerType(), False),
391 |                 StructField("TH_DTS", TimestampType(), False),
392 |                 StructField("TH_ST_ID", StringType(), False),
393 |         ])
394 |         load_csv(schema, con_file_name, 'trade_history')
395 | 
396 |     con_file_name = 'StatusType.txt'
397 |     if file_name in ['all', con_file_name]:
398 |         schema = StructType([
399 |                 StructField("ST_ID", StringType(), False),
400 |                 StructField("ST_NAME", StringType(), False),
401 |         ])
402 |         load_csv(schema, con_file_name, 'status_type')
403 | 
404 |     con_file_name = 'TradeType.txt'
405 |     if file_name in ['all', con_file_name]:
406 |         schema = StructType([
407 |                 StructField("TT_ID", StringType(), False),
408 |                 StructField("TT_NAME", StringType(), False),
409 |                 StructField("TT_IS_SELL", BooleanType(), False),
410 |                 StructField("TT_IS_MARKET", BooleanType(), False),
411 |         ])
412 |         load_csv(schema, con_file_name, 'trade_type')
413 | 
414 |     con_file_name = 'HoldingHistory.txt'
415 |     if file_name in ['all', con_file_name]:
416 |         schema = StructType([
417 |                 StructField("HH_H_T_ID", IntegerType(), False),
418 |                 StructField("HH_T_ID", IntegerType(), False),
419 |                 StructField("HH_BEFORE_QTY", FloatType(), False),
420 |                 StructField("HH_AFTER_QTY", FloatType(), False),
421 |         ])
422 |         load_csv(schema, con_file_name, 'holding_history')
423 | 
424 |     con_file_name = 'CashTransaction.txt'
425 |     if file_name in ['all', con_file_name]:
426 |         schema = StructType([
427 |                 StructField("CT_CA_ID", IntegerType(), False),
428 |                 StructField("CT_DTS", TimestampType(), False),
429 |                 StructField("CT_AMT", FloatType(), False),
430 |                 StructField("CT_NAME", StringType(), False),
431 |         ])
432 |         load_csv(schema, con_file_name, 'cash_transaction')
433 | 
434 |     con_file_name = 'FINWIRE'
435 |     if file_name in ['all', con_file_name]:
436 |         # These are fixed-width fields, so read the entire line in as "line"
437 |         schema = StructType([
438 |                 StructField("line", StringType(), False),
439 |         ])
440 | 
441 |         stage_path = get_stage_path(stage, con_file_name)
442 |         upload_files(con_file_name, stage_path)
443 | 
444 |         # generic dataframe for all record types
445 |         # create a temporary table
446 |         df = (
447 |             session
448 |             .read
449 |             .schema(schema)
450 |             .option('field_delimiter', '|')
451 |             .csv(stage_path)
452 |             .with_column('rec_type', substring(col("line"), lit(16), lit(3)))
453 |             .with_column('pts', to_timestamp(substring(col("line"), lit(0), lit(15)), lit("yyyymmdd-hhmiss")))
454 |             .write.mode("overwrite").save_as_table("finwire", table_type="temporary")
455 |         )
456 | 
457 |         # CMP record types
458 |         df = (
459 |             session
460 |             .table('finwire')
461 |             .where(col('rec_type') == 'CMP')
462 |             .with_column('company_name', substr(col('line'), lit(19), lit(60)))
463 |             .withColumn('cik', substring(col("line"), lit(79), lit(10)))
464 |             .withColumn('status', substring(col("line"), lit(89), lit(4)))
465 |             .withColumn('industry_id', substring(col("line"), lit(93), lit(2)))
466 |             .withColumn('sp_rating', substring(col("line"), lit(95), lit(4)))
467 |             .withColumn(
468 |                 'founding_date',
469 |                 try_cast(
470 |                     trim(
471 |                         substring(
472 |                             col("line"), lit(99), lit(8)
473 |                         )
474 |                     ),
475 |                     'DATE'
476 |                 )
477 |             )
478 |             .withColumn('address_line1', substring(col("line"), lit(107), lit(80)))
479 |             .withColumn('address_line2', substring(col("line"), lit(187), lit(80)))
480 |             .withColumn('postal_code', substring(col("line"), lit(267), lit(12)))
481 |             .withColumn('city', substring(col("line"), lit(279), lit(25)))
482 |             .withColumn('state_province', substring(col("line"), lit(304), lit(20)))
483 |             .withColumn('country', substring(col("line"), lit(324), lit(24)))
484 |             .withColumn('ceo_name', substring(col("line"), lit(348), lit(46)))
485 |             .withColumn('description', substring(col("line"), lit(394), lit(150)))
486 |             .drop(col("line"), col("rec_type"))
487 |         )
488 | 
489 |         save_df(df, 'cmp')
490 | 
491 |         # SEC record types
492 |         df = (
493 |             session
494 |             .table('finwire')
495 |             .where(col('rec_type') == 'SEC')
496 |             .withColumn('symbol', substring(col("line"), lit(19), lit(15)))
497 |             .withColumn('issue_type', substring(col("line"), lit(34), lit(6)))
498 |             .withColumn('status', substring(col("line"), lit(40), lit(4)))
499 |             .withColumn('name', substring(col("line"), lit(44), lit(70)))
500 |             .withColumn('ex_id', substring(col("line"), lit(114), lit(6)))
501 |             .withColumn('sh_out', substring(col("line"), lit(120), lit(13)))
502 |             .withColumn('first_trade_date', substring(col("line"), lit(133), lit(8)))
503 |             .withColumn('first_exchange_date', substring(col("line"), lit(141), lit(8)))
504 |             .withColumn('dividend', substring(col("line"), lit(149), lit(12)))
505 |             .withColumn('co_name_or_cik', substring(col("line"), lit(161), lit(60)))
506 |             .drop(col("line"), col("rec_type"))
507 |         )
508 | 
509 |         save_df(df, 'sec')
510 | 
511 |         # FIN record types
512 |         df = (
513 |             session
514 |             .table('finwire')
515 |             .where(col('rec_type') == 'FIN')
516 |             .withColumn('year', substring(col("line"), lit(19), lit(4)))
517 |             .withColumn('quarter', substring(col("line"), lit(23), lit(1)))
518 |             .withColumn('quarter_start_date', substring(col("line"), lit(24), lit(8)))
519 |             .withColumn('posting_date', substring(col("line"), lit(32), lit(8)))
520 |             .withColumn('revenue', substring(col("line"), lit(40), lit(17)))
521 |             .withColumn('earnings', substring(col("line"), lit(57), lit(17)))
522 |             .withColumn('eps', substring(col("line"), lit(74), lit(12)))
523 |             .withColumn('diluted_eps', substring(col("line"), lit(86), lit(12)))
524 |             .withColumn('margin', substring(col("line"), lit(98), lit(12)))
525 |             .withColumn('inventory', substring(col("line"), lit(110), lit(17)))
526 |             .withColumn('assets', substring(col("line"), lit(127), lit(17)))
527 |             .withColumn('liabilities', substring(col("line"), lit(144), lit(17)))
528 |             .withColumn('sh_out', substring(col("line"), lit(161), lit(13)))
529 |             .withColumn('diluted_sh_out', substring(col("line"), lit(174), lit(13)))
530 |             .withColumn('co_name_or_cik', substring(col("line"), lit(187), lit(60)))
531 |             .drop(col("line"), col("rec_type"))
532 |         )
533 |             
534 | 
535 |         save_df(df, 'fin')
536 | 
537 | if __name__ == "__main__":
538 |     app()
539 | 


--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "0c989f41",
   6 |    "metadata": {
   7 |     "slideshow": {
   8 |      "slide_type": "slide"
   9 |     }
  10 |    },
  11 |    "source": [
  12 |     "![Title](images/title-page.png)"
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "markdown",
  17 |    "id": "f90ad24f-5035-4c5d-81e4-a81d9dff1e9a",
  18 |    "metadata": {
  19 |     "slideshow": {
  20 |      "slide_type": "slide"
  21 |     }
  22 |    },
  23 |    "source": [
  24 |     "![Title](images/title-qr.png)"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "id": "ed161285-5c6c-41de-a4ad-685a5de5ba0e",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "### What is the TPC?\n",
  33 |     "The TPC is a non-profit corporation focused on developing data-centric benchmark standards and disseminating objective, verifiable data to the industry.\n",
  34 |     "\n",
  35 |     "[tpc.org](https://www.tpc.org)\n",
  36 |     "\n",
  37 |     "### What is TPC-DI?\n",
  38 |     "The TPC-DI benchmark combines and transforms data extracted from an On-Line Transaction Processing (OTLP) system along with other sources of data, and loads it into a data warehouse.\n",
  39 |     "\n",
  40 |     "[tpc.org/tpcdi](https://www.tpc.org/tpcdi/)"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "markdown",
  45 |    "id": "87a9e518-484a-4f18-80d4-4ae43c72312a",
  46 |    "metadata": {},
  47 |    "source": [
  48 |     "![ETL Diagram](images/tpc-di-etl-diagram.png)"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "markdown",
  53 |    "metadata": {},
  54 |    "source": [
  55 |     "TPC-DI provides `DIGen.jar` to generate the source files.\n",
  56 |     "\n",
  57 |     "The JAR is dated and requires a 1.8 JDK."
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": null,
  63 |    "metadata": {},
  64 |    "outputs": [],
  65 |    "source": [
  66 |     "!jenv local 1.8\n",
  67 |     "!java -jar ~/dev/Tools/DIGen.jar --help"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": null,
  73 |    "metadata": {},
  74 |    "outputs": [],
  75 |    "source": [
  76 |     "!rm -rf ~/dev/tpcdi-output\n",
  77 |     "!mkdir -p ~/dev/tpcdi-output\n",
  78 |     "!cd ~/dev/Tools && java -jar ~/dev/Tools/DIGen.jar -o ~/dev/tpcdi-output -sf 10"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "markdown",
  83 |    "metadata": {},
  84 |    "source": [
  85 |     "### The GitHub repository has a prebuilt CLI for easily loading the files.\n",
  86 |     "[github.com/stewartbryson/dbt-tpcdi](https://www.github.com/stewartbryson/dbt-tpcdi)"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": null,
  92 |    "metadata": {},
  93 |    "outputs": [],
  94 |    "source": [
  95 |     "!python tpcdi.py --help"
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "code",
 100 |    "execution_count": null,
 101 |    "metadata": {},
 102 |    "outputs": [],
 103 |    "source": [
 104 |     "!python tpcdi.py process-files --help"
 105 |    ]
 106 |   },
 107 |   {
 108 |    "cell_type": "code",
 109 |    "execution_count": null,
 110 |    "metadata": {},
 111 |    "outputs": [],
 112 |    "source": [
 113 |     "!python tpcdi.py process-files --output-directory ~/dev/tpcdi-output --file-name DailyMarket.txt --show"
 114 |    ]
 115 |   },
 116 |   {
 117 |    "cell_type": "markdown",
 118 |    "metadata": {},
 119 |    "source": [
 120 |     "### If you get nothing else from this video, know that there's an easy way to load this dataset into Snowflake.\n",
 121 |     "\n",
 122 |     "But I also wanted to show some interesting approaches using Snowpark.\n",
 123 |     "\n",
 124 |     "All of the code samples below are snippets from the CLI with abstractions removed.\n",
 125 |     "\n",
 126 |     "We start with a `credentials.json` file to store our Snowflake credentials. Something like this:\n",
 127 |     "\n",
 128 |     "```json\n",
 129 |     "{\n",
 130 |     "    \"account\": \"myaccount\",\n",
 131 |     "    \"user\": \"myuser\",\n",
 132 |     "    \"password\": \"mypassword\",\n",
 133 |     "    \"role\": \"myrole\",\n",
 134 |     "    \"warehouse\": \"stewart_dev\",\n",
 135 |     "    \"database\": \"tpcdi\",\n",
 136 |     "    \"schema\": \"digen\"\n",
 137 |     "}\n",
 138 |     "```"
 139 |    ]
 140 |   },
 141 |   {
 142 |    "cell_type": "markdown",
 143 |    "metadata": {},
 144 |    "source": [
 145 |     "We create a Snowflake session."
 146 |    ]
 147 |   },
 148 |   {
 149 |    "cell_type": "code",
 150 |    "execution_count": null,
 151 |    "metadata": {},
 152 |    "outputs": [],
 153 |    "source": [
 154 |     "import json\n",
 155 |     "from snowflake.snowpark.types import *\n",
 156 |     "from snowflake.snowpark.functions import *\n",
 157 |     "from snowflake.snowpark import Session\n",
 158 |     "\n",
 159 |     "# Read the credentials.json file\n",
 160 |     "with open(\"credentials.json\") as jsonfile:\n",
 161 |     "    credentials_dict = json.load(jsonfile)\n",
 162 |     "\n",
 163 |     "# build the session\n",
 164 |     "session = (\n",
 165 |     "    Session\n",
 166 |     "    .builder\n",
 167 |     "    .configs(credentials_dict)\n",
 168 |     "    .create()\n",
 169 |     ")\n",
 170 |     "\n",
 171 |     "# Constants\n",
 172 |     "source_path = '/Users/stewartbryson/dev/tpcdi-output/Batch1'"
 173 |    ]
 174 |   },
 175 |   {
 176 |    "cell_type": "markdown",
 177 |    "metadata": {},
 178 |    "source": [
 179 |     "Most of the files generated by `DIGen.jar` are pipe-separated files, very similar to CSV files.\n",
 180 |     "\n",
 181 |     "These are very simple to handle. First let's upload the file to a stage:"
 182 |    ]
 183 |   },
 184 |   {
 185 |    "cell_type": "code",
 186 |    "execution_count": null,
 187 |    "metadata": {},
 188 |    "outputs": [],
 189 |    "source": [
 190 |     "# Constants\n",
 191 |     "stage_path = \"@tpcdi/Batch1\"\n",
 192 |     "\n",
 193 |     "# Put the file\n",
 194 |     "put_result = (\n",
 195 |     "    session\n",
 196 |     "    .file\n",
 197 |     "    .put(\n",
 198 |     "        f\"{source_path}/DailyMarket.txt\",\n",
 199 |     "        f\"{stage_path}/DailyMarket.txt\",\n",
 200 |     "        parallel=4,\n",
 201 |     "        auto_compress=True,\n",
 202 |     "    )\n",
 203 |     ")\n",
 204 |     "\n",
 205 |     "# Report back file results\n",
 206 |     "for result in put_result:\n",
 207 |     "    print(f\"File {result.source}: {result.status}\")"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "markdown",
 212 |    "metadata": {},
 213 |    "source": [
 214 |     "And now we'll create a table from that file:"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": null,
 220 |    "metadata": {},
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "table_name = 'daily_market'\n",
 224 |     "# Define the schema\n",
 225 |     "schema = StructType([\n",
 226 |     "                StructField(\"DM_DATE\", DateType(), False),\n",
 227 |     "                StructField(\"DM_S_SYMB\", StringType(), False),\n",
 228 |     "                StructField(\"DM_CLOSE\", FloatType(), False),\n",
 229 |     "                StructField(\"DM_HIGH\", FloatType(), False),\n",
 230 |     "                StructField(\"DM_LOW\", FloatType(), False),\n",
 231 |     "                StructField(\"DM_VOL\", FloatType(), False),\n",
 232 |     "        ])\n",
 233 |     "\n",
 234 |     "# create a table from a DataFrame\n",
 235 |     "df = (\n",
 236 |     "    session\n",
 237 |     "    .read\n",
 238 |     "    .schema(schema)\n",
 239 |     "    .option(\"field_delimiter\", '|')\n",
 240 |     "    .csv(f\"{stage_path}/DailyMarket.txt\")\n",
 241 |     "    .write\n",
 242 |     "    .mode(\"overwrite\")\n",
 243 |     "    .save_as_table(table_name)\n",
 244 |     ")\n",
 245 |     "\n",
 246 |     "print(f\"{table_name.upper()} table created.\")"
 247 |    ]
 248 |   },
 249 |   {
 250 |    "cell_type": "code",
 251 |    "execution_count": null,
 252 |    "metadata": {},
 253 |    "outputs": [],
 254 |    "source": [
 255 |     "# show the table\n",
 256 |     "df = (\n",
 257 |     "    session \n",
 258 |     "    .table(table_name) \n",
 259 |     "    .show()\n",
 260 |     ")"
 261 |    ]
 262 |   },
 263 |   {
 264 |    "cell_type": "markdown",
 265 |    "metadata": {},
 266 |    "source": [
 267 |     "The `DIGen.jar` utility generates a series of \"finwire\" files.\n",
 268 |     "\n",
 269 |     "These files represent market history over time.\n",
 270 |     "\n",
 271 |     "They are fixed-width, multi-format files."
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "markdown",
 276 |    "metadata": {},
 277 |    "source": [
 278 |     "Here's a sample of one of the files that has one of each type of record: `FIN`, `SEC`, and `CMP`:"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": null,
 284 |    "metadata": {},
 285 |    "outputs": [],
 286 |    "source": [
 287 |     "!cat devrel/multi-record.txt"
 288 |    ]
 289 |   },
 290 |   {
 291 |    "cell_type": "markdown",
 292 |    "metadata": {},
 293 |    "source": [
 294 |     "We'll start by uploading all the files:"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": null,
 300 |    "metadata": {},
 301 |    "outputs": [],
 302 |    "source": [
 303 |     "from pathlib import Path\n",
 304 |     "\n",
 305 |     "# File paths\n",
 306 |     "stage_path = \"@tpcdi/Batch1/FINWIRE\"\n",
 307 |     "\n",
 308 |     "# glob the files\n",
 309 |     "pathlist = (\n",
 310 |     "    Path(source_path)\n",
 311 |     "    .glob(\"FINWIRE??????\")\n",
 312 |     ")\n",
 313 |     "\n",
 314 |     "for file in pathlist:\n",
 315 |     "    # put the file(s) in the stage\n",
 316 |     "    put_result = (\n",
 317 |     "        session \n",
 318 |     "        .file\n",
 319 |     "        .put(\n",
 320 |     "            str(file), \n",
 321 |     "            stage_path, \n",
 322 |     "            parallel=4, \n",
 323 |     "            auto_compress=True\n",
 324 |     "        )\n",
 325 |     "    )\n",
 326 |     "    for result in put_result:\n",
 327 |     "        print(f\"File {result.source}: {result.status}\")"
 328 |    ]
 329 |   },
 330 |   {
 331 |    "cell_type": "markdown",
 332 |    "metadata": {},
 333 |    "source": [
 334 |     "The `CMP`, `SEC`, and `FIN` records all have two fields in common, so we want to create a generic DataFrame that contains the shared logic and we’ll save that DataFrame as a Snowflake temporary table called `FINWIRE`. We'll use `WITH_COLUMN` and `SUBSTRING`"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": null,
 340 |    "metadata": {},
 341 |    "outputs": [],
 342 |    "source": [
 343 |     "# These are fixed-width fields, so read the entire line in as \"line\"\n",
 344 |     "schema = StructType([\n",
 345 |     "        StructField(\"line\", StringType(), False),\n",
 346 |     "])\n",
 347 |     "\n",
 348 |     "# generic dataframe for all record types\n",
 349 |     "# create a temporary table\n",
 350 |     "# The delimiter '|' seems safer\n",
 351 |     "df = (\n",
 352 |     "    session\n",
 353 |     "    .read\n",
 354 |     "    .schema(schema)\n",
 355 |     "    .option('field_delimiter', '|')\n",
 356 |     "    .csv(stage_path)\n",
 357 |     "    .with_column(\n",
 358 |     "        'pts', \n",
 359 |     "        to_timestamp(\n",
 360 |     "            substring(col(\"line\"), lit(0), lit(15)), \n",
 361 |     "            lit(\"yyyymmdd-hhmiss\")\n",
 362 |     "        )\n",
 363 |     "    )\n",
 364 |     "    .with_column(\n",
 365 |     "        'rec_type', \n",
 366 |     "        substring(col(\"line\"), lit(16), lit(3))\n",
 367 |     "    )\n",
 368 |     "    .write\n",
 369 |     "    .mode(\"overwrite\")\n",
 370 |     "    .save_as_table(\"finwire\", table_type=\"temporary\")\n",
 371 |     ")\n",
 372 |     "\n",
 373 |     "# let's see the table\n",
 374 |     "df = (\n",
 375 |     "    session \n",
 376 |     "    .table('finwire') \n",
 377 |     "    .show()\n",
 378 |     ")"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "markdown",
 383 |    "metadata": {},
 384 |    "source": [
 385 |     "Now we can create the three separate tables from this temporary table using the `WITH_COLUMN` and `SUBSTRING` functions.\n",
 386 |     "\n",
 387 |     "I'll only show the Security table as an example, but the other two are done the same way:"
 388 |    ]
 389 |   },
 390 |   {
 391 |    "cell_type": "code",
 392 |    "execution_count": null,
 393 |    "metadata": {},
 394 |    "outputs": [],
 395 |    "source": [
 396 |     "# SEC record types\n",
 397 |     "table_name = 'sec'\n",
 398 |     "df = (\n",
 399 |     "    session\n",
 400 |     "    .table('finwire')\n",
 401 |     "    .where(col('rec_type') == 'SEC')\n",
 402 |     "    .withColumn(\n",
 403 |     "        'symbol', \n",
 404 |     "        substring(col(\"line\"), lit(19), lit(15))\n",
 405 |     "    )\n",
 406 |     "    .withColumn(\n",
 407 |     "        'issue_type', \n",
 408 |     "        substring(col(\"line\"), lit(34), lit(6))\n",
 409 |     "    )\n",
 410 |     "    .withColumn(\n",
 411 |     "        'status', \n",
 412 |     "        substring(col(\"line\"), lit(40), lit(4))\n",
 413 |     "    )\n",
 414 |     "    .withColumn(\n",
 415 |     "        'name', \n",
 416 |     "        substring(col(\"line\"), lit(44), lit(70))\n",
 417 |     "    )\n",
 418 |     "    .withColumn(\n",
 419 |     "        'ex_id', \n",
 420 |     "        substring(col(\"line\"), lit(114), lit(6))\n",
 421 |     "    )\n",
 422 |     "    .withColumn(\n",
 423 |     "        'sh_out', \n",
 424 |     "        substring(col(\"line\"), lit(120), lit(13))\n",
 425 |     "    )\n",
 426 |     "    .withColumn(\n",
 427 |     "        'first_trade_date', \n",
 428 |     "        substring(col(\"line\"), lit(133), lit(8))\n",
 429 |     "    )\n",
 430 |     "    .withColumn(\n",
 431 |     "        'first_exchange_date', \n",
 432 |     "        substring(col(\"line\"), lit(141), lit(8))\n",
 433 |     "    )\n",
 434 |     "    .withColumn(\n",
 435 |     "        'dividend', \n",
 436 |     "        substring(col(\"line\"), lit(149), lit(12))\n",
 437 |     "    )\n",
 438 |     "    .withColumn(\n",
 439 |     "        'co_name_or_cik', \n",
 440 |     "        substring(col(\"line\"), lit(161), lit(60))\n",
 441 |     "    )\n",
 442 |     "    # these columns are no longer relevant\n",
 443 |     "    .drop(col(\"line\"), col(\"rec_type\"))\n",
 444 |     "    .write\n",
 445 |     "    .mode(\"overwrite\")\n",
 446 |     "    .save_as_table(table_name)\n",
 447 |     ")\n",
 448 |     "\n",
 449 |     "print(f\"{table_name.upper()} table created.\")"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": null,
 455 |    "metadata": {},
 456 |    "outputs": [],
 457 |    "source": [
 458 |     "\n",
 459 |     "# let's see the table\n",
 460 |     "df = (\n",
 461 |     "    session \n",
 462 |     "    .table('sec') \n",
 463 |     "    .show()\n",
 464 |     ")"
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "markdown",
 469 |    "metadata": {},
 470 |    "source": [
 471 |     "The `DIGen.jar` utility creates a single XML file called `CustomerMgmt.xml`, with a sample below:\n",
 472 |     "\n",
 473 |     "```xml\n",
 474 |     "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
 475 |     "<TPCDI:Actions xmlns:TPCDI=\"http://www.tpc.org/tpc-di\">\n",
 476 |     " <TPCDI:Action ActionType=\"NEW\" ActionTS=\"2007-07-07T02:56:25\">\n",
 477 |     "  <Customer C_ID=\"0\" C_TAX_ID=\"923-54-6498\" C_GNDR=\"F\" C_TIER=\"3\" C_DOB=\"1940-12-02\">\n",
 478 |     "   <Name>\n",
 479 |     "    <C_L_NAME>Joannis</C_L_NAME>\n",
 480 |     "    <C_F_NAME>Adara</C_F_NAME>\n",
 481 |     "    <C_M_NAME/>\n",
 482 |     "   </Name>\n",
 483 |     "   <Address>\n",
 484 |     "    <C_ADLINE1>4779 Weller Way</C_ADLINE1>\n",
 485 |     "    <C_ADLINE2/>\n",
 486 |     "    <C_ZIPCODE>92624</C_ZIPCODE>\n",
 487 |     "    <C_CITY>Columbus</C_CITY>\n",
 488 |     "    <C_STATE_PROV>Ontario</C_STATE_PROV>\n",
 489 |     "    <C_CTRY>Canada</C_CTRY>\n",
 490 |     "   </Address>\n",
 491 |     "   <ContactInfo>\n",
 492 |     "    <C_PRIM_EMAIL>Adara.Joannis@moose-mail.com</C_PRIM_EMAIL>\n",
 493 |     "    <C_ALT_EMAIL>Adara.Joannis@gmx.com</C_ALT_EMAIL>\n",
 494 |     "    <C_PHONE_1>\n",
 495 |     "     <C_CTRY_CODE>1</C_CTRY_CODE>\n",
 496 |     "     <C_AREA_CODE>872</C_AREA_CODE>\n",
 497 |     "     <C_LOCAL>523-8928</C_LOCAL>\n",
 498 |     "     <C_EXT/>\n",
 499 |     "    </C_PHONE_1>\n",
 500 |     "    <C_PHONE_2>\n",
 501 |     "     <C_CTRY_CODE/>\n",
 502 |     "     <C_AREA_CODE/>\n",
 503 |     "     <C_LOCAL>492-3961</C_LOCAL>\n",
 504 |     "     <C_EXT/>\n",
 505 |     "    </C_PHONE_2>\n",
 506 |     "    <C_PHONE_3>\n",
 507 |     "     <C_CTRY_CODE/>\n",
 508 |     "     <C_AREA_CODE/>\n",
 509 |     "     <C_LOCAL/>\n",
 510 |     "     <C_EXT/>\n",
 511 |     "    </C_PHONE_3>\n",
 512 |     "   </ContactInfo>\n",
 513 |     "   <TaxInfo>\n",
 514 |     "    <C_LCL_TX_ID>CA3</C_LCL_TX_ID>\n",
 515 |     "    <C_NAT_TX_ID>YT3</C_NAT_TX_ID>\n",
 516 |     "   </TaxInfo>\n",
 517 |     "   <Account CA_ID=\"0\" CA_TAX_ST=\"1\">\n",
 518 |     "    <CA_B_ID>17713</CA_B_ID>\n",
 519 |     "    <CA_NAME>CJlmMuFyibKOmKLHIaTeWugvCgZdmcfpDsYb</CA_NAME>\n",
 520 |     "   </Account>\n",
 521 |     "  </Customer>\n",
 522 |     " </TPCDI:Action>\n",
 523 |     "</TPCDI:Actions>\n",
 524 |     "```\n",
 525 |     "\n",
 526 |     "The hierarchical representation of a TPCDI:Action record, with @ signifying a node attribute as opposed to an element, is shown below:\n",
 527 |     "\n",
 528 |     "```\n",
 529 |     "|-- TPCDI:Action\n",
 530 |     "    |-- @ActionType: string\n",
 531 |     "    |-- @ActionTS: timestamp\n",
 532 |     "    |-- Customer\n",
 533 |     "        |-- @C_ID: number\n",
 534 |     "        |-- @C_TAX_ID: string\n",
 535 |     "        |-- @C_GNDR: string\n",
 536 |     "        |-- @C_TIER: number\n",
 537 |     "        |-- @C_DOB: date\n",
 538 |     "        |-- Name\n",
 539 |     "            |-- C_F_NAME: string\n",
 540 |     "            |-- C_L_NAME: string\n",
 541 |     "            |-- C_M_NAME: string\n",
 542 |     "        |-- Address\n",
 543 |     "            |-- C_ADLINE1: string\n",
 544 |     "            |-- C_ADLINE2: string\n",
 545 |     "            |-- C_CITY: string\n",
 546 |     "            |-- C_CTRY: string\n",
 547 |     "            |-- C_STATE_PROV: string\n",
 548 |     "            |-- C_ZIPCODE: string\n",
 549 |     "        |-- ContactInfo\n",
 550 |     "            |-- C_ALT_EMAIL: string\n",
 551 |     "            |-- C_PHONE_1\n",
 552 |     "                |-- C_AREA_CODE: number\n",
 553 |     "                |-- C_CTRY_CODE: number\n",
 554 |     "                |-- C_EXT: long\n",
 555 |     "                |-- C_LOCAL: string\n",
 556 |     "            |-- C_PHONE_2\n",
 557 |     "                |-- C_AREA_CODE: number\n",
 558 |     "                |-- C_CTRY_CODE: number\n",
 559 |     "                |-- C_EXT: number\n",
 560 |     "                |-- C_LOCAL: string\n",
 561 |     "            |-- C_PHONE_3\n",
 562 |     "                |-- C_AREA_CODE: number\n",
 563 |     "                |-- C_CTRY_CODE: number\n",
 564 |     "                |-- C_EXT: number\n",
 565 |     "                |-- C_LOCAL: string\n",
 566 |     "        |-- TaxInfo\n",
 567 |     "            |-- C_LCL_TX_ID: string\n",
 568 |     "            |-- C_NAT_TX_ID: string\n",
 569 |     "        |-- Account\n",
 570 |     "            |-- CA_B_ID: number\n",
 571 |     "            |-- CA_NAME: string\n",
 572 |     "            |-- @CA_ID: number\n",
 573 |     "            |-- @CA_TAX_ST: number\n",
 574 |     "```\n"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "markdown",
 579 |    "metadata": {},
 580 |    "source": [
 581 |     "Let's create a DataFrame from the XML file and see what we get:"
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": null,
 587 |    "metadata": {},
 588 |    "outputs": [],
 589 |    "source": [
 590 |     "# File paths\n",
 591 |     "stage_path = \"@tpcdi/Batch1\"\n",
 592 |     "\n",
 593 |     "# Put the file\n",
 594 |     "put_result = (\n",
 595 |     "    session\n",
 596 |     "    .file\n",
 597 |     "    .put(\n",
 598 |     "        f\"{source_path}/CustomerMgmt.xml\",\n",
 599 |     "        f\"{stage_path}/CustomerMgmt.xml\",\n",
 600 |     "        parallel=4,\n",
 601 |     "        auto_compress=True,\n",
 602 |     "    )\n",
 603 |     ")\n",
 604 |     "for result in put_result:\n",
 605 |     "    print(f\"File {result.source}: {result.status}\")\n",
 606 |     "\n",
 607 |     "# Read the XML file into a DataFrame and show it\n",
 608 |     "df = (\n",
 609 |     "    session\n",
 610 |     "    .read\n",
 611 |     "    .option('STRIP_OUTER_ELEMENT', True) # Strips TPCDI:Actions\n",
 612 |     "    .xml(f\"{stage_path}/CustomerMgmt.xml\")\n",
 613 |     "    .show(1, 100)\n",
 614 |     ")"
 615 |    ]
 616 |   },
 617 |   {
 618 |    "cell_type": "markdown",
 619 |    "metadata": {},
 620 |    "source": [
 621 |     "Snowflake does not support simple dot notation for XML the way it does for JSON.\n",
 622 |     "\n",
 623 |     "Instead we have to pair the `GET()` function with an `XMLGET()`, which can be quite tedious.\n",
 624 |     "\n",
 625 |     "So we can create helper functions to abstract the toil:"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": null,
 631 |    "metadata": {},
 632 |    "outputs": [],
 633 |    "source": [
 634 |     "# Simplifies retrieving XML elements\n",
 635 |     "def get_xml_element(\n",
 636 |     "        column:str,\n",
 637 |     "        element:str,\n",
 638 |     "        datatype:str,\n",
 639 |     "        with_alias:bool = True\n",
 640 |     "):\n",
 641 |     "    new_element = (\n",
 642 |     "        get(\n",
 643 |     "            xmlget(\n",
 644 |     "                col(column),\n",
 645 |     "                lit(element),\n",
 646 |     "            ),\n",
 647 |     "            lit('$')\n",
 648 |     "        )\n",
 649 |     "        .cast(datatype)\n",
 650 |     "    )\n",
 651 |     "\n",
 652 |     "    # alias needs to be optional\n",
 653 |     "    return (\n",
 654 |     "        new_element.alias(element) if with_alias else new_element\n",
 655 |     "    )\n",
 656 |     "\n",
 657 |     "# Simplifies retrieving XML attributes\n",
 658 |     "def get_xml_attribute(\n",
 659 |     "        column:str,\n",
 660 |     "        attribute:str,\n",
 661 |     "        datatype:str,\n",
 662 |     "        with_alias:bool = True\n",
 663 |     "):\n",
 664 |     "    new_attribute = (\n",
 665 |     "        get(\n",
 666 |     "            col(column),\n",
 667 |     "            lit(f\"@{attribute}\")\n",
 668 |     "        )\n",
 669 |     "        .cast(datatype)\n",
 670 |     "    )\n",
 671 |     "\n",
 672 |     "    # alias needs to be optional\n",
 673 |     "    return (\n",
 674 |     "        new_attribute.alias(attribute) if with_alias else new_attribute\n",
 675 |     "    )\n",
 676 |     "\n",
 677 |     "# Constructs a phone number from multiple nested fields\n",
 678 |     "def get_phone_number(\n",
 679 |     "        phone_id:str,\n",
 680 |     "        separator:str = '-'\n",
 681 |     "):\n",
 682 |     "    return (\n",
 683 |     "        concat (\n",
 684 |     "            get_xml_element(f\"phone{phone_id}\", 'C_CTRY_CODE', 'STRING', False),\n",
 685 |     "            when(get_xml_element(f\"phone{phone_id}\", 'C_CTRY_CODE', 'STRING', False) == '', '')\n",
 686 |     "            .otherwise(separator),\n",
 687 |     "            get_xml_element(f\"phone{phone_id}\", 'C_AREA_CODE', 'STRING', False),\n",
 688 |     "            when(get_xml_element(f\"phone{phone_id}\", 'C_AREA_CODE', 'STRING', False) == '', '')\n",
 689 |     "            .otherwise(separator),\n",
 690 |     "            get_xml_element(f\"phone{phone_id}\", 'C_LOCAL', 'STRING', False),\n",
 691 |     "            when(get_xml_element(f\"phone{phone_id}\", 'C_EXT', 'STRING', False) == '', '')\n",
 692 |     "            .otherwise(\" ext: \"),\n",
 693 |     "            get_xml_element(f\"phone{phone_id}\", 'C_EXT', 'STRING', False)\n",
 694 |     "        )\n",
 695 |     "        .alias(f\"c_phone_{phone_id}\")\n",
 696 |     "    )"
 697 |    ]
 698 |   },
 699 |   {
 700 |    "cell_type": "markdown",
 701 |    "metadata": {},
 702 |    "source": [
 703 |     "Now let's put it all together and create our `customer_mgmt` table:"
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "code",
 708 |    "execution_count": null,
 709 |    "metadata": {},
 710 |    "outputs": [],
 711 |    "source": [
 712 |     "table_name = 'customer_mgmt'\n",
 713 |     "df = (\n",
 714 |     "    session\n",
 715 |     "    .read\n",
 716 |     "    .option('STRIP_OUTER_ELEMENT', True) # Strips the TPCDI:Actions node\n",
 717 |     "    .xml(f\"{stage_path}/CustomerMgmt.xml\")\n",
 718 |     "    .select(\n",
 719 |     "        # flatten out all of the nested elements\n",
 720 |     "        xmlget(col('$1'), lit('Customer'), 0).alias('customer'),\n",
 721 |     "        xmlget(col('customer'), lit('Name'), 0).alias('name'),\n",
 722 |     "        xmlget(col('customer'), lit('Address'), 0).alias('address'),\n",
 723 |     "        xmlget(col('customer'), lit('ContactInfo'), 0).alias('contact_info'),\n",
 724 |     "        xmlget(col('contact_info'), lit('C_PHONE_1')).alias('phone1'),\n",
 725 |     "        xmlget(col('contact_info'), lit('C_PHONE_2')).alias('phone2'),\n",
 726 |     "        xmlget(col('contact_info'), lit('C_PHONE_3')).alias('phone3'),\n",
 727 |     "        xmlget(col('customer'), lit('TaxInfo'), 0).alias('tax_info'),\n",
 728 |     "        xmlget(col('customer'), lit('Account'), 0).alias('account'),\n",
 729 |     "        # get the Action attributes\n",
 730 |     "        get_xml_attribute('$1','ActionType','STRING'),\n",
 731 |     "        get_xml_attribute('$1','ActionTS','STRING'),\n",
 732 |     "    )\n",
 733 |     "    .select(\n",
 734 |     "        # Handling Action attributes\n",
 735 |     "        to_timestamp(\n",
 736 |     "            col('ActionTs'),\n",
 737 |     "            lit('yyyy-mm-ddThh:mi:ss')\n",
 738 |     "        ).alias('action_ts'),\n",
 739 |     "        col('ActionType').alias('ACTION_TYPE'),\n",
 740 |     "        # Get Customer Attributes\n",
 741 |     "        get_xml_attribute('customer','C_ID','NUMBER'),\n",
 742 |     "        get_xml_attribute('customer','C_TAX_ID','STRING'),\n",
 743 |     "        get_xml_attribute('customer','C_GNDR','STRING'),\n",
 744 |     "        # Had to disable auto-aliasing\n",
 745 |     "        try_cast(\n",
 746 |     "            get_xml_attribute('customer','C_TIER','STRING', False),\n",
 747 |     "            'NUMBER'\n",
 748 |     "        ).alias('c_tier'),\n",
 749 |     "        get_xml_attribute('customer','C_DOB','DATE'),\n",
 750 |     "        # Get Name elements\n",
 751 |     "        get_xml_element('name','C_L_NAME','STRING'),\n",
 752 |     "        get_xml_element('name','C_F_NAME','STRING'),\n",
 753 |     "        get_xml_element('name','C_M_NAME','STRING'),\n",
 754 |     "        # Get Address elements\n",
 755 |     "        get_xml_element('address','C_ADLINE1','STRING'),\n",
 756 |     "        get_xml_element('address', 'C_ADLINE2', 'STRING'),\n",
 757 |     "        get_xml_element('address','C_ZIPCODE','STRING'),\n",
 758 |     "        get_xml_element('address','C_CITY','STRING'),\n",
 759 |     "        get_xml_element('address','C_STATE_PROV','STRING'),\n",
 760 |     "        get_xml_element('address','C_CTRY','STRING'),\n",
 761 |     "        # Get Contact Info elements\n",
 762 |     "        get_xml_element('contact_info','C_PRIM_EMAIL','STRING'),\n",
 763 |     "        get_xml_element('contact_info','C_ALT_EMAIL','STRING'),\n",
 764 |     "        # Contruct phone numbers from multi-nested elements\n",
 765 |     "        get_phone_number('1'),\n",
 766 |     "        get_phone_number('2'),\n",
 767 |     "        get_phone_number('3'),\n",
 768 |     "        # Get TaxInfo elements\n",
 769 |     "        get_xml_element('tax_info','C_LCL_TX_ID','STRING'),\n",
 770 |     "        get_xml_element('tax_info','C_NAT_TX_ID','STRING'),\n",
 771 |     "        # Get Account Attributes\n",
 772 |     "        get_xml_attribute('account','CA_ID','STRING'),\n",
 773 |     "        get_xml_attribute('account','CA_TAX_ST','NUMBER'),\n",
 774 |     "        # Get Account elements\n",
 775 |     "        get_xml_element('account','CA_B_ID','NUMBER'),\n",
 776 |     "        get_xml_element('account','CA_NAME','STRING'),\n",
 777 |     "    )\n",
 778 |     "    .write\n",
 779 |     "    .mode(\"overwrite\")\n",
 780 |     "    .save_as_table(table_name)\n",
 781 |     ")\n",
 782 |     "\n",
 783 |     "print(f\"{table_name.upper()} table created.\")"
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "code",
 788 |    "execution_count": null,
 789 |    "metadata": {},
 790 |    "outputs": [],
 791 |    "source": [
 792 |     "df = (\n",
 793 |     "    session\n",
 794 |     "    .table('customer_mgmt')\n",
 795 |     "    .select(\n",
 796 |     "        col('action_ts'),\n",
 797 |     "        col('c_id'),\n",
 798 |     "        col('c_tier'),\n",
 799 |     "        col('c_phone_1')\n",
 800 |     "    )\n",
 801 |     "    .show()\n",
 802 |     ")"
 803 |    ]
 804 |   },
 805 |   {
 806 |    "cell_type": "markdown",
 807 |    "metadata": {},
 808 |    "source": [
 809 |     "![ETL Diagram](images/tpc-di-logical-model.png)"
 810 |    ]
 811 |   },
 812 |   {
 813 |    "cell_type": "markdown",
 814 |    "id": "442d9973-dcdb-491f-82d2-0a1ec2d133aa",
 815 |    "metadata": {
 816 |     "tags": []
 817 |    },
 818 |    "source": [
 819 |     "# When we Google \"dbt dynamic tables\":\n",
 820 |     "\n",
 821 |     "![Google Search](images/dbt-dynamic-tables.png)"
 822 |    ]
 823 |   },
 824 |   {
 825 |    "cell_type": "markdown",
 826 |    "id": "c73ac995-dcdf-4d7d-96d8-4f23eafc16c9",
 827 |    "metadata": {},
 828 |    "source": [
 829 |     "# It's not as simple as this\n",
 830 |     "\n",
 831 |     "![Conflict](images/refresh-conflict.png)"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "markdown",
 836 |    "id": "96e4236c-46da-4b50-aa37-225df6ff9f7f",
 837 |    "metadata": {},
 838 |    "source": [
 839 |     "# dbt is more than just a job scheduler.\n",
 840 |     "Dynamic Tables need to be (re)created in the correct order.\n",
 841 |     "The dependent relationships become very complex as the number of dependencies increases.\n",
 842 |     "We don't want to maintain a create script with all the tables in the correct order.\n",
 843 |     "\n",
 844 |     "dbt understands your DAG and can infer the relationships."
 845 |    ]
 846 |   },
 847 |   {
 848 |    "cell_type": "markdown",
 849 |    "metadata": {},
 850 |    "source": [
 851 |     "![dbt DAG](images/dbt-dag.png)"
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "markdown",
 856 |    "id": "469f485d-792c-46df-906f-cd09f364d7d0",
 857 |    "metadata": {},
 858 |    "source": [
 859 |     "### Enabling dynamic tables in our dbt project\n",
 860 |     "Be on version `1.7.0` of `dbt-snowflake` and set a new materialization type in our `dbt_project.yml` file:\n",
 861 |     "\n",
 862 |     "```yaml\n",
 863 |     "models:\n",
 864 |     "  dbt_tpcdi:\n",
 865 |     "    example:\n",
 866 |     "      +materialized: view\n",
 867 |     "    bronze:\n",
 868 |     "      +schema: bronze\n",
 869 |     "      +materialized: dynamic_table\n",
 870 |     "      +snowflake_warehouse: tpcdi_large\n",
 871 |     "      +target_lag: downstream\n",
 872 |     "    silver:\n",
 873 |     "      +schema: silver\n",
 874 |     "      +materialized: dynamic_table\n",
 875 |     "      +snowflake_warehouse: tpcdi_large\n",
 876 |     "      +target_lag: '10 minutes'\n",
 877 |     "    gold:\n",
 878 |     "      +schema: gold\n",
 879 |     "      +materialized: dynamic_table\n",
 880 |     "      +snowflake_warehouse: tpcdi_large\n",
 881 |     "      +target_lag: '20 minutes'\n",
 882 |     "    work:\n",
 883 |     "      +schema: work\n",
 884 |     "      +materialized: ephemeral\n",
 885 |     "```"
 886 |    ]
 887 |   },
 888 |   {
 889 |    "cell_type": "code",
 890 |    "execution_count": null,
 891 |    "id": "420db6a7-ce06-4638-a3a5-1ac16aff2fc8",
 892 |    "metadata": {
 893 |     "tags": []
 894 |    },
 895 |    "outputs": [],
 896 |    "source": [
 897 |     "!dbt build"
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "markdown",
 902 |    "id": "3eb79852",
 903 |    "metadata": {},
 904 |    "source": [
 905 |     "### dbt also has Tests\n",
 906 |     "\n",
 907 |     "We can run them when we create the Dynamic Table:"
 908 |    ]
 909 |   },
 910 |   {
 911 |    "cell_type": "code",
 912 |    "execution_count": null,
 913 |    "id": "a62811a9",
 914 |    "metadata": {
 915 |     "tags": []
 916 |    },
 917 |    "outputs": [],
 918 |    "source": [
 919 |     "!dbt build --select fact_trade"
 920 |    ]
 921 |   },
 922 |   {
 923 |    "cell_type": "markdown",
 924 |    "id": "d564eded",
 925 |    "metadata": {},
 926 |    "source": [
 927 |     "Or we can schedule them to run periodically:"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": null,
 933 |    "id": "cf40309e",
 934 |    "metadata": {
 935 |     "tags": []
 936 |    },
 937 |    "outputs": [],
 938 |    "source": [
 939 |     "!dbt test"
 940 |    ]
 941 |   },
 942 |   {
 943 |    "cell_type": "markdown",
 944 |    "id": "ac3f442b",
 945 |    "metadata": {},
 946 |    "source": [
 947 |     "### dbt is more than a scheduler\n",
 948 |     "\n",
 949 |     "1. Cloud development environment for those that prefer it.\n",
 950 |     "1. CI/CD workflows for promoting Dynamic Table changes into Production.\n",
 951 |     "1. Perhaps there's promise in the Semantic Layer."
 952 |    ]
 953 |   },
 954 |   {
 955 |    "cell_type": "markdown",
 956 |    "metadata": {},
 957 |    "source": [
 958 |     "### Snowflake has its own version of the DAG\n",
 959 |     "However, this only exists after all the Dynamic Tables have been created.\n",
 960 |     "dbt understands the DAG _before_ it's created.\n",
 961 |     "\n",
 962 |     "![Snowflake DAG](images/snowflake-dag.png)"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "markdown",
 967 |    "metadata": {},
 968 |    "source": []
 969 |   },
 970 |   {
 971 |    "cell_type": "markdown",
 972 |    "id": "22101a49",
 973 |    "metadata": {},
 974 |    "source": [
 975 |     "# Clean-up"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": null,
 981 |    "id": "3c6a912a",
 982 |    "metadata": {
 983 |     "tags": []
 984 |    },
 985 |    "outputs": [],
 986 |    "source": [
 987 |     "!python tpcdi.py drop-schema --schema dl_gold\n",
 988 |     "!python tpcdi.py drop-schema --schema dl_silver\n",
 989 |     "!python tpcdi.py drop-schema --schema dl_bronze\n",
 990 |     "!python tpcdi.py drop-schema --schema dl_work"
 991 |    ]
 992 |   }
 993 |  ],
 994 |  "metadata": {
 995 |   "kernelspec": {
 996 |    "display_name": "Python 3 (ipykernel)",
 997 |    "language": "python",
 998 |    "name": "python3"
 999 |   },
1000 |   "language_info": {
1001 |    "codemirror_mode": {
1002 |     "name": "ipython",
1003 |     "version": 3
1004 |    },
1005 |    "file_extension": ".py",
1006 |    "mimetype": "text/x-python",
1007 |    "name": "python",
1008 |    "nbconvert_exporter": "python",
1009 |    "pygments_lexer": "ipython3",
1010 |    "version": "3.8.17"
1011 |   }
1012 |  },
1013 |  "nbformat": 4,
1014 |  "nbformat_minor": 5
1015 | }
1016 | 


--------------------------------------------------------------------------------