├── __init__.py
├── dbt-proj
    ├── data
    │   └── .gitkeep
    ├── tests
    │   └── .gitkeep
    ├── analysis
    │   └── .gitkeep
    ├── macros
    │   └── .gitkeep
    ├── .gitignore
    ├── README.md
    ├── models
    │   └── example
    │   │   └── one_big_table.sql
    ├── profiles.yml
    └── dbt_project.yml
├── Analysis
    └── images
    │   ├── bigquery.png
    │   ├── snowflake.png
    │   ├── dc2.8xlarge_multi-node_first.png
    │   ├── dc2.large_single-node_first.png
    │   ├── dc2.8xlarge_multi-node_subsequent.png
    │   └── dc2.large_single-node_subsequent.png
├── static
    ├── slack_screenshot.png
    ├── relative_performance.png
    └── side_by_side_performance.png
├── test_queries
    ├── 01_simple_groupby
    │   ├── obt.sql
    │   └── star.sql
    ├── 10_simple_groupby_sum
    │   ├── obt.sql
    │   └── star.sql
    ├── 03_high_cardinality_groupby
    │   ├── obt.sql
    │   └── star.sql
    ├── 02_cross_table_multi_groupby
    │   ├── obt.sql
    │   └── star.sql
    ├── 09_simple_groupby_with_filter
    │   ├── obt.sql
    │   └── star.sql
    ├── 06_date_window_2
    │   ├── obt.sql
    │   └── star.sql
    ├── 07_wait_buy_why
    │   ├── obt.sql
    │   └── star.sql
    ├── 04_window_function
    │   ├── obt.sql
    │   └── star.sql
    ├── 05_date_window
    │   ├── obt.sql
    │   └── star.sql
    └── 08_more_addresses
    │   ├── obt.sql
    │   └── star.sql
├── .gitignore
├── setup.sh
├── load_bigquery_data.sql
├── dbconf.yml
├── load_redshift_data.py
├── README.md
├── test_queries.py
├── db.py
├── load_commands.sql
├── results.md
└── load_snowflake.sql


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt-proj/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt-proj/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt-proj/analysis/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt-proj/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbt-proj/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_modules/
4 | logs/
5 | 


--------------------------------------------------------------------------------
/Analysis/images/bigquery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/Analysis/images/bigquery.png


--------------------------------------------------------------------------------
/Analysis/images/snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/Analysis/images/snowflake.png


--------------------------------------------------------------------------------
/static/slack_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/static/slack_screenshot.png


--------------------------------------------------------------------------------
/test_queries/01_simple_groupby/obt.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_state
3 |   , COUNT(*)
4 | FROM dbt.one_big_table
5 | GROUP BY
6 |   ca_state
7 | 


--------------------------------------------------------------------------------
/static/relative_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/static/relative_performance.png


--------------------------------------------------------------------------------
/static/side_by_side_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/static/side_by_side_performance.png


--------------------------------------------------------------------------------
/test_queries/10_simple_groupby_sum/obt.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_state
3 |   , sum(ss_net_profit)
4 | FROM dbt.one_big_table
5 | GROUP BY
6 |   ca_state
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.sw*
 2 | .envrc
 3 | .user.yml
 4 | __pycache__/
 5 | *.csv
 6 | */.ipynb_checkpoints/
 7 | bq-compare-distribution-creds.json
 8 | .DS_Store
 9 | *.pyc
10 | 


--------------------------------------------------------------------------------
/Analysis/images/dc2.8xlarge_multi-node_first.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/Analysis/images/dc2.8xlarge_multi-node_first.png


--------------------------------------------------------------------------------
/Analysis/images/dc2.large_single-node_first.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/Analysis/images/dc2.large_single-node_first.png


--------------------------------------------------------------------------------
/test_queries/03_high_cardinality_groupby/obt.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_zip
3 |   , COUNT(*)
4 | FROM dbt.one_big_table
5 | GROUP BY
6 |     ca_zip
7 | ORDER BY 2 DESC
8 | limit 20
9 | 


--------------------------------------------------------------------------------
/Analysis/images/dc2.8xlarge_multi-node_subsequent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/Analysis/images/dc2.8xlarge_multi-node_subsequent.png


--------------------------------------------------------------------------------
/Analysis/images/dc2.large_single-node_subsequent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikekaminsky/compare-warehouse-distributions/HEAD/Analysis/images/dc2.large_single-node_subsequent.png


--------------------------------------------------------------------------------
/test_queries/02_cross_table_multi_groupby/obt.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_state
3 |   , s_country
4 |   , COUNT(*)
5 | FROM dbt.one_big_table
6 | GROUP BY
7 |     ca_state
8 |   , s_country
9 | 


--------------------------------------------------------------------------------
/test_queries/09_simple_groupby_with_filter/obt.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_state
3 |   , COUNT(*)
4 | FROM dbt.one_big_table
5 | WHERE ca_state IN ('TX','NY','CA')
6 | GROUP BY
7 |   ca_state
8 | 


--------------------------------------------------------------------------------
/test_queries/01_simple_groupby/star.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_state
3 |   , COUNT(*)
4 | FROM public.store_sales
5 | LEFT JOIN public.customer_address
6 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
7 | GROUP BY
8 |   ca_state
9 | 


--------------------------------------------------------------------------------
/test_queries/06_date_window_2/obt.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     d_year
3 |   , d_moy
4 |   , Sum(ss_net_profit)*100/Sum(Sum(ss_net_profit)) OVER (partition BY d_moy, d_year)
5 | FROM dbt.one_big_table
6 | GROUP BY d_year, d_moy
7 | ORDER  BY d_year, d_moy
8 | 


--------------------------------------------------------------------------------
/test_queries/10_simple_groupby_sum/star.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     ca_state
3 |   , sum(ss_net_profit)
4 | FROM public.store_sales
5 | LEFT JOIN public.customer_address
6 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
7 | GROUP BY
8 |   ca_state
9 | 


--------------------------------------------------------------------------------
/test_queries/03_high_cardinality_groupby/star.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     ca_zip
 3 |   , COUNT(*)
 4 | FROM public.store_sales
 5 | LEFT JOIN public.customer_address
 6 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
 7 | GROUP BY
 8 |     ca_zip
 9 | ORDER BY 2 DESC
10 | limit 20
11 | 


--------------------------------------------------------------------------------
/test_queries/09_simple_groupby_with_filter/star.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     ca_state
 3 |   , COUNT(*)
 4 | FROM public.store_sales
 5 | LEFT JOIN public.customer_address
 6 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
 7 | WHERE ca_state IN ('TX','NY','CA')
 8 | GROUP BY
 9 |   ca_state
10 | 


--------------------------------------------------------------------------------
/test_queries/07_wait_buy_why/obt.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     SUM(ss_net_profit)
 3 |   , d_moy
 4 |   , d_year
 5 |   , RANK() OVER (
 6 |      PARTITION BY d_moy, d_year
 7 |      ORDER BY SUM(ss_net_profit) DESC
 8 |     )  AS rank_within_parent
 9 | FROM dbt.one_big_table
10 | GROUP  BY d_moy, d_year
11 | ORDER  BY d_moy, rank_within_parent
12 | 


--------------------------------------------------------------------------------
/test_queries/04_window_function/obt.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     SUM(ss_net_profit)
 3 |   , s_state
 4 |   , s_county
 5 |   , RANK() OVER (
 6 |      PARTITION BY s_state, s_county
 7 |      ORDER BY SUM(ss_net_profit) DESC
 8 |     )  AS rank_within_parent
 9 | FROM dbt.one_big_table
10 | GROUP  BY s_state, s_county
11 | ORDER  BY s_state, rank_within_parent
12 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Log commands and quit if you encounter any error
 4 | set -ex
 5 | 
 6 | vlad --config-file dbconf.yml create_cluster 
 7 | vlad --config-file dbconf.yml add_security_group  --use-my-current-ip 
 8 | vlad --config-file dbconf.yml attach_iam_policy  RedshiftReadS3
 9 | vlad --config-file dbconf.yml describe_clusters 
10 | 
11 | 


--------------------------------------------------------------------------------
/test_queries/06_date_window_2/star.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     d_year
 3 |   , d_moy
 4 |   , Sum(ss_net_profit)*100/Sum(Sum(ss_net_profit)) OVER (partition BY d_moy, d_year)
 5 | FROM public.store_sales
 6 | LEFT JOIN public.date_dim
 7 |   ON store_sales.ss_sold_date_sk = date_dim.d_date_sk
 8 | GROUP  BY
 9 |     d_year
10 |   , d_moy
11 | ORDER BY
12 |     d_year
13 |   , d_moy
14 | 


--------------------------------------------------------------------------------
/dbt-proj/README.md:
--------------------------------------------------------------------------------
 1 | ### dbt models for [your org/project name]
 2 | 
 3 | ---
 4 | - [What is dbt](https://dbt.readme.io/docs/overview)?
 5 | - Read the [dbt viewpoint](https://dbt.readme.io/docs/viewpoint)
 6 | - [Installation](https://dbt.readme.io/docs/installation)
 7 | - Join the [chat](http://ac-slackin.herokuapp.com/) on Slack for live questions and support.
 8 | 
 9 | ---
10 | 


--------------------------------------------------------------------------------
/test_queries/02_cross_table_multi_groupby/star.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     ca_state
 3 |   , s_country
 4 |   , COUNT(*)
 5 | FROM public.store_sales
 6 | LEFT JOIN public.customer_address
 7 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
 8 | LEFT JOIN public.store
 9 |   ON store_sales.ss_store_sk = store.s_store_sk
10 | GROUP BY
11 |     ca_state
12 |   , s_country
13 | 
14 | 


--------------------------------------------------------------------------------
/test_queries/05_date_window/obt.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | 
 3 | cte_1 AS (
 4 | SELECT
 5 |     d_year
 6 |   , d_moy
 7 |   , SUM(ss_net_profit) AS ss_net_profit
 8 | FROM dbt.one_big_table
 9 | GROUP  BY
10 |     d_year
11 |   , d_moy
12 | )
13 | 
14 | SELECT
15 |     d_year
16 |   , d_moy
17 |   , ss_net_profit /
18 |     SUM(ss_net_profit) OVER (
19 |         PARTITION BY
20 |         d_year
21 |       )
22 | FROM cte_1
23 | 


--------------------------------------------------------------------------------
/test_queries/07_wait_buy_why/star.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     SUM(ss_net_profit)
 3 |   , d_moy
 4 |   , d_year
 5 |   , RANK() OVER (
 6 |      PARTITION BY d_moy, d_year
 7 |      ORDER BY SUM(ss_net_profit) DESC
 8 |     )  AS rank_within_parent
 9 | FROM public.store_sales
10 | LEFT JOIN public.date_dim
11 |   ON store_sales.ss_sold_date_sk = date_dim.d_date_sk
12 | GROUP BY d_moy, d_year
13 | ORDER BY d_moy, rank_within_parent
14 | 


--------------------------------------------------------------------------------
/test_queries/04_window_function/star.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     SUM(ss_net_profit)
 3 |   , s_state
 4 |   , s_county
 5 |   , RANK() OVER (
 6 |      PARTITION BY s_state, s_county
 7 |      ORDER BY SUM(ss_net_profit) DESC
 8 |     )  AS rank_within_parent
 9 | FROM public.store_sales
10 | LEFT JOIN public.store
11 |   ON store_sales.ss_store_sk = store.s_store_sk
12 | GROUP  BY s_state, s_county
13 | ORDER  BY s_state, rank_within_parent
14 | 


--------------------------------------------------------------------------------
/test_queries/08_more_addresses/obt.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | 
 3 | cte AS (
 4 | SELECT
 5 |     ca_state
 6 |   , COUNT(*) total_in_state
 7 | FROM dbt.one_big_table
 8 | GROUP BY
 9 |     ca_state
10 | )
11 | 
12 | SELECT
13 |     obt.ca_state
14 |   , ca_zip
15 |   , COUNT(*) / total_in_state
16 | FROM dbt.one_big_table obt
17 | LEFT JOIN cte
18 |   ON obt.ca_state = cte.ca_state
19 | GROUP BY
20 |     obt.ca_state
21 |   , total_in_state
22 |   , ca_zip
23 | 


--------------------------------------------------------------------------------
/test_queries/05_date_window/star.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | 
 3 | cte_1 AS (
 4 | SELECT
 5 |     d_year
 6 |   , d_moy
 7 |   , SUM(ss_net_profit) AS ss_net_profit
 8 | FROM public.store_sales
 9 | LEFT JOIN public.date_dim
10 |   ON store_sales.ss_sold_date_sk = date_dim.d_date_sk
11 | GROUP  BY
12 |     d_year
13 |   , d_moy
14 | )
15 | 
16 | SELECT
17 |     d_year
18 |   , d_moy
19 |   , ss_net_profit /
20 |     SUM(ss_net_profit) OVER (
21 |         PARTITION BY
22 |         d_year
23 |       )
24 | FROM cte_1
25 | 


--------------------------------------------------------------------------------
/dbt-proj/models/example/one_big_table.sql:
--------------------------------------------------------------------------------
 1 | {{ config(materialized='table', dist='ss_item_sk') }}
 2 | 
 3 | SELECT *
 4 | FROM public.store_sales
 5 | LEFT JOIN public.date_dim
 6 |   ON store_sales.ss_sold_date_sk = date_dim.d_date_sk
 7 | LEFT JOIN public.store
 8 |   ON store_sales.ss_store_sk = store.s_store_sk
 9 | LEFT JOIN public.household_demographics
10 |   ON store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
11 | LEFT JOIN public.customer_address
12 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
13 | 


--------------------------------------------------------------------------------
/load_bigquery_data.sql:
--------------------------------------------------------------------------------
1 | create table public.date_dim as select * from `singular-vector-135519`.tpcds_100.date_dim;
2 | create table public.store as select * from `singular-vector-135519`.tpcds_100.store;
3 | create table public.household_demographics as select * from `singular-vector-135519`.tpcds_100.household_demographics;
4 | create table public.customer_address as select * from `singular-vector-135519`.tpcds_100.customer_address;
5 | create table public.store_sales as select * from `singular-vector-135519`.tpcds_100.store_sales;
6 | 


--------------------------------------------------------------------------------
/dbconf.yml:
--------------------------------------------------------------------------------
 1 | redshift:
 2 |   cluster_identifier: compare-warehouse-distributions
 3 |   db_name: ENV['REDSHIFT_DBNAME']
 4 |   user: ENV['REDSHIFT_USER']
 5 |   password: ENV['REDSHIFT_PASSWORD']
 6 |   port: 5439
 7 |   #cluster_type: multi-node
 8 |   #node_type: dc2.8xlarge
 9 |   number_of_nodes: 1
10 |   cluster_type: single-node
11 |   node_type: dc2.large
12 | 
13 | snowflake:
14 |   user: ENV['SNOWFLAKE_USER']
15 |   password: ENV['SNOWFLAKE_PASSWORD']
16 |   account: ENV['SNOWFLAKE_ACCOUNT']
17 | 
18 | bigquery:
19 | 
20 | security_group_name: whitelist-ip2
21 | 


--------------------------------------------------------------------------------
/test_queries/08_more_addresses/star.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | 
 3 | cte AS (
 4 | SELECT
 5 |     ca_state
 6 |   , COUNT(*) total_in_state
 7 | FROM public.store_sales
 8 | LEFT JOIN public.customer_address
 9 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
10 | GROUP BY
11 |     ca_state
12 | )
13 | 
14 | SELECT
15 |     ca.ca_state
16 |   , ca_zip
17 |   , COUNT(*) / total_in_state
18 | FROM public.store_sales
19 | LEFT JOIN public.customer_address ca
20 |   ON store_sales.ss_addr_sk = ca.ca_address_sk
21 | LEFT JOIN cte
22 |   ON ca.ca_state = cte.ca_state
23 | GROUP BY
24 |     ca.ca_state
25 |   , total_in_state
26 |   , ca_zip
27 | 


--------------------------------------------------------------------------------
/load_redshift_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import db
 3 | 
 4 | ROLE_ARN = "arn:aws:iam::768805597102:role/RedshiftReadS3"
 5 | 
 6 | TABLE_LIST = [
 7 |     "date_dim",
 8 |     "store",
 9 |     "household_demographics",
10 |     "customer_address",
11 |     "store_sales"
12 | ]
13 | 
14 | conn = db.get_connection('redshift')
15 | 
16 | # Create table commands
17 | with open("load_commands.sql", "r") as f:
18 |     cmds = f.readlines()
19 | 
20 | for cmd in cmds:
21 |     print(cmd)
22 |     conn.cursor().execute(cmd)
23 |     conn.commit()
24 | 
25 | copy_template = """
26 |     copy public.{table_name} from 's3://fivetran-benchmark/tpcds_100_dat/{table_name}/' region 'us-east-1' format delimiter '|' acceptinvchars compupdate on iam_role '{role_arn}';
27 | """
28 | 
29 | for table in TABLE_LIST:
30 |     cmd = copy_template.format(table_name=table, role_arn=ROLE_ARN)
31 |     print(cmd)
32 |     conn.cursor().execute(cmd)
33 |     conn.commit()
34 | 


--------------------------------------------------------------------------------
/dbt-proj/profiles.yml:
--------------------------------------------------------------------------------
 1 | redshift:
 2 |   target: dev
 3 |   outputs:
 4 |     dev:
 5 |       type: redshift
 6 |       host: compare-warehouse-distributions.cfurovc8lmfv.us-east-1.redshift.amazonaws.com
 7 |       user: "{{ env_var('REDSHIFT_USER') }}"
 8 |       pass: "{{ env_var('REDSHIFT_PASSWORD') }}"
 9 |       port: 5439
10 |       dbname:  "{{ env_var('REDSHIFT_DBNAME') }}"
11 |       schema: dbt
12 |       threads: 4
13 | snowflake:
14 |   target: dev
15 |   outputs:
16 |     dev:
17 |       type: snowflake
18 |       account: "{{ env_var('SNOWFLAKE_ACCOUNT') }}"
19 |       user: "{{ env_var('SNOWFLAKE_USER') }}"
20 |       password:  "{{ env_var('SNOWFLAKE_PASSWORD') }}"
21 |       role: SYSADMIN
22 |       database: DEMO_DB
23 |       warehouse: COMPUTE_WH
24 |       schema: dbt
25 |       threads: 4
26 | bigquery:
27 |   target: dev
28 |   outputs:
29 |     dev:
30 |       type: bigquery
31 |       method: oauth
32 |       project: compare-distributions
33 |       schema: dbt
34 |       threads: 4
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Comparing Data Warehouse Distribution Styles
 2 | 
 3 | This repo contains the code for testing how different data warehouse data-organization schemes impact query performance.
 4 | 
 5 | You can use this repo to either review the queries that were used in the test (in the [test_queries](/test_queries) folder) or to run the analysis yourself.
 6 | 
 7 | This guide assumes you have a Redshift cluster up and running with the following conditions:
 8 | 
 9 | * You have query access to it from your IP address
10 | * The cluster has permission to read from S3
11 | 
12 | You'll need to set the following environment variables:
13 | 
14 | ```
15 |   export REDSHIFT_DBNAME=test_db
16 |   export REDSHIFT_USER=test_user
17 |   export REDSHIFT_PASSWORD=test_Pa55w0rd
18 |   export DBT_PROFILES_DIR=path/to/profiles/dir
19 |   export REDSHIFT_HOST=something-something.aws.amazon.com
20 | ```
21 | 
22 | You can then run the script to load the data we need into the cluster:
23 | 
24 | ```
25 | python load_data.py
26 | ```
27 | 
28 | Once you've done that, you can use DBT to create the table we'll use
29 | 
30 | ```
31 | cd dbt-proj
32 | dbt run
33 | ```
34 | 
35 | Finally, we can run the script to execute our test queries and persist the results
36 | 
37 | ```
38 | cd .. # go back to the root project directory
39 | python test_queries.py
40 | ```
41 | 
42 | You can then analyze the results using the jupyter notebook in the `Analysis/` directory.
43 | 


--------------------------------------------------------------------------------
/dbt-proj/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your package! Package names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'my_new_package'
 6 | version: '1.0'
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project. Profiles contain
 9 | # database connection information, and should be configured in the  ~/.dbt/profiles.yml file
10 | profile: 'redshift'
11 | 
12 | # These configurations specify where dbt should look for different types of files.
13 | # The `source-paths` config, for example, states that source models can be found
14 | # in the "models/" directory. You probably won't need to change these!
15 | source-paths: ["models"]
16 | analysis-paths: ["analysis"] 
17 | test-paths: ["tests"]
18 | data-paths: ["data"]
19 | macro-paths: ["macros"]
20 | 
21 | target-path: "target"  # directory which will store compiled SQL files
22 | clean-targets:         # directories to be removed by `dbt clean`
23 |     - "target"
24 |     - "dbt_modules"
25 | 
26 | # You can define configurations for models in the `source-paths` directory here.
27 | # Using these configurations, you can enable or disable models, change how they
28 | # are materialized, and more!
29 | 
30 | # In this example config, we tell dbt to build all models in the example/ directory
31 | # as views (the default). These settings can be overridden in the individual model files
32 | # using the `{{ config(...) }}` macro.
33 | models:
34 |   my_new_package:
35 |       # Applies to all files under models/example/
36 |       example:
37 |           materialized: view
38 | 


--------------------------------------------------------------------------------
/test_queries.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import datetime
 4 | 
 5 | import pandas as pd
 6 | 
 7 | import db
 8 | import sys
 9 | 
10 | 
11 | db_type = sys.argv[1]
12 | print(db_type)
13 | 
14 | today_str = datetime.datetime.today().strftime("%Y-%m-%d")
15 | 
16 | disable_redshift_cache = """SET enable_result_cache_for_session TO OFF;"""
17 | disable_snowflake_cache = """ALTER SESSION SET USE_CACHED_RESULT=false;"""
18 | 
19 | conn = db.get_connection(db_type)
20 | 
21 | # Disable cache
22 | if db_type == "redshift":
23 |     db.run_command(db_type, conn, disable_redshift_cache)
24 | if db_type == "snowflake":
25 |     db.run_command(db_type, conn, "USE DATABASE DEMO_DB;")
26 |     db.run_command(db_type, conn, disable_snowflake_cache)
27 | 
28 | 
29 | def time_it(test_dir, style, nth):
30 |     fn = os.path.join("test_queries", test_dir, style + ".sql")
31 |     print("Evaluating {}".format(fn))
32 |     with open(fn, "r") as f:
33 |         query = f.read()
34 |     if db_type == "snowflake":
35 |         db.run_command(db_type, conn, disable_snowflake_cache)
36 |     start_time = time.time()
37 |     if db_type == "redshift":
38 |         query = disable_redshift_cache + query
39 |     query_res = db.get_query_results(db_type, conn, query)
40 |     end_time = time.time()
41 |     elapsed = end_time - start_time
42 |     print("Completed in: {}".format(elapsed))
43 |     res = {"test": test_dir, "style": style, "time": elapsed, "nth": nth}
44 |     pd_res = pd.DataFrame.from_records([res])
45 |     return pd_res
46 | 
47 | 
48 | row_list = []
49 | for i in range(1, 6):
50 |     for test_q in os.listdir("test_queries"):
51 |         row_list.append(time_it(test_q, "star", i))
52 |         row_list.append(time_it(test_q, "obt", i))
53 | 
54 | results = pd.concat(row_list)
55 | results["date"] = today_str
56 | if db_type == "redshift":
57 |     results["cluster_type"] = db.config["redshift"]["cluster_type"]
58 |     results["node_type"] = db.config["redshift"]["node_type"]
59 | results["db_type"] = db_type
60 | 
61 | results.to_csv("data/{}_results.csv".format(db_type), mode="a")
62 | 


--------------------------------------------------------------------------------
/db.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | import os
 3 | import yaml
 4 | import snowflake.connector
 5 | from google.cloud import bigquery
 6 | 
 7 | with open("dbconf.yml", "r") as f:
 8 |     config = yaml.load(f)
 9 | 
10 | 
11 | def get_connection(db_type):
12 |     if db_type == "redshift":
13 |         host_root = os.getenv("REDSHIFT_HOST_ROOT")
14 |         identifier = config["redshift"]["cluster_identifier"]
15 |         host = ".".join([identifier, host_root])
16 |         dbname = config["redshift"]["db_name"]
17 |         dbname = os.getenv("REDSHIFT_DBNAME")
18 |         username = os.getenv("REDSHIFT_USER")
19 |         password = os.getenv("REDSHIFT_PASSWORD")
20 |         port = config["redshift"]["port"]
21 |         conn = psycopg2.connect(
22 |             host=host, port=port, dbname=dbname, user=username, password=password
23 |         )
24 |         return conn
25 | 
26 |     if db_type == "snowflake":
27 |         password = os.getenv("SNOWFLAKE_PASSWORD")
28 |         user = os.getenv("SNOWFLAKE_USER")
29 |         account = os.getenv("SNOWFLAKE_ACCOUNT")
30 |         conn = snowflake.connector.connect(
31 |             user=user, password=password, account=account
32 |         )
33 |         return conn
34 | 
35 |     if db_type == "bigquery":
36 |         client = bigquery.Client()
37 |         return client
38 | 
39 | 
40 | def run_command(db_type, conn, cmd):
41 |     print(cmd)
42 |     if db_type == "bigquery":
43 | 
44 |         job_config = bigquery.QueryJobConfig()
45 |         job_config.use_query_cache = False
46 |         query_job = conn.query(cmd, job_config=job_config)
47 |         results = query_job.result()
48 |     else:
49 |         cur = conn.cursor()
50 |         cur.execute(cmd)
51 |         conn.commit()
52 | 
53 | 
54 | def get_query_results(db_type, conn, qry):
55 |     if db_type == "bigquery":
56 |         query_job = conn.query(qry)
57 |         job_config = bigquery.QueryJobConfig()
58 |         job_config.use_query_cache = False
59 |         query_job = conn.query(qry, job_config=job_config)
60 |         results = query_job.result()
61 |     else:
62 |         cur = conn.cursor()
63 |         cur.execute(qry)
64 |         results = cur.fetchall()
65 |     return results
66 | 


--------------------------------------------------------------------------------
/load_commands.sql:
--------------------------------------------------------------------------------
1 | drop schema public cascade;
2 | create schema public;
3 | create table public.store_sales(ss_sold_date_sk bigint, ss_sold_time_sk bigint, ss_item_sk bigint, ss_customer_sk bigint, ss_cdemo_sk bigint, ss_hdemo_sk bigint, ss_addr_sk bigint, ss_store_sk bigint, ss_promo_sk bigint, ss_ticket_number bigint, ss_quantity int, ss_wholesale_cost double precision, ss_list_price double precision, ss_sales_price double precision, ss_ext_discount_amt double precision, ss_ext_sales_price double precision, ss_ext_wholesale_cost double precision, ss_ext_list_price double precision, ss_ext_tax double precision, ss_coupon_amt double precision, ss_net_paid double precision, ss_net_paid_inc_tax double precision, ss_net_profit double precision ) distkey(ss_item_sk) interleaved sortkey (ss_sold_date_sk, ss_store_sk, ss_hdemo_sk, ss_addr_sk);
4 | create table public.date_dim(d_date_sk bigint, d_date_id char(16), d_date varchar(10), d_month_seq int, d_week_seq int, d_quarter_seq int, d_year int, d_dow int, d_moy int, d_dom int, d_qoy int, d_fy_year int, d_fy_quarter_seq int, d_fy_week_seq int, d_day_name char(9), d_quarter_name char(6), d_holiday char(1), d_weekend char(1), d_following_holiday char(1), d_first_dom int, d_last_dom int, d_same_day_ly int, d_same_day_lq int, d_current_day char(1), d_current_week char(1), d_current_month char(1), d_current_quarter char(1), d_current_year char(1) ) diststyle all sortkey(d_date_sk);
5 | create table public.customer_address(ca_address_sk bigint, ca_address_id char(16), ca_street_number char(10), ca_street_name varchar(60), ca_street_type char(15), ca_suite_number char(10), ca_city varchar(60), ca_county varchar(30), ca_state char(2), ca_zip char(10), ca_country varchar(20), ca_gmt_offset double precision, ca_location_type char(20) ) diststyle all sortkey(ca_address_sk);
6 | create table public.customer(c_customer_sk bigint, c_customer_id char(16), c_current_cdemo_sk bigint, c_current_hdemo_sk bigint, c_current_addr_sk bigint, c_first_shipto_date_sk bigint, c_first_sales_date_sk bigint, c_salutation char(10), c_first_name char(20), c_last_name char(30), c_preferred_cust_flag char(1), c_birth_day int, c_birth_month int, c_birth_year int, c_birth_country varchar(20), c_login char(13), c_email_address char(50), c_last_review_date char(10) ) diststyle all sortkey(c_customer_sk);
7 | create table public.household_demographics(hd_demo_sk bigint, hd_income_band_sk bigint, hd_buy_potential char(15), hd_dep_count int, hd_vehicle_count int ) diststyle all sortkey(hd_demo_sk);
8 | create table public.store(s_store_sk bigint, s_store_id char(16), s_rec_start_date varchar(10), s_rec_end_date varchar(10), s_closed_date_sk bigint, s_store_name varchar(50), s_number_employees int, s_floor_space int, s_hours char(20), s_manager varchar(40), s_market_id int, s_geography_class varchar(100), s_market_desc varchar(100), s_market_manager varchar(40), s_division_id int, s_division_name varchar(50), s_company_id int, s_company_name varchar(50), s_street_number varchar(10), s_street_name varchar(60), s_street_type char(15), s_suite_number char(10), s_city varchar(60), s_county varchar(30), s_state char(2), s_zip char(10), s_country varchar(20), s_gmt_offset double precision, s_tax_precentage double precision ) diststyle all sortkey(s_store_sk);
9 | 


--------------------------------------------------------------------------------
/results.md:
--------------------------------------------------------------------------------
  1 | # For Data Warehouse Performance: One Big Table or Star Schema?
  2 | 
  3 | Many have wondered, and some have even claimed to know, which data-organization style provides the best performance in a data warehouse. The two schools of thought are, roughly:
  4 | 
  5 | * Denormalize the data into one-big-table (OBT) so that the warehouse never has to do any joins on-the-fly
  6 | * Maintain a star schema that can take advantage of sort keys on the dimension tables
  7 | 
  8 | Before writing this blog post, I was very much on team "star schema" -- I had been taught that organizing data in a star schema was _critical_ to performance in analytic data warehouses. It turns out, I was wrong.
  9 | 
 10 | The first sign that I was wrong was when I wrote this slack message in the [dbt](https://www.getdbt.com) slack channel:
 11 | 
 12 | ![A screenshot of me being an idiot.](/static/slack_screenshot.png)
 13 | 
 14 | If I had learned anything from my history of making statements that claim 100% certainty, I'd know that this is a _strong_ predictor of me being wrong and having to come back and eat crow. So here we are.
 15 | 
 16 | ## The Goal: Understand Warehouse Performance for BI workloads
 17 | 
 18 | The objective of this analysis is to understand the performance implications of these different warehouse distribution patterns under normal BI-style workloads _within a given warehouse_. That is, we aren't trying to [benchmark warehouses against each other](https://fivetran.com/blog/warehouse-benchmark) or understand their relative performance and cost tradeoffs. We want to understand how different data architecture patterns perform once you've chosen which warehouse to use.
 19 | 
 20 | In particular, this analysis is focused on architecture patterns to support business-intelligence style workloads, not necessarily the query performance of miscellaneous, arbitrarily complex ad-hoc queries. The way many people build their warehouses today (using an ELT paradigm with a tool like [dbt](https://www.getdbt.com)), the star schema is constructe at the end of an ELT run and is explicitly designed to support BI-type queries in tools like Looker or Periscope. With that in mind, the queries that we use to test these different distribution styles are not especially complex as they're intentionally designed to reflect the common sorts of queries that are run by a BI tool -- aggregating measures over a variety of different dimensions, occasionally with a CTE or window function thrown in.
 21 | 
 22 | You can review the queries we used for the test [here](https://github.com/mikekaminsky/compare-warehouse-distributions/tree/master/test_queries).
 23 | 
 24 | ## The Results: Denormalized Tables Result in Faster Query-Response
 25 | 
 26 | For all three of the warehouses we tested, Redshift, Snowflake, and Bigquery, using a single denormalized table instead of a star schema leads to a substantial improvement in query times. The speed improvement of using a single denormalized table represents an improvemnt of 25%-50% depending on which warehouse you're using. This amounts to a difference of about 10 seconds on a single-node cluster in Redshift. Excluding redshift query compilation time, the improvements are:
 27 | 
 28 | * Redshift: 25%-30% (depending on warehouse size and number of clusters)
 29 | * Snowflake: ~25%
 30 | * Bigquery: ~50%
 31 | 
 32 | ### Redshift
 33 | 
 34 | For the redshift results, we present data from runs using both a large multi-node cluster as well as a small single-node cluster. We also split the results between the first time a query was executed (which will include the time Redshift needs to compile the query) as well as subsequent runs that only include compute time
 35 | 
 36 | #### Single-Node
 37 | First run
 38 | ![dc2.large, single-node, first run](/Analysis/images/dc2.large_single-node_first.png)
 39 | 
 40 | Subsequent runs
 41 | ![dc2.large, single-node, subsequent runs](/Analysis/images/dc2.large_single-node_subsequent.png)
 42 | 
 43 | #### Multi-Node
 44 | First run
 45 | ![dc2.8xlarge, multi-node, first run](/Analysis/images/dc2.8xlarge_multi-node_first.png)
 46 | 
 47 | Subsequent runs
 48 | ![dc2.8xlarge, multi-node, subsequent runs](/Analysis/images/dc2.8xlarge_multi-node_subsequent.png)
 49 | 
 50 | Here we can see that the OBT (denormalized) model out-performs the star-schema model in all but one of the 10 queries we tested[^1]. With the exception of the query-4 enigma, the denormalized table out performs the star schema from 10% to 45% depending on the query.
 51 | 
 52 | ### Snowflake
 53 | 
 54 | For Snowflake, the results are more mixed. While the OBT (denormalized) model is definitely faster than the star schema in the slowest of queries (queries 8, 9, and 10), the star schema actually does appear to out-perform the OBT model in some of the simpler queries (namely 3, 4, and 7). Note that these queries _include_ query compilation time.
 55 | 
 56 | ![Snowflake query results](/Analysis/images/snowflake.png)
 57 | 
 58 | I do not have a good enough intuition for the inner-workings of snowflake to cast any light on _why_ this might be happening, but if any of our readers are snowflake experts we'd love to hear your hypotheses!
 59 | 
 60 | ### Bigquery
 61 | 
 62 | For bigquery, the results are even more dramatic than what we saw in Redshift -- the average improvement in query response time is 49%, with the denormalized table out-performing the star schema in every category. Note that these queries _include_ query compilation time.
 63 | 
 64 | ![Bigquery query results](/Analysis/images/bigquery.png)
 65 | 
 66 | One thing that's interesting to note is how dramatically different the _variances_ in the query response times are between the two different distribution styles -- the star schema has a much higher variance in query response time which I assumes has to do with how bigquery is planning the execution under the hood (but I'm definitely not a BQ expert, so would love someone with more knowledge to weigh-in on what's going on here).
 67 | 
 68 | ## Analysis details
 69 | 
 70 | This comparison was made using a subset of the data from the TPC-DS benchmark, kindly [made available](https://github.com/fivetran/benchmark/) by the folks at Fivetran. For all analyses, we used the TPC-DS "100" data.
 71 | 
 72 | * Redshift: 
 73 |   * dc2.large with 1 node
 74 |   * dc2.8xlarge cluster with three nodes
 75 | * Snowflake:
 76 |   * X-Large warehouse (16 servers) 
 77 | * Bigquery:
 78 |   * I used whatever the default configuration comes with a fresh warehouse
 79 | 
 80 | We make use of the following tables: `store_sales`, `date_dim`, `store`, `household_demographics`, `customer_address`
 81 | 
 82 | For the star schema, I just kept these tables as-is (distributing the fact table by `ss_item_key` and distributing the dimension tables across all nodes. In redshift, I distribute this by `ss_item_key` as well. 
 83 | 
 84 | For the timing test, we disable query caching mechanisms according to [these docs for redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_enable_result_cache_for_session.html), [these docs for bigquery](https://cloud.google.com/bigquery/docs/cached-results), and [these docs for snowflake](https://docs.snowflake.net/manuals/sql-reference/parameters.html#use-cached-result). 
 85 | 
 86 | For Snowflake, we exclude the initial run for the queries that serve the purpose of "warming the cache" by reading the data from S3 onto the SSD that snowflake uses for caching. So while the _results_ of the query aren't being cached directly, we do want to assess snowflake under the circumstances where the data has been read off of S3. You can read more about [Snowflake caching here](https://www.analytics.today/blog/caching-in-snowflake-data-warehouse).
 87 | 
 88 | For the denormalized tables, I just do a simple join to bring everything together:
 89 | 
 90 | ```
 91 | SELECT *
 92 | FROM public.store_sales
 93 | LEFT JOIN public.date_dim
 94 |   ON store_sales.ss_sold_date_sk = date_dim.d_date_sk
 95 | LEFT JOIN public.store
 96 |   ON store_sales.ss_store_sk = store.s_store_sk
 97 | LEFT JOIN public.household_demographics
 98 |   ON store_sales.ss_hdemo_sk = household_demographics.hd_demo_sk
 99 | LEFT JOIN public.customer_address
100 |   ON store_sales.ss_addr_sk = customer_address.ca_address_sk
101 | ```
102 | 
103 | All of the code to reproduce the analysis can be found in [this repo](https://github.com/mikekaminsky/compare-warehouse-distributions).
104 | 
105 | ## Other considerations
106 | 
107 | There are a few reasons why you might still want to consider using the star schema (or something like it):
108 | 
109 | * The star schema promotes better ELT / ETL code conceptualization and organization.
110 | * The star schema is easier for end-users (analysts and other query-writers) to navigate.
111 | * The star schema takes up less disk space.
112 | 
113 | While the first two concerns are important, I think they can be handled pretty easily by staging your ELT process such that the data all get transformed into something like a star schema before everything gets re-joined back together for end-user querying.
114 | 
115 | The third point deserves more consideration, especially in a datawarehouse like Redshift -- materializing the denormalized takes up a significant amount of disk space on the cluster. Simply materializing the table bumped the disk-space usage up from a bit over 30 gigabytes to over 90. 
116 | 
117 | ```
118 |        tablename        |  megabytes
119 | ------------------------+-------
120 |  household_demographics |     8
121 |  date_dim               |    31
122 |  store                  |    32
123 |  customer_address       |    56
124 |  store_sales            | 29778
125 |  one_big_table          | 60250
126 | ```
127 | 
128 | And this is only a subset of the data we could have joined to `store_sales`! In fact, when I initially started on this analysis task I wanted to join all of the possible dimensions onto `store_sales` but couldn't because redshift ran out of disk-space (on a dc2.large cluster with 1 node)
129 | 
130 | Depending on the scale of your data, the storage cost of duplicating all of the dimensions on disk could just be too high[^2].
131 | 
132 | ## Get in touch
133 | 
134 | If you have questions or thoughts on this analysis, I'd love to hear them. You can reach me via email at [kaminsky.michael@gmail.com](mailto:kaminsky.michael@gmail.com) or you can find me at my other blog locallyoptimistic.com.
135 | 
136 | 
137 | [^1]: Determining why the star-schema out performs the denormalized table on query 4 in the single-node cluster (but _not_ the multi-node cluster) is left as an exercise for the reader. Mostly because I have no idea.
138 | 
139 | [^2]: Because dbt doesn't have the ability to specify column compression or encoding style in Redshift, this is probably the worst-possible-case in terms of disk storage size. I suspect that with proper column encoding you could alleviate a fair amount of this issue.
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/load_snowflake.sql:
--------------------------------------------------------------------------------
  1 | drop table if exists public.call_center;
  2 | drop table if exists public.catalog_page;
  3 | drop table if exists public.catalog_returns;
  4 | drop table if exists public.catalog_sales ;
  5 | drop table if exists public.customer_address ;
  6 | drop table if exists public.customer_demographics ;
  7 | drop table if exists public.customer ;
  8 | drop table if exists public.date_dim ;
  9 | drop table if exists public.household_demographics ;
 10 | drop table if exists public.income_band;
 11 | drop table if exists public.inventory ;
 12 | drop table if exists public.item ;
 13 | drop table if exists public.promotion ;
 14 | drop table if exists public.reason;
 15 | drop table if exists public.ship_mode;
 16 | drop table if exists public.store_returns ;
 17 | drop table if exists public.store_sales ;
 18 | drop table if exists public.store ;
 19 | drop table if exists public.time_dim ;
 20 | drop table if exists public.warehouse;
 21 | drop table if exists public.web_page;
 22 | drop table if exists public.web_returns ;
 23 | drop table if exists public.web_sales ;
 24 | drop table if exists public.web_site ;
 25 | 
 26 | create table public.call_center(
 27 |       cc_call_center_sk         bigint
 28 | ,     cc_call_center_id         varchar
 29 | ,     cc_rec_start_date        varchar
 30 | ,     cc_rec_end_date          varchar
 31 | ,     cc_closed_date_sk         bigint
 32 | ,     cc_open_date_sk           bigint
 33 | ,     cc_name                   varchar
 34 | ,     cc_class                  varchar
 35 | ,     cc_employees              int
 36 | ,     cc_sq_ft                  int
 37 | ,     cc_hours                  varchar
 38 | ,     cc_manager                varchar
 39 | ,     cc_mkt_id                 int
 40 | ,     cc_mkt_class              varchar
 41 | ,     cc_mkt_desc               varchar
 42 | ,     cc_market_manager         varchar
 43 | ,     cc_division               int
 44 | ,     cc_division_name          varchar
 45 | ,     cc_company                int
 46 | ,     cc_company_name           varchar
 47 | ,     cc_street_number          varchar
 48 | ,     cc_street_name            varchar
 49 | ,     cc_street_type            varchar
 50 | ,     cc_suite_number           varchar
 51 | ,     cc_city                   varchar
 52 | ,     cc_county                 varchar
 53 | ,     cc_state                  varchar
 54 | ,     cc_zip                    varchar
 55 | ,     cc_country                varchar
 56 | ,     cc_gmt_offset             double precision
 57 | ,     cc_tax_percentage         double precision
 58 | );
 59 | 
 60 | create table public.catalog_page(
 61 |       cp_catalog_page_sk        bigint
 62 | ,     cp_catalog_page_id        varchar
 63 | ,     cp_start_date_sk          bigint
 64 | ,     cp_end_date_sk            bigint
 65 | ,     cp_department             varchar
 66 | ,     cp_catalog_number         int
 67 | ,     cp_catalog_page_number    int
 68 | ,     cp_description            varchar
 69 | ,     cp_type                   varchar
 70 | );
 71 | 
 72 | create table public.catalog_returns(
 73 |     cr_returned_date_sk       bigint,
 74 |     cr_returned_time_sk       bigint,
 75 |     cr_item_sk                bigint,
 76 |     cr_refunded_customer_sk   bigint,
 77 |     cr_refunded_cdemo_sk      bigint,
 78 |     cr_refunded_hdemo_sk      bigint,
 79 |     cr_refunded_addr_sk       bigint,
 80 |     cr_returning_customer_sk  bigint,
 81 |     cr_returning_cdemo_sk     bigint,
 82 |     cr_returning_hdemo_sk     bigint,
 83 |     cr_returning_addr_sk      bigint,
 84 |     cr_call_center_sk         bigint,
 85 |     cr_catalog_page_sk        bigint,
 86 |     cr_ship_mode_sk           bigint,
 87 |     cr_warehouse_sk           bigint,
 88 |     cr_reason_sk              bigint,
 89 |     cr_order_number           bigint,
 90 |     cr_return_quantity        int,
 91 |     cr_return_amount          double precision,
 92 |     cr_return_tax             double precision,
 93 |     cr_return_amt_inc_tax     double precision,
 94 |     cr_fee                    double precision,
 95 |     cr_return_ship_cost       double precision,
 96 |     cr_refunded_cash          double precision,
 97 |     cr_reversed_charge        double precision,
 98 |     cr_store_credit           double precision,
 99 |     cr_net_loss               double precision
100 | );
101 | 
102 | create table public.catalog_sales (
103 |     cs_sold_date_sk           bigint,
104 |     cs_sold_time_sk           bigint,
105 |     cs_ship_date_sk           bigint,
106 |     cs_bill_customer_sk       bigint,
107 |     cs_bill_cdemo_sk          bigint,
108 |     cs_bill_hdemo_sk          bigint,
109 |     cs_bill_addr_sk           bigint,
110 |     cs_ship_customer_sk       bigint,
111 |     cs_ship_cdemo_sk          bigint,
112 |     cs_ship_hdemo_sk          bigint,
113 |     cs_ship_addr_sk           bigint,
114 |     cs_call_center_sk         bigint,
115 |     cs_catalog_page_sk        bigint,
116 |     cs_ship_mode_sk           bigint,
117 |     cs_warehouse_sk           bigint,
118 |     cs_item_sk                bigint,
119 |     cs_promo_sk               bigint,
120 |     cs_order_number           bigint,
121 |     cs_quantity               int,
122 |     cs_wholesale_cost         double precision,
123 |     cs_list_price             double precision,
124 |     cs_sales_price            double precision,
125 |     cs_ext_discount_amt       double precision,
126 |     cs_ext_sales_price        double precision,
127 |     cs_ext_wholesale_cost     double precision,
128 |     cs_ext_list_price         double precision,
129 |     cs_ext_tax                double precision,
130 |     cs_coupon_amt             double precision,
131 |     cs_ext_ship_cost          double precision,
132 |     cs_net_paid               double precision,
133 |     cs_net_paid_inc_tax       double precision,
134 |     cs_net_paid_inc_ship      double precision,
135 |     cs_net_paid_inc_ship_tax  double precision,
136 |     cs_net_profit             double precision
137 | );
138 | 
139 | create table public.customer_address (
140 |     ca_address_sk             bigint,
141 |     ca_address_id             varchar,
142 |     ca_street_number          varchar,
143 |     ca_street_name            varchar,
144 |     ca_street_type            varchar,
145 |     ca_suite_number           varchar,
146 |     ca_city                   varchar,
147 |     ca_county                 varchar,
148 |     ca_state                  varchar,
149 |     ca_zip                    varchar,
150 |     ca_country                varchar,
151 |     ca_gmt_offset             double precision,
152 |     ca_location_type          varchar
153 | );
154 | 
155 | create table public.customer_demographics (
156 |     cd_demo_sk                bigint,
157 |     cd_gender                 varchar,
158 |     cd_marital_status         varchar,
159 |     cd_education_status       varchar,
160 |     cd_purchase_estimate      int,
161 |     cd_credit_rating          varchar,
162 |     cd_dep_count              int,
163 |     cd_dep_employed_count     int,
164 |     cd_dep_college_count      int
165 | );
166 | 
167 | create table public.customer (
168 |     c_customer_sk             bigint,
169 |     c_customer_id             varchar,
170 |     c_current_cdemo_sk        bigint,
171 |     c_current_hdemo_sk        bigint,
172 |     c_current_addr_sk         bigint,
173 |     c_first_shipto_date_sk    bigint,
174 |     c_first_sales_date_sk     bigint,
175 |     c_salutation              varchar,
176 |     c_first_name              varchar,
177 |     c_last_name               varchar,
178 |     c_preferred_cust_flag     varchar,
179 |     c_birth_day               int,
180 |     c_birth_month             int,
181 |     c_birth_year              int,
182 |     c_birth_country           varchar,
183 |     c_login                   varchar,
184 |     c_email_address           varchar,
185 |     c_last_review_date        varchar
186 | );
187 | 
188 | create table public.date_dim (
189 |     d_date_sk                 bigint,
190 |     d_date_id                 varchar,
191 |     d_date                    varchar,
192 |     d_month_seq               int,
193 |     d_week_seq                int,
194 |     d_quarter_seq             int,
195 |     d_year                    int,
196 |     d_dow                     int,
197 |     d_moy                     int,
198 |     d_dom                     int,
199 |     d_qoy                     int,
200 |     d_fy_year                 int,
201 |     d_fy_quarter_seq          int,
202 |     d_fy_week_seq             int,
203 |     d_day_name                varchar,
204 |     d_quarter_name            varchar,
205 |     d_holiday                 varchar,
206 |     d_weekend                 varchar,
207 |     d_following_holiday       varchar,
208 |     d_first_dom               int,
209 |     d_last_dom                int,
210 |     d_same_day_ly             int,
211 |     d_same_day_lq             int,
212 |     d_current_day             varchar,
213 |     d_current_week            varchar,
214 |     d_current_month           varchar,
215 |     d_current_quarter         varchar,
216 |     d_current_year            varchar
217 | );
218 | 
219 | create table public.household_demographics (
220 |     hd_demo_sk                bigint,
221 |     hd_income_band_sk         bigint,
222 |     hd_buy_potential          varchar,
223 |     hd_dep_count              int,
224 |     hd_vehicle_count          int
225 | );
226 | 
227 | create table public.income_band(
228 |       ib_income_band_sk         bigint
229 | ,     ib_lower_bound            int
230 | ,     ib_upper_bound            int
231 | );
232 | 
233 | create table public.inventory (
234 |     inv_date_sk            bigint,
235 |     inv_item_sk            bigint,
236 |     inv_warehouse_sk        bigint,
237 |     inv_quantity_on_hand    int
238 | );
239 | 
240 | create table public.item (
241 |     i_item_sk                 bigint,
242 |     i_item_id                 varchar,
243 |     i_rec_start_date          varchar,
244 |     i_rec_end_date            varchar,
245 |     i_item_desc               varchar,
246 |     i_current_price           double precision,
247 |     i_wholesale_cost          double precision,
248 |     i_brand_id                int,
249 |     i_brand                   varchar,
250 |     i_class_id                int,
251 |     i_class                   varchar,
252 |     i_category_id             int,
253 |     i_category                varchar,
254 |     i_manufact_id             int,
255 |     i_manufact                varchar,
256 |     i_size                    varchar,
257 |     i_formulation             varchar,
258 |     i_color                   varchar,
259 |     i_units                   varchar,
260 |     i_container               varchar,
261 |     i_manager_id              int,
262 |     i_product_name            varchar
263 | );
264 | 
265 | create table public.promotion (
266 |     p_promo_sk                bigint,
267 |     p_promo_id                varchar,
268 |     p_start_date_sk           bigint,
269 |     p_end_date_sk             bigint,
270 |     p_item_sk                 bigint,
271 |     p_cost                    double precision,
272 |     p_response_target         int,
273 |     p_promo_name              varchar,
274 |     p_channel_dmail           varchar,
275 |     p_channel_email           varchar,
276 |     p_channel_catalog         varchar,
277 |     p_channel_tv              varchar,
278 |     p_channel_radio           varchar,
279 |     p_channel_press           varchar,
280 |     p_channel_event           varchar,
281 |     p_channel_demo            varchar,
282 |     p_channel_details         varchar,
283 |     p_purpose                 varchar,
284 |     p_discount_active         varchar
285 | );
286 | 
287 | create table public.reason(
288 |       r_reason_sk               bigint
289 | ,     r_reason_id               varchar
290 | ,     r_reason_desc             varchar
291 | );
292 | 
293 | create table public.ship_mode(
294 |       sm_ship_mode_sk           bigint
295 | ,     sm_ship_mode_id           varchar
296 | ,     sm_type                   varchar
297 | ,     sm_code                   varchar
298 | ,     sm_carrier                varchar
299 | ,     sm_contract               varchar
300 | );
301 | 
302 | create table public.store_returns (
303 |     sr_returned_date_sk       bigint,
304 |     sr_return_time_sk         bigint,
305 |     sr_item_sk                bigint,
306 |     sr_customer_sk            bigint,
307 |     sr_cdemo_sk               bigint,
308 |     sr_hdemo_sk               bigint,
309 |     sr_addr_sk                bigint,
310 |     sr_store_sk               bigint,
311 |     sr_reason_sk              bigint,
312 |     sr_ticket_number          bigint,
313 |     sr_return_quantity        int,
314 |     sr_return_amt             double precision,
315 |     sr_return_tax             double precision,
316 |     sr_return_amt_inc_tax     double precision,
317 |     sr_fee                    double precision,
318 |     sr_return_ship_cost       double precision,
319 |     sr_refunded_cash          double precision,
320 |     sr_reversed_charge        double precision,
321 |     sr_store_credit           double precision,
322 |     sr_net_loss               double precision
323 | );
324 | 
325 | create table public.store_sales (
326 |     ss_sold_date_sk           bigint,
327 |     ss_sold_time_sk           bigint,
328 |     ss_item_sk                bigint,
329 |     ss_customer_sk            bigint,
330 |     ss_cdemo_sk               bigint,
331 |     ss_hdemo_sk               bigint,
332 |     ss_addr_sk                bigint,
333 |     ss_store_sk               bigint,
334 |     ss_promo_sk               bigint,
335 |     ss_ticket_number          bigint,
336 |     ss_quantity               int,
337 |     ss_wholesale_cost         double precision,
338 |     ss_list_price             double precision,
339 |     ss_sales_price            double precision,
340 |     ss_ext_discount_amt       double precision,
341 |     ss_ext_sales_price        double precision,
342 |     ss_ext_wholesale_cost     double precision,
343 |     ss_ext_list_price         double precision,
344 |     ss_ext_tax                double precision,
345 |     ss_coupon_amt             double precision,
346 |     ss_net_paid               double precision,
347 |     ss_net_paid_inc_tax       double precision,
348 |     ss_net_profit             double precision
349 | );
350 | 
351 | create table public.store (
352 |     s_store_sk                bigint,
353 |     s_store_id                varchar,
354 |     s_rec_start_date          varchar,
355 |     s_rec_end_date            varchar,
356 |     s_closed_date_sk          bigint,
357 |     s_store_name              varchar,
358 |     s_number_employees        int,
359 |     s_floor_space             int,
360 |     s_hours                   varchar,
361 |     s_manager                 varchar,
362 |     s_market_id               int,
363 |     s_geography_class         varchar,
364 |     s_market_desc             varchar,
365 |     s_market_manager          varchar,
366 |     s_division_id             int,
367 |     s_division_name           varchar,
368 |     s_company_id              int,
369 |     s_company_name            varchar,
370 |     s_street_number           varchar,
371 |     s_street_name             varchar,
372 |     s_street_type             varchar,
373 |     s_suite_number            varchar,
374 |     s_city                    varchar,
375 |     s_county                  varchar,
376 |     s_state                   varchar,
377 |     s_zip                     varchar,
378 |     s_country                 varchar,
379 |     s_gmt_offset              double precision,
380 |     s_tax_precentage          double precision
381 | );
382 | 
383 | create table public.time_dim (
384 |     t_time_sk                 bigint,
385 |     t_time_id                 varchar,
386 |     t_time                    int,
387 |     t_hour                    int,
388 |     t_minute                  int,
389 |     t_second                  int,
390 |     t_am_pm                   varchar,
391 |     t_shift                   varchar,
392 |     t_sub_shift               varchar,
393 |     t_meal_time               varchar
394 | );
395 | 
396 | create table public.warehouse(
397 |       w_warehouse_sk            bigint
398 | ,     w_warehouse_id            varchar
399 | ,     w_warehouse_name          varchar
400 | ,     w_warehouse_sq_ft         int
401 | ,     w_street_number           varchar
402 | ,     w_street_name             varchar
403 | ,     w_street_type             varchar
404 | ,     w_suite_number            varchar
405 | ,     w_city                    varchar
406 | ,     w_county                  varchar
407 | ,     w_state                   varchar
408 | ,     w_zip                     varchar
409 | ,     w_country                 varchar
410 | ,     w_gmt_offset              double precision
411 | );
412 | 
413 | create table public.web_page(
414 |       wp_web_page_sk            bigint
415 | ,     wp_web_page_id            varchar
416 | ,     wp_rec_start_date        varchar
417 | ,     wp_rec_end_date          varchar
418 | ,     wp_creation_date_sk       bigint
419 | ,     wp_access_date_sk         bigint
420 | ,     wp_autogen_flag           varchar
421 | ,     wp_customer_sk            bigint
422 | ,     wp_url                    varchar
423 | ,     wp_type                   varchar
424 | ,     wp_char_count             int
425 | ,     wp_link_count             int
426 | ,     wp_image_count            int
427 | ,     wp_max_ad_count           int
428 | );
429 | 
430 | create table public.web_returns (
431 |     wr_returned_date_sk       bigint,
432 |     wr_returned_time_sk       bigint,
433 |     wr_item_sk                bigint,
434 |     wr_refunded_customer_sk   bigint,
435 |     wr_refunded_cdemo_sk      bigint,
436 |     wr_refunded_hdemo_sk      bigint,
437 |     wr_refunded_addr_sk       bigint,
438 |     wr_returning_customer_sk  bigint,
439 |     wr_returning_cdemo_sk     bigint,
440 |     wr_returning_hdemo_sk     bigint,
441 |     wr_returning_addr_sk      bigint,
442 |     wr_web_page_sk            bigint,
443 |     wr_reason_sk              bigint,
444 |     wr_order_number           bigint,
445 |     wr_return_quantity        int,
446 |     wr_return_amt             double precision,
447 |     wr_return_tax             double precision,
448 |     wr_return_amt_inc_tax     double precision,
449 |     wr_fee                    double precision,
450 |     wr_return_ship_cost       double precision,
451 |     wr_refunded_cash          double precision,
452 |     wr_reversed_charge        double precision,
453 |     wr_account_credit         double precision,
454 |     wr_net_loss               double precision
455 | );
456 | 
457 | create table public.web_sales (
458 |     ws_sold_date_sk           bigint,
459 |     ws_sold_time_sk           bigint,
460 |     ws_ship_date_sk           bigint,
461 |     ws_item_sk                bigint,
462 |     ws_bill_customer_sk       bigint,
463 |     ws_bill_cdemo_sk          bigint,
464 |     ws_bill_hdemo_sk          bigint,
465 |     ws_bill_addr_sk           bigint,
466 |     ws_ship_customer_sk       bigint,
467 |     ws_ship_cdemo_sk          bigint,
468 |     ws_ship_hdemo_sk          bigint,
469 |     ws_ship_addr_sk           bigint,
470 |     ws_web_page_sk            bigint,
471 |     ws_web_site_sk            bigint,
472 |     ws_ship_mode_sk           bigint,
473 |     ws_warehouse_sk           bigint,
474 |     ws_promo_sk               bigint,
475 |     ws_order_number           bigint,
476 |     ws_quantity               int,
477 |     ws_wholesale_cost         double precision,
478 |     ws_list_price             double precision,
479 |     ws_sales_price            double precision,
480 |     ws_ext_discount_amt       double precision,
481 |     ws_ext_sales_price        double precision,
482 |     ws_ext_wholesale_cost     double precision,
483 |     ws_ext_list_price         double precision,
484 |     ws_ext_tax                double precision,
485 |     ws_coupon_amt             double precision,
486 |     ws_ext_ship_cost          double precision,
487 |     ws_net_paid               double precision,
488 |     ws_net_paid_inc_tax       double precision,
489 |     ws_net_paid_inc_ship      double precision,
490 |     ws_net_paid_inc_ship_tax  double precision,
491 |     ws_net_profit             double precision
492 | );
493 | 
494 | create table public.web_site (
495 |     web_site_sk           bigint,
496 |     web_site_id           varchar,
497 |     web_rec_start_date    varchar,
498 |     web_rec_end_date      varchar,
499 |     web_name              varchar,
500 |     web_open_date_sk      bigint,
501 |     web_close_date_sk     bigint,
502 |     web_class             varchar,
503 |     web_manager           varchar,
504 |     web_mkt_id            int,
505 |     web_mkt_class         varchar,
506 |     web_mkt_desc          varchar,
507 |     web_market_manager    varchar,
508 |     web_company_id        int,
509 |     web_company_name      varchar,
510 |     web_street_number     varchar,
511 |     web_street_name       varchar,
512 |     web_street_type       varchar,
513 |     web_suite_number      varchar,
514 |     web_city              varchar,
515 |     web_county            varchar,
516 |     web_state             varchar,
517 |     web_zip               varchar,
518 |     web_country           varchar,
519 |     web_gmt_offset        double precision,
520 |     web_tax_percentage    double precision
521 | );
522 | 
523 | copy into public.call_center from 's3://fivetran-benchmark/tpcds_100_dat/call_center/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
524 | copy into public.catalog_page from 's3://fivetran-benchmark/tpcds_100_dat/catalog_page/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
525 | copy into public.catalog_returns from 's3://fivetran-benchmark/tpcds_100_dat/catalog_returns/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
526 | copy into public.catalog_sales from 's3://fivetran-benchmark/tpcds_100_dat/catalog_sales/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
527 | copy into public.customer from 's3://fivetran-benchmark/tpcds_100_dat/customer/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
528 | copy into public.customer_address from 's3://fivetran-benchmark/tpcds_100_dat/customer_address/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
529 | copy into public.customer_demographics from 's3://fivetran-benchmark/tpcds_100_dat/customer_demographics/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
530 | copy into public.date_dim from 's3://fivetran-benchmark/tpcds_100_dat/date_dim/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
531 | copy into public.household_demographics from 's3://fivetran-benchmark/tpcds_100_dat/household_demographics/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
532 | copy into public.income_band from 's3://fivetran-benchmark/tpcds_100_dat/income_band/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
533 | copy into public.inventory from 's3://fivetran-benchmark/tpcds_100_dat/inventory/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
534 | copy into public.item from 's3://fivetran-benchmark/tpcds_100_dat/item/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
535 | copy into public.promotion from 's3://fivetran-benchmark/tpcds_100_dat/promotion/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
536 | copy into public.reason from 's3://fivetran-benchmark/tpcds_100_dat/reason/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
537 | copy into public.ship_mode from 's3://fivetran-benchmark/tpcds_100_dat/ship_mode/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
538 | copy into public.store from 's3://fivetran-benchmark/tpcds_100_dat/store/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
539 | copy into public.store_returns from 's3://fivetran-benchmark/tpcds_100_dat/store_returns/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
540 | copy into public.store_sales from 's3://fivetran-benchmark/tpcds_100_dat/store_sales/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
541 | copy into public.time_dim from 's3://fivetran-benchmark/tpcds_100_dat/time_dim/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
542 | copy into public.warehouse from 's3://fivetran-benchmark/tpcds_100_dat/warehouse/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
543 | copy into public.web_page from 's3://fivetran-benchmark/tpcds_100_dat/web_page/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
544 | copy into public.web_returns from 's3://fivetran-benchmark/tpcds_100_dat/web_returns/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
545 | copy into public.web_sales from 's3://fivetran-benchmark/tpcds_100_dat/web_sales/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
546 | copy into public.web_site from 's3://fivetran-benchmark/tpcds_100_dat/web_site/' file_format = (type = csv field_delimiter='|' error_on_column_count_mismatch=false VALIDATE_UTF8=false );
547 | 


--------------------------------------------------------------------------------