├── .github └── workflows │ ├── cd.yml │ ├── ci.yml │ └── ci_teardown.yml ├── .gitignore ├── .sqlfluff ├── README.md ├── data-tests ├── assert_total_payment_amount_is_positive.sql └── generic │ ├── is_even.sql │ └── is_odd.sql ├── dbt-requirements.txt ├── dbt_project.yml ├── macros ├── cents_to_dollars.sql └── drop_pr_staging_schemas.sql ├── models └── jaffle-shop │ ├── marts │ ├── agg_orders.sql │ ├── agg_orders.yml │ ├── dim_customers.sql │ ├── dim_customers.yml │ ├── fact_orders.sql │ └── fact_orders.yml │ └── staging │ ├── _sources.yml │ ├── stg_customers.sql │ ├── stg_customers.yml │ ├── stg_orders.sql │ ├── stg_orders.yml │ ├── stg_payments.sql │ └── stg_payments.yml ├── packages.yml ├── profiles.yml ├── seeds └── country_codes.csv └── snapshots └── snapshot_customers.sql /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: CD_action 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | CD_job: 10 | runs-on: ubuntu-latest 11 | 12 | env: 13 | GCLOUD_SERVICE_KEY: ${{ secrets.GCLOUD_SERVICE_KEY }} 14 | GCLOUD_STORAGE_PATH_MANIFEST: ${{ secrets.GCLOUD_STORAGE_PATH_MANIFEST }} 15 | DBT_ENV_SECRET_TYPE: ${{ secrets.DBT_ENV_SECRET_TYPE }} 16 | DBT_ENV_SECRET_PROJECT_ID: ${{ secrets.DBT_ENV_SECRET_PROJECT_ID }} 17 | DBT_ENV_SECRET_PRIVATE_KEY_ID: ${{ secrets.DBT_ENV_SECRET_PRIVATE_KEY_ID }} 18 | DBT_ENV_SECRET_PRIVATE_KEY: ${{ secrets.DBT_ENV_SECRET_PRIVATE_KEY }} 19 | DBT_ENV_SECRET_CLIENT_EMAIL: ${{ secrets.DBT_ENV_SECRET_CLIENT_EMAIL }} 20 | DBT_ENV_SECRET_CLIENT_ID: ${{ secrets.DBT_ENV_SECRET_CLIENT_ID }} 21 | DBT_ENV_SECRET_AUTH_URI: ${{ secrets.DBT_ENV_SECRET_AUTH_URI }} 22 | DBT_ENV_SECRET_TOKEN_URI: ${{ secrets.DBT_ENV_SECRET_TOKEN_URI }} 23 | DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL: ${{ secrets.DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL }} 24 | DBT_ENV_SECRET_CLIENT_X509_CERT_URL: ${{ secrets.DBT_ENV_SECRET_CLIENT_X509_CERT_URL }} 25 | 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v3 29 | 30 | - name: Set up Python 31 | uses: actions/setup-python@v3 32 | with: 33 | python-version: '3.12' 34 | 35 | - name: Install dependencies 36 | run: pip install -r dbt-requirements.txt 37 | 38 | - name: Authenticate to Google Cloud 39 | uses: google-github-actions/auth@v0.4.3 40 | with: 41 | credentials_json: ${{ secrets.GCLOUD_SERVICE_KEY }} 42 | 43 | - name: Set up Cloud SDK 44 | uses: google-github-actions/setup-gcloud@v1 45 | with: 46 | project_id: ${{ secrets.DBT_ENV_SECRET_PROJECT_ID }} 47 | 48 | - name: Copy manifest.json from Google Cloud Storage 49 | run: | 50 | gsutil cp gs://${{ env.GCLOUD_STORAGE_PATH_MANIFEST }}/manifest.json ./ || echo "Manifest not found" 51 | 52 | - name: Run dbt debug 53 | run: dbt debug --target prod 54 | 55 | - name: Run dbt deps 56 | run: dbt deps --target prod 57 | 58 | - name: Run dbt build 59 | run: | 60 | if [ -f "./manifest.json" ]; then 61 | dbt build -s 'state:modified+' --state ./ --target prod 62 | else 63 | dbt build --target prod 64 | fi 65 | 66 | - name: Copy new manifest.json to Google Cloud Storage 67 | run: | 68 | gsutil cp ./target/manifest.json gs://${{ env.GCLOUD_STORAGE_PATH_MANIFEST }}/ -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI_action 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | CI_job: 10 | runs-on: ubuntu-latest 11 | 12 | env: 13 | 14 | GCLOUD_SERVICE_KEY: ${{ secrets.GCLOUD_SERVICE_KEY }} 15 | GCLOUD_STORAGE_PATH_MANIFEST: ${{ secrets.GCLOUD_STORAGE_PATH_MANIFEST }} 16 | DBT_ENV_SECRET_TYPE: ${{ secrets.DBT_ENV_SECRET_TYPE }} 17 | DBT_ENV_SECRET_PROJECT_ID: ${{ secrets.DBT_ENV_SECRET_PROJECT_ID }} 18 | DBT_ENV_SECRET_PRIVATE_KEY_ID: ${{ secrets.DBT_ENV_SECRET_PRIVATE_KEY_ID }} 19 | DBT_ENV_SECRET_PRIVATE_KEY: ${{ secrets.DBT_ENV_SECRET_PRIVATE_KEY }} 20 | DBT_ENV_SECRET_CLIENT_EMAIL: ${{ secrets.DBT_ENV_SECRET_CLIENT_EMAIL }} 21 | DBT_ENV_SECRET_CLIENT_ID: ${{ secrets.DBT_ENV_SECRET_CLIENT_ID }} 22 | DBT_ENV_SECRET_AUTH_URI: ${{ secrets.DBT_ENV_SECRET_AUTH_URI }} 23 | DBT_ENV_SECRET_TOKEN_URI: ${{ secrets.DBT_ENV_SECRET_TOKEN_URI }} 24 | DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL: ${{ secrets.DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL }} 25 | DBT_ENV_SECRET_CLIENT_X509_CERT_URL: ${{ secrets.DBT_ENV_SECRET_CLIENT_X509_CERT_URL }} 26 | 27 | steps: 28 | - name: Checkout repository 29 | uses: actions/checkout@v3 30 | 31 | - name: Set up Python 32 | uses: actions/setup-python@v3 33 | with: 34 | python-version: '3.12' 35 | 36 | - name: Install dependencies 37 | run: pip install -r dbt-requirements.txt 38 | 39 | - name: Authenticate to Google Cloud 40 | uses: google-github-actions/auth@v0.4.3 41 | with: 42 | credentials_json: ${{ secrets.GCLOUD_SERVICE_KEY }} 43 | 44 | - name: Set up Cloud SDK 45 | uses: google-github-actions/setup-gcloud@v1 46 | with: 47 | project_id: ${{ secrets.DBT_ENV_SECRET_PROJECT_ID }} 48 | 49 | - name: Copy manifest.json from Google Cloud Storage 50 | run: | 51 | gsutil cp gs://${{ env.GCLOUD_STORAGE_PATH_MANIFEST }}/manifest.json ./ || echo "Manifest not found" 52 | 53 | - name: Get Schema ID 54 | id: schema_id 55 | run: echo "SCHEMA_ID=${{ github.event.pull_request.number }}__${{ github.sha }}" >> $GITHUB_ENV 56 | 57 | - name: Run dbt debug 58 | run: | 59 | dbt debug --target pr --vars "schema_id: $SCHEMA_ID" 60 | 61 | - name: Run dbt deps 62 | run: | 63 | dbt deps --target pr --vars "schema_id: $SCHEMA_ID" 64 | 65 | - name: Run dbt build 66 | run: | 67 | if [ -f "./manifest.json" ]; then 68 | dbt build -s 'state:modified+' --defer --state ./ --target pr --vars "schema_id: $SCHEMA_ID" 69 | else 70 | dbt build --target pr --vars "schema_id: $SCHEMA_ID" 71 | fi 72 | -------------------------------------------------------------------------------- /.github/workflows/ci_teardown.yml: -------------------------------------------------------------------------------- 1 | name: CI schema teardown on PR close 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - closed 7 | 8 | jobs: 9 | CI_TEARDOWN_job: 10 | runs-on: ubuntu-latest 11 | 12 | env: 13 | GCLOUD_SERVICE_KEY: ${{ secrets.GCLOUD_SERVICE_KEY }} 14 | GCLOUD_STORAGE_PATH_MANIFEST: ${{ secrets.GCLOUD_STORAGE_PATH_MANIFEST }} 15 | DBT_ENV_SECRET_TYPE: ${{ secrets.DBT_ENV_SECRET_TYPE }} 16 | DBT_ENV_SECRET_PROJECT_ID: ${{ secrets.DBT_ENV_SECRET_PROJECT_ID }} 17 | DBT_ENV_SECRET_PRIVATE_KEY_ID: ${{ secrets.DBT_ENV_SECRET_PRIVATE_KEY_ID }} 18 | DBT_ENV_SECRET_PRIVATE_KEY: ${{ secrets.DBT_ENV_SECRET_PRIVATE_KEY }} 19 | DBT_ENV_SECRET_CLIENT_EMAIL: ${{ secrets.DBT_ENV_SECRET_CLIENT_EMAIL }} 20 | DBT_ENV_SECRET_CLIENT_ID: ${{ secrets.DBT_ENV_SECRET_CLIENT_ID }} 21 | DBT_ENV_SECRET_AUTH_URI: ${{ secrets.DBT_ENV_SECRET_AUTH_URI }} 22 | DBT_ENV_SECRET_TOKEN_URI: ${{ secrets.DBT_ENV_SECRET_TOKEN_URI }} 23 | DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL: ${{ secrets.DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL }} 24 | DBT_ENV_SECRET_CLIENT_X509_CERT_URL: ${{ secrets.DBT_ENV_SECRET_CLIENT_X509_CERT_URL }} 25 | 26 | steps: 27 | - name: checkout 28 | uses: actions/checkout@v3 29 | 30 | - name: Set up Python 31 | uses: actions/setup-python@v3 32 | with: 33 | python-version: '3.12' 34 | 35 | - name: Install dependencies 36 | run: pip install -r dbt-requirements.txt 37 | 38 | - name: Authenticate to Google Cloud 39 | uses: google-github-actions/auth@v0.4.3 40 | with: 41 | credentials_json: ${{ secrets.GCLOUD_SERVICE_KEY }} 42 | 43 | - name: Set up Cloud SDK 44 | uses: google-github-actions/setup-gcloud@v1 45 | with: 46 | project_id: ${{ secrets.DBT_ENV_SECRET_PROJECT_ID }} 47 | 48 | - name: Get PR NUM 49 | id: schema_id 50 | run: echo "PR_NUM=${{ github.event.number }}" >> $GITHUB_ENV 51 | 52 | - name: dbt deps 53 | run: dbt deps 54 | 55 | - name: drop PR schemas 56 | run: dbt run-operation drop_pr_staging_schemas --args "{'project_id'":" '${DBT_ENV_SECRET_PROJECT_ID}', 'PR_number'":" '${PR_NUM}' }" --profiles-dir ./ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_packages/ 3 | logs/ 4 | .user.yml 5 | package-lock.yml 6 | venv/ 7 | *.DS_Store -------------------------------------------------------------------------------- /.sqlfluff: -------------------------------------------------------------------------------- 1 | [sqlfluff] 2 | dialect = trino 3 | templater = jinja 4 | sql_file_exists = .sql 5 | exclude_rules = ambiguous.column_count, structure.column_order, ambiguous.join, LT01 6 | max_line_length = 120 7 | 8 | [sqlfluff:templater:dbt] 9 | project_dir = ./ 10 | 11 | [sqlfluff:layout:type:comma] 12 | line_position = leading 13 | 14 | [sqlfluff:rules:layout.select_targets] 15 | wildcard_policy = single 16 | 17 | [sqlfluff:rules:jinja.padding] 18 | single_space = true 19 | 20 | [sqlfluff:indentation] 21 | allow_implicit_indents = True 22 | indented_joins = False 23 | indented_using_on = True 24 | template_blocks_indent = True 25 | indented_on_contents = False 26 | indented_ctes = True 27 | indented_then = True 28 | 29 | [sqlfluff:templater] 30 | unwrap_wrapped_queries = True 31 | 32 | [sqlfluff:rules] 33 | allow_scalar = True 34 | single_table_references = consistent 35 | unquoted_identifiers_policy = all 36 | LT01 = disable 37 | 38 | [sqlfluff:rules:capitalisation.keywords] 39 | capitalisation_policy = lower 40 | 41 | [sqlfluff:rules:capitalisation.identifiers] 42 | capitalisation_policy = lower 43 | 44 | [sqlfluff:rules:capitalisation.functions] 45 | capitalisation_policy = lower 46 | extended_capitalisation_policy = lower 47 | 48 | [sqlfluff:rules:capitalisation.literals] 49 | capitalisation_policy = lower 50 | 51 | [sqlfluff:rules:capitalisation.types] 52 | extended_capitalisation_policy = lower 53 | 54 | [sqlfluff:rules:aliasing.table] 55 | aliasing = explicit 56 | 57 | [sqlfluff:rules:aliasing.column] 58 | aliasing = explicit 59 | 60 | [sqlfluff:rules:aliasing.length] 61 | min_alias_length = 3 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dbt-ci-cd 2 | 3 | ## Overview 4 | 5 | This repository contains the dbt project for Jaffle Shop, configured with a CI/CD pipeline using GitHub Actions and Google Cloud Platform (GCP). This setup is designed to demonstrate and implement best practices for testing and deploying dbt models. It's adaptable for others platforms beyond GCP. 6 | 7 | ## CI/CD Pipeline Explanation 8 | 9 | The GitHub Actions configuration is divided into two primary workflow files: `CI.yml` for Continuous Integration and `CD.yml` for Continuous Deployment. 10 | 11 | ### Continuous Integration (CI) Workflow 12 | 13 | The `CI.yml` workflow triggers on every pull request to the `main` branch to ensure that changes are tested before merging. The workflow details are as follows: 14 | 15 | 1. **Dependencies Installation** 16 | - **Python Dependencies**: Install all required Python packages specified in `dbt-requirements.txt`. 17 | - **Google Cloud SDK Installation**: Install the Google Cloud SDK to interact with Google Cloud resources. 18 | 19 | 2. **Authentication** 20 | - **Service Account Authentication**: Utilize the Google Cloud SDK to authenticate using a service account key, crucial for accessing GCP services securely. 21 | 22 | 3. **Schema Management** 23 | - **Dynamic Schema Creation**: Generate a unique schema identifier based on the pull request ID and commit hash. This schema is used to isolate testing from the production environment. 24 | 25 | 4. **dbt Commands Execution** 26 | - **Debugging and Dependencies**: Run `dbt debug` and `dbt deps` to verify configurations and fetch dependencies. 27 | - **Conditional Build**: Attempt to download the `manifest.json` to compare with the local changes. 28 | - If present, dbt builds only modified and downstream resources (`state:modified+`) in the new schema. To be able to build only modified resources, it references the upstream resources from the production schema, using the `--defer` flag. 29 | - This comparison ensures that only resources affected by the changes are tested. 30 | - If no manifest.json is found, it performs a full build. 31 | - **Schema Cleanup**: After testing, the temporary schema is dropped to clean up resources. This step actually is a on-run-end hook, and not a pipeline step. 32 | 33 | ### Continuous Deployment (CD) Workflow 34 | 35 | The `CD.yml` workflow executes when changes are merged into the main branch. It builds modified dbt resources and downstream dependencies in the production schema, ensuring that production data remains up-to-date: 36 | 37 | 1. **Environment and Authentication Setup** 38 | - As in the CI workflow, prepare environment variables and authenticate using the Google Cloud SDK. 39 | 40 | 2. **Retrieve Production Manifest** 41 | - **Manifest Download**: Download the existing `manifest.json` from GCP to handle incremental builds by identifying changed models. 42 | 43 | 3. **Production dbt Execution** 44 | - **Production Build**: Execute dbt commands to update the production environment. It builds modified resources and their downstream dependencies (`state:modified+`), using the updated manifest to determine changes. If no manifest.json is found, it performs a full build (useful when running the project for the first time). 45 | - **Manifest Update**: After successful deployment, upload the current `manifest.json` back to GCP. This step updates the manifest to reflect the latest state, for subsequent CI runs. 46 | 47 | ## Platform Flexibility 48 | 49 | This pipeline is designed with flexibility in mind, particularly in terms of platform dependency. The steps involving authentication and manifest handling are currently tailored for GCP but can be adapted for other platforms like AWS, Azure, or even on-premise solutions. 50 | 51 | ### Modifying for Other Platforms 52 | 53 | - **Authentication**: Replace GCP authentication steps with corresponding steps for AWS (using AWS CLI and IAM roles) or Azure (using Azure CLI and service principals). 54 | - **Manifest Storage**: Change commands related to `gsutil` (used for interacting with Google Cloud Storage) to equivalent commands for other services like Amazon S3 or Azure Blob Storage. 55 | 56 | ## Setup and Configuration 57 | 58 | To implement this pipeline: 59 | 1. **Clone the Repository**: Get a copy of this repository. 60 | 2. **Configure Environment Variables**: Set all necessary variables in GitHub Secrets settings. 61 | -------------------------------------------------------------------------------- /data-tests/assert_total_payment_amount_is_positive.sql: -------------------------------------------------------------------------------- 1 | -- Refunds have a negative amount, so the total amount should always be >= 0. 2 | -- Therefore return records where this isn't true to make the test fail 3 | select 4 | id 5 | , sum(amount) 6 | from {{ ref('fact_orders' )}} 7 | group by 1 8 | having not(sum(amount) >= 0) -------------------------------------------------------------------------------- /data-tests/generic/is_even.sql: -------------------------------------------------------------------------------- 1 | {% test is_even(model, column_name) %} 2 | 3 | with validation as ( 4 | 5 | select 6 | {{ column_name }} as even_field 7 | 8 | from {{ model }} 9 | 10 | ), 11 | 12 | validation_errors as ( 13 | 14 | select 15 | even_field 16 | 17 | from validation 18 | -- if this is true, then even_field is actually odd! 19 | where (even_field % 2) = 1 20 | 21 | ) 22 | 23 | select * 24 | from validation_errors 25 | 26 | {% endtest %} -------------------------------------------------------------------------------- /data-tests/generic/is_odd.sql: -------------------------------------------------------------------------------- 1 | {% test is_odd(model, column_name) %} 2 | 3 | with validation as ( 4 | 5 | select 6 | {{ column_name }} as odd_field 7 | 8 | from {{ model }} 9 | 10 | ), 11 | 12 | validation_errors as ( 13 | 14 | select 15 | odd_field 16 | 17 | from validation 18 | -- if this is true, then odd_field is actually even! 19 | where (odd_field % 2) = 0 20 | 21 | ) 22 | 23 | select * 24 | from validation_errors 25 | 26 | {% endtest %} -------------------------------------------------------------------------------- /dbt-requirements.txt: -------------------------------------------------------------------------------- 1 | dbt-bigquery==1.8.0 2 | sqlfluff==3.0.6 -------------------------------------------------------------------------------- /dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'ci_cd_project' 6 | version: '1.0.0' 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'bigquery_sandbox' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `model-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | model-paths: ["models"] 15 | analysis-paths: ["analyses"] 16 | test-paths: ["tests"] 17 | seed-paths: ["seeds"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | 21 | clean-targets: # directories to be removed by `dbt clean` 22 | - "target" 23 | - "dbt_packages" 24 | 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | # In this example config, we tell dbt to build all models in the example/ 30 | # directory as views. These settings can be overridden in the individual model 31 | # files using the `{{ config(...) }}` macro. 32 | models: 33 | ci_cd_project: 34 | +materialized: table 35 | dbt_project_evaluator: 36 | +enabled: "{{ env_var('ENABLE_DBT_PROJECT_EVALUATOR', 'false') | lower == 'true' | as_bool }}" 37 | 38 | data_tests: 39 | dbt_project_evaluator: 40 | +severity: "{{ env_var('DBT_PROJECT_EVALUATOR_SEVERITY', 'warn') }}" 41 | 42 | seeds: 43 | dbt_project_evaluator: 44 | +enabled: "{{ env_var('ENABLE_DBT_PROJECT_EVALUATOR', 'false') | lower == 'true' | as_bool }}" -------------------------------------------------------------------------------- /macros/cents_to_dollars.sql: -------------------------------------------------------------------------------- 1 | {# A basic example for a project-wide macro to cast a column uniformly #} 2 | 3 | {% macro cents_to_dollars(column_name) -%} 4 | {{ return(adapter.dispatch('cents_to_dollars')(column_name)) }} 5 | {%- endmacro %} 6 | 7 | {% macro trino__cents_to_dollars(column_name) -%} 8 | cast(({{ column_name }} / 100) as decimal(16, 2)) 9 | {%- endmacro %} 10 | 11 | {% macro default__cents_to_dollars(column_name) -%} 12 | ({{ column_name }} / 100)::numeric(16, 2) 13 | {%- endmacro %} 14 | 15 | {% macro postgres__cents_to_dollars(column_name) -%} 16 | ({{ column_name }}::numeric(16, 2) / 100) 17 | {%- endmacro %} 18 | 19 | {% macro bigquery__cents_to_dollars(column_name) %} 20 | round(cast(({{ column_name }} / 100) as numeric), 2) 21 | {% endmacro %} 22 | -------------------------------------------------------------------------------- /macros/drop_pr_staging_schemas.sql: -------------------------------------------------------------------------------- 1 | {%- macro drop_pr_staging_schemas(project_id, PR_number) %} 2 | 3 | {% set pr_cleanup_query %} 4 | with pr_staging_schemas as ( 5 | select schema_name 6 | from {{ project_id }}.region-us.INFORMATION_SCHEMA.SCHEMATA 7 | where 8 | schema_name like 'pr_'||{{ PR_number }}||'__%' 9 | ) 10 | 11 | select 12 | 'drop schema if exists '||schema_name||' cascade;' as drop_command 13 | from pr_staging_schemas 14 | {% endset %} 15 | 16 | {% do log(pr_cleanup_query, info=TRUE) %} 17 | 18 | {% set drop_commands = run_query(pr_cleanup_query).columns[0].values() %} 19 | 20 | {% if drop_commands %} 21 | {% for drop_command in drop_commands %} 22 | {% do log(drop_command, True) %} 23 | {% do run_query(drop_command) %} 24 | {% endfor %} 25 | {% else %} 26 | {% do log('No schemas to drop.', True) %} 27 | {% endif %} 28 | 29 | {%- endmacro -%} 30 | -------------------------------------------------------------------------------- /models/jaffle-shop/marts/agg_orders.sql: -------------------------------------------------------------------------------- 1 | with 2 | fact_orders as ( 3 | select 4 | order_date 5 | , status 6 | , amount 7 | from {{ ref('fact_orders') }} 8 | ) 9 | 10 | , aggregated as ( 11 | select 12 | order_date 13 | , status 14 | , sum(amount) as total_amount 15 | from fact_orders 16 | group by 17 | order_date 18 | , status 19 | ) 20 | 21 | select * 22 | from aggregated 23 | where true 24 | -------------------------------------------------------------------------------- /models/jaffle-shop/marts/agg_orders.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: agg_orders 5 | description: "This aggregated table summarizes orders by date, status, and payment method, providing the total amount for each combination." 6 | columns: 7 | - name: order_date 8 | description: "The date when the orders were placed." 9 | 10 | - name: status 11 | description: "The status of the orders." 12 | 13 | - name: total_amount 14 | description: "The total amount for orders grouped by order_date, status, and payment_method." 15 | 16 | unit_tests: 17 | - name: test_amount_sum 18 | model: agg_orders 19 | given: 20 | - input: ref('fact_orders') 21 | format: csv 22 | rows: | 23 | order_date,status,amount 24 | '2024-01-01',completed,100 25 | '2024-01-01',completed,100 26 | '2024-01-02',cancelled,100 27 | '2024-01-02',shipped,100 28 | expect: 29 | format: csv 30 | rows: | 31 | order_date,status,total_amount 32 | '2024-01-01',completed,200 33 | '2024-01-02',cancelled,100 34 | '2024-01-02',shipped,100 -------------------------------------------------------------------------------- /models/jaffle-shop/marts/dim_customers.sql: -------------------------------------------------------------------------------- 1 | with 2 | stg_customers as ( 3 | select 4 | id 5 | , first_name 6 | , last_name 7 | , last_updated_dt 8 | , country_code 9 | , dbt_valid_to 10 | , dbt_valid_from 11 | from {{ ref('stg_customers') }} 12 | ) 13 | 14 | , country_codes as ( 15 | select 16 | code 17 | , name 18 | from {{ ref('country_codes') }} 19 | ) 20 | 21 | , joined as ( 22 | select 23 | stg_customers.id 24 | , stg_customers.first_name 25 | , stg_customers.last_name 26 | , country_codes.name as country_name 27 | , stg_customers.last_updated_dt 28 | , stg_customers.dbt_valid_from 29 | , stg_customers.dbt_valid_to 30 | from stg_customers 31 | left join country_codes 32 | on stg_customers.country_code = country_codes.code 33 | ) 34 | 35 | select * 36 | from joined 37 | -------------------------------------------------------------------------------- /models/jaffle-shop/marts/dim_customers.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dim_customers 5 | description: "This model combines customer data from the staging table with country information, providing a unified view of customers with their respective country names." 6 | columns: 7 | - name: id 8 | description: "Unique identifier for each customer." 9 | data_tests: 10 | - unique 11 | - not_null 12 | 13 | - name: first_name 14 | description: "First name of the customer." 15 | data_tests: 16 | - not_null 17 | 18 | - name: last_name 19 | description: "Last name of the customer." 20 | data_tests: 21 | - not_null 22 | 23 | - name: country_name 24 | description: "The name of the country associated with the customer's country code." 25 | data_tests: 26 | - not_null 27 | 28 | - name: last_updated_dt 29 | description: "The date and time when the customer's information was last updated." 30 | data_tests: 31 | - not_null 32 | 33 | - name: dbt_valid_from 34 | description: "Timestamp indicating the validity period's start for the customer record." 35 | data_tests: 36 | - not_null 37 | 38 | - name: dbt_valid_to 39 | description: "Timestamp indicating the validity period's end for the customer record." 40 | 41 | 42 | -------------------------------------------------------------------------------- /models/jaffle-shop/marts/fact_orders.sql: -------------------------------------------------------------------------------- 1 | with 2 | stg_orders as ( 3 | select 4 | id 5 | , user_id 6 | , order_date 7 | , status 8 | , last_updated_dt 9 | from {{ ref('stg_orders') }} 10 | ) 11 | 12 | , stg_payments as ( 13 | select 14 | id 15 | , order_id 16 | , payment_method 17 | , amount 18 | from {{ ref('stg_payments') }} 19 | ) 20 | 21 | , payment_methods_agg as ( 22 | select 23 | order_id 24 | , string_agg(payment_method) as payment_methods 25 | , sum(amount) as amount 26 | from stg_payments 27 | group by order_id 28 | ) 29 | 30 | , joined as ( 31 | select 32 | stg_orders.id 33 | , stg_orders.user_id 34 | , stg_orders.order_date 35 | , stg_orders.status 36 | , payment_methods_agg.payment_methods 37 | , payment_methods_agg.amount 38 | from stg_orders 39 | left join payment_methods_agg 40 | on stg_orders.id = payment_methods_agg.order_id 41 | ) 42 | 43 | select * 44 | from joined 45 | -------------------------------------------------------------------------------- /models/jaffle-shop/marts/fact_orders.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: fact_orders 5 | description: "This fact table combines data from orders and payments, linking each order with its payment details." 6 | columns: 7 | - name: id 8 | description: "Unique identifier for each order." 9 | data_tests: 10 | - unique 11 | - not_null 12 | 13 | - name: user_id 14 | description: "Identifier for the user who placed the order." 15 | data_tests: 16 | - not_null 17 | - relationships: 18 | to: ref('dim_customers') 19 | field: id 20 | 21 | - name: order_date 22 | description: "The date when the order was placed." 23 | data_tests: 24 | - not_null 25 | 26 | - name: status 27 | description: "Current status of the order." 28 | data_tests: 29 | - not_null 30 | 31 | - name: payment_methods 32 | description: "The methods of payment used for the order." 33 | data_tests: 34 | - not_null 35 | 36 | - name: amount 37 | description: "The total amount paid for the order." 38 | data_tests: 39 | - not_null 40 | - dbt_expectations.expect_column_values_to_be_between: 41 | min_value: 0 42 | max_value: 100000 43 | -------------------------------------------------------------------------------- /models/jaffle-shop/staging/_sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: raw 5 | tables: 6 | - name: raw_orders 7 | - name: raw_customers 8 | - name: raw_payments 9 | 10 | freshness: 11 | warn_after: 12 | count: 1 13 | period: day 14 | error_after: 15 | count: 7 16 | period: day 17 | loaded_at_field: "cast(last_updated_dt as timestamp)" -------------------------------------------------------------------------------- /models/jaffle-shop/staging/stg_customers.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with 6 | staging as ( 7 | select 8 | id 9 | , first_name 10 | , last_name 11 | , last_updated_dt 12 | , country_code 13 | , dbt_valid_to 14 | , dbt_valid_from 15 | , row_number() over( 16 | partition by id 17 | order by dbt_valid_to desc 18 | ) as row_num 19 | --, null as dummy_column_to_test_cicd 20 | , null as dummy_column_to_test_cicd_2 21 | from {{ ref('snapshot_customers') }} 22 | ) 23 | 24 | select * 25 | from staging 26 | where row_num = 1 27 | -------------------------------------------------------------------------------- /models/jaffle-shop/staging/stg_customers.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_customers 5 | description: staging table for customers 6 | columns: 7 | - name: id 8 | data_tests: 9 | - unique 10 | - not_null 11 | - name: first_name 12 | data_tests: 13 | - not_null 14 | - name: last_name 15 | data_tests: 16 | - not_null 17 | - name: dbt_valid_from 18 | - name: dbt_valid_to -------------------------------------------------------------------------------- /models/jaffle-shop/staging/stg_orders.sql: -------------------------------------------------------------------------------- 1 | with 2 | staging as ( 3 | select 4 | id 5 | , user_id 6 | , order_date 7 | , status 8 | , last_updated_dt 9 | from {{ source('raw', 'raw_orders') }} 10 | ) 11 | 12 | select * 13 | from staging 14 | -------------------------------------------------------------------------------- /models/jaffle-shop/staging/stg_orders.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_orders 5 | description: staging table for orders 6 | columns: 7 | - name: id 8 | data_tests: 9 | - unique 10 | - not_null 11 | - name: user_id 12 | data_tests: 13 | - not_null 14 | - relationships: 15 | to: ref('stg_customers') 16 | field: id 17 | - name: order_date 18 | data_tests: 19 | - not_null 20 | - name: status 21 | data_tests: 22 | - accepted_values: 23 | values: 24 | - return_pending 25 | - returned 26 | - completed 27 | - placed 28 | - shipped 29 | - name: last_updated_dt 30 | data_tests: 31 | - not_null -------------------------------------------------------------------------------- /models/jaffle-shop/staging/stg_payments.sql: -------------------------------------------------------------------------------- 1 | with 2 | staging as ( 3 | select 4 | id 5 | , order_id 6 | , payment_method 7 | , amount 8 | , last_updated_dt 9 | from {{ source('raw', 'raw_payments') }} 10 | ) 11 | 12 | select * 13 | from staging 14 | -------------------------------------------------------------------------------- /models/jaffle-shop/staging/stg_payments.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: stg_payments 5 | description: staging table for orders 6 | columns: 7 | - name: id 8 | data_tests: 9 | - unique 10 | - not_null 11 | - name: order_id 12 | data_tests: 13 | - not_null 14 | - relationships: 15 | to: ref('stg_orders') 16 | field: id 17 | - name: payment_method 18 | data_tests: 19 | - accepted_values: 20 | values: 21 | - coupon 22 | - credit_card 23 | - bank_transfer 24 | - gift_card 25 | - name: amount 26 | data_tests: 27 | - not_null 28 | - name: last_updated_dt 29 | data_tests: 30 | - not_null -------------------------------------------------------------------------------- /packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | - package: dbt-labs/dbt_project_evaluator 5 | version: 0.11.0 6 | - package: calogica/dbt_expectations 7 | version: 0.10.3 8 | - package: dbt-labs/codegen 9 | version: 0.12.1 -------------------------------------------------------------------------------- /profiles.yml: -------------------------------------------------------------------------------- 1 | bigquery_sandbox: 2 | target: dev 3 | outputs: 4 | dev: 5 | type: bigquery 6 | method: service-account-json 7 | project: "{{ env_var('DBT_ENV_SECRET_PROJECT_ID') }}" 8 | dataset: "dev" 9 | threads: 4 # Must be a value of 1 or greater 10 | 11 | # These fields come from the service account json keyfile 12 | keyfile_json: 13 | type: "{{ env_var('DBT_ENV_SECRET_TYPE') }}" 14 | project_id: "{{ env_var('DBT_ENV_SECRET_PROJECT_ID') }}" 15 | private_key_id: "{{ env_var('DBT_ENV_SECRET_PRIVATE_KEY_ID') }}" 16 | private_key: "{{ env_var('DBT_ENV_SECRET_PRIVATE_KEY') }}" 17 | client_email: "{{ env_var('DBT_ENV_SECRET_CLIENT_EMAIL') }}" 18 | client_id: "{{ env_var('DBT_ENV_SECRET_CLIENT_ID') }}" 19 | auth_uri: "{{ env_var('DBT_ENV_SECRET_AUTH_URI') }}" 20 | token_uri: "{{ env_var('DBT_ENV_SECRET_TOKEN_URI') }}" 21 | auth_provider_x509_cert_url: "{{ env_var('DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL') }}" 22 | client_x509_cert_url: "{{ env_var('DBT_ENV_SECRET_CLIENT_X509_CERT_URL') }}" 23 | 24 | prod: 25 | type: bigquery 26 | method: service-account-json 27 | project: "{{ env_var('DBT_ENV_SECRET_PROJECT_ID') }}" 28 | dataset: "prod" 29 | threads: 4 # Must be a value of 1 or greater 30 | 31 | # These fields come from the service account json keyfile 32 | keyfile_json: 33 | type: "{{ env_var('DBT_ENV_SECRET_TYPE') }}" 34 | project_id: "{{ env_var('DBT_ENV_SECRET_PROJECT_ID') }}" 35 | private_key_id: "{{ env_var('DBT_ENV_SECRET_PRIVATE_KEY_ID') }}" 36 | private_key: "{{ env_var('DBT_ENV_SECRET_PRIVATE_KEY') }}" 37 | client_email: "{{ env_var('DBT_ENV_SECRET_CLIENT_EMAIL') }}" 38 | client_id: "{{ env_var('DBT_ENV_SECRET_CLIENT_ID') }}" 39 | auth_uri: "{{ env_var('DBT_ENV_SECRET_AUTH_URI') }}" 40 | token_uri: "{{ env_var('DBT_ENV_SECRET_TOKEN_URI') }}" 41 | auth_provider_x509_cert_url: "{{ env_var('DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL') }}" 42 | client_x509_cert_url: "{{ env_var('DBT_ENV_SECRET_CLIENT_X509_CERT_URL') }}" 43 | 44 | pr: 45 | type: bigquery 46 | method: service-account-json 47 | project: "{{ env_var('DBT_ENV_SECRET_PROJECT_ID') }}" 48 | dataset: "pr_{{ var('schema_id') }}" 49 | threads: 4 # Must be a value of 1 or greater 50 | 51 | # These fields come from the service account json keyfile 52 | keyfile_json: 53 | type: "{{ env_var('DBT_ENV_SECRET_TYPE') }}" 54 | project_id: "{{ env_var('DBT_ENV_SECRET_PROJECT_ID') }}" 55 | private_key_id: "{{ env_var('DBT_ENV_SECRET_PRIVATE_KEY_ID') }}" 56 | private_key: "{{ env_var('DBT_ENV_SECRET_PRIVATE_KEY') }}" 57 | client_email: "{{ env_var('DBT_ENV_SECRET_CLIENT_EMAIL') }}" 58 | client_id: "{{ env_var('DBT_ENV_SECRET_CLIENT_ID') }}" 59 | auth_uri: "{{ env_var('DBT_ENV_SECRET_AUTH_URI') }}" 60 | token_uri: "{{ env_var('DBT_ENV_SECRET_TOKEN_URI') }}" 61 | auth_provider_x509_cert_url: "{{ env_var('DBT_ENV_SECRET_AUTH_PROVIDER_X509_CERT_URL') }}" 62 | client_x509_cert_url: "{{ env_var('DBT_ENV_SECRET_CLIENT_X509_CERT_URL') }}" -------------------------------------------------------------------------------- /seeds/country_codes.csv: -------------------------------------------------------------------------------- 1 | code,name 2 | BR,Brazil 3 | GB,United Kingdom 4 | US,United States 5 | JP,Japan 6 | CN,China 7 | CA,Canada 8 | DE,Germany 9 | FR,France 10 | IT,Italy 11 | IN,India -------------------------------------------------------------------------------- /snapshots/snapshot_customers.sql: -------------------------------------------------------------------------------- 1 | {% snapshot snapshot_customers %} 2 | 3 | {{ 4 | config( 5 | target_schema='snapshots', 6 | unique_key='id', 7 | 8 | strategy='timestamp', 9 | updated_at='last_updated_dt', 10 | ) 11 | }} 12 | 13 | select * from {{ source('raw', 'raw_customers') }} 14 | 15 | {% endsnapshot %} --------------------------------------------------------------------------------