├── .circleci └── config.yml ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── pull_request_template.md └── workflows │ ├── ci.yml │ ├── stale.yml │ └── triage-labels.yml ├── .gitignore ├── .vscode └── settings.json ├── CHANGELOG.md ├── LICENSE ├── README.md ├── RELEASE.md ├── dbt_project.yml ├── integration_tests ├── .env │ └── postgres.env ├── analyses │ ├── compare_column_values_smoke_test.sql │ └── compare_relation_columns_smoke_test.sql ├── dbt_project.yml ├── macros │ └── unit_tests │ │ └── struct_generation_macros.sql ├── models │ ├── data_tests │ │ ├── compare_all_columns_concat_pk_with_summary.sql │ │ ├── compare_all_columns_concat_pk_without_summary.sql │ │ ├── compare_all_columns_where_clause.sql │ │ ├── compare_all_columns_with_summary.sql │ │ ├── compare_all_columns_with_summary_and_exclude.sql │ │ ├── compare_all_columns_without_summary.sql │ │ ├── compare_and_classify_query_results.sql │ │ ├── compare_queries.sql │ │ ├── compare_queries_concat_pk_without_summary.sql │ │ ├── compare_queries_with_summary.sql │ │ ├── compare_queries_without_summary.sql │ │ ├── compare_relation_columns.sql │ │ ├── compare_relations_concat_pk_without_summary.sql │ │ ├── compare_relations_with_exclude.sql │ │ ├── compare_relations_with_summary.sql │ │ ├── compare_relations_without_exclude.sql │ │ ├── compare_relations_without_summary.sql │ │ ├── compare_row_counts.sql │ │ ├── compare_which_columns_differ.sql │ │ ├── compare_which_columns_differ_exclude_cols.sql │ │ └── schema.yml │ ├── unit_test_placeholder_models │ │ ├── unit_test_model_a.sql │ │ ├── unit_test_model_b.sql │ │ ├── unit_test_struct_model_a.sql │ │ └── unit_test_struct_model_b.sql │ └── unit_test_wrappers │ │ ├── unit_compare_classify.sql │ │ ├── unit_compare_classify.yml │ │ ├── unit_compare_classify_struct.sql │ │ ├── unit_compare_classify_struct.yml │ │ ├── unit_compare_queries.sql │ │ ├── unit_compare_queries.yml │ │ ├── unit_compare_which_query_columns_differ.sql │ │ ├── unit_compare_which_query_columns_differ.yml │ │ ├── unit_ensure_all_pks_are_in_column_set.sql │ │ ├── unit_ensure_all_pks_are_in_column_set.yml │ │ ├── unit_quick_are_queries_identical.sql │ │ └── unit_quick_are_queries_identical.yml ├── package-lock.yml ├── packages.yml ├── profiles.yml ├── seeds │ ├── data_compare_all_columns__albertsons_produce.csv │ ├── data_compare_all_columns__albertsons_produce__concat_pk.csv │ ├── data_compare_all_columns__market_of_choice_produce.csv │ ├── data_compare_all_columns__market_of_choice_produce__concat_pk.csv │ ├── data_compare_relation_columns_a.csv │ ├── data_compare_relation_columns_b.csv │ ├── data_compare_relations__a_relation.csv │ ├── data_compare_relations__b_relation.csv │ ├── data_compare_which_columns_differ_a.csv │ ├── data_compare_which_columns_differ_b.csv │ ├── expected_results__compare_all_columns_concat_pk_with_summary.csv │ ├── expected_results__compare_all_columns_concat_pk_without_summary.csv │ ├── expected_results__compare_all_columns_where_clause.csv │ ├── expected_results__compare_all_columns_with_summary.csv │ ├── expected_results__compare_all_columns_with_summary_and_exclude.csv │ ├── expected_results__compare_all_columns_without_summary.csv │ ├── expected_results__compare_relation_columns.csv │ ├── expected_results__compare_relations_with_exclude.csv │ ├── expected_results__compare_relations_without_exclude.csv │ ├── expected_results__compare_row_counts.csv │ ├── expected_results__compare_which_columns_differ.csv │ ├── expected_results__compare_which_columns_differ_exclude_cols.csv │ ├── expected_results__compare_with_summary.csv │ └── expected_results__compare_without_summary.csv └── tests │ └── fixtures │ ├── complex_struct.sql │ ├── complex_struct_different_order.sql │ ├── complex_struct_different_value.sql │ ├── simple_struct.sql │ ├── simple_struct_different_order.sql │ └── simple_struct_removed_key.sql ├── macros ├── compare_all_columns.sql ├── compare_and_classify_query_results.sql ├── compare_and_classify_relation_rows.sql ├── compare_column_values.sql ├── compare_column_values_verbose.sql ├── compare_queries.sql ├── compare_relation_columns.sql ├── compare_relations.sql ├── compare_row_counts.sql ├── compare_which_query_columns_differ.sql ├── compare_which_relation_columns_differ.sql ├── quick_are_queries_identical.sql ├── quick_are_relations_identical.sql └── utils │ ├── _classify_audit_row_status.sql │ ├── _count_num_rows_in_status.sql │ ├── _ensure_all_pks_are_in_column_set.sql │ ├── _generate_null_safe_sk.sql │ ├── _generate_set_results.sql │ ├── _get_comparison_bounds.sql │ └── _get_intersecting_columns_from_relations.sql ├── packages.yml ├── supported_adapters.env └── tox.ini /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | jobs: 5 | build: 6 | docker: 7 | - image: cimg/python:3.9.9 8 | - image: cimg/postgres:14.0 9 | auth: 10 | username: dbt-labs 11 | password: '' 12 | environment: 13 | POSTGRES_HOST: localhost 14 | POSTGRES_USER: root 15 | POSTGRES_PORT: 5432 16 | POSTGRES_DATABASE: circle_test 17 | POSTGRES_SCHEMA: dbt_utils_integration_tests_postgres 18 | DBT_ENV_SECRET_POSTGRES_PASS: '' 19 | 20 | steps: 21 | - checkout 22 | 23 | - run: 24 | name: setup_creds 25 | command: | 26 | echo $BIGQUERY_SERVICE_ACCOUNT_JSON > ${HOME}/bigquery-service-key.json 27 | 28 | - restore_cache: 29 | key: deps1-{{ .Branch }} 30 | 31 | - run: 32 | name: "Setup dbt" 33 | command: | 34 | set -x 35 | 36 | python -m venv dbt_venv 37 | . dbt_venv/bin/activate 38 | 39 | python -m pip install --upgrade pip setuptools 40 | python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks 41 | 42 | - run: 43 | name: "Run Tests - Postgres" 44 | environment: 45 | POSTGRES_HOST: localhost 46 | POSTGRES_USER: root 47 | POSTGRES_PORT: 5432 48 | POSTGRES_DATABASE: circle_test 49 | POSTGRES_SCHEMA: dbt_utils_integration_tests_postgres 50 | DBT_ENV_SECRET_POSTGRES_PASS: '' 51 | command: | 52 | . dbt_venv/bin/activate 53 | cd integration_tests 54 | dbt deps --target postgres 55 | dbt seed --target postgres --full-refresh 56 | dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+ 57 | dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+ 58 | 59 | - run: 60 | name: "Run Tests - Redshift" 61 | command: | 62 | . dbt_venv/bin/activate 63 | echo `pwd` 64 | cd integration_tests 65 | dbt deps --target redshift 66 | dbt seed --target redshift --full-refresh 67 | dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+ 68 | dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+ 69 | 70 | - run: 71 | name: "Run Tests - Snowflake" 72 | command: | 73 | . dbt_venv/bin/activate 74 | echo `pwd` 75 | cd integration_tests 76 | dbt deps --target snowflake 77 | dbt seed --target snowflake --full-refresh 78 | dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+ 79 | dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+ 80 | 81 | - run: 82 | name: "Run Tests - BigQuery" 83 | environment: 84 | BIGQUERY_SERVICE_KEY_PATH: "/home/circleci/bigquery-service-key.json" 85 | 86 | command: | 87 | . dbt_venv/bin/activate 88 | echo `pwd` 89 | cd integration_tests 90 | dbt deps --target bigquery 91 | dbt seed --target bigquery --full-refresh 92 | dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+ 93 | dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+ 94 | 95 | - run: 96 | name: "Run Tests - Databricks" 97 | command: | 98 | . dbt_venv/bin/activate 99 | echo `pwd` 100 | cd integration_tests 101 | dbt deps --target databricks 102 | dbt seed --target databricks --full-refresh 103 | dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+ 104 | dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+ 105 | 106 | - save_cache: 107 | key: deps1-{{ .Branch }} 108 | paths: 109 | - "dbt_venv" 110 | 111 | 112 | - store_artifacts: 113 | path: integration_tests/logs 114 | - store_artifacts: 115 | path: integration_tests/target 116 | 117 | workflows: 118 | version: 2 119 | test-all: 120 | jobs: 121 | - build: 122 | context: 123 | - profile-redshift 124 | - profile-snowflake 125 | - profile-bigquery 126 | - profile-databricks 127 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @clrcrl 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report a bug or an issue you've found with this package 4 | title: '' 5 | labels: bug, triage 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Describe the bug 11 | 14 | 15 | ### Steps to reproduce 16 | 19 | 20 | ### Expected results 21 | 24 | 25 | ### Actual results 26 | 29 | 30 | ### Screenshots and log output 31 | 34 | 35 | ### System information 36 | **The contents of your `packages.yml` file:** 37 | 38 | **Which database are you using dbt with?** 39 | - [ ] postgres 40 | - [ ] redshift 41 | - [ ] bigquery 42 | - [ ] snowflake 43 | - [ ] other (specify: ____________) 44 | 45 | 46 | **The output of `dbt --version`:** 47 | ``` 48 | 49 | ``` 50 | 51 | **The operating system you're using:** 52 | 53 | **The output of `python --version`:** 54 | 55 | ### Additional context 56 | 59 | 60 | ### Are you interested in contributing the fix? 61 | 64 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this package 4 | title: '' 5 | labels: enhancement, triage 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Describe the feature 11 | A clear and concise description of what you want to happen. 12 | 13 | ### Describe alternatives you've considered 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | ### Additional context 17 | Is this feature database-specific? Which database(s) is/are relevant? Please include any other relevant context here. 18 | 19 | ### Who will this benefit? 20 | What kind of use case will this feature be useful for? Please be specific and provide examples, this will help us prioritize properly. 21 | 22 | ### Are you interested in contributing this feature? 23 | 26 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description & motivation 2 | 5 | 6 | ## Checklist 7 | - [ ] I have verified that these changes work locally 8 | - [ ] I have updated the README.md (if applicable) 9 | - [ ] I have added tests & descriptions to my models (and macros if applicable) 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # **what?** 2 | # Run tests for dbt-codegen against supported adapters 3 | 4 | # **why?** 5 | # To ensure that dbt-codegen works as expected with all supported adapters 6 | 7 | # **when?** 8 | # On every PR, and every push to main and when manually triggered 9 | 10 | name: Package Integration Tests 11 | 12 | on: 13 | push: 14 | branches: 15 | - main 16 | pull_request_target: 17 | workflow_dispatch: 18 | 19 | jobs: 20 | run-tests: 21 | uses: dbt-labs/dbt-package-testing/.github/workflows/run_tox.yml@v1 22 | # this just tests with postgres so no variables need to be passed through. 23 | # When it's time to add more adapters you will need to pass through inputs for 24 | # the other adapters as shown in the below example for redshift 25 | # with: 26 | # # redshift 27 | # REDSHIFT_HOST: ${{ vars.REDSHIFT_HOST }} 28 | # REDSHIFT_USER: ${{ vars.REDSHIFT_USER }} 29 | # REDSHIFT_DATABASE: ${{ vars.REDSHIFT_DATABASE }} 30 | # REDSHIFT_SCHEMA: "integration_tests_redshift_${{ github.run_number }}" 31 | # REDSHIFT_PORT: ${{ vars.REDSHIFT_PORT }} 32 | # secrets: 33 | # DBT_ENV_SECRET_REDSHIFT_PASS: ${{ secrets.DBT_ENV_SECRET_REDSHIFT_PASS }} 34 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # **what?** 2 | # For issues that have been open for awhile without activity, label 3 | # them as stale with a warning that they will be closed out. If 4 | # anyone comments to keep the issue open, it will automatically 5 | # remove the stale label and keep it open. 6 | 7 | # Stale label rules: 8 | # awaiting_response, more_information_needed -> 90 days 9 | # good_first_issue, help_wanted -> 360 days (a year) 10 | # tech_debt -> 720 (2 years) 11 | # all else defaults -> 180 days (6 months) 12 | 13 | # **why?** 14 | # To keep the repo in a clean state from issues that aren't relevant anymore 15 | 16 | # **when?** 17 | # Once a day 18 | 19 | name: "Close stale issues and PRs" 20 | on: 21 | schedule: 22 | - cron: "30 1 * * *" 23 | 24 | permissions: 25 | issues: write 26 | pull-requests: write 27 | 28 | jobs: 29 | stale: 30 | uses: dbt-labs/actions/.github/workflows/stale-bot-matrix.yml@main 31 | -------------------------------------------------------------------------------- /.github/workflows/triage-labels.yml: -------------------------------------------------------------------------------- 1 | # **what?** 2 | # When the maintenance team triages, we sometimes need more information from the issue creator. In 3 | # those cases we remove the `triage` label and add the `awaiting_response` label. Once we 4 | # recieve a response in the form of a comment, we want the `awaiting_response` label removed 5 | # in favor of the `triage` label so we are aware that the issue needs action. 6 | 7 | # **why?** 8 | # To help with out team triage issue tracking 9 | 10 | # **when?** 11 | # This will run when a comment is added to an issue and that issue has the `awaiting_response` label. 12 | 13 | name: Update Triage Label 14 | 15 | on: issue_comment 16 | 17 | defaults: 18 | run: 19 | shell: bash 20 | 21 | permissions: 22 | issues: write 23 | 24 | jobs: 25 | triage_label: 26 | if: contains(github.event.issue.labels.*.name, 'awaiting_response') 27 | uses: dbt-labs/actions/.github/workflows/swap-labels.yml@main 28 | with: 29 | add_label: "triage" 30 | remove_label: "awaiting_response" 31 | secrets: inherit 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_packages/ 3 | logs/ 4 | logfile 5 | .DS_Store 6 | package-lock.yml 7 | integration_tests/package-lock.yml 8 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "yaml.schemas": { 3 | "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_yml_files-latest.json": [ 4 | "/**/*.yml", 5 | "!profiles.yml", 6 | "!dbt_project.yml", 7 | "!packages.yml", 8 | "!selectors.yml", 9 | "!profile_template.yml" 10 | ], 11 | "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_project-latest.json": [ 12 | "dbt_project.yml" 13 | ], 14 | "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/selectors-latest.json": [ 15 | "selectors.yml" 16 | ], 17 | "https://raw.githubusercontent.com/dbt-labs/dbt-jsonschema/main/schemas/latest/packages-latest.json": [ 18 | "packages.yml" 19 | ] 20 | }, 21 | } -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # audit_helper 0.6.0 2 | 🚨 This version requires dbt Core 1.2 or above, and is ready for dbt utils 1.0. 3 | 4 | Changed: 5 | * add column_name to output of compare_column_values by @leoebfolsom in https://github.com/dbt-labs/dbt-audit-helper/pull/47 6 | * Easier switching between summary and details by @christineberger in https://github.com/dbt-labs/dbt-audit-helper/pull/52 7 | * Removes references to dbt_utils for cross-db macros 8 | 9 | New features: 10 | * dbt Cloud instructions for compare_column_values by @SamHarting in https://github.com/dbt-labs/dbt-audit-helper/pull/45 11 | * Compare all columns macro by @leoebfolsom in https://github.com/dbt-labs/dbt-audit-helper/pull/50 12 | 13 | 14 | # audit_helper 0.5.0 15 | This version brings full compatibility with dbt-core 1.0. It requires any version (minor and patch) of v1, which means far less need for compatibility releases in the future. 16 | 17 | # audit_helper 0.4.1 18 | 🚨 This is a compatibility release in preparation for `dbt-core` v1.0.0 (🎉). Projects using this version with dbt-core v1.0.x can expect to see a deprecation warning. This will be resolved in the next minor release. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dbt-audit-helper 2 | 3 | Useful macros when performing data audits 4 | 5 | ## Contents 6 | 7 | - [Installation instructions](#installation-instructions) 8 | - [Compare Data Outputs](#compare-data-outputs) 9 | - Compare and classify: 10 | - [compare_and_classify_query_results](#compare_and_classify_query_results-source) 11 | - [compare_and_classify_relation_rows](#compare_and_classify_relation_rows-source) 12 | - Quick identical check: 13 | - [quick_are_queries_identical](#quick_are_queries_identical-source) 14 | - [quick_are_relations_identical](#quick_are_relations_identical-source) 15 | - [compare\_row\_counts](#compare_row_counts-source) 16 | - [Compare Columns](#compare-columns) 17 | - [compare\_column\_values](#compare_column_values-source) 18 | - [compare\_all\_columns](#compare_all_columns-source) 19 | - Compare which columns differ: 20 | - [compare\_which\_query\_columns\_differ](#compare_which_query_columns_differ-source) 21 | - [compare\_which\_relation\_columns\_differ](#compare_which_relation_columns_differ-source) 22 | - [compare\_relation\_columns](#compare_relation_columns-source) 23 | - [Advanced Usage](#advanced-usage) 24 | - [Print Output To Logs](#print-output-to-logs) 25 | - [Use Output For Custom Singular Test](#use-output-for-custom-singular-test) 26 | - [Legacy Macros](#legacy-macros) 27 | - [compare\_queries](#compare_queries-source) 28 | - [compare\_relations](#compare_relations-source) 29 | - [Internal Macros](#internal-macros) 30 | 31 | ## Installation instructions 32 | 33 | New to dbt packages? Read more about them [here](https://docs.getdbt.com/docs/building-a-dbt-project/package-management/). 34 | 35 | 1. Include this package in your `packages.yml` file — check [here](https://hub.getdbt.com/dbt-labs/audit_helper/latest/) for the latest version number. 36 | 2. Run `dbt deps` to install the package. 37 | 38 | ## Compare Data Outputs 39 | 40 | ### compare_and_classify_query_results ([source](macros/compare_and_classify_query_results.sql)) 41 | 42 | Generates a row-by-row comparison of two queries, as well as summary stats of added, removed, identical and modified records. This prevents you from having to query your comparison tables multiple times to get raw data and summary data. 43 | 44 | #### Output 45 | 46 | | order_id | order_date | status | dbt_audit_in_a | dbt_audit_in_b | dbt_audit_row_status | dbt_audit_num_rows_in_status | dbt_audit_sample_number | 47 | |----------|------------|-----------|----------------|----------------|----------------------|------------------------------|-------------------------| 48 | | 1 | 2024-01-01 | completed | True | True | identical | 1 | 1 | 49 | | 2 | 2024-01-02 | completed | True | False | modified | 2 | 1 | 50 | | 2 | 2024-01-02 | returned | False | True | modified | 2 | 1 | 51 | | 3 | 2024-01-03 | completed | True | False | modified | 2 | 2 | 52 | | 3 | 2024-01-03 | completed | False | True | modified | 2 | 2 | 53 | | 4 | 2024-01-04 | completed | False | True | added | 1 | 1 | 54 | 55 | Note that there are 4 rows with the `modified` status, but `dbt_audit_num_rows_in_status` says 2. This is because it is counting each primary key only once. 56 | 57 | #### Arguments 58 | 59 | - `a_query` and `b_query`: The queries you want to compare. 60 | - `primary_key_columns` (required): A list of primary key column(s) used to join the queries together for comparison. 61 | - `columns` (required): The columns present in the two queries you want to compare. 62 | - `sample_limit`: Number of sample records to return per status. Defaults to 20. 63 | 64 | #### Usage 65 | 66 | ```sql 67 | 68 | {% set old_query %} 69 | select 70 | id as order_id, 71 | amount, 72 | customer_id 73 | from old_database.old_schema.fct_orders 74 | {% endset %} 75 | 76 | {% set new_query %} 77 | select 78 | order_id, 79 | amount, 80 | customer_id 81 | from {{ ref('fct_orders') }} 82 | {% endset %} 83 | 84 | {{ 85 | audit_helper.compare_and_classify_query_results( 86 | old_query, 87 | new_query, 88 | primary_key_columns=['order_id'], 89 | columns=['order_id', 'amount', 'customer_id'] 90 | ) 91 | }} 92 | 93 | ``` 94 | 95 | ### compare_and_classify_relation_rows ([source](macros/compare_and_classify_relation_rows.sql)) 96 | 97 | A wrapper to `compare_which_query_columns_differ`, except it takes two [Relations](https://docs.getdbt.com/reference/dbt-classes#relation) (instead of two queries). 98 | 99 | Each relation must have the same columns with the same names, but they do not have to be in the same order. 100 | 101 | #### Arguments 102 | 103 | - `a_relation` and `b_relation`: The [relations](https://docs.getdbt.com/reference/dbt-classes#relation) you want to compare. 104 | - `primary_key_columns` (required): A list of primary key column(s) used to join the queries together for comparison. 105 | - `columns` (optional): The columns present in the two queries you want to compare. Build long lists with a few exclusions with `dbt_utils.get_filtered_columns_in_relation`, or pass `None` and the macro will find all intersecting columns automatically. 106 | - `sample_limit`: Number of sample records to return per status. Defaults to 20. 107 | 108 | #### Usage 109 | 110 | ```sql 111 | 112 | {% set old_relation = adapter.get_relation( 113 | database = "old_database", 114 | schema = "old_schema", 115 | identifier = "fct_orders" 116 | ) -%} 117 | 118 | {% set dbt_relation = ref('fct_orders') %} 119 | 120 | {{ audit_helper.compare_and_classify_relation_rows( 121 | a_relation = old_relation, 122 | b_relation = dbt_relation, 123 | primary_key_columns = ["order_id"], 124 | columns = None 125 | ) }} 126 | 127 | ``` 128 | 129 | ### quick_are_queries_identical ([source](macros/quick_are_queries_identical.sql)) 130 | 131 | On supported adapters (currently Snowflake and BigQuery), take a hash of all rows in two queries and compare them. 132 | 133 | This can be calculated relatively quickly compared to other macros in this package and can efficiently provide reassurance that a refactor introduced no changes. 134 | 135 | #### Output 136 | 137 | | are_tables_identical | 138 | |----------------------| 139 | | true | 140 | 141 | #### Arguments 142 | 143 | - `a_query` and `b_query`: The queries you want to compare. 144 | - `columns` (required): The columns present in the two queries you want to compare. 145 | 146 | #### Usage 147 | 148 | ```sql 149 | 150 | {% set old_query %} 151 | select * from old_database.old_schema.dim_product 152 | {% endset %} 153 | 154 | {% set new_query %} 155 | select * from {{ ref('dim_product') }} 156 | {% endset %} 157 | 158 | {{ audit_helper.quick_are_queries_identical( 159 | query_a = old_query, 160 | query_b = new_query, 161 | columns=['order_id', 'amount', 'customer_id'] 162 | ) 163 | }} 164 | 165 | ``` 166 | 167 | ### quick_are_relations_identical ([source](macros/quick_are_relations_identical.sql)) 168 | 169 | A wrapper to `quick_are_queries_identical`, except it takes two [Relations](https://docs.getdbt.com/reference/dbt-classes#relation) (instead of two queries). 170 | 171 | Each relation must have the same columns with the same names, but they do not have to be in the same order. Build long lists with a few exclusions with `dbt_utils.get_filtered_columns_in_relation`, or pass `None` and the macro will find all intersecting columns automatically. 172 | 173 | #### Usage 174 | 175 | ```sql 176 | 177 | {% set old_relation = adapter.get_relation( 178 | database = "old_database", 179 | schema = "old_schema", 180 | identifier = "fct_orders" 181 | ) -%} 182 | 183 | {% set dbt_relation = ref('fct_orders') %} 184 | 185 | {{ audit_helper.quick_are_relations_identical( 186 | a_relation = old_relation, 187 | b_relation = dbt_relation, 188 | columns = None 189 | ) }} 190 | 191 | ``` 192 | 193 | ### compare_row_counts ([source](macros/compare_row_counts.sql)) 194 | 195 | This macro does a simple comparison of the row counts in two relations. 196 | 197 | #### Output 198 | 199 | Calling this macro on two different relations will return a very simple table comparing the row counts in each relation. 200 | 201 | | relation_name | total_records | 202 | |----------------------------------------------|---------------:| 203 | | target_database.target_schema.my_a_relation | 34,231 | 204 | | target_database.target_schema.my_b_relation | 24,789 | 205 | 206 | #### Arguments 207 | 208 | - `a_relation` and `b_relation`: The [Relations](https://docs.getdbt.com/reference/dbt-classes#relation) you want to compare. 209 | 210 | #### Usage 211 | 212 | ```sql 213 | 214 | {% set old_relation = adapter.get_relation( 215 | database = "old_database", 216 | schema = "old_schema", 217 | identifier = "fct_orders" 218 | ) -%} 219 | 220 | {% set dbt_relation = ref('fct_orders') %} 221 | 222 | {{ audit_helper.compare_row_counts( 223 | a_relation = old_relation, 224 | b_relation = dbt_relation 225 | ) }} 226 | 227 | ``` 228 | 229 | ## Compare Columns 230 | 231 | ### compare_which_query_columns_differ ([source](macros/compare_which_query_columns_differ.sql)) 232 | 233 | This macro generates SQL that can be used to detect which columns returned by two queries contain _any_ value level changes. 234 | 235 | It does not return the magnitude of the change, only whether or not a difference has occurred. Only records that exist in both queries (as determined by the primary key) are considered. 236 | 237 | #### Output 238 | 239 | The generated query returns whether or not each column has any differences: 240 | 241 | | column_name | has_difference | 242 | |-------------|----------------| 243 | | order_id | False | 244 | | customer_id | False | 245 | | order_date | True | 246 | | status | False | 247 | | amount | True | 248 | 249 | #### Arguments 250 | 251 | - `a_query` and `b_query`: The queries to compare 252 | - `primary_key_columns` (required): A list of primary key column(s) used to join the queries together for comparison. 253 | - `columns` (required): The columns present in the two queries you want to compare. 254 | 255 | ### compare_which_relation_columns_differ ([source](macros/compare_which_relation_columns_differ.sql)) 256 | 257 | A wrapper to `compare_which_query_columns_differ`, except it takes two [Relations](https://docs.getdbt.com/reference/dbt-classes#relation) (instead of two queries). 258 | 259 | Each relation must have the same columns with the same names, but they do not have to be in the same order. Build long lists with a few exclusions with `dbt_utils.get_filtered_columns_in_relation`, or pass `None` and the macro will find all intersecting columns automatically. 260 | 261 | #### Usage 262 | 263 | ```sql 264 | 265 | {% set old_relation = adapter.get_relation( 266 | database = "old_database", 267 | schema = "old_schema", 268 | identifier = "fct_orders" 269 | ) -%} 270 | 271 | {% set dbt_relation = ref('fct_orders') %} 272 | 273 | {{ audit_helper.compare_which_relation_columns_differ( 274 | a_relation = old_relation, 275 | b_relation = dbt_relation, 276 | primary_key_columns = ["order_id"], 277 | columns = None 278 | ) }} 279 | 280 | ``` 281 | 282 | ```sql 283 | 284 | {% set old_relation = adapter.get_relation( 285 | database = "old_database", 286 | schema = "old_schema", 287 | identifier = "fct_orders" 288 | ) -%} 289 | 290 | {% set dbt_relation = ref('fct_orders') %} 291 | 292 | {% set columns = dbt_utils.get_filtered_columns_in_relation(old_relation, exclude=["loaded_at"]) %} 293 | 294 | {{ audit_helper.compare_which_relation_columns_differ( 295 | a_relation = old_relation, 296 | b_relation = dbt_relation, 297 | primary_key_columns = ["order_id"], 298 | columns = columns 299 | ) }} 300 | 301 | ``` 302 | 303 | ### compare_column_values ([source](macros/compare_column_values.sql)) 304 | 305 | This macro generates SQL that can be used to compare a column's values across two queries. This macro is useful when you've used the `compare_which_query_columns_differ` macro to identify a column with differing values and want to understand how many discrepancies are caused by that column. 306 | 307 | #### Output 308 | 309 | The generated query returns a summary of the count of rows where the column's values: 310 | 311 | - match perfectly 312 | - differ 313 | - are null in `a` or `b` or both 314 | - are missing from `a` or `b` 315 | 316 | | match_status | count | percent_of_total | 317 | |-----------------------------|-------:|-----------------:| 318 | | ✅: perfect match | 37,721 | 79.03 | 319 | | ✅: both are null | 5,789 | 12.13 | 320 | | 🤷: missing from a | 5 | 0.01 | 321 | | 🤷: missing from b | 20 | 0.04 | 322 | | 🤷: value is null in a only | 59 | 0.12 | 323 | | 🤷: value is null in b only | 73 | 0.15 | 324 | | ❌: ‍values do not match | 4,064 | 8.51 | 325 | 326 | #### Arguments 327 | 328 | - `a_query` and `b_query`: The queries you want to compare. 329 | - `primary_key`: The primary key of the model. Used to sort unmatched results for row-by-row validation. Must be a unique key (unqiue and never `null`) in both tables, otherwise the join won't work as expected. 330 | - `column_to_compare`: The column you want to compare. 331 | - `emojis` (optional): Boolean argument that defaults to `true` and displays ✅, 🤷 and ❌ for easier visual scanning. If you don't want to include emojis in the output, set it to `false`. 332 | - `a_relation_name` and `b_relation_name` (optional): Names of the queries you want displayed in the output. Default is `a` and `b`. 333 | 334 | #### Usage 335 | 336 | ```sql 337 | 338 | {% set old_query %} 339 | select * from old_database.old_schema.dim_product 340 | where is_latest 341 | {% endset %} 342 | 343 | {% set new_query %} 344 | select * from {{ ref('dim_product') }} 345 | {% endset %} 346 | 347 | {{ audit_helper.compare_column_values( 348 | a_query = old_query, 349 | b_query = new_query, 350 | primary_key = "product_id", 351 | column_to_compare = "status" 352 | ) }} 353 | 354 | ``` 355 | 356 | ### compare_all_columns ([source](macros/compare_all_columns.sql)) 357 | 358 | Similar to `compare_column_values`, except it can be used to compare _all_ columns' values across two _relations_. This macro is useful when you've used the `compare_queries` macro and found that a significant number of your records don't match and want to understand how many discrepancies are caused by each column. 359 | 360 | #### Output 361 | 362 | By default, the generated query returns a summary of the count of rows where the each column's values: 363 | 364 | - match perfectly 365 | - differ 366 | - are null in `a` or `b` or both 367 | - are missing from `a` or `b` 368 | 369 | | column_name | perfect_match | null_in_a | null_in_b | missing_from_a | missing_from_b | conflicting_values | 370 | |-------|-------:|------:|-----------------:|------:|------:|------:| 371 | | order_id | 10 | 0 | 0 | 0 | 0 | 0 | 372 | | order_date | 2 | 0 | 0 | 0 | 0 | 8 | 373 | | order_status | 6 | 4 | 4 | 0 | 0 | 0 | 374 | 375 | Setting the `summarize` argument to `false` lets you check the match status of a specific column value of a specifc row: 376 | 377 | | primary_key | column_name | perfect_match | null_in_a | null_in_b | missing_from_a | missing_from_b | conflicting_values | 378 | |-------|-------|-------:|------:|-----------------:|------:|------:|------:| 379 | | 1 | order_id | true | false | false | false | false | false | 380 | | 1 | order_date | false | false | false | false | false | true | 381 | | 1 | order_status | false | true | true | false | false | false | 382 | | ... | ... | ... | ... | ... | ... | ... | ... | 383 | 384 | #### Arguments 385 | 386 | - `a_relation` and `b_relation`: The [relations](https://docs.getdbt.com/reference/dbt-classes#relation) you want to compare. Any two relations that have the same columns can be used. 387 | - `primary_key`: The primary key of the model (or concatenated sql to create the primary key). Used to sort unmatched results for row-by-row validation. Must be a unique key (unique and never `null`) in both tables, otherwise the join won't work as expected. 388 | - `exclude_columns` (optional): Any columns you wish to exclude from the validation. 389 | - `summarize` (optional): Allows you to switch between a summary or detailed view of the compared data. Defaults to `true`. 390 | 391 | #### Usage 392 | 393 | ```sql 394 | 395 | {% set old_relation = adapter.get_relation( 396 | database = "old_database", 397 | schema = "old_schema", 398 | identifier = "fct_orders" 399 | ) -%} 400 | 401 | {% set dbt_relation = ref('fct_orders') %} 402 | 403 | {{ audit_helper.compare_all_columns( 404 | a_relation = old_relation, 405 | b_relation = dbt_relation, 406 | primary_key = "order_id" 407 | ) }} 408 | 409 | ``` 410 | 411 | ### compare_relation_columns ([source](macros/compare_relation_columns.sql)) 412 | 413 | This macro generates SQL that can be used to compare the schema (ordinal position and data types of columns) of two relations. This is especially useful when: 414 | 415 | - Comparing a new version of a relation with an old one, to make sure that the structure is the same 416 | - Helping figure out why a `union` of two relations won't work (often because the data types are different) 417 | 418 | #### Output 419 | 420 | | column_name | a_ordinal_position | b_ordinal_position | a_data_type | b_data_type | has_ordinal_position_match | has_data_type_match | in_a_only | in_b_only | in_both | 421 | |-------------|--------------------|--------------------|-------------------|-------------------| -------------------------- | ------------------- | --------- | --------- | ------- | 422 | | order_id | 1 | 1 | integer | integer | True | True | False | False | True | 423 | | customer_id | 2 | 2 | integer | integer | True | True | False | False | True | 424 | | order_date | 3 | 3 | timestamp | date | True | False | False | False | True | 425 | | status | 4 | 5 | character varying | character varying | False | True | False | False | True | 426 | | amount | 5 | 4 | bigint | bigint | False | True | False | False | True | 427 | 428 | Note: For adapters other than BigQuery, Postgres, Redshift, and Snowflake, the ordinal position is inferred based on the response from dbt Core's `adapter.get_columns_in_relation()`, as opposed to being loaded from the information schema. 429 | 430 | #### Arguments 431 | 432 | - `a_relation` and `b_relation`: The [relations](https://docs.getdbt.com/reference/dbt-classes#relation) you want to compare. 433 | 434 | #### Usage 435 | 436 | ```sql 437 | 438 | {% set old_relation = adapter.get_relation( 439 | database = "old_database", 440 | schema = "old_schema", 441 | identifier = "fct_orders" 442 | ) -%} 443 | 444 | {% set dbt_relation = ref('fct_orders') %} 445 | 446 | {{ audit_helper.compare_relation_columns( 447 | a_relation=old_relation, 448 | b_relation=dbt_relation 449 | ) }} 450 | 451 | ``` 452 | 453 | ## Advanced Usage 454 | 455 | ### Print Output To Logs 456 | 457 | You may want to print the output of the query generated by an audit helper macro to your logc (instead of previewing the results). 458 | 459 | To do so, you can alternatively store the results of your query and print it to the logs. 460 | 461 | For example, using the `compare_column_values` macro: 462 | 463 | ```sql 464 | {% set old_query %} 465 | select * from old_database.old_schema.dim_product 466 | where is_latest 467 | {% endset %} 468 | 469 | {% set new_query %} 470 | select * from {{ ref('dim_product') }} 471 | {% endset %} 472 | 473 | {% set audit_query = audit_helper.compare_column_values( 474 | a_query = old_query, 475 | b_query = new_query, 476 | primary_key = "product_id", 477 | column_to_compare = "status" 478 | ) %} 479 | 480 | {% set audit_results = run_query(audit_query) %} 481 | 482 | {% if execute %} 483 | {% do audit_results.print_table() %} 484 | {% endif %} 485 | ``` 486 | 487 | The `.print_table()` function is not compatible with dbt Cloud, so an adjustment needs to be made in order to print the results. Add the following code to a new macro file: 488 | 489 | ```sql 490 | {% macro print_audit_output() %} 491 | {%- set columns_to_compare=adapter.get_columns_in_relation(ref('fct_orders')) -%} 492 | 493 | {% set old_etl_relation_query %} 494 | select * from public.dim_product 495 | {% endset %} 496 | 497 | {% set new_etl_relation_query %} 498 | select * from {{ ref('fct_orders') }} 499 | {% endset %} 500 | 501 | {% if execute %} 502 | {% for column in columns_to_compare %} 503 | {{ log('Comparing column "' ~ column.name ~'"', info=True) }} 504 | {% set audit_query = audit_helper.compare_column_values( 505 | a_query=old_etl_relation_query, 506 | b_query=new_etl_relation_query, 507 | primary_key="order_id", 508 | column_to_compare=column.name 509 | ) %} 510 | 511 | {% set audit_results = run_query(audit_query) %} 512 | 513 | {% do log(audit_results.column_names, info=True) %} 514 | {% for row in audit_results.rows %} 515 | {% do log(row.values(), info=True) %} 516 | {% endfor %} 517 | {% endfor %} 518 | {% endif %} 519 | 520 | {% endmacro %} 521 | ``` 522 | 523 | To run the macro, execute `dbt run-operation print_audit_output()` in the command bar. 524 | 525 | ### Use Output For Custom Singular Test 526 | 527 | If desired, you can use the audit helper macros to add a dbt test to your project to protect against unwanted changes to your data outputs. 528 | 529 | For example, using the `compare_all_columns` macro, you could set up a test that will fail if any column values do not match. 530 | 531 | Users can configure what exactly constitutes a value match or failure. If there is a test failure, results can be inspected in the warehouse. The primary key and the column name can be included in the test output that gets written to the warehouse. This enables the user to join test results to relevant tables in your dev or prod schema to investigate the error. 532 | 533 | _Note: this test should only be used on (and will only work on) models that have a primary key that is reliably `unique` and `not_null`. [Generic dbt tests](https://docs.getdbt.com/docs/building-a-dbt-project/tests#generic-tests) should be used to ensure the model being tested meets the requirements of `unique` and `not_null`._ 534 | 535 | To create a test for the `stg_customers` model, create a custom test 536 | in the `tests` subdirectory of your dbt project that looks like this: 537 | 538 | ```sql 539 | {{ 540 | audit_helper.compare_all_columns( 541 | a_relation=ref('stg_customers'), -- in a test, this ref will compile as your dev or PR schema. 542 | b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), -- you can explicitly write a relation to select your production schema, or any other db/schema/table you'd like to use for comparison testing. 543 | exclude_columns=['updated_at'], 544 | primary_key='id' 545 | ) 546 | }} 547 | where not perfect_match 548 | ``` 549 | 550 | The `where not perfect_match` statement is an example of a filter you can apply to define whatconstitutes a test failure. The test will fail if any rows don't meet the requirement of a perfect match. Failures would include: 551 | 552 | - If the primary key exists in both relations, but one model has a null value in a column. 553 | - If a primary key is missing from one relation. 554 | - If the primary key exists in both relations, but the value conflicts. 555 | 556 | If you'd like the test to only fail when there are conflicting values, you could configure it like this: 557 | 558 | ```sql 559 | {{ 560 | audit_helper.compare_all_columns( 561 | a_relation=ref('stg_customers'), 562 | b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), 563 | primary_key='id' 564 | ) 565 | }} 566 | where conflicting_values 567 | ``` 568 | 569 | If you want to create test results that include columns from the model itself for easier inspection, that can be written into the test: 570 | 571 | ```sql 572 | {{ 573 | audit_helper.compare_all_columns( 574 | a_relation=ref('stg_customers'), 575 | b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), 576 | exclude_columns=['updated_at'], 577 | primary_key='id' 578 | ) 579 | }} 580 | left join {{ ref('stg_customers') }} using(id) 581 | ``` 582 | 583 | This structure also allows for the test to group or filter by any attribute in the model or in the macro's output as part of the test, for example: 584 | 585 | ```sql 586 | with base_test_cte as ( 587 | {{ 588 | audit_helper.compare_all_columns( 589 | a_relation=ref('stg_customers'), 590 | b_relation=api.Relation.create(database='dbt_db', schema='analytics_prod', identifier='stg_customers'), 591 | exclude_columns=['updated_at'], 592 | primary_key='id' 593 | ) 594 | }} 595 | left join {{ ref('stg_customers') }} using(id) 596 | where conflicting_values 597 | ) 598 | select 599 | status, -- assume there's a "status" column in stg_customers 600 | count(distinct case when conflicting_values then id end) as conflicting_values 601 | from base_test_cte 602 | group by 1 603 | ``` 604 | 605 | You can write a `compare_all_columns` test on individual table; and the test will be run as part of a full test suite run - `dbt test --select stg_customers`. 606 | 607 | If you want to [store results in the warehouse for further analysis](https://docs.getdbt.com/docs/building-a-dbt-project/tests#storing-test-failures), add the `--store-failures` flag. 608 | 609 | ## Legacy Macros 610 | 611 | ### compare_queries ([source](macros/compare_queries.sql)) 612 | 613 | > [!TIP] 614 | > Consider `compare_and_classify_query_results` instead 615 | 616 | This macro generates SQL that can be used to do a row-by-row comparison of two queries. This macro is particularly useful when you want to check that a refactored model (or a model that you are moving over from a legacy system) are identical. `compare_queries` provides flexibility when: 617 | 618 | - You need to filter out records from one of the relations. 619 | - You need to rename or recast some columns to get them to match up. 620 | - You only want to compare a small number of columns, so it's easier to write the columns you want to compare, rather than the columns you want to exclude. 621 | 622 | #### Output 623 | 624 | By default, the generated query returns a summary of the count of rows that are unique to `a`, unique to `b`, and identical: 625 | 626 | | in_a | in_b | count | percent_of_total | 627 | |-------|-------|------:|-----------------:| 628 | | True | True | 6870 | 99.74 | 629 | | True | False | 9 | 0.13 | 630 | | False | True | 9 | 0.13 | 631 | 632 | Setting the `summarize` argument to `false` lets you check which rows do not match between relations: 633 | 634 | | order_id | order_date | status | in_a | in_b | 635 | |----------|------------|-----------|-------|-------| 636 | | 1 | 2018-01-01 | completed | True | False | 637 | | 1 | 2018-01-01 | returned | False | True | 638 | | 2 | 2018-01-02 | completed | True | False | 639 | | 2 | 2018-01-02 | returned | False | True | 640 | 641 | #### Arguments 642 | 643 | - `a_query` and `b_query`: The queries you want to compare. 644 | - `primary_key` (optional): The primary key of the model (or concatenated sql to create the primary key). Used to sort unmatched results for row-by-row validation. 645 | - `summarize` (optional): Allows you to switch between a summary or detailed view of the compared data. Accepts `true` or `false` values. Defaults to `true`. 646 | - `limit` (optional): Allows you to limit the number of rows returned when `summarize = False`. Defaults to `None` (no limit). 647 | 648 | #### Usage 649 | 650 | ```sql 651 | 652 | {% set old_query %} 653 | select 654 | id as order_id, 655 | amount, 656 | customer_id 657 | from old_database.old_schema.fct_orders 658 | {% endset %} 659 | 660 | {% set new_query %} 661 | select 662 | order_id, 663 | amount, 664 | customer_id 665 | from {{ ref('fct_orders') }} 666 | {% endset %} 667 | 668 | {{ audit_helper.compare_queries( 669 | a_query = old_query, 670 | b_query = new_query, 671 | primary_key = "order_id" 672 | ) }} 673 | 674 | ``` 675 | 676 | ### compare_relations ([source](macros/compare_relations.sql)) 677 | 678 | > [!TIP] 679 | > Consider `compare_and_classify_relation_rows` instead 680 | 681 | A wrapper to `compare_queries`, except it takes two [Relations](https://docs.getdbt.com/reference/dbt-classes#relation) (instead of two queries). 682 | 683 | Each relation must have the same columns with the same names, but they do not have to be in the same order. Use `exclude_columns` if some columns only exist in one relation. 684 | 685 | #### Arguments 686 | 687 | - `a_relation` and `b_relation`: The [relations](https://docs.getdbt.com/reference/dbt-classes#relation) you want to compare. 688 | - `primary_key` (optional): The primary key of the model (or concatenated sql to create the primary key). Used to sort unmatched results for row-by-row validation. 689 | - `exclude_columns` (optional): Any columns you wish to exclude from the validation. 690 | - `summarize` (optional): Allows you to switch between a summary or detailed view of the compared data. Accepts `true` or `false` values. Defaults to `true`. 691 | - `limit` (optional): Allows you to limit the number of rows returned when `summarize = False`. Defaults to `None` (no limit). 692 | 693 | #### Usage 694 | 695 | ```sql 696 | 697 | {% set old_relation = adapter.get_relation( 698 | database = "old_database", 699 | schema = "old_schema", 700 | identifier = "fct_orders" 701 | ) -%} 702 | 703 | {% set dbt_relation = ref('fct_orders') %} 704 | 705 | {{ audit_helper.compare_relations( 706 | a_relation = old_relation, 707 | b_relation = dbt_relation, 708 | exclude_columns = ["loaded_at"], 709 | primary_key = "order_id" 710 | ) }} 711 | 712 | ``` 713 | 714 | ## Internal Macros 715 | 716 | Macros prefixed with an `_` (such as those in the `utils/` subdirectory) are for private use. They are not documented or contracted and can change without notice. 717 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # dbt-audit-helper releases 2 | 3 | ## When do we release? 4 | There's a few scenarios that might prompt a release: 5 | 6 | | Scenario | Release type | 7 | |--------------------------------------------|--------------| 8 | | Breaking changes to existing macros | minor | 9 | | New functionality | minor | 10 | | Fixes to existing macros | patch | 11 | 12 | ## Release process 13 | 14 | 1. Begin a new release by clicking [here](https://github.com/dbt-labs/dbt-audit-helper/releases/new) 15 | 1. Click "Choose a tag", then paste your version number (with no "v" in the name), then click "Create new tag: x.y.z. on publish" 16 | - The “Release title” will be identical to the tag name 17 | 1. Click the "Generate release notes" button 18 | 1. Copy and paste the generated release notes into `CHANGELOG.md`, commit, and merge into the `main` branch 19 | 1. Click the "Publish release" button 20 | - This will automatically create an "Assets" section containing: 21 | - Source code (zip) 22 | - Source code (tar.gz) 23 | -------------------------------------------------------------------------------- /dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'audit_helper' 2 | version: '0.4.0' 3 | config-version: 2 4 | 5 | require-dbt-version: [">=1.2.0", "<2.0.0"] 6 | 7 | target-path: "target" 8 | clean-targets: ["target", "dbt_packages"] 9 | macro-paths: ["macros"] 10 | log-path: "logs" 11 | -------------------------------------------------------------------------------- /integration_tests/.env/postgres.env: -------------------------------------------------------------------------------- 1 | POSTGRES_HOST=localhost 2 | POSTGRES_USER=root 3 | DBT_ENV_SECRET_POSTGRES_PASS=password 4 | POSTGRES_PORT=5432 5 | POSTGRES_DATABASE=audit_helper_test 6 | POSTGRES_SCHEMA=audit_helper_integration_tests_postgres 7 | -------------------------------------------------------------------------------- /integration_tests/analyses/compare_column_values_smoke_test.sql: -------------------------------------------------------------------------------- 1 | {% set a_query %} 2 | select * from {{ ref('data_compare_relations__a_relation') }} 3 | {% endset %} 4 | 5 | {% set audit_query = audit_helper.compare_column_values( 6 | a_query=a_query, 7 | b_query=a_query, 8 | primary_key="col_a", 9 | column_to_compare="col_b" 10 | ) %} 11 | 12 | {{ audit_query }} 13 | 14 | {% if execute %} 15 | 16 | {% set audit_results = run_query(audit_query) %} 17 | 18 | {% do audit_results.print_table() %} 19 | 20 | {% endif %} 21 | -------------------------------------------------------------------------------- /integration_tests/analyses/compare_relation_columns_smoke_test.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation') %} 2 | 3 | {% set compare_relation_columns_sql = audit_helper.compare_relation_columns( 4 | a_relation, 5 | a_relation 6 | ) %} 7 | 8 | {{ compare_relation_columns_sql }} 9 | 10 | {% if execute %} 11 | 12 | {% set results = run_query(compare_relation_columns_sql) %} 13 | {% do results.print_table() %} 14 | 15 | {% endif %} 16 | -------------------------------------------------------------------------------- /integration_tests/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'audit_helper_integration_tests' 2 | version: '1.0' 3 | config-version: 2 4 | 5 | profile: 'integration_tests' 6 | 7 | model-paths: ["models"] 8 | analysis-paths: ["analyses"] 9 | test-paths: ["tests"] 10 | seed-paths: ["seeds"] 11 | macro-paths: ["macros"] 12 | 13 | target-path: "target" # directory which will store compiled SQL files 14 | clean-targets: # directories to be removed by `dbt clean` 15 | - "target" 16 | - "dbt_packages" 17 | 18 | seeds: 19 | +quote_columns: false 20 | 21 | vars: 22 | compare_queries_summarize: true 23 | primary_key_columns_var: ['col1'] 24 | columns_var: ['col1'] 25 | event_time_var: 26 | quick_are_queries_identical_cols: ['col1'] 27 | 28 | flags: 29 | send_anonymous_usage_stats: False 30 | use_colors: True -------------------------------------------------------------------------------- /integration_tests/macros/unit_tests/struct_generation_macros.sql: -------------------------------------------------------------------------------- 1 | {%- macro _basic_json_function() -%} 2 | {%- if target.type == 'snowflake' -%} 3 | object_construct 4 | {%- elif target.type == 'bigquery' -%} 5 | json_object 6 | {%- elif target.type == 'databricks' -%} 7 | map 8 | {%- elif execute -%} 9 | {# Only raise exception if it's actually being called, not during parsing #} 10 | {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'") -%} 11 | {%- endif -%} 12 | {%- endmacro -%} 13 | 14 | {% macro _complex_json_function(json) %} 15 | 16 | {% if target.type == 'redshift' %} 17 | json_parse({{ json }}) 18 | {% elif target.type == 'databricks' %} 19 | from_json({{ json }}, schema_of_json({{ json }})) 20 | {% elif target.type in ['snowflake', 'bigquery'] %} 21 | parse_json({{ json }}) 22 | {% elif execute %} 23 | {# Only raise exception if it's actually being called, not during parsing #} 24 | {%- do exceptions.raise_compiler_error("Unknown adapter '"~ target.type ~ "'") -%} 25 | {% endif %} 26 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_all_columns_concat_pk_with_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_all_columns__market_of_choice_produce__concat_pk')%} 2 | 3 | {% set b_relation=ref('data_compare_all_columns__albertsons_produce__concat_pk') %} 4 | 5 | {{ audit_helper.compare_all_columns( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key=dbt_utils.generate_surrogate_key(['produce_category', 'id']) 9 | ) }} 10 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_all_columns_concat_pk_without_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_all_columns__market_of_choice_produce__concat_pk')%} 2 | 3 | {% set b_relation=ref('data_compare_all_columns__albertsons_produce__concat_pk') %} 4 | 5 | {{ audit_helper.compare_all_columns( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key=dbt_utils.generate_surrogate_key(['produce_category', 'id']), 9 | summarize=false 10 | ) }} 11 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_all_columns_where_clause.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} 2 | 3 | {% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} 4 | 5 | {{ audit_helper.compare_all_columns( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key="id", 9 | summarize=false 10 | ) }} 11 | where not perfect_match -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_all_columns_with_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} 2 | 3 | {% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} 4 | 5 | {{ audit_helper.compare_all_columns( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key="id" 9 | ) }} 10 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_all_columns_with_summary_and_exclude.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} 2 | 3 | {% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} 4 | 5 | {{ audit_helper.compare_all_columns( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key="id", 9 | exclude_columns=['ripeness'] 10 | ) }} 11 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_all_columns_without_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_all_columns__market_of_choice_produce')%} 2 | 3 | {% set b_relation=ref('data_compare_all_columns__albertsons_produce') %} 4 | 5 | {{ audit_helper.compare_all_columns( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key="id", 9 | summarize=false 10 | ) }} 11 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_and_classify_query_results.sql: -------------------------------------------------------------------------------- 1 | -- this has no tests, it's just making sure that the introspecive queries for event_time actually run 2 | 3 | {{ 4 | audit_helper.compare_and_classify_query_results( 5 | a_query="select * from " ~ ref('unit_test_model_a') ~ " where 1=1", 6 | b_query="select * from " ~ ref('unit_test_model_b') ~ " where 1=1", 7 | primary_key_columns=['id'], 8 | columns=['id', 'col1', 'col2'], 9 | event_time='created_at' 10 | ) 11 | }} -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_queries.sql: -------------------------------------------------------------------------------- 1 | {% set a_query %} 2 | select * from {{ ref('data_compare_relations__a_relation') }} 3 | {% endset %} 4 | 5 | {% set b_query %} 6 | select * from {{ ref('data_compare_relations__b_relation') }} 7 | {% endset %} 8 | 9 | {{ audit_helper.compare_queries( 10 | a_query=a_query, 11 | b_query=b_query, 12 | primary_key="order_id" 13 | ) }} 14 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_queries_concat_pk_without_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_query %} 2 | select * from {{ ref('data_compare_relations__a_relation') }} 3 | {% endset %} 4 | 5 | {% set b_query %} 6 | select * from {{ ref('data_compare_relations__b_relation') }} 7 | {% endset %} 8 | 9 | {{ audit_helper.compare_queries( 10 | a_query=a_query, 11 | b_query=b_query, 12 | primary_key=dbt_utils.generate_surrogate_key(['col_a', 'col_b']), 13 | summarize=false 14 | ) }} 15 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_queries_with_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_query %} 2 | select * from {{ ref('data_compare_relations__a_relation') }} 3 | {% endset %} 4 | 5 | {% set b_query %} 6 | select * from {{ ref('data_compare_relations__b_relation') }} 7 | {% endset %} 8 | 9 | {{ audit_helper.compare_queries( 10 | a_query=a_query, 11 | b_query=b_query, 12 | primary_key="col_a" 13 | ) }} -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_queries_without_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_query %} 2 | select * from {{ ref('data_compare_relations__a_relation') }} 3 | {% endset %} 4 | 5 | {% set b_query %} 6 | select * from {{ ref('data_compare_relations__b_relation') }} 7 | {% endset %} 8 | 9 | {{ audit_helper.compare_queries( 10 | a_query=a_query, 11 | b_query=b_query, 12 | primary_key="col_a", 13 | summarize=false 14 | ) }} -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_relation_columns.sql: -------------------------------------------------------------------------------- 1 | 2 | with audit_helper_results as ( 3 | {{ audit_helper.compare_relation_columns( 4 | a_relation=ref('data_compare_relation_columns_a'), 5 | b_relation=ref('data_compare_relation_columns_b') 6 | ) }} 7 | ) 8 | 9 | select 10 | --These need to be cast, otherwise they are technically typed as "sql_identifier" or "cardinal_number" on Redshift 11 | {{ "lower(" if target.type == 'snowflake' }} cast(column_name as {{ dbt.type_string() }}) {{ ")" if target.type == 'snowflake' }} as column_name, 12 | cast(a_ordinal_position as {{ dbt.type_int() }}) as a_ordinal_position, 13 | cast(b_ordinal_position as {{ dbt.type_int() }}) as b_ordinal_position, 14 | --not checking the specific datatypes, as long as they match/don't match as expected then that's still checking the audit behaviour 15 | has_ordinal_position_match, 16 | has_data_type_match 17 | from audit_helper_results -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_relations_concat_pk_without_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation')%} 2 | 3 | {% set b_relation=ref('data_compare_relations__b_relation') %} 4 | 5 | {{ audit_helper.compare_relations( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key=dbt_utils.generate_surrogate_key(['col_a', 'col_b']), 9 | summarize=false 10 | ) }} 11 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_relations_with_exclude.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation')%} 2 | 3 | {% set b_relation=ref('data_compare_relations__b_relation') %} 4 | 5 | {{ audit_helper.compare_relations( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | exclude_columns=['col_b'], 9 | primary_key="col_a" 10 | ) }} 11 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_relations_with_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation')%} 2 | 3 | {% set b_relation=ref('data_compare_relations__b_relation') %} 4 | 5 | {{ audit_helper.compare_relations( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key="col_a" 9 | ) }} 10 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_relations_without_exclude.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation')%} 2 | 3 | {% set b_relation=ref('data_compare_relations__b_relation') %} 4 | 5 | {{ audit_helper.compare_relations( 6 | a_relation=a_relation, 7 | b_relation=b_relation 8 | ) }} 9 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_relations_without_summary.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation')%} 2 | 3 | {% set b_relation=ref('data_compare_relations__b_relation') %} 4 | 5 | {{ audit_helper.compare_relations( 6 | a_relation=a_relation, 7 | b_relation=b_relation, 8 | primary_key="col_a", 9 | summarize=false 10 | ) }} 11 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_row_counts.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_relations__a_relation')%} 2 | 3 | {% set b_relation=ref('data_compare_relations__b_relation') %} 4 | 5 | select 6 | case 7 | when relation_name = '{{ a_relation }}' 8 | then 'a' 9 | else 'b' 10 | end as relation_name, 11 | total_records 12 | 13 | from ( 14 | 15 | {{ audit_helper.compare_row_counts( 16 | a_relation=a_relation, 17 | b_relation=b_relation 18 | ) }} 19 | 20 | ) as base_query -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_which_columns_differ.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_which_columns_differ_a')%} 2 | 3 | {% set b_relation=ref('data_compare_which_columns_differ_b') %} 4 | 5 | -- lowercase for CI 6 | 7 | select 8 | lower(column_name) as column_name, 9 | has_difference 10 | from ( 11 | 12 | {{ audit_helper.compare_which_relation_columns_differ( 13 | a_relation=a_relation, 14 | b_relation=b_relation, 15 | primary_key_columns=["id"] 16 | ) }} 17 | ) as macro_output 18 | -------------------------------------------------------------------------------- /integration_tests/models/data_tests/compare_which_columns_differ_exclude_cols.sql: -------------------------------------------------------------------------------- 1 | {% set a_relation=ref('data_compare_which_columns_differ_a')%} 2 | 3 | {% set b_relation=ref('data_compare_which_columns_differ_b') %} 4 | 5 | {% set pk_cols = ['id'] %} 6 | {% set cols = ['id','value_changes','becomes_not_null','does_not_change'] %} 7 | 8 | {% if target.type == 'snowflake' %} 9 | {% set pk_cols = pk_cols | map("upper") | list %} 10 | {% set cols = cols | map("upper") | list %} 11 | {% endif %} 12 | 13 | select 14 | lower(column_name) as column_name, 15 | has_difference 16 | from ( 17 | 18 | {{ audit_helper.compare_which_relation_columns_differ( 19 | a_relation=a_relation, 20 | b_relation=b_relation, 21 | primary_key_columns=pk_cols, 22 | columns=cols 23 | ) }} 24 | 25 | ) as macro_output -------------------------------------------------------------------------------- /integration_tests/models/data_tests/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: compare_queries 5 | data_tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_results__compare_relations_without_exclude') 8 | 9 | - name: compare_queries_concat_pk_without_summary 10 | data_tests: 11 | - dbt_utils.equality: 12 | compare_model: ref('expected_results__compare_without_summary') 13 | 14 | - name: compare_queries_with_summary 15 | data_tests: 16 | - dbt_utils.equality: 17 | compare_model: ref('expected_results__compare_with_summary') 18 | 19 | - name: compare_queries_without_summary 20 | data_tests: 21 | - dbt_utils.equality: 22 | compare_model: ref('expected_results__compare_without_summary') 23 | 24 | - name: compare_relations_with_summary 25 | data_tests: 26 | - dbt_utils.equality: 27 | compare_model: ref('expected_results__compare_with_summary') 28 | 29 | - name: compare_relations_without_summary 30 | data_tests: 31 | - dbt_utils.equality: 32 | compare_model: ref('expected_results__compare_without_summary') 33 | 34 | - name: compare_relations_with_exclude 35 | data_tests: 36 | - dbt_utils.equality: 37 | compare_model: ref('expected_results__compare_relations_with_exclude') 38 | 39 | - name: compare_relations_without_exclude 40 | data_tests: 41 | - dbt_utils.equality: 42 | compare_model: ref('expected_results__compare_relations_without_exclude') 43 | 44 | - name: compare_all_columns_with_summary 45 | data_tests: 46 | - dbt_utils.equality: 47 | compare_model: ref('expected_results__compare_all_columns_with_summary') 48 | 49 | - name: compare_all_columns_without_summary 50 | data_tests: 51 | - dbt_utils.equality: 52 | compare_model: ref('expected_results__compare_all_columns_without_summary') 53 | 54 | - name: compare_all_columns_concat_pk_with_summary 55 | data_tests: 56 | - dbt_utils.equality: 57 | compare_model: ref('expected_results__compare_all_columns_concat_pk_with_summary') 58 | 59 | - name: compare_all_columns_concat_pk_without_summary 60 | data_tests: 61 | - dbt_utils.equality: 62 | compare_model: ref('expected_results__compare_all_columns_concat_pk_without_summary') 63 | 64 | - name: compare_all_columns_with_summary_and_exclude 65 | data_tests: 66 | - dbt_utils.equality: 67 | compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude') 68 | 69 | - name: compare_all_columns_where_clause 70 | data_tests: 71 | - dbt_utils.equality: 72 | compare_model: ref('expected_results__compare_all_columns_where_clause') 73 | 74 | - name: compare_relation_columns 75 | data_tests: 76 | - dbt_utils.equality: 77 | compare_model: ref('expected_results__compare_relation_columns') 78 | 79 | - name: compare_relations_concat_pk_without_summary 80 | data_tests: 81 | - dbt_utils.equality: 82 | compare_model: ref('expected_results__compare_without_summary') 83 | 84 | - name: compare_which_columns_differ 85 | data_tests: 86 | - dbt_utils.equality: 87 | compare_model: ref('expected_results__compare_which_columns_differ') 88 | 89 | - name: compare_which_columns_differ_exclude_cols 90 | data_tests: 91 | - dbt_utils.equality: 92 | compare_model: ref('expected_results__compare_which_columns_differ_exclude_cols') 93 | 94 | - name: compare_row_counts 95 | data_tests: 96 | - dbt_utils.equality: 97 | compare_model: ref('expected_results__compare_row_counts') 98 | -------------------------------------------------------------------------------- /integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql: -------------------------------------------------------------------------------- 1 | select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at -------------------------------------------------------------------------------- /integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql: -------------------------------------------------------------------------------- 1 | select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at -------------------------------------------------------------------------------- /integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql: -------------------------------------------------------------------------------- 1 | {{ config(tags=['skip' if (target.type in ['postgres']) else 'runnable']) }} 2 | 3 | {% if target.name != 'redshift' %} 4 | 5 | select 6 | 1 as id, 7 | 'John Doe' as col1, 8 | {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 9 | 10 | {% else %} 11 | 12 | select 13 | 1 AS id, 14 | 'John Doe' AS col1, 15 | json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 16 | {% endif %} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql: -------------------------------------------------------------------------------- 1 | {{ config(tags=['skip' if (target.type in ['postgres']) else 'runnable']) }} 2 | 3 | {% if target.name != 'redshift' %} 4 | 5 | select 6 | 1 as id, 7 | 'John Doe' as col1, 8 | {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 9 | 10 | {% else %} 11 | 12 | select 13 | 1 AS id, 14 | 'John Doe' AS col1, 15 | json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 16 | {% endif %} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_classify.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | audit_helper.compare_and_classify_query_results( 3 | "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", 4 | "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", 5 | primary_key_columns=var('primary_key_columns_var'), 6 | columns=var('columns_var'), 7 | event_time=var('event_time_var') 8 | ) 9 | }} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_classify.yml: -------------------------------------------------------------------------------- 1 | unit_tests: 2 | - name: compare_classify_identical_tables 3 | model: unit_compare_classify 4 | 5 | given: 6 | - input: ref('unit_test_model_a') 7 | rows: 8 | - { "id": 1, "col1": "abc", "col2": "def" } 9 | - { "id": 2, "col1": "hij", "col2": "klm" } 10 | - { "id": 3, "col1": "nop", "col2": "qrs" } 11 | - input: ref('unit_test_model_b') 12 | rows: 13 | - { "id": 1, "col1": "abc", "col2": "def" } 14 | - { "id": 2, "col1": "hij", "col2": "klm" } 15 | - { "id": 3, "col1": "nop", "col2": "qrs" } 16 | 17 | expect: 18 | rows: 19 | - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} 20 | - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} 21 | - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} 22 | 23 | overrides: 24 | vars: 25 | columns_var: ['id', 'col1', 'col2'] 26 | event_time_var: 27 | primary_key_columns_var: ['id'] 28 | 29 | - name: compare_classify_identical_tables_event_time_filter 30 | model: unit_compare_classify 31 | overrides: 32 | vars: 33 | columns_var: ['id', 'col1', 'col2', 'created_at'] 34 | event_time_var: 'created_at' 35 | primary_key_columns_var: ['id'] 36 | macros: 37 | audit_helper._get_comparison_bounds: 38 | "min_event_time": "2024-01-02" 39 | "max_event_time": "2024-01-03" 40 | "event_time": 'created_at' 41 | 42 | given: 43 | - input: ref('unit_test_model_a') 44 | rows: 45 | - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } 46 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 47 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 48 | - input: ref('unit_test_model_b') 49 | rows: 50 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 51 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 52 | 53 | expect: 54 | rows: 55 | - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 2} 56 | - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 2} 57 | 58 | - name: compare_classify_all_statuses 59 | model: unit_compare_classify 60 | overrides: 61 | vars: 62 | columns_var: ['id', 'col1', 'col2'] 63 | event_time_var: 64 | primary_key_columns_var: ['id'] 65 | given: 66 | - input: ref('unit_test_model_a') 67 | rows: 68 | - { "id": 1, "col1": "abc", "col2": "def" } 69 | - { "id": 2, "col1": "hij", "col2": "klm" } 70 | - { "id": 3, "col1": "nop", "col2": "qrs" } 71 | - input: ref('unit_test_model_b') 72 | rows: 73 | - { "id": 1, "col1": "abc", "col2": "def" } 74 | - { "id": 2, "col1": "changed", "col2": "values" } 75 | - { "id": 4, "col1": "nop", "col2": "qrs" } 76 | 77 | expect: 78 | rows: 79 | - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} 80 | - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} 81 | - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} 82 | - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} 83 | - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} 84 | config: 85 | tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 86 | 87 | - name: compare_classify_identical_tables_multiple_pk_cols 88 | model: unit_compare_classify 89 | overrides: 90 | vars: 91 | columns_var: ['id', 'id_2', 'col1', 'col2'] 92 | event_time_var: 93 | primary_key_columns_var: ['id', 'id_2'] 94 | given: 95 | - input: ref('unit_test_model_a') 96 | rows: 97 | - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } 98 | - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } 99 | - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } 100 | - input: ref('unit_test_model_b') 101 | rows: 102 | - { "id": 12, "id_2": 3, "col1": "abc", "col2": "def" } 103 | - { "id": 1, "id_2": 23, "col1": "hij", "col2": "klm" } 104 | - { "id": 3, "id_2": 4, "col1": "nop", "col2": "qrs" } 105 | expect: 106 | rows: 107 | - {"dbt_audit_row_status": 'identical', 'id': 12, "id_2": 3, "dbt_audit_num_rows_in_status": 3} 108 | - {"dbt_audit_row_status": 'identical', 'id': 1, "id_2": 23, "dbt_audit_num_rows_in_status": 3} 109 | - {"dbt_audit_row_status": 'identical', 'id': 3, "id_2": 4, "dbt_audit_num_rows_in_status": 3} 110 | 111 | - name: compare_classify_identical_tables_single_null_pk 112 | model: unit_compare_classify 113 | description: "`nonunique_pk` status checks whether a PK is unique. It's intended to avoid arbitrary comparisons, not protect against null records (that's what constraints or tests are for)." 114 | 115 | given: 116 | - input: ref('unit_test_model_a') 117 | rows: 118 | - { "id": , "col1": "abc", "col2": "def" } 119 | - { "id": 2, "col1": "hij", "col2": "klm" } 120 | - { "id": 3, "col1": "nop", "col2": "qrs" } 121 | - input: ref('unit_test_model_b') 122 | rows: 123 | - { "id": , "col1": "abc", "col2": "def" } 124 | - { "id": 2, "col1": "hij", "col2": "klm" } 125 | - { "id": 3, "col1": "nop", "col2": "qrs" } 126 | 127 | expect: 128 | rows: 129 | - {"dbt_audit_row_status": 'identical', 'id': , dbt_audit_num_rows_in_status: 3} 130 | - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} 131 | - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} 132 | 133 | overrides: 134 | vars: 135 | columns_var: ['id', 'col1', 'col2'] 136 | event_time_var: 137 | primary_key_columns_var: ['id'] 138 | config: 139 | tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 140 | 141 | - name: compare_classify_identical_tables_multiple_null_pk 142 | model: unit_compare_classify 143 | 144 | given: 145 | - input: ref('unit_test_model_a') 146 | rows: 147 | - { "id": , "col1": "abc", "col2": "def" } 148 | - { "id": , "col1": "hij", "col2": "klm" } 149 | - { "id": 3, "col1": "nop", "col2": "qrs" } 150 | - input: ref('unit_test_model_b') 151 | rows: 152 | - { "id": , "col1": "abc", "col2": "def" } 153 | - { "id": , "col1": "hij", "col2": "klm" } 154 | - { "id": 3, "col1": "nop", "col2": "qrs" } 155 | 156 | expect: 157 | rows: 158 | - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} 159 | - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 2} 160 | - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} 161 | 162 | overrides: 163 | vars: 164 | columns_var: ['id', 'col1', 'col2'] 165 | event_time_var: 166 | primary_key_columns_var: ['id'] 167 | config: 168 | tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 169 | 170 | - name: compare_classify_identical_tables_multi_null_pk_dupe_rows 171 | description: All rows with a null ID are identical. They should be returned as individual rows instead of being combined 172 | model: unit_compare_classify 173 | 174 | given: 175 | - input: ref('unit_test_model_a') 176 | rows: 177 | - { "id": , "col1": "abc", "col2": "def" } 178 | - { "id": , "col1": "abc", "col2": "def" } 179 | - { "id": 3, "col1": "nop", "col2": "qrs" } 180 | - input: ref('unit_test_model_b') 181 | rows: 182 | - { "id": , "col1": "abc", "col2": "def" } 183 | - { "id": , "col1": "abc", "col2": "def" } 184 | - { "id": , "col1": "abc", "col2": "def" } 185 | - { "id": 3, "col1": "nop", "col2": "qrs" } 186 | 187 | expect: 188 | rows: 189 | - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 1} 190 | - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} 191 | - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} 192 | - {"dbt_audit_row_status": 'nonunique_pk', 'id': , dbt_audit_num_rows_in_status: 3} 193 | 194 | overrides: 195 | vars: 196 | columns_var: ['id', 'col1', 'col2'] 197 | event_time_var: 198 | primary_key_columns_var: ['id'] 199 | config: 200 | tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-redshift/issues/821 201 | 202 | - name: compare_classify_all_statuses_different_column_set 203 | model: unit_compare_classify 204 | overrides: 205 | vars: 206 | primary_key_columns_var: ['id'] 207 | columns_var: ['id', 'col1'] 208 | event_time_var: 209 | given: 210 | - input: ref('unit_test_model_a') 211 | rows: 212 | - { "id": 1, "col1": "abc", "col2": "def" } 213 | - { "id": 2, "col1": "hij", "col2": "klm" } 214 | - { "id": 3, "col1": "nop", "col2": "qrs" } 215 | - input: ref('unit_test_model_b') 216 | rows: 217 | - { "id": 1, "col1": "abc" } 218 | - { "id": 2, "col1": "ddd" } 219 | - { "id": 4, "col1": "nop" } 220 | 221 | expect: 222 | rows: 223 | - {"dbt_audit_row_status": 'added', 'id': 4, dbt_audit_num_rows_in_status: 1} 224 | - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 1} 225 | - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} 226 | - {"dbt_audit_row_status": 'modified', 'id': 2, dbt_audit_num_rows_in_status: 1} 227 | - {"dbt_audit_row_status": 'removed', 'id': 3, dbt_audit_num_rows_in_status: 1} 228 | config: 229 | tags: "{{ 'temporary_skip' if (target.type in ['redshift']) else 'runnable' }}" #https://github.com/dbt-labs/dbt-core/issues/10167 230 | 231 | - name: compare_classify_identical_tables_without_pk_in_cols_list 232 | model: unit_compare_classify 233 | 234 | given: 235 | - input: ref('unit_test_model_a') 236 | rows: 237 | - { "id": 1, "col1": "abc", "col2": "def" } 238 | - { "id": 2, "col1": "hij", "col2": "klm" } 239 | - { "id": 3, "col1": "nop", "col2": "qrs" } 240 | - input: ref('unit_test_model_b') 241 | rows: 242 | - { "id": 1, "col1": "abc", "col2": "def" } 243 | - { "id": 2, "col1": "hij", "col2": "klm" } 244 | - { "id": 3, "col1": "nop", "col2": "qrs" } 245 | 246 | expect: 247 | rows: 248 | - {"dbt_audit_row_status": 'identical', 'id': 1, dbt_audit_num_rows_in_status: 3} 249 | - {"dbt_audit_row_status": 'identical', 'id': 3, dbt_audit_num_rows_in_status: 3} 250 | - {"dbt_audit_row_status": 'identical', 'id': 2, dbt_audit_num_rows_in_status: 3} 251 | 252 | overrides: 253 | vars: 254 | columns_var: ['col1', 'col2'] 255 | event_time_var: 256 | primary_key_columns_var: ['id'] 257 | -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | audit_helper.compare_and_classify_query_results( 3 | "select * from " ~ ref('unit_test_struct_model_a') ~ " where 1=1", 4 | "select * from " ~ ref('unit_test_struct_model_b') ~ " where 1=1", 5 | primary_key_columns=var('primary_key_columns_var'), 6 | columns=var('columns_var'), 7 | event_time=var('event_time_var') 8 | ) 9 | }} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_classify_struct.yml: -------------------------------------------------------------------------------- 1 | unit_tests: 2 | - name: compare_classify_simple_struct 3 | model: unit_compare_classify_struct 4 | given: 5 | - input: ref('unit_test_struct_model_a') 6 | format: sql 7 | fixture: simple_struct 8 | - input: ref('unit_test_struct_model_b') 9 | format: sql 10 | fixture: simple_struct 11 | expect: 12 | rows: 13 | - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} 14 | overrides: 15 | vars: 16 | columns_var: ['id', 'col1', 'col2'] 17 | event_time_var: 18 | primary_key_columns_var: ['id'] 19 | 20 | - name: unit_compare_classify_struct_identical_values_different_order 21 | model: unit_compare_classify_struct 22 | description: Objects' keys are generally sorted alphabetically, so sort order is ignored. 23 | given: 24 | - input: ref('unit_test_struct_model_a') 25 | format: sql 26 | fixture: simple_struct 27 | - input: ref('unit_test_struct_model_b') 28 | format: sql 29 | fixture: simple_struct_different_order 30 | expect: 31 | rows: 32 | - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} 33 | overrides: 34 | vars: 35 | columns_var: ['id', 'col1', 'col2'] 36 | event_time_var: 37 | primary_key_columns_var: ['id'] 38 | config: 39 | #Databricks cares about the order and considers it a difference. We're not trying to have identical behaviour across warehouses so that's OK. 40 | tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" 41 | 42 | - name: unit_compare_classify_struct_identical_values_different_order_dbx 43 | model: unit_compare_classify_struct 44 | description: Most platforms don't care about sort order. Databricks does. 45 | given: 46 | - input: ref('unit_test_struct_model_a') 47 | format: sql 48 | fixture: simple_struct 49 | - input: ref('unit_test_struct_model_b') 50 | format: sql 51 | fixture: simple_struct_different_order 52 | expect: 53 | rows: 54 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 55 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 56 | overrides: 57 | vars: 58 | columns_var: ['id', 'col1', 'col2'] 59 | event_time_var: 60 | primary_key_columns_var: ['id'] 61 | config: 62 | #Only for databricks 63 | tags: "{{ 'skip' if (target.type not in ['databricks']) else 'runnable' }}" 64 | 65 | - name: unit_compare_classify_struct_removed_key 66 | model: unit_compare_classify_struct 67 | given: 68 | - input: ref('unit_test_struct_model_a') 69 | format: sql 70 | fixture: simple_struct 71 | - input: ref('unit_test_struct_model_b') 72 | format: sql 73 | fixture: simple_struct_removed_key 74 | expect: 75 | rows: 76 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 77 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 78 | overrides: 79 | vars: 80 | columns_var: ['id', 'col1', 'col2'] 81 | event_time_var: 82 | primary_key_columns_var: ['id'] 83 | # config: 84 | # tags: "{{ 'skip' if (target.type in ['databricks']) else 'runnable' }}" #Can't do set operations on even simple JSON cols 85 | 86 | - name: compare_classify_complex_struct 87 | model: unit_compare_classify_struct 88 | given: 89 | - input: ref('unit_test_struct_model_a') 90 | format: sql 91 | fixture: complex_struct 92 | - input: ref('unit_test_struct_model_b') 93 | format: sql 94 | fixture: complex_struct 95 | expect: 96 | rows: 97 | - {"id": 1, "dbt_audit_row_status": "identical", "dbt_audit_num_rows_in_status": 1} 98 | overrides: 99 | vars: 100 | columns_var: ['id', 'col1', 'col2'] 101 | event_time_var: 102 | primary_key_columns_var: ['id'] 103 | # config: 104 | # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet 105 | 106 | - name: compare_classify_complex_struct_different_values 107 | model: unit_compare_classify_struct 108 | given: 109 | - input: ref('unit_test_struct_model_a') 110 | format: sql 111 | fixture: complex_struct 112 | - input: ref('unit_test_struct_model_b') 113 | format: sql 114 | fixture: complex_struct_different_value 115 | 116 | expect: 117 | rows: 118 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 119 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 120 | overrides: 121 | vars: 122 | columns_var: ['id', 'col1', 'col2'] 123 | event_time_var: 124 | primary_key_columns_var: ['id'] 125 | # config: 126 | # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet 127 | 128 | - name: unit_compare_classify_complex_struct_identical_values_different_order 129 | model: unit_compare_classify_struct 130 | description: Snowflake sorts objects' keys alphabetically, but respects the order items are added to arrays so differences are detected. 131 | given: 132 | - input: ref('unit_test_struct_model_a') 133 | format: sql 134 | fixture: complex_struct 135 | - input: ref('unit_test_struct_model_b') 136 | format: sql 137 | fixture: complex_struct_different_order 138 | expect: 139 | rows: 140 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 141 | - {"id": 1, "dbt_audit_row_status": "modified", "dbt_audit_num_rows_in_status": 1} 142 | overrides: 143 | vars: 144 | columns_var: ['id', 'col1', 'col2'] 145 | event_time_var: 146 | primary_key_columns_var: ['id'] 147 | # config: 148 | # tags: "{{ 'skip' if (target.type in ['redshift', 'databricks']) else 'runnable' }}" #haven't ported these to be multi-warehouse yet 149 | -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_queries.sql: -------------------------------------------------------------------------------- 1 | 2 | {{ 3 | audit_helper.compare_queries( 4 | "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", 5 | "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", 6 | summarize = var('compare_queries_summarize') 7 | ) 8 | }} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_queries.yml: -------------------------------------------------------------------------------- 1 | unit_tests: 2 | - name: identical_records_compare_queries 3 | model: unit_compare_queries 4 | description: The world's most basic unit test. 5 | 6 | given: 7 | - input: ref('unit_test_model_a') 8 | rows: 9 | - { "id": 1, "col1": "abc", "col2": "def" } 10 | - { "id": 2, "col1": "hij", "col2": "klm" } 11 | - { "id": 3, "col1": "nop", "col2": "qrs" } 12 | - input: ref('unit_test_model_b') 13 | rows: 14 | - { "id": 1, "col1": "abc", "col2": "def" } 15 | - { "id": 2, "col1": "hij", "col2": "klm" } 16 | - { "id": 3, "col1": "nop", "col2": "qrs" } 17 | 18 | expect: 19 | rows: 20 | - {"in_a": true, "in_b": true} 21 | 22 | overrides: 23 | vars: 24 | compare_queries_summarize: true 25 | 26 | - name: identical_records_compare_queries_no_summarize 27 | model: unit_compare_queries 28 | description: The world's second most basic unit test. 29 | 30 | given: 31 | - input: ref('unit_test_model_a') 32 | rows: 33 | - { "id": 1, "col1": "abc", "col2": "def" } 34 | - { "id": 2, "col1": "hij", "col2": "klm" } 35 | - { "id": 3, "col1": "nop", "col2": "qrs" } 36 | - input: ref('unit_test_model_b') 37 | rows: 38 | - { "id": 1, "col1": "abc", "col2": "def" } 39 | - { "id": 2, "col1": "hij", "col2": "klm" } 40 | - { "id": 3, "col1": "nop", "col2": "qrs" } 41 | 42 | expect: 43 | rows: [] 44 | 45 | overrides: 46 | vars: 47 | compare_queries_summarize: false 48 | -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.sql: -------------------------------------------------------------------------------- 1 | {% set pk_cols = var('primary_key_columns_var') %} 2 | {% set cols = var('columns_var') %} 3 | 4 | {% if target.type == 'snowflake' and flags.WHICH == 'run' %} 5 | {% set pk_cols = pk_cols | map("upper") | list %} 6 | {% set cols = cols | map("upper") | list %} 7 | {% endif %} 8 | 9 | {{ 10 | audit_helper.compare_which_query_columns_differ( 11 | a_query = "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", 12 | b_query = "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", 13 | primary_key_columns = pk_cols, 14 | columns = cols, 15 | event_time = var('event_time_var') 16 | ) 17 | }} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_compare_which_query_columns_differ.yml: -------------------------------------------------------------------------------- 1 | unit_tests: 2 | - name: compare_cols_identical_tables 3 | model: unit_compare_which_query_columns_differ 4 | 5 | given: 6 | - input: ref('unit_test_model_a') 7 | rows: 8 | - { "id": 1, "col1": "abc", "col2": "def" } 9 | - { "id": 2, "col1": "hij", "col2": "klm" } 10 | - { "id": 3, "col1": "nop", "col2": "qrs" } 11 | - input: ref('unit_test_model_b') 12 | rows: 13 | - { "id": 1, "col1": "abc", "col2": "def" } 14 | - { "id": 2, "col1": "hij", "col2": "klm" } 15 | - { "id": 3, "col1": "nop", "col2": "qrs" } 16 | 17 | expect: 18 | rows: 19 | - {"column_name": 'id', 'has_difference': false} 20 | - {"column_name": 'col1', 'has_difference': false} 21 | - {"column_name": 'col2', 'has_difference': false} 22 | 23 | overrides: 24 | vars: 25 | columns_var: ['id', 'col1', 'col2'] 26 | event_time_var: 27 | primary_key_columns_var: ['id'] 28 | config: 29 | tags: "{{ 'skip' if (target.type in ['snowflake']) else 'runnable' }}" #Case sensitivity 30 | 31 | - name: compare_cols_identical_tables_event_time_filter 32 | model: unit_compare_which_query_columns_differ 33 | overrides: 34 | vars: 35 | columns_var: ['id', 'col1', 'col2', 'created_at'] 36 | event_time_var: 'created_at' 37 | primary_key_columns_var: ['id'] 38 | macros: 39 | audit_helper._get_comparison_bounds: 40 | "min_event_time": "2024-01-02" 41 | "max_event_time": "2024-01-03" 42 | "event_time": 'created_at' 43 | 44 | given: 45 | - input: ref('unit_test_model_a') 46 | rows: 47 | - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } 48 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 49 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 50 | - input: ref('unit_test_model_b') 51 | rows: 52 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 53 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 54 | 55 | expect: 56 | rows: 57 | - {"column_name": 'id', "has_difference": false} 58 | - {"column_name": 'col1', "has_difference": false} 59 | - {"column_name": 'col2', "has_difference": false} 60 | - {"column_name": 'created_at', "has_difference": false} 61 | config: 62 | tags: "{{ 'skip' if (target.type in ['snowflake']) else 'runnable' }}" #Case sensitivity 63 | 64 | - name: compare_cols_identical_tables_snowflake 65 | model: unit_compare_which_query_columns_differ 66 | 67 | given: 68 | - input: ref('unit_test_model_a') 69 | rows: 70 | - { "id": 1, "col1": "abc", "col2": "def" } 71 | - { "id": 2, "col1": "hij", "col2": "klm" } 72 | - { "id": 3, "col1": "nop", "col2": "qrs" } 73 | - input: ref('unit_test_model_b') 74 | rows: 75 | - { "id": 1, "col1": "abc", "col2": "def" } 76 | - { "id": 2, "col1": "hij", "col2": "klm" } 77 | - { "id": 3, "col1": "nop", "col2": "qrs" } 78 | 79 | expect: 80 | rows: 81 | - {"column_name": 'ID', 'has_difference': false} 82 | - {"column_name": 'COL1', 'has_difference': false} 83 | - {"column_name": 'COL2', 'has_difference': false} 84 | 85 | overrides: 86 | vars: 87 | columns_var: ['ID', 'COL1', 'COL2'] 88 | event_time_var: 89 | primary_key_columns_var: ['ID'] 90 | config: 91 | tags: "{{ 'skip' if (target.type not in ['snowflake']) else 'runnable' }}" #Case sensitivity 92 | 93 | - name: compare_cols_identical_tables_event_time_filter_snowflake 94 | model: unit_compare_which_query_columns_differ 95 | overrides: 96 | vars: 97 | columns_var: ['ID', 'COL1', 'COL2', 'CREATED_AT'] 98 | event_time_var: 'CREATED_AT' 99 | primary_key_columns_var: ['ID'] 100 | macros: 101 | audit_helper._get_comparison_bounds: 102 | "min_event_time": "2024-01-02" 103 | "max_event_time": "2024-01-03" 104 | "event_time": 'created_at' 105 | 106 | given: 107 | - input: ref('unit_test_model_a') 108 | rows: 109 | - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } 110 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 111 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 112 | - input: ref('unit_test_model_b') 113 | rows: 114 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 115 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 116 | 117 | expect: 118 | rows: 119 | - {"column_name": 'ID', "has_difference": false} 120 | - {"column_name": 'COL1', "has_difference": false} 121 | - {"column_name": 'COL2', "has_difference": false} 122 | - {"column_name": 'CREATED_AT', "has_difference": false} 123 | config: 124 | tags: "{{ 'skip' if (target.type not in ['snowflake']) else 'runnable' }}" #Case sensitivity -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.sql: -------------------------------------------------------------------------------- 1 | {% set results = 2 | audit_helper._ensure_all_pks_are_in_column_set( 3 | primary_key_columns=var('primary_key_columns_var', ['a_column_with_a_large_unwieldy_name']), 4 | columns=var('columns_var', ['b_column_with_a_large_unwieldy_name']), 5 | ) 6 | %} 7 | 8 | {% if (var('primary_key_columns_var') | length == 0) and (var('columns_var') | length == 0) %} 9 | -- need to still provide a table shape 10 | select 'abcdefabcdef' as col, 1 as row_index 11 | limit 0 12 | {% endif %} 13 | 14 | {% for result in results %} 15 | select '{{ result }}' as col, {{ loop.index }} as row_index 16 | {% if not loop.last %} 17 | union all 18 | {% endif %} 19 | {% endfor %} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_ensure_all_pks_are_in_column_set.yml: -------------------------------------------------------------------------------- 1 | unit_tests: 2 | - name: ensure_all_pks_in_columns 3 | model: unit_ensure_all_pks_are_in_column_set 4 | given: [] 5 | overrides: 6 | vars: 7 | primary_key_columns_var: ['pk1', 'pk2'] 8 | columns_var: ['pk1', 'pk2', 'column_a', 'column_b'] 9 | 10 | expect: 11 | rows: 12 | - {"col": 'pk1', "row_index": 1} 13 | - {"col": 'pk2', "row_index": 2} 14 | - {"col": 'column_a', "row_index": 3} 15 | - {"col": 'column_b', "row_index": 4} 16 | 17 | - name: ensure_all_pks_in_columns_pks_at_end 18 | model: unit_ensure_all_pks_are_in_column_set 19 | description: PKs are specified in `columns` so should be at end of list 20 | given: [] 21 | overrides: 22 | vars: 23 | primary_key_columns_var: ['pk1', 'pk2'] 24 | columns_var: ['column_a', 'column_b', 'pk1', 'pk2'] 25 | 26 | expect: 27 | rows: 28 | - {"col": 'column_a', "row_index": 1} 29 | - {"col": 'column_b', "row_index": 2} 30 | - {"col": 'pk1', "row_index": 3} 31 | - {"col": 'pk2', "row_index": 4} 32 | 33 | - name: ensure_all_pks_in_columns_one_missing_pk 34 | model: unit_ensure_all_pks_are_in_column_set 35 | description: PK specified in `columns` should be at end of list, missing PK will be added at front 36 | given: [] 37 | overrides: 38 | vars: 39 | primary_key_columns_var: ['pk1', 'pk2'] 40 | columns_var: ['column_a', 'column_b', 'pk2'] 41 | 42 | expect: 43 | rows: 44 | - {"col": 'pk1', "row_index": 1} 45 | - {"col": 'column_a', "row_index": 2} 46 | - {"col": 'column_b', "row_index": 3} 47 | - {"col": 'pk2', "row_index": 4} 48 | 49 | - name: ensure_all_pks_in_columns_empty_sets 50 | model: unit_ensure_all_pks_are_in_column_set 51 | given: [] 52 | overrides: 53 | vars: 54 | primary_key_columns_var: [] 55 | columns_var: [] 56 | 57 | expect: 58 | rows: [] 59 | 60 | - name: ensure_all_pks_in_columns_no_pks 61 | model: unit_ensure_all_pks_are_in_column_set 62 | given: [] 63 | overrides: 64 | vars: 65 | primary_key_columns_var: [] 66 | columns_var: ['column_a', 'column_b'] 67 | 68 | expect: 69 | rows: 70 | - {"col": 'column_a', "row_index": 1} 71 | - {"col": 'column_b', "row_index": 2} 72 | 73 | - name: ensure_all_pks_in_columns_no_cols 74 | model: unit_ensure_all_pks_are_in_column_set 75 | given: [] 76 | overrides: 77 | vars: 78 | primary_key_columns_var: ['pk1', 'pk2'] 79 | columns_var: [] 80 | 81 | expect: 82 | rows: 83 | - {"col": 'pk1', "row_index": 1} 84 | - {"col": 'pk2', "row_index": 2} 85 | 86 | - name: ensure_all_pks_in_columns_caps_pk 87 | model: unit_ensure_all_pks_are_in_column_set 88 | given: [] 89 | overrides: 90 | vars: 91 | primary_key_columns_var: ['pk2', 'PK1'] 92 | columns_var: ['pk1', 'pk2', 'column_a', 'column_b'] 93 | 94 | expect: 95 | rows: 96 | - {"col": 'pk1', "row_index": 1} 97 | - {"col": 'pk2', "row_index": 2} 98 | - {"col": 'column_a', "row_index": 3} 99 | - {"col": 'column_b', "row_index": 4} 100 | 101 | - name: ensure_all_pks_in_columns_caps_col 102 | model: unit_ensure_all_pks_are_in_column_set 103 | given: [] 104 | overrides: 105 | vars: 106 | primary_key_columns_var: ['pk2', 'pk1'] 107 | columns_var: ['pk1', 'pk2', 'COLUMN_A', 'column_b'] 108 | 109 | expect: 110 | rows: 111 | - {"col": 'pk1', "row_index": 1} 112 | - {"col": 'pk2', "row_index": 2} 113 | - {"col": 'COLUMN_A', "row_index": 3} 114 | - {"col": 'column_b', "row_index": 4} 115 | 116 | - name: ensure_all_pks_in_columns_caps_pk_in_both 117 | model: unit_ensure_all_pks_are_in_column_set 118 | given: [] 119 | overrides: 120 | vars: 121 | primary_key_columns_var: ['pk2', 'PK1'] 122 | columns_var: ['PK1', 'pk2', 'column_a', 'column_b'] 123 | 124 | expect: 125 | rows: 126 | - {"col": 'PK1', "row_index": 1} 127 | - {"col": 'pk2', "row_index": 2} 128 | - {"col": 'column_a', "row_index": 3} 129 | - {"col": 'column_b', "row_index": 4} 130 | -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql: -------------------------------------------------------------------------------- 1 | {{ config(tags=['skip' if (target.type in ['redshift', 'postgres', 'databricks']) else 'runnable']) }} 2 | 3 | {{ 4 | audit_helper.quick_are_queries_identical( 5 | "select * from " ~ ref('unit_test_model_a') ~ " where 1=1", 6 | "select * from " ~ ref('unit_test_model_b') ~ " where 1=1", 7 | columns=var('quick_are_queries_identical_cols'), 8 | event_time=var('event_time_var') 9 | ) 10 | }} -------------------------------------------------------------------------------- /integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.yml: -------------------------------------------------------------------------------- 1 | unit_tests: 2 | - name: quick_are_queries_identical_identical_tables 3 | model: unit_quick_are_queries_identical 4 | 5 | given: 6 | - input: ref('unit_test_model_a') 7 | rows: 8 | - { "id": 1, "col1": "abc", "col2": "def" } 9 | - { "id": 2, "col1": "hij", "col2": "klm" } 10 | - { "id": 3, "col1": "nop", "col2": "qrs" } 11 | - input: ref('unit_test_model_b') 12 | rows: 13 | - { "id": 1, "col1": "abc", "col2": "def" } 14 | - { "id": 2, "col1": "hij", "col2": "klm" } 15 | - { "id": 3, "col1": "nop", "col2": "qrs" } 16 | 17 | expect: 18 | rows: 19 | - {"are_tables_identical": true} 20 | 21 | overrides: 22 | vars: 23 | quick_are_queries_identical_cols: ['id', 'col1', 'col2'] 24 | event_time_var: 25 | 26 | - name: quick_are_queries_identical_identical_tables_event_time_filter 27 | model: unit_quick_are_queries_identical 28 | overrides: 29 | vars: 30 | quick_are_queries_identical_cols: ['id', 'col1', 'col2', 'created_at'] 31 | event_time_var: 'created_at' 32 | macros: 33 | audit_helper._get_comparison_bounds: 34 | "min_event_time": "2024-01-02" 35 | "max_event_time": "2024-01-03" 36 | "event_time": 'created_at' 37 | 38 | given: 39 | - input: ref('unit_test_model_a') 40 | rows: 41 | - { "id": 1, "col1": "abc", "col2": "def", "created_at": '2024-01-01' } 42 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 43 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 44 | - input: ref('unit_test_model_b') 45 | rows: 46 | - { "id": 2, "col1": "hij", "col2": "klm", "created_at": '2024-01-02' } 47 | - { "id": 3, "col1": "nop", "col2": "qrs", "created_at": '2024-01-03' } 48 | 49 | expect: 50 | rows: 51 | - {"are_tables_identical": true} 52 | 53 | - name: quick_are_queries_identical_differences 54 | model: unit_quick_are_queries_identical 55 | overrides: 56 | vars: 57 | quick_are_queries_identical_cols: ['id', 'col1', 'col2'] 58 | event_time_var: 59 | given: 60 | - input: ref('unit_test_model_a') 61 | rows: 62 | - { "id": 1, "col1": "abc", "col2": "def" } 63 | - { "id": 2, "col1": "hij", "col2": "klm" } 64 | - { "id": 3, "col1": "nop", "col2": "qrs" } 65 | - input: ref('unit_test_model_b') 66 | rows: 67 | - { "id": 1, "col1": "abc", "col2": "def" } 68 | - { "id": 2, "col1": "changed", "col2": "values" } 69 | - { "id": 4, "col1": "nop", "col2": "qrs" } 70 | 71 | expect: 72 | rows: 73 | - {"are_tables_identical": false} 74 | 75 | - name: quick_are_queries_identical_identical_tables_with_null_pks 76 | model: unit_quick_are_queries_identical 77 | 78 | given: 79 | - input: ref('unit_test_model_a') 80 | rows: 81 | - { "id":, "col1": "abc", "col2": "def" } 82 | - { "id":, "col1": "hij", "col2": "klm" } 83 | - { "id": 3, "col1": "nop", "col2": "qrs" } 84 | - input: ref('unit_test_model_b') 85 | rows: 86 | - { "id":, "col1": "abc", "col2": "def" } 87 | - { "id":, "col1": "hij", "col2": "klm" } 88 | - { "id": 3, "col1": "nop", "col2": "qrs" } 89 | 90 | expect: 91 | rows: 92 | - {"are_tables_identical": true} 93 | 94 | overrides: 95 | vars: 96 | quick_are_queries_identical_cols: ['id', 'col1', 'col2'] 97 | event_time_var: 98 | -------------------------------------------------------------------------------- /integration_tests/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - local: ../ 3 | - package: dbt-labs/dbt_utils 4 | version: 1.1.1 5 | sha1_hash: de2deba3d66ce03d8c02949013650cc9b94f6030 6 | -------------------------------------------------------------------------------- /integration_tests/packages.yml: -------------------------------------------------------------------------------- 1 | 2 | packages: 3 | - local: ../ 4 | -------------------------------------------------------------------------------- /integration_tests/profiles.yml: -------------------------------------------------------------------------------- 1 | integration_tests: 2 | target: postgres 3 | outputs: 4 | postgres: 5 | type: "postgres" 6 | host: "{{ env_var('POSTGRES_HOST') }}" 7 | user: "{{ env_var('POSTGRES_USER') }}" 8 | pass: "{{ env_var('DBT_ENV_SECRET_POSTGRES_PASS') }}" 9 | port: "{{ env_var('POSTGRES_PORT') | as_number }}" 10 | dbname: "{{ env_var('POSTGRES_DATABASE') }}" 11 | schema: "{{ env_var('POSTGRES_SCHEMA') }}" 12 | threads: 5 13 | 14 | redshift: 15 | type: redshift 16 | host: "{{ env_var('REDSHIFT_TEST_HOST') }}" 17 | user: "{{ env_var('REDSHIFT_TEST_USER') }}" 18 | pass: "{{ env_var('REDSHIFT_TEST_PASS') }}" 19 | dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" 20 | port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" 21 | schema: audit_helper_integration_tests_redshift 22 | threads: 8 23 | 24 | bigquery: 25 | type: bigquery 26 | method: service-account 27 | keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" 28 | project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" 29 | schema: audit_helper_integration_tests_bigquery 30 | threads: 8 31 | 32 | snowflake: 33 | type: snowflake 34 | account: "{{ env_var('SNOWFLAKE_TEST_ACCOUNT') }}" 35 | user: "{{ env_var('SNOWFLAKE_TEST_USER') }}" 36 | password: "{{ env_var('SNOWFLAKE_TEST_PASSWORD') }}" 37 | role: "{{ env_var('SNOWFLAKE_TEST_ROLE') }}" 38 | database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" 39 | warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" 40 | schema: audit_helper_integration_tests_snowflake 41 | threads: 8 42 | 43 | databricks: 44 | type: databricks 45 | schema: dbt_project_evaluator_integration_tests_databricks 46 | host: "{{ env_var('DATABRICKS_TEST_HOST') }}" 47 | http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}" 48 | token: "{{ env_var('DATABRICKS_TEST_ACCESS_TOKEN') }}" 49 | threads: 10 50 | 51 | -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_all_columns__albertsons_produce.csv: -------------------------------------------------------------------------------- 1 | id,fruit,ripeness 2 | 1,banana,yellow 3 | 2,banana,brown 4 | 3,banana,brown 5 | 4,orange,green 6 | 5,orange,orange 7 | 6,,brown 8 | 7,orange,orange 9 | 9,apple,mushy 10 | 10,apple, -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_all_columns__albertsons_produce__concat_pk.csv: -------------------------------------------------------------------------------- 1 | produce_category,id,produce,ripeness 2 | vegetable,1,spinach,wilted 3 | fruit,1,banana,yellow 4 | fruit,2,banana,brown 5 | fruit,3,banana,brown 6 | fruit,4,orange,green 7 | fruit,5,orange,orange 8 | fruit,6,,brown 9 | fruit,7,orange,orange 10 | fruit,9,apple,mushy 11 | fruit,10,apple, -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_all_columns__market_of_choice_produce.csv: -------------------------------------------------------------------------------- 1 | id,fruit,ripeness 2 | 1,banana,yellow 3 | 2,banana,green 4 | 3,banana,brown 5 | 4,orange,green 6 | 5,orange,orange 7 | 6,orange,brown 8 | 7,orange, 9 | 8,apple,mushy -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_all_columns__market_of_choice_produce__concat_pk.csv: -------------------------------------------------------------------------------- 1 | produce_category,id,produce,ripeness 2 | vegetable,1,spinach,wilted 3 | fruit,1,banana,yellow 4 | fruit,2,banana,green 5 | fruit,3,banana,brown 6 | fruit,4,orange,green 7 | fruit,5,orange,orange 8 | fruit,6,orange,brown 9 | fruit,7,orange, 10 | fruit,8,apple,mushy -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_relation_columns_a.csv: -------------------------------------------------------------------------------- 1 | awesome_column,zany_column,brave_column,young_column,cool_column,xcellent_column 2 | testing_is_fun,2022-02-22,1234,9.8765,false,2020-01-01T21:08:17 -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_relation_columns_b.csv: -------------------------------------------------------------------------------- 1 | magnificent_column,zany_column,brave_column,young_column,cool_column,xpeditionary_column,awesome_column 2 | 2022-02-22,my_string_here,1234,9.8765,true,2020-01-01T21:08:17,testing_is_fun -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_relations__a_relation.csv: -------------------------------------------------------------------------------- 1 | col_a,col_b 2 | 1,a 3 | 2,b 4 | -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_relations__b_relation.csv: -------------------------------------------------------------------------------- 1 | col_a,col_b 2 | 1,a 3 | 2,c 4 | -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_which_columns_differ_a.csv: -------------------------------------------------------------------------------- 1 | id,value_changes,becomes_null,becomes_not_null,does_not_change 2 | 1,pink,22,a,dave 3 | 2,blue,33,,dave 4 | 3,green,44,c,dave 5 | 4,yellow,55,d,dave -------------------------------------------------------------------------------- /integration_tests/seeds/data_compare_which_columns_differ_b.csv: -------------------------------------------------------------------------------- 1 | id,value_changes,becomes_null,becomes_not_null,does_not_change 2 | 1,red,22,a,dave 3 | 2,blue,,b,dave 4 | 3,green,44,c,dave 5 | 4,yellow,55,d,dave -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_all_columns_concat_pk_with_summary.csv: -------------------------------------------------------------------------------- 1 | column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values 2 | ID,8,0,0,2,1,0 3 | PRODUCE,7,0,1,2,1,1 4 | PRODUCE_CATEGORY,8,0,0,2,1,0 5 | RIPENESS,6,1,1,2,1,2 -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_all_columns_concat_pk_without_summary.csv: -------------------------------------------------------------------------------- 1 | primary_key,column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values 2 | 00f0200cfb8e8443dfa3566bd60170a7,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 3 | 00f0200cfb8e8443dfa3566bd60170a7,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 4 | 00f0200cfb8e8443dfa3566bd60170a7,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 5 | 00f0200cfb8e8443dfa3566bd60170a7,RIPENESS,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 6 | 0262eff11e473d76cf5e71ba1bb9adde,ID,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 7 | 0262eff11e473d76cf5e71ba1bb9adde,PRODUCE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 8 | 0262eff11e473d76cf5e71ba1bb9adde,PRODUCE_CATEGORY,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 9 | 0262eff11e473d76cf5e71ba1bb9adde,RIPENESS,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 10 | 231ee7461c22557b0b811bc510df9c3f,PRODUCE_CATEGORY,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 11 | 231ee7461c22557b0b811bc510df9c3f,PRODUCE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 12 | 231ee7461c22557b0b811bc510df9c3f,RIPENESS,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE 13 | 231ee7461c22557b0b811bc510df9c3f,ID,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE 14 | 51f71ec6b715b6071a0b6a9647bce8a7,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 15 | 51f71ec6b715b6071a0b6a9647bce8a7,RIPENESS,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 16 | 51f71ec6b715b6071a0b6a9647bce8a7,PRODUCE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE 17 | 51f71ec6b715b6071a0b6a9647bce8a7,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 18 | 559c0b59e42ff35a37de91977b660800,RIPENESS,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 19 | 559c0b59e42ff35a37de91977b660800,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 20 | 559c0b59e42ff35a37de91977b660800,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 21 | 559c0b59e42ff35a37de91977b660800,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 22 | 5f3bd1bba6beca5a23d4cde34a9bd96b,RIPENESS,FALSE,TRUE,FALSE,FALSE,FALSE,TRUE 23 | 5f3bd1bba6beca5a23d4cde34a9bd96b,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 24 | 5f3bd1bba6beca5a23d4cde34a9bd96b,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 25 | 5f3bd1bba6beca5a23d4cde34a9bd96b,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 26 | a971c9a048ccd0fd4d282cc2a55734bc,PRODUCE_CATEGORY,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE 27 | a971c9a048ccd0fd4d282cc2a55734bc,RIPENESS,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE 28 | a971c9a048ccd0fd4d282cc2a55734bc,PRODUCE,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE 29 | a971c9a048ccd0fd4d282cc2a55734bc,ID,FALSE,FALSE,FALSE,FALSE,TRUE,FALSE 30 | bd483dcfa375c6fd78c89072de1eea20,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 31 | bd483dcfa375c6fd78c89072de1eea20,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 32 | bd483dcfa375c6fd78c89072de1eea20,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 33 | bd483dcfa375c6fd78c89072de1eea20,RIPENESS,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 34 | cfd3543ee591403d825bf0a1618b1709,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 35 | cfd3543ee591403d825bf0a1618b1709,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 36 | cfd3543ee591403d825bf0a1618b1709,RIPENESS,FALSE,FALSE,FALSE,FALSE,FALSE,TRUE 37 | cfd3543ee591403d825bf0a1618b1709,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 38 | eae3b305c437133aebdd66788f38e262,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 39 | eae3b305c437133aebdd66788f38e262,RIPENESS,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 40 | eae3b305c437133aebdd66788f38e262,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 41 | eae3b305c437133aebdd66788f38e262,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 42 | f2448f021cb149747e9ada2531d5116d,PRODUCE_CATEGORY,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 43 | f2448f021cb149747e9ada2531d5116d,RIPENESS,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 44 | f2448f021cb149747e9ada2531d5116d,PRODUCE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE 45 | f2448f021cb149747e9ada2531d5116d,ID,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_all_columns_where_clause.csv: -------------------------------------------------------------------------------- 1 | primary_key,column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values 2 | 2,RIPENESS,false,false,false,false,false,true 3 | 6,FRUIT,false,false,true,false,false,true 4 | 7,RIPENESS,false,true,false,false,false,true 5 | 8,ID,false,false,false,false,true,false 6 | 8,FRUIT,false,false,false,false,true,false 7 | 8,RIPENESS,false,false,false,false,true,false 8 | 9,ID,false,false,false,true,false,false 9 | 9,FRUIT,false,false,false,true,false,false 10 | 9,RIPENESS,false,false,false,true,false,false 11 | 10,ID,false,false,false,true,false,false 12 | 10,FRUIT,false,false,false,true,false,false 13 | 10,RIPENESS,false,false,true,true,false,false -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_all_columns_with_summary.csv: -------------------------------------------------------------------------------- 1 | column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values 2 | FRUIT,6,0,1,2,1,1 3 | ID,7,0,0,2,1,0 4 | RIPENESS,5,1,1,2,1,2 -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_all_columns_with_summary_and_exclude.csv: -------------------------------------------------------------------------------- 1 | column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values 2 | FRUIT,6,0,1,2,1,1 3 | ID,7,0,0,2,1,0 -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_all_columns_without_summary.csv: -------------------------------------------------------------------------------- 1 | primary_key,column_name,perfect_match,null_in_a,null_in_b,missing_from_a,missing_from_b,conflicting_values 2 | 1,ID,true,false,false,false,false,false 3 | 2,ID,true,false,false,false,false,false 4 | 3,ID,true,false,false,false,false,false 5 | 4,ID,true,false,false,false,false,false 6 | 5,ID,true,false,false,false,false,false 7 | 6,ID,true,false,false,false,false,false 8 | 7,ID,true,false,false,false,false,false 9 | 8,ID,false,false,false,false,true,false 10 | 9,ID,false,false,false,true,false,false 11 | 10,ID,false,false,false,true,false,false 12 | 1,FRUIT,true,false,false,false,false,false 13 | 2,FRUIT,true,false,false,false,false,false 14 | 3,FRUIT,true,false,false,false,false,false 15 | 4,FRUIT,true,false,false,false,false,false 16 | 5,FRUIT,true,false,false,false,false,false 17 | 6,FRUIT,false,false,true,false,false,true 18 | 7,FRUIT,true,false,false,false,false,false 19 | 8,FRUIT,false,false,false,false,true,false 20 | 9,FRUIT,false,false,false,true,false,false 21 | 10,FRUIT,false,false,false,true,false,false 22 | 1,RIPENESS,true,false,false,false,false,false 23 | 2,RIPENESS,false,false,false,false,false,true 24 | 3,RIPENESS,true,false,false,false,false,false 25 | 4,RIPENESS,true,false,false,false,false,false 26 | 5,RIPENESS,true,false,false,false,false,false 27 | 6,RIPENESS,true,false,false,false,false,false 28 | 7,RIPENESS,false,true,false,false,false,true 29 | 8,RIPENESS,false,false,false,false,true,false 30 | 9,RIPENESS,false,false,false,true,false,false 31 | 10,RIPENESS,false,false,true,true,false,false -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_relation_columns.csv: -------------------------------------------------------------------------------- 1 | COLUMN_NAME,A_ORDINAL_POSITION,B_ORDINAL_POSITION,HAS_ORDINAL_POSITION_MATCH,HAS_DATA_TYPE_MATCH 2 | awesome_column,1,7,false,true 3 | magnificent_column,,1,false,false 4 | zany_column,2,2,true,false 5 | brave_column,3,3,true,true 6 | young_column,4,4,true,true 7 | cool_column,5,5,true,true 8 | xpeditionary_column,,6,false,false 9 | xcellent_column,6,,false,false -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_relations_with_exclude.csv: -------------------------------------------------------------------------------- 1 | in_a,in_b,count,percent_of_total 2 | True,True,2,100 3 | -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_relations_without_exclude.csv: -------------------------------------------------------------------------------- 1 | in_a,in_b,count,percent_of_total 2 | True,True,1,33.33 3 | True,False,1,33.33 4 | False,True,1,33.33 5 | -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_row_counts.csv: -------------------------------------------------------------------------------- 1 | relation_name,total_records 2 | a,2 3 | b,2 -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_which_columns_differ.csv: -------------------------------------------------------------------------------- 1 | column_name,has_difference 2 | id,false 3 | value_changes,true 4 | becomes_null,true 5 | becomes_not_null,true 6 | does_not_change,false -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_which_columns_differ_exclude_cols.csv: -------------------------------------------------------------------------------- 1 | column_name,has_difference 2 | id,false 3 | value_changes,true 4 | becomes_not_null,true 5 | does_not_change,false -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_with_summary.csv: -------------------------------------------------------------------------------- 1 | in_a,in_b,count,percent_of_total 2 | True,True,1,33.33 3 | True,False,1,33.33 4 | False,True,1,33.33 -------------------------------------------------------------------------------- /integration_tests/seeds/expected_results__compare_without_summary.csv: -------------------------------------------------------------------------------- 1 | col_a,col_b,in_a,in_b 2 | 2,b,true,false 3 | 2,c,false,true -------------------------------------------------------------------------------- /integration_tests/tests/fixtures/complex_struct.sql: -------------------------------------------------------------------------------- 1 | {% set json %} 2 | '{"emails":["john.doe@example.com","john.d@example.com"],"phones":[{"number":"123-456-7890","type":"home"},{"number":"987-654-3210","type":"work"}]}' 3 | {% endset %} 4 | 5 | select 6 | 1 as id, 7 | 'John Doe' as col1, 8 | {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 -------------------------------------------------------------------------------- /integration_tests/tests/fixtures/complex_struct_different_order.sql: -------------------------------------------------------------------------------- 1 | {% set json %} 2 | '{"emails":["john.doe@example.com","john.d@example.com"],"phones":[{"number":"987-654-3210","type":"work"}, {"number":"123-456-7890","type":"home"}]}' 3 | {% endset %} 4 | 5 | select 6 | 1 as id, 7 | 'John Doe' as col1, 8 | {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 -------------------------------------------------------------------------------- /integration_tests/tests/fixtures/complex_struct_different_value.sql: -------------------------------------------------------------------------------- 1 | {% set json %} 2 | '{"emails":["john.smith@example.com","john.s@example.com"],"phones":[{"number":"123-456-7890","type":"home"},{"number":"987-654-3210","type":"work"}]}' 3 | {% endset %} 4 | 5 | select 6 | 1 as id, 7 | 'John Doe' as col1, 8 | {{ audit_helper_integration_tests._complex_json_function(json) }} as col2 -------------------------------------------------------------------------------- /integration_tests/tests/fixtures/simple_struct.sql: -------------------------------------------------------------------------------- 1 | {% if target.name != 'redshift' %} 2 | 3 | select 4 | 1 as id, 5 | 'John Doe' as col1, 6 | {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2 7 | 8 | {% else %} 9 | 10 | select 11 | 1 AS id, 12 | 'John Doe' AS col1, 13 | json_parse('{"street": "123 Main St", "city": "Anytown", "state": "CA"}') AS col2 14 | {% endif %} -------------------------------------------------------------------------------- /integration_tests/tests/fixtures/simple_struct_different_order.sql: -------------------------------------------------------------------------------- 1 | {% if target.name != 'redshift' %} 2 | 3 | select 4 | 1 as id, 5 | 'John Doe' as col1, 6 | {{ audit_helper_integration_tests._basic_json_function() -}}( 'state', 'CA', 'street', '123 Main St', 'city', 'Anytown') as col2 7 | 8 | {% else %} 9 | 10 | select 11 | 1 AS id, 12 | 'John Doe' AS col1, 13 | json_parse('{"state": "CA", "street": "123 Main St", "city": "Anytown"}') AS col2 14 | {% endif %} -------------------------------------------------------------------------------- /integration_tests/tests/fixtures/simple_struct_removed_key.sql: -------------------------------------------------------------------------------- 1 | {% if target.name != 'redshift' %} 2 | 3 | select 4 | 1 as id, 5 | 'John Doe' as col1, 6 | {{ audit_helper_integration_tests._basic_json_function() -}}('street', '123 Main St', 'state', 'CA') as col2 7 | 8 | {% else %} 9 | 10 | select 11 | 1 AS id, 12 | 'John Doe' AS col1, 13 | json_parse('{"street": "123 Main St", "state": "CA"}') AS col2 14 | {% endif %} -------------------------------------------------------------------------------- /macros/compare_all_columns.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_all_columns( a_relation, b_relation, primary_key, exclude_columns=[],summarize=true ) -%} 2 | {{ return(adapter.dispatch('compare_all_columns', 'audit_helper')( a_relation, b_relation, primary_key, exclude_columns, summarize )) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__compare_all_columns( a_relation, b_relation, primary_key, exclude_columns=[], summarize=true ) -%} 6 | 7 | {% set column_names = dbt_utils.get_filtered_columns_in_relation(from=a_relation, except=exclude_columns) %} 8 | 9 | {# We explictly select the primary_key and rename to support any sql as the primary_key - 10 | a column or concatenated columns. this assumes that a_relation and b_relation do not already 11 | have a field named dbt_audit_helper_pk #} 12 | 13 | {% set a_query %} 14 | select 15 | *, 16 | {{ primary_key }} as dbt_audit_helper_pk 17 | from {{ a_relation }} 18 | {% endset %} 19 | 20 | {% set b_query %} 21 | select 22 | *, 23 | {{ primary_key }} as dbt_audit_helper_pk 24 | from {{ b_relation }} 25 | {% endset %} 26 | 27 | {% for column_name in column_names %} 28 | 29 | {% set audit_query = audit_helper.compare_column_values_verbose( 30 | a_query=a_query, 31 | b_query=b_query, 32 | primary_key="dbt_audit_helper_pk", 33 | column_to_compare=column_name 34 | ) %} 35 | 36 | /* Create a query combining results from all columns so that the user, or the 37 | test suite, can examine all at once. 38 | */ 39 | 40 | {% if loop.first %} 41 | 42 | /* Create a CTE that wraps all the unioned subqueries that are created 43 | in this for loop 44 | */ 45 | with main as ( 46 | 47 | {% endif %} 48 | 49 | /* There will be one audit_query subquery for each column 50 | */ 51 | ( {{ audit_query }} ) 52 | 53 | {% if not loop.last %} 54 | 55 | union all 56 | 57 | {% else %} 58 | 59 | ), 60 | 61 | {%- if summarize %} 62 | 63 | final as ( 64 | select 65 | upper(column_name) as column_name, 66 | sum(case when perfect_match then 1 else 0 end) as perfect_match, 67 | sum(case when null_in_a then 1 else 0 end) as null_in_a, 68 | sum(case when null_in_b then 1 else 0 end) as null_in_b, 69 | sum(case when missing_from_a then 1 else 0 end) as missing_from_a, 70 | sum(case when missing_from_b then 1 else 0 end) as missing_from_b, 71 | sum(case when conflicting_values then 1 else 0 end) as conflicting_values 72 | from main 73 | group by 1 74 | order by column_name 75 | ) 76 | 77 | {%- else %} 78 | 79 | final as ( 80 | select 81 | primary_key, 82 | upper(column_name) as column_name, 83 | perfect_match, 84 | null_in_a, 85 | null_in_b, 86 | missing_from_a, 87 | missing_from_b, 88 | conflicting_values 89 | from main 90 | order by primary_key 91 | ) 92 | 93 | {%- endif %} 94 | 95 | select * from final 96 | 97 | {% endif %} 98 | 99 | {% endfor %} 100 | 101 | {% endmacro %} -------------------------------------------------------------------------------- /macros/compare_and_classify_query_results.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_and_classify_query_results(a_query, b_query, primary_key_columns=[], columns=[], event_time=None, sample_limit=20) %} 2 | 3 | {% set columns = audit_helper._ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} 4 | {% set joined_cols = columns | join(", ") %} 5 | 6 | {% if event_time %} 7 | {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} 8 | {% endif %} 9 | 10 | with 11 | 12 | {{ audit_helper._generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props)}} 13 | 14 | , 15 | 16 | all_records as ( 17 | 18 | select 19 | *, 20 | true as dbt_audit_in_a, 21 | true as dbt_audit_in_b 22 | from a_intersect_b 23 | 24 | union all 25 | 26 | select 27 | *, 28 | true as dbt_audit_in_a, 29 | false as dbt_audit_in_b 30 | from a_except_b 31 | 32 | union all 33 | 34 | select 35 | *, 36 | false as dbt_audit_in_a, 37 | true as dbt_audit_in_b 38 | from b_except_a 39 | 40 | ), 41 | 42 | classified as ( 43 | select 44 | *, 45 | {{ audit_helper._classify_audit_row_status() }} as dbt_audit_row_status 46 | from all_records 47 | ), 48 | 49 | final as ( 50 | select 51 | *, 52 | {{ audit_helper._count_num_rows_in_status() }} as dbt_audit_num_rows_in_status, 53 | -- using dense_rank so that modified rows (which have a full row for both the left and right side) both get picked up in the sample. 54 | -- For every other type this is equivalent to a row_number() 55 | dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) as dbt_audit_sample_number 56 | from classified 57 | ) 58 | 59 | select * from final 60 | {% if sample_limit %} 61 | where dbt_audit_sample_number <= {{ sample_limit }} 62 | {% endif %} 63 | order by dbt_audit_row_status, dbt_audit_sample_number 64 | 65 | {% endmacro %} -------------------------------------------------------------------------------- /macros/compare_and_classify_relation_rows.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_and_classify_relation_rows(a_relation, b_relation, primary_key_columns=[], columns=None, event_time=None, sample_limit=20) %} 2 | {%- if not columns -%} 3 | {%- set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) -%} 4 | {%- endif -%} 5 | 6 | {{ 7 | audit_helper.compare_and_classify_query_results( 8 | "select * from " ~ a_relation, 9 | "select * from " ~ b_relation, 10 | primary_key_columns, 11 | columns, 12 | event_time, 13 | sample_limit 14 | ) 15 | }} 16 | {% endmacro %} -------------------------------------------------------------------------------- /macros/compare_column_values.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_column_values(a_query, b_query, primary_key, column_to_compare, emojis=True, a_relation_name='a', b_relation_name='b') -%} 2 | {{ return(adapter.dispatch('compare_column_values', 'audit_helper')(a_query, b_query, primary_key, column_to_compare, emojis, a_relation_name, b_relation_name)) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__compare_column_values(a_query, b_query, primary_key, column_to_compare, emojis, a_relation_name, b_relation_name) -%} 6 | with a_query as ( 7 | {{ a_query }} 8 | ), 9 | 10 | b_query as ( 11 | {{ b_query }} 12 | ), 13 | 14 | joined as ( 15 | select 16 | coalesce(a_query.{{ primary_key }}, b_query.{{ primary_key }}) as {{ primary_key }}, 17 | a_query.{{ column_to_compare }} as a_query_value, 18 | b_query.{{ column_to_compare }} as b_query_value, 19 | case 20 | when a_query.{{ column_to_compare }} = b_query.{{ column_to_compare }} then '{% if emojis %}✅: {% endif %}perfect match' 21 | when a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is null then '{% if emojis %}✅: {% endif %}both are null' 22 | when a_query.{{ primary_key }} is null then '{% if emojis %}🤷: {% endif %}missing from {{ a_relation_name }}' 23 | when b_query.{{ primary_key }} is null then '{% if emojis %}🤷: {% endif %}missing from {{ b_relation_name }}' 24 | when a_query.{{ column_to_compare }} is null then '{% if emojis %}🤷: {% endif %}value is null in {{ a_relation_name }} only' 25 | when b_query.{{ column_to_compare }} is null then '{% if emojis %}🤷: {% endif %}value is null in {{ b_relation_name }} only' 26 | when a_query.{{ column_to_compare }} != b_query.{{ column_to_compare }} then '{% if emojis %}❌: {% endif %}‍values do not match' 27 | else 'unknown' -- this should never happen 28 | end as match_status, 29 | case 30 | when a_query.{{ column_to_compare }} = b_query.{{ column_to_compare }} then 0 31 | when a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is null then 1 32 | when a_query.{{ primary_key }} is null then 2 33 | when b_query.{{ primary_key }} is null then 3 34 | when a_query.{{ column_to_compare }} is null then 4 35 | when b_query.{{ column_to_compare }} is null then 5 36 | when a_query.{{ column_to_compare }} != b_query.{{ column_to_compare }} then 6 37 | else 7 -- this should never happen 38 | end as match_order 39 | 40 | from a_query 41 | 42 | full outer join b_query on a_query.{{ primary_key }} = b_query.{{ primary_key }} 43 | ), 44 | 45 | aggregated as ( 46 | select 47 | '{{ column_to_compare }}' as column_name, 48 | match_status, 49 | match_order, 50 | count(*) as count_records 51 | from joined 52 | 53 | group by column_name, match_status, match_order 54 | ) 55 | 56 | select 57 | column_name, 58 | match_status, 59 | count_records, 60 | round(100.0 * count_records / sum(count_records) over (), 2) as percent_of_total 61 | 62 | from aggregated 63 | 64 | order by match_order 65 | 66 | {% endmacro %} 67 | -------------------------------------------------------------------------------- /macros/compare_column_values_verbose.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_column_values_verbose(a_query, b_query, primary_key, column_to_compare) -%} 2 | {{ return(adapter.dispatch('compare_column_values_verbose', 'audit_helper')(a_query, b_query, primary_key, column_to_compare)) }} 3 | {%- endmacro %} 4 | 5 | 6 | {% macro default__compare_column_values_verbose(a_query, b_query, primary_key, column_to_compare) -%} 7 | with a_query as ( 8 | {{ a_query }} 9 | ), 10 | 11 | b_query as ( 12 | {{ b_query }} 13 | ) 14 | select 15 | coalesce(a_query.{{ primary_key }}, b_query.{{ primary_key }}) as primary_key, 16 | 17 | {% if target.name == 'postgres' or target.name == 'redshift' %} 18 | '{{ column_to_compare }}'::text as column_name, 19 | {% else %} 20 | '{{ column_to_compare }}' as column_name, 21 | {% endif %} 22 | 23 | coalesce( 24 | a_query.{{ column_to_compare }} = b_query.{{ column_to_compare }} and 25 | a_query.{{ primary_key }} is not null and b_query.{{ primary_key }} is not null, 26 | (a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is null), 27 | false 28 | ) as perfect_match, 29 | a_query.{{ column_to_compare }} is null and a_query.{{ primary_key }} is not null as null_in_a, 30 | b_query.{{ column_to_compare }} is null and b_query.{{ primary_key }} is not null as null_in_b, 31 | a_query.{{ primary_key }} is null as missing_from_a, 32 | b_query.{{ primary_key }} is null as missing_from_b, 33 | coalesce( 34 | a_query.{{ primary_key }} is not null and b_query.{{ primary_key }} is not null and 35 | -- ensure that neither value is missing before considering it a conflict 36 | ( 37 | a_query.{{ column_to_compare }} != b_query.{{ column_to_compare }} or -- two not-null values that do not match 38 | (a_query.{{ column_to_compare }} is not null and b_query.{{ column_to_compare }} is null) or -- null in b and not null in a 39 | (a_query.{{ column_to_compare }} is null and b_query.{{ column_to_compare }} is not null) -- null in a and not null in b 40 | ), 41 | false 42 | ) as conflicting_values 43 | -- considered a conflict if the values do not match AND at least one of the values is not null. 44 | 45 | from a_query 46 | 47 | full outer join b_query on (a_query.{{ primary_key }} = b_query.{{ primary_key }}) 48 | 49 | 50 | 51 | {% endmacro %} 52 | -------------------------------------------------------------------------------- /macros/compare_queries.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_queries(a_query, b_query, primary_key=None, summarize=true, limit=None) -%} 2 | {{ return(adapter.dispatch('compare_queries', 'audit_helper')(a_query, b_query, primary_key, summarize, limit)) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__compare_queries(a_query, b_query, primary_key=None, summarize=true, limit=None) %} 6 | 7 | with a as ( 8 | 9 | {{ a_query }} 10 | 11 | ), 12 | 13 | b as ( 14 | 15 | {{ b_query }} 16 | 17 | ), 18 | 19 | a_intersect_b as ( 20 | 21 | select * from a 22 | {{ dbt.intersect() }} 23 | select * from b 24 | 25 | ), 26 | 27 | a_except_b as ( 28 | 29 | select * from a 30 | {{ dbt.except() }} 31 | select * from b 32 | 33 | ), 34 | 35 | b_except_a as ( 36 | 37 | select * from b 38 | {{ dbt.except() }} 39 | select * from a 40 | 41 | ), 42 | 43 | all_records as ( 44 | 45 | select 46 | *, 47 | true as in_a, 48 | true as in_b 49 | from a_intersect_b 50 | 51 | union all 52 | 53 | select 54 | *, 55 | true as in_a, 56 | false as in_b 57 | from a_except_b 58 | 59 | union all 60 | 61 | select 62 | *, 63 | false as in_a, 64 | true as in_b 65 | from b_except_a 66 | 67 | ), 68 | 69 | {%- if summarize %} 70 | 71 | summary_stats as ( 72 | 73 | select 74 | 75 | in_a, 76 | in_b, 77 | count(*) as count 78 | 79 | from all_records 80 | group by 1, 2 81 | 82 | ), 83 | 84 | final as ( 85 | 86 | select 87 | 88 | *, 89 | round(100.0 * count / sum(count) over (), 2) as percent_of_total 90 | 91 | from summary_stats 92 | order by in_a desc, in_b desc 93 | 94 | ) 95 | 96 | {%- else %} 97 | 98 | final as ( 99 | 100 | select * from all_records 101 | where not (in_a and in_b) 102 | order by {{ primary_key ~ ", " if primary_key is not none }} in_a desc, in_b desc 103 | 104 | ) 105 | 106 | {%- endif %} 107 | 108 | select * from final 109 | {%- if limit and not summarize %} 110 | limit {{ limit }} 111 | {%- endif %} 112 | 113 | 114 | {% endmacro %} 115 | -------------------------------------------------------------------------------- /macros/compare_relation_columns.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_relation_columns(a_relation, b_relation) %} 2 | {{ return(adapter.dispatch('compare_relation_columns', 'audit_helper')(a_relation, b_relation)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__compare_relation_columns(a_relation, b_relation) %} 6 | 7 | with a_cols as ( 8 | {{ audit_helper.get_columns_in_relation_sql(a_relation) }} 9 | ), 10 | 11 | b_cols as ( 12 | {{ audit_helper.get_columns_in_relation_sql(b_relation) }} 13 | ) 14 | 15 | select 16 | column_name, 17 | a_cols.ordinal_position as a_ordinal_position, 18 | b_cols.ordinal_position as b_ordinal_position, 19 | a_cols.data_type as a_data_type, 20 | b_cols.data_type as b_data_type, 21 | coalesce(a_cols.ordinal_position = b_cols.ordinal_position, false) as has_ordinal_position_match, 22 | coalesce(a_cols.data_type = b_cols.data_type, false) as has_data_type_match, 23 | a_cols.data_type is not null and b_cols.data_type is null as in_a_only, 24 | b_cols.data_type is not null and a_cols.data_type is null as in_b_only, 25 | b_cols.data_type is not null and a_cols.data_type is not null as in_both 26 | from a_cols 27 | full outer join b_cols using (column_name) 28 | order by coalesce(a_cols.ordinal_position, b_cols.ordinal_position) 29 | 30 | {% endmacro %} 31 | 32 | 33 | {% macro get_columns_in_relation_sql(relation) %} 34 | 35 | {{ adapter.dispatch('get_columns_in_relation_sql', 'audit_helper')(relation) }} 36 | 37 | {% endmacro %} 38 | 39 | {% macro default__get_columns_in_relation_sql(relation) %} 40 | 41 | {% set columns = adapter.get_columns_in_relation(relation) %} 42 | {% for column in columns %} 43 | select 44 | {{ dbt.string_literal(column.name) }} as column_name, 45 | {{ loop.index }} as ordinal_position, 46 | {{ dbt.string_literal(column.data_type) }} as data_type 47 | 48 | {% if not loop.last -%} 49 | union all 50 | {%- endif %} 51 | {% endfor %} 52 | 53 | 54 | {% endmacro %} 55 | 56 | {% macro redshift__get_columns_in_relation_sql(relation) %} 57 | {# You can't store the results of an info schema query to a table/view in Redshift, because the data only lives on the leader node #} 58 | {{ return (audit_helper.default__get_columns_in_relation_sql(relation)) }} 59 | {% endmacro %} 60 | 61 | 62 | {% macro snowflake__get_columns_in_relation_sql(relation) %} 63 | {#- 64 | From: https://github.com/dbt-labs/dbt/blob/dev/louisa-may-alcott/plugins/snowflake/dbt/include/snowflake/macros/adapters.sql#L48 65 | Edited to include ordinal_position 66 | -#} 67 | select 68 | ordinal_position, 69 | column_name, 70 | data_type, 71 | character_maximum_length, 72 | numeric_precision, 73 | numeric_scale 74 | 75 | from 76 | {{ relation.information_schema('columns') }} 77 | 78 | where table_name ilike '{{ relation.identifier }}' 79 | {% if relation.schema %} 80 | and table_schema ilike '{{ relation.schema }}' 81 | {% endif %} 82 | {% if relation.database %} 83 | and table_catalog ilike '{{ relation.database }}' 84 | {% endif %} 85 | order by ordinal_position 86 | {% endmacro %} 87 | 88 | 89 | {% macro postgres__get_columns_in_relation_sql(relation) %} 90 | {#- 91 | From: https://github.com/dbt-labs/dbt/blob/23484b18b71010f701b5312f920f04529ceaa6b2/plugins/postgres/dbt/include/postgres/macros/adapters.sql#L32 92 | Edited to include ordinal_position 93 | -#} 94 | select 95 | ordinal_position, 96 | column_name, 97 | data_type, 98 | character_maximum_length, 99 | numeric_precision, 100 | numeric_scale 101 | 102 | from {{ relation.information_schema('columns') }} 103 | where table_name = '{{ relation.identifier }}' 104 | {% if relation.schema %} 105 | and table_schema = '{{ relation.schema }}' 106 | {% endif %} 107 | order by ordinal_position 108 | {% endmacro %} 109 | 110 | 111 | {% macro bigquery__get_columns_in_relation_sql(relation) %} 112 | 113 | select 114 | ordinal_position, 115 | column_name, 116 | data_type 117 | 118 | from `{{ relation.database }}`.`{{ relation.schema }}`.INFORMATION_SCHEMA.COLUMNS 119 | where table_name = '{{ relation.identifier }}' 120 | 121 | {% endmacro %} 122 | -------------------------------------------------------------------------------- /macros/compare_relations.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_relations(a_relation, b_relation, exclude_columns=[], primary_key=None, summarize=true, limit=None) %} 2 | 3 | {% set column_names = dbt_utils.get_filtered_columns_in_relation(from=a_relation, except=exclude_columns) %} 4 | 5 | {% set column_selection %} 6 | 7 | {% for column_name in column_names %} 8 | {{ adapter.quote(column_name) }} 9 | {% if not loop.last %} 10 | , 11 | {% endif %} 12 | {% endfor %} 13 | 14 | {% endset %} 15 | 16 | {% set a_query %} 17 | select 18 | 19 | {{ column_selection }} 20 | 21 | from {{ a_relation }} 22 | {% endset %} 23 | 24 | {% set b_query %} 25 | select 26 | 27 | {{ column_selection }} 28 | 29 | from {{ b_relation }} 30 | {% endset %} 31 | 32 | {{ audit_helper.compare_queries(a_query, b_query, primary_key, summarize, limit) }} 33 | 34 | {% endmacro %} 35 | -------------------------------------------------------------------------------- /macros/compare_row_counts.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_row_counts(a_relation, b_relation) %} 2 | {{ return(adapter.dispatch('compare_row_counts', 'audit_helper')(a_relation, b_relation)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__compare_row_counts(a_relation, b_relation) %} 6 | 7 | select 8 | '{{ a_relation }}' as relation_name, 9 | count(*) as total_records 10 | from {{ a_relation }} 11 | 12 | union all 13 | 14 | select 15 | '{{ b_relation }}' as relation_name, 16 | count(*) as total_records 17 | from {{ b_relation }} 18 | 19 | {% endmacro %} -------------------------------------------------------------------------------- /macros/compare_which_query_columns_differ.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_which_query_columns_differ(a_query, b_query, primary_key_columns=[], columns=[], event_time=None) %} 2 | {{ return(adapter.dispatch('compare_which_query_columns_differ', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__compare_which_query_columns_differ(a_query, b_query, primary_key_columns, columns, event_time) %} 6 | {% set columns = audit_helper._ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} 7 | {% if event_time %} 8 | {% set event_time_props = audit_helper._get_comparison_bounds(event_time) %} 9 | {% endif %} 10 | 11 | {% set joined_cols = columns | join (", ") %} 12 | 13 | with a as ( 14 | select 15 | {{ joined_cols }}, 16 | {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key 17 | from ({{ a_query }}) as a_subq 18 | {{ audit_helper.event_time_filter(event_time_props) }} 19 | ), 20 | b as ( 21 | select 22 | {{ joined_cols }}, 23 | {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key 24 | from ({{ b_query }}) as b_subq 25 | {{ audit_helper.event_time_filter(event_time_props) }} 26 | ), 27 | 28 | calculated as ( 29 | select 30 | {% for column in columns %} 31 | {% set quoted_column = adapter.quote(column) %} 32 | {% set compare_statement %} 33 | ( 34 | (a.{{ quoted_column }} != b.{{ quoted_column }}) 35 | or (a.{{ quoted_column }} is null and b.{{ quoted_column }} is not null) 36 | or (a.{{ quoted_column }} is not null and b.{{ quoted_column }} is null) 37 | ) 38 | {% endset %} 39 | 40 | {{ dbt.bool_or(compare_statement) }} as {{ column | lower }}_has_difference 41 | 42 | {%- if not loop.last %}, {% endif %} 43 | {% endfor %} 44 | from a 45 | inner join b on a.dbt_audit_surrogate_key = b.dbt_audit_surrogate_key 46 | ) 47 | 48 | {% for column in columns %} 49 | 50 | select 51 | '{{ column }}' as column_name, 52 | {{ column | lower }}_has_difference as has_difference 53 | 54 | from calculated 55 | 56 | {% if not loop.last %} 57 | 58 | union all 59 | 60 | {% endif %} 61 | 62 | {% endfor %} 63 | 64 | {% endmacro %} 65 | -------------------------------------------------------------------------------- /macros/compare_which_relation_columns_differ.sql: -------------------------------------------------------------------------------- 1 | {% macro compare_which_relation_columns_differ(a_relation, b_relation, primary_key_columns=[], columns=[], event_time=None) %} 2 | {%- if not columns -%} 3 | {%- set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) -%} 4 | {%- endif -%} 5 | 6 | {{ 7 | audit_helper.compare_which_query_columns_differ( 8 | "select * from " ~ a_relation, 9 | "select * from " ~ b_relation, 10 | primary_key_columns, 11 | columns, 12 | event_time 13 | ) 14 | }} 15 | {% endmacro %} -------------------------------------------------------------------------------- /macros/quick_are_queries_identical.sql: -------------------------------------------------------------------------------- 1 | /* 2 | As described by the Infinite Lambda team here: https://infinitelambda.com/data-validation-refactoring-snowflake/ 3 | 4 | Some platforms let you take a hash of the whole table, which can be very very fast compared to comparing each row. 5 | 6 | If you run this and it returns false, you still have to run the more in-depth queries to find out what specific changes there are, 7 | but it's a good way to quickly verify identical results if that's what you're expecting. 8 | */ 9 | 10 | {% macro quick_are_queries_identical(query_a, query_b, columns=[], event_time=None) %} 11 | {{ return (adapter.dispatch('quick_are_queries_identical', 'audit_helper')(query_a, query_b, columns, event_time)) }} 12 | {% endmacro %} 13 | 14 | {% macro default__quick_are_queries_identical(query_a, query_b, columns, event_time) %} 15 | {% if execute %} 16 | {# Need to only throw this error when the macro is actually trying to be used, not during intial parse phase #} 17 | {# if/when unit tests get support for `enabled` config, this check can be removed as they won't be supplied for parse anyway #} 18 | {% do exceptions.raise_compiler_error("quick_are_queries_identical() is not implemented for adapter '"~ target.type ~ "'" ) %} 19 | {% endif %} 20 | {% endmacro %} 21 | 22 | {% macro bigquery__quick_are_queries_identical(query_a, query_b, columns, event_time) %} 23 | {% set joined_cols = columns | join(", ") %} 24 | {% if event_time %} 25 | {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} 26 | {% endif %} 27 | 28 | with query_a as ( 29 | select {{ joined_cols }} 30 | from ({{ query_a }}) 31 | {{ audit_helper.event_time_filter(event_time_props) }} 32 | ), 33 | query_b as ( 34 | select {{ joined_cols }} 35 | from ({{ query_b }}) 36 | {{ audit_helper.event_time_filter(event_time_props) }} 37 | ) 38 | 39 | select count(distinct hash_result) = 1 as are_tables_identical 40 | from ( 41 | select bit_xor(farm_fingerprint(to_json_string(query_a))) as hash_result 42 | from query_a 43 | 44 | union all 45 | 46 | select bit_xor(farm_fingerprint(to_json_string(query_b))) as hash_result 47 | from query_b 48 | ) as hashes 49 | {% endmacro %} 50 | 51 | {% macro snowflake__quick_are_queries_identical(query_a, query_b, columns, event_time) %} 52 | {% set joined_cols = columns | join(", ") %} 53 | {% if event_time %} 54 | {% set event_time_props = audit_helper._get_comparison_bounds(a_query, b_query, event_time) %} 55 | {% endif %} 56 | 57 | select count(distinct hash_result) = 1 as are_tables_identical 58 | from ( 59 | select hash_agg({{ joined_cols }}) as hash_result 60 | from ({{ query_a }}) query_a_subq 61 | {{ audit_helper.event_time_filter(event_time_props) }} 62 | 63 | union all 64 | 65 | select hash_agg({{ joined_cols }}) as hash_result 66 | from ({{ query_b }}) query_b_subq 67 | {{ audit_helper.event_time_filter(event_time_props) }} 68 | 69 | ) as hashes 70 | {% endmacro %} -------------------------------------------------------------------------------- /macros/quick_are_relations_identical.sql: -------------------------------------------------------------------------------- 1 | {% macro quick_are_relations_identical(a_relation, b_relation, columns=None, event_time=None) %} 2 | {% if not columns %} 3 | {% set columns = audit_helper._get_intersecting_columns_from_relations(a_relation, b_relation) %} 4 | {% endif %} 5 | 6 | {{ 7 | audit_helper.quick_are_queries_identical( 8 | "select * from " ~ a_relation, 9 | "select * from " ~ b_relation, 10 | columns, 11 | event_time 12 | ) 13 | }} 14 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/_classify_audit_row_status.sql: -------------------------------------------------------------------------------- 1 | {% macro _classify_audit_row_status() %} 2 | {{ return(adapter.dispatch('_classify_audit_row_status', 'audit_helper')()) }} 3 | {% endmacro %} 4 | 5 | {%- macro default___classify_audit_row_status() -%} 6 | case 7 | when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' 8 | when dbt_audit_in_a and dbt_audit_in_b then 'identical' 9 | when {{ dbt.bool_or('dbt_audit_in_a') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) 10 | and {{ dbt.bool_or('dbt_audit_in_b') }} over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) 11 | then 'modified' 12 | when dbt_audit_in_a then 'removed' 13 | when dbt_audit_in_b then 'added' 14 | end 15 | {% endmacro %} 16 | 17 | 18 | {%- macro redshift___classify_audit_row_status() -%} 19 | {#- Redshift doesn't support bitwise operations (e.g. bool_or) inside of a window function :( -#} 20 | case 21 | when max(dbt_audit_pk_row_num) over (partition by dbt_audit_surrogate_key) > 1 then 'nonunique_pk' 22 | when dbt_audit_in_a and dbt_audit_in_b then 'identical' 23 | when max(case when dbt_audit_in_a then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 24 | and max(case when dbt_audit_in_b then 1 else 0 end) over (partition by dbt_audit_surrogate_key, dbt_audit_pk_row_num) = 1 25 | then 'modified' 26 | when dbt_audit_in_a then 'removed' 27 | when dbt_audit_in_b then 'added' 28 | end{% endmacro %} -------------------------------------------------------------------------------- /macros/utils/_count_num_rows_in_status.sql: -------------------------------------------------------------------------------- 1 | {% macro _count_num_rows_in_status() %} 2 | {{ return(adapter.dispatch('_count_num_rows_in_status', 'audit_helper')()) }} 3 | {% endmacro %} 4 | 5 | {%- macro default___count_num_rows_in_status() -%} 6 | count(distinct dbt_audit_surrogate_key, dbt_audit_pk_row_num) over (partition by dbt_audit_row_status) 7 | {% endmacro %} 8 | 9 | {%- macro bigquery___count_num_rows_in_status() -%} 10 | count(distinct {{ dbt.concat(["dbt_audit_surrogate_key", "dbt_audit_pk_row_num"]) }}) over (partition by dbt_audit_row_status) 11 | {% endmacro %} 12 | 13 | {%- macro postgres___count_num_rows_in_status() -%} 14 | {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} 15 | {% endmacro %} 16 | 17 | {%- macro databricks___count_num_rows_in_status() -%} 18 | {{ audit_helper._count_num_rows_in_status_without_distinct_window_func() }} 19 | {% endmacro %} 20 | 21 | {% macro _count_num_rows_in_status_without_distinct_window_func() %} 22 | {#- Some platforms don't support count(distinct) inside of window functions -#} 23 | {#- You can get the same outcome by dense_rank, assuming no nulls (we've already handled that) #} 24 | {# https://stackoverflow.com/a/22347502 -#} 25 | dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key, dbt_audit_pk_row_num) 26 | + dense_rank() over (partition by dbt_audit_row_status order by dbt_audit_surrogate_key desc, dbt_audit_pk_row_num desc) 27 | - 1 28 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/_ensure_all_pks_are_in_column_set.sql: -------------------------------------------------------------------------------- 1 | {# If someone forgot to include the PK columns in their main set of columns, fix it up for them #} 2 | {# Assuming that the PKs are the most important columns, so they go to the front of the list #} 3 | 4 | {% macro _ensure_all_pks_are_in_column_set(primary_key_columns, columns) %} 5 | {% set lower_cols = columns | map('lower') | list %} 6 | {% set missing_pks = [] %} 7 | 8 | {% for pk in primary_key_columns %} 9 | {% if pk | lower not in lower_cols %} 10 | {% do missing_pks.append(pk) %} 11 | {% endif %} 12 | {% endfor %} 13 | 14 | {% if missing_pks | length > 0 %} 15 | {% set columns = missing_pks + columns %} 16 | {% endif %} 17 | 18 | {% do return (columns) %} 19 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/_generate_null_safe_sk.sql: -------------------------------------------------------------------------------- 1 | {# Taken from https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql but without the option to treat nulls as empty strings #} 2 | 3 | {%- macro _generate_null_safe_surrogate_key(field_list) -%} 4 | {{ return(adapter.dispatch('_generate_null_safe_surrogate_key', 'audit_helper')(field_list)) }} 5 | {% endmacro %} 6 | 7 | {%- macro default___generate_null_safe_surrogate_key(field_list) -%} 8 | 9 | {%- set fields = [] -%} 10 | 11 | {%- for field in field_list -%} 12 | 13 | {%- do fields.append( 14 | "coalesce(cast(" ~ field ~ " as " ~ dbt.type_string() ~ "), '_dbt_audit_helper_surrogate_key_null_')" 15 | ) -%} 16 | 17 | {%- if not loop.last %} 18 | {%- do fields.append("'-'") -%} 19 | {%- endif -%} 20 | 21 | {%- endfor -%} 22 | 23 | {{ dbt.hash(dbt.concat(fields)) }} 24 | 25 | {%- endmacro -%} -------------------------------------------------------------------------------- /macros/utils/_generate_set_results.sql: -------------------------------------------------------------------------------- 1 | {#- 2 | Set generation is dispatched because it's possible to get performance optimisations 3 | on some platforms, while keeping the post-processing standardised 4 | See https://infinitelambda.com/data-validation-refactoring-snowflake/ for an example and background 5 | -#} 6 | 7 | {% macro _generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props=None) %} 8 | {{ return(adapter.dispatch('_generate_set_results', 'audit_helper')(a_query, b_query, primary_key_columns, columns, event_time_props)) }} 9 | {% endmacro %} 10 | 11 | {% macro default___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} 12 | {% set joined_cols = columns | join(", ") %} 13 | 14 | a_base as ( 15 | select 16 | {{ joined_cols }}, 17 | {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key 18 | from ( {{- a_query -}} ) a_base_subq 19 | {{ audit_helper.event_time_filter(event_time_props) }} 20 | ), 21 | 22 | b_base as ( 23 | select 24 | {{ joined_cols }}, 25 | {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key 26 | from ( {{- b_query -}} ) b_base_subq 27 | {{ audit_helper.event_time_filter(event_time_props) }} 28 | ), 29 | 30 | a as ( 31 | select 32 | *, 33 | row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num 34 | from a_base 35 | ), 36 | 37 | b as ( 38 | select 39 | *, 40 | row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key) as dbt_audit_pk_row_num 41 | from b_base 42 | ), 43 | 44 | a_intersect_b as ( 45 | 46 | select * from a 47 | {{ dbt.intersect() }} 48 | select * from b 49 | 50 | ), 51 | 52 | a_except_b as ( 53 | 54 | select * from a 55 | {{ dbt.except() }} 56 | select * from b 57 | 58 | ), 59 | 60 | b_except_a as ( 61 | 62 | select * from b 63 | {{ dbt.except() }} 64 | select * from a 65 | 66 | ) 67 | {% endmacro %} 68 | 69 | {% macro bigquery___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} 70 | {% set joined_cols = columns | join(", ") %} 71 | {% set surrogate_key = audit_helper._generate_null_safe_surrogate_key(primary_key_columns) %} 72 | subset_columns_a as ( 73 | select 74 | {{ joined_cols }}, 75 | {{ surrogate_key }} as dbt_audit_surrogate_key, 76 | row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num 77 | from ( {{- a_query -}} ) 78 | {{ audit_helper.event_time_filter(event_time_props) }} 79 | ), 80 | 81 | subset_columns_b as ( 82 | select 83 | {{ joined_cols }}, 84 | {{ surrogate_key }} as dbt_audit_surrogate_key, 85 | row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num 86 | from ( {{- b_query -}} ) 87 | {{ audit_helper.event_time_filter(event_time_props) }} 88 | ), 89 | 90 | a as ( 91 | select 92 | *, 93 | farm_fingerprint(to_json_string(subset_columns_a)) as dbt_audit_row_hash 94 | from subset_columns_a 95 | ), 96 | 97 | b as ( 98 | select 99 | *, 100 | farm_fingerprint(to_json_string(subset_columns_b)) as dbt_audit_row_hash 101 | from subset_columns_b 102 | ), 103 | 104 | a_intersect_b as ( 105 | 106 | select * from a 107 | where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) 108 | 109 | ), 110 | 111 | a_except_b as ( 112 | 113 | select * from a 114 | where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) 115 | 116 | ), 117 | 118 | b_except_a as ( 119 | 120 | select * from b 121 | where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) 122 | 123 | ) 124 | {% endmacro %} 125 | 126 | {% macro databricks___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} 127 | {% set cast_columns = [] %} 128 | {# Map types can't be compared by default (you need to opt in to a legacy behaviour flag) #} 129 | {# so everything needs to be cast as a string first :( #} 130 | {% for col in columns %} 131 | {% do cast_columns.append(dbt.cast(col, api.Column.translate_type("string"))) %} 132 | {% endfor %} 133 | {% set joined_cols = cast_columns | join(", ") %} 134 | {% set surrogate_key = audit_helper._generate_null_safe_surrogate_key(primary_key_columns) %} 135 | a as ( 136 | select 137 | {{ joined_cols }}, 138 | {{ surrogate_key }} as dbt_audit_surrogate_key, 139 | row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, 140 | xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash 141 | from ( {{- a_query -}} ) 142 | {{ audit_helper.event_time_filter(event_time_props) }} 143 | ), 144 | 145 | b as ( 146 | select 147 | {{ joined_cols }}, 148 | {{ surrogate_key }} as dbt_audit_surrogate_key, 149 | row_number() over (partition by {{ surrogate_key }} order by 1 ) as dbt_audit_pk_row_num, 150 | xxhash64({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash 151 | from ( {{- b_query -}} ) 152 | {{ audit_helper.event_time_filter(event_time_props) }} 153 | ), 154 | 155 | a_intersect_b as ( 156 | 157 | select * from a 158 | where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) 159 | 160 | ), 161 | 162 | a_except_b as ( 163 | 164 | select * from a 165 | where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) 166 | 167 | ), 168 | 169 | b_except_a as ( 170 | 171 | select * from b 172 | where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) 173 | 174 | ) 175 | {% endmacro %} 176 | 177 | {% macro snowflake___generate_set_results(a_query, b_query, primary_key_columns, columns, event_time_props) %} 178 | {% set joined_cols = columns | join(", ") %} 179 | a as ( 180 | select 181 | {{ joined_cols }}, 182 | {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, 183 | row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, 184 | hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash 185 | from ( {{- a_query -}} ) 186 | {{ audit_helper.event_time_filter(event_time_props) }} 187 | ), 188 | 189 | b as ( 190 | select 191 | {{ joined_cols }}, 192 | {{ audit_helper._generate_null_safe_surrogate_key(primary_key_columns) }} as dbt_audit_surrogate_key, 193 | row_number() over (partition by dbt_audit_surrogate_key order by dbt_audit_surrogate_key ) as dbt_audit_pk_row_num, 194 | hash({{ joined_cols }}, dbt_audit_pk_row_num) as dbt_audit_row_hash 195 | from ( {{- b_query -}} ) 196 | {{ audit_helper.event_time_filter(event_time_props) }} 197 | ), 198 | 199 | a_intersect_b as ( 200 | 201 | select * from a 202 | where a.dbt_audit_row_hash in (select b.dbt_audit_row_hash from b) 203 | 204 | ), 205 | 206 | a_except_b as ( 207 | 208 | select * from a 209 | where a.dbt_audit_row_hash not in (select b.dbt_audit_row_hash from b) 210 | 211 | ), 212 | 213 | b_except_a as ( 214 | 215 | select * from b 216 | where b.dbt_audit_row_hash not in (select a.dbt_audit_row_hash from a) 217 | 218 | ) 219 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/_get_comparison_bounds.sql: -------------------------------------------------------------------------------- 1 | /* 2 | The idea here is that if the event_time is set, we will only compare records enclosed in both models. 3 | This improves performance and allows us to compare apples to apples, instead of detecting millions/billions 4 | of "deletions" identified due to prod having all data while CI only has a few days' worth. 5 | 6 | In the diagram below, the thatched section is the comparison bounds. You can think of it as 7 | 8 | greatest(model_a.min_value, model_b.min_value) 9 | least(model_a.max_value, model_b.max_value) 10 | 11 | ┌────────────────────────────┐ 12 | a min_value │ a max_value │ 13 | └──► ┌───────┼────────────────────┐ ◄───┘ │ 14 | │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ 15 | model_a │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ model_b 16 | │ │┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼┼│ │ 17 | └───────┼────────────────────┘ │ 18 | ┌──► └────────────────────────────┘ ◄────┐ 19 | b min_value b max_value 20 | */ 21 | {% macro _get_comparison_bounds(a_query, b_query, event_time) %} 22 | {% set min_max_queries %} 23 | with min_maxes as ( 24 | select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time 25 | from ({{ a_query }}) a_subq 26 | union all 27 | select min({{ event_time }}) as min_event_time, max({{ event_time }}) as max_event_time 28 | from ({{ b_query }}) b_subq 29 | ) 30 | select max(min_event_time) as min_event_time, min(max_event_time) as max_event_time 31 | from min_maxes 32 | {% endset %} 33 | 34 | {% set query_response = dbt_utils.get_query_results_as_dict(min_max_queries) %} 35 | 36 | {% set event_time_props = {"event_time": event_time} %} 37 | 38 | {# query_response.keys() are only `min_event_time` and `max_event_time`, but they have indeterminate capitalisation #} 39 | {# hence the dynamic approach for what is otherwise just two well-known values #} 40 | {% for k in query_response.keys() %} 41 | {% do event_time_props.update({k | lower: query_response[k][0]}) %} 42 | {% endfor %} 43 | 44 | {% do return(event_time_props) %} 45 | {% endmacro %} 46 | 47 | {% macro event_time_filter(event_time_props) %} 48 | {% if event_time_props %} 49 | where {{ event_time_props["event_time"] }} >= '{{ event_time_props["min_event_time"] }}' 50 | and {{ event_time_props["event_time"] }} <= '{{ event_time_props["max_event_time"] }}' 51 | {% endif %} 52 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/_get_intersecting_columns_from_relations.sql: -------------------------------------------------------------------------------- 1 | {% macro _get_intersecting_columns_from_relations(a_relation, b_relation) %} 2 | {%- set a_cols = dbt_utils.get_filtered_columns_in_relation(a_relation) -%} 3 | {%- set b_cols = dbt_utils.get_filtered_columns_in_relation(b_relation) -%} 4 | 5 | {%- set intersection = [] -%} 6 | {%- for col in a_cols -%} 7 | {%- if col in b_cols -%} 8 | {%- do intersection.append(col) -%} 9 | {%- endif -%} 10 | {%- endfor -%} 11 | 12 | {% do return(intersection) %} 13 | {% endmacro %} -------------------------------------------------------------------------------- /packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: [">=0.9.0", "<2.0.0"] 4 | -------------------------------------------------------------------------------- /supported_adapters.env: -------------------------------------------------------------------------------- 1 | SUPPORTED_ADAPTERS=postgres 2 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | skipsdist = True 3 | envlist = lint_all, testenv 4 | 5 | [testenv] 6 | passenv = 7 | # postgres env vars 8 | POSTGRES_HOST 9 | POSTGRES_USER 10 | DBT_ENV_SECRET_POSTGRES_PASS 11 | POSTGRES_PORT 12 | POSTGRES_DATABASE 13 | POSTGRES_SCHEMA 14 | 15 | # Postgres integration tests for centralized dbt testing 16 | # run dbt commands directly, assumes dbt is already installed in environment 17 | [testenv:dbt_integration_postgres] 18 | changedir = integration_tests 19 | allowlist_externals = 20 | dbt 21 | skip_install = true 22 | commands = 23 | dbt --version 24 | dbt debug --target postgres 25 | --------------------------------------------------------------------------------