├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── ci-pr.yml │ └── docs.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .sqlfluffignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── dbt_project.yml ├── docs ├── assets │ ├── css │ │ └── termynal.css │ ├── favicon.ico │ ├── img │ │ ├── Sample_DAG_of_Tasks.png │ │ ├── data-diff.jpeg │ │ ├── favicon.ico │ │ ├── il-logo.png │ │ └── sis_ui.png │ └── js │ │ ├── custom.js │ │ ├── feedback.js │ │ └── termynal.js ├── contributing.md ├── hooks.py ├── index.md └── overrides │ └── main.html ├── integration_tests ├── ci │ ├── ci.profiles.yml │ ├── profiles.yml │ └── sf-init.sql ├── dbt_project.yml ├── models │ ├── example │ │ ├── my_first_dbt_model.sql │ │ └── my_second_dbt_model.sql │ ├── unit │ │ ├── unit.yml │ │ ├── verify_configured_tables.sql │ │ ├── verify_configured_tables.yml │ │ ├── verify_get_namespace.sql │ │ ├── verify_log_for_validation.sql │ │ └── verify_log_for_validation.yml │ └── verify_run │ │ ├── verify_configured_tables_data.sql │ │ ├── verify_created_sprocs.sql │ │ ├── verify_created_tasks.sql │ │ ├── verify_log_entry.sql │ │ └── verify_run.yml ├── package-lock.yml └── packages.yml ├── macros ├── data_diff.yml ├── data_diff__cleanup.sql ├── data_diff__poll_status_async.sql ├── data_diff__run.sql ├── data_diff__run_async.sql ├── resources │ ├── create_resources.sql │ ├── refresh_resource_data.sql │ ├── resources.yml │ └── stored-procedures │ │ ├── create__check_data_diff.sql │ │ ├── create__check_key.sql │ │ ├── create__check_schema.sql │ │ └── stored_procedures.yml ├── sis │ ├── diff_helper.py │ ├── sis.yml │ └── sis_deploy__diff_helper.sql └── utilities │ ├── escape_single_quote_value.sql │ ├── escape_single_quote_value.yml │ ├── get_namespace.sql │ └── get_namespace.yml ├── mkdocs.yml ├── models ├── 01_key_diff │ ├── key_check.sql │ ├── key_check.yml │ ├── key_check_summary.sql │ └── key_check_summary.yml ├── 02_schema_diff │ ├── schema_check.sql │ ├── schema_check.yml │ ├── schema_check_summary.sql │ └── schema_check_summary.yml ├── 03_content_diff │ ├── data_diff_check_summary.sql │ └── data_diff_check_summary.yml ├── configured_tables.sql ├── configured_tables.yml ├── log_for_validation.sql └── log_for_validation.yml ├── package-lock.yml ├── packages.yml ├── poetry.lock └── pyproject.toml /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report a bug or an issue you've found with this package 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Describe the bug 11 | 14 | 15 | ### Steps to reproduce 16 | 19 | 20 | ### Expected results 21 | 24 | 25 | ### Actual results 26 | 29 | 30 | ### Screenshots and log output 31 | 34 | 35 | ### System information 36 | 37 | **The contents of your `packages.yml` file:** 38 | 39 | **Which database are you using dbt with?** 40 | 41 | - [ ] snowflake 42 | - [ ] other (specify: ____________) 43 | 44 | **The output of `dbt --version`:** 45 | 46 | ```log 47 | 48 | ``` 49 | 50 | ### Additional context 51 | 54 | 55 | ### Are you interested in contributing the fix? 56 | 59 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this package 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Describe the feature 11 | 12 | A clear and concise description of what you want to happen. 13 | 14 | ### Describe alternatives you've considered 15 | 16 | A clear and concise description of any alternative solutions or features you've considered. 17 | 18 | ### Additional context 19 | 20 | Is this feature database-specific? Which database(s) is/are relevant? Please include any other relevant context here. 21 | 22 | ### Who will this benefit? 23 | 24 | What kind of use case will this feature be useful for? Please be specific and provide examples, this will help us prioritize properly. 25 | 26 | ### Are you interested in contributing this feature? 27 | 30 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | resolves # 3 | 4 | This is a: 5 | 6 | - [ ] documentation update 7 | - [ ] bug fix with no breaking changes 8 | - [ ] new functionality 9 | - [ ] a breaking change 10 | 11 | All pull requests from community contributors should target the `main` branch (default). 12 | 13 | ## Description & motivation 14 | 17 | 18 | ## Checklist 19 | 20 | - [ ] This code is associated with an Issue which has been triaged and [accepted for development](https://docs.getdbt.com/docs/contributing/oss-expectations#pull-requests) 21 | - [ ] I have verified that these changes work locally on the following warehouses (Note: it's okay if you do not have access to all warehouses, this helps us understand what has been covered) 22 | - [ ] Snowflake 23 | - [ ] I have updated the README.md (if applicable) 24 | - [ ] I have added tests & descriptions to my models (and macros if applicable) 25 | -------------------------------------------------------------------------------- /.github/workflows/ci-pr.yml: -------------------------------------------------------------------------------- 1 | name: testing on pull request 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | 7 | jobs: 8 | snowflake: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | with: 14 | fetch-depth: 0 15 | submodules: recursive 16 | - uses: actions/cache@v2 17 | with: 18 | key: ${{ github.ref }} 19 | path: .cache 20 | - uses: abatilo/actions-poetry@v2 21 | with: 22 | poetry-version: 1.7.1 23 | 24 | - name: Install dependencies 25 | run: | 26 | poetry config installer.max-workers 1 27 | poetry config virtualenvs.in-project true 28 | poetry install 29 | 30 | - name: Code Quality 31 | run: | 32 | poetry run poe lint 33 | 34 | - name: Test data diff 35 | run: | 36 | # Single thread 37 | poetry run poe data-diff 38 | poetry run poe data-diff-test 39 | 40 | # Multi thread 41 | poetry run poe data-diff-async-wait 42 | poetry run poe data-diff-test 43 | env: 44 | DBT_SNOWFLAKE_TEST_ACCOUNT: ${{ secrets.DBT_SNOWFLAKE_TEST_ACCOUNT }} 45 | DBT_SNOWFLAKE_TEST_USER: ${{ secrets.DBT_SNOWFLAKE_TEST_USER }} 46 | DBT_ENV_SECRET_SNOWFLAKE_TEST_PASSWORD: ${{ secrets.DBT_ENV_SECRET_SNOWFLAKE_TEST_PASSWORD }} 47 | DBT_SNOWFLAKE_TEST_ROLE: ${{ secrets.DBT_SNOWFLAKE_TEST_ROLE }} 48 | DBT_SNOWFLAKE_TEST_DATABASE: ${{ secrets.DBT_SNOWFLAKE_TEST_DATABASE }} 49 | DBT_SNOWFLAKE_TEST_WAREHOUSE: ${{ secrets.DBT_SNOWFLAKE_TEST_WAREHOUSE }} 50 | DBT_SCHEMA: PRCI_${{ github.sha }} 51 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: deploy documentation site 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version: 7 | description: Document version e.g. 1.1, 1.2 8 | required: true 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | with: 17 | fetch-depth: 0 18 | submodules: recursive 19 | - uses: actions/cache@v2 20 | with: 21 | key: ${{ github.ref }} 22 | path: .cache 23 | - uses: abatilo/actions-poetry@v2 24 | with: 25 | poetry-version: 1.7.1 26 | 27 | - name: Install dependencies 28 | run: | 29 | poetry config installer.max-workers 1 30 | poetry config virtualenvs.in-project true 31 | poetry install 32 | 33 | - name: Build the docs site 34 | run: | 35 | mkdir -p ~/.dbt 36 | cp integration_tests/ci/ci.profiles.yml ~/.dbt/profiles.yml 37 | poetry run poe build-doc 38 | env: 39 | DBT_SNOWFLAKE_TEST_ACCOUNT: ${{ secrets.DBT_SNOWFLAKE_TEST_ACCOUNT }} 40 | DBT_SNOWFLAKE_TEST_USER: ${{ secrets.DBT_SNOWFLAKE_TEST_USER }} 41 | DBT_ENV_SECRET_SNOWFLAKE_TEST_PASSWORD: ${{ secrets.DBT_ENV_SECRET_SNOWFLAKE_TEST_PASSWORD }} 42 | DBT_SNOWFLAKE_TEST_ROLE: ${{ secrets.DBT_SNOWFLAKE_TEST_ROLE }} 43 | DBT_SNOWFLAKE_TEST_DATABASE: ${{ secrets.DBT_SNOWFLAKE_TEST_DATABASE }} 44 | DBT_SNOWFLAKE_TEST_WAREHOUSE: ${{ secrets.DBT_SNOWFLAKE_TEST_WAREHOUSE }} 45 | DBT_SCHEMA: DOCS 46 | 47 | - name: Configure Git user 48 | run: | 49 | git config --local user.email "github-actions[bot]@users.noreply.github.com" 50 | git config --local user.name "github-actions[bot]" 51 | 52 | - name: Publish site 53 | run: | 54 | poetry run mike deploy --push --update-aliases ${{ inputs.version }} latest 55 | env: 56 | GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }} 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dbt 2 | dbt_packages/ 3 | logs/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | target-*/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # poetry 103 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 107 | #poetry.lock 108 | 109 | # pdm 110 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 111 | #pdm.lock 112 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 113 | # in version control. 114 | # https://pdm.fming.dev/#use-with-ide 115 | .pdm.toml 116 | 117 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 118 | __pypackages__/ 119 | 120 | # Celery stuff 121 | celerybeat-schedule 122 | celerybeat.pid 123 | 124 | # SageMath parsed files 125 | *.sage.py 126 | 127 | # Environments 128 | .env 129 | .venv 130 | env/ 131 | venv/ 132 | ENV/ 133 | env.bak/ 134 | venv.bak/ 135 | 136 | # Spyder project settings 137 | .spyderproject 138 | .spyproject 139 | 140 | # Rope project settings 141 | .ropeproject 142 | 143 | # mkdocs documentation 144 | /site 145 | 146 | # mypy 147 | .mypy_cache/ 148 | .dmypy.json 149 | dmypy.json 150 | 151 | # Pyre type checker 152 | .pyre/ 153 | 154 | # pytype static type analyzer 155 | .pytype/ 156 | 157 | # Cython debug symbols 158 | cython_debug/ 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ 166 | 167 | .secrets 168 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | fail_fast: true 2 | repos: 3 | - repo: https://github.com/commitizen-tools/commitizen 4 | rev: v2.20.0 5 | hooks: 6 | - id: commitizen 7 | name: Check Commit Message 8 | stages: [commit-msg] 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v2.2.1 11 | hooks: 12 | - id: trailing-whitespace 13 | - id: end-of-file-fixer 14 | -------------------------------------------------------------------------------- /.sqlfluffignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_modules/ 3 | dbt_packages/ 4 | sf-init.sql 5 | /macros 6 | /integration_tests 7 | /site 8 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | the [Infinite Lambda Contact](https://infinitelambda.com/contacts/). 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at the [Code of Conduct v2](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html). 119 | 120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 121 | enforcement ladder](https://github.com/mozilla/diversity). 122 | 123 | [homepage]: https://www.contributor-covenant.org 124 | 125 | For answers to common questions about this code of conduct, see the [FAQ](https://www.contributor-covenant.org/faq). Translations are available [here](https://www.contributor-covenant.org/translations). 126 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `dbt-data-diff` 2 | 3 | `dbt-data-diff` is open-source dbt package ❤️. Whether you are a seasoned open-source contributor or a first-time committer, we welcome and encourage you to contribute code, documentation, ideas, or problem statements to this project. 4 | 5 | - [Contributing to `dbt-data-diff`](#contributing-to-dbt-data-diff) 6 | - [About this document](#about-this-document) 7 | - [Getting the code](#getting-the-code) 8 | - [Installing git](#installing-git) 9 | - [External contributors](#external-contributors) 10 | - [Setting up an environment](#setting-up-an-environment) 11 | - [Tools](#tools) 12 | - [Get dbt profile ready](#get-dbt-profile-ready) 13 | - [Linting](#linting) 14 | - [Testing](#testing) 15 | - [Committing](#committing) 16 | - [Submitting a Pull Request](#submitting-a-pull-request) 17 | 18 | ## About this document 19 | 20 | There are many ways to contribute to the ongoing development of `dbt-data-diff`, such as by participating in discussions and issues. 21 | 22 | The rest of this document serves as a more granular guide for contributing code changes to `dbt-data-diff` (this repository). It is not intended as a guide for using `dbt-data-diff`, and some pieces assume a level of familiarity with Python development with `poetry`. Specific code snippets in this guide assume you are using macOS or Linux and are comfortable with the command line. 23 | 24 | - **Branches:** All pull requests from community contributors should target the `main` branch (default). If the change is needed as a patch for a minor version of dbt that has already been released (or is already a release candidate), a maintainer will backport the changes in your PR to the relevant "latest" release branch (`1.0.`, `1.1.`, ...). If an issue fix applies to a release branch, that fix should be first committed to the development branch and then to the release branch (rarely release-branch fixes may not apply to `main`). 25 | - **Releases**: Before releasing a new minor version, we prepare a series of beta release candidates to allow users to test the new version in live environments. This is an important quality assurance step, as it exposes the new code to a wide variety of complicated deployments and can surface bugs before official release. Releases are accessible via pip. 26 | 27 | ## Getting the code 28 | 29 | ### Installing git 30 | 31 | You will need `git` in order to download and modify the `dbt-data-diff` source code. On macOS, the best way to download git is to just install [Xcode](https://developer.apple.com/support/xcode/). 32 | 33 | ### External contributors 34 | 35 | You can contribute to `dbt-data-diff` by forking the `dbt-data-diff` repository. For a detailed overview on forking, check out the [GitHub docs on forking](https://help.github.com/en/articles/fork-a-repo). In short, you will need to: 36 | 37 | 1. Fork the `dbt-data-diff` repository 38 | 2. Clone your fork locally 39 | 3. Check out a new branch for your proposed changes 40 | 4. Push changes to your fork 41 | 5. Open a pull request against `infintelambda/dbt-data-diff` from your forked repository 42 | 43 | ## Setting up an environment 44 | 45 | There are some tools that will be helpful to you in developing locally. While this is the list relevant for `dbt-data-diff` development, many of these tools are used commonly across open-source python projects. 46 | 47 | ### Tools 48 | 49 | We will buy `poetry` in `dbt-data-diff` development and testing. 50 | 51 | So first install poetry via pip or via the [official installer](https://python-poetry.org/docs/#installing-with-the-official-installer), please help to check right version used in [poetry.lock](/poetry.lock) file. Then, start installing the local environment: 52 | 53 | ```bash 54 | poetry install 55 | poetry shell 56 | poe git-hooks 57 | ``` 58 | 59 | ### Get dbt profile ready 60 | 61 | Please help to check [the sample script](/integration_tests/ci/sf-init.sql) to initialize Snowflake environment in `integreation_tests/ci` directory, and get your database freshly created. 62 | 63 | Next, you should follow [dbt profile instruction](https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles) and setting up your dedicated profile. Again, you could [try our sample](/integration_tests/ci/profiles.yml) in the same above directory. 64 | 65 | Run `poe data-diff-verify` for verifying the connection ✅ 66 | 67 | ## Linting 68 | 69 | We're trying to also maintain the code quality leveraging [sqlfluff](https://sqlfluff.com/). 70 | 71 | It is highly encouraged that you format the code before commiting using the below `poe` helpers: 72 | 73 | ```bash 74 | poe lint # check your code, we run this check in CI 75 | poe format # format your code to match sqlfluff configs 76 | ``` 77 | 78 | ## Testing 79 | 80 | Once you're able to manually test that your code change is working as expected, it's important to run existing automated tests, as well as adding some new ones. These tests will ensure that: 81 | 82 | - Your code changes do not unexpectedly break other established functionality 83 | - Your code changes can handle all known edge cases 84 | - The functionality you're adding will _keep_ working in the future 85 | 86 | See here for details for running existing integration tests and adding new ones: 87 | 88 | **An integration test typically involves making 1) a new seed file 2) a new model file 3) a generic test to assert anticipated behaviour.** 89 | 90 | Once you've added all of these files, in the `poetry shell`, you should be able to run: 91 | 92 | ```bash 93 | poe data-diff-migration # create resources 94 | poe data-diff-bg # prepare blue/green data 95 | poe data-diff-run # trigger the data-diff 96 | poe data-diff-test # test the package and the data-diff result 97 | ``` 98 | 99 | Alternatively, you could use 1 single command: `poe data-diff-run` OR `poe data-diff-ru-async-wait`👍 100 | 101 | ## Committing 102 | 103 | Upon running `poe git-hooks` we will make sure that you provide as the clean & neat commit messages as possible. 104 | 105 | There are 2 main checks: 106 | 107 | - Trailing whitespace: If any, it will try to fix it for us, and we have to stage the changes before committing 108 | - Commit message: It must follow the [commitizen](https://commitizen-tools.github.io/commitizen/) convention as `{change_type}: {message}` 109 | - `change_type`: is one of `feat|fix|chore|refactor|perf|BREAKING CHANGE` 110 | 111 | ## Submitting a Pull Request 112 | 113 | Code can be merged into the current development branch `main` by opening a pull request. A `dbt-data-diff` maintainer will review your PR. They may suggest code revision for style or clarity, or request that you add unit or integration test(s). These are good things! We believe that, with a little bit of help, anyone can contribute high-quality code. 114 | 115 | Automated tests run via GitHub Actions. If you're a first-time contributor, all tests (including code checks and unit tests) will require a maintainer to approve. Changes in the `dbt-data-diff` repository trigger integration tests against Snowflake 💰. 116 | 117 | Once all tests are passing and your PR has been approved, a `dbt-data-diff` maintainer will merge your changes into the active development branch. And that's it! 118 | 119 | **_Happy Developing 🎉_** 120 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2023 Infinite Lambda 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # dbt-data-diff 3 | 4 | 5 | 6 | [![dbt-hub](https://img.shields.io/badge/Visit-dbt--hub%20↗️-FF694B?logo=dbt&logoColor=FF694B)](https://hub.getdbt.com/infinitelambda/data_diff) 7 | [![support-snowflake](https://img.shields.io/badge/support-Snowflake-7faecd?logo=snowflake&logoColor=7faecd)](https://docs.snowflake.com?ref=infinitelambda) 8 | [![support-dbt](https://img.shields.io/badge/support-dbt%20v1.6+-FF694B?logo=dbt&logoColor=FF694B)](https://docs.getdbt.com?ref=infinitelambda) 9 | [![built-in-sis](https://img.shields.io/badge/built--in-SiS-BD4042?logo=streamlit&logoColor=FF694B)](https://www.snowflake.com/en/data-cloud/overview/streamlit-in-snowflake?ref=infinitelambda) 10 | 11 | Data-diff solution for dbt-ers with Snowflake ❄️ 🚀 12 | 13 | > [!TIP] 14 | > 📖 For more details, please help to visit [the documentation site](https://data-diff.iflambda.com/latest/) (or go to the [docs/index.md](./docs/index.md)) for more details 15 | 16 | Sample diffing 17 | 18 | ## Installation 19 | 20 | - Add to `packages.yml` file: 21 | 22 | ```yml 23 | packages: 24 | - package: infinitelambda/data_diff 25 | version: [">=1.0.0", "<1.1.0"] 26 | ``` 27 | 28 | Or use the latest version from git: 29 | 30 | ```yml 31 | packages: 32 | - git: "https://github.com/infinitelambda/dbt-data-diff.git" 33 | revision: 1.0.0 # 1.0.0b1 34 | ``` 35 | 36 | - (Optional) Configure database & schema in `dbt_project.yml` file: 37 | 38 | ```yml 39 | vars: 40 | # (optional) default to `target.database` if not specified 41 | data_diff__database: COMMON 42 | # (optional) default to `target.schema` if not specified 43 | data_diff__schema: DATA_DIFF 44 | ``` 45 | 46 | - Create/Migrate the `data-diff`'s DDL resources 47 | 48 | ```bash 49 | dbt deps 50 | dbt run -s data_diff --vars '{data_diff__on_migration: true}' 51 | ``` 52 | 53 | ## Quick Demo 54 | 55 | Let's jump to the [Quick Start](https://data-diff.iflambda.com/latest/#quick-start) section and the next [demo one](https://data-diff.iflambda.com/latest/#demo) 🏃 56 | 57 | 📊 Here is the sample Streamlit in Snowflake application based on the result produced by the package: 58 | 59 | Sample SiS 60 | 61 | ## How to Contribute 62 | 63 | `dbt-data-diff` is an open-source dbt package. Whether you are a seasoned open-source contributor or a first-time committer, we welcome and encourage you to contribute code, documentation, ideas, or problem statements to this project. 64 | 65 | 👉 See [CONTRIBUTING guideline](https://data-diff.iflambda.com/latest/nav/dev/contributing.html) for more details or check out [CONTRIBUTING.md](./CONTRIBUTING.md) 66 | 67 | 🌟 And then, kudos to **our beloved Contributors**: 68 | 69 | 70 | Contributors 71 | 72 | 73 | ⭐ Special Credits to [👱 Attila Berecz](https://www.linkedin.com/in/attila-berecz-a0bb5ba2/) who is the OG Contributor of the Core Concept and all the Snowflake Stored Procedures 74 | 75 | ## About Infinite Lambda 76 | 77 | Infinite Lambda is a cloud and data consultancy. We build strategies, help organizations implement them, and pass on the expertise to look after the infrastructure. 78 | 79 | We are an Elite Snowflake Partner, a Platinum dbt Partner, and a two-time Fivetran Innovation Partner of the Year for EMEA. 80 | 81 | Naturally, we love exploring innovative solutions and sharing knowledge, so go ahead and: 82 | 83 | 🔧 Take a look around our [Git](https://github.com/infinitelambda) 84 | 85 | ✏️ Browse our [tech blog](https://infinitelambda.com/category/tech-blog/) 86 | 87 | We are also chatty, so: 88 | 89 | 👀 Follow us on [LinkedIn](https://www.linkedin.com/company/infinite-lambda/) 90 | 91 | 👋🏼 Or just [get in touch](https://infinitelambda.com/contacts/) 92 | 93 | [About IL](https://infinitelambda.com/) 94 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Version | Supported | Until When? | 6 | | ------- | ------------------ | ------------------ | 7 | | 1.0.x | :white_check_mark: | | 8 | 9 | ## Reporting a Vulnerability 10 | 11 | The repository is currently enabled with [Privately reporting a security vulnerability](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability). Once any security pops up, `dbt-data-diff` maintainer will be responsible to make an update and might release the patches if necessarily. 12 | 13 | Otherwise, please help to explicitly raise the awareness via [Report a security vulnerability issue](https://github.com/infinitelambda/dbt-data-diff/security/advisories/new/?title=[SEC]). 14 | -------------------------------------------------------------------------------- /dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'data_diff' 2 | config-version: 2 3 | require-dbt-version: [">=1.6.0", "<2.0.0"] 4 | 5 | model-paths: ["models"] 6 | macro-paths: ["macros"] 7 | 8 | clean-targets: 9 | - "target" 10 | - "dbt_packages" 11 | - "logs" 12 | 13 | vars: 14 | # data_diff__database: COMMON # This will be passed through `generate_database_name` macro 15 | # data_diff__schema: DATA_DIFF # This will be passed through `generate_schema_name` macro 16 | data_diff__on_migration: false # Enable to deploy the package's resources e.g. stored proc, take precedence to `data_diff__on_migration_data` 17 | data_diff__on_migration_data: false # Enable to refresh the list of tables configured for the validation 18 | data_diff__on_run_hook: false # Enable to run data diff's hook 19 | # data_diff__full_refresh: false # Enable to re-create the data-diff models whose the native dbt `full-refresh` option is currently disabled 20 | data_diff__configured_tables__source_fixed_naming: true # Set false to let (only) source db & schema naming follow the configured one in dbt 21 | data_diff__configured_tables__target_fixed_naming: true # Set false to let (only) target db & schema naming follow the configured one in dbt 22 | # data_diff__configured_tables: 23 | # - src_db: src_db # optional, empty to get target.database 24 | # src_schema: src_schema # optional, empty to get target.schema 25 | # src_table: table1 # mandatory 26 | # trg_db: trg_db # optional, empty to get target.database 27 | # trg_schema: trg_schema # optional, empty to get target.schema 28 | # trg_table: table1 # optional, empty to get src_table 29 | # pk: key # mandatory, multiple columns splitted by comma e.g. key1,key2 30 | # include_columns: [] # optional, [] to include all 31 | # exclude_columns: ["timestamp"] # optional, [] to exclude nothing 32 | # where: "1=1" # optional, 1=1 by default 33 | # pipe_name: '' # in non-async mode, it must be empty if specified 34 | # # in async mode, it can be any value e.g. "1", "batch_dat", "batch_milan" 35 | data_diff__auto_pipe: false # Set true & re-run migration to auto-set `pipe_name` config if it's not specified: Each compared entity will be set in a single thread 36 | 37 | on-run-end: 38 | - > # migration hook 39 | {% if var("data_diff__on_migration", false) %} 40 | {{ data_diff.create_resources() }} 41 | 42 | {% if execute and var("data_diff__on_migration_data", true) %} 43 | {{ data_diff.refresh_resource_data() }} 44 | {% endif %} 45 | 46 | {% endif %} 47 | -------------------------------------------------------------------------------- /docs/assets/css/termynal.css: -------------------------------------------------------------------------------- 1 | /** 2 | * termynal.js 3 | * 4 | * @author Ines Montani 5 | * @version 0.0.1 6 | * @license MIT 7 | */ 8 | 9 | :root { 10 | --color-bg: #252a33; 11 | --color-text: #eee; 12 | --color-text-subtle: #a2a2a2; 13 | } 14 | 15 | [data-termynal] { 16 | width: 750px; 17 | max-width: 100%; 18 | background: var(--color-bg); 19 | color: var(--color-text); 20 | font-size: 12px; 21 | font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; 22 | border-radius: 4px; 23 | padding: 75px 45px 35px; 24 | position: relative; 25 | -webkit-box-sizing: border-box; 26 | box-sizing: border-box; 27 | } 28 | 29 | [data-termynal]:before { 30 | content: ''; 31 | position: absolute; 32 | top: 15px; 33 | left: 15px; 34 | display: inline-block; 35 | width: 15px; 36 | height: 15px; 37 | border-radius: 50%; 38 | /* A little hack to display the window buttons in one pseudo element. */ 39 | background: #d9515d; 40 | -webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930; 41 | box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930; 42 | } 43 | 44 | [data-termynal]:after { 45 | content: 'bash'; 46 | position: absolute; 47 | color: var(--color-text-subtle); 48 | top: 5px; 49 | left: 0; 50 | width: 100%; 51 | text-align: center; 52 | } 53 | 54 | [data-ty] { 55 | display: block; 56 | line-height: 2; 57 | } 58 | 59 | [data-ty]:before { 60 | /* Set up defaults and ensure empty lines are displayed. */ 61 | content: ''; 62 | display: inline-block; 63 | vertical-align: middle; 64 | } 65 | 66 | [data-ty="input"]:before, 67 | [data-ty-prompt]:before { 68 | margin-right: 0.75em; 69 | color: var(--color-text-subtle); 70 | } 71 | 72 | [data-ty="input"]:before { 73 | content: '$'; 74 | } 75 | 76 | [data-ty][data-ty-prompt]:before { 77 | content: attr(data-ty-prompt); 78 | } 79 | 80 | [data-ty-cursor]:after { 81 | content: attr(data-ty-cursor); 82 | font-family: monospace; 83 | margin-left: 0.5em; 84 | -webkit-animation: blink 1s infinite; 85 | animation: blink 1s infinite; 86 | } 87 | 88 | 89 | /* Cursor animation */ 90 | 91 | @-webkit-keyframes blink { 92 | 50% { 93 | opacity: 0; 94 | } 95 | } 96 | 97 | @keyframes blink { 98 | 50% { 99 | opacity: 0; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /docs/assets/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitelambda/dbt-data-diff/5ca23538738eec6b9c1f8ebdf2921e953c372427/docs/assets/favicon.ico -------------------------------------------------------------------------------- /docs/assets/img/Sample_DAG_of_Tasks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitelambda/dbt-data-diff/5ca23538738eec6b9c1f8ebdf2921e953c372427/docs/assets/img/Sample_DAG_of_Tasks.png -------------------------------------------------------------------------------- /docs/assets/img/data-diff.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitelambda/dbt-data-diff/5ca23538738eec6b9c1f8ebdf2921e953c372427/docs/assets/img/data-diff.jpeg -------------------------------------------------------------------------------- /docs/assets/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitelambda/dbt-data-diff/5ca23538738eec6b9c1f8ebdf2921e953c372427/docs/assets/img/favicon.ico -------------------------------------------------------------------------------- /docs/assets/img/il-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitelambda/dbt-data-diff/5ca23538738eec6b9c1f8ebdf2921e953c372427/docs/assets/img/il-logo.png -------------------------------------------------------------------------------- /docs/assets/img/sis_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinitelambda/dbt-data-diff/5ca23538738eec6b9c1f8ebdf2921e953c372427/docs/assets/img/sis_ui.png -------------------------------------------------------------------------------- /docs/assets/js/custom.js: -------------------------------------------------------------------------------- 1 | const div = document.querySelector('.github-topic-projects') 2 | 3 | async function getDataBatch(page) { 4 | const response = await fetch(`https://api.github.com/search/repositories?q=topic:dbt-data-diff&per_page=100&page=${page}`, { headers: { Accept: 'application/vnd.github.mercy-preview+json' } }) 5 | const data = await response.json() 6 | return data 7 | } 8 | 9 | async function getData() { 10 | let page = 1 11 | let data = [] 12 | let dataBatch = await getDataBatch(page) 13 | data = data.concat(dataBatch.items) 14 | const totalCount = dataBatch.total_count 15 | while (data.length < totalCount) { 16 | page += 1 17 | dataBatch = await getDataBatch(page) 18 | data = data.concat(dataBatch.items) 19 | } 20 | return data 21 | } 22 | 23 | function setupTermynal() { 24 | document.querySelectorAll(".termynal").forEach(node => { 25 | node.style.display = "block"; 26 | new Termynal(node, { 27 | lineDelay: 500 28 | }); 29 | }); 30 | const progressLiteralStart = "---> 100%"; 31 | const promptLiteralStart = "$ "; 32 | const customPromptLiteralStart = "# "; 33 | const termynalActivateClass = "termy"; 34 | let termynals = []; 35 | 36 | function createTermynals() { 37 | document 38 | .querySelectorAll(`.${termynalActivateClass} .highlight`) 39 | .forEach(node => { 40 | const text = node.textContent; 41 | const lines = text.split("\n"); 42 | const useLines = []; 43 | let buffer = []; 44 | function saveBuffer() { 45 | if (buffer.length) { 46 | let isBlankSpace = true; 47 | buffer.forEach(line => { 48 | if (line) { 49 | isBlankSpace = false; 50 | } 51 | }); 52 | dataValue = {}; 53 | if (isBlankSpace) { 54 | dataValue["delay"] = 0; 55 | } 56 | if (buffer[buffer.length - 1] === "") { 57 | // A last single
won't have effect 58 | // so put an additional one 59 | buffer.push(""); 60 | } 61 | const bufferValue = buffer.join("
"); 62 | dataValue["value"] = bufferValue; 63 | useLines.push(dataValue); 64 | buffer = []; 65 | } 66 | } 67 | for (let line of lines) { 68 | if (line === progressLiteralStart) { 69 | saveBuffer(); 70 | useLines.push({ 71 | type: "progress" 72 | }); 73 | } else if (line.startsWith(promptLiteralStart)) { 74 | saveBuffer(); 75 | const value = line.replace(promptLiteralStart, "").trimEnd(); 76 | useLines.push({ 77 | type: "input", 78 | value: value 79 | }); 80 | } else if (line.startsWith("// ")) { 81 | saveBuffer(); 82 | const value = "💬 " + line.replace("// ", "").trimEnd(); 83 | useLines.push({ 84 | value: value, 85 | class: "termynal-comment", 86 | delay: 0 87 | }); 88 | } else if (line.startsWith(customPromptLiteralStart)) { 89 | saveBuffer(); 90 | const promptStart = line.indexOf(promptLiteralStart); 91 | if (promptStart === -1) { 92 | console.error("Custom prompt found but no end delimiter", line) 93 | } 94 | const prompt = line.slice(0, promptStart).replace(customPromptLiteralStart, "") 95 | let value = line.slice(promptStart + promptLiteralStart.length); 96 | useLines.push({ 97 | type: "input", 98 | value: value, 99 | prompt: prompt 100 | }); 101 | } else { 102 | buffer.push(line); 103 | } 104 | } 105 | saveBuffer(); 106 | const div = document.createElement("div"); 107 | node.replaceWith(div); 108 | const termynal = new Termynal(div, { 109 | lineData: useLines, 110 | noInit: true, 111 | lineDelay: 500 112 | }); 113 | termynals.push(termynal); 114 | }); 115 | } 116 | 117 | function loadVisibleTermynals() { 118 | termynals = termynals.filter(termynal => { 119 | if (termynal.container.getBoundingClientRect().top - innerHeight <= 0) { 120 | termynal.init(); 121 | return false; 122 | } 123 | return true; 124 | }); 125 | } 126 | window.addEventListener("scroll", loadVisibleTermynals); 127 | createTermynals(); 128 | loadVisibleTermynals(); 129 | } 130 | 131 | async function main() { 132 | if (div) { 133 | data = await getData() 134 | div.innerHTML = '
    ' 135 | const ul = document.querySelector('.github-topic-projects ul') 136 | data.forEach(v => { 137 | if (v.full_name === 'infinitelambda/dbt-data-diff') { 138 | return 139 | } 140 | const li = document.createElement('li') 141 | li.innerHTML = `★ ${v.stargazers_count} - ${v.full_name} by @${v.owner.login}` 142 | ul.append(li) 143 | }) 144 | } 145 | 146 | setupTermynal(); 147 | } 148 | 149 | main() 150 | -------------------------------------------------------------------------------- /docs/assets/js/feedback.js: -------------------------------------------------------------------------------- 1 | var feedback = document.forms.feedback 2 | feedback.addEventListener("submit", function(ev) { 3 | ev.preventDefault() 4 | 5 | /* Retrieve page and feedback value */ 6 | var page = document.location.pathname 7 | var data = ev.submitter.getAttribute("data-md-value") 8 | 9 | /* Send feedback value */ 10 | console.log(page, data) 11 | }) 12 | -------------------------------------------------------------------------------- /docs/assets/js/termynal.js: -------------------------------------------------------------------------------- 1 | /** 2 | * termynal.js 3 | * A lightweight, modern and extensible animated terminal window, using 4 | * async/await. 5 | * 6 | * @author Ines Montani 7 | * @version 0.0.1 8 | * @license MIT 9 | */ 10 | 11 | 'use strict'; 12 | 13 | /** Generate a terminal widget. */ 14 | class Termynal { 15 | /** 16 | * Construct the widget's settings. 17 | * @param {(string|Node)=} container - Query selector or container element. 18 | * @param {Object=} options - Custom settings. 19 | * @param {string} options.prefix - Prefix to use for data attributes. 20 | * @param {number} options.startDelay - Delay before animation, in ms. 21 | * @param {number} options.typeDelay - Delay between each typed character, in ms. 22 | * @param {number} options.lineDelay - Delay between each line, in ms. 23 | * @param {number} options.progressLength - Number of characters displayed as progress bar. 24 | * @param {string} options.progressChar – Character to use for progress bar, defaults to █. 25 | * @param {number} options.progressPercent - Max percent of progress. 26 | * @param {string} options.cursor – Character to use for cursor, defaults to ▋. 27 | * @param {Object[]} lineData - Dynamically loaded line data objects. 28 | * @param {boolean} options.noInit - Don't initialise the animation. 29 | */ 30 | constructor(container = '#termynal', options = {}) { 31 | this.container = (typeof container === 'string') ? document.querySelector(container) : container; 32 | this.pfx = `data-${options.prefix || 'ty'}`; 33 | this.startDelay = options.startDelay 34 | || parseFloat(this.container.getAttribute(`${this.pfx}-startDelay`)) || 600; 35 | this.typeDelay = options.typeDelay 36 | || parseFloat(this.container.getAttribute(`${this.pfx}-typeDelay`)) || 90; 37 | this.lineDelay = options.lineDelay 38 | || parseFloat(this.container.getAttribute(`${this.pfx}-lineDelay`)) || 1500; 39 | this.progressLength = options.progressLength 40 | || parseFloat(this.container.getAttribute(`${this.pfx}-progressLength`)) || 40; 41 | this.progressChar = options.progressChar 42 | || this.container.getAttribute(`${this.pfx}-progressChar`) || '█'; 43 | this.progressPercent = options.progressPercent 44 | || parseFloat(this.container.getAttribute(`${this.pfx}-progressPercent`)) || 100; 45 | this.cursor = options.cursor 46 | || this.container.getAttribute(`${this.pfx}-cursor`) || '▋'; 47 | this.lineData = this.lineDataToElements(options.lineData || []); 48 | if (!options.noInit) this.init() 49 | } 50 | 51 | /** 52 | * Initialise the widget, get lines, clear container and start animation. 53 | */ 54 | init() { 55 | // Appends dynamically loaded lines to existing line elements. 56 | this.lines = [...this.container.querySelectorAll(`[${this.pfx}]`)].concat(this.lineData); 57 | 58 | /** 59 | * Calculates width and height of Termynal container. 60 | * If container is empty and lines are dynamically loaded, defaults to browser `auto` or CSS. 61 | */ 62 | const containerStyle = getComputedStyle(this.container); 63 | this.container.style.width = containerStyle.width !== '0px' ? 64 | containerStyle.width : undefined; 65 | this.container.style.minHeight = containerStyle.height !== '0px' ? 66 | containerStyle.height : undefined; 67 | 68 | this.container.setAttribute('data-termynal', ''); 69 | this.container.innerHTML = ''; 70 | this.start(); 71 | } 72 | 73 | /** 74 | * Start the animation and rener the lines depending on their data attributes. 75 | */ 76 | async start() { 77 | await this._wait(this.startDelay); 78 | 79 | for (let line of this.lines) { 80 | const type = line.getAttribute(this.pfx); 81 | const delay = line.getAttribute(`${this.pfx}-delay`) || this.lineDelay; 82 | 83 | if (type == 'input') { 84 | line.setAttribute(`${this.pfx}-cursor`, this.cursor); 85 | await this.type(line); 86 | await this._wait(delay); 87 | } 88 | 89 | else if (type == 'progress') { 90 | await this.progress(line); 91 | await this._wait(delay); 92 | } 93 | 94 | else { 95 | this.container.appendChild(line); 96 | await this._wait(delay); 97 | } 98 | 99 | line.removeAttribute(`${this.pfx}-cursor`); 100 | } 101 | } 102 | 103 | /** 104 | * Animate a typed line. 105 | * @param {Node} line - The line element to render. 106 | */ 107 | async type(line) { 108 | const chars = [...line.textContent]; 109 | const delay = line.getAttribute(`${this.pfx}-typeDelay`) || this.typeDelay; 110 | line.textContent = ''; 111 | this.container.appendChild(line); 112 | 113 | for (let char of chars) { 114 | await this._wait(delay); 115 | line.textContent += char; 116 | } 117 | } 118 | 119 | /** 120 | * Animate a progress bar. 121 | * @param {Node} line - The line element to render. 122 | */ 123 | async progress(line) { 124 | const progressLength = line.getAttribute(`${this.pfx}-progressLength`) 125 | || this.progressLength; 126 | const progressChar = line.getAttribute(`${this.pfx}-progressChar`) 127 | || this.progressChar; 128 | const chars = progressChar.repeat(progressLength); 129 | const progressPercent = line.getAttribute(`${this.pfx}-progressPercent`) 130 | || this.progressPercent; 131 | line.textContent = ''; 132 | this.container.appendChild(line); 133 | 134 | for (let i = 1; i < chars.length + 1; i++) { 135 | await this._wait(this.typeDelay); 136 | const percent = Math.round(i / chars.length * 100); 137 | line.textContent = `${chars.slice(0, i)} ${percent}%`; 138 | if (percent>progressPercent) { 139 | break; 140 | } 141 | } 142 | } 143 | 144 | /** 145 | * Helper function for animation delays, called with `await`. 146 | * @param {number} time - Timeout, in ms. 147 | */ 148 | _wait(time) { 149 | return new Promise(resolve => setTimeout(resolve, time)); 150 | } 151 | 152 | /** 153 | * Converts line data objects into line elements. 154 | * 155 | * @param {Object[]} lineData - Dynamically loaded lines. 156 | * @param {Object} line - Line data object. 157 | * @returns {Element[]} - Array of line elements. 158 | */ 159 | lineDataToElements(lineData) { 160 | return lineData.map(line => { 161 | let div = document.createElement('div'); 162 | div.innerHTML = `${line.value || ''}`; 163 | 164 | return div.firstElementChild; 165 | }); 166 | } 167 | 168 | /** 169 | * Helper function for generating attributes string. 170 | * 171 | * @param {Object} line - Line data object. 172 | * @returns {string} - String of attributes. 173 | */ 174 | _attributes(line) { 175 | let attrs = ''; 176 | for (let prop in line) { 177 | attrs += this.pfx; 178 | 179 | if (prop === 'type') { 180 | attrs += `="${line[prop]}" ` 181 | } else if (prop !== 'value') { 182 | attrs += `-${prop}="${line[prop]}" ` 183 | } 184 | } 185 | 186 | return attrs; 187 | } 188 | } 189 | 190 | /** 191 | * HTML API: If current script has container(s) specified, initialise Termynal. 192 | */ 193 | if (document.currentScript.hasAttribute('data-termynal-container')) { 194 | const containers = document.currentScript.getAttribute('data-termynal-container'); 195 | containers.split('|') 196 | .forEach(container => new Termynal(container)) 197 | } 198 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to `dbt-data-diff` 2 | 3 | `dbt-data-diff` is open-source dbt package ❤️. Whether you are a seasoned open-source contributor or a first-time committer, we welcome and encourage you to contribute code, documentation, ideas, or problem statements to this project. 4 | 5 | - [Contributing to `dbt-data-diff`](#contributing-to-dbt-data-diff) 6 | - [About this document](#about-this-document) 7 | - [Getting the code](#getting-the-code) 8 | - [Installing git](#installing-git) 9 | - [External contributors](#external-contributors) 10 | - [Setting up an environment](#setting-up-an-environment) 11 | - [Tools](#tools) 12 | - [Get dbt profile ready](#get-dbt-profile-ready) 13 | - [Linting](#linting) 14 | - [Testing](#testing) 15 | - [Committing](#committing) 16 | - [Submitting a Pull Request](#submitting-a-pull-request) 17 | 18 | ## About this document 19 | 20 | There are many ways to contribute to the ongoing development of `dbt-data-diff`, such as by participating in discussions and issues. 21 | 22 | The rest of this document serves as a more granular guide for contributing code changes to `dbt-data-diff` (this repository). It is not intended as a guide for using `dbt-data-diff`, and some pieces assume a level of familiarity with Python development with `poetry`. Specific code snippets in this guide assume you are using macOS or Linux and are comfortable with the command line. 23 | 24 | - **Branches:** All pull requests from community contributors should target the `main` branch (default). If the change is needed as a patch for a minor version of dbt that has already been released (or is already a release candidate), a maintainer will backport the changes in your PR to the relevant "latest" release branch (`1.0.`, `1.1.`, ...). If an issue fix applies to a release branch, that fix should be first committed to the development branch and then to the release branch (rarely release-branch fixes may not apply to `main`). 25 | - **Releases**: Before releasing a new minor version, we prepare a series of beta release candidates to allow users to test the new version in live environments. This is an important quality assurance step, as it exposes the new code to a wide variety of complicated deployments and can surface bugs before official release. Releases are accessible via pip. 26 | 27 | ## Getting the code 28 | 29 | ### Installing git 30 | 31 | You will need `git` in order to download and modify the `dbt-data-diff` source code. On macOS, the best way to download git is to just install [Xcode](https://developer.apple.com/support/xcode/). 32 | 33 | ### External contributors 34 | 35 | You can contribute to `dbt-data-diff` by forking the `dbt-data-diff` repository. For a detailed overview on forking, check out the [GitHub docs on forking](https://help.github.com/en/articles/fork-a-repo). In short, you will need to: 36 | 37 | 1. Fork the `dbt-data-diff` repository 38 | 2. Clone your fork locally 39 | 3. Check out a new branch for your proposed changes 40 | 4. Push changes to your fork 41 | 5. Open a pull request against `infintelambda/dbt-data-diff` from your forked repository 42 | 43 | ## Setting up an environment 44 | 45 | There are some tools that will be helpful to you in developing locally. While this is the list relevant for `dbt-data-diff` development, many of these tools are used commonly across open-source python projects. 46 | 47 | ### Tools 48 | 49 | We will buy `poetry` in `dbt-data-diff` development and testing. 50 | 51 | So first install poetry via pip or via the [official installer](https://python-poetry.org/docs/#installing-with-the-official-installer), please help to check right version used in [poetry.lock](https://github.com/infinitelambda/dbt-data-diff/blob/main/poetry.lock) file. Then, start installing the local environment: 52 | 53 | ```bash 54 | poetry install 55 | poetry shell 56 | poe git-hooks 57 | ``` 58 | 59 | ### Get dbt profile ready 60 | 61 | Please help to check [the sample script](https://github.com/infinitelambda/dbt-data-diff/blob/main/integration_tests/ci/sf-init.sql) to initialize Snowflake environment in `integreation_tests/ci` directory, and get your database freshly created. 62 | 63 | Next, you should follow [dbt profile instruction](https://docs.getdbt.com/docs/core/connect-data-platform/connection-profiles) and setting up your dedicated profile. Again, you could [try our sample](https://github.com/infinitelambda/dbt-data-diff/blob/main/integration_tests/ci/profiles.yml) in the same above directory. 64 | 65 | Run `poe data-diff-verify` for verifying the connection ✅ 66 | 67 | ## Linting 68 | 69 | We're trying to also maintain the code quality leveraging [sqlfluff](https://sqlfluff.com/). 70 | 71 | It is highly encouraged that you format the code before commiting using the below `poe` helpers: 72 | 73 | ```bash 74 | poe lint # check your code, we run this check in CI 75 | poe format # format your code to match sqlfluff configs 76 | ``` 77 | 78 | ## Testing 79 | 80 | Once you're able to manually test that your code change is working as expected, it's important to run existing automated tests, as well as adding some new ones. These tests will ensure that: 81 | 82 | - Your code changes do not unexpectedly break other established functionality 83 | - Your code changes can handle all known edge cases 84 | - The functionality you're adding will _keep_ working in the future 85 | 86 | See here for details for running existing integration tests and adding new ones: 87 | 88 | **An integration test typically involves making 1) a new seed file 2) a new model file 3) a generic test to assert anticipated behaviour.** 89 | 90 | Once you've added all of these files, in the `poetry shell`, you should be able to run: 91 | 92 | ```bash 93 | poe data-diff-migration # create resources 94 | poe data-diff-bg # prepare blue/green data 95 | poe data-diff-run # trigger the data-diff 96 | poe data-diff-test # test the package and the data-diff result 97 | ``` 98 | 99 | Alternatively, you could use 1 single command: `poe data-diff-run` OR `poe data-diff-ru-async-wait`👍 100 | 101 | ## Committing 102 | 103 | Upon running `poe git-hooks` we will make sure that you provide as the clean & neat commit messages as possible. 104 | 105 | There are 2 main checks: 106 | 107 | - Trailing whitespace: If any, it will try to fix it for us, and we have to stage the changes before committing 108 | - Commit message: It must follow the [commitizen](https://commitizen-tools.github.io/commitizen/) convention as `{change_type}: {message}` 109 | - `change_type`: is one of `feat|fix|chore|refactor|perf|BREAKING CHANGE` 110 | 111 | ## Submitting a Pull Request 112 | 113 | Code can be merged into the current development branch `main` by opening a pull request. A `dbt-data-diff` maintainer will review your PR. They may suggest code revision for style or clarity, or request that you add unit or integration test(s). These are good things! We believe that, with a little bit of help, anyone can contribute high-quality code. 114 | 115 | Automated tests run via GitHub Actions. If you're a first-time contributor, all tests (including code checks and unit tests) will require a maintainer to approve. Changes in the `dbt-data-diff` repository trigger integration tests against Snowflake 💰. 116 | 117 | Once all tests are passing and your PR has been approved, a `dbt-data-diff` maintainer will merge your changes into the active development branch. And that's it! 118 | 119 | **_Happy Developing 🎉_** 120 | -------------------------------------------------------------------------------- /docs/hooks.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os 3 | 4 | def on_post_build(config, **kwargs): 5 | site_dir = config['site_dir'] 6 | shutil.copytree("integration_tests/target/", os.path.join(site_dir, 'dbt-docs')) 7 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | # dbt-data-diff 3 | 4 | 5 | 6 | [![dbt-hub](https://img.shields.io/badge/Visit-dbt--hub%20↗️-FF694B?logo=dbt&logoColor=FF694B)](https://hub.getdbt.com/infinitelambda/data_diff) 7 | [![support-snowflake](https://img.shields.io/badge/support-Snowflake-7faecd?logo=snowflake&logoColor=7faecd)](https://docs.snowflake.com?ref=infinitelambda) 8 | [![support-dbt](https://img.shields.io/badge/support-dbt%20v1.6+-FF694B?logo=dbt&logoColor=FF694B)](https://docs.getdbt.com?ref=infinitelambda) 9 | [![built-in-sis](https://img.shields.io/badge/built--in-SiS-BD4042?logo=streamlit&logoColor=FF694B)](https://www.snowflake.com/en/data-cloud/overview/streamlit-in-snowflake?ref=infinitelambda) 10 | 11 | Data-diff solution for dbt-ers with Snowflake ❄️ 🌟 12 | 13 | **_Who is this for?_** 14 | 15 | - Primarily for people who want to perform Data-diff validation on **[the Blue-Green deployment](https://discourse.getdbt.com/t/performing-a-blue-green-deploy-of-your-dbt-project-on-snowflake/1349)** 🚀 16 | - Other good considerations 👍 17 | - UAT validation: data-diff with PROD 18 | - Code-Refactoring validation: data diff between old vs new 19 | - Migration to Snowflake: data diff between old vs new (requires to land the old data to Snowflake) 20 | - CI: future consideration only ⚠️ 21 | 22 | ## Core Concept 🌟 23 | 24 | `dbt-data-diff` package provides the diff results into 3 categories or 3 levels of the diff as follows: 25 | 26 | - 🥉 **Key diff** ([models](https://github.com/infinitelambda/dbt-data-diff/tree/main/models/01_key_diff/)): Compare the Primary Key (`pk`) only 27 | - 🥈 **Schema diff** ([models](https://github.com/infinitelambda/dbt-data-diff/tree/main/models/02_schema_diff/)): Compare the list of column's Names and Data Types 28 | - 🥇 **Content diff** (aka Data diff) ([models](https://github.com/infinitelambda/dbt-data-diff/tree/main/models/03_content_diff/)): Compare all cell values. The columns will be filtered by each table's configuration (`include_columns` and `exclude_columns`), and the data can be also filtered by the `where` config. Behind the scenes, this operation does not require the Primary Key (PK) config, it will perform Bulk Operation (`INTERCEPT` or `MINUS`) and make an aggregation to make up the column level's match percentage 29 | 30 | Sample diffing: 31 | Sample diffing 32 | 33 | Behind the scenes, this package leverages the ❄️ [Scripting Stored Procedure](https://docs.snowflake.com/en/developer-guide/stored-procedure/stored-procedures-snowflake-scripting) which provides the 3 ones correspondingly with 3 categories as above. Moreover, it utilizes the [DAG of Tasks](https://docs.snowflake.com/en/user-guide/tasks-intro?utm_source=legacy&utm_medium=serp&utm_term=task+DAG#label-task-dag) to optimize the speed with the parallelism once enabled by configuration 🚀 34 | 35 | Sample DAG: 36 | 37 | Sample DAG 38 | 39 | ## Installation 40 | 41 | - Add to `packages.yml` file: 42 | 43 | ```yml 44 | packages: 45 | - package: infinitelambda/data_diff 46 | version: [">=1.0.0", "<1.1.0"] 47 | ``` 48 | 49 | Or use the latest version from git: 50 | 51 | ```yml 52 | packages: 53 | - git: "https://github.com/infinitelambda/dbt-data-diff" 54 | revision: 1.0.0 # 1.0.0b1 55 | ``` 56 | 57 | - (Optional) Configure database & schema in `dbt_project.yml` file: 58 | 59 | ```yml 60 | vars: 61 | # (optional) default to `target.database` if not specified 62 | data_diff__database: COMMON 63 | # (optional) default to `target.schema` if not specified 64 | data_diff__schema: DATA_DIFF 65 | ``` 66 | 67 | - Create/Migrate the `data-diff`'s DDL resources 68 | 69 | ```bash 70 | dbt deps 71 | dbt run -s data_diff --vars '{data_diff__on_migration: true}' 72 | ``` 73 | 74 | ## Quick Start 75 | 76 | ### 1. Configure the tables that need comparing in `dbt_project.yml` 77 | 78 | We're going to use the `data_diff__configured_tables` variable (Check out the [dbt_project.yml](https://github.com/infinitelambda/dbt-data-diff/tree/main/dbt_project.yml)/`vars` section for more details!) 79 | 80 | For example, we want to compare `table_x` between **PROD** db and **DEV** one: 81 | 82 | ```yaml 83 | vars: 84 | data_diff__configured_tables: 85 | - src_db: your_prod 86 | src_schema: your_schema 87 | src_table: table_x 88 | trg_db: your_dev 89 | trg_schema: your_schema 90 | trg_table: table_x 91 | pk: key # multiple columns splitted by comma 92 | include_columns: [] # [] to include all 93 | exclude_columns: ["loaded_at"] # [] to exclude loaded_at field 94 | ``` 95 | 96 | ### 2. Refresh the configured tables's data 97 | 98 | We can skip this step if you already did it. If not, let's run the below command: 99 | 100 | ```bash 101 | dbt run -s data_diff \ 102 | --full-refresh \ 103 | --vars '{data_diff__on_migration: true, data_diff__on_migration_data: true, data_diff__full_refresh: true}' 104 | ``` 105 | 106 | !!! note "In the above:" 107 | 108 | - `--full-refresh` and `data_diff__full_refresh`: To re-create all data-diff models 109 | - `data_diff__on_migration: true`: To re-create the stored procedures 110 | - `data_diff__on_migration_data: true`: To reset the configured data 111 | 112 | ### 3. Trigger the validation via dbt operation 113 | 114 | Now, let's start the diff run: 115 | 116 | ```bash 117 | dbt run-operation data_diff__run # normal mode, run in sequence, wait unitl finished 118 | # OR 119 | dbt run-operation data_diff__run_async # async mode, parallel, no waiting 120 | dbt run-operation data_diff__run_async --args '{is_polling_status: true}' 121 | # async mode, parallel, status polling 122 | ``` 123 | 124 | !!! tip "In the Async Mode" 125 | We leverage the DAG of tasks, therefore the dbt's ROLE will need granting the addtional privilege: 126 | 127 | ```sql 128 | use role accountadmin; 129 | grant execute task on account to role {{ target.role }}; 130 | ``` 131 | 132 |
    133 | 📖 Or via dbt hook by default (it will run an incremental load for all models) 134 | 135 | ```yaml 136 | # Add into dbt_project.yml file 137 | 138 | # normal mode 139 | on-run-end 140 | - > # run data-diff hook 141 | {% if var("data_diff__on_run_hook", false) %} 142 | {{ data_diff.data_diff__run(in_hook=true) }} 143 | {% endif %} 144 | 145 | # async mode 146 | on-run-end 147 | - > # run data-diff hook 148 | {% if var("data_diff__on_run_hook", false) %} 149 | {{ data_diff.data_diff__run_async(in_hook=true) }} 150 | {% endif %} 151 | 152 | ``` 153 | 154 | ```bash 155 | # terminal 156 | dbt run -s data_diff --vars '{data_diff__on_run_hook: true}' 157 | ``` 158 | 159 |
    160 | 161 | ### 4. [Bonus] Deploy the helper 🤩 162 | 163 | Our helper is the [Streamlit in Snowflake](https://www.snowflake.com/en/data-cloud/overview/streamlit-in-snowflake/) (SiS) application which was built on the last diff result in order to help us to have a better examining with the actual result without typing SQL. 164 | 165 | Let's deploy the Streamlit app by running the dbt command as follows: 166 | 167 | ```bash 168 | dbt run-operation sis_deploy__diff_helper 169 | ``` 170 | 171 |
    172 | Sample logs 173 | 174 | ```log 175 | 02:44:50 Running with dbt=1.7.4 176 | 02:44:52 Registered adapter: snowflake=1.7.1 177 | 02:44:53 Found 16 models, 2 operations, 21 tests, 0 sources, 0 exposures, 0 metrics, 558 macros, 0 groups, 0 semantic models 178 | 02:44:53 [RUN]: sis_deploy__diff_helper 179 | 02:44:53 query: 180 | 181 | create schema if not exists data_diff.blue_dat_common; 182 | create or replace stage data_diff.blue_dat_common.stage_diff_helper 183 | directory = ( enable = true ) 184 | comment = 'Named stage for diff helper SiS appilication'; 185 | 186 | PUT file://dbt_packages/data_diff/macros/sis/diff_helper.py @data_diff.blue_dat_common.stage_diff_helper overwrite=true auto_compress=false; 187 | 188 | create or replace streamlit data_diff.blue_dat_common.data_diff_helper 189 | root_location = '@data_diff.blue_dat_common.stage_diff_helper' 190 | main_file = '/diff_helper.py' 191 | query_warehouse = wh_data_diff 192 | comment = 'Streamlit app for the dbt-data-diff package'; 193 | 194 | 02:45:02 )> 195 | ``` 196 | 197 |
    198 | 199 | Once it's done, you could access to the app via: **Steamlit menu** / **DATA_DIFF_HELPER** or via this quick link: 200 | 201 | ```log 202 | {BASE_SNOWFLAKE_URL}/#/streamlit-apps/{DATABASE}.{SCHEMA}.DATA_DIFF_HELPER 203 | ``` 204 | 205 |
    206 | 👉 Check out the sample app UI 207 | 208 | Sample SiS 209 | 210 |
    211 | 212 | ## Demo 213 | 214 | **Part 1**: Configure and prepare Blue/Green 215 | 216 | [![Watch the video - P1](https://cdn.loom.com/sessions/thumbnails/2445f322720a4466ab9494c90e66946b-1705309091927-with-play.gif)](https://www.loom.com/share/2445f322720a4466ab9494c90e66946b?sid=9b5f354c-3611-412a-ac18-554e4b879913) 217 | 218 | **Part 2**: Run data diff & examine the result 219 | 220 | [![Watch the video - P2](https://cdn.loom.com/sessions/thumbnails/c4dc4179a4ee4a0d9583db405b46e969-1705308496485-with-play.gif)](https://www.loom.com/share/c4dc4179a4ee4a0d9583db405b46e969?sid=fc6e2dd8-c456-4888-8eaf-64883423270d) 221 | 222 | ## Variables 223 | 224 | !!! tip "See `dbt_project.yml` file" 225 | Go to `vars` section [here](https://github.com/infinitelambda/dbt-data-diff/blob/main/dbt_project.yml#L12) 🏃 226 | 227 | We managed to provide the inline comments only for now, soon to have the dedicated page for more detail explanation. 228 | 229 | Here are the full list of built-in variables: 230 | 231 | - `data_diff__database` 232 | - `data_diff__schema` 233 | - `data_diff__on_migration` 234 | - `data_diff__on_migration_data` 235 | - `data_diff__on_run_hook` 236 | - `data_diff__full_refresh` 237 | - `data_diff__configured_tables__source_fixed_naming` 238 | - `data_diff__configured_tables__target_fixed_naming` 239 | - `data_diff__configured_tables` 240 | - `data_diff__auto_pipe` 241 | 242 | ## How to Contribute ❤️ 243 | 244 | `dbt-data-diff` is an open-source dbt package. Whether you are a seasoned open-source contributor or a first-time committer, we welcome and encourage you to contribute code, documentation, ideas, or problem statements to this project. 245 | 246 | 👉 See [CONTRIBUTING guideline](https://data-diff.iflambda.com/latest/nav/dev/contributing.html) for more details or check out [CONTRIBUTING.md](https://github.com/infinitelambda/dbt-data-diff/tree/main/CONTRIBUTING.md) 247 | 248 | 🌟 And then, kudos to **our beloved Contributors**: 249 | 250 | 251 | Contributors 252 | 253 | 254 | ⭐ Special Credits to [👱 Attila Berecz](https://www.linkedin.com/in/attila-berecz-a0bb5ba2/) who is the OG Contributor of the Core Concept and all the Snowflake Stored Procedures 255 | 256 | ## Features comparison to the alternative packages 257 | 258 | | Feature | Supported Package | Notes | 259 | |:----------------------|:-----------------------------------------------------------|:-----------------| 260 | | Key diff |
    • `dbt_data_diff`
    • [`data_diff`](https://github.com/datafold/data_diff)
    • [`dbt_audit_helper`](https://github.com/dbt-labs/dbt_audit_helper)
    | ✅ all available | 261 | | Schema diff |
    • `dbt_data_diff`
    • [`data_diff`(*)](https://github.com/datafold/data_diff)
    • [`dbt_audit_helper`](https://github.com/dbt-labs/dbt_audit_helper)
    | (*): Only available in the paid-version 💰 | 262 | | Content diff |
    • `dbt_data_diff`
    • [`data_diff`(*)](https://github.com/datafold/data_diff)
    • [`dbt_audit_helper`](https://github.com/dbt-labs/dbt_audit_helper)
    | (*): Only available in the paid-version 💰 | 263 | | Yaml Configuration |
    • `dbt_data_diff`
    | `data_diff` will use the `toml` file, `dbt_audit_helper` will require to create new models for each comparison | 264 | | Query & Execution log |
    • `dbt_data_diff`
    | Except for dbt's log, this package to be very transparent on which diff queries executed which are exposed in [`log_for_validation`](https://github.com/infinitelambda/dbt-data-diff/tree/main/models/log_for_validation.yml) model | 265 | | Snowflake-native Stored Proc |
    • `dbt_data_diff`
    | Purely built as Snowflake SQL native stored procedures | 266 | | Parallelism |
    • `dbt_data_diff`
    • [`data_diff`](https://github.com/datafold/data_diff)
    • [`dbt_audit_helper`](https://github.com/dbt-labs/dbt_audit_helper)
    | `dbt_data_diff` leverages Snowflake Task DAG, the others use python threading | 267 | | Asynchronous |
    • `dbt_data_diff`
    | Trigger run & go away. Decide to continously poll the run status and waiting until finished if needed | 268 | | Multi-warehouse supported |
    • `dbt_data_diff`(*)
    • [`data_diff`](https://github.com/datafold/data_diff)
    • [`dbt_audit_helper`](https://github.com/dbt-labs/dbt_audit_helper)
    | (*): Future Consideration 🏃 | 269 | 270 | ## About Infinite Lambda 271 | 272 | Infinite Lambda is a cloud and data consultancy. We build strategies, help organizations implement them, and pass on the expertise to look after the infrastructure. 273 | 274 | We are an Elite Snowflake Partner, a Platinum dbt Partner, and a two-time Fivetran Innovation Partner of the Year for EMEA. 275 | 276 | Naturally, we love exploring innovative solutions and sharing knowledge, so go ahead and: 277 | 278 | 🔧 Take a look around our [Git](https://github.com/infinitelambda) 279 | 280 | ✏️ Browse our [tech blog](https://infinitelambda.com/category/tech-blog/) 281 | 282 | We are also chatty, so: 283 | 284 | 👀 Follow us on [LinkedIn](https://www.linkedin.com/company/infinite-lambda/) 285 | 286 | 👋🏼 Or just [get in touch](https://infinitelambda.com/contacts/) 287 | 288 | [About IL](https://infinitelambda.com/) 289 | -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block outdated %} 4 | You're not viewing the latest version. 5 | 6 | Click here to go to latest. 7 | 8 | {% endblock %} 9 | 10 | {% block extrahead %} 11 | 12 | {% endblock %} 13 | -------------------------------------------------------------------------------- /integration_tests/ci/ci.profiles.yml: -------------------------------------------------------------------------------- 1 | config: 2 | send_anonymous_usage_stats: False 3 | use_colors: True 4 | 5 | data_diff: 6 | target: blue 7 | outputs: 8 | blue: 9 | type: snowflake 10 | account: "{{ env_var('DBT_SNOWFLAKE_TEST_ACCOUNT') }}" 11 | user: "{{ env_var('DBT_SNOWFLAKE_TEST_USER') }}" 12 | password: "{{ env_var('DBT_ENV_SECRET_SNOWFLAKE_TEST_PASSWORD') }}" 13 | role: "{{ env_var('DBT_SNOWFLAKE_TEST_ROLE') }}" 14 | database: "{{ env_var('DBT_SNOWFLAKE_TEST_DATABASE') }}" 15 | warehouse: "{{ env_var('DBT_SNOWFLAKE_TEST_WAREHOUSE') }}" 16 | schema: "{{ env_var('DBT_SCHEMA', 'dbt_data_diff_it') }}" 17 | threads: 10 18 | green: 19 | type: snowflake 20 | account: "{{ env_var('DBT_SNOWFLAKE_TEST_ACCOUNT') }}" 21 | user: "{{ env_var('DBT_SNOWFLAKE_TEST_USER') }}" 22 | password: "{{ env_var('DBT_ENV_SECRET_SNOWFLAKE_TEST_PASSWORD') }}" 23 | role: "{{ env_var('DBT_SNOWFLAKE_TEST_ROLE') }}" 24 | database: "{{ env_var('DBT_SNOWFLAKE_TEST_DATABASE') }}" 25 | warehouse: "{{ env_var('DBT_SNOWFLAKE_TEST_WAREHOUSE') }}" 26 | schema: "{{ env_var('DBT_SCHEMA', 'dbt_data_diff_it') }}" 27 | threads: 10 28 | -------------------------------------------------------------------------------- /integration_tests/ci/profiles.yml: -------------------------------------------------------------------------------- 1 | data_diff: 2 | outputs: 3 | blue: 4 | type: snowflake 5 | account: xxx 6 | warehouse: wh_data_diff 7 | database: data_diff 8 | role: role_data_diff 9 | schema: blue_dat 10 | threads: 10 11 | user: xxx@infinitelambda.com 12 | password: xxx 13 | # authenticator: externalbrowser 14 | green: 15 | type: snowflake 16 | account: xxx 17 | warehouse: wh_data_diff 18 | database: data_diff 19 | role: role_data_diff 20 | schema: green_dat 21 | threads: 10 22 | user: xxx@infinitelambda.com 23 | password: xxx 24 | # authenticator: externalbrowser 25 | target: blue 26 | -------------------------------------------------------------------------------- /integration_tests/ci/sf-init.sql: -------------------------------------------------------------------------------- 1 | use role sysadmin; 2 | use warehouse wh_compute; 3 | create or replace database data_diff with comment = 'Database for data_diff'; 4 | 5 | use role accountadmin; 6 | create or replace resource monitor rm_data_diff with 7 | credit_quota = 1 8 | frequency = daily 9 | start_timestamp = immediately 10 | notify_users = ("") 11 | triggers 12 | on 100 percent do suspend_immediate 13 | ; 14 | 15 | create or replace warehouse wh_data_diff with 16 | warehouse_type = 'standard' 17 | warehouse_size = 'xsmall' 18 | auto_suspend = 60 19 | auto_resume = true 20 | initially_suspended = true 21 | resource_monitor = rm_data_diff 22 | comment = 'Warehouse for data_diff'; 23 | 24 | use role securityadmin; 25 | create or replace role role_data_diff with comment = 'Role for data_diff'; 26 | 27 | grant usage on warehouse wh_data_diff to role role_data_diff; 28 | grant usage on database data_diff to role role_data_diff; 29 | grant all privileges on database data_diff to role role_data_diff; 30 | grant all privileges on all schemas in database data_diff to role role_data_diff; 31 | grant all privileges on future schemas in database data_diff to role role_data_diff; 32 | grant all privileges on all tables in database data_diff to role role_data_diff; 33 | grant all privileges on future tables in database data_diff to role role_data_diff; 34 | grant all privileges on all views in database data_diff to role role_data_diff; 35 | grant all privileges on future views in database data_diff to role role_data_diff; 36 | grant usage, create schema on database data_diff to role role_data_diff; 37 | grant role role_data_diff to role sysadmin; 38 | 39 | use role role_data_diff; 40 | use database data_diff; 41 | -------------------------------------------------------------------------------- /integration_tests/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'data_diff_test' 2 | version: '0.0.0' 3 | config-version: 2 4 | 5 | profile: 'data_diff' 6 | 7 | clean-targets: 8 | - "target" 9 | - "dbt_packages" 10 | - "logs" 11 | 12 | on-run-start: 13 | - > # re-create common schema 14 | {% if var("fresh", false) %} 15 | create or replace schema {{ data_diff.get_namespace() }}; 16 | {% endif %} 17 | 18 | vars: 19 | # data_diff__database: data_diff 20 | data_diff__schema: datadiff 21 | # For normal (non-async) mode 22 | data_diff__configured_tables: 23 | - # src_db: data_diff 24 | src_schema: dbt_blue 25 | src_table: my_first_dbt_model 26 | # trg_db: data_diff 27 | trg_schema: dbt_green 28 | trg_table: my_first_dbt_model 29 | pk: id # id1,id2 30 | include_columns: [] 31 | exclude_columns: ["loaded_at"] 32 | # pipe_name: awesome_thread 33 | - src_schema: dbt_blue 34 | src_table: my_second_dbt_model 35 | trg_schema: dbt_green 36 | trg_table: my_second_dbt_model 37 | pk: id 38 | include_columns: [] 39 | exclude_columns: ["loaded_at"] 40 | # data_diff__auto_pipe: true 41 | -------------------------------------------------------------------------------- /integration_tests/models/example/my_first_dbt_model.sql: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | Welcome to your first dbt model! 4 | Did you know that you can also configure models directly within SQL files? 5 | This will override configurations stated in dbt_project.yml 6 | 7 | Try changing "table" to "view" below 8 | */ 9 | 10 | {{ config(materialized='table') }} 11 | 12 | with source_data as ( 13 | 14 | select 1 as id, 'id 1' as id_description, {% if target.name.lower() == 'blue' %} '100' {%else %} 100 {% endif %} as amount 15 | union all 16 | select null as id, 'null' as id_description, {% if target.name.lower() == 'blue' %} '100' {%else %} 100 {% endif %} as amount 17 | union all 18 | select 2 as id, 'id 2' as id_description, {% if target.name.lower() == 'blue' %} '100' {%else %} 100 {% endif %} as amount 19 | union all 20 | select 3 as id, {% if target.name.lower() == 'blue' %}'id 3 blue'{% else %}'id 3 green'{% endif %}as id_description, {% if target.name.lower() == 'blue' %} '100' {%else %} 100 {% endif %} as amount 21 | {% if target.name.lower() == 'blue' %} 22 | union all 23 | select 4 as id, 'id 4' as id_description, {% if target.name.lower() == 'blue' %} '100' {%else %} 100 {% endif %} as amount 24 | {% endif %} 25 | 26 | ) 27 | 28 | select * 29 | from source_data 30 | 31 | /* 32 | Uncomment the line below to remove records with null `id` values 33 | */ 34 | 35 | -- where id is not null 36 | -------------------------------------------------------------------------------- /integration_tests/models/example/my_second_dbt_model.sql: -------------------------------------------------------------------------------- 1 | 2 | -- Use the `ref` function to select from other models 3 | 4 | select id, id_description 5 | from {{ ref('my_first_dbt_model') }} 6 | where true 7 | {# and id is not null #} 8 | and coalesce(id, 1) = 1 9 | -------------------------------------------------------------------------------- /integration_tests/models/unit/unit.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: verify_get_namespace 3 | tests: 4 | - dbt_utils.expression_is_true: 5 | expression: actual = expected 6 | -------------------------------------------------------------------------------- /integration_tests/models/unit/verify_configured_tables.sql: -------------------------------------------------------------------------------- 1 | select * from {{ ref('configured_tables') }} 2 | -------------------------------------------------------------------------------- /integration_tests/models/unit/verify_configured_tables.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: verify_configured_tables 3 | tests: 4 | - dbt_utils.unique_combination_of_columns: 5 | combination_of_columns: 6 | - src_db 7 | - src_schema 8 | - src_table 9 | - trg_db 10 | - trg_schema 11 | - trg_table 12 | columns: 13 | - name: src_db 14 | tests: 15 | - not_null 16 | - name: src_schema 17 | tests: 18 | - not_null 19 | - name: src_table 20 | tests: 21 | - not_null 22 | - name: trg_db 23 | tests: 24 | - not_null 25 | - name: trg_schema 26 | tests: 27 | - not_null 28 | - name: trg_table 29 | tests: 30 | - not_null 31 | - name: pk 32 | tests: 33 | - not_null 34 | - name: where_condition 35 | tests: 36 | - not_null 37 | - name: is_enabled 38 | tests: 39 | - not_null 40 | - accepted_values: 41 | values: [TRUE, FALSE] 42 | -------------------------------------------------------------------------------- /integration_tests/models/unit/verify_get_namespace.sql: -------------------------------------------------------------------------------- 1 | {% set expected_namespace -%} 2 | {{ generate_database_name(var("data_diff__database", target.database)) }}.{{ generate_schema_name(var("data_diff__schema", target.schema)) }} 3 | {%- endset -%} 4 | 5 | select '' as test_case, '{{ data_diff.get_namespace() }}' as actual, '{{ expected_namespace }}' as expected 6 | -------------------------------------------------------------------------------- /integration_tests/models/unit/verify_log_for_validation.sql: -------------------------------------------------------------------------------- 1 | select * from {{ ref('log_for_validation') }} 2 | -------------------------------------------------------------------------------- /integration_tests/models/unit/verify_log_for_validation.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: verify_log_for_validation 3 | columns: 4 | - name: start_time 5 | tests: 6 | - not_null 7 | - name: end_time 8 | description: End time of the execution 9 | tests: 10 | - not_null: 11 | where: diff_type not ilike 'DAG of Task%' 12 | - name: sql_statement 13 | tests: 14 | - not_null 15 | - name: diff_start_time 16 | tests: 17 | - not_null 18 | - name: diff_type 19 | tests: 20 | - not_null 21 | -------------------------------------------------------------------------------- /integration_tests/models/verify_run/verify_configured_tables_data.sql: -------------------------------------------------------------------------------- 1 | {% set configured_rows = var("data_diff__configured_tables", []) | length %} 2 | 3 | select count(*) as actual, {{ configured_rows }} as expected 4 | from {{ ref('configured_tables') }} 5 | -------------------------------------------------------------------------------- /integration_tests/models/verify_run/verify_created_sprocs.sql: -------------------------------------------------------------------------------- 1 | {% set ns = data_diff.get_namespace() %} 2 | 3 | {{ 4 | config( 5 | materialized='table', 6 | sql_header='show procedures in schema ' ~ ns ~ ';' 7 | ) 8 | }} 9 | 10 | {% set sproc_names = [ 11 | "check_data_diff", 12 | "check_key", 13 | "check_schema", 14 | ] %} 15 | 16 | with show_data as ( 17 | 18 | select * 19 | from table(result_scan(last_query_id())) 20 | where "description" = 'user-defined procedure' 21 | 22 | ) 23 | {% for item in sproc_names %} 24 | 25 | select upper(concat("catalog_name",'.',"schema_name",'.',"name")) as actual 26 | ,upper('{{ ns }}.{{ item }}') as expected 27 | from show_data 28 | where "name" ilike '{{ item }}' 29 | {% if not loop.last %} union all {% endif %} 30 | 31 | {% endfor %} 32 | -------------------------------------------------------------------------------- /integration_tests/models/verify_run/verify_created_tasks.sql: -------------------------------------------------------------------------------- 1 | {% set ns = data_diff.get_namespace() %} 2 | 3 | {{ 4 | config( 5 | materialized='table', 6 | sql_header='show tasks in schema ' ~ ns ~ ';' 7 | ) 8 | }} 9 | 10 | {% set dag_log_entries = dbt_utils.get_column_values( 11 | table=ref('log_for_validation'), 12 | column="upper(trim(split(diff_type, ':')[1]))", 13 | where="diff_type ilike 'DAG%' and value is not null" 14 | ) or [] 15 | %} 16 | 17 | with show_data as ( 18 | 19 | select * 20 | from table(result_scan(last_query_id())) 21 | 22 | ) 23 | 24 | {% if dag_log_entries | length == 0 %} 25 | 26 | select 'Normal run (non-async)' as test_case, count(*) as actual, 0 as expected 27 | from show_data 28 | 29 | {% else %} 30 | --Assuming data_diff__auto_pipe = true 31 | {% set configured_rows = var("data_diff__configured_tables", []) | length %} 32 | select 'key tasks' as test_case, count(*) as actual, {{ configured_rows }} as expected 33 | from show_data 34 | where "name" ilike '%check_key_%{{ dag_log_entries[0] }}%' 35 | 36 | union all 37 | 38 | select 'schema tasks' as test_case, count(*) as actual, {{ configured_rows }} as expected 39 | from show_data 40 | where "name" ilike '%check_schema_%{{ dag_log_entries[0] }}%' 41 | 42 | union all 43 | 44 | select 'data-diff tasks' as test_case, count(*) as actual, {{ configured_rows }} as expected 45 | from show_data 46 | where "name" ilike '%check_data_diff_%{{ dag_log_entries[0] }}%' 47 | 48 | union all 49 | 50 | select 'root task' as test_case, count(*) as actual, 1 as expected 51 | from show_data 52 | where "name" ilike '%task_root%{{ dag_log_entries[0] }}%' 53 | 54 | union all 55 | 56 | select 'end task' as test_case, count(*) as actual, 1 as expected 57 | from show_data 58 | where "name" ilike '%task_end%{{ dag_log_entries[0] }}%' 59 | 60 | {% endif %} 61 | -------------------------------------------------------------------------------- /integration_tests/models/verify_run/verify_log_entry.sql: -------------------------------------------------------------------------------- 1 | {% set configured_rows = var("data_diff__configured_tables", []) | length %} 2 | 3 | select 'key' as diff, count(*) as actual, {{ configured_rows }} as expected 4 | from {{ ref('log_for_validation') }} 5 | where diff_type = 'key' 6 | 7 | union all 8 | 9 | select 'schema' as diff, count(*) as actual, {{ configured_rows }} as expected 10 | from {{ ref('log_for_validation') }} 11 | where diff_type = 'schema' 12 | 13 | union all 14 | 15 | select 'data' as diff, count(*) as actual, {{ configured_rows * 2 }} as expected 16 | from {{ ref('log_for_validation') }} 17 | where diff_type = 'data-diff' 18 | -------------------------------------------------------------------------------- /integration_tests/models/verify_run/verify_run.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: verify_configured_tables_data 3 | tests: 4 | - dbt_utils.expression_is_true: 5 | expression: actual = expected 6 | - name: verify_created_sprocs 7 | tests: 8 | - dbt_utils.expression_is_true: 9 | expression: actual = expected 10 | - name: verify_created_tasks 11 | tests: 12 | - dbt_utils.expression_is_true: 13 | expression: actual = expected 14 | - name: verify_log_entry 15 | tests: 16 | - dbt_utils.expression_is_true: 17 | expression: actual = expected 18 | -------------------------------------------------------------------------------- /integration_tests/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - local: ../ 3 | - package: dbt-labs/dbt_utils 4 | version: 1.1.1 5 | sha1_hash: de2deba3d66ce03d8c02949013650cc9b94f6030 6 | -------------------------------------------------------------------------------- /integration_tests/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - local: ../ 3 | -------------------------------------------------------------------------------- /macros/data_diff.yml: -------------------------------------------------------------------------------- 1 | macros: 2 | - name: data_diff__run 3 | description: | 4 | Execute (or Produce the script of) the Data Diff's stored procedures. 5 | arguments: 6 | - name: in_hook 7 | type: boolean [default=false] 8 | description: True to generate SQL script only, False to exclude SQL right after the generation 9 | - name: is_cleanup 10 | type: boolean [default=false] 11 | description: True to perform additional SQL script for cleaning up (log only) by calling `data_diff__cleanup` 12 | 13 | - name: data_diff__run_async 14 | description: | 15 | Execute (or Produce the script of) the Data Diff's stored procedures, wrapped into a DAG of Tasks. 16 | arguments: 17 | - name: is_polling_status [default=false] 18 | type: boolean 19 | description: True to keep polling task status until it's finished or timout. Note, it doesn't support to run in hook 20 | - name: in_hook [default=false] 21 | type: boolean 22 | description: True to generate SQL script only, False to exclude SQL right after the generation 23 | - name: is_cleanup 24 | type: boolean [default=false] 25 | description: True to perform additional SQL script for cleaning up (log & task dag) by calling `data_diff__cleanup` 26 | 27 | - name: data_diff__poll_status_async 28 | description: | 29 | Execute the status polling of the executed DAG of Tasks based on the coresponding dbt invocation id 30 | 31 | Uasge: 32 | ```bash 33 | dbt run-operation \ 34 | data_diff__poll_status_async \ 35 | --args '{p_invocation_id:"9642e2dd-9c10-4a10-a979-7d169b5d3731", poll_times:10, poll_wait_in_s:60} 36 | ``` 37 | arguments: 38 | - name: p_invocation_id 39 | type: string 40 | description: The coresponding dbt invocation_id 41 | - name: poll_times [default=100] 42 | type: integer 43 | description: Number of times it tries to execute the polling script 44 | - name: poll_wait_in_s [default=60] 45 | type: integer 46 | description: Number of seconds need to wait in each polling 47 | 48 | - name: data_diff__cleanup 49 | description: TODO clean up diff tables 50 | -------------------------------------------------------------------------------- /macros/data_diff__cleanup.sql: -------------------------------------------------------------------------------- 1 | {% macro data_diff__cleanup(in_hook=false, p_invocation_id=none) -%} 2 | {{ return(adapter.dispatch('data_diff__cleanup')(in_hook=in_hook, p_invocation_id=p_invocation_id)) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__data_diff__cleanup(in_hook=false, p_invocation_id=none) -%} 6 | 7 | {% set namespace = data_diff.get_namespace() %} 8 | 9 | {% set query -%} 10 | 11 | TODO: clean up log table - keep today data 12 | TODO: clean up DAG tasks - keep invocation_id passed in & delete others, none for doing nothing 13 | 14 | {%- endset %} 15 | 16 | {% if in_hook %} 17 | {{ log("[SCRIPT]: data_diff__cleanup", info=True) if execute }} 18 | {{ return(query) }} 19 | {% else %} 20 | {{ log("[RUN]: data_diff__cleanup", info=True) }} 21 | {% set results = run_query(query) %} 22 | {{ log(results.rows, info=True) }} 23 | {% endif %} 24 | 25 | {%- endmacro %} 26 | -------------------------------------------------------------------------------- /macros/data_diff__poll_status_async.sql: -------------------------------------------------------------------------------- 1 | {% macro data_diff__poll_status_async(p_invocation_id, poll_times=100, poll_wait_in_s=10) -%} 2 | {{ return( 3 | adapter.dispatch('data_diff__poll_status_async')( 4 | p_invocation_id=p_invocation_id, 5 | poll_times=poll_times, 6 | poll_wait_in_s=poll_wait_in_s 7 | ) 8 | ) }} 9 | {%- endmacro %} 10 | 11 | {% macro default__data_diff__poll_status_async(p_invocation_id, poll_times=100, poll_wait_in_s=10) -%} 12 | 13 | {% set namespace = data_diff.get_namespace() %} 14 | {% set dbt_invocation_id = p_invocation_id | replace("-", "_") %} 15 | {% set end_task = "data_diff__task_end_" ~ dbt_invocation_id %} 16 | 17 | {% set query -%} 18 | 19 | use schema {{ namespace }}; 20 | 21 | call system$wait({{ poll_wait_in_s }}, 'SECONDS'); 22 | 23 | select state -- poll until SUCCEEDED 24 | from table(information_schema.task_history( 25 | task_name => '{{ end_task | upper }}' 26 | )) 27 | order by scheduled_time desc 28 | limit 1; 29 | 30 | {%- endset %} 31 | 32 | {% for item in range(0, poll_times) %} 33 | 34 | {% set query_state = dbt_utils.get_single_value(query, default="") %} 35 | {{ log("[RUN] Polling #" ~ item ~ ": " ~ (query_state or 'SCHEDULED'), info=True) }} 36 | 37 | {% if query_state == "SUCCEEDED" %} 38 | {{ return(none) }} 39 | {% endif %} 40 | 41 | {% endfor %} 42 | 43 | 44 | {%- endmacro %} 45 | -------------------------------------------------------------------------------- /macros/data_diff__run.sql: -------------------------------------------------------------------------------- 1 | {% macro data_diff__run(in_hook=false, is_cleanup=false) -%} 2 | {{ return(adapter.dispatch('data_diff__run')(in_hook=in_hook, is_cleanup=is_cleanup)) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__data_diff__run(in_hook=false, is_cleanup=false) -%} 6 | 7 | {% set namespace = data_diff.get_namespace() %} 8 | 9 | {% set query -%} 10 | 11 | call {{ namespace }}.check_key('', '{{ invocation_id }}'); 12 | call {{ namespace }}.check_schema('', '{{ invocation_id }}'); 13 | call {{ namespace }}.check_data_diff('', '{{ invocation_id }}'); 14 | 15 | {% if is_cleanup -%} 16 | {{ data_diff.data_diff__cleanup(in_hook=true) }} 17 | {%- endif %} 18 | 19 | {%- endset %} 20 | 21 | {% if in_hook %} 22 | {{ log("[SCRIPT]: data_diff__run", info=True) if execute }} 23 | {{ return(query) }} 24 | {% else %} 25 | {{ log("[RUN]: data_diff__run", info=True) }} 26 | {% set results = run_query(query) %} 27 | {{ log("Completed", info=True) }} 28 | {% endif %} 29 | 30 | {%- endmacro %} 31 | -------------------------------------------------------------------------------- /macros/data_diff__run_async.sql: -------------------------------------------------------------------------------- 1 | {% macro data_diff__run_async(is_polling_status=false, in_hook=false, is_cleanup=false) -%} 2 | {{ return( 3 | adapter.dispatch('data_diff__run_async')( 4 | is_polling_status=is_polling_status, 5 | in_hook=in_hook, 6 | is_cleanup=is_cleanup 7 | ) 8 | ) }} 9 | {%- endmacro %} 10 | 11 | {% macro default__data_diff__run_async(is_polling_status=false, in_hook=false, is_cleanup=false) -%} 12 | 13 | {% set namespace = data_diff.get_namespace() %} 14 | {% set dbt_invocation_id = invocation_id | replace("-", "_") %} 15 | 16 | {% set root_task = "data_diff__task_root_" ~ dbt_invocation_id %} 17 | {% set end_task = "data_diff__task_end_" ~ dbt_invocation_id %} 18 | {% set prefix_batch_task__check_key = "data_diff__task__check_key_batch_" ~ dbt_invocation_id ~ "_" %} 19 | {% set prefix_batch_task__check_schema = "data_diff__task__check_schema_batch_" ~ dbt_invocation_id ~ "_" %} 20 | {% set prefix_batch_task__check_data_diff = "data_diff__task__check_data_diff_batch_" ~ dbt_invocation_id ~ "_" %} 21 | 22 | {% set batches = dbt_utils.get_column_values(table=ref('configured_tables'), column='pipe_name') or [] %} 23 | {% if batches | length == 0 %} 24 | {{ log("No configured entity found!", info=True) if execute }} 25 | {{ return("") }} 26 | {% endif %} 27 | 28 | {% set log_model_fqn -%} {{ ref("log_for_validation") }} {%- endset %} 29 | 30 | {% set utcnow = modules.datetime.datetime.utcnow() %} 31 | {% set diff_run_id = "DATA-DIFF-RUN-" ~ invocation_id ~ "-" ~ utcnow.strftime("%Y%m%d-%H%M%S") %} 32 | {% set query -%} 33 | --1. Build the DAG 34 | --root task 35 | create or replace task {{ namespace }}.{{ root_task }} 36 | warehouse = {{ target.warehouse }} 37 | as 38 | insert into {{ log_model_fqn }} (start_time, end_time, sql_statement, diff_start_time, diff_type, diff_run_id) 39 | values (sysdate(), null, 'execute task {{ namespace }}.{{ root_task }}', '{{ utcnow }}', 'DAG of Task: {{ dbt_invocation_id }}', '{{ diff_run_id }}'); 40 | --end task 41 | create or replace task {{ namespace }}.{{ end_task }} 42 | warehouse = {{ target.warehouse }} 43 | as 44 | insert into {{ log_model_fqn }} (start_time, end_time, sql_statement, diff_start_time, diff_type, diff_run_id) 45 | values (sysdate(), null, 'execute task {{ namespace }}.{{ end_task }}', '{{ utcnow }}', 'DAG of Task: {{ dbt_invocation_id }}', '{{ diff_run_id }}'); 46 | 47 | {% for batch_id in batches %} 48 | 49 | --key task(s) 50 | create or replace task {{ namespace }}.{{ prefix_batch_task__check_key }}{{ batch_id }} 51 | warehouse = {{ target.warehouse }} 52 | after {{ namespace }}.{{ root_task }} 53 | as 54 | call {{ namespace }}.check_key('{{ batch_id }}', '{{ diff_run_id }}'); 55 | alter task {{ namespace }}.{{ prefix_batch_task__check_key }}{{ batch_id }} resume; 56 | 57 | --schema task(s): run after key check 58 | create or replace task {{ namespace }}.{{ prefix_batch_task__check_schema }}{{ batch_id }} 59 | warehouse = {{ target.warehouse }} 60 | after {{ namespace }}.{{ prefix_batch_task__check_key }}{{ batch_id }} 61 | as 62 | call {{ namespace }}.check_schema('{{ batch_id }}', '{{ diff_run_id }}'); 63 | alter task {{ namespace }}.{{ prefix_batch_task__check_schema }}{{ batch_id }} resume; 64 | 65 | --data diff task(s): run after schema task & depends on its result 66 | create or replace task {{ namespace }}.{{ prefix_batch_task__check_data_diff }}{{ batch_id }} 67 | warehouse = {{ target.warehouse }} 68 | after {{ namespace }}.{{ prefix_batch_task__check_schema }}{{ batch_id }} 69 | as 70 | call {{ namespace }}.check_data_diff('{{ batch_id }}', '{{ diff_run_id }}'); 71 | alter task {{ namespace }}.{{ prefix_batch_task__check_data_diff }}{{ batch_id }} resume; 72 | 73 | --end task 74 | alter task {{ namespace }}.{{ end_task }} add after {{ namespace }}.{{ prefix_batch_task__check_data_diff }}{{ batch_id }}; 75 | 76 | {%- endfor %} 77 | alter task {{ namespace }}.{{ end_task }} resume; 78 | 79 | --2. Execute root task 80 | execute task {{ namespace }}.{{ root_task }}; 81 | 82 | --Clean up 83 | {% if is_cleanup -%} 84 | {{ log('is_cleanup: ' ~ is_cleanup, info=True) }} 85 | {{ data_diff.data_diff__cleanup(in_hook=true, p_invocation_id=dbt_invocation_id) }} 86 | {%- endif %} 87 | 88 | {%- endset %} 89 | 90 | {% if in_hook %} 91 | {{ log("[SCRIPT]: data_diff__run_async", info=True) if execute }} 92 | {{ return(query) }} {# polling status doesn't support in hook #} 93 | {% else %} 94 | {{ log("[RUN]: data_diff__run_async", info=True) }} 95 | {% set results = run_query(query) %} 96 | {{ log(results.rows, info=True) }} 97 | 98 | {% if is_polling_status -%} 99 | {{ data_diff.data_diff__poll_status_async(p_invocation_id=dbt_invocation_id) }} 100 | {%- endif %} 101 | {% endif %} 102 | 103 | {{ log( 104 | ( 105 | "👉 Visit the root task at: " 106 | "https://{SF_BASE_URL}/#/data/" 107 | "databases/" ~ (generate_database_name(var("data_diff__database", target.database)) | upper) ~ "/" 108 | "schemas/" ~ (generate_schema_name(var("data_diff__schema", target.schema)) | upper) ~ "/" 109 | "task/" ~ (root_task | upper) ~ "/" 110 | "graph" 111 | " to monitor the DAG execution..." 112 | ), 113 | info=True 114 | ) 115 | }} 116 | {% if not is_polling_status or in_hook -%} 117 | {{ log("💡 Poll status of " ~ (end_task | upper) ~ " to know if the DAG finished", info=True) }} 118 | {%- endif %} 119 | 120 | {%- endmacro %} 121 | -------------------------------------------------------------------------------- /macros/resources/create_resources.sql: -------------------------------------------------------------------------------- 1 | {% macro create_resources() -%} 2 | {{ return(adapter.dispatch('create_resources')()) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__create_resources() -%} 6 | 7 | {{ log("[SCRIPT]: create_resources", info=True) if execute }} 8 | {{ data_diff.create__check_key() }} 9 | {{ data_diff.create__check_schema() }} 10 | {{ data_diff.create__check_data_diff() }} 11 | 12 | {%- endmacro %} 13 | -------------------------------------------------------------------------------- /macros/resources/refresh_resource_data.sql: -------------------------------------------------------------------------------- 1 | {% macro refresh_resource_data() %} 2 | 3 | {% set configured_tables = var("data_diff__configured_tables", []) %} 4 | {% set source_fixed_naming = var("data_diff__configured_tables__source_fixed_naming", true) %} 5 | {% set target_fixed_naming = var("data_diff__configured_tables__target_fixed_naming", true) %} 6 | {% set configured_table_model -%} {{ ref("configured_tables") }} {%- endset %} 7 | 8 | {% set query -%} 9 | 10 | truncate table {{ configured_table_model }}; 11 | insert into {{ configured_table_model }} 12 | (src_db,src_schema,src_table,trg_db,trg_schema,trg_table,pk,include_columns,exclude_columns,where_condition,is_enabled,pipe_name) 13 | 14 | {% for item in configured_tables -%} 15 | 16 | select 17 | 18 | {% if source_fixed_naming -%} 19 | '{{ item.get("src_db", target.database) }}' as src_db 20 | ,'{{ item.get("src_schema", target.schema) }}' as src_schema 21 | {%- else -%} 22 | '{{ generate_database_name(item.get("src_db")) }}' as src_db 23 | ,'{{ generate_schema_name(item.get("src_schema")) }}' as src_schema 24 | {%- endif -%} 25 | ,'{{ item.get("src_table") }}' as src_table 26 | 27 | {% if target_fixed_naming -%} 28 | ,'{{ item.get("trg_db", target.database) }}' as trg_db 29 | ,'{{ item.get("trg_schema", target.schema) }}' as trg_schema 30 | {%- else -%} 31 | ,'{{ generate_database_name(item.get("trg_db")) }}' as trg_db 32 | ,'{{ generate_schema_name(item.get("trg_schema")) }}' as trg_schema 33 | {%- endif -%} 34 | ,'{{ item.get("trg_table", item.get("src_table")) }}' as trg_table 35 | 36 | ,'{{ item.get("pk") }}' as pk 37 | ,{{ item.get("include_columns", []) | upper }} as include_columns 38 | ,{{ item.get("exclude_columns", []) | upper }} as exclude_columns 39 | ,'{{ data_diff.escape_single_quote_value(item.get("where", "1=1")) }}' as where_condition 40 | ,True as is_enabled 41 | 42 | {% if var("data_diff__auto_pipe", false) -%} 43 | ,coalesce( 44 | nullif('{{ item.get("pipe_name", "") }}', ''), 45 | concat(src_db,'_',src_schema,'_',src_table,'__',trg_db,'_',trg_schema,'_',trg_table) 46 | ) as pipe_name 47 | {%- else -%} 48 | ,'{{ item.get("pipe_name", "") }}' as pipe_name 49 | {%- endif %} 50 | 51 | {% if not loop.last -%} 52 | union all 53 | {% endif %} 54 | 55 | {%- endfor %}; 56 | 57 | {%- endset %} 58 | 59 | {{ log("[SCRIPT]: refresh_resource_data", info=True) if execute }} 60 | {{ return(query) }} 61 | 62 | {% endmacro %} 63 | -------------------------------------------------------------------------------- /macros/resources/resources.yml: -------------------------------------------------------------------------------- 1 | macros: 2 | - name: create_resources 3 | description: Produce SQL script of this package's resources 4 | - name: refresh_resource_data 5 | description: Reset data of `configured_tables` model based on the `data_diff__configured_tables` variable 6 | -------------------------------------------------------------------------------- /macros/resources/stored-procedures/create__check_data_diff.sql: -------------------------------------------------------------------------------- 1 | {% macro create__check_data_diff() -%} 2 | {{ return(adapter.dispatch('create__check_data_diff')()) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__create__check_data_diff() -%} 6 | 7 | {% set configured_table_model -%} {{ ref("configured_tables").identifier }} {%- endset %} 8 | {% set log_model -%} {{ ref("log_for_validation").identifier }} {%- endset %} 9 | {% set result_schema_model -%} {{ ref("schema_check").identifier }} {%- endset %} 10 | {% set result_model -%} {{ ref("data_diff_check_summary").identifier }} {%- endset %} 11 | 12 | {% set namespace = data_diff.get_namespace() %} 13 | 14 | {% set query -%} 15 | 16 | create or replace procedure {{ namespace }}.check_data_diff(p_batch varchar, p_diff_run_id varchar) 17 | returns varchar 18 | language sql 19 | as 20 | $$ 21 | declare 22 | sql_statement varchar; 23 | run_timestamp timestamp; 24 | 25 | c1 cursor for 26 | 27 | with {{ configured_table_model }}_tmp as ( 28 | 29 | select * 30 | from {{ configured_table_model }} 31 | where true 32 | and is_enabled = true 33 | and coalesce(pipe_name, '') = ? 34 | 35 | ), 36 | 37 | pk_base as ( 38 | 39 | select {{ configured_table_model }}_tmp.* 40 | ,table1.value 41 | ,'ifnull(nullif(upper(trim(cast(trg.'|| table1.value ||' as varchar))), ''''), ''^^'')' as trg_pk_null 42 | ,'ifnull(nullif(upper(trim(cast(src.'|| table1.value ||' as varchar))), ''''), ''^^'')' as src_pk_null 43 | ,'upper(trim(cast('|| table1.value ||' as varchar)))' as combined_pk 44 | ,'upper(trim(cast(trg.'|| table1.value ||' as varchar)))' as trg_pk 45 | ,'upper(trim(cast(src.'|| table1.value ||' as varchar)))' as src_pk 46 | 47 | from {{ configured_table_model }}_tmp, table(split_to_table(pk, ',')) as table1 48 | 49 | ), 50 | 51 | {{ configured_table_model }}_final as ( 52 | 53 | select src_db 54 | ,src_schema 55 | ,src_table 56 | ,trg_db 57 | ,trg_schema 58 | ,trg_table 59 | ,pk 60 | ,include_columns 61 | ,exclude_columns 62 | ,where_condition 63 | ,listagg(combined_pk ,'||') as combined_unique_key 64 | ,listagg(src_pk ,'||') as src_unique_key 65 | ,listagg(trg_pk ,'||') as trg_unique_key 66 | from pk_base 67 | group by all 68 | 69 | ), 70 | 71 | schema_validation as ( 72 | 73 | select * 74 | from {{ result_schema_model }} 75 | where true 76 | and common_col = 1 -- only available mutual columns 77 | qualify row_number() over( 78 | partition by src_db, src_schema, src_table, trg_db, trg_schema, trg_table, column_name, pipe_name 79 | order by last_data_diff_timestamp desc 80 | ) = 1 --get last schema diff result 81 | 82 | ), 83 | 84 | base as ( 85 | 86 | select t.* 87 | ,listagg(v.column_name, ',') as col_list 88 | ,'cast(md5_binary(concat_ws(''||'',' 89 | || listagg('ifnull(nullif(upper(trim(cast(' || v.column_name || ' as varchar))), ''''), ''^^'')', ',' ) 90 | || ' )) as binary(16)) as hashdiff' as hash_calc 91 | ,listagg('ifnull(nullif(upper(trim(cast(src.'|| v.column_name ||' as varchar))),''''),''^^'')= ifnull(nullif(upper(trim(cast(trg.'|| v.column_name ||' as varchar))),''''),''^^'') as '|| v.column_name || '_is_equal', ',' ) as is_equal 92 | ,listagg('sum(case when '|| v.column_name ||'_is_equal then 1 else 0 end) as '|| v.column_name || '_diff', ',' ) as diff_calc 93 | ,listagg(v.column_name ||'_diff / cnt as '|| v.column_name, ',') as result_calc 94 | 95 | from {{ configured_table_model }}_final as t 96 | join schema_validation as v 97 | on t.src_schema = v.src_schema 98 | and t.src_table = v.src_table 99 | where true 100 | --excluded columns i.e always changing column, added or removed column 101 | and (not array_contains(upper(v.column_name)::variant, t.exclude_columns)) 102 | and ( 103 | case 104 | when array_size(t.include_columns) > 0 105 | then array_contains(v.column_name::variant, t.include_columns) 106 | else true 107 | end 108 | ) 109 | group by all 110 | 111 | ) 112 | 113 | select ' 114 | create or replace table {{ result_model }}_' || src_table || '_' || to_char(sysdate(),'yyyymmdd') || ' 115 | as 116 | with different_in_source as ( 117 | (select ' || concat(col_list, ',' , combined_unique_key || ' as combined_unique_key') || ' from '|| src_db || '.' || src_schema || '.' || src_table || ' where ' || where_condition || ') 118 | except 119 | (select ' || concat(col_list, ',' , combined_unique_key || ' as combined_unique_key') || ' from '|| trg_db || '.' || trg_schema || '.' || trg_table || ' where ' || where_condition || ') 120 | ), 121 | different_in_target as ( 122 | (select ' || concat(col_list, ',' , combined_unique_key || ' as combined_unique_key') || ' from '|| trg_db || '.' || trg_schema || '.' || trg_table || ' where ' || where_condition || ') 123 | except 124 | (select ' || concat(col_list, ',' , combined_unique_key || ' as combined_unique_key') || ' from '|| src_db || '.' || src_schema || '.' || src_table || ' where ' || where_condition || ') 125 | ), 126 | compare_content as ( 127 | 128 | select ''different_in_source'' as type_of_diff, * from different_in_source 129 | union 130 | select ''different_in_target'' as type_of_diff, * from different_in_target 131 | ) 132 | select * 133 | , ''' || ? || ''' as last_data_diff_timestamp 134 | , ''' || ? || ''' as diff_run_id 135 | from compare_content' as sql_data_diff__for_a_table, 136 | 137 | ' 138 | insert into {{ result_model }} ( 139 | src_db 140 | ,src_schema 141 | ,src_table 142 | ,trg_db 143 | ,trg_schema 144 | ,trg_table 145 | ,column_name 146 | ,diff_count 147 | ,table_count 148 | ,diff_feeded_rate 149 | ,match_percentage 150 | ,last_data_diff_timestamp 151 | ,diff_run_id 152 | ) 153 | with compare_content as ( 154 | 155 | select * from {{ result_model }}_' || src_table || '_' || to_char(sysdate(),'yyyymmdd') || ' 156 | 157 | ), 158 | column_compare as ( 159 | select cc.combined_unique_key, ' || is_equal ||' 160 | from compare_content as cc 161 | join '|| src_db || '.' || src_schema || '.'|| src_table || ' as src 162 | on '|| src_unique_key || ' = cc.combined_unique_key 163 | join '|| trg_db || '.' || trg_schema || '.'|| trg_table || ' as trg 164 | on '|| trg_unique_key || ' = cc.combined_unique_key 165 | ), 166 | calc as ( 167 | select '''|| src_db || ''' as src_db 168 | ,'''|| src_schema || ''' as src_schema 169 | ,'''|| src_table || ''' as src_table 170 | ,'''|| trg_db || ''' as trg_db 171 | ,'''|| trg_schema || ''' as trg_schema 172 | ,'''|| trg_table || ''' as trg_table 173 | ,count(*) as cnt 174 | ,'|| diff_calc || 175 | ', '|| result_calc || ' 176 | from column_compare 177 | ) 178 | 179 | select src_db 180 | ,src_schema 181 | ,src_table 182 | ,trg_db 183 | ,trg_schema 184 | ,trg_table 185 | ,column_name 186 | ,cnt as diff_count 187 | ,(select count(*) from '|| src_db || '.' || src_schema || '.' || src_table || ' where ' || where_condition || ') as total_count 188 | ,(1 - match_rate) as diff_feeded_rate 189 | ,(1 - diff_count / 2 * 1.0 / total_count) as match_percentage 190 | ,''' || ? ||''' as last_data_diff_timestamp 191 | ,''' || ? ||''' as diff_run_id 192 | from calc 193 | unpivot ( 194 | match_rate 195 | for column_name in (' || col_list || ') 196 | ) 197 | where match_rate < 1' as sql_data_diff__pivot_summary 198 | 199 | from base 200 | order by src_table; 201 | 202 | begin 203 | run_timestamp := sysdate(); 204 | 205 | open c1 using(:p_batch, :run_timestamp, :p_diff_run_id, :run_timestamp, :p_diff_run_id); 206 | 207 | for record in c1 do 208 | sql_statement := record.sql_data_diff__for_a_table; 209 | 210 | insert into {{ log_model }} (start_time, end_time, sql_statement, diff_start_time, diff_type, diff_run_id) 211 | values (sysdate(), null, :sql_statement, :run_timestamp, 'data-diff', :p_diff_run_id); 212 | 213 | execute immediate :sql_statement; 214 | 215 | update {{ log_model }} 216 | set end_time = sysdate() 217 | where diff_run_id = :p_diff_run_id 218 | and sql_statement = :sql_statement; 219 | 220 | 221 | sql_statement := record.sql_data_diff__pivot_summary; 222 | 223 | insert into {{ log_model }} (start_time, end_time, sql_statement, diff_start_time, diff_type, diff_run_id) 224 | values (sysdate(), null, :sql_statement, :run_timestamp, 'data-diff', :p_diff_run_id); 225 | 226 | execute immediate :sql_statement; 227 | 228 | update {{ log_model }} 229 | set end_time = sysdate() 230 | where diff_run_id = :p_diff_run_id 231 | and sql_statement = :sql_statement; 232 | 233 | end for; 234 | 235 | close c1; 236 | 237 | end; 238 | $$ 239 | ; 240 | 241 | {% endset %} 242 | 243 | {{ return(query) }} 244 | 245 | {%- endmacro %} 246 | -------------------------------------------------------------------------------- /macros/resources/stored-procedures/create__check_key.sql: -------------------------------------------------------------------------------- 1 | {% macro create__check_key() -%} 2 | {{ return(adapter.dispatch('create__check_key')()) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__create__check_key() %} 6 | 7 | {% set configured_table_model -%} {{ ref("configured_tables").identifier }} {%- endset %} 8 | {% set log_model -%} {{ ref("log_for_validation").identifier }} {%- endset %} 9 | {% set result_model -%} {{ ref("key_check").identifier }} {%- endset %} 10 | 11 | {% set namespace = data_diff.get_namespace() %} 12 | 13 | {% set query -%} 14 | 15 | create or replace procedure {{ namespace }}.check_key(p_batch varchar, p_diff_run_id varchar) 16 | returns varchar 17 | language sql 18 | as 19 | $$ 20 | declare 21 | sql_statement varchar; 22 | run_timestamp timestamp; 23 | 24 | c1 cursor for 25 | 26 | with {{ configured_table_model }}_tmp as ( 27 | 28 | select * 29 | from {{ configured_table_model }} 30 | where true 31 | and is_enabled = true 32 | and coalesce(pipe_name, '') = ? 33 | 34 | ), 35 | 36 | pk_base as ( 37 | 38 | select {{ configured_table_model }}_tmp.* 39 | ,table1.value 40 | ,'ifnull(nullif(upper(trim(cast(trg.'|| table1.value ||' as varchar))), ''''), ''^^'')' as trg_pk_null 41 | ,'ifnull(nullif(upper(trim(cast(src.'|| table1.value ||' as varchar))), ''''), ''^^'')' as src_pk_null 42 | ,'upper(trim(cast(trg.'|| table1.value ||' as varchar)))' as trg_pk 43 | ,'upper(trim(cast(src.'|| table1.value ||' as varchar)))' as src_pk 44 | 45 | from {{ configured_table_model }}_tmp, table(split_to_table(pk, ',')) as table1 46 | 47 | ), 48 | 49 | final as ( 50 | 51 | select src_db 52 | ,src_schema 53 | ,src_table 54 | ,trg_db 55 | ,trg_schema 56 | ,trg_table 57 | ,pk 58 | ,include_columns 59 | ,exclude_columns 60 | ,where_condition 61 | ,listagg(src_pk ,'||') as src_unique_key 62 | ,listagg(trg_pk ,'||') as trg_unique_key 63 | 64 | from pk_base 65 | group by all 66 | ) 67 | 68 | select ' 69 | insert into {{ result_model }} 70 | ( 71 | src_db 72 | ,src_schema 73 | ,src_table 74 | ,trg_db 75 | ,trg_schema 76 | ,trg_table 77 | ,pk 78 | ,key_value 79 | ,is_exclusive_src 80 | ,is_exclusive_trg 81 | ,is_diff_unique_key 82 | ,last_data_diff_timestamp 83 | ,diff_run_id 84 | ) 85 | with 86 | src_data as ( 87 | select * 88 | from ' || t.src_db || '.'|| t.src_schema || '.'|| t.src_table || ' 89 | where ' || t.where_condition || ' 90 | ), 91 | trg_data as ( 92 | select * 93 | from ' || t.trg_db || '.'|| t.trg_schema || '.'|| t.trg_table || ' 94 | where ' || t.where_condition || ' 95 | ), 96 | insert_part as ( 97 | select ''' || t.src_db || ''' as src_db 98 | , ''' || t.src_schema || ''' as src_schema 99 | , ''' || t.src_table || ''' as src_table 100 | , ''' || t.trg_db || ''' as trg_db 101 | , ''' || t.trg_schema || ''' as trg_schema 102 | , ''' || t.trg_table || ''' as trg_table 103 | , ''' || t.pk || ''' as pk 104 | , ' || src_unique_key || ' as src_pk 105 | , ' || trg_unique_key || ' as trg_pk 106 | , coalesce(src_pk, trg_pk) as key_value 107 | , (trg_pk is null) as is_exclusive_src 108 | , (src_pk is null) as is_exclusive_trg 109 | , case when src_pk is distinct from trg_pk then 1 else 0 end as is_diff_unique_key 110 | , ''' || ? || ''' as last_data_diff_timestamp 111 | , ''' || ? || ''' as diff_run_id 112 | from src_data as src 113 | full join trg_data as trg 114 | on src_pk = trg_pk 115 | where is_diff_unique_key = 1 116 | ) 117 | select src_db 118 | ,src_schema 119 | ,src_table 120 | ,trg_db 121 | ,trg_schema 122 | ,trg_table 123 | ,pk 124 | ,key_value 125 | ,is_exclusive_src 126 | ,is_exclusive_trg 127 | ,is_diff_unique_key 128 | ,last_data_diff_timestamp 129 | ,diff_run_id 130 | from insert_part 131 | ' as sql 132 | 133 | from final as t 134 | order by src_table; 135 | 136 | begin 137 | 138 | run_timestamp := sysdate(); 139 | 140 | open c1 using(:p_batch, :run_timestamp, :p_diff_run_id); 141 | 142 | for record in c1 do 143 | 144 | sql_statement := record.sql; 145 | 146 | insert into {{ log_model }} (start_time, end_time, sql_statement, diff_start_time, diff_type, diff_run_id) 147 | values (sysdate(), null, :sql_statement, :run_timestamp, 'key', :p_diff_run_id); 148 | 149 | execute immediate :sql_statement; 150 | 151 | update {{ log_model }} 152 | set end_time = sysdate() 153 | where diff_run_id = :p_diff_run_id 154 | and sql_statement = :sql_statement; 155 | 156 | end for; 157 | 158 | close c1; 159 | 160 | end; 161 | $$ 162 | ; 163 | 164 | {% endset %} 165 | 166 | {{ return(query) }} 167 | 168 | {%- endmacro %} 169 | -------------------------------------------------------------------------------- /macros/resources/stored-procedures/create__check_schema.sql: -------------------------------------------------------------------------------- 1 | {% macro create__check_schema() -%} 2 | {{ return(adapter.dispatch('create__check_schema')()) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__create__check_schema() %} 6 | 7 | {% set configured_table_model -%} {{ ref("configured_tables").identifier }} {%- endset %} 8 | {% set log_model -%} {{ ref("log_for_validation").identifier }} {%- endset %} 9 | {% set result_model -%} {{ ref("schema_check").identifier }} {%- endset %} 10 | 11 | {% set namespace = data_diff.get_namespace() %} 12 | 13 | {% set query -%} 14 | 15 | create or replace procedure {{ namespace }}.check_schema(p_batch varchar, p_diff_run_id varchar) 16 | returns varchar 17 | language sql 18 | as 19 | $$ 20 | declare 21 | 22 | sql_statement varchar; 23 | run_timestamp timestamp; 24 | 25 | c1 cursor for 26 | 27 | select ' 28 | insert into {{ result_model }} ( 29 | src_db 30 | ,src_schema 31 | ,src_table 32 | ,trg_db 33 | ,trg_schema 34 | ,trg_table 35 | ,column_name 36 | ,data_type 37 | ,datetime_precision 38 | ,numeric_precision 39 | ,numeric_scale 40 | ,common_col 41 | ,common_col_text 42 | ,is_exclusive_src 43 | ,is_exclusive_trg 44 | ,datatype_check 45 | ,last_data_diff_timestamp 46 | ,pipe_name 47 | ,diff_run_id 48 | ) 49 | 50 | with tables_to_compare as ( 51 | 52 | select * 53 | from {{ configured_table_model }} 54 | where true 55 | and is_enabled = true 56 | and src_db ilike ''' || b.src_db || ''' 57 | and src_schema ilike ''' || b.src_schema || ''' 58 | and src_table ilike ''' || b.src_table || ''' 59 | and trg_db ilike ''' || b.trg_db || ''' 60 | and trg_schema ilike ''' || b.trg_schema || ''' 61 | and trg_table ilike ''' || b.trg_table || ''' 62 | 63 | ), 64 | 65 | src_meta as ( 66 | 67 | select t.* 68 | ,table_schema 69 | ,table_name 70 | ,column_name 71 | ,data_type 72 | ,datetime_precision 73 | ,numeric_precision 74 | ,numeric_scale 75 | 76 | from '|| src_db ||'.information_schema.columns c 77 | join tables_to_compare t 78 | on t.src_schema ilike c.table_schema 79 | and t.src_table ilike c.table_name 80 | 81 | ), 82 | 83 | trg_meta as ( 84 | 85 | select t.* 86 | ,table_schema 87 | ,table_name 88 | ,column_name 89 | ,data_type 90 | ,datetime_precision 91 | ,numeric_precision 92 | ,numeric_scale 93 | 94 | from '|| trg_db ||'.information_schema.columns c 95 | join tables_to_compare t 96 | on t.trg_schema ilike c.table_schema 97 | and t.trg_table ilike c.table_name 98 | 99 | ), 100 | 101 | common_meta as ( 102 | 103 | select coalesce(src.src_db, trg.src_db) as src_db 104 | ,coalesce(src.src_schema, trg.src_schema) as src_schema 105 | ,coalesce(src.src_table, trg.src_table) as src_table 106 | ,coalesce(src.trg_db, trg.trg_db) as trg_db 107 | ,coalesce(src.trg_schema, trg.trg_schema) as trg_schema 108 | ,coalesce(src.trg_table, trg.trg_table) as trg_table 109 | ,coalesce(src.column_name, trg.column_name) as column_name 110 | ,coalesce(src.data_type, trg.data_type) as data_type 111 | ,coalesce(src.datetime_precision, trg.datetime_precision) as datetime_precision 112 | ,coalesce(src.numeric_precision, trg.numeric_precision) as numeric_precision 113 | ,coalesce(src.numeric_scale, trg.numeric_scale) as numeric_scale 114 | ,case when src.column_name = trg.column_name then 1 else 0 end as common_col 115 | ,case 116 | when src.column_name = trg.column_name then ''common'' 117 | when trg.column_name is not null then ''target only'' 118 | when src.column_name is not null then ''source only'' 119 | end as common_col_text 120 | ,case when trg.column_name is null then 1 else 0 end as is_exclusive_src 121 | ,case when src.column_name is null then 1 else 0 end as is_exclusive_trg 122 | ,case 123 | when concat( 124 | ifnull(nullif(upper(trim(cast(src.data_type as varchar))), ''''), ''^^''), 125 | ifnull(nullif(upper(trim(cast(src.datetime_precision as varchar))), ''''), ''^^''), 126 | ifnull(nullif(upper(trim(cast(src.numeric_precision as varchar))), ''''), ''^^''), 127 | ifnull(nullif(upper(trim(cast(src.numeric_scale as varchar))), ''''), ''^^'') 128 | ) = concat( 129 | ifnull(nullif(upper(trim(cast(trg.data_type as varchar))), ''''), ''^^''), 130 | ifnull(nullif(upper(trim(cast(trg.datetime_precision as varchar))), ''''), ''^^''), 131 | ifnull(nullif(upper(trim(cast(trg.numeric_precision as varchar))), ''''), ''^^''), 132 | ifnull(nullif(upper(trim(cast(trg.numeric_scale as varchar))), ''''), ''^^'') 133 | ) 134 | then 1 135 | else 0 136 | end as datatype_check 137 | ,''' || ? || ''' as last_data_diff_timestamp 138 | ,''' || ? || ''' as pipe_name 139 | ,''' || ? || ''' as diff_run_id 140 | 141 | from src_meta as src 142 | full join trg_meta as trg 143 | on trg.src_db = src.src_db 144 | and trg.src_schema = src.src_schema 145 | and trg.src_table = src.src_table 146 | and trg.trg_db = src.trg_db 147 | and trg.trg_schema = src.trg_schema 148 | and trg.trg_schema = src.trg_schema 149 | and trg.column_name = src.column_name 150 | 151 | ) 152 | 153 | select * 154 | from common_meta ' as sql 155 | 156 | from {{ configured_table_model }} as b 157 | where true 158 | and is_enabled = true 159 | and coalesce(pipe_name, '') = ? 160 | order by src_table; 161 | 162 | begin 163 | 164 | run_timestamp := sysdate(); 165 | 166 | open c1 using(:run_timestamp, :p_batch, :p_diff_run_id, :p_batch); 167 | 168 | for record in c1 do 169 | 170 | sql_statement := record.sql; 171 | 172 | insert into {{ log_model }} (start_time, end_time, sql_statement, diff_start_time, diff_type, diff_run_id) 173 | values (sysdate(), null, :sql_statement, :run_timestamp, 'schema', :p_diff_run_id); 174 | 175 | execute immediate :sql_statement; 176 | 177 | update {{ log_model }} 178 | set end_time = sysdate() 179 | where diff_run_id = :p_diff_run_id 180 | and sql_statement = :sql_statement; 181 | 182 | end for; 183 | 184 | close c1; 185 | 186 | end; 187 | $$ 188 | ; 189 | 190 | {% endset %} 191 | 192 | {{ return(query) }} 193 | 194 | {%- endmacro %} 195 | -------------------------------------------------------------------------------- /macros/resources/stored-procedures/stored_procedures.yml: -------------------------------------------------------------------------------- 1 | macros: 2 | - name: create__check_key 3 | description: Produce SQL script (`create or replace`) of the Key Diff's stored procedure 4 | - name: create__check_schema 5 | description: Produce SQL script (`create or replace`) of the Schema Diff's stored procedure 6 | - name: create__check_data_diff 7 | description: Produce SQL script (`create or replace`) of the Data Diff's stored procedure 8 | -------------------------------------------------------------------------------- /macros/sis/diff_helper.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from snowflake.snowpark.context import get_active_session 3 | 4 | st.set_page_config(layout="wide") 5 | 6 | # Header 7 | st.title("Data Diff Helpers") 8 | st.write(""" 9 | Aggregation of the diff result produced by the package [dbt-data-diff](https://data-diff.iflambda.com/latest/) 10 | """) 11 | 12 | # Get the current credentials 13 | session = get_active_session() 14 | 15 | # Query the last Diff Run ID 16 | sql = "select diff_run_id from log_for_validation order by diff_start_time desc limit 1" 17 | data = session.sql(sql).collect() 18 | last_run_id = data[0].as_dict().get('DIFF_RUN_ID') if len(data) > 0 else None 19 | st.caption(f"Last Run ID: {last_run_id}") 20 | 21 | if not last_run_id: 22 | st.warning("No diff result found!") 23 | else: 24 | # Summary 25 | st.subheader("🥉 Key diff:") 26 | st.caption("Compare the Primary Key (`pk`) only") 27 | sql = f""" 28 | with 29 | 30 | last_key_check_summary as ( 31 | select * 32 | from key_check_summary 33 | where diff_run_id = '{last_run_id}' 34 | ) 35 | 36 | select case when r.src_db is null then '🟢' else '🔴' end as result 37 | ,concat(r.number_of_exclusive_src, ' (',upper(r.pk),')') as source_not_found 38 | ,concat(r.number_of_exclusive_trg, ' (',upper(r.pk),')') as target_not_found 39 | ,concat( 40 | c.src_db,'.',c.src_schema,'.',c.src_table, 41 | ' ▶️ ', 42 | c.trg_db,'.',c.trg_schema,'.',c.trg_table 43 | ) as entity 44 | 45 | from configured_tables as c 46 | left join last_key_check_summary as r 47 | on r.src_db = c.src_db 48 | and r.src_schema = c.src_schema 49 | and r.src_table = c.src_table 50 | and r.trg_db = c.trg_db 51 | and r.trg_schema = c.trg_schema 52 | and r.trg_table = c.trg_table 53 | 54 | where r.src_db is not null 55 | or c.is_enabled = true 56 | """ 57 | data = session.sql(sql).collect() 58 | st.dataframe(data, use_container_width=True) 59 | 60 | st.subheader("🥈 Schema diff:") 61 | st.caption("Compare the data structure (column's Names and Data Types)") 62 | sql = f""" 63 | with 64 | 65 | last_schema_check_summary as ( 66 | select * 67 | from schema_check_summary 68 | where diff_run_id = '{last_run_id}' 69 | ) 70 | 71 | select case when r.src_db is null then '🟢' else '🔴' end as result 72 | ,case 73 | when r.number_of_exclusive_source > 0 then concat(r.number_of_exclusive_source, ' (',upper(r.exclusive_source_list),')') 74 | end as source_not_found 75 | ,case 76 | when r.number_of_exclusive_source > 0 then concat(r.number_of_exclusive_source, ' (',upper(r.exclusive_target_list),')') 77 | end as target_not_found 78 | ,coalesce(1 - r.mutual_columns * 1.0 / r.number_of_columns, 0) as not_found_rate 79 | ,case 80 | when r.number_of_false_datatype_check > 0 then concat(r.number_of_false_datatype_check, ' (',upper(r.false_datatype_check_list),')') 81 | end as data_type_mismatched 82 | ,coalesce(r.number_of_false_datatype_check * 1.0 / r.number_of_columns, 0) as mismatched_rate 83 | ,concat( 84 | c.src_db,'.',c.src_schema,'.',c.src_table, 85 | ' ▶️ ', 86 | c.trg_db,'.',c.trg_schema,'.',c.trg_table 87 | ) as entity 88 | 89 | from configured_tables as c 90 | left join last_schema_check_summary as r 91 | on r.src_db = c.src_db 92 | and r.src_schema = c.src_schema 93 | and r.src_table = c.src_table 94 | and r.trg_db = c.trg_db 95 | and r.trg_schema = c.trg_schema 96 | and r.trg_table = c.trg_table 97 | 98 | where r.src_db is not null 99 | or c.is_enabled = true 100 | """ 101 | data = session.sql(sql).collect() 102 | st.dataframe(data, use_container_width=True) 103 | 104 | st.subheader("🥇 Data diff:") 105 | st.caption("Compare all cell values given matched columns and mached keys") 106 | sql = f""" 107 | with 108 | 109 | last_data_diff_check_summary as ( 110 | select * 111 | from data_diff_check_summary 112 | where diff_run_id = '{last_run_id}' 113 | ) 114 | 115 | select case when r.src_db is null then '🟢' else '🔴' end as result 116 | ,r.column_name 117 | ,concat(r.match_percentage * 100, ' %') as match_percentage 118 | ,concat(r.diff_feeded_rate * 100, ' %') as diff_feeded_rate 119 | ,concat(r.diff_count, '/', r.table_count) as diff_count_vs_total 120 | ,concat( 121 | c.src_db,'.',c.src_schema,'.',c.src_table, 122 | ' ▶️ ', 123 | c.trg_db,'.',c.trg_schema,'.',c.trg_table 124 | ) as entity 125 | 126 | from configured_tables as c 127 | left join last_data_diff_check_summary as r 128 | on r.src_db = c.src_db 129 | and r.src_schema = c.src_schema 130 | and r.src_table = c.src_table 131 | and r.trg_db = c.trg_db 132 | and r.trg_schema = c.trg_schema 133 | and r.trg_table = c.trg_table 134 | 135 | where r.src_db is not null 136 | or c.is_enabled = true 137 | 138 | order by 3 desc 139 | """ 140 | data = session.sql(sql).collect() 141 | st.dataframe(data, use_container_width=True) 142 | 143 | 144 | # Drill down 145 | def show_entity_diff_drilldown(session, entity_row, expanded: bool = False): 146 | entity_dict = entity_row.as_dict() 147 | with st.expander(f"{entity_dict.get('ENTITY')}", expanded=expanded): 148 | st.markdown(f"_(only 10 rows maximum)_") 149 | sql = entity_dict.get("DRILLDOWN_SCRIPT") 150 | sql_data = session.sql(sql).collect() 151 | st.dataframe(sql_data, use_container_width=True) 152 | st.markdown("Used query:") 153 | st.code(sql.replace(" "," "), language='sql') 154 | 155 | sql = f""" 156 | with 157 | 158 | last_data_diff_check_summary as ( 159 | select * 160 | from data_diff_check_summary 161 | where diff_run_id = '{last_run_id}' 162 | ) 163 | 164 | select concat( 165 | '🟡 **', column_name, '** / ', 166 | src_db,'.',src_schema,'.',src_table, 167 | ' ▶️ ', 168 | trg_db,'.',trg_schema,'.',trg_table 169 | ) as entity 170 | ,column_name 171 | ,'with 172 | 173 | src as ( 174 | select * 175 | from data_diff_check_summary_' || src_table || '_' || to_varchar(last_data_diff_timestamp, 'YYYYMMDD') || ' 176 | where type_of_diff = ''different_in_source'' 177 | ), 178 | 179 | trg as ( 180 | select * 181 | from data_diff_check_summary_' || src_table || '_' || to_varchar(last_data_diff_timestamp, 'YYYYMMDD') || ' 182 | where type_of_diff = ''different_in_target'' 183 | ) 184 | 185 | select src.' || column_name || ' as _source 186 | ,trg.' || column_name || ' as _target 187 | , src.combined_unique_key 188 | 189 | from src 190 | join trg using (combined_unique_key) 191 | 192 | where hash(src.' || column_name || ') != hash(trg.' || column_name || ') 193 | 194 | limit 10; 195 | ' as drilldown_script 196 | 197 | from data_diff_check_summary 198 | where {{where}} 199 | order by match_percentage 200 | """ 201 | entity_options = [ 202 | x.as_dict().get("ENTITY") 203 | for x in session.sql(f"{sql.format(where='1=1')}").collect() 204 | ] 205 | entity_option = st.selectbox( 206 | label="Let's drill-down by selecting a diff entity to view the sample failure:", 207 | options=entity_options 208 | ) 209 | if entity_option: 210 | entity_drilldown_query = session.sql(sql.format(where=f"entity = '{entity_option}'")).collect() 211 | show_entity_diff_drilldown(session=session, entity_row=entity_drilldown_query[0], expanded=True) 212 | 213 | if entity_options: 214 | if st.button("Or see all (Top 10) Failure(s) ▶️"): 215 | data = session.sql(f"{sql.format(where='1=1')} limit 10").collect() 216 | for item in data: 217 | show_entity_diff_drilldown(session=session, entity_row=item) 218 | -------------------------------------------------------------------------------- /macros/sis/sis.yml: -------------------------------------------------------------------------------- 1 | macros: 2 | - name: sis_deploy__diff_helper 3 | description: | 4 | Deploy the streamlit in snowflake application - Data Diff Helper 5 | 6 | How to deploy: 7 | ```bash 8 | dbt run-operation sis_deploy__diff_helper 9 | # or 10 | dbt run-operation sis_deploy__diff_helper --args '{packages_install_path: your_specific_path}' 11 | ``` 12 | arguments: 13 | - name: packages_install_path 14 | type: string 15 | description: | 16 | Default to `dbt_packages`. 17 | 18 | If you configured `packages-install-path` with a specific value e.g. `my_installed_packages`, 19 | then the deployment command will need to use this argument, for example: 20 | 21 | ```bash 22 | dbt run-operation sis_deploy__diff_helper \ 23 | --args '{packages_install_path: my_installed_packages}' 24 | ``` 25 | -------------------------------------------------------------------------------- /macros/sis/sis_deploy__diff_helper.sql: -------------------------------------------------------------------------------- 1 | {% macro sis_deploy__diff_helper(packages_install_path='dbt_packages') -%} 2 | 3 | {% set ns = data_diff.get_namespace() %} 4 | {% set query %} 5 | 6 | create schema if not exists {{ ns }}; 7 | create or replace stage {{ ns }}.stage_diff_helper 8 | directory = ( enable = true ) 9 | comment = 'Named stage for diff helper SiS appilication'; 10 | 11 | PUT file://{{ packages_install_path }}/data_diff/macros/sis/diff_helper.py @{{ ns }}.stage_diff_helper 12 | overwrite=true 13 | auto_compress=false; 14 | 15 | create or replace streamlit {{ ns }}.data_diff_helper 16 | root_location = '@{{ ns }}.stage_diff_helper' 17 | main_file = '/diff_helper.py' 18 | query_warehouse = {{ target.warehouse or 'compute_wh' }} 19 | comment = 'Streamlit app for the dbt-data-diff package'; 20 | {% endset %} 21 | 22 | {{ log("[RUN]: sis_deploy__diff_helper", info=True) if execute }} 23 | {{ log("query: " ~ query, info=True) if execute }} 24 | {% set results = run_query(query) %} 25 | {{ log(results.rows, info=True) }} 26 | 27 | {%- endmacro %} 28 | -------------------------------------------------------------------------------- /macros/utilities/escape_single_quote_value.sql: -------------------------------------------------------------------------------- 1 | {% macro escape_single_quote_value(value) %} 2 | 3 | {{ return(value | replace("'", "''")) }} 4 | 5 | {% endmacro %} 6 | -------------------------------------------------------------------------------- /macros/utilities/escape_single_quote_value.yml: -------------------------------------------------------------------------------- 1 | macros: 2 | - name: escape_single_quote_value 3 | description: | 4 | Escape the SQL value which contains the single quote. 5 | 6 | For example: `name = 'data'` will be escaped to `name = ''data''` 7 | -------------------------------------------------------------------------------- /macros/utilities/get_namespace.sql: -------------------------------------------------------------------------------- 1 | {% macro get_namespace() %} 2 | 3 | {% set namespace -%} 4 | {{ generate_database_name(var("data_diff__database", target.database)) }}.{{ generate_schema_name(var("data_diff__schema", target.schema)) }} 5 | {%- endset %} 6 | 7 | {{ return(namespace) }} 8 | 9 | {% endmacro %} 10 | -------------------------------------------------------------------------------- /macros/utilities/get_namespace.yml: -------------------------------------------------------------------------------- 1 | macros: 2 | - name: get_namespace 3 | description: Return the data-diff's object namespace (`.`) 4 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: dbt Data Diff 2 | site_url: https://infinitelambda.github.io/dbt-data-diff/ 3 | site_author: Infinite Lambda 4 | site_description: Offical documentation of `dbt-data-diff` dbt package 5 | 6 | repo_name: infinitelambda/dbt-data-diff 7 | repo_url: https://github.com/infinitelambda/dbt-data-diff 8 | edit_uri: edit/main/docs/ 9 | 10 | copyright: Copyright © 2024 Infinite Lambda 11 | 12 | nav: 13 | - 📦 dbt-data-diff: index.md 14 | - 📖 dbt docs: dbt-docs/index.html # managed by hooks.copy_dbt_docs 15 | - Contribution Guideline: contributing.md 16 | - Change Log ↗️: https://github.com/infinitelambda/dbt-data-diff/releases" target="_blank 17 | 18 | hooks: 19 | - docs/hooks.py 20 | 21 | theme: 22 | name: material 23 | features: 24 | - announce.dismiss 25 | - content.action.edit 26 | - content.action.view 27 | - content.code.annotate 28 | - content.code.copy 29 | - content.tooltips 30 | - navigation.footer 31 | - navigation.indexes 32 | - navigation.sections 33 | - navigation.top 34 | - navigation.tracking 35 | - search.highlight 36 | - search.share 37 | - search.suggest 38 | - toc.follow 39 | palette: 40 | - media: "(prefers-color-scheme)" 41 | primary: black 42 | toggle: 43 | icon: material/brightness-auto 44 | name: Switch to light mode 45 | - media: "(prefers-color-scheme: light)" 46 | primary: black 47 | scheme: default 48 | toggle: 49 | icon: material/brightness-7 50 | name: Switch to dark mode 51 | - media: "(prefers-color-scheme: dark)" 52 | primary: black 53 | scheme: slate 54 | toggle: 55 | icon: material/brightness-4 56 | name: Switch to system preference 57 | font: 58 | text: Roboto 59 | code: Roboto Mono 60 | favicon: assets/favicon.ico 61 | logo: assets/favicon.ico 62 | icon: 63 | repo: fontawesome/brands/github 64 | 65 | plugins: 66 | - search: 67 | separator: '[\s\-,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' 68 | - minify: 69 | minify_html: true 70 | - offline 71 | 72 | theme: 73 | name: material 74 | custom_dir: docs/overrides 75 | features: 76 | - announce.dismiss 77 | - content.action.edit 78 | - content.action.view 79 | - content.code.annotate 80 | - content.code.copy 81 | - content.tooltips 82 | - navigation.footer 83 | - navigation.indexes 84 | - navigation.sections 85 | - navigation.top 86 | - navigation.tracking 87 | - search.highlight 88 | - search.share 89 | - search.suggest 90 | - toc.follow 91 | 92 | palette: 93 | - media: "(prefers-color-scheme)" 94 | primary: black 95 | toggle: 96 | icon: material/brightness-auto 97 | name: Switch to light mode 98 | - media: "(prefers-color-scheme: light)" 99 | scheme: default 100 | primary: black 101 | toggle: 102 | icon: material/brightness-7 103 | name: Switch to dark mode 104 | - media: "(prefers-color-scheme: dark)" 105 | scheme: slate 106 | primary: black 107 | toggle: 108 | icon: material/brightness-4 109 | name: Switch to system preference 110 | 111 | font: 112 | text: Roboto 113 | code: Roboto Mono 114 | favicon: assets/favicon.ico 115 | logo: assets/favicon.ico 116 | icon: 117 | repo: fontawesome/brands/github 118 | 119 | plugins: 120 | - search: 121 | separator: '[\s\-,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' 122 | - minify: 123 | minify_html: true 124 | - offline 125 | 126 | extra: 127 | version: 128 | provider: mike 129 | annotate: 130 | json: [.s2] 131 | social: 132 | - icon: fontawesome/brands/github 133 | link: https://github.com/infinitelambda 134 | - icon: fontawesome/brands/linkedin 135 | link: https://www.linkedin.com/company/infinite-lambda/ 136 | consent: 137 | title: Cookie consent 138 | description: >- 139 | We use cookies to recognize your repeated visits and preferences, as well 140 | as to measure the effectiveness of our documentation and whether users 141 | find what they're searching for. With your consent, you're helping us to 142 | make our documentation better. 143 | analytics: 144 | provider: google 145 | property: !ENV GOOGLE_ANALYTICS_KEY 146 | feedback: 147 | title: Was this page helpful? 148 | ratings: 149 | - icon: material/heart 150 | name: This page was helpful 151 | data: 1 152 | note: >- 153 | Thanks for your feedback! 154 | - icon: material/heart-broken 155 | name: This page could be improved 156 | data: 0 157 | note: >- 158 | Thanks for your feedback! Help us improve this page by 159 | using our feedback form. 160 | 161 | extra_css: 162 | - assets/css/termynal.css 163 | 164 | extra_javascript: 165 | - assets/js/feedback.js 166 | - assets/js/termynal.js 167 | - assets/js/custom.js 168 | 169 | markdown_extensions: 170 | - abbr 171 | - admonition 172 | - attr_list 173 | - def_list 174 | - footnotes 175 | - md_in_html 176 | - toc: 177 | permalink: true 178 | - pymdownx.arithmatex: 179 | generic: true 180 | - pymdownx.betterem: 181 | smart_enable: all 182 | - pymdownx.caret 183 | - pymdownx.details 184 | - pymdownx.emoji: 185 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 186 | emoji_index: !!python/name:material.extensions.emoji.twemoji 187 | - pymdownx.highlight: 188 | anchor_linenums: true 189 | line_spans: __span 190 | pygments_lang_class: true 191 | - pymdownx.inlinehilite 192 | - pymdownx.keys 193 | - pymdownx.magiclink: 194 | repo_url_shorthand: true 195 | user: squidfunk 196 | repo: mkdocs-material 197 | - pymdownx.mark 198 | - pymdownx.smartsymbols 199 | - pymdownx.superfences: 200 | custom_fences: 201 | - name: mermaid 202 | class: mermaid 203 | format: !!python/name:pymdownx.superfences.fence_code_format 204 | - pymdownx.tabbed: 205 | alternate_style: true 206 | - pymdownx.tasklist: 207 | custom_checkbox: true 208 | - pymdownx.tilde 209 | - codehilite: 210 | guess_lang: false 211 | -------------------------------------------------------------------------------- /models/01_key_diff/key_check.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema), 5 | materialized = 'incremental', 6 | on_schema_change = 'append_new_columns', 7 | full_refresh = var('data_diff__full_refresh', false) 8 | ) 9 | }} 10 | 11 | with dummy as (select 1 as col) 12 | 13 | select 14 | cast(null as {{ dbt.type_string() }}) as src_db 15 | , cast(null as {{ dbt.type_string() }}) as src_schema 16 | , cast(null as {{ dbt.type_string() }}) as src_table 17 | , cast(null as {{ dbt.type_string() }}) as trg_db 18 | , cast(null as {{ dbt.type_string() }}) as trg_schema 19 | , cast(null as {{ dbt.type_string() }}) as trg_table 20 | , cast(null as {{ dbt.type_string() }}) as pk 21 | , cast(null as {{ dbt.type_string() }}) as key_value 22 | , cast(null as {{ dbt.type_boolean() }}) as is_exclusive_src 23 | , cast(null as {{ dbt.type_boolean() }}) as is_exclusive_trg 24 | , cast(null as {{ dbt.type_boolean() }}) as is_diff_unique_key 25 | , cast(null as {{ dbt.type_timestamp() }}) as last_data_diff_timestamp 26 | , cast(null as {{ dbt.type_string() }}) as diff_run_id 27 | 28 | from dummy 29 | 30 | where 1 = 0 31 | -------------------------------------------------------------------------------- /models/01_key_diff/key_check.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: key_check 3 | description: | 4 | Result of the **Key** diff 5 | 6 | - 🟢 GOOD: No data 7 | - 🔴 BAD: Has data, each row stands for a key diff for a diff run 8 | 9 | columns: 10 | - name: src_db 11 | description: Database name of the source object to be compared 12 | - name: src_schema 13 | description: Schema name of the source object to be compared 14 | - name: src_table 15 | description: Source object name to be compared 16 | - name: trg_db 17 | description: Database name of the target object to be compared 18 | - name: trg_schema 19 | description: Schema name of the target object to be compared 20 | - name: trg_table 21 | description: Target object to be compared 22 | - name: pk 23 | description: Primary key of the objects to be compared 24 | - name: key_value 25 | description: Value of the primary key 26 | - name: is_exclusive_src 27 | description: true/false, where true means the key can be found only in the source object 28 | - name: is_exclusive_trg 29 | description: true/false, where true means the key can be found only in the target object 30 | - name: is_diff_unique_key 31 | description: true/false, where true means the key is missing from the source or target object 32 | - name: last_data_diff_timestamp 33 | description: Last data-diff run timestamp 34 | - name: diff_run_id 35 | description: Data diff's Run ID 36 | -------------------------------------------------------------------------------- /models/01_key_diff/key_check_summary.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema) 5 | ) 6 | }} 7 | 8 | select 9 | src_db 10 | , src_schema 11 | , src_table 12 | , trg_db 13 | , trg_schema 14 | , trg_table 15 | , pk 16 | , last_data_diff_timestamp 17 | , diff_run_id 18 | , sum(case when is_exclusive_src then 1 else 0 end) as number_of_exclusive_src 19 | , sum(case when is_exclusive_trg then 1 else 0 end) as number_of_exclusive_trg 20 | , sum(case when is_diff_unique_key then 1 else 0 end) as number_of_diff_pk 21 | 22 | from {{ ref('key_check') }} 23 | 24 | group by all 25 | -------------------------------------------------------------------------------- /models/01_key_diff/key_check_summary.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: key_check_summary 3 | description: | 4 | Aggregation of the **Key** diff failure only 5 | columns: 6 | - name: src_db 7 | description: Database name of the source object to be compared 8 | - name: src_schema 9 | description: Schema name of the source object to be compared 10 | - name: src_table 11 | description: Source object name to be compared 12 | - name: trg_db 13 | description: Database name of the target object to be compared 14 | - name: trg_schema 15 | description: Schema name of the target object to be compared 16 | - name: trg_table 17 | description: Target object to be compared 18 | - name: pk 19 | description: Primary key of the objects to be compared 20 | - name: number_of_exclusive_src 21 | description: Number of columns can be found only in the source object 22 | - name: number_of_exclusive_trg 23 | description: Number of columns can be found only in the target object 24 | - name: number_of_diff_pk 25 | description: Number of columns which are missing either from the source or the target 26 | - name: last_data_diff_timestamp 27 | description: Last data-diff run timestamp 28 | - name: diff_run_id 29 | description: Data diff's Run ID 30 | -------------------------------------------------------------------------------- /models/02_schema_diff/schema_check.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema), 5 | materialized = 'incremental', 6 | on_schema_change = 'append_new_columns', 7 | full_refresh = var('data_diff__full_refresh', false) 8 | ) 9 | }} 10 | 11 | with dummy as (select 1 as col) 12 | 13 | select 14 | cast(null as {{ dbt.type_string() }}) as src_db 15 | , cast(null as {{ dbt.type_string() }}) as src_schema 16 | , cast(null as {{ dbt.type_string() }}) as src_table 17 | , cast(null as {{ dbt.type_string() }}) as trg_db 18 | , cast(null as {{ dbt.type_string() }}) as trg_schema 19 | , cast(null as {{ dbt.type_string() }}) as trg_table 20 | , cast(null as {{ dbt.type_string() }}) as column_name 21 | , cast(null as {{ dbt.type_string() }}) as data_type 22 | , cast(null as {{ dbt.type_string() }}) as datetime_precision 23 | , cast(null as {{ dbt.type_string() }}) as numeric_precision 24 | , cast(null as {{ dbt.type_string() }}) as numeric_scale 25 | , cast(null as {{ dbt.type_boolean() }}) as common_col 26 | , cast(null as {{ dbt.type_string() }}) as common_col_text 27 | , cast(null as {{ dbt.type_boolean() }}) as is_exclusive_src 28 | , cast(null as {{ dbt.type_boolean() }}) as is_exclusive_trg 29 | , cast(null as {{ dbt.type_boolean() }}) as datatype_check 30 | , cast(null as {{ dbt.type_timestamp() }}) as last_data_diff_timestamp 31 | , cast(null as {{ dbt.type_string() }}) as pipe_name 32 | , cast(null as {{ dbt.type_string() }}) as diff_run_id 33 | 34 | from dummy 35 | 36 | where 1 = 0 37 | -------------------------------------------------------------------------------- /models/02_schema_diff/schema_check.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: schema_check 3 | description: | 4 | Result of the **Schema** diff 5 | 6 | - 🟢 GOOD: Has data, and `number_of_columns = mutual_columns` 7 | - 🔴 BAD: Has data, and `number_of_columns > mutual_columns` or `number_of_false_datatype_check > 0` 8 | 9 | columns: 10 | - name: src_db 11 | description: Database name of the source object to be compared 12 | - name: src_schema 13 | description: Schema name of the source object to be compared 14 | - name: src_table 15 | description: Source object name to be compared 16 | - name: trg_db 17 | description: Database name of the target object to be compared 18 | - name: trg_schema 19 | description: Schema name of the target object to be compared 20 | - name: trg_table 21 | description: Target object to be compared 22 | - name: column_name 23 | description: Name of the compared column 24 | - name: data_type 25 | description: Data type of the column 26 | - name: datetime_precision 27 | description: Precision in case of datetime data type 28 | - name: numeric_precision 29 | description: Precision in case of numeric data type 30 | - name: numeric_scale 31 | description: Scale in case of numeric data type 32 | - name: common_col 33 | description: true/false, where true means the column can be found both source and target table 34 | - name: common_col_text 35 | description: | 36 | Possible values are: 37 | - common 38 | - source only 39 | - target only 40 | - name: is_exclusive_src 41 | description: true/false, where true means the column can be found only in source object 42 | - name: is_exclusive_trg 43 | description: true/false, where true means the column can be found only in target object 44 | - name: datatype_check 45 | description: true/false, where true means the data type is the same in both places 46 | - name: last_data_diff_timestamp 47 | description: Last data-diff run timestamp 48 | - name: pipe_name 49 | description: | 50 | Specify the pipe name which is used in the asynchronous run fostering the parallel. 51 | 52 | NOTE: This field is necessarily exiting here in order to be used in data-diff async run. 53 | - name: diff_run_id 54 | description: Data diff's Run ID 55 | -------------------------------------------------------------------------------- /models/02_schema_diff/schema_check_summary.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema) 5 | ) 6 | }} 7 | 8 | select 9 | src_db 10 | , src_schema 11 | , src_table 12 | , trg_db 13 | , trg_schema 14 | , trg_table 15 | , last_data_diff_timestamp 16 | , diff_run_id 17 | , count(*) as number_of_columns 18 | , sum(case when common_col then 1 else 0 end) as mutual_columns 19 | 20 | , sum(case when datatype_check then 0 else 1 end) as number_of_false_datatype_check 21 | , listagg( 22 | case when not datatype_check then column_name end, ', ' 23 | ) within group (order by column_name) as false_datatype_check_list 24 | 25 | , sum(case when is_exclusive_src then 1 else 0 end) as number_of_exclusive_target 26 | , listagg( 27 | case when is_exclusive_src then column_name end, ', ' 28 | ) within group (order by column_name) as exclusive_target_list 29 | 30 | , sum(case when is_exclusive_trg then 1 else 0 end) as number_of_exclusive_source 31 | , listagg( 32 | case when is_exclusive_trg then column_name end, ', ' 33 | ) within group (order by column_name) as exclusive_source_list 34 | 35 | from {{ ref('schema_check') }} 36 | 37 | group by all 38 | 39 | having 40 | number_of_columns != mutual_columns 41 | or number_of_false_datatype_check != 0 42 | -------------------------------------------------------------------------------- /models/02_schema_diff/schema_check_summary.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: schema_check_summary 3 | description: | 4 | Aggregation of the **Schema** diff failure only 5 | columns: 6 | - name: src_db 7 | description: Database name of the source object to be compared 8 | - name: src_schema 9 | description: Schema name of the source object to be compared 10 | - name: src_table 11 | description: Source object name to be compared 12 | - name: trg_db 13 | description: Database name of the target object to be compared 14 | - name: trg_schema 15 | description: Schema name of the target object to be compared 16 | - name: trg_table 17 | description: Target object to be compared 18 | - name: last_data_diff_timestamp 19 | description: Last data-diff run timestamp 20 | - name: diff_run_id 21 | description: Data diff's Run ID 22 | - name: number_of_columns 23 | description: Total number of columns 24 | - name: mutual_columns 25 | description: Total number of columns existing in both target and source 26 | - name: number_of_false_datatype_check 27 | description: Number of columns which are not matched in the data type 28 | - name: false_datatype_check_list 29 | description: List of columns have not been matched in data type 30 | - name: number_of_exclusive_target 31 | description: Number of columns exist in target only 32 | - name: exclusive_target_list 33 | description: List of columns exist in target only 34 | - name: number_of_exclusive_source 35 | description: Number of columns exist in source only 36 | - name: exclusive_source_list 37 | description: List of columns exist in source only 38 | -------------------------------------------------------------------------------- /models/03_content_diff/data_diff_check_summary.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema), 5 | materialized = 'incremental', 6 | on_schema_change = 'append_new_columns', 7 | full_refresh = var('data_diff__full_refresh', false) 8 | ) 9 | }} 10 | 11 | with dummy as (select 1 as col) 12 | 13 | select 14 | cast(null as {{ dbt.type_string() }}) as src_db 15 | , cast(null as {{ dbt.type_string() }}) as src_schema 16 | , cast(null as {{ dbt.type_string() }}) as src_table 17 | , cast(null as {{ dbt.type_string() }}) as trg_db 18 | , cast(null as {{ dbt.type_string() }}) as trg_schema 19 | , cast(null as {{ dbt.type_string() }}) as trg_table 20 | , cast(null as {{ dbt.type_string() }}) as column_name 21 | , cast(null as {{ dbt.type_int() }}) as diff_count 22 | , cast(null as {{ dbt.type_int() }}) as table_count 23 | , cast(null as {{ dbt.type_float() }}) as diff_feeded_rate 24 | , cast(null as {{ dbt.type_float() }}) as match_percentage 25 | , cast(null as {{ dbt.type_timestamp() }}) as last_data_diff_timestamp 26 | , cast(null as {{ dbt.type_string() }}) as diff_run_id 27 | 28 | from dummy 29 | 30 | where 1 = 0 31 | -------------------------------------------------------------------------------- /models/03_content_diff/data_diff_check_summary.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: data_diff_check_summary 3 | description: | 4 | Result of the **Data Content** diff failure only, aggregated by columns. 5 | 6 | - 🟢 GOOD: No data, Or Has data and the `match_percentage` can be well-explained e.g. new columns, behavior changes 7 | - 🔴 BAD: Has data and `match_percentage` cannot be explained 8 | 9 | It will also link to the addtional result tables, 10 | formatted as `data_diff_check_detail_{source_table}_{YYYYMMDD}`, 11 | which is the result of EXCEPT operator contains all the different rows, 12 | the structure will be the same as the compared objects 13 | 14 | NOTE: the additional result table(s) will be replaced if there is an existing one 15 | 16 | columns: 17 | - name: src_db 18 | description: Database name of the source object to be compared 19 | - name: src_schema 20 | description: Schema name of the source object to be compared 21 | - name: src_table 22 | description: Source object name to be compared 23 | - name: trg_db 24 | description: Database name of the target object to be compared 25 | - name: trg_schema 26 | description: Schema name of the target object to be compared 27 | - name: trg_table 28 | description: Target object to be compared 29 | - name: column_name 30 | description: Name of the compared column 31 | - name: diff_count 32 | description: Diff row count of the given column 33 | - name: table_count 34 | description: Total row count of the source table 35 | - name: diff_feeded_rate 36 | description: | 37 | Percentage of matching values of the given column compared to all the differences (*). 38 | 39 | (*) = different_in_source + different_in_target 40 | 41 | For example, the Diff result of TableA is having 10 diff rows: 42 | - 5 rows different_in_source 43 | - 5 rows different_in_target 44 | 45 | then, for each column: 46 | - Column1: count(matched) = 10 --> column feeded rate = 1 --> there is no diff in this column 47 | - Column2: count(matched) = 5 --> column feeded rate = 0.5 --> we have data diff in Column2 in 5 (out of 10) diff rows above 48 | - Column3: count(matched) = 0 --> column feeded rate = 0 --> we have a critical data diff in Column3 49 | - name: match_percentage 50 | description: | 51 | Percentage of matching values of the given column compared to total table row count. 52 | 53 | For example, the Diff result of TableA is having 10 diff rows: 54 | - 5 rows different_in_source 55 | - 5 rows different_in_target 56 | - Target table has 100 rows in total 57 | 58 | then, for each column: 59 | - Column1: count(matched) = 99 --> match % = 0.99 = 99% 60 | - Column2: count(matched) = 10 --> match % = 0.1 = 10% 61 | - name: last_data_diff_timestamp 62 | description: Last data-diff run timestamp 63 | - name: diff_run_id 64 | description: Data diff's Run ID 65 | -------------------------------------------------------------------------------- /models/configured_tables.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema), 5 | materialized = 'incremental', 6 | on_schema_change = 'append_new_columns', 7 | full_refresh = var('data_diff__full_refresh', false) 8 | ) 9 | }} 10 | 11 | with dummy as (select 1 as col) 12 | 13 | select 14 | cast(null as {{ dbt.type_string() }}) as src_db 15 | , cast(null as {{ dbt.type_string() }}) as src_schema 16 | , cast(null as {{ dbt.type_string() }}) as src_table 17 | , cast(null as {{ dbt.type_string() }}) as trg_db 18 | , cast(null as {{ dbt.type_string() }}) as trg_schema 19 | , cast(null as {{ dbt.type_string() }}) as trg_table 20 | , cast(null as {{ dbt.type_string() }}) as pk 21 | , cast(null as array) as include_columns 22 | , cast(null as array) as exclude_columns 23 | , cast(null as {{ dbt.type_string() }}) as where_condition 24 | , cast(null as {{ dbt.type_boolean() }}) as is_enabled 25 | , cast(null as {{ dbt.type_string() }}) as pipe_name 26 | 27 | from dummy 28 | 29 | where 1 = 0 30 | -------------------------------------------------------------------------------- /models/configured_tables.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: configured_tables 3 | description: | 4 | Configuration table that contains the list of objects need to be compared. 5 | 6 | Data is managed/reset by [the migration step](https://data-diff.iflambda.com/latest/#2-refresh-the-configured-tabless-data), or behind the scenes, it is the `refresh_resource_data` macro. 7 | 8 | Additionally, `is_enabled` flag allows us to manually disable the compared entity without re-run the migration. 9 | For example: 10 | ```sql 11 | update configured_tables 12 | set is_enabled = FALSE 13 | where src_table = 'ABC'; 14 | ``` 15 | 16 | columns: 17 | - name: src_db 18 | description: Database name of the source object to be compared 19 | - name: src_schema 20 | description: Schema name of the source object to be compared 21 | - name: src_table 22 | description: Source object name to be compared 23 | - name: trg_db 24 | description: Database name of the target object to be compared 25 | - name: trg_schema 26 | description: Schema name of the target object to be compared 27 | - name: trg_table 28 | description: Target object to be compared 29 | - name: pk 30 | description: Primary key of the objects to be compared 31 | - name: include_columns 32 | description: An array containing the list of columns to include into the comparison, default is empty, which means every mutual common will be compared 33 | - name: exclude_columns 34 | description: Array containing the list of columns to exclude from the comparison 35 | - name: where_condition 36 | description: Add filter for your compared objects e.g. table is too big. Default to `1=1` if not specified 37 | - name: is_enabled 38 | description: Using this field we can enable or disables the validation 39 | - name: pipe_name 40 | description: Specify the pipe name which is used in the asynchronous run fostering the parallel 41 | -------------------------------------------------------------------------------- /models/log_for_validation.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | database = var('data_diff__database', target.database), 4 | schema = var("data_diff__schema", target.schema), 5 | materialized = 'incremental', 6 | on_schema_change = 'append_new_columns', 7 | full_refresh = var('data_diff__full_refresh', false) 8 | ) 9 | }} 10 | 11 | with dummy as (select 1 as col) 12 | 13 | select 14 | cast(null as {{ dbt.type_timestamp() }}) as start_time 15 | , cast(null as {{ dbt.type_timestamp() }}) as end_time 16 | , cast(null as {{ dbt.type_string() }}) as sql_statement 17 | , cast(null as {{ dbt.type_timestamp() }}) as diff_start_time 18 | , cast(null as {{ dbt.type_string() }}) as diff_type 19 | , cast(null as {{ dbt.type_string() }}) as diff_run_id 20 | 21 | from dummy 22 | 23 | where 1 = 0 24 | -------------------------------------------------------------------------------- /models/log_for_validation.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: log_for_validation 3 | description: Log entries produced by the data-diff operations 4 | 5 | columns: 6 | - name: start_time 7 | description: Start time of the execution 8 | - name: end_time 9 | description: End time of the execution 10 | - name: sql_statement 11 | description: The executed sql_statement 12 | - name: diff_start_time 13 | description: Start time of all the executions in a diff type, can be used as a join key to the appropiate result table 14 | - name: diff_type 15 | description: Key, schema or data-diff type of entry 16 | - name: diff_run_id 17 | description: Data diff's Run ID 18 | -------------------------------------------------------------------------------- /package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | sha1_hash: e6424ba9e5a22487e47f023803aa4f0411946808 5 | -------------------------------------------------------------------------------- /packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: [">=1.1.0", "<2.0.0"] 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "dbt-data-diff" 3 | version = "0.0.0" 4 | description = "Data-diff solution for dbt-ers with Snowflake ❄️ 🚀" 5 | authors = ["Infinite Lambda "] 6 | readme = "README.md" 7 | license = "Apache-2.0" 8 | repository = "https://github.com/infinitelambda/dbt-data-diff" 9 | homepage = "https://infinitelambda.com" 10 | documentation = "https://data-diff.iflambda.com/latest/" 11 | keywords = ["packaging", "dbt", "data-quality", "data-diff", "markdown", "lint"] 12 | classifiers = [ 13 | "Topic :: dbt Package Development :: Documentation", 14 | "Topic :: dbt Package Development :: Testing", 15 | "Topic :: dbt Package :: Blue-Green", 16 | ] 17 | 18 | [tool.poetry.dependencies] 19 | python = ">=3.9,<3.13" 20 | dbt-snowflake = "^1.7.0" 21 | 22 | [tool.poetry.dev-dependencies] 23 | pre-commit = "^2.17.0" 24 | poethepoet = "^0.16.4" 25 | sqlfluff = "^2.3.5" 26 | sqlfluff-templater-dbt = "^2.3.5" 27 | mkdocs = "^1.5.3" 28 | mkdocs-material = "^9.4.8" 29 | mkdocs-minify-plugin = "^0.7.1" 30 | mike = "^2.0.0" 31 | 32 | [build-system] 33 | requires = ["poetry-core>=1.0.0"] 34 | build-backend = "poetry.core.masonry.api" 35 | 36 | [tool.sqlfluff.core] 37 | templater = "dbt" 38 | dialect = "snowflake" 39 | sql_file_exts = ".sql,.sql.j2,.dml,.ddl" 40 | max_line_length = 120 41 | 42 | [tool.sqlfluff.templater.dbt] 43 | profile = "data_diff" 44 | 45 | [tool.sqlfluff.rules] 46 | allow_scalar = true 47 | single_table_references = "consistent" 48 | unquoted_identifiers_policy = "all" 49 | 50 | [tool.sqlfluff.layout.type.comma] 51 | line_position = "leading" 52 | 53 | [tool.sqlfluff.indentation] 54 | tab_space_size = 2 55 | indent_unit = "space" 56 | 57 | [toolsqlfluff.rules.capitalisation.keywords] 58 | capitalisation_policy = "lower" 59 | 60 | [toolsqlfluff.rules.capitalisation.functions] 61 | capitalisation_policy = "lower" 62 | 63 | [tool.sqlfluff.templater] 64 | unwrap_wrapped_queries = true 65 | 66 | [tool.sqlfluff.templater.jinja] 67 | apply_dbt_builtins = true 68 | 69 | [tool.poe.tasks] 70 | git-hooks = { shell = "pre-commit install --install-hooks && pre-commit install --hook-type commit-msg" } 71 | format = [ 72 | {cmd = "dbt clean"}, 73 | {cmd = "dbt deps"}, 74 | {cmd = "sqlfluff format . --dialect snowflake"}, 75 | ] 76 | lint = [ 77 | {cmd = "dbt clean"}, 78 | {cmd = "dbt deps"}, 79 | {cmd = "sqlfluff lint . --dialect snowflake"}, 80 | ] 81 | data-diff-verify = [ 82 | {cmd = "dbt deps --project-dir integration_tests"}, 83 | {cmd = "dbt debug --project-dir integration_tests"}, 84 | ] 85 | data-diff-migration = [ 86 | {cmd = "dbt run -s data_diff --full-refresh --vars '{fresh: true, data_diff__on_migration: true, data_diff__on_migration_data: true, data_diff__full_refresh: true}' --project-dir integration_tests"}, 87 | ] 88 | data-diff-migration-4async = [ 89 | {cmd = "dbt run -s data_diff --full-refresh --vars '{data_diff__on_migration: true, data_diff__on_migration_data: true, data_diff__full_refresh: true, data_diff__auto_pipe: true}' --project-dir integration_tests"}, 90 | ] 91 | data-diff-bg = [ 92 | {cmd = "dbt build -s example --project-dir integration_tests -t blue"}, 93 | {cmd = "dbt build -s example --project-dir integration_tests -t green"}, 94 | ] 95 | data-diff-run = [ 96 | {cmd = "dbt run-operation data_diff__run --project-dir integration_tests"}, 97 | ] 98 | data-diff-run-async = [ 99 | {cmd = "dbt run-operation data_diff__run_async --project-dir integration_tests"}, 100 | ] 101 | data-diff-run-async-wait = [ 102 | {cmd = "dbt run-operation data_diff__run_async --project-dir integration_tests --args '{is_polling_status: true}'"}, 103 | ] 104 | data-diff-test = [ 105 | {cmd = "dbt build --exclude example --project-dir integration_tests"}, 106 | ] 107 | data-diff = [ 108 | {cmd = "poe data-diff-migration"}, 109 | {cmd = "poe data-diff-bg"}, 110 | {cmd = "poe data-diff-run"} 111 | ] 112 | data-diff-async-wait = [ 113 | {cmd = "poe data-diff-migration-4async"}, 114 | {cmd = "poe data-diff-bg"}, 115 | {cmd = "poe data-diff-run-async-wait"} 116 | ] 117 | build-doc = [ 118 | {cmd = "dbt deps --project-dir integration_tests"}, 119 | {cmd = "dbt docs generate --select package:data_diff --project-dir integration_tests"}, 120 | {cmd = "mkdocs build"} 121 | ] 122 | build-doc-and-serve = [ 123 | {cmd = "poe build-doc"}, 124 | {cmd = "mkdocs serve"} 125 | ] 126 | git-push-github = [ 127 | {cmd = "git remote set-url origin https://github.com/infinitelambda/dbt-data-diff.git"}, 128 | {cmd = "git push"}, 129 | {cmd = "git remote set-url origin git@gitlab.infinitelambda.com:infinitelambda/bi-chapter/dbt-data-diff.git"} 130 | ] 131 | --------------------------------------------------------------------------------