├── .dockerignore ├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── request-support-for-a-database.md └── workflows │ ├── ci.yml │ └── ci_full.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── dev ├── Dockerfile.prestosql.340 ├── _bq_import_csv.py ├── benchmark.sh ├── dev.env ├── graph.py ├── prepare_db.pql ├── prepare_db_gaps.pql ├── presto-conf │ └── standalone │ │ ├── catalog │ │ ├── jmx.properties │ │ ├── memory.properties │ │ ├── postgresql.properties │ │ ├── tpcds.properties │ │ └── tpch.properties │ │ ├── config.properties │ │ ├── jvm.config │ │ ├── log.properties │ │ └── node.properties └── trino-conf │ └── etc │ ├── catalog │ ├── jms.properties │ ├── memory.properties │ ├── postgresql.properties │ ├── tpcds.properties │ └── tpch.properties │ ├── config.properties │ ├── jvm.config │ └── node.properties ├── docker-compose.yml ├── docs ├── Makefile ├── conf.py ├── how-to-use.md ├── index.rst ├── install.md ├── make.bat ├── new-database-driver-guide.rst ├── python-api.rst ├── requirements.txt ├── supported-databases.md └── technical-explanation.md ├── poetry.lock ├── pyproject.toml ├── readthedocs.yml ├── reladiff ├── __init__.py ├── __main__.py ├── config.py ├── databases │ ├── __init__.py │ ├── _connect.py │ ├── base.py │ ├── bigquery.py │ ├── clickhouse.py │ ├── databricks.py │ ├── duckdb.py │ ├── mysql.py │ ├── oracle.py │ ├── postgresql.py │ ├── presto.py │ ├── redshift.py │ ├── snowflake.py │ ├── trino.py │ └── vertica.py ├── diff_tables.py ├── hashdiff_tables.py ├── info_tree.py ├── joindiff_tables.py ├── parse_time.py ├── query_utils.py ├── table_segment.py ├── thread_utils.py └── utils.py ├── reladiff_logo.svg └── tests ├── __init__.py ├── common.py ├── test_api.py ├── test_cli.py ├── test_config.py ├── test_database_types.py ├── test_diff_tables.py ├── test_joindiff.py ├── test_parse_time.py ├── test_postgresql.py └── waiting_for_stack_up.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | .venv 2 | ml-25m* 3 | dev/ml-25m* 4 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig spec for a consistent cross-editor style. 2 | # Read more: https://EditorConfig.org 3 | 4 | root = true 5 | 6 | [*] 7 | end_of_line = lf # Unix-style newlines with a newline ending every file 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | # 4 space indentation 11 | indent_style = space 12 | indent_size = 4 13 | 14 | [*.{md,py}] 15 | charset = utf-8 16 | 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | Make sure to include the following (minus sensitive information): 14 | - The command or code you used 15 | - The run output + error you're getting. (including tracestack) 16 | - Run reladiff with the `-d` switch for extra debug information. 17 | 18 | If possible, please paste these as text, and not a screenshot. 19 | 20 | **Describe the environment** 21 | 22 | Describe which OS you're using, which reladiff version, and any other information that might be relevant to this bug. 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/request-support-for-a-database.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request support for a database 3 | about: 'Request a driver to support a new database ' 4 | title: 'Add support for ' 5 | labels: new-db-driver 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI-COVER-VERSIONS 2 | 3 | on: 4 | push: 5 | paths: 6 | - '**.py' 7 | - 'pyproject.toml' 8 | - '.github/workflows/**' 9 | - 'docker-compose.yml' 10 | - '!dev/**' 11 | - '!docs/**' 12 | pull_request: 13 | paths: 14 | - '**.py' 15 | - 'pyproject.toml' 16 | - '.github/workflows/**' 17 | - 'docker-compose.yml' 18 | - '!dev/**' 19 | - '!docs/**' 20 | branches: [ master ] 21 | 22 | workflow_dispatch: 23 | 24 | jobs: 25 | unit_tests: 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | os: [ubuntu-latest] 30 | python-version: 31 | - "3.8" 32 | - "3.9" 33 | - "3.10" 34 | - "3.11" 35 | - "3.12" 36 | 37 | name: Check Python ${{ matrix.python-version }} on ${{ matrix.os }} 38 | runs-on: ${{ matrix.os }} 39 | steps: 40 | - uses: actions/checkout@v3 41 | 42 | - name: Setup Python ${{ matrix.python-version }} 43 | uses: actions/setup-python@v3 44 | with: 45 | python-version: ${{ matrix.python-version }} 46 | 47 | - name: Build the stack 48 | run: docker compose up -d mysql postgres trino clickhouse vertica 49 | 50 | - name: Install Poetry 51 | run: pip install poetry 52 | 53 | - name: Install package 54 | run: "poetry install" 55 | 56 | # BigQuery start 57 | # - id: 'auth' 58 | # uses: 'google-github-actions/auth@v1' 59 | # with: 60 | # credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' 61 | 62 | # - name: 'Set up BigQuery Cloud SDK' 63 | # uses: 'google-github-actions/setup-gcloud@v1' 64 | 65 | # - name: 'Use gcloud CLI' 66 | # run: "gcloud config configurations list" 67 | 68 | # - name: "Install BigQuery for Python" 69 | # run: poetry add google-cloud-bigquery 70 | 71 | # BigQuery end 72 | 73 | - name: Run unit tests 74 | env: 75 | # SNOWFLAKE_URI: '${{ secrets.SNOWFLAKE_URI }}' 76 | # PRESTO_URI: '${{ secrets.PRESTO_URI }}' 77 | TRINO_URI: 'trino://postgres@127.0.0.1:8081/postgresql/public' 78 | # BIGQUERY_URI: '${{ secrets.BIGQUERY_URI }}' 79 | CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse' 80 | VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica' 81 | REDSHIFT_URI: '${{ secrets.REDSHIFT_URI }}' 82 | run: | 83 | chmod +x tests/waiting_for_stack_up.sh 84 | ./tests/waiting_for_stack_up.sh && TEST_ACROSS_ALL_DBS=0 poetry run unittest-parallel -j 16 85 | -------------------------------------------------------------------------------- /.github/workflows/ci_full.yml: -------------------------------------------------------------------------------- 1 | name: CI-COVER-DATABASES 2 | 3 | on: 4 | # push: 5 | # paths: 6 | # - '**.py' 7 | # - '.github/workflows/**' 8 | # - '!dev/**' 9 | pull_request: 10 | paths: 11 | - '**.py' 12 | - 'pyproject.toml' 13 | - 'poetry.lock' 14 | - '.github/workflows/**' 15 | - 'docker-compose.yml' 16 | - '!dev/**' 17 | - '!docs/**' 18 | 19 | branches: [ master ] 20 | workflow_dispatch: 21 | 22 | permissions: 23 | id-token: write # This is required for requesting the JWT 24 | contents: read # This is required for actions/checkout 25 | 26 | jobs: 27 | unit_tests: 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | os: [ubuntu-latest] 32 | python-version: 33 | - "3.10" 34 | 35 | name: Check Python ${{ matrix.python-version }} on ${{ matrix.os }} 36 | runs-on: ${{ matrix.os }} 37 | steps: 38 | - uses: actions/checkout@v3 39 | 40 | - name: Setup Python ${{ matrix.python-version }} 41 | uses: actions/setup-python@v3 42 | with: 43 | python-version: ${{ matrix.python-version }} 44 | 45 | - name: Build the stack 46 | run: docker compose up -d mysql postgres trino vertica # presto clickhouse 47 | 48 | - name: Install Poetry 49 | run: pip install poetry 50 | 51 | - name: Install package 52 | run: "poetry install" 53 | 54 | # BigQuery start 55 | # - id: 'auth' 56 | # uses: 'google-github-actions/auth@v1' 57 | # with: 58 | # credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}' 59 | 60 | # - name: 'Set up BigQuery Cloud SDK' 61 | # uses: 'google-github-actions/setup-gcloud@v1' 62 | 63 | # - name: "Install BigQuery for Python" 64 | # run: poetry add google-cloud-bigquery 65 | 66 | # BigQuery end 67 | 68 | - name: Run unit tests 69 | env: 70 | TRINO_URI: 'trino://postgres@127.0.0.1:8081/postgresql/public' 71 | SNOWFLAKE_URI: '${{ secrets.SNOWFLAKE_URI }}' 72 | # PRESTO_URI: '${{ secrets.PRESTO_URI }}' 73 | # CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse' 74 | VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica' 75 | # BIGQUERY_URI: '${{ secrets.BIGQUERY_URI }}' 76 | REDSHIFT_URI: '${{ secrets.REDSHIFT_URI }}' 77 | run: | 78 | chmod +x tests/waiting_for_stack_up.sh 79 | ./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # dev 132 | ml-25m* 133 | ratings*.csv 134 | drive 135 | mysqltuner.pl 136 | benchmark_*.jsonl 137 | benchmark_*.png 138 | 139 | # Mac 140 | .DS_Store 141 | 142 | # IntelliJ 143 | .idea 144 | 145 | # VSCode 146 | .vscode 147 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | Treat everyone with respect and patience. 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Reladiff 2 | 3 | Contributions are very welcome! We'll be happy to help you in the process. 4 | 5 | ## What should I know before I get started? 6 | 7 | Go through the README and the documentation, and make sure that you understand how Reladiff works. 8 | 9 | ## How to contribute? 10 | 11 | ### Reporting bugs 12 | 13 | Please report the bug with as many details as you can. 14 | 15 | 1. Include the exact command that you used. Make sure to run Reladiff with the `-d` flag for debug output. 16 | 2. Provide the entire output of the command. (stdout, logs, exception) 17 | 3. If possible, show us how we could reproduce the bug. i.e. how to set up an environment in which it occurs. 18 | 19 | (When pasting, always make sure to redact sensitive information, like passwords.) 20 | 21 | If Reladiff returns incorrect results, i.e. false-positive or false-negative, please also include the original values. 22 | 23 | Before you report a bug, make sure it doesn't already exist. 24 | 25 | See [issues](/erezsh/reladiff/issues/). 26 | 27 | ### Suggesting Enhancements 28 | 29 | We are always interested to hear about how we can make Reladiff better! 30 | 31 | If you'd like us to support a new database, you should open an issue for it, if there isn't one already. If it already exists, make sure to vote for it with a :thumbsup:, to help us priortize it. 32 | 33 | The same goes for other technical requests, like missing features, or gaps in the documentation. 34 | 35 | See [issues](/erezsh/reladiff/issues/). 36 | 37 | For questions, and non-technical discussions, see [discussions](https://github.com/erezsh/reladiff/discussions). 38 | 39 | ### Contributing code 40 | 41 | #### Code style 42 | 43 | All code should be formatted with `black -l 120`. 44 | 45 | When in doubt, use the existing code as a guideline, or ask. 46 | 47 | #### Get started (setup) 48 | 49 | To get started, first clone the repository. For example `git clone https://github.com/erezsh/reladiff`. 50 | 51 | Once inside, you can install the dependencies. 52 | 53 | - Option 1: Run `poetry install` to install them in a virtual env. You can then run Reladiff using `poetry run reladiff ...` . 54 | 55 | - Option 2: Run `pip install -e .` to install them, and Reladiff, in the global context. 56 | 57 | At the bare minimum, you need MySQL to run the tests. 58 | 59 | You can create a local MySQL instance using `docker-compose up mysql`. The URI for it will be `mysql://mysql:Password1@localhost/mysql`. If you're using a different server, make sure to update `TEST_MYSQL_CONN_STRING` in `tests/common.py`. For your convenience, we recommend creating `tests/local_settings.py`, and to override the value there. 60 | 61 | You can also run a few servers at once. For example `docker-compose up mysql postgres presto`. 62 | 63 | Make sure to update the appropriate `TEST_*_CONN_STRING`, so that it will be included in the tests. 64 | 65 | #### Run the tests 66 | 67 | You can run the tests with `unittest`. 68 | 69 | When running against multiple databases, the tests can take a long while. 70 | 71 | To save time, we recommend running them with `unittest-parallel`. 72 | 73 | When debugging, we recommend using the `-f` flag, to stop on error. Also, use the `-k` flag to run only the individual test that you're trying to fix. 74 | 75 | #### Implementing a new database. 76 | 77 | New databases should be added as a new module in the `reladiff/databases/` folder. 78 | 79 | If possible, please also add the database setup to `docker-compose.yml`, so that we can run and test it for ourselves. If you do, also update the CI (`ci.yml`). 80 | 81 | Guide to implementing a new database driver: https://reladiff.readthedocs.io/en/latest/new-database-driver-guide.html 82 | 83 | ## Development Setup 84 | 85 | The development setup centers around using `docker-compose` to boot up various 86 | databases, and then inserting data into them. 87 | 88 | For Mac for performance of Docker, we suggest enabling in the UI: 89 | 90 | * Use new Virtualization Framework 91 | * Enable VirtioFS accelerated directory sharing 92 | 93 | **1. Install Data Diff** 94 | 95 | When developing/debugging, it's recommended to install dependencies and run it 96 | directly with `poetry` rather than go through the package. 97 | 98 | ``` 99 | $ brew install mysql postgresql # MacOS dependencies for C bindings 100 | $ apt-get install libpq-dev libmysqlclient-dev # Debian dependencies 101 | $ pip install poetry # Python dependency isolation tool 102 | $ poetry install # Install dependencies 103 | ``` 104 | **2. Start Databases** 105 | 106 | [Install **docker-compose**][docker-compose] if you haven't already. 107 | 108 | ```shell-session 109 | $ docker-compose up -d mysql postgres # run mysql and postgres dbs in background 110 | ``` 111 | 112 | [docker-compose]: https://docs.docker.com/compose/install/ 113 | 114 | **3. Run Unit Tests** 115 | 116 | There are more than 1000 tests for all the different type and database 117 | combinations, so we recommend using `unittest-parallel` that's installed as a 118 | development dependency. 119 | 120 | ```shell-session 121 | $ poetry run unittest-parallel -j 16 # run all tests 122 | $ poetry run python -m unittest -k # run individual test 123 | ``` 124 | 125 | **4. Seed the Database(s) (optional)** 126 | 127 | First, download the CSVs of seeding data: 128 | 129 | ```shell-session 130 | $ curl https://datafold-public.s3.us-west-2.amazonaws.com/1m.csv -o dev/ratings.csv 131 | # For a larger data-set (but takes 25x longer to import): 132 | # - curl https://datafold-public.s3.us-west-2.amazonaws.com/25m.csv -o dev/ratings.csv 133 | ``` 134 | 135 | Now you can insert it into the testing database(s): 136 | 137 | ```shell-session 138 | # It's optional to seed more than one to run reladiff(1) against. 139 | $ poetry run preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql 140 | $ poetry run preql -f dev/prepare_db.pql postgresql://postgres:Password1@127.0.0.1:5432/postgres 141 | # Cloud databases 142 | $ poetry run preql -f dev/prepare_db.pql snowflake:// 143 | $ poetry run preql -f dev/prepare_db.pql mssql:// 144 | $ poetry run preql -f dev/prepare_db.pql bigquery:/// 145 | ``` 146 | 147 | **5. Run **Reladiff** against seeded database (optional)** 148 | 149 | ```bash 150 | poetry run python3 -m reladiff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose 151 | ``` 152 | 153 | **6. Run benchmarks (optional)** 154 | 155 | ```shell-session 156 | $ dev/benchmark.sh # runs benchmarks and puts results in benchmark_.csv 157 | $ poetry run python3 dev/graph.py # create graphs from benchmark_*.csv files 158 | ``` 159 | 160 | You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/benchmark.sh`: 161 | 162 | ```shell-session 163 | $ N_SAMPLES=100000000 dev/benchmark.sh # 100m which is our canonical target 164 | ``` 165 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10 2 | RUN apt-get update && apt-get install -y \ 3 | python3-dev libpq-dev wget unzip \ 4 | python3-setuptools gcc bc 5 | RUN pip install --no-cache-dir poetry==1.1.13 6 | COPY . /app 7 | WORKDIR /app 8 | # For now while we are in heavy development we install the latest with Poetry 9 | # and execute directly with Poetry. Later, we'll move to the released Pip package. 10 | RUN poetry install 11 | ENTRYPOINT ["poetry", "run", "python3", "-m", "reladiff"] 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 Erez Shinnan 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | -- 21 | 22 | Copyright 2022 DataFold Inc. 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy of 25 | this software and associated documentation files (the "Software"), to deal in 26 | the Software without restriction, including without limitation the rights to 27 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 28 | the Software, and to permit persons to whom the Software is furnished to do so, 29 | subject to the following conditions: 30 | 31 | The above copyright notice and this permission notice shall be included in all 32 | copies or substantial portions of the Software. 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 36 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 37 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 38 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 39 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](reladiff_logo.svg) 2 | 3 |   4 |
5 |
6 | **Reladiff** is a high-performance tool and library designed for diffing large datasets across databases. By executing the diff calculation within the database itself, Reladiff minimizes data transfer and achieves optimal performance. 7 | 8 | This tool is specifically tailored for data professionals, DevOps engineers, and system administrators. 9 | 10 | Reladiff is free, open-source, user-friendly, extensively tested, and delivers fast results, even at massive scale. 11 | 12 | ### Key Features: 13 | 14 | 1. **Cross-Database Diff**: Reladiff employs a divide-and-conquer algorithm, based on matching hashes, to efficiently identify modified segments and download only the necessary data for comparison. This approach ensures exceptional performance when differences are minimal. 15 | 16 | - ⇄ Diffs across over a dozen different databases (e.g. *PostgreSQL* -> *Snowflake*) ! 17 | 18 | - 🧠 Gracefully handles reduced precision (e.g., timestamp(9) -> timestamp(3)) by rounding according to the database specification. 19 | 20 | - 🔥 Benchmarked to diff over 25M rows in under 10 seconds and over 1B rows in approximately 5 minutes, given no differences. 21 | 22 | - ♾️ Capable of handling tables with tens of billions of rows. 23 | 24 | 25 | 2. **Intra-Database Diff**: When both tables reside in the same database, Reladiff compares them using a join operation, with additional optimizations for enhanced speed. 26 | 27 | - Supports materializing the diff into a local table. 28 | - Can collect various extra statistics about the tables. 29 | 30 | 3. **Threaded**: Utilizes multiple threads to significantly boost performance during diffing operations. 31 | 32 | 3. **Configurable**: Offers numerous options for power-users to customize and optimize their usage. 33 | 34 | 4. **Automation-Friendly**: Outputs both JSON and git-like diffs (with + and -), facilitating easy integration into CI/CD pipelines. 35 | 36 | 5. **Over a dozen databases supported**. MySQL, Postgres, Snowflake, Bigquery, Oracle, Clickhouse, and more. [See full list](https://reladiff.readthedocs.io/en/latest/supported-databases.html) 37 | 38 | 39 | Reladiff is a fork of an archived project called [data-diff](https://github.com/datafold/data-diff). 40 | 41 | ## Get Started 42 | 43 | [**🗎 Read the Documentation**](https://reladiff.readthedocs.io/en/latest/) - our detailed documentation has everything you need to start diffing. 44 | 45 | ## Quickstart 46 | 47 | For the impatient ;) 48 | 49 | ### Install 50 | 51 | Reladiff is available on [PyPI](https://pypi.org/project/reladiff/). You may install it by running: 52 | 53 | ``` 54 | pip install reladiff 55 | ``` 56 | 57 | Requires Python 3.8+ with pip. 58 | 59 | We advise to install it within a virtual-env. 60 | 61 | ### How to Use 62 | 63 | Once you've installed Reladiff, you can run it from the command-line: 64 | 65 | ```bash 66 | # Cross-DB diff, using hashes 67 | reladiff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS] 68 | ``` 69 | 70 | When both tables belong to the same database, a shorter syntax is available: 71 | 72 | ```bash 73 | # Same-DB diff, using outer join 74 | reladiff DB1_URI TABLE1_NAME TABLE2_NAME [OPTIONS] 75 | ``` 76 | 77 | Or, you can import and run it from Python: 78 | 79 | ```python 80 | from reladiff import connect_to_table, diff_tables 81 | 82 | table1 = connect_to_table("postgresql:///", "table_name", "id") 83 | table2 = connect_to_table("mysql:///", "table_name", "id") 84 | 85 | sign: Literal['+' | '-'] 86 | row: tuple[str, ...] 87 | for sign, row in diff_tables(table1, table2): 88 | print(sign, row) 89 | ``` 90 | 91 | Read our detailed instructions: 92 | 93 | * [How to use from the shell / command-line](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-from-the-shell-or-command-line) 94 | * [How to use with TOML configuration file](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-with-a-configuration-file) 95 | * [How to use from Python](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-from-python) 96 | 97 | 98 | #### "Real-world" example: Diff "events" table between Postgres and Snowflake 99 | 100 | ``` 101 | reladiff \ 102 | postgresql:/// \ 103 | events \ 104 | "snowflake://:@//?warehouse=&role=" \ 105 | events \ 106 | -k event_id \ # Identifier of event 107 | -c event_data \ # Extra column to compare 108 | -w "event_time < '2024-10-10'" # Filter the rows on both dbs 109 | ``` 110 | 111 | #### "Real-world" example: Diff "events" and "old_events" tables in the same Postgres DB 112 | 113 | Materializes the results into a new table, containing the current timestamp in its name. 114 | 115 | ``` 116 | reladiff \ 117 | postgresql:/// events old_events \ 118 | -k org_id \ 119 | -c created_at -c is_internal \ 120 | -w "org_id != 1 and org_id < 2000" \ 121 | -m test_results_%t \ 122 | --materialize-all-rows \ 123 | --table-write-limit 10000 124 | ``` 125 | 126 | ### Technical Explanation 127 | 128 | Check out this [technical explanation](https://reladiff.readthedocs.io/en/latest/technical-explanation.html) of how cross-database reladiff works. 129 | 130 | ### We're here to help! 131 | 132 | * Confused? Got a cool idea? Just want to share your thoughts? Let's discuss it in [GitHub Discussions](https://github.com/erezsh/reladiff/discussions). 133 | 134 | * Did you encounter a bug? [Open an issue](https://github.com/erezsh/reladiff/issues). 135 | 136 | ## How to Contribute 137 | * Please read the [contributing guidelines](https://github.com/erezsh/reladiff/blob/master/CONTRIBUTING.md) to get started. 138 | * Feel free to open a new issue or work on an existing one. 139 | 140 | Big thanks to everyone who contributed so far: 141 | 142 | 143 | 144 | 145 | 146 | 147 | ## License 148 | 149 | This project is licensed under the terms of the [MIT License](https://github.com/erezsh/reladiff/blob/master/LICENSE). 150 | -------------------------------------------------------------------------------- /dev/Dockerfile.prestosql.340: -------------------------------------------------------------------------------- 1 | FROM openjdk:11-jdk-slim-buster 2 | 3 | ENV PRESTO_VERSION=340 4 | ENV PRESTO_SERVER_URL=https://repo1.maven.org/maven2/io/prestosql/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz 5 | ENV PRESTO_CLI_URL=https://repo1.maven.org/maven2/io/prestosql/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar 6 | ENV PRESTO_HOME=/opt/presto 7 | ENV PATH=${PRESTO_HOME}/bin:${PATH} 8 | 9 | WORKDIR $PRESTO_HOME 10 | 11 | RUN set -xe \ 12 | && apt-get update \ 13 | && apt-get install -y curl less python \ 14 | && curl -sSL $PRESTO_SERVER_URL | tar xz --strip 1 \ 15 | && curl -sSL $PRESTO_CLI_URL > ./bin/presto \ 16 | && chmod +x ./bin/presto \ 17 | && apt-get remove -y curl \ 18 | && rm -rf /var/lib/apt/lists/* 19 | 20 | VOLUME /data 21 | 22 | EXPOSE 8080 23 | 24 | ENTRYPOINT ["launcher"] 25 | CMD ["run"] 26 | -------------------------------------------------------------------------------- /dev/_bq_import_csv.py: -------------------------------------------------------------------------------- 1 | from google.cloud import bigquery 2 | 3 | client = bigquery.Client() 4 | 5 | table_id = "reladiff-dev-2.reladiff.tmp_rating" 6 | dataset_name = "reladiff" 7 | 8 | client.create_dataset(dataset_name, exists_ok=True) 9 | 10 | job_config = bigquery.LoadJobConfig( 11 | source_format=bigquery.SourceFormat.CSV, 12 | skip_leading_rows=1, 13 | autodetect=True, 14 | ) 15 | 16 | with open("ratings.csv", "rb") as source_file: 17 | job = client.load_table_from_file(source_file, table_id, job_config=job_config) 18 | 19 | job.result() # Waits for the job to complete. 20 | 21 | table = client.get_table(table_id) # Make an API request. 22 | print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id)) 23 | -------------------------------------------------------------------------------- /dev/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | run_test() { 4 | N_SAMPLES=${N_SAMPLES:-1000000} N_THREADS=${N_THREADS:-16} LOG_LEVEL=${LOG_LEVEL:-info} BENCHMARK=1 \ 5 | poetry run python3 -m unittest tests/test_database_types.py -v -k $1 6 | } 7 | 8 | run_test "postgresql_int_mysql_int" 9 | run_test "mysql_int_mysql_int" 10 | run_test "postgresql_int_postgresql_int" 11 | run_test "postgresql_ts6_n_tz_mysql_ts0" 12 | run_test "postgresql_ts6_n_tz_snowflake_ts9" 13 | run_test "postgresql_int_presto_int" 14 | run_test "postgresql_int_redshift_int" 15 | run_test "postgresql_int_snowflake_int" 16 | run_test "postgresql_int_bigquery_int" 17 | run_test "snowflake_int_snowflake_int" 18 | 19 | poetry run python dev/graph.py 20 | -------------------------------------------------------------------------------- /dev/dev.env: -------------------------------------------------------------------------------- 1 | POSTGRES_USER=postgres 2 | POSTGRES_PASSWORD=Password1 3 | POSTGRES_DB=postgres 4 | 5 | MYSQL_DATABASE=mysql 6 | MYSQL_USER=mysql 7 | MYSQL_PASSWORD=Password1 8 | MYSQL_ROOT_PASSWORD=RootPassword1 9 | 10 | CLICKHOUSE_USER=clickhouse 11 | CLICKHOUSE_PASSWORD=Password1 12 | CLICKHOUSE_DB=clickhouse 13 | CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 14 | 15 | # Vertica credentials 16 | APP_DB_USER=vertica 17 | APP_DB_PASSWORD=Password1 18 | VERTICA_DB_NAME=vertica 19 | 20 | # To prevent generating sample demo VMart data (more about it here https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/GettingStartedGuide/IntroducingVMart/IntroducingVMart.htm), 21 | # leave VMART_DIR and VMART_ETL_SCRIPT empty. 22 | VMART_DIR= 23 | VMART_ETL_SCRIPT= 24 | -------------------------------------------------------------------------------- /dev/graph.py: -------------------------------------------------------------------------------- 1 | # Use this to graph the benchmarking results (see benchmark.sh) 2 | # 3 | # To run this: 4 | # - pip install pandas 5 | # - pip install plotly 6 | # 7 | 8 | import pandas as pd 9 | import plotly.graph_objects as go 10 | from reladiff.utils import number_to_human 11 | import glob 12 | 13 | for benchmark_file in glob.glob("benchmark_*.jsonl"): 14 | rows = pd.read_json(benchmark_file, lines=True) 15 | rows["cloud"] = rows["test"].str.match(r".*(snowflake|redshift|presto|bigquery)") 16 | sha = benchmark_file.split("_")[1].split(".")[0] 17 | print(f"Generating graphs from {benchmark_file}..") 18 | 19 | for n_rows, group in rows.groupby(["rows"]): 20 | image_path = f"benchmark_{sha}_{number_to_human(n_rows)}.png" 21 | print(f"\t rows: {number_to_human(n_rows)}, image: {image_path}") 22 | 23 | r = group.drop_duplicates(subset=["name_human"]) 24 | r = r.sort_values(by=["cloud", "source_type", "target_type", "name_human"]) 25 | 26 | fig = go.Figure( 27 | data=[ 28 | go.Bar( 29 | name="count(*)", 30 | x=r["name_human"], 31 | y=r["count_max_sec"], 32 | text=r["count_max_sec"], 33 | textfont=dict(color="blue"), 34 | ), 35 | go.Bar( 36 | name="reladiff (checksum)", 37 | x=r["name_human"], 38 | y=r["checksum_sec"], 39 | text=r["checksum_sec"], 40 | textfont=dict(color="red"), 41 | ), 42 | go.Bar( 43 | name="Download and compare †", 44 | x=r["name_human"], 45 | y=r["download_sec"], 46 | text=r["download_sec"], 47 | textfont=dict(color="green"), 48 | ), 49 | ] 50 | ) 51 | # Change the bar mode 52 | fig.update_layout(title=f"reladiff {number_to_human(n_rows)} rows, sha: {sha}") 53 | fig.update_traces(texttemplate="%{text:.1f}", textposition="outside") 54 | fig.update_layout(uniformtext_minsize=2, uniformtext_mode="hide") 55 | fig.update_yaxes(title="Time") 56 | fig.write_image(image_path, scale=2) 57 | -------------------------------------------------------------------------------- /dev/prepare_db.pql: -------------------------------------------------------------------------------- 1 | // This is a Preql file, used for setting up a database for development and testing 2 | // 3 | // In loads a "rating" dataset and generates a set of tables from it, with various modifications. 4 | 5 | // Declare table & functions 6 | func run_sql(code) { 7 | print code 8 | force_eval( SQL( nulltype, code )) 9 | } 10 | 11 | func drop_table(t) { 12 | run_sql("DROP TABLE IF EXISTS " + get_qualified_name(t)) 13 | } 14 | 15 | func create_indices(tbl) { 16 | tbl.add_index("id", true) 17 | tbl.add_index("timestamp") 18 | tbl.add_index(["id", "timestamp"]) 19 | } 20 | 21 | DATASET = "reladiff" // For BigQuery 22 | if (db_type == "bigquery") { 23 | set_active_dataset(DATASET) 24 | } 25 | 26 | // Cleanup 27 | func cleanup() { 28 | drop_table("rating") 29 | drop_table("tmp_rating") 30 | drop_table("rating_del1") 31 | drop_table("rating_update1") 32 | drop_table("rating_update001p") 33 | drop_table("rating_update1p") 34 | drop_table("rating_del1p") 35 | drop_table("rating_update50p") 36 | commit() 37 | } 38 | 39 | cleanup() 40 | 41 | // Import CSV 42 | if (db_type == "snowflake" or db_type == "redshift") { 43 | if (db_type == "snowflake") { 44 | print "Uploading ratings CSV" 45 | 46 | run_sql("RM @~/ratings.csv.gz") 47 | run_sql("PUT file://dev/ratings.csv @~") 48 | 49 | print "Loading ratings CSV" 50 | 51 | bare table tmp_rating { 52 | userid: int 53 | movieid: int 54 | rating: float 55 | timestamp: int 56 | } 57 | 58 | run_sql("COPY INTO tmp_rating FROM '@~/ratings.csv.gz' file_format=(skip_header=1)") 59 | 60 | } else if (db_type == "redshift") { 61 | // NOTE: Requires that the csv already exists on s3 in the given path 62 | print "Loading ratings CSV (already uploaded)" 63 | 64 | table tmp_rating { 65 | userid: int 66 | movieid: int 67 | rating: float 68 | timestamp: int 69 | } 70 | 71 | run_sql(""" 72 | COPY "public"."tmp_rating" (userid, movieid, rating, timestamp) 73 | FROM 's3://dev-cf-redshift-datafold-reladiff/ml/ratings.csv' 74 | IAM_ROLE 'arn:aws:iam::760878568205:role/dev-cf-redshift-reladiff' 75 | DELIMITER ',' 76 | IGNOREHEADER 1; 77 | """) 78 | 79 | } 80 | 81 | table rating { 82 | id: int // explicit id, instead of identity type 83 | userid: int 84 | movieid: int 85 | rating: float 86 | timestamp: int 87 | } 88 | 89 | run_sql(""" 90 | INSERT INTO rating(id, userid, movieid, rating, timestamp) 91 | SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp 92 | FROM tmp_rating 93 | """) 94 | 95 | } else if (db_type == "mssql") { 96 | run_sql("drop table if exists tmp_rating") 97 | run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)") 98 | table tmp_rating {...} 99 | print "Loading ratings CSV" 100 | run_sql("BULK INSERT tmp_rating from 'dev/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);") 101 | print "Populating actual table" 102 | rating += tmp_rating 103 | commit() 104 | } else if (db_type == "bigquery") { 105 | print "Importing the CSV through the Python script (BigQuery)" 106 | PY("0", "import _bq_import_csv") 107 | 108 | table rating { 109 | id: int // explicit id, to avoid identity type 110 | userid: int 111 | movieid: int 112 | rating: float 113 | timestamp: int 114 | } 115 | 116 | run_sql(""" 117 | INSERT INTO reladiff.rating(id, userid, movieid, rating, timestamp) 118 | SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp FROM reladiff.tmp_rating 119 | """) 120 | 121 | } else { 122 | print "Importing ratings CSV" 123 | 124 | table rating { 125 | userid: int 126 | movieid: int 127 | rating: float 128 | timestamp: int 129 | } 130 | import_csv(rating, 'dev/ratings.csv', true) 131 | create_indices(rating) 132 | } 133 | 134 | drop_table("tmp_rating") 135 | commit() 136 | 137 | middle = count(rating) /~ 2 138 | 139 | // Code notes: 140 | // - We use 'const table' to avoid updating the ids 141 | 142 | print "Create tables" 143 | const table rating_del1 = rating 144 | const table rating_update1 = rating 145 | const table rating_update001p = rating 146 | const table rating_update1p = rating 147 | const table rating_del1p = rating 148 | const table rating_update50p = rating 149 | 150 | print "Create indexes" 151 | 152 | create_indices(rating_del1) 153 | create_indices(rating_update1) 154 | create_indices(rating_update001p) 155 | create_indices(rating_update1p) 156 | create_indices(rating_del1p) 157 | create_indices(rating_update50p) 158 | commit() 159 | 160 | print "Alter tables" 161 | rating_del1[middle..(middle+1)] delete [true] 162 | assert count(rating) == count(rating_del1) + 1 163 | rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1} 164 | 165 | rating_update001p[random() < 0.0001] update {timestamp: timestamp + 1} 166 | rating_update1p[random() < 0.01] update {timestamp: timestamp + 1} 167 | rating_update50p[random() < 0.5] update {timestamp: timestamp + 1} 168 | rating_del1p[random() < 0.01] delete [true] 169 | 170 | commit() 171 | -------------------------------------------------------------------------------- /dev/prepare_db_gaps.pql: -------------------------------------------------------------------------------- 1 | // This is a Preql file, used for setting up a database for development and testing 2 | // 3 | // It generates tables with various gaps in them, based on the "rating" dataset. 4 | // Assumes prepare_db.pql has already been run. 5 | 6 | 7 | // Declare table & functions 8 | func run_sql(code) { 9 | print code 10 | force_eval( SQL( nulltype, code )) 11 | } 12 | 13 | func drop_table(t) { 14 | run_sql("DROP TABLE IF EXISTS " + t) 15 | } 16 | 17 | func create_indices(tbl) { 18 | tbl.add_index("id", true) 19 | tbl.add_index("timestamp") 20 | tbl.add_index(["id", "timestamp"]) 21 | } 22 | 23 | // Assumes prepare_db already ran 24 | table rating {...} 25 | 26 | drop_table("rating_gap1") 27 | drop_table("rating_gap2") 28 | drop_table("rating_gap3") 29 | drop_table("rating_gap1_update0001p") 30 | drop_table("rating_gap2_update0001p") 31 | drop_table("rating_gap3_update0001p") 32 | 33 | const table rating_gap1 = rating 34 | const table rating_gap2 = rating 35 | const table rating_gap3 = rating 36 | 37 | create_indices(rating_gap1) 38 | create_indices(rating_gap2) 39 | create_indices(rating_gap3) 40 | commit() 41 | 42 | table rating_gap1 { 43 | userid: int 44 | movieid: int 45 | rating: float 46 | timestamp: int 47 | } 48 | 49 | table rating_gap2 { 50 | userid: int 51 | movieid: int 52 | rating: float 53 | timestamp: int 54 | } 55 | 56 | table rating_gap3 { 57 | userid: int 58 | movieid: int 59 | rating: float 60 | timestamp: int 61 | } 62 | 63 | rating_gap3[id == 1000] update {id: 2147483548} 64 | 65 | // Create many small gaps, for testing low bisection thresholds 66 | run_sql("UPDATE rating_gap1 SET id = id * 1000 + 25000000 where 100000 < id and id <= 500000 ") 67 | 68 | // Create increasing gaps, to test many gaps of various sizes at once 69 | run_sql("UPDATE rating_gap2 SET id = cast(id*0.1*id as int) + 26000000 WHERE 10 < id and id < 100000") 70 | 71 | // Create one very big gap, to test empty scans and excessive bisection. 72 | run_sql("INSERT INTO rating_gap3(id, userid, movieid, rating, timestamp) VALUES (2047483548, 1, 1, 5.0, 27)") 73 | commit() 74 | 75 | print "Create more tables" 76 | const table rating_gap1_update0001p = rating_gap1 77 | const table rating_gap2_update0001p = rating_gap2 78 | const table rating_gap3_update0001p = rating_gap3 79 | create_indices(rating_gap1_update0001p) 80 | create_indices(rating_gap2_update0001p) 81 | create_indices(rating_gap3_update0001p) 82 | 83 | rating_gap1_update0001p[random() < 0.000001] update {timestamp: timestamp + 1} 84 | rating_gap2_update0001p[random() < 0.000001] update {timestamp: timestamp + 1} 85 | rating_gap3_update0001p[random() < 0.000001] update {timestamp: timestamp + 1} 86 | rating_gap3[id == 100000] delete [true] 87 | commit() 88 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/catalog/jmx.properties: -------------------------------------------------------------------------------- 1 | connector.name=jmx 2 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/catalog/memory.properties: -------------------------------------------------------------------------------- 1 | connector.name=memory 2 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/catalog/postgresql.properties: -------------------------------------------------------------------------------- 1 | connector.name=postgresql 2 | connection-url=jdbc:postgresql://postgres:5432/postgres 3 | connection-user=postgres 4 | connection-password=Password1 5 | allow-drop-table=true 6 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/catalog/tpcds.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpcds 2 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/catalog/tpch.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpch 2 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | query.max-memory=5GB 5 | query.max-memory-per-node=1GB 6 | query.max-total-memory-per-node=2GB 7 | discovery-server.enabled=true 8 | discovery.uri=http://127.0.0.1:8080 9 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx16G 3 | -XX:+UseG1GC 4 | -XX:G1HeapRegionSize=32M 5 | -XX:+UseGCOverheadLimit 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:+ExitOnOutOfMemoryError 9 | -XX:OnOutOfMemoryError=kill -9 %p 10 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/log.properties: -------------------------------------------------------------------------------- 1 | com.facebook.presto=INFO 2 | -------------------------------------------------------------------------------- /dev/presto-conf/standalone/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=production 2 | node.data-dir=/data 3 | node.id=standalone 4 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/catalog/jms.properties: -------------------------------------------------------------------------------- 1 | connector.name=jmx 2 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/catalog/memory.properties: -------------------------------------------------------------------------------- 1 | connector.name=memory 2 | memory.max-data-per-node=128MB 3 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/catalog/postgresql.properties: -------------------------------------------------------------------------------- 1 | connector.name=postgresql 2 | connection-url=jdbc:postgresql://postgres:5432/postgres 3 | connection-user=postgres 4 | connection-password=Password1 5 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/catalog/tpcds.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpcds 2 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/catalog/tpch.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpch 2 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | discovery.uri=http://localhost:8080 5 | discovery-server.enabled=true 6 | -------------------------------------------------------------------------------- /dev/trino-conf/etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx1G 3 | -XX:-UseBiasedLocking 4 | -XX:+UseG1GC 5 | -XX:G1HeapRegionSize=32M 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+HeapDumpOnOutOfMemoryError 8 | -XX:+UseGCOverheadLimit 9 | -XX:+ExitOnOutOfMemoryError 10 | -XX:ReservedCodeCacheSize=256M 11 | -Djdk.attach.allowAttachSelf=true 12 | -Djdk.nio.maxCachedBufferSize=2000000 -------------------------------------------------------------------------------- /dev/trino-conf/etc/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=docker 2 | node.data-dir=/data/trino 3 | plugin.dir=/usr/lib/trino/plugin 4 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | postgres: 5 | container_name: dd-postgresql 6 | image: postgres:14.1-alpine 7 | # work_mem: less tmp files 8 | # maintenance_work_mem: improve table-level op perf 9 | # max_wal_size: allow more time before merging to heap 10 | command: > 11 | -c work_mem=1GB 12 | -c maintenance_work_mem=1GB 13 | -c max_wal_size=8GB 14 | restart: always 15 | volumes: 16 | - postgresql-data:/var/lib/postgresql/data:delegated 17 | ports: 18 | - '5432:5432' 19 | expose: 20 | - '5432' 21 | env_file: 22 | - dev/dev.env 23 | tty: true 24 | networks: 25 | - local 26 | 27 | mysql: 28 | container_name: dd-mysql 29 | image: mysql:oracle 30 | # fsync less aggressively for insertion perf for test setup 31 | command: > 32 | --binlog-cache-size=16M 33 | --key_buffer_size=0 34 | --max_connections=1000 35 | --innodb_flush_log_at_trx_commit=2 36 | --innodb_flush_log_at_timeout=10 37 | --innodb_log_compressed_pages=OFF 38 | --sync_binlog=0 39 | restart: always 40 | volumes: 41 | - mysql-data:/var/lib/mysql:delegated 42 | user: mysql 43 | ports: 44 | - '3306:3306' 45 | expose: 46 | - '3306' 47 | env_file: 48 | - dev/dev.env 49 | tty: true 50 | networks: 51 | - local 52 | 53 | clickhouse: 54 | container_name: dd-clickhouse 55 | image: clickhouse/clickhouse-server:21.12.3.32 56 | restart: always 57 | volumes: 58 | - clickhouse-data:/var/lib/clickhouse:delegated 59 | ulimits: 60 | nproc: 65535 61 | nofile: 62 | soft: 262144 63 | hard: 262144 64 | ports: 65 | - '8123:8123' 66 | - '9000:9000' 67 | expose: 68 | - '8123' 69 | - '9000' 70 | env_file: 71 | - dev/dev.env 72 | tty: true 73 | networks: 74 | - local 75 | 76 | # prestodb.dbapi.connect(host="127.0.0.1", user="presto").cursor().execute('SELECT * FROM system.runtime.nodes') 77 | presto: 78 | container_name: dd-presto 79 | build: 80 | context: ./dev 81 | dockerfile: ./Dockerfile.prestosql.340 82 | volumes: 83 | - ./dev/presto-conf/standalone:/opt/presto/etc:ro 84 | ports: 85 | - '8080:8080' 86 | tty: true 87 | networks: 88 | - local 89 | 90 | trino: 91 | container_name: dd-trino 92 | image: 'trinodb/trino:389' 93 | hostname: trino 94 | ports: 95 | - '8081:8080' 96 | volumes: 97 | - ./dev/trino-conf/etc:/etc/trino:ro 98 | networks: 99 | - local 100 | 101 | vertica: 102 | container_name: dd-vertica 103 | image: vertica/vertica-ce:12.0.0-0 104 | restart: always 105 | volumes: 106 | - vertica-data:/data:delegated 107 | ports: 108 | - '5433:5433' 109 | - '5444:5444' 110 | expose: 111 | - '5433' 112 | - '5444' 113 | env_file: 114 | - dev/dev.env 115 | tty: true 116 | networks: 117 | - local 118 | 119 | 120 | 121 | volumes: 122 | postgresql-data: 123 | mysql-data: 124 | clickhouse-data: 125 | vertica-data: 126 | 127 | networks: 128 | local: 129 | driver: bridge 130 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = reladiff 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Documentation build configuration file, created by 5 | # sphinx-quickstart on Sun Aug 16 13:09:41 2020. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath("..")) 24 | sys.path.append(os.path.abspath("./_ext")) 25 | autodoc_member_order = "bysource" 26 | 27 | 28 | # -- General configuration ------------------------------------------------ 29 | 30 | # If your documentation needs a minimal Sphinx version, state it here. 31 | # 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.napoleon", 40 | "sphinx.ext.coverage", 41 | "recommonmark", 42 | "sphinx_markdown_tables", 43 | "sphinx_copybutton", 44 | "enum_tools.autoenum", 45 | # 'sphinx_gallery.gen_gallery' 46 | ] 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ["_templates"] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"} 56 | 57 | 58 | # The master toctree document. 59 | master_doc = "index" 60 | 61 | # General information about the project. 62 | project = "reladiff" 63 | copyright = "Erez Shinan" 64 | author = "Erez Shinan" 65 | 66 | # The version info for the project you're documenting, acts as replacement for 67 | # |version| and |release|, also used in various other places throughout the 68 | # built documents. 69 | # 70 | # The short X.Y version. 71 | version = "" 72 | # The full version, including alpha/beta/rc tags. 73 | release = "" 74 | 75 | # The language for content autogenerated by Sphinx. Refer to documentation 76 | # for a list of supported languages. 77 | # 78 | # This is also used if you do content translation via gettext catalogs. 79 | # Usually you set "language" from the command line for these cases. 80 | language = "en" 81 | 82 | # List of patterns, relative to source directory, that match files and 83 | # directories to ignore when looking for source files. 84 | # This patterns also effect to html_static_path and html_extra_path 85 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 86 | 87 | # The name of the Pygments (syntax highlighting) style to use. 88 | pygments_style = "sphinx" 89 | 90 | # If true, `todo` and `todoList` produce output, else they produce nothing. 91 | todo_include_todos = False 92 | 93 | 94 | # -- Options for HTML output ---------------------------------------------- 95 | 96 | # The theme to use for HTML and HTML Help pages. See the documentation for 97 | # a list of builtin themes. 98 | # 99 | html_theme = "sphinx_rtd_theme" 100 | 101 | # Theme options are theme-specific and customize the look and feel of a theme 102 | # further. For a list of options available for each theme, see the 103 | # documentation. 104 | # 105 | # html_theme_options = {} 106 | 107 | # Add any paths that contain custom static files (such as style sheets) here, 108 | # relative to this directory. They are copied after the builtin static files, 109 | # so a file named "default.css" will overwrite the builtin "default.css". 110 | html_static_path = ["_static"] 111 | 112 | html_css_files = [ 113 | "custom.css", 114 | ] 115 | 116 | # Custom sidebar templates, must be a dictionary that maps document names 117 | # to template names. 118 | # 119 | # This is required for the alabaster theme 120 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 121 | html_sidebars = { 122 | "**": [ 123 | "relations.html", # needs 'show_related': True theme option to display 124 | "searchbox.html", 125 | ] 126 | } 127 | 128 | 129 | # -- Options for HTMLHelp output ------------------------------------------ 130 | 131 | # Output file base name for HTML help builder. 132 | htmlhelp_basename = "reladiffdoc" 133 | 134 | 135 | # -- Options for LaTeX output --------------------------------------------- 136 | 137 | latex_elements = { 138 | # The paper size ('letterpaper' or 'a4paper'). 139 | # 140 | # 'papersize': 'letterpaper', 141 | # The font size ('10pt', '11pt' or '12pt'). 142 | # 143 | # 'pointsize': '10pt', 144 | # Additional stuff for the LaTeX preamble. 145 | # 146 | # 'preamble': '', 147 | # Latex figure (float) alignment 148 | # 149 | # 'figure_align': 'htbp', 150 | } 151 | 152 | # Grouping the document tree into LaTeX files. List of tuples 153 | # (source start file, target name, title, 154 | # author, documentclass [howto, manual, or own class]). 155 | latex_documents = [ 156 | (master_doc, "Reladiff.tex", "Reladiff Documentation", "Erez Shinan", "manual"), 157 | ] 158 | 159 | 160 | # -- Options for manual page output --------------------------------------- 161 | 162 | # One entry per manual page. List of tuples 163 | # (source start file, name, description, authors, manual section). 164 | man_pages = [(master_doc, "Reladiff", "Reladiff Documentation", [author], 1)] 165 | 166 | 167 | # -- Options for Texinfo output ------------------------------------------- 168 | 169 | # Grouping the document tree into Texinfo files. List of tuples 170 | # (source start file, target name, title, author, 171 | # dir menu entry, description, category) 172 | texinfo_documents = [ 173 | ( 174 | master_doc, 175 | "Reladiff", 176 | "Reladiff Documentation", 177 | author, 178 | "Reladiff", 179 | "One line description of project.", 180 | "Miscellaneous", 181 | ), 182 | ] 183 | 184 | # -- Sphinx gallery config ------------------------------------------- 185 | 186 | # sphinx_gallery_conf = { 187 | # 'examples_dirs': ['../examples'], 188 | # 'gallery_dirs': ['examples'], 189 | # } 190 | -------------------------------------------------------------------------------- /docs/how-to-use.md: -------------------------------------------------------------------------------- 1 | # User guide 2 | 3 | Once you've [installed](https://reladiff.readthedocs.io/en/latest/install.html) Reladiff, you can run it from the command-line, or from Python. 4 | 5 | ## How to use from the shell / command-line 6 | 7 | The basic syntax for reladiff is: 8 | 9 | ```bash 10 | # Cross-DB diff, using hashes 11 | reladiff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS] 12 | ``` 13 | 14 | When both tables belong to the same database, a shorter syntax is available: 15 | 16 | ```bash 17 | # Same-DB diff, using outer join 18 | reladiff DB_URI TABLE1_NAME TABLE2_NAME [OPTIONS] 19 | ``` 20 | 21 | `DB_URL` is either a [database URL](supported-databases.md), or the name of a database definition that is specified in a [configuration file](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-with-a-configuration-file). Our database URLs conform to the same format as SQLAlchemy. 22 | 23 | We recommend using a configuration file, with the ``--conf`` switch, to keep the command simple and manageable. 24 | 25 | For a list of example URLs, see [list of supported databases](supported-databases.md). 26 | 27 | Note: Because URLs allow many special characters, and may collide with the syntax of your shell, 28 | it's recommended to surround them with quotes. 29 | 30 | ### Options 31 | 32 | - `--help` - Show help message and exit. 33 | - `-k` or `--key-columns` - Name of the primary key column. If none provided, default is 'id'. Can be used more than once, for a compound key. 34 | - `-t` or `--update-column` - Name of updated_at/last_updated column 35 | - `-c` or `--columns` - Names of extra columns to compare. Can be used more than once in the same command. 36 | Accepts a name or a pattern like in SQL. 37 | Example: `-c col% -c another_col -c %foob.r%` 38 | - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime) 39 | - `-s` or `--stats` - Print stats instead of a detailed diff 40 | - `-d` or `--debug` - Print debug info 41 | - `-v` or `--verbose` - Print extra info 42 | - `-i` or `--interactive` - Confirm queries, implies `--debug` 43 | - `--json` - Print JSONL output for machine readability 44 | - `--skip-sort-results` - Skip sorting the hashdiff output by key for better performance. 45 | Entries with the same key but different column values may not appear adjacent in the output. 46 | - `--min-age` - Considers only rows older than specified. Useful for specifying replication lag. 47 | Example: `--min-age=5min` ignores rows from the last 5 minutes. 48 | Valid units: `d, days, h, hours, min, minutes, mon, months, s, seconds, w, weeks, y, years` 49 | - `--max-age` - Considers only rows younger than specified. See `--min-age`. 50 | - `-j` or `--threads` - Number of worker threads to use per database. Default=1. 51 | - `-w`, `--where` - An additional 'where' expression to restrict the search space. 52 | - `--allow-empty-tables` - Allows diffing on empty tables. Otherwise, we raise an error. 53 | - `--case-sensitive` - Column names are treated as case-sensitive. Otherwise, reladiff corrects their case according to schema. 54 | - `--conf`, `--run` - Specify the run and configuration from a TOML file. (see below) 55 | - `--bisection-threshold` - Minimal size of segment to be split. Smaller segments will be downloaded and compared locally. 56 | - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search. 57 | - `-m`, `--materialize` - Materialize the diff results into a new table in the database. 58 | If a table exists by that name, it will be replaced. 59 | Use `%t` in the name to place a timestamp. 60 | Example: `-m test_mat_%t` 61 | - `--assume-unique-key` - Skip validating the uniqueness of the key column during joindiff, which is costly in non-cloud dbs. 62 | Also, disables support for duplicate rows in hashdiff, offering a small performance gain. 63 | - `--sample-exclusive-rows` - Sample several rows that only appear in one of the tables, but not the other. Use with `-s`. 64 | - `--materialize-all-rows` - Materialize every row, even if they are the same, instead of just the differing rows. 65 | - `--table-write-limit` - Maximum number of rows to write when creating materialized or sample tables, per thread. Default=1000. 66 | - `-a`, `--algorithm` `[auto|joindiff|hashdiff]` - Force algorithm choice 67 | 68 | 69 | ### How to use with a configuration file 70 | 71 | Reladiff lets you load the configuration for a run from a TOML file. 72 | 73 | **Reasons to use a configuration file:** 74 | 75 | - Convenience: Set-up the parameters for diffs that need to run often 76 | 77 | - Easier and more readable: You can define the database connection settings as separate config values, instead of in a single URI. 78 | 79 | - Gives you fine-grained control over the settings switches, without requiring any Python code. 80 | 81 | Use `--conf` to specify that path to the configuration file. reladiff will load the settings from `run.default`, if it's defined. 82 | 83 | Then you can, optionally, use `--run` to choose to load the settings of a specific run, and override the settings `run.default`. (all runs extend `run.default`, like inheritance). 84 | 85 | Finally, CLI switches have the final say, and will override the settings defined by the configuration file, and the current run. 86 | 87 | Example TOML file: 88 | 89 | ```toml 90 | # Specify the connection params to the test database. 91 | [database.test_postgresql] 92 | driver = "postgresql" 93 | user = "postgres" 94 | password = "Password1" 95 | 96 | # Specify the default run params 97 | [run.default] 98 | update_column = "timestamp" 99 | verbose = true 100 | 101 | # Specify params for a run 'test_diff'. 102 | [run.test_diff] 103 | verbose = false 104 | # Source 1 ("left") 105 | 1.database = "test_postgresql" # Use options from database.test_postgresql 106 | 1.table = "rating" 107 | # Source 2 ("right") 108 | 2.database = "postgresql://postgres:Password1@/" # Use URI like in the CLI 109 | 2.table = "rating_del1" 110 | ``` 111 | 112 | In this example, running `reladiff --conf myconfig.toml --run test_diff` will compare between `rating` and `rating_del1`. 113 | It will use the `timestamp` column as the update column, as specified in `run.default`. However, it won't be verbose, since that 114 | flag is overwritten to `false`. 115 | 116 | Running it with `reladiff --conf myconfig.toml --run test_diff -v` will set verbose back to `true`. 117 | 118 | 119 | ## How to use from Python 120 | 121 | Import the `reladiff` module, and use the following functions: 122 | 123 | - `connect_to_table()` to connect to a specific table in the database 124 | 125 | - `diff_tables()` to diff those tables 126 | 127 | 128 | Example: 129 | 130 | ```python 131 | # Optional: Set logging to display the progress of the diff 132 | import logging 133 | logging.basicConfig(level=logging.INFO) 134 | 135 | from reladiff import connect_to_table, diff_tables 136 | 137 | table1 = connect_to_table("postgresql:///", "table_name", "id") 138 | table2 = connect_to_table("mysql:///", "table_name", "id") 139 | 140 | sign: Literal['+' | '-'] 141 | row: tuple[str, ...] 142 | for sign, row in diff_tables(table1, table2): 143 | print(sign, row) 144 | ``` 145 | 146 | To learn more about the different options, [read the API reference](https://reladiff.readthedocs.io/en/latest/python-api.html) or run `help(diff_tables)`. 147 | 148 | 149 | ## Tips 150 | 151 | - If you are only interested in whether something changed, i.e. a yes/no answer, set `--limit 1`. Reladiff will return as soon as it finds the first difference. 152 | 153 | - Ensure that you have indexes on the columns you are comparing. Preferably a compound index, if relevant. You can run with `--interactive` to see an EXPLAIN for the queries. 154 | 155 | - Setting a higher thread count may help performance significantly, depending on the database. For databases that limit concurrency per query, such as PostgreSQL/MySQL, this can improve performance dramatically. 156 | 157 | - A low `--bisection-threshold` will minimize the amount of network transfer. But if network isn't an issue, a high `--bisection-threshold` will make Reladiff run a lot faster. 158 | 159 | - If you run into timeouts for very large tables, try increasing the `--bisection-factor`. 160 | 161 | - The fewer columns you verify, the faster Reladiff will be. If you're only interested in additions/deletions, verifying the primary key could be enough. If you have an automatic `updated` column, it might be enough to capture changes, i.e. comparing all the data isn't always necessary. 162 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. toctree:: 2 | :maxdepth: 2 3 | :caption: Reference 4 | :hidden: 5 | 6 | install 7 | how-to-use 8 | supported-databases 9 | python-api 10 | technical-explanation 11 | new-database-driver-guide 12 | 13 | Reladiff 14 | ------------ 15 | 16 | **Reladiff** is a high-performance tool and library designed for diffing large datasets across databases. By executing the diff calculation within the database itself, Reladiff minimizes data transfer and achieves optimal performance. 17 | 18 | This tool is specifically tailored for data professionals, DevOps engineers, and system administrators. 19 | 20 | Reladiff is free, open-source, user-friendly, extensively tested, and delivers fast results, even at massive scale. 21 | 22 | Key Features 23 | ============ 24 | 25 | 1. **Cross-Database Diff**: *Reladiff* employs a divide-and-conquer algorithm, based on matching hashes, to efficiently identify modified segments and download only the necessary data for comparison. This approach ensures exceptional performance when differences are minimal. 26 | 27 | - ⇄ Diffs across over a dozen different databases (e.g. *PostgreSQL* -> *Snowflake*)! 28 | 29 | - 🧠 Gracefully handles reduced precision (e.g., timestamp(9) -> timestamp(3)) by rounding according to the database specification. 30 | 31 | - 🔥 Benchmarked to diff over 25M rows in under 10 seconds and over 1B rows in approximately 5 minutes, given no differences. 32 | 33 | - ♾️ Capable of handling tables with tens of billions of rows. 34 | 35 | 2. **Intra-Database Diff**: When both tables reside in the same database, Reladiff compares them using a join operation, with additional optimizations for enhanced speed. 36 | 37 | - Supports materializing the diff into a local table. 38 | - Can collect various extra statistics about the tables. 39 | 40 | 3. **Threaded**: Utilizes multiple threads to significantly boost performance during diffing operations. 41 | 42 | 4. **Configurable**: Offers numerous options for power-users to customize and optimize their usage. 43 | 44 | 5. **Automation-Friendly**: Outputs both JSON and git-like diffs (with + and -), facilitating easy integration into CI/CD pipelines. 45 | 46 | 6. **Over a dozen databases supported**: MySQL, Postgres, Snowflake, Bigquery, Oracle, Clickhouse, and more. `See full list `_. 47 | 48 | Reladiff is a fork of an archived project called `data-diff `_. Code that worked with data-diff should also work with reladiff, without any changes. However, there are a few differences: Reladiff doesn't contain any tracking code. Reladiff doesn't have DBT integration. 49 | 50 | Resources 51 | --------- 52 | 53 | 54 | - User Documentation 55 | - :doc:`install` 56 | - :doc:`how-to-use` 57 | - :doc:`supported-databases` 58 | - :doc:`python-api` 59 | - :doc:`technical-explanation` 60 | - Contributor Documentation 61 | - :doc:`new-database-driver-guide` 62 | 63 | - Other links 64 | - Github: ``_ 65 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Installation Guide 2 | 3 | ## Install library and CLI (no drivers) 4 | 5 | Reladiff is available on [PyPI](https://pypi.org/project/reladiff/). You may install it by running: 6 | 7 | ```sh 8 | pip install reladiff 9 | ``` 10 | 11 | Requirements: Python 3.8+ with pip. 12 | 13 | ## Install with database drivers 14 | 15 | You may install the necessary database drivers, at the same time as when installing Reladiff, using pip's "extra" syntax. 16 | 17 | We advise to install Reladiff within a virtual-env, because the drivers may bring many dependencies. 18 | 19 | ```sh 20 | # Install all database drivers 21 | pip install reladiff[all] 22 | 23 | # The above line is equivalent to: 24 | pip install reladiff[duckdb,mysql,postgresql,snowflake,presto,oracle,trino,clickhouse,vertica] 25 | ``` 26 | 27 | You may remove any database you don't plan to use. 28 | 29 | For example, if you only want to diff between Postgresql and DuckDB, install Reladiff thusly: 30 | 31 | ```sh 32 | pip install reladiff[duckdb,postgresql] 33 | ``` 34 | 35 | ### Notes for shell / command-line 36 | 37 | In some shells, like `bash` and `powershell`, you will have to use quotes, in order to allow the `[]` syntax. 38 | 39 | For example: 40 | 41 | ```sh 42 | pip install 'reladiff[all]' # will work on bash 43 | pip install "reladiff[all]" # will work on powershell (Windows) 44 | ``` 45 | 46 | Consult your shell environment to learn the correct way to quote or escape your command. 47 | 48 | ### Notes for BigQuery 49 | 50 | Reladiff currently doesn't auto-install the BigQuery drivers. 51 | 52 | For BigQuery, see: [https://pypi.org/project/google-cloud-bigquery](https://pypi.org/project/google-cloud-bigquery) 53 | 54 | 55 | ### Another way to install all the drivers 56 | 57 | For your convenience, you may also run these commands one after the other. You may omit drivers that you don't plan to use. 58 | 59 | ```bash 60 | pip install reladiff[duckdb] 61 | pip install reladiff[mysql] 62 | pip install reladiff[postgresql] 63 | pip install reladiff[snowflake] 64 | pip install reladiff[presto] 65 | pip install reladiff[oracle] 66 | pip install reladiff[trino] 67 | pip install reladiff[clickhouse] 68 | pip install reladiff[vertica] 69 | ``` 70 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=reladiff 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/new-database-driver-guide.rst: -------------------------------------------------------------------------------- 1 | How to implement a new database driver for Reladiff 2 | ==================================================== 3 | 4 | **This guide is out-of-date!** New databases should be added first in `Sqeleton `_. 5 | 6 | First, read through the `CONTRIBUTING.md `_ document. 7 | 8 | Make sure Reladiff is set up for development, and that all the tests pass (try to at least set it up for mysql and postgresql) 9 | 10 | Look at the other database drivers for example and inspiration. 11 | 12 | 13 | 1. Add dependencies to ``pyproject.toml`` 14 | ----------------------------------------- 15 | 16 | Most new drivers will require a 3rd party library in order to connect to the database. 17 | 18 | These dependencies should be specified in the ``pyproject.toml`` file, in ``[tool.poetry.extras]``. Example: 19 | 20 | :: 21 | 22 | [tool.poetry.extras] 23 | postgresql = ["psycopg2"] 24 | 25 | Then, users can install the dependencies needed for your database driver, with ``pip install 'reladiff[postgresql]``. 26 | 27 | This way, Reladiff can support a wide variety of drivers, without requiring our users to install libraries that they won't use. 28 | 29 | 2. Implement a database module 30 | ------------------------------ 31 | 32 | New database modules belong in the ``reladiff/databases`` directory. 33 | 34 | The module consists of: 35 | 1. Dialect (Class responsible for normalizing/casting fields. e.g. Numbers/Timestamps) 36 | 2. Database class that handles connecting to the DB, querying (if the default doesn't work) , closing connectiosn and etc. 37 | 38 | Choosing a base class, based on threading Model 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 40 | 41 | You can choose to inherit from either ``base.Database`` or ``base.ThreadedDatabase``. 42 | 43 | Usually, databases with cursor-based connections, like MySQL or Postgresql, only allow connections to be used by the thread that created them. In order to support multithreading, we implement them by inheriting from ``ThreadedDatabase``, which holds a pool of worker threads, and creates a new connection per thread. 44 | 45 | Usually, cloud databases, such as Snowflake and BigQuery, open a new connection per request, and support simultaneous queries from any number of threads. In other words, they already support multithreading, so we can implement them by inheriting directly from ``Database``. 46 | 47 | Import on demand 48 | ~~~~~~~~~~~~~~~~~ 49 | 50 | Database drivers should not import any 3rd party library at the module level. 51 | 52 | Instead, they should be imported and initialized within a function. Example: 53 | 54 | :: 55 | 56 | from .base import import_helper 57 | 58 | @import_helper("postgresql") 59 | def import_postgresql(): 60 | import psycopg2 61 | import psycopg2.extras 62 | 63 | psycopg2.extensions.set_wait_callback(psycopg2.extras.wait_select) 64 | return psycopg2 65 | 66 | We use the ``import_helper()`` decorator to provide a uniform and informative error. The string argument should be the name of the package, as written in ``pyproject.toml``. 67 | 68 | :meth:`_query()` 69 | ~~~~~~~~~~~~~~~~~~ 70 | 71 | All queries to the database pass through ``_query()``. It takes SQL code, and returns a list of rows. Here is its signature: 72 | 73 | :: 74 | 75 | def _query(self, sql_code: str) -> list: ... 76 | 77 | For standard cursor connections, it's sufficient to implement it with a call to ``base._query_conn()``, like: 78 | 79 | :: 80 | return _query_conn(self._conn, sql_code) 81 | 82 | 83 | :meth:`select_table_schema()` / :meth:`query_table_schema()` 84 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 85 | 86 | If your database does not have a ``information_schema.columns`` table, or if its structure is unusual, you may have to implement your own ``select_table_schema()`` function, which returns the query needed to return column information in the form of a list of tuples, where each tuple is `column_name, data_type, datetime_precision, numeric_precision, numeric_scale`. 87 | 88 | If such a query isn't possible, you may have to implement ``query_table_schema()`` yourself, which extracts this information from the database, and returns it in the proper form. 89 | 90 | If the information returned from ``query_table_schema()`` requires slow or error-prone post-processing, you may delay that post-processing by overriding ``_process_table_schema()`` and implementing it there. The method ``_process_table_schema()`` only gets called for the columns that will be diffed. 91 | 92 | Documentation: 93 | 94 | - :meth:`reladiff.databases.database_types.AbstractDatabase.select_table_schema` 95 | 96 | - :meth:`reladiff.databases.database_types.AbstractDatabase.query_table_schema` 97 | 98 | :data:`TYPE_CLASSES` 99 | ~~~~~~~~~~~~~~~~~~~~~~ 100 | 101 | Each database class must have a ``TYPE_CLASSES`` dictionary, which maps between the string data-type, as returned by querying the table schema, into the appropriate Reladiff type class, i.e. a subclass of ``database_types.ColType``. 102 | 103 | :data:`ROUNDS_ON_PREC_LOSS` 104 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 105 | 106 | When providing a datetime or a timestamp to a database, the database may lower its precision to correspond with the target column type. 107 | 108 | Some databases will lower precision of timestamp/datetime values by truncating them, and some by rounding them. 109 | 110 | ``ROUNDS_ON_PREC_LOSS`` should be True if this database rounds, or False if it truncates. 111 | 112 | :meth:`__init__`, :meth:`create_connection()` 113 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 114 | 115 | The options for the database connection will be given to the ``__init__()`` method as keywords. 116 | 117 | If you inherit from ``Database``, your ``__init__()`` method may create the database connection. 118 | 119 | If you inherit from ``ThreadedDatabase``, you should instead create the connection in the ``create_connection()`` method. 120 | 121 | :meth:`close()` 122 | ~~~~~~~~~~~~~~~~ 123 | 124 | If you inherit from ``Database``, you will need to implement this method to close the connection yourself. 125 | 126 | If you inherit from ``ThreadedDatabase``, you don't have to implement this method. 127 | 128 | Docs: 129 | 130 | - :meth:`reladiff.databases.database_types.AbstractDatabase.close` 131 | 132 | :meth:`quote()`, :meth:`to_string()`, 133 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 134 | 135 | These methods are used when creating queries, to quote a value, or cast it to STRING/VARCHAR. 136 | 137 | For more information, read their docs: 138 | 139 | - :meth:`reladiff.databases.database_types.AbstractDatabase.quote` 140 | 141 | - :meth:`reladiff.databases.database_types.AbstractDatabase.to_string` 142 | 143 | :meth:`normalize_number()`, :meth:`normalize_timestamp()`, :meth:`md5_to_int()` 144 | 145 | Because comparing data between 2 databases requires both the data to be in the same format - we have normalization functions. 146 | 147 | Databases can have the same data in different formats, e.g. ``DECIMAL`` vs ``FLOAT`` vs ``VARCHAR``, with different precisions. 148 | Reladiff works by converting the values to ``VARCHAR`` and comparing it. 149 | Your normalize_number/normalize_timestamp functions should account for differing precisions between columns. 150 | 151 | These functions accept an SQL code fragment, and returns a new code fragment representing the appropriate computation. 152 | 153 | :meth:`parse_type` 154 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 155 | 156 | This is used to determine types which the system cannot effectively detect. 157 | Examples: 158 | DECIMAL(10,3) needs to be parsed by a custom algorithm. You'd be using regex to split it into Field name + Width + Scale. 159 | 160 | 4. Debugging 161 | ----------------------- 162 | 163 | You can enable debug logging for tests by setting the logger level to ``DEBUG``, via the environment variable ``LOG_LEVEL``, or the ``LOG_LEVEL`` variable in /tests/common.py. 164 | This will display all the queries ran, and display the type detected for each column. 165 | 166 | 3. Add tests 167 | -------------- 168 | 169 | Add your new database to the ``DATABASE_TYPES`` dict in ``tests/test_database_types.py`` 170 | 171 | The key is the class itself, and the value is a dict of {category: [type1, type2, ...]} 172 | 173 | Categories supported are: ``int``, ``datetime``, ``float``, and ``uuid``. 174 | 175 | Example: 176 | 177 | :: 178 | 179 | DATABASE_TYPES = { 180 | ... 181 | db.PostgreSQL: { 182 | "int": [ "int", "bigint" ], 183 | "datetime": [ 184 | "timestamp(6) without time zone", 185 | "timestamp(3) without time zone", 186 | "timestamp(0) without time zone", 187 | "timestamp with time zone", 188 | ], 189 | ... 190 | }, 191 | 192 | 193 | Then run the tests and make sure your database driver is being tested. 194 | 195 | You can run the tests with ``unittest``. 196 | 197 | To save time, we recommend running them with ``unittest-parallel``. 198 | 199 | When debugging, we recommend using the `-f` flag, to stop on error. Also, use the `-k` flag to run only the individual test that you're trying to fix. 200 | 201 | 4. Create Pull-Request 202 | ----------------------- 203 | 204 | Open a pull-request on github, and we'll take it from there! 205 | -------------------------------------------------------------------------------- /docs/python-api.rst: -------------------------------------------------------------------------------- 1 | Python API Reference 2 | ==================== 3 | 4 | .. py:module:: reladiff 5 | 6 | .. autofunction:: connect 7 | 8 | .. autofunction:: connect_to_table 9 | 10 | .. autofunction:: diff_tables 11 | 12 | .. autoclass:: HashDiffer 13 | :members: __init__, diff_tables 14 | 15 | .. autoclass:: JoinDiffer 16 | :members: __init__, diff_tables 17 | 18 | .. autoclass:: TableSegment 19 | :members: __init__, get_values, choose_checkpoints, segment_by_checkpoints, count, count_and_checksum, is_bounded, new, with_schema 20 | 21 | .. autoclass:: DiffResultWrapper 22 | :members: __iter__, close, get_stats_dict, get_stats_string 23 | 24 | .. autoclass:: reladiff.databases.database_types.AbstractDatabase 25 | :members: 26 | 27 | .. autoclass:: reladiff.databases.database_types.AbstractDialect 28 | :members: 29 | 30 | .. autodata:: DbKey 31 | .. autodata:: DbTime 32 | .. autodata:: DbPath 33 | .. autoenum:: Algorithm 34 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # https://docs.readthedocs.io/en/stable/guides/specifying-dependencies.html#specifying-a-requirements-file 2 | sphinx-gallery 3 | sphinx_markdown_tables 4 | sphinx-copybutton 5 | sphinx-rtd-theme 6 | recommonmark 7 | enum-tools[sphinx] 8 | 9 | reladiff 10 | -------------------------------------------------------------------------------- /docs/supported-databases.md: -------------------------------------------------------------------------------- 1 | # List of supported databases 2 | 3 | | Database | Status | Connection string | 4 | |---------------|-------------------------------------------------------------------------------------------------------------------------------------|--------| 5 | | PostgreSQL >=10 | 💚 | `postgresql://:@:5432/` | 6 | | MySQL | 💚 | `mysql://:@:5432/` | 7 | | Snowflake | 💚 | `"snowflake://[:]@//?warehouse=&role=[&authenticator=externalbrowser]"` | 8 | | Redshift | 💚 | `redshift://:@:5439/` | 9 | | DuckDB >= 0.6 | 💚 | `duckdb://` | 10 | | Trino | 💚 | `trino://:@:8080/` | 11 | | BigQuery | 💛 | `bigquery:///` | 12 | | Oracle | 💛 | `oracle://:@/database` | 13 | | Presto | 💛 | `presto://:@:8080/` | 14 | | Vertica | 💛 | `vertica://:@:5433/` | 15 | | Clickhouse | 💛 | `clickhouse://:@:9000/` | 16 | | Databricks | 💛 | `databricks://:@//` | 17 | | SQLite | 📝 | | 18 | 19 | * 💚: Implemented and thoroughly tested. 20 | * 💛: Implemented, but not thoroughly tested yet. 21 | * ⏳: Implementation in progress. 22 | * 📝: Implementation planned. Contributions welcome. 23 | 24 | 25 | #### Looking for a database not on the list? 26 | If a database is not on the list, we'd still love to support it. [Please open an issue](https://github.com/erezsh/reladiff/issues) to discuss it, or vote on existing requests to push them up our todo list. 27 | 28 | We also accept pull-requests! 29 | -------------------------------------------------------------------------------- /docs/technical-explanation.md: -------------------------------------------------------------------------------- 1 | # Technical explanation 2 | 3 | Reladiff can diff tables within the same database, or across different databases. 4 | 5 | **Same-DB Diff:** 6 | - Uses an outer-join to diff the rows as efficiently and accurately as possible. 7 | - Supports materializing the diff results to a database table. 8 | - Can also collect various extra statistics about the tables. 9 | 10 | **Cross-DB Diff:** Employs a divide and conquer algorithm based on hashing, optimized for few changes. 11 | 12 | The following is a technical explanation of the cross-db diff. 13 | 14 | ### Overview 15 | 16 | Reladiff divides the table into smaller segments and computes checksums for each segment in both databases. If the checksums for a segment do not match, it further subdivides that segment and continues checksumming until it identifies the differing row(s). 17 | 18 | This approach has performance within an order of magnitude of count(*) when there are few/no changes, but is able to output each differing row! By pushing the compute into the databases, it's much faster than querying for and comparing every row. 19 | 20 | ![Performance for 100M rows](https://user-images.githubusercontent.com/97400/175182987-a3900d4e-c097-4732-a4e9-19a40fac8cdc.png) 21 | 22 | **†:** The performance for downloading rows is fairly driver-specific. In our tests, PostgreSQL performed 10x 23 | better than MySQL. 24 | 25 | ### Deep Dive 26 | 27 | In this section we'll be doing a walk-through of exactly how Reladiff 28 | works, and how to tune `--bisection-factor` and `--bisection-threshold`. 29 | 30 | Let's consider a scenario with an `orders` table with 1M rows. Fivetran is 31 | replicating it contionously from PostgreSQL to Snowflake: 32 | 33 | ``` 34 | ┌─────────────┐ ┌─────────────┐ 35 | │ PostgreSQL │ │ Snowflake │ 36 | ├─────────────┤ ├─────────────┤ 37 | │ │ │ │ 38 | │ │ │ │ 39 | │ │ ┌─────────────┐ │ table with │ 40 | │ table with ├──┤ replication ├──────▶│ ?maybe? all │ 41 | │lots of rows!│ └─────────────┘ │ the same │ 42 | │ │ │ rows. │ 43 | │ │ │ │ 44 | │ │ │ │ 45 | │ │ │ │ 46 | └─────────────┘ └─────────────┘ 47 | ``` 48 | 49 | In order to check whether the two tables are the same, Reladiff splits 50 | the table into segemnts. We define `--bisection-factor=10`, so it will start with 10 segments. 51 | 52 | We also have to choose which columns we want to checksum. In our case, we care 53 | about the primary key, `--key-column=id` and the update column 54 | `--update-column=updated_at`. `updated_at` is updated every time the row is, and 55 | we have an index on it. 56 | 57 | Reladiff starts by querying both databases for the `min(id)` and `max(id)` 58 | of the table. Then it splits the table into `--bisection-factor=10` segments of 59 | `1M/10 = 100K` keys each: 60 | 61 | ``` 62 | ┌──────────────────────┐ ┌──────────────────────┐ 63 | │ PostgreSQL │ │ Snowflake │ 64 | ├──────────────────────┤ ├──────────────────────┤ 65 | │ id=1..100k │ │ id=1..100k │ 66 | ├──────────────────────┤ ├──────────────────────┤ 67 | │ id=100k..200k │ │ id=100k..200k │ 68 | ├──────────────────────┤ ├──────────────────────┤ 69 | │ id=200k..300k ├─────────────▶│ id=200k..300k │ 70 | ├──────────────────────┤ ├──────────────────────┤ 71 | │ id=300k..400k │ │ id=300k..400k │ 72 | ├──────────────────────┤ ├──────────────────────┤ 73 | │ ... │ │ ... │ 74 | ├──────────────────────┤ ├──────────────────────┤ 75 | │ 900k..100k │ │ 900k..100k │ 76 | └───────────────────▲──┘ └▲─────────────────────┘ 77 | ┃ ┃ 78 | ┃ ┃ 79 | ┃ checksum queries ┃ 80 | ┃ ┃ 81 | ┌─┻──────────────────┻────┐ 82 | │ Reladiff │ 83 | └─────────────────────────┘ 84 | ``` 85 | 86 | Now Reladiff will start running `--threads=1` queries in parallel that 87 | checksum each segment. The queries for checksumming each segment will look 88 | something like this, depending on the database: 89 | 90 | ```sql 91 | SELECT count(*), 92 | sum(cast(conv(substring(md5(concat(cast(id as char), cast(updated_at as char))), 18), 16, 10) as unsigned)) 93 | FROM `rating_del1` 94 | WHERE (id >= 1) AND (id < 100000) 95 | ``` 96 | 97 | This keeps the amount of data that has to be transferred between the databases 98 | to a minimum, making it very performant! Additionally, if you have an index on 99 | `updated_at` (highly recommended), then the query will be fast, as the database 100 | only has to do a partial index scan between `id=1..100k`. 101 | 102 | If you are not sure whether the queries are using an index, you can run it with 103 | `--interactive`. This puts Reladiff in interactive mode, where it shows an 104 | `EXPLAIN` before executing each query, requiring confirmation to proceed. 105 | 106 | After running the checksum queries on both sides, we see that all segments 107 | are the same except `id=100k..200k`: 108 | 109 | ``` 110 | ┌──────────────────────┐ ┌──────────────────────┐ 111 | │ PostgreSQL │ │ Snowflake │ 112 | ├──────────────────────┤ ├──────────────────────┤ 113 | │ checksum=0102 │ │ checksum=0102 │ 114 | ├──────────────────────┤ mismatch! ├──────────────────────┤ 115 | │ checksum=ffff ◀──────────────▶ checksum=aaab │ 116 | ├──────────────────────┤ ├──────────────────────┤ 117 | │ checksum=abab │ │ checksum=abab │ 118 | ├──────────────────────┤ ├──────────────────────┤ 119 | │ checksum=f0f0 │ │ checksum=f0f0 │ 120 | ├──────────────────────┤ ├──────────────────────┤ 121 | │ ... │ │ ... │ 122 | ├──────────────────────┤ ├──────────────────────┤ 123 | │ checksum=9494 │ │ checksum=9494 │ 124 | └──────────────────────┘ └──────────────────────┘ 125 | ``` 126 | 127 | Now Reladiff will do exactly as it just did for the _whole table_ for only 128 | this segment: Split it into `--bisection-factor` segments. 129 | 130 | However, this time, because each segment has `100k/10=10k` entries, which is 131 | less than the `--bisection-threshold`, it will pull down every row in the segment 132 | and compare them in memory in Reladiff. 133 | 134 | ``` 135 | ┌──────────────────────┐ ┌──────────────────────┐ 136 | │ PostgreSQL │ │ Snowflake │ 137 | ├──────────────────────┤ ├──────────────────────┤ 138 | │ id=100k..110k │ │ id=100k..110k │ 139 | ├──────────────────────┤ ├──────────────────────┤ 140 | │ id=110k..120k │ │ id=110k..120k │ 141 | ├──────────────────────┤ ├──────────────────────┤ 142 | │ id=120k..130k │ │ id=120k..130k │ 143 | ├──────────────────────┤ ├──────────────────────┤ 144 | │ id=130k..140k │ │ id=130k..140k │ 145 | ├──────────────────────┤ ├──────────────────────┤ 146 | │ ... │ │ ... │ 147 | ├──────────────────────┤ ├──────────────────────┤ 148 | │ 190k..200k │ │ 190k..200k │ 149 | └──────────────────────┘ └──────────────────────┘ 150 | ``` 151 | 152 | Finally Reladiff will output the `(id, updated_at)` for each row that was different: 153 | 154 | ``` 155 | (122001, 1653672821) 156 | ``` 157 | 158 | If you pass `--stats` you'll see stats such as the % of rows were different. 159 | 160 | ### Performance Considerations 161 | 162 | * Ensure that you have indexes on the columns you are comparing. Preferably a 163 | compound index. You can run with `--interactive` to see an `EXPLAIN` for the 164 | queries. 165 | * Consider increasing the number of simultaneous threads executing 166 | queries per database with `--threads`. For databases that limit concurrency 167 | per query, such as PostgreSQL/MySQL, this can improve performance dramatically. 168 | * If you are only interested in _whether_ something changed, pass `--limit 1`. 169 | This can be useful if changes are very rare. This is often faster than doing a 170 | `count(*)`, for the reason mentioned above. 171 | * If the table is _very_ large, consider a larger `--bisection-factor`. Otherwise, you may run into timeouts. 172 | * If there are a lot of changes, consider a larger `--bisection-threshold`. 173 | * If there are very large gaps in your key column (e.g., 10s of millions of 174 | continuous rows missing), then Reladiff may perform poorly, doing lots of 175 | queries for ranges of rows that do not exist. We have ideas on how to tackle this issue, which we have yet to implement. If you're experiencing this effect, please open an issue, and we 176 | will prioritize it. 177 | * The fewer columns you verify (passed with `--columns`), the faster 178 | Reladiff will be. On one extreme, you can verify every column; on the 179 | other, you can verify _only_ `updated_at`, if you trust it enough. You can also 180 | _only_ verify `id` if you're interested in only presence, such as to detect 181 | missing hard deletes. You can do also do a hybrid where you verify 182 | `updated_at` and the most critical value, such as a money value in `amount`, but 183 | not verify a large serialized column like `json_settings`. 184 | * We have ideas for making Reladiff even faster that 185 | we haven't implemented yet: faster checksums by reducing type-casts 186 | and using a faster hash than MD5, dynamic adaptation of 187 | `bisection_factor`/`threads`/`bisection_threshold` (especially with large key 188 | gaps), and improvements to bypass Python/driver performance limitations when 189 | comparing huge amounts of rows locally (i.e. for very high `bisection_threshold` values). 190 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "reladiff" 3 | version = "0.6.0" 4 | description = "Command-line tool and Python library to efficiently diff rows across two different databases." 5 | authors = ["Erez Shinan "] 6 | license = "MIT" 7 | readme = "README.md" 8 | repository = "https://github.com/erezsh/reladiff" 9 | documentation = "https://reladiff.readthedocs.io/en/latest/" 10 | classifiers = [ 11 | "Intended Audience :: Developers", 12 | "Intended Audience :: Information Technology", 13 | "Intended Audience :: System Administrators", 14 | "Programming Language :: Python :: 3.8", 15 | "Programming Language :: Python :: 3.9", 16 | "Programming Language :: Python :: 3.10", 17 | "Programming Language :: Python :: 3.11", 18 | "Programming Language :: Python :: 3.12", 19 | "Development Status :: 4 - Beta", 20 | "Environment :: Console", 21 | "Topic :: Database :: Database Engines/Servers", 22 | "Typing :: Typed" 23 | ] 24 | packages = [{ include = "reladiff" }] 25 | 26 | [tool.poetry.dependencies] 27 | python = "^3.8" 28 | runtype = ">=0.5.0" 29 | dsnparse = "*" 30 | click = ">=8.1" 31 | rich = "*" 32 | toml = ">=0.10.2" 33 | sqeleton = "^0.1.7" 34 | mysql-connector-python = {version=">=8.0.29", optional=true} 35 | psycopg2-binary = {version="*", optional=true} 36 | snowflake-connector-python = {version=">=2.7.2", optional=true} 37 | cryptography = {version="*", optional=true} 38 | trino = {version=">=0.314.0", optional=true} 39 | presto-python-client = {version="*", optional=true} 40 | clickhouse-driver = {version="*", optional=true} 41 | duckdb = {version=">=0.6.0", optional=true} 42 | 43 | [tool.poetry.dev-dependencies] 44 | parameterized = "*" 45 | unittest-parallel = "*" 46 | # preql = ">=0.2.19" 47 | mysql-connector-python = "*" 48 | psycopg2-binary = "*" 49 | snowflake-connector-python = ">=2.7.2" 50 | cryptography = "*" 51 | trino = ">=0.314.0" 52 | presto-python-client = "*" 53 | clickhouse-driver = "*" 54 | vertica-python = "*" 55 | duckdb = ">=0.6.0" 56 | # google-cloud-bigquery = "*" 57 | # databricks-sql-connector = "*" 58 | 59 | [tool.poetry.extras] 60 | # When adding, update also: README + dev deps just above 61 | preql = ["preql"] 62 | mysql = ["mysql-connector-python"] 63 | postgresql = ["psycopg2-binary"] 64 | snowflake = ["snowflake-connector-python", "cryptography"] 65 | presto = ["presto-python-client"] 66 | oracle = ["cx_Oracle"] 67 | # databricks = ["databricks-sql-connector"] 68 | trino = ["trino"] 69 | clickhouse = ["clickhouse-driver"] 70 | vertica = ["vertica-python"] 71 | duckdb = ["duckdb"] 72 | 73 | all = ["mysql-connector-python", "psycopg2-binary", "snowflake-connector-python", "cryptography", "presto-python-client", "cx_Oracle", "trino", "clickhouse-driver", "vertica-python", "duckdb"] 74 | 75 | [build-system] 76 | requires = ["poetry-core>=1.0.0"] 77 | build-backend = "poetry.core.masonry.api" 78 | 79 | [tool.poetry.scripts] 80 | reladiff = 'reladiff.__main__:main' 81 | 82 | [tool.mypy] 83 | no_implicit_optional=false 84 | 85 | [tool.ruff] 86 | line-length = 120 87 | 88 | [tool.black] 89 | line-length = 120 90 | target-version = ['py38'] 91 | -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | formats: all 4 | 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: "3.8" 9 | 10 | python: 11 | install: 12 | - requirements: docs/requirements.txt 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | -------------------------------------------------------------------------------- /reladiff/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple, Iterable, Optional, Union 2 | 3 | from sqeleton.abcs import DbTime, DbPath, AbstractDatabase 4 | 5 | from .databases import connect 6 | from .diff_tables import Algorithm, TableDiffer, DiffResultWrapper 7 | from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR 8 | from .joindiff_tables import JoinDiffer, TABLE_WRITE_LIMIT 9 | from .table_segment import TableSegment 10 | from .utils import eval_name_template, Vector 11 | 12 | __version__ = "0.6.0" 13 | 14 | 15 | def connect_to_table( 16 | db_info: Union[str, dict, AbstractDatabase], 17 | table_name: Union[DbPath, str], 18 | key_columns: Union[str, Sequence[str]] = ("id",), 19 | thread_count: Optional[int] = 1, 20 | **kwargs, 21 | ) -> TableSegment: 22 | """Connects to the given database, and creates a TableSegment instance 23 | 24 | Parameters: 25 | db_info: Either a URI string, dict of connection options or a reladiff AbstractDatabase type. 26 | table_name: Name of the table as a string, or a tuple that signifies the path. 27 | key_columns: Names of the key columns 28 | thread_count: Number of threads for this connection (only if using a threadpooled db implementation) 29 | 30 | See Also: 31 | :meth:`connect` 32 | """ 33 | if isinstance(key_columns, str): 34 | key_columns = (key_columns,) 35 | if isinstance(db_info, AbstractDatabase): 36 | db = db_info 37 | else: 38 | db = connect(db_info, thread_count=thread_count) 39 | 40 | if isinstance(table_name, str): 41 | table_name = db.parse_table_name(table_name) 42 | 43 | return TableSegment(db, tuple(table_name), tuple(key_columns), **kwargs) 44 | 45 | 46 | def diff_tables( 47 | table1: TableSegment, 48 | table2: TableSegment, 49 | *, 50 | # Name of the key column, which uniquely identifies each row (usually id) 51 | key_columns: Sequence[str] = None, 52 | # Name of updated column, which signals that rows changed (usually updated_at or last_update) 53 | update_column: str = None, 54 | # Extra columns to compare 55 | extra_columns: Tuple[str, ...] = None, 56 | # Start/end key_column values, used to restrict the segment 57 | min_key: Vector = None, 58 | max_key: Vector = None, 59 | # Start/end update_column values, used to restrict the segment 60 | min_update: DbTime = None, 61 | max_update: DbTime = None, 62 | # Enable/disable threaded diffing. Needed to take advantage of database threads. 63 | threaded: bool = True, 64 | # Maximum size of each threadpool. None = auto. Only relevant when threaded is True. 65 | # There may be many pools, so number of actual threads can be a lot higher. 66 | max_threadpool_size: Optional[int] = 1, 67 | # Algorithm 68 | algorithm: Algorithm = Algorithm.AUTO, 69 | # An additional 'where' expression to restrict the search space. 70 | where: str = None, 71 | # Into how many segments to bisect per iteration (hashdiff only) 72 | bisection_factor: int = DEFAULT_BISECTION_FACTOR, 73 | # When should we stop bisecting and compare locally (in row count; hashdiff only) 74 | bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD, 75 | # Enable/disable validating that the key columns are unique. (joindiff only) 76 | # Enable/disable support for duplicate rows, offering a small performance gain. (hashdiff only) 77 | validate_unique_key: bool = True, 78 | # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only) 79 | sample_exclusive_rows: bool = False, 80 | # Path of new table to write diff results to. Disabled if not provided. (joindiff only) 81 | materialize_to_table: Union[str, DbPath] = None, 82 | # Materialize every row, not just those that are different. (joindiff only) 83 | materialize_all_rows: bool = False, 84 | # Maximum number of rows to write when materializing, per thread. (joindiff only) 85 | table_write_limit: int = TABLE_WRITE_LIMIT, 86 | # If false, diffing on empty tables raises an EmptyTable(ValueError) exception. 87 | allow_empty_tables: bool = False, 88 | # Skip sorting the hashdiff output by key for better performance. (hashdiff only) 89 | skip_sort_results: bool = False, 90 | ) -> DiffResultWrapper: 91 | """Finds the diff between table1 and table2. 92 | 93 | Parameters: 94 | key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id) 95 | update_column (str, optional): Name of updated column, which signals that rows changed. 96 | Usually updated_at or last_update. Used by `min_update` and `max_update`. 97 | extra_columns (Tuple[str, ...], optional): Extra columns to compare 98 | min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment 99 | max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment 100 | min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment 101 | max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment 102 | threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads. 103 | max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto. 104 | Only relevant when `threaded` is ``True``. 105 | There may be many pools, so number of actual threads can be a lot higher. 106 | (Note: For best performance, we recommend setting this to at least twice the 107 | `thread_count` argument provided to the driver through `connect()`/`connect_to_table()`. 108 | where (str, optional): An additional 'where' expression to restrict the search space. 109 | algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`) 110 | bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`) 111 | bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download 112 | and compare locally. (Used when algorithm is `HASHDIFF`). 113 | validate_unique_key (bool): Enable/disable validating that the key columns are unique (`JOINDIFF`). 114 | Enable/disable support for duplicate rows, offering a small performance gain (`HASHDIFF`). 115 | (default: True) 116 | Single query, and can't be threaded, so it's very slow on non-cloud dbs. 117 | Future versions will detect UNIQUE constraints in the schema. 118 | sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False) 119 | materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`. 120 | materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False) 121 | table_write_limit (int): Maximum number of rows to write when materializing, per thread. 122 | allow_empty_tables (bool): If false, diffing on empty tables raises an EmptyTable(ValueError) exception. 123 | skip_sort_results (bool): Skip sorting the hashdiff output by key for better performance. (used for `HASHDIFF`. default: False) 124 | 125 | Note: 126 | The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances: 127 | `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`. 128 | If different values are needed per table, it's possible to omit them here, and instead set 129 | them directly when creating each :class:`TableSegment`. 130 | 131 | Note: 132 | It is recommended to call .close() on the returned object when done, to release thread-pool. Alternatively, you may use it as a context manager. 133 | 134 | Example: 135 | >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id') 136 | >>> list(diff_tables(table1, table1)) 137 | [] 138 | >>> with diff_tables(table1, table1) as diff: 139 | ... print(list(diff)) 140 | [] 141 | 142 | See Also: 143 | :class:`TableSegment` 144 | :class:`HashDiffer` 145 | :class:`JoinDiffer` 146 | 147 | """ 148 | if isinstance(key_columns, str): 149 | key_columns = (key_columns,) 150 | 151 | tables = [table1, table2] 152 | override_attrs = { 153 | k: v 154 | for k, v in dict( 155 | key_columns=key_columns, 156 | update_column=update_column, 157 | extra_columns=extra_columns, 158 | min_key=min_key, 159 | max_key=max_key, 160 | min_update=min_update, 161 | max_update=max_update, 162 | where=where, 163 | ).items() 164 | if v is not None 165 | } 166 | 167 | segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables 168 | 169 | algorithm = Algorithm(algorithm) 170 | if algorithm == Algorithm.AUTO: 171 | algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF 172 | 173 | differ: TableDiffer 174 | if algorithm == Algorithm.HASHDIFF: 175 | differ = HashDiffer( 176 | bisection_factor=bisection_factor, 177 | bisection_threshold=bisection_threshold, 178 | threaded=threaded, 179 | max_threadpool_size=max_threadpool_size, 180 | allow_empty_tables=allow_empty_tables, 181 | skip_sort_results=skip_sort_results, 182 | ) 183 | elif algorithm == Algorithm.JOINDIFF: 184 | if isinstance(materialize_to_table, str): 185 | materialize_to_table = table1.database.parse_table_name(eval_name_template(materialize_to_table)) 186 | differ = JoinDiffer( 187 | threaded=threaded, 188 | max_threadpool_size=max_threadpool_size, 189 | validate_unique_key=validate_unique_key, 190 | sample_exclusive_rows=sample_exclusive_rows, 191 | materialize_to_table=materialize_to_table, 192 | materialize_all_rows=materialize_all_rows, 193 | table_write_limit=table_write_limit, 194 | allow_empty_tables=allow_empty_tables, 195 | ) 196 | else: 197 | raise ValueError(f"Unknown algorithm: {algorithm}") 198 | 199 | return differ.diff_tables(*segments) 200 | -------------------------------------------------------------------------------- /reladiff/config.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from typing import Any, Dict 4 | import toml 5 | 6 | 7 | class ConfigParseError(Exception): 8 | pass 9 | 10 | 11 | def is_uri(s: str) -> bool: 12 | return "://" in s 13 | 14 | 15 | def _apply_config(config: Dict[str, Any], run_name: str, kw: Dict[str, Any]): 16 | _resolve_env(config) 17 | 18 | # Load config 19 | databases = config.pop("database", {}) 20 | runs = config.pop("run", {}) 21 | if config: 22 | raise ConfigParseError(f"Unknown option(s): {config}") 23 | 24 | # Init run_args 25 | run_args = runs.get("default") or {} 26 | if run_name: 27 | if run_name not in runs: 28 | raise ConfigParseError(f"Cannot find run '{run_name}' in configuration.") 29 | run_args.update(runs[run_name]) 30 | else: 31 | run_name = "default" 32 | 33 | if kw.get("database1") is not None: 34 | for attr in ("table1", "database2", "table2"): 35 | if kw[attr] is None: 36 | raise ValueError(f"Specified database1 but not {attr}. Must specify all 4 arguments, or neither.") 37 | 38 | for index in "12": 39 | run_args[index] = {attr: kw.pop(f"{attr}{index}") for attr in ("database", "table")} 40 | 41 | # Process databases + tables 42 | for index in "12": 43 | try: 44 | args = run_args.pop(index) 45 | except KeyError: 46 | raise ConfigParseError( 47 | f"Could not find source #{index}: Expecting a key of '{index}' containing '.database' and '.table'." 48 | ) 49 | for attr in ("database", "table"): 50 | if attr not in args: 51 | raise ConfigParseError(f"Running 'run.{run_name}': Connection #{index} is missing attribute '{attr}'.") 52 | 53 | database = args.pop("database") 54 | table = args.pop("table") 55 | threads = args.pop("threads", None) 56 | if args: 57 | raise ConfigParseError(f"Unexpected attributes for connection #{index}: {args}") 58 | 59 | if not is_uri(database): 60 | if database not in databases: 61 | raise ConfigParseError( 62 | f"Database '{database}' not found in list of databases. Available: {list(databases)}." 63 | ) 64 | database = dict(databases[database]) 65 | assert isinstance(database, dict) 66 | if "driver" not in database: 67 | raise ConfigParseError(f"Database '{database}' did not specify a driver.") 68 | 69 | run_args[f"database{index}"] = database 70 | run_args[f"table{index}"] = table 71 | if threads is not None: 72 | run_args[f"threads{index}"] = int(threads) 73 | 74 | # Update keywords 75 | new_kw = dict(kw) # Set defaults 76 | new_kw.update(run_args) # Apply config 77 | new_kw.update({k: v for k, v in kw.items() if v}) # Apply non-empty defaults 78 | 79 | new_kw["__conf__"] = run_args 80 | 81 | return new_kw 82 | 83 | 84 | # There are no strict requirements for the environment variable name format. 85 | # But most shells only allow alphanumeric characters and underscores. 86 | # https://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html 87 | # "Environment variable names (...) consist solely of uppercase letters, digits, and the '_' (underscore)" 88 | _ENV_VAR_PATTERN = r"\$\{([A-Za-z0-9_]+)\}" 89 | 90 | 91 | def _resolve_env(config: Dict[str, Any]): 92 | """ 93 | Resolve environment variables referenced as ${ENV_VAR_NAME}. 94 | Missing environment variables are replaced with an empty string. 95 | """ 96 | for key, value in config.items(): 97 | if isinstance(value, dict): 98 | _resolve_env(value) 99 | elif isinstance(value, str): 100 | config[key] = re.sub(_ENV_VAR_PATTERN, _replace_match, value) 101 | 102 | 103 | def _replace_match(match: re.Match) -> str: 104 | # Lookup referenced variable in environment. 105 | # Replace with empty string if not found 106 | referenced_var = match.group(1) # group(0) is the whole string 107 | return os.environ.get(referenced_var, "") 108 | 109 | 110 | def apply_config_from_file(path: str, run_name: str, kw: Dict[str, Any]): 111 | with open(path) as f: 112 | return _apply_config(toml.load(f), run_name, kw) 113 | 114 | 115 | def apply_config_from_string(toml_config: str, run_name: str, kw: Dict[str, Any]): 116 | return _apply_config(toml.loads(toml_config), run_name, kw) 117 | -------------------------------------------------------------------------------- /reladiff/databases/__init__.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, QueryError, ConnectError 2 | 3 | from .postgresql import PostgreSQL 4 | from .mysql import MySQL 5 | from .oracle import Oracle 6 | from .snowflake import Snowflake 7 | from .bigquery import BigQuery 8 | from .redshift import Redshift 9 | from .presto import Presto 10 | from .databricks import Databricks 11 | from .trino import Trino 12 | from .clickhouse import Clickhouse 13 | from .vertica import Vertica 14 | from .duckdb import DuckDB 15 | 16 | from ._connect import connect 17 | -------------------------------------------------------------------------------- /reladiff/databases/_connect.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from sqeleton.databases import Connect 4 | 5 | from .postgresql import PostgreSQL 6 | from .mysql import MySQL 7 | from .oracle import Oracle 8 | from .snowflake import Snowflake 9 | from .bigquery import BigQuery 10 | from .redshift import Redshift 11 | from .presto import Presto 12 | from .databricks import Databricks 13 | from .trino import Trino 14 | from .clickhouse import Clickhouse 15 | from .vertica import Vertica 16 | from .duckdb import DuckDB 17 | 18 | 19 | DATABASE_BY_SCHEME = { 20 | "postgresql": PostgreSQL, 21 | "mysql": MySQL, 22 | "oracle": Oracle, 23 | "redshift": Redshift, 24 | "snowflake": Snowflake, 25 | "presto": Presto, 26 | "bigquery": BigQuery, 27 | "databricks": Databricks, 28 | "duckdb": DuckDB, 29 | "trino": Trino, 30 | "clickhouse": Clickhouse, 31 | "vertica": Vertica, 32 | } 33 | 34 | 35 | class Connect_SetUTC(Connect): 36 | __doc__ = Connect.__call__.__doc__ 37 | 38 | def _connection_created(self, db): 39 | db = super()._connection_created(db) 40 | try: 41 | db.query(db.dialect.set_timezone_to_utc()) 42 | except NotImplementedError: 43 | logging.debug( 44 | f"Database '{db}' does not allow setting timezone. We recommend making sure it's set to 'UTC'." 45 | ) 46 | return db 47 | 48 | 49 | connect = Connect_SetUTC(DATABASE_BY_SCHEME) 50 | -------------------------------------------------------------------------------- /reladiff/databases/base.py: -------------------------------------------------------------------------------- 1 | from sqeleton.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue 2 | 3 | 4 | class ReladiffDialect(AbstractMixin_MD5, AbstractMixin_NormalizeValue): 5 | pass 6 | -------------------------------------------------------------------------------- /reladiff/databases/bigquery.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import bigquery 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(bigquery.Dialect, bigquery.Mixin_MD5, bigquery.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class BigQuery(bigquery.BigQuery): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/clickhouse.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import clickhouse 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(clickhouse.Dialect, clickhouse.Mixin_MD5, clickhouse.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Clickhouse(clickhouse.Clickhouse): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/databricks.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import databricks 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(databricks.Dialect, databricks.Mixin_MD5, databricks.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Databricks(databricks.Databricks): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/duckdb.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import duckdb 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(duckdb.Dialect, duckdb.Mixin_MD5, duckdb.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class DuckDB(duckdb.DuckDB): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/mysql.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import mysql 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(mysql.Dialect, mysql.Mixin_MD5, mysql.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class MySQL(mysql.MySQL): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/oracle.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import oracle 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(oracle.Dialect, oracle.Mixin_MD5, oracle.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Oracle(oracle.Oracle): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/postgresql.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import postgresql as pg 2 | from .base import ReladiffDialect 3 | 4 | 5 | class PostgresqlDialect(pg.PostgresqlDialect, pg.Mixin_MD5, pg.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class PostgreSQL(pg.PostgreSQL): 10 | dialect = PostgresqlDialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/presto.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import presto 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(presto.Dialect, presto.Mixin_MD5, presto.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Presto(presto.Presto): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/redshift.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import redshift 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(redshift.Dialect, redshift.Mixin_MD5, redshift.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Redshift(redshift.Redshift): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/snowflake.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import snowflake 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(snowflake.Dialect, snowflake.Mixin_MD5, snowflake.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Snowflake(snowflake.Snowflake): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/trino.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import trino 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(trino.Dialect, trino.Mixin_MD5, trino.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Trino(trino.Trino): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/databases/vertica.py: -------------------------------------------------------------------------------- 1 | from sqeleton.databases import vertica 2 | from .base import ReladiffDialect 3 | 4 | 5 | class Dialect(vertica.Dialect, vertica.Mixin_MD5, vertica.Mixin_NormalizeValue, ReladiffDialect): 6 | pass 7 | 8 | 9 | class Vertica(vertica.Vertica): 10 | dialect = Dialect() 11 | -------------------------------------------------------------------------------- /reladiff/diff_tables.py: -------------------------------------------------------------------------------- 1 | """Provides classes for performing a table diff 2 | """ 3 | 4 | from abc import ABC, abstractmethod 5 | from enum import Enum 6 | from contextlib import contextmanager 7 | from operator import methodcaller 8 | from typing import Dict, Tuple, Iterator, Optional 9 | from concurrent.futures import ThreadPoolExecutor, as_completed 10 | 11 | from runtype import dataclass 12 | 13 | from .info_tree import InfoTree, SegmentInfo 14 | 15 | from .utils import safezip, getLogger, Vector 16 | from .thread_utils import ThreadedYielder 17 | from .table_segment import TableSegment, create_mesh_from_points, EmptyTable, EmptyTableSegment 18 | from sqeleton.abcs import IKey 19 | 20 | logger = getLogger(__name__) 21 | 22 | 23 | class Algorithm(Enum): 24 | AUTO = "auto" 25 | JOINDIFF = "joindiff" 26 | HASHDIFF = "hashdiff" 27 | 28 | 29 | DiffResult = Iterator[Tuple[str, tuple]] # Iterator[Tuple[Literal["+", "-"], tuple]] 30 | 31 | 32 | @dataclass 33 | class ThreadBase: 34 | "Provides utility methods for optional threading" 35 | 36 | threaded: bool = True 37 | max_threadpool_size: Optional[int] = 1 38 | 39 | def _thread_map(self, func, iterable): 40 | if not self.threaded: 41 | return map(func, iterable) 42 | 43 | with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool: 44 | return task_pool.map(func, iterable) 45 | 46 | def _threaded_call(self, func, iterable, **kw): 47 | "Calls a method for each object in iterable." 48 | return list(self._thread_map(methodcaller(func, **kw), iterable)) 49 | 50 | def _thread_as_completed(self, func, iterable): 51 | if not self.threaded: 52 | yield from map(func, iterable) 53 | return 54 | 55 | with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool: 56 | futures = [task_pool.submit(func, item) for item in iterable] 57 | for future in as_completed(futures): 58 | yield future.result() 59 | 60 | def _threaded_call_as_completed(self, func, iterable): 61 | "Calls a method for each object in iterable. Returned in order of completion." 62 | return self._thread_as_completed(methodcaller(func), iterable) 63 | 64 | @contextmanager 65 | def _run_in_background(self, *funcs): 66 | with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool: 67 | futures = [task_pool.submit(f) for f in funcs if f is not None] 68 | yield futures 69 | for f in futures: 70 | f.result() 71 | 72 | 73 | @dataclass 74 | class DiffStats: 75 | diff_by_sign: Dict[str, int] 76 | table1_count: int 77 | table2_count: int 78 | unchanged: int 79 | diff_percent: float 80 | 81 | 82 | @dataclass 83 | class DiffResultWrapper: 84 | """Wrapper for the diff result, with additional stats and info 85 | 86 | Supports reenterant iteration, context management, and immediate closing of the thread pool. 87 | 88 | Note: Once the threadpool is closed, the iterator will not be able to continue. 89 | """ 90 | diff: iter # DiffResult 91 | info_tree: InfoTree 92 | stats: dict 93 | _ti: ThreadedYielder 94 | result_list: list = [] 95 | 96 | def __iter__(self): 97 | """Iterate over the results of the diff. 98 | 99 | It's a "lazy-list": Repeated calls will return the same results, but will not re-run the diff. 100 | """ 101 | yield from self.result_list 102 | for i in self.diff: 103 | self.result_list.append(i) 104 | yield i 105 | 106 | def close(self): 107 | "Immediately stop diffing and close the thread pool" 108 | # TODO we should be able to wait for the thread pool to finish 109 | self._ti.shutdown(wait=False) 110 | 111 | def __enter__(self): 112 | return self 113 | def __exit__(self, *args): 114 | self.close() 115 | 116 | def _get_stats(self) -> DiffStats: 117 | list(self) # Consume the iterator into result_list, if we haven't already 118 | 119 | diff_by_key = {} 120 | for sign, values in self.result_list: 121 | k = values[: len(self.info_tree.info.tables[0].key_columns)] 122 | if k in diff_by_key: 123 | assert sign != diff_by_key[k] 124 | diff_by_key[k] = "!" 125 | else: 126 | diff_by_key[k] = sign 127 | 128 | diff_by_sign = {k: 0 for k in "+-!"} 129 | for sign in diff_by_key.values(): 130 | diff_by_sign[sign] += 1 131 | 132 | table1_count = self.info_tree.info.rowcounts[1] 133 | table2_count = self.info_tree.info.rowcounts[2] 134 | unchanged = table1_count - diff_by_sign["-"] - diff_by_sign["!"] 135 | diff_percent = 1 - unchanged / max(table1_count, table2_count, 1) 136 | 137 | return DiffStats(diff_by_sign, table1_count, table2_count, unchanged, diff_percent) 138 | 139 | def get_stats_string(self): 140 | """Return a pretty string of the diff stats (used by the CLI)""" 141 | diff_stats = self._get_stats() 142 | string_output = "" 143 | string_output += f"{diff_stats.table1_count} rows in table A\n" 144 | string_output += f"{diff_stats.table2_count} rows in table B\n" 145 | string_output += f"{diff_stats.diff_by_sign['-']} rows exclusive to table A (not present in B)\n" 146 | string_output += f"{diff_stats.diff_by_sign['+']} rows exclusive to table B (not present in A)\n" 147 | string_output += f"{diff_stats.diff_by_sign['!']} rows updated\n" 148 | string_output += f"{diff_stats.unchanged} rows unchanged\n" 149 | string_output += f"{100*diff_stats.diff_percent:.2f}% difference score\n" 150 | return string_output 151 | 152 | def get_stats_dict(self): 153 | """Return a dictionary of the diff stats""" 154 | diff_stats = self._get_stats() 155 | json_output = { 156 | "rows_A": diff_stats.table1_count, 157 | "rows_B": diff_stats.table2_count, 158 | "exclusive_A": diff_stats.diff_by_sign["-"], 159 | "exclusive_B": diff_stats.diff_by_sign["+"], 160 | "updated": diff_stats.diff_by_sign["!"], 161 | "unchanged": diff_stats.unchanged, 162 | "total": sum(diff_stats.diff_by_sign.values()), 163 | "stats": self.stats, 164 | } 165 | 166 | return json_output 167 | 168 | 169 | @dataclass(frozen=True) 170 | class TableDiffer(ThreadBase, ABC): 171 | bisection_factor = 32 172 | stats: dict = {} 173 | allow_empty_tables: bool = False 174 | 175 | def diff_tables( 176 | self, table1: TableSegment, table2: TableSegment, *, info_tree: InfoTree = None 177 | ) -> DiffResultWrapper: 178 | """Diff the given tables. 179 | 180 | Parameters: 181 | table1 (TableSegment): The "before" table to compare. Or: source table 182 | table2 (TableSegment): The "after" table to compare. Or: target table 183 | 184 | Returns: 185 | An iterator that yield pair-tuples, representing the diff. Items can be either - 186 | ('-', row) for items in table1 but not in table2. 187 | ('+', row) for items in table2 but not in table1. 188 | Where `row` is a tuple of values, corresponding to the diffed columns. 189 | """ 190 | if info_tree is None: 191 | info_tree = InfoTree(SegmentInfo([table1, table2])) 192 | ti = ThreadedYielder(self.max_threadpool_size) 193 | return DiffResultWrapper(self._diff_tables_wrapper(table1, table2, info_tree, ti), info_tree, self.stats, ti) 194 | 195 | def _diff_tables_wrapper( 196 | self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree, ti: ThreadedYielder 197 | ) -> DiffResult: 198 | try: 199 | # Query and validate schema 200 | table1, table2 = self._threaded_call( 201 | "with_schema", [table1, table2], allow_empty_table=self.allow_empty_tables 202 | ) 203 | self._validate_and_adjust_columns(table1, table2) 204 | 205 | yield from self._diff_tables_root(table1, table2, info_tree, ti) 206 | finally: 207 | info_tree.aggregate_info() 208 | 209 | def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegment) -> DiffResult: 210 | pass 211 | 212 | def _diff_tables_root( 213 | self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree, ti: ThreadedYielder 214 | ) -> DiffResult: 215 | return self._bisect_and_diff_tables(table1, table2, info_tree, ti) 216 | 217 | @abstractmethod 218 | def _diff_segments( 219 | self, 220 | ti: ThreadedYielder, 221 | table1: TableSegment, 222 | table2: TableSegment, 223 | info_tree: InfoTree, 224 | max_rows: int, 225 | level=0, 226 | segment_index=None, 227 | segment_count=None, 228 | ): ... 229 | 230 | def _bisect_and_diff_tables( 231 | self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree, ti: ThreadedYielder 232 | ): 233 | if len(table1.key_columns) != len(table2.key_columns): 234 | raise ValueError("Tables should have an equivalent number of key columns!") 235 | 236 | key_types1 = table1.key_types 237 | key_types2 = table2.key_types 238 | is_empty1 = isinstance(table1, EmptyTableSegment) 239 | is_empty2 = isinstance(table2, EmptyTableSegment) 240 | 241 | for kt in ([] if is_empty1 else key_types1) + ([] if is_empty2 else key_types2): 242 | if not isinstance(kt, IKey): 243 | raise NotImplementedError(f"Cannot use a column of type {kt} as a key") 244 | 245 | if not (is_empty1 or is_empty2): 246 | for kt1, kt2 in safezip(key_types1, key_types2): 247 | if kt1.python_type is not kt2.python_type: 248 | raise TypeError(f"Incompatible key types: {kt1} and {kt2}") 249 | 250 | # Query min/max values 251 | key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2]) 252 | 253 | # Start with the first completed value, so we don't waste time waiting 254 | try: 255 | min_key1, max_key1 = self._parse_key_range_result(key_types1, next(key_ranges)) 256 | except EmptyTable: 257 | if not self.allow_empty_tables: 258 | raise 259 | try: 260 | min_key1, max_key1 = self._parse_key_range_result(key_types2, next(key_ranges)) 261 | except EmptyTable: 262 | # Both tables are empty 263 | info_tree.info.set_diff([]) 264 | info_tree.info.max_rows = 0 265 | info_tree.info.rowcounts = {1: 0, 2: 0} 266 | return [] 267 | 268 | btable1, btable2 = [t.new_key_bounds(min_key=min_key1, max_key=max_key1) for t in (table1, table2)] 269 | 270 | logger.info( 271 | f"Diffing segments at key-range: {min_key1}..{max_key1}. " 272 | f"size: table1 <= {btable1.approximate_size()}, table2 <= {btable2.approximate_size()}" 273 | ) 274 | 275 | # Bisect (split) the table into segments, and diff them recursively. 276 | ti.submit(self._bisect_and_diff_segments, ti, btable1, btable2, info_tree) 277 | 278 | # Now we check for the second min-max, to diff the portions we "missed". 279 | # This is achieved by subtracting the table ranges, and dividing the resulting space into aligned boxes. 280 | # For example, given tables A & B, and a 2D compound key, where A was queried first for key-range, 281 | # the regions of B we need to diff in this second pass are marked by B1..8: 282 | # ┌──┬──────┬──┐ 283 | # │B1│ B2 │B3│ 284 | # ├──┼──────┼──┤ 285 | # │B4│ A │B5│ 286 | # ├──┼──────┼──┤ 287 | # │B6│ B7 │B8│ 288 | # └──┴──────┴──┘ 289 | # Overall, the max number of new regions in this 2nd pass is 3^|k| - 1 290 | 291 | try: 292 | min_key2, max_key2 = self._parse_key_range_result(key_types1, next(key_ranges)) 293 | except StopIteration: # First table is empty 294 | return ti 295 | except EmptyTable: # Second table is empty 296 | if not self.allow_empty_tables: 297 | raise 298 | return ti 299 | 300 | points = [list(sorted(p)) for p in safezip(min_key1, min_key2, max_key1, max_key2)] 301 | box_mesh = create_mesh_from_points(*points) 302 | 303 | new_regions = [(p1, p2) for p1, p2 in box_mesh if p1 < p2 and not (p1 >= min_key1 and p2 <= max_key1)] 304 | 305 | for p1, p2 in new_regions: 306 | extra_tables = [t.new_key_bounds(min_key=p1, max_key=p2) for t in (table1, table2)] 307 | ti.submit(self._bisect_and_diff_segments, ti, *extra_tables, info_tree) 308 | 309 | return ti 310 | 311 | def _parse_key_range_result(self, key_types, key_range) -> Tuple[Vector, Vector]: 312 | if isinstance(key_range, Exception): 313 | raise key_range 314 | 315 | min_key_values, max_key_values = key_range 316 | 317 | # We add 1 because our ranges are exclusive of the end (like in Python) 318 | try: 319 | min_key = Vector(key_type.make_value(mn) for key_type, mn in safezip(key_types, min_key_values)) 320 | max_key = Vector(key_type.make_value(mx) + 1 for key_type, mx in safezip(key_types, max_key_values)) 321 | except (TypeError, ValueError) as e: 322 | raise type(e)(f"Cannot apply {key_types} to '{min_key_values}', '{max_key_values}'.") from e 323 | 324 | return min_key, max_key 325 | 326 | def _bisect_and_diff_segments( 327 | self, 328 | ti: ThreadedYielder, 329 | table1: TableSegment, 330 | table2: TableSegment, 331 | info_tree: InfoTree, 332 | level=0, 333 | max_rows=None, 334 | ): 335 | assert table1.is_bounded and table2.is_bounded 336 | 337 | # Choose evenly spaced checkpoints (according to min_key and max_key) 338 | biggest_table = max(table1, table2, key=methodcaller("approximate_size")) 339 | checkpoints = biggest_table.choose_checkpoints(self.bisection_factor - 1) 340 | 341 | # Create new instances of TableSegment between each checkpoint 342 | segmented1 = table1.segment_by_checkpoints(checkpoints) 343 | segmented2 = table2.segment_by_checkpoints(checkpoints) 344 | 345 | # Recursively compare each pair of corresponding segments between table1 and table2 346 | for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)): 347 | info_node = info_tree.add_node(t1, t2, max_rows=max_rows) 348 | ti.submit( 349 | self._diff_segments, ti, t1, t2, info_node, max_rows, level + 1, i + 1, len(segmented1), priority=level 350 | ) 351 | -------------------------------------------------------------------------------- /reladiff/hashdiff_tables.py: -------------------------------------------------------------------------------- 1 | import os 2 | from numbers import Number 3 | import logging 4 | from typing import Iterator 5 | from operator import attrgetter 6 | from collections import Counter 7 | from itertools import chain 8 | 9 | from dataclasses import dataclass, field 10 | 11 | # from runtype import dataclass # TODO fix in runtype 12 | 13 | from sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean 14 | 15 | from .info_tree import InfoTree 16 | from .utils import safezip 17 | from .thread_utils import ThreadedYielder 18 | from .table_segment import TableSegment, EmptyTableSegment 19 | 20 | from .diff_tables import TableDiffer 21 | 22 | BENCHMARK = os.environ.get("BENCHMARK", False) 23 | 24 | DEFAULT_BISECTION_THRESHOLD = 1024 * 16 25 | DEFAULT_BISECTION_FACTOR = 32 26 | 27 | logger = logging.getLogger("hashdiff_tables") 28 | 29 | 30 | def diff_sets(a: list, b: list, skip_sort_results: bool, duplicate_rows_support: bool) -> Iterator: 31 | if duplicate_rows_support: 32 | c = Counter(b) 33 | c.subtract(a) 34 | diff = (("+", k) if count > 0 else ("-", k) for k, count in c.items() for _ in range(abs(count))) 35 | else: 36 | sa = set(a) 37 | sb = set(b) 38 | diff = chain((("-", x) for x in sa - sb), (("+", x) for x in sb - sa)) 39 | 40 | return diff if skip_sort_results else sorted(diff, key=lambda i: i[1]) # sort by key 41 | 42 | 43 | @dataclass(frozen=True) 44 | class HashDiffer(TableDiffer): 45 | """Finds the diff between two SQL tables 46 | 47 | The algorithm uses hashing to quickly check if the tables are different, and then applies a 48 | bisection search recursively to find the differences efficiently. 49 | 50 | Works best for comparing tables that are mostly the same, with minor discrepancies. 51 | 52 | Parameters: 53 | bisection_factor (int): Into how many segments to bisect per iteration. 54 | bisection_threshold (Number): When should we stop bisecting and compare locally (in row count). 55 | threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads. 56 | max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto. 57 | Only relevant when `threaded` is ``True``. 58 | There may be many pools, so number of actual threads can be a lot higher. 59 | skip_sort_results (bool): Skip sorting the hashdiff output by key for better performance. 60 | Entries with the same key but different column values may not appear adjacent in the output. 61 | duplicate_rows_support (bool): If ``True``, the algorithm will support duplicate rows in the tables. 62 | """ 63 | 64 | bisection_factor: int = DEFAULT_BISECTION_FACTOR 65 | bisection_threshold: Number = DEFAULT_BISECTION_THRESHOLD # Accepts inf for tests 66 | skip_sort_results: bool = False 67 | duplicate_rows_support: bool = True 68 | 69 | stats: dict = field(default_factory=dict) 70 | 71 | def __post_init__(self): 72 | # Validate options 73 | if self.bisection_factor >= self.bisection_threshold: 74 | raise ValueError("Incorrect param values (bisection factor must be lower than threshold)") 75 | if self.bisection_factor < 2: 76 | raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)") 77 | 78 | def _validate_and_adjust_columns(self, table1, table2): 79 | if isinstance(table1, EmptyTableSegment) or isinstance(table2, EmptyTableSegment): 80 | # Skip all logic; it only pertains to column mismatch 81 | return 82 | 83 | for c1, c2 in safezip(table1.relevant_columns, table2.relevant_columns): 84 | if c1 not in table1._schema: 85 | raise ValueError(f"Column '{c1}' not found in schema for table {table1}") 86 | if c2 not in table2._schema: 87 | raise ValueError(f"Column '{c2}' not found in schema for table {table2}") 88 | 89 | # Update schemas to minimal mutual precision 90 | col1 = table1._schema[c1] 91 | col2 = table2._schema[c2] 92 | if isinstance(col1, PrecisionType): 93 | if not isinstance(col2, PrecisionType): 94 | raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}") 95 | 96 | lowest = min(col1, col2, key=attrgetter("precision")) 97 | 98 | if col1.precision != col2.precision: 99 | logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}") 100 | 101 | table1._schema[c1] = col1.replace(precision=lowest.precision, rounds=lowest.rounds) 102 | table2._schema[c2] = col2.replace(precision=lowest.precision, rounds=lowest.rounds) 103 | 104 | elif isinstance(col1, (NumericType, Boolean)): 105 | if not isinstance(col2, (NumericType, Boolean)): 106 | raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}") 107 | 108 | lowest = min(col1, col2, key=attrgetter("precision")) 109 | 110 | if col1.precision != col2.precision: 111 | logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}") 112 | 113 | if lowest.precision != col1.precision: 114 | table1._schema[c1] = col1.replace(precision=lowest.precision) 115 | if lowest.precision != col2.precision: 116 | table2._schema[c2] = col2.replace(precision=lowest.precision) 117 | 118 | elif isinstance(col1, ColType_UUID): 119 | if not isinstance(col2, ColType_UUID): 120 | raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}") 121 | 122 | elif isinstance(col1, StringType): 123 | if not isinstance(col2, StringType): 124 | raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}") 125 | 126 | for t in [table1, table2]: 127 | for c in t.relevant_columns: 128 | ctype = t._schema[c] 129 | if not ctype.supported: 130 | logger.warning( 131 | f"[{t.database.name}] Column '{c}' of type '{ctype}' has no compatibility handling. " 132 | "If encoding/formatting differs between databases, it may result in false positives." 133 | ) 134 | 135 | def _diff_segments( 136 | self, 137 | ti: ThreadedYielder, 138 | table1: TableSegment, 139 | table2: TableSegment, 140 | info_tree: InfoTree, 141 | max_rows: int, 142 | level=0, 143 | segment_index=None, 144 | segment_count=None, 145 | ): 146 | logger.info( 147 | ". " * level + f"Diffing segment {segment_index}/{segment_count}, " 148 | f"key-range: {table1.min_key}..{table2.max_key}, " 149 | f"size <= {max_rows}" 150 | ) 151 | 152 | # When benchmarking, we want the ability to skip checksumming. This 153 | # allows us to download all rows for comparison in performance. By 154 | # default, reladiff will checksum the section first (when it's below 155 | # the threshold) and _then_ download it. 156 | if BENCHMARK: 157 | if max_rows < self.bisection_threshold: 158 | return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max_rows) 159 | 160 | if isinstance(table1, EmptyTableSegment) or isinstance(table1, EmptyTableSegment): 161 | # Optimization: No need to checksum if one of the tables is empty 162 | count1, count2 = self._threaded_call("count", [table1, table2]) 163 | checksum1 = checksum2 = None 164 | else: 165 | (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2]) 166 | 167 | assert not info_tree.info.rowcounts 168 | info_tree.info.rowcounts = {1: count1, 2: count2} 169 | 170 | if count1 == 0 and count2 == 0: 171 | logger.debug( 172 | "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). " 173 | "For better performance, we recommend to increase the bisection-threshold.", 174 | table1.min_key, 175 | table1.max_key, 176 | ) 177 | assert checksum1 is None and checksum2 is None 178 | info_tree.info.is_diff = False 179 | return 180 | 181 | if checksum1 == checksum2 and count1 == count2: 182 | info_tree.info.is_diff = False 183 | return 184 | 185 | info_tree.info.is_diff = True 186 | return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max(count1, count2)) 187 | 188 | def _bisect_and_diff_segments( 189 | self, 190 | ti: ThreadedYielder, 191 | table1: TableSegment, 192 | table2: TableSegment, 193 | info_tree: InfoTree, 194 | level=0, 195 | max_rows=None, 196 | ): 197 | assert table1.is_bounded and table2.is_bounded 198 | 199 | max_space_size = max(table1.approximate_size(), table2.approximate_size()) 200 | if max_rows is None: 201 | # We can be sure that row_count <= max_rows iff the table key is unique 202 | max_rows = max_space_size 203 | info_tree.info.max_rows = max_rows 204 | 205 | # If count is below the threshold, just download and compare the columns locally 206 | # This saves time, as bisection speed is limited by ping and query performance. 207 | if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2: 208 | rows1, rows2 = self._threaded_call("get_values", [table1, table2]) 209 | diff = list(diff_sets(rows1, rows2, self.skip_sort_results, self.duplicate_rows_support)) 210 | 211 | info_tree.info.set_diff(diff) 212 | info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)} 213 | 214 | logger.info(". " * level + f"Diff found {len(diff)} different rows.") 215 | self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) 216 | return diff 217 | 218 | return super()._bisect_and_diff_segments(ti, table1, table2, info_tree, level, max_rows) 219 | -------------------------------------------------------------------------------- /reladiff/info_tree.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Union 2 | 3 | from runtype import dataclass 4 | 5 | from .table_segment import TableSegment, EmptyTableSegment 6 | 7 | 8 | @dataclass(frozen=False) 9 | class SegmentInfo: 10 | tables: List[Union[TableSegment, EmptyTableSegment]] 11 | 12 | diff: list = None 13 | is_diff: bool = None 14 | diff_count: int = None 15 | 16 | rowcounts: Dict[int, int] = {} 17 | max_rows: int = None 18 | 19 | def set_diff(self, diff: list): 20 | self.diff = diff 21 | self.diff_count = len(diff) 22 | self.is_diff = self.diff_count > 0 23 | 24 | def update_from_children(self, child_infos): 25 | child_infos = list(child_infos) 26 | assert child_infos 27 | 28 | # self.diff = list(chain(*[c.diff for c in child_infos])) 29 | self.diff_count = sum(c.diff_count for c in child_infos if c.diff_count is not None) 30 | self.is_diff = any(c.is_diff for c in child_infos) 31 | 32 | self.rowcounts = { 33 | 1: sum(c.rowcounts[1] for c in child_infos if c.rowcounts), 34 | 2: sum(c.rowcounts[2] for c in child_infos if c.rowcounts), 35 | } 36 | 37 | 38 | @dataclass 39 | class InfoTree: 40 | info: SegmentInfo 41 | children: List["InfoTree"] = [] 42 | 43 | def add_node(self, table1: TableSegment, table2: TableSegment, max_rows: int = None): 44 | node = InfoTree(SegmentInfo([table1, table2], max_rows=max_rows)) 45 | self.children.append(node) 46 | return node 47 | 48 | def aggregate_info(self): 49 | if self.children: 50 | for c in self.children: 51 | c.aggregate_info() 52 | self.info.update_from_children(c.info for c in self.children) 53 | -------------------------------------------------------------------------------- /reladiff/parse_time.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime, timedelta 3 | from difflib import SequenceMatcher 4 | 5 | 6 | class ParseError(ValueError): 7 | pass 8 | 9 | 10 | TIME_UNITS = dict( 11 | seconds="seconds", 12 | minutes="minutes", 13 | hours="hours", 14 | days="days", 15 | weeks="weeks", 16 | months="months", 17 | years="years", 18 | # Shortcuts 19 | s="seconds", 20 | min="minutes", 21 | h="hours", 22 | d="days", 23 | w="weeks", 24 | mon="months", 25 | y="years", 26 | ) 27 | 28 | EXTRAPOLATED = {"months": (30, "days"), "years": (365, "days")} 29 | assert set(EXTRAPOLATED) <= set(TIME_UNITS) 30 | 31 | TIME_RE = re.compile(r"(\d+)([a-z]+)") 32 | 33 | UNITS_STR = ", ".join(sorted(TIME_UNITS.keys())) 34 | 35 | 36 | def string_similarity(a, b): 37 | return SequenceMatcher(None, a, b).ratio() 38 | 39 | 40 | def parse_time_atom(count, unit): 41 | count = int(count) 42 | try: 43 | unit = TIME_UNITS[unit] 44 | except KeyError: 45 | most_similar = max(TIME_UNITS, key=lambda k: string_similarity(k, unit)) 46 | raise ParseError( 47 | f"'{unit}' is not a recognized time unit. Did you mean '{most_similar}'?" f"\nSupported units: {UNITS_STR}" 48 | ) 49 | 50 | if unit in EXTRAPOLATED: 51 | mul, unit = EXTRAPOLATED[unit] 52 | count *= mul 53 | return count, unit 54 | 55 | 56 | def parse_time_delta(t: str): 57 | time_dict = {} 58 | while t: 59 | m = TIME_RE.match(t) 60 | if not m: 61 | raise ParseError(f"Cannot parse '{t}': Not a recognized time delta") 62 | count, unit = parse_time_atom(*m.groups()) 63 | if unit in time_dict: 64 | raise ParseError(f"Time unit {unit} specified more than once") 65 | time_dict[unit] = count 66 | t = t[m.end() :] 67 | 68 | if not time_dict: 69 | raise ParseError("No time difference specified") 70 | return timedelta(**time_dict) 71 | 72 | 73 | def parse_time_before(time: datetime, delta: str): 74 | return time - parse_time_delta(delta) 75 | -------------------------------------------------------------------------------- /reladiff/query_utils.py: -------------------------------------------------------------------------------- 1 | "Module for query utilities that didn't make it into the query-builder (yet)" 2 | 3 | from contextlib import suppress 4 | 5 | from sqeleton.databases import DbPath, QueryError, Oracle 6 | from sqeleton.queries import table, commit, Expr 7 | 8 | 9 | def _drop_table_oracle(name: DbPath): 10 | t = table(name) 11 | # Experience shows double drop is necessary 12 | with suppress(QueryError): 13 | yield t.drop() 14 | yield t.drop() 15 | yield commit 16 | 17 | 18 | def _drop_table(name: DbPath): 19 | t = table(name) 20 | yield t.drop(if_exists=True) 21 | yield commit 22 | 23 | 24 | def drop_table(db, tbl): 25 | if isinstance(db, Oracle): 26 | db.query(_drop_table_oracle(tbl)) 27 | else: 28 | db.query(_drop_table(tbl)) 29 | 30 | 31 | def _append_to_table_oracle(path: DbPath, expr: Expr): 32 | """See append_to_table""" 33 | assert expr.schema, expr 34 | t = table(path, schema=expr.schema) 35 | with suppress(QueryError): 36 | yield t.create() # uses expr.schema 37 | yield commit 38 | yield t.insert_expr(expr) 39 | yield commit 40 | 41 | 42 | def _append_to_table(path: DbPath, expr: Expr): 43 | """Append to table""" 44 | assert expr.schema, expr 45 | t = table(path, schema=expr.schema) 46 | yield t.create(if_not_exists=True) # uses expr.schema 47 | yield commit 48 | yield t.insert_expr(expr) 49 | yield commit 50 | 51 | 52 | def append_to_table(db, path, expr): 53 | f = _append_to_table_oracle if isinstance(db, Oracle) else _append_to_table 54 | db.query(f(path, expr)) 55 | -------------------------------------------------------------------------------- /reladiff/table_segment.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List, Tuple 3 | import logging 4 | from itertools import product 5 | 6 | from runtype import dataclass 7 | 8 | from .utils import safezip, Vector 9 | from sqeleton.utils import ArithString, split_space 10 | from sqeleton.databases import Database, DbPath, DbKey, DbTime 11 | from sqeleton.abcs.database_types import String_UUID 12 | from sqeleton.schema import Schema, create_schema 13 | from sqeleton.queries import Count, Checksum, SKIP, table, this, Expr, min_, max_, Code 14 | from sqeleton.queries.extras import ApplyFuncAndNormalizeAsString, NormalizeAsString 15 | 16 | logger = logging.getLogger("table_segment") 17 | 18 | RECOMMENDED_CHECKSUM_DURATION = 20 19 | 20 | 21 | class EmptyTable(ValueError): 22 | pass 23 | 24 | 25 | def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]: 26 | assert min_key < max_key 27 | 28 | if max_key - min_key <= count: 29 | count = 1 30 | 31 | if isinstance(min_key, ArithString): 32 | assert type(min_key) is type(max_key) 33 | checkpoints = min_key.range(max_key, count) 34 | else: 35 | checkpoints = split_space(min_key, max_key, count) 36 | 37 | assert all(min_key < x < max_key for x in checkpoints) 38 | return [min_key] + checkpoints + [max_key] 39 | 40 | 41 | def int_product(nums: List[int]) -> int: 42 | p = 1 43 | for n in nums: 44 | p *= n 45 | return p 46 | 47 | 48 | def split_compound_key_space(mn: Vector, mx: Vector, count: int) -> List[List[DbKey]]: 49 | """Returns a list of split-points for each key dimension, essentially returning an N-dimensional grid of split points.""" 50 | return [split_key_space(mn_k, mx_k, count) for mn_k, mx_k in safezip(mn, mx)] 51 | 52 | 53 | def create_mesh_from_points(*values_per_dim: list) -> List[Tuple[Vector, Vector]]: 54 | """Given a list of values along each axis of N dimensional space, 55 | return an array of boxes whose start-points & end-points align with the given values, 56 | and together consitute a mesh filling that space entirely (within the bounds of the given values). 57 | 58 | Assumes given values are already ordered ascending. 59 | 60 | len(boxes) == ∏i( len(i)-1 ) 61 | 62 | Example: 63 | :: 64 | >>> d1 = 'a', 'b', 'c' 65 | >>> d2 = 1, 2, 3 66 | >>> d3 = 'X', 'Y' 67 | >>> create_mesh_from_points(d1, d2, d3) 68 | [ 69 | [('a', 1, 'X'), ('b', 2, 'Y')], 70 | [('a', 2, 'X'), ('b', 3, 'Y')], 71 | [('b', 1, 'X'), ('c', 2, 'Y')], 72 | [('b', 2, 'X'), ('c', 3, 'Y')] 73 | ] 74 | """ 75 | assert all(len(v) >= 2 for v in values_per_dim), values_per_dim 76 | 77 | # Create tuples of (v1, v2) for each pair of adjacent values 78 | ranges = [list(zip(values[:-1], values[1:])) for values in values_per_dim] 79 | 80 | assert all(a <= b for r in ranges for a, b in r) 81 | 82 | # Create a product of all the ranges 83 | res = [tuple(Vector(a) for a in safezip(*r)) for r in product(*ranges)] 84 | 85 | expected_len = int_product(len(v) - 1 for v in values_per_dim) 86 | assert len(res) == expected_len, (len(res), expected_len) 87 | return res 88 | 89 | 90 | @dataclass 91 | class TableSegment: 92 | """Signifies a segment of rows (and selected columns) within a table 93 | 94 | Parameters: 95 | database (Database): Database instance. See :meth:`connect` 96 | table_path (:data:`DbPath`): Path to table in form of a tuple. e.g. `('my_dataset', 'table_name')` 97 | key_columns (Tuple[str]): Name of the key column, which uniquely identifies each row (usually id) 98 | update_column (str, optional): Name of updated column, which signals that rows changed. 99 | Usually updated_at or last_update. Used by `min_update` and `max_update`. 100 | extra_columns (Tuple[str, ...], optional): Extra columns to compare 101 | min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment 102 | max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment 103 | min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment 104 | max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment 105 | where (str, optional): An additional 'where' expression to restrict the search space. 106 | 107 | case_sensitive (bool): If false, the case of column names will adjust according to the schema. Default is true. 108 | 109 | """ 110 | 111 | # Location of table 112 | database: Database 113 | table_path: DbPath 114 | 115 | # Columns 116 | key_columns: Tuple[str, ...] 117 | update_column: str = None 118 | extra_columns: Tuple[str, ...] = () 119 | 120 | # Restrict the segment 121 | min_key: Vector = None 122 | max_key: Vector = None 123 | min_update: DbTime = None 124 | max_update: DbTime = None 125 | where: str = None 126 | 127 | case_sensitive: bool = True 128 | _schema: Schema = None 129 | 130 | def __post_init__(self): 131 | if not self.update_column and (self.min_update or self.max_update): 132 | raise ValueError("Error: the min_update/max_update feature requires 'update_column' to be set.") 133 | 134 | if self.min_key is not None and self.max_key is not None and self.min_key >= self.max_key: 135 | raise ValueError(f"Error: min_key expected to be smaller than max_key! ({self.min_key} >= {self.max_key})") 136 | 137 | if self.min_update is not None and self.max_update is not None and self.min_update >= self.max_update: 138 | raise ValueError( 139 | f"Error: min_update expected to be smaller than max_update! ({self.min_update} >= {self.max_update})" 140 | ) 141 | 142 | def _where(self): 143 | return f"({self.where})" if self.where else None 144 | 145 | def _with_raw_schema(self, raw_schema: dict, refine: bool = True, allow_empty_table=False) -> "TableSegment": 146 | # TODO validate all relevant columns are in the schema? 147 | cols = {c.lower() for c in self.relevant_columns} 148 | # We use v[0] to get the actual name (with correct case) 149 | raw_schema = {v[0]: v for k, v in raw_schema.items() if k.lower() in cols} 150 | schema, samples = self.database.process_query_table_schema( 151 | self.table_path, raw_schema, refine=refine, refine_where=self._where() 152 | ) 153 | assert refine or samples is None 154 | is_empty_table = samples is not None and not samples 155 | if is_empty_table and not allow_empty_table: 156 | raise EmptyTable(f"Table {self.table_path} is empty. Use --allow-empty-tables to disable this protection.", self) 157 | 158 | res = self.new(_schema=create_schema(self.database, self.table_path, schema, self.case_sensitive)) 159 | 160 | return EmptyTableSegment(res) if is_empty_table else res 161 | 162 | def with_schema(self, refine: bool = True, allow_empty_table: bool = False) -> "TableSegment": 163 | "Queries the table schema from the database, and returns a new instance of TableSegment, with a schema." 164 | if self._schema: 165 | return self 166 | 167 | return self._with_raw_schema( 168 | self.database.query_table_schema(self.table_path), refine=refine, allow_empty_table=allow_empty_table 169 | ) 170 | 171 | def _cast_col_value(self, col, value): 172 | """Cast the value to the right type, based on the type of the column 173 | 174 | Currently only used to support native vs string UUID values. 175 | """ 176 | assert self._schema 177 | t = self._schema[col] 178 | if isinstance(t, String_UUID): 179 | return str(value) 180 | return value 181 | 182 | def _make_key_range(self): 183 | if self.min_key is not None: 184 | for mn, k in safezip(self.min_key, self.key_columns): 185 | mn = self._cast_col_value(k, mn) 186 | yield mn <= this[k] 187 | if self.max_key is not None: 188 | for k, mx in safezip(self.key_columns, self.max_key): 189 | mx = self._cast_col_value(k, mx) 190 | yield this[k] < mx 191 | 192 | def _make_update_range(self): 193 | if self.min_update is not None: 194 | yield self.min_update <= this[self.update_column] 195 | if self.max_update is not None: 196 | yield this[self.update_column] < self.max_update 197 | 198 | @property 199 | def source_table(self): 200 | return table(*self.table_path, schema=self._schema) 201 | 202 | def make_select(self): 203 | return self.source_table.where( 204 | *self._make_key_range(), *self._make_update_range(), Code(self._where()) if self.where else SKIP 205 | ) 206 | 207 | def get_values(self) -> list: 208 | "Download all the relevant values of the segment from the database" 209 | select = self.make_select().select(*self._relevant_columns_repr) 210 | return self.database.query(select, List[Tuple]) 211 | 212 | def choose_checkpoints(self, count: int) -> List[List[DbKey]]: 213 | "Suggests a bunch of evenly-spaced checkpoints to split by, including start, end." 214 | 215 | assert self.is_bounded 216 | 217 | # Take Nth root of count, to approximate the appropriate box size 218 | count = int(count ** (1 / len(self.key_columns))) or 1 219 | 220 | return split_compound_key_space(self.min_key, self.max_key, count) 221 | 222 | def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]: 223 | "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints" 224 | 225 | return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)] 226 | 227 | def new(self, **kwargs) -> "TableSegment": 228 | """Creates a copy of the instance using 'replace()'""" 229 | return self.replace(**kwargs) 230 | 231 | def new_key_bounds(self, min_key: Vector, max_key: Vector) -> "TableSegment": 232 | if self.min_key is not None: 233 | assert self.min_key <= min_key, (self.min_key, min_key) 234 | assert self.min_key < max_key 235 | 236 | if self.max_key is not None: 237 | assert min_key < self.max_key 238 | assert max_key <= self.max_key 239 | 240 | return self.replace(min_key=min_key, max_key=max_key) 241 | 242 | @property 243 | def relevant_columns(self) -> List[str]: 244 | extras = list(self.extra_columns) 245 | 246 | if self.update_column and self.update_column not in extras: 247 | extras = [self.update_column] + extras 248 | 249 | return list(self.key_columns) + extras 250 | 251 | @property 252 | def _relevant_columns_repr(self) -> List[Expr]: 253 | return [NormalizeAsString(this[c]) for c in self.relevant_columns] 254 | 255 | def count(self) -> int: 256 | """Count how many rows are in the segment, in one pass.""" 257 | return self.database.query(self.make_select().select(Count()), int) 258 | 259 | def count_and_checksum(self) -> Tuple[int, int]: 260 | """Count and checksum the rows in the segment, in one pass.""" 261 | start = time.monotonic() 262 | q = self.make_select().select(Count(), Checksum(self._relevant_columns_repr)) 263 | count, checksum = self.database.query(q, tuple) 264 | duration = time.monotonic() - start 265 | if duration > RECOMMENDED_CHECKSUM_DURATION: 266 | logger.warning( 267 | "Checksum is taking longer than expected (%.2f). " 268 | "We recommend increasing --bisection-factor or decreasing --threads.", 269 | duration, 270 | ) 271 | 272 | if count: 273 | assert checksum, (count, checksum) 274 | return count or 0, int(checksum) if count else None 275 | 276 | def query_key_range(self) -> Tuple[tuple, tuple]: 277 | """Query database for minimum and maximum key. This is used for setting the initial bounds.""" 278 | # Normalizes the result (needed for UUIDs) after the min/max computation 279 | select = self.make_select().select( 280 | ApplyFuncAndNormalizeAsString(this[k], f) for k in self.key_columns for f in (min_, max_) 281 | ) 282 | result = tuple(self.database.query(select, tuple)) 283 | 284 | if any(i is None for i in result): 285 | # We return EmptyTable instead of raising it, so that we can consume 286 | # the key_ranges as an iterator. 287 | # _parse_key_range_result() will raise the error we return. 288 | return EmptyTable(f"Table {self.table_path} appears to be empty.", self) 289 | 290 | # Min/max keys are interleaved 291 | min_key, max_key = result[::2], result[1::2] 292 | assert len(min_key) == len(max_key) 293 | 294 | return min_key, max_key 295 | 296 | @property 297 | def is_bounded(self): 298 | return self.min_key is not None and self.max_key is not None 299 | 300 | def approximate_size(self): 301 | if not self.is_bounded: 302 | raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.") 303 | diff = self.max_key - self.min_key 304 | assert all(d > 0 for d in diff) 305 | return int_product(diff) 306 | 307 | @property 308 | def key_types(self): 309 | return [self._schema[i] for i in self.key_columns] 310 | 311 | 312 | @dataclass 313 | class EmptyTableSegment: 314 | _table_segment: TableSegment 315 | 316 | def approximate_size(self): 317 | return 0 318 | 319 | @property 320 | def is_bounded(self): 321 | return True 322 | 323 | def query_key_range(self) -> Tuple[tuple, tuple]: 324 | return EmptyTable() 325 | 326 | def count(self) -> int: 327 | return 0 328 | 329 | def count_and_checksum(self) -> Tuple[int, int]: 330 | return (0, None) 331 | 332 | def __getattr__(self, attr): 333 | assert attr in ("database", "key_columns", "key_types", "relevant_columns", "_schema") 334 | return getattr(self._table_segment, attr) 335 | 336 | @property 337 | def min_key(self): 338 | return None 339 | 340 | @property 341 | def max_key(self): 342 | return None 343 | 344 | def with_schema(self, refine: bool = True, allow_empty_table: bool = False) -> "TableSegment": 345 | assert self._table_segment._schema 346 | return self 347 | 348 | def new_key_bounds(self, min_key: Vector, max_key: Vector) -> "TableSegment": 349 | return self 350 | 351 | def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]: 352 | "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints" 353 | mesh = create_mesh_from_points(*checkpoints) 354 | return [self for s, e in mesh] 355 | 356 | def make_select(self): 357 | # XXX shouldn't be called 358 | return self._table_segment.make_select() 359 | 360 | def get_values(self) -> list: 361 | return [] 362 | -------------------------------------------------------------------------------- /reladiff/thread_utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from queue import PriorityQueue 3 | from collections import deque 4 | from collections.abc import Iterable 5 | from concurrent.futures import ThreadPoolExecutor 6 | from concurrent.futures.thread import _WorkItem 7 | from time import sleep 8 | from typing import Callable, Iterator, Optional 9 | 10 | 11 | class AutoPriorityQueue(PriorityQueue): 12 | """Overrides PriorityQueue to automatically get the priority from _WorkItem.kwargs 13 | 14 | We also assign a unique id for each item, to avoid making comparisons on _WorkItem. 15 | As a side effect, items with the same priority are returned FIFO. 16 | """ 17 | 18 | _counter = itertools.count().__next__ 19 | 20 | def put(self, item: Optional[_WorkItem], block=True, timeout=None): 21 | priority = item.kwargs.pop("priority") if item is not None else 0 22 | super().put((-priority, self._counter(), item), block, timeout) 23 | 24 | def get(self, block=True, timeout=None) -> Optional[_WorkItem]: 25 | _p, _c, work_item = super().get(block, timeout) 26 | return work_item 27 | 28 | 29 | class PriorityThreadPoolExecutor(ThreadPoolExecutor): 30 | """Overrides ThreadPoolExecutor to use AutoPriorityQueue 31 | 32 | XXX WARNING: Might break in future versions of Python 33 | """ 34 | 35 | def __init__(self, *args): 36 | super().__init__(*args) 37 | 38 | self._work_queue = AutoPriorityQueue() 39 | 40 | 41 | class ThreadedYielder(Iterable): 42 | """Yields results from multiple threads into a single iterator, ordered by priority. 43 | 44 | To add a source iterator, call ``submit()`` with a function that returns an iterator. 45 | Priority for the iterator can be provided via the keyword argument 'priority'. (higher runs first) 46 | 47 | max_workers set the maximum number of worker threads 48 | yield_buffer_size sets the size of the "lookahead" buffer for the yielder. Default=1. 49 | For lazy computation, set this to 1. Set this to a higher value to reduce latency. 50 | Set to 0 for unlimited size. 51 | """ 52 | 53 | def __init__(self, max_workers: Optional[int] = None, yield_buffer_size: int = 1): 54 | self._pool = PriorityThreadPoolExecutor(max_workers) 55 | self._futures = deque() 56 | self._yield = deque() 57 | self._exception = None 58 | self._yield_buffer_size = yield_buffer_size 59 | 60 | def _worker(self, fn, *args, _priority=0, **kwargs): 61 | if self._yield_buffer_size and len(self._yield) >= self._yield_buffer_size: 62 | self._idle() 63 | self._futures.append(self._pool.submit(self._worker, fn, *args, priority=_priority, _priority=_priority, **kwargs)) 64 | return 65 | 66 | try: 67 | res = fn(*args, **kwargs) 68 | if res is not None: 69 | self._yield.append(res) 70 | except Exception as e: 71 | self._exception = e 72 | 73 | def submit(self, fn: Callable, *args, priority: int = 0, **kwargs): 74 | self._futures.append(self._pool.submit(self._worker, fn, *args, priority=priority, _priority=priority, **kwargs)) 75 | 76 | def shutdown(self, wait=True): 77 | try: 78 | # Python 3.9+ 79 | self._pool.shutdown(wait, cancel_futures=True) 80 | except TypeError: 81 | # Python 3.8 doesn't support cancel_futures 82 | self._pool.shutdown(wait) 83 | 84 | def _idle(self): 85 | if self._exception: 86 | raise self._exception 87 | 88 | while self._futures and self._futures[0].done(): 89 | self._futures.popleft() 90 | 91 | if not self._futures: 92 | # No more tasks 93 | return True 94 | 95 | sleep(0.001) 96 | 97 | def __iter__(self) -> Iterator: 98 | if self._exception: 99 | raise self._exception 100 | 101 | while True: 102 | while self._yield: 103 | yield from self._yield.popleft() 104 | 105 | if self._idle(): 106 | break 107 | 108 | -------------------------------------------------------------------------------- /reladiff/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from typing import Iterable, Sequence 4 | from urllib.parse import urlparse 5 | import operator 6 | import threading 7 | from datetime import datetime 8 | 9 | 10 | def safezip(*args): 11 | "zip but makes sure all sequences are the same length" 12 | lens = list(map(len, args)) 13 | if len(set(lens)) != 1: 14 | raise ValueError(f"Mismatching lengths in arguments to safezip: {lens}") 15 | return zip(*args) 16 | 17 | 18 | def _join_if_any(sym, args): 19 | args = list(args) 20 | if not args: 21 | return "" 22 | return sym.join(str(a) for a in args if a) 23 | 24 | 25 | def remove_password_from_url(url: str, replace_with: str = "***") -> str: 26 | parsed = urlparse(url) 27 | account = parsed.username or "" 28 | if parsed.password: 29 | account += ":" + replace_with 30 | host = _join_if_any(":", filter(None, [parsed.hostname, parsed.port])) 31 | netloc = _join_if_any("@", filter(None, [account, host])) 32 | replaced = parsed._replace(netloc=netloc) 33 | return replaced.geturl() 34 | 35 | 36 | def match_like(pattern: str, strs: Sequence[str]) -> Iterable[str]: 37 | reo = re.compile(pattern.replace("%", ".*").replace("?", ".") + "$") 38 | for s in strs: 39 | if reo.match(s): 40 | yield s 41 | 42 | 43 | def accumulate(iterable, func=operator.add, *, initial=None): 44 | "Return running totals" 45 | # Taken from https://docs.python.org/3/library/itertools.html#itertools.accumulate, to backport 'initial' to 3.7 46 | it = iter(iterable) 47 | total = initial 48 | if initial is None: 49 | try: 50 | total = next(it) 51 | except StopIteration: 52 | return 53 | yield total 54 | for element in it: 55 | total = func(total, element) 56 | yield total 57 | 58 | 59 | def run_as_daemon(threadfunc, *args): 60 | th = threading.Thread(target=threadfunc, args=args) 61 | th.daemon = True 62 | th.start() 63 | return th 64 | 65 | 66 | def getLogger(name): 67 | return logging.getLogger(name.rsplit(".", 1)[-1]) 68 | 69 | 70 | def eval_name_template(name): 71 | def get_timestamp(_match): 72 | return datetime.now().isoformat("_", "seconds").replace(":", "_") 73 | 74 | return re.sub("%t", get_timestamp, name) 75 | 76 | 77 | def truncate_error(error: str): 78 | first_line = error.split("\n", 1)[0] 79 | return re.sub("'(.*?)'", "'***'", first_line) 80 | 81 | 82 | class Vector(tuple): 83 | 84 | """Immutable implementation of a regular vector over any arithmetic value 85 | 86 | Implements a product order - https://en.wikipedia.org/wiki/Product_order 87 | 88 | Partial implementation: Only the needed functionality is implemented 89 | """ 90 | 91 | def __lt__(self, other: "Vector"): 92 | if isinstance(other, Vector): 93 | return all(a < b for a, b in safezip(self, other)) 94 | return NotImplemented 95 | 96 | def __le__(self, other: "Vector"): 97 | if isinstance(other, Vector): 98 | return all(a <= b for a, b in safezip(self, other)) 99 | return NotImplemented 100 | 101 | def __gt__(self, other: "Vector"): 102 | if isinstance(other, Vector): 103 | return all(a > b for a, b in safezip(self, other)) 104 | return NotImplemented 105 | 106 | def __ge__(self, other: "Vector"): 107 | if isinstance(other, Vector): 108 | return all(a >= b for a, b in safezip(self, other)) 109 | return NotImplemented 110 | 111 | def __eq__(self, other: "Vector"): 112 | if isinstance(other, Vector): 113 | return all(a == b for a, b in safezip(self, other)) 114 | return NotImplemented 115 | 116 | def __sub__(self, other: "Vector"): 117 | if isinstance(other, Vector): 118 | return Vector((a - b) for a, b in safezip(self, other)) 119 | raise NotImplementedError() 120 | 121 | def __repr__(self) -> str: 122 | return "(%s)" % ", ".join(str(k) for k in self) 123 | -------------------------------------------------------------------------------- /reladiff_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erezsh/reladiff/f613504212d1f0a40e650238e25b19810f523825/tests/__init__.py -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import string 4 | import random 5 | from typing import Callable 6 | import unittest 7 | import logging 8 | import subprocess 9 | 10 | from parameterized import parameterized_class 11 | 12 | from sqeleton.queries import table 13 | from sqeleton.databases import Database 14 | 15 | from reladiff import databases as db 16 | from reladiff import connect 17 | from reladiff.table_segment import TableSegment 18 | from reladiff.query_utils import drop_table 19 | 20 | # We write 'or None' because Github sometimes creates empty env vars for secrets 21 | TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql" 22 | TEST_POSTGRESQL_CONN_STRING: str = "postgresql://postgres:Password1@localhost/postgres" 23 | TEST_SNOWFLAKE_CONN_STRING: str = os.environ.get("SNOWFLAKE_URI") or None 24 | TEST_PRESTO_CONN_STRING: str = os.environ.get("PRESTO_URI") or None 25 | TEST_BIGQUERY_CONN_STRING: str = os.environ.get("BIGQUERY_URI") or None 26 | TEST_REDSHIFT_CONN_STRING: str = os.environ.get("REDSHIFT_URI") or None 27 | TEST_ORACLE_CONN_STRING: str = None 28 | TEST_DATABRICKS_CONN_STRING: str = os.environ.get("DATABRICKS_URI") 29 | TEST_TRINO_CONN_STRING: str = os.environ.get("TRINO_URI") or None 30 | # clickhouse uri for provided docker - "clickhouse://clickhouse:Password1@localhost:9000/clickhouse" 31 | TEST_CLICKHOUSE_CONN_STRING: str = os.environ.get("CLICKHOUSE_URI") 32 | # vertica uri provided for docker - "vertica://vertica:Password1@localhost:5433/vertica" 33 | TEST_VERTICA_CONN_STRING: str = os.environ.get("VERTICA_URI") 34 | TEST_DUCKDB_CONN_STRING: str = "duckdb://main:@:memory:" 35 | 36 | 37 | DEFAULT_N_SAMPLES = 50 38 | N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES)) 39 | BENCHMARK = os.environ.get("BENCHMARK", False) 40 | N_THREADS = int(os.environ.get("N_THREADS", 1)) 41 | TEST_ACROSS_ALL_DBS = os.environ.get("TEST_ACROSS_ALL_DBS", True) # Should we run the full db<->db test suite? 42 | 43 | 44 | def get_git_revision_short_hash() -> str: 45 | return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip() 46 | 47 | 48 | GIT_REVISION = get_git_revision_short_hash() 49 | 50 | level = logging.ERROR 51 | if os.environ.get("LOG_LEVEL", False): 52 | level = getattr(logging, os.environ["LOG_LEVEL"].upper()) 53 | 54 | logging.basicConfig(level=level) 55 | logging.getLogger("hashdiff_tables").setLevel(level) 56 | logging.getLogger("joindiff_tables").setLevel(level) 57 | logging.getLogger("diff_tables").setLevel(level) 58 | logging.getLogger("table_segment").setLevel(level) 59 | logging.getLogger("database").setLevel(level) 60 | 61 | try: 62 | from .local_settings import * 63 | except ImportError: 64 | pass # No local settings 65 | 66 | 67 | CONN_STRINGS = { 68 | db.BigQuery: TEST_BIGQUERY_CONN_STRING, 69 | db.MySQL: TEST_MYSQL_CONN_STRING, 70 | db.PostgreSQL: TEST_POSTGRESQL_CONN_STRING, 71 | db.Snowflake: TEST_SNOWFLAKE_CONN_STRING, 72 | db.Redshift: TEST_REDSHIFT_CONN_STRING, 73 | db.Oracle: TEST_ORACLE_CONN_STRING, 74 | db.Presto: TEST_PRESTO_CONN_STRING, 75 | db.Databricks: TEST_DATABRICKS_CONN_STRING, 76 | db.Trino: TEST_TRINO_CONN_STRING, 77 | db.Clickhouse: TEST_CLICKHOUSE_CONN_STRING, 78 | db.Vertica: TEST_VERTICA_CONN_STRING, 79 | db.DuckDB: TEST_DUCKDB_CONN_STRING, 80 | } 81 | 82 | _database_instances = {} 83 | 84 | 85 | def get_conn(cls: type, shared: bool = True) -> Database: 86 | if shared: 87 | if cls not in _database_instances: 88 | _database_instances[cls] = get_conn(cls, shared=False) 89 | return _database_instances[cls] 90 | 91 | return connect(CONN_STRINGS[cls], N_THREADS) 92 | 93 | 94 | def _print_used_dbs(): 95 | used = {k.__name__ for k, v in CONN_STRINGS.items() if v is not None} 96 | unused = {k.__name__ for k, v in CONN_STRINGS.items() if v is None} 97 | 98 | print(f"Testing databases: {', '.join(used)}") 99 | if unused: 100 | logging.info(f"Connection not configured; skipping tests for: {', '.join(unused)}") 101 | if TEST_ACROSS_ALL_DBS: 102 | logging.info( 103 | f"Full tests enabled (every db<->db). May take very long when many dbs are involved. ={TEST_ACROSS_ALL_DBS}" 104 | ) 105 | 106 | 107 | _print_used_dbs() 108 | CONN_STRINGS = {k: v for k, v in CONN_STRINGS.items() if v is not None} 109 | 110 | 111 | def random_table_suffix() -> str: 112 | char_set = string.ascii_lowercase + string.digits 113 | suffix = "_" 114 | suffix += "".join(random.choice(char_set) for _ in range(5)) 115 | return suffix 116 | 117 | 118 | def str_to_checksum(str: str): 119 | # hello world 120 | # => 5eb63bbbe01eeed093cb22bb8f5acdc3 121 | # => cb22bb8f5acdc3 122 | # => 273350391345368515 123 | m = hashlib.md5() 124 | m.update(str.encode("utf-8")) # encode to binary 125 | md5 = m.hexdigest() 126 | # 0-indexed, unlike DBs which are 1-indexed here, so +1 in dbs 127 | half_pos = db.MD5_HEXDIGITS - db.CHECKSUM_HEXDIGITS 128 | return int(md5[half_pos:], 16) 129 | 130 | 131 | class DiffTestCase(unittest.TestCase): 132 | "Sets up two tables for diffing" 133 | db_cls = None 134 | src_schema = None 135 | dst_schema = None 136 | shared_connection = True 137 | 138 | def setUp(self): 139 | assert self.db_cls, self.db_cls 140 | 141 | self.connection = get_conn(self.db_cls, self.shared_connection) 142 | 143 | table_suffix = random_table_suffix() 144 | self.table_src_name = f"src{table_suffix}" 145 | self.table_dst_name = f"dst{table_suffix}" 146 | 147 | self.table_src_path = self.connection.parse_table_name(self.table_src_name) 148 | self.table_dst_path = self.connection.parse_table_name(self.table_dst_name) 149 | 150 | drop_table(self.connection, self.table_src_path) 151 | drop_table(self.connection, self.table_dst_path) 152 | 153 | self.src_table = table(self.table_src_path, schema=self.src_schema) 154 | self.dst_table = table(self.table_dst_path, schema=self.dst_schema) 155 | if self.src_schema: 156 | self.connection.query(self.src_table.create()) 157 | if self.dst_schema: 158 | self.connection.query(self.dst_table.create()) 159 | 160 | return super().setUp() 161 | 162 | def tearDown(self): 163 | drop_table(self.connection, self.table_src_path) 164 | drop_table(self.connection, self.table_dst_path) 165 | 166 | 167 | def _parameterized_class_per_conn(test_databases): 168 | test_databases = set(test_databases) 169 | names = [(cls.__name__, cls) for cls in CONN_STRINGS if cls in test_databases] 170 | return parameterized_class(("name", "db_cls"), names) 171 | 172 | 173 | def test_each_database_in_list(databases) -> Callable: 174 | def _test_per_database(cls): 175 | return _parameterized_class_per_conn(databases)(cls) 176 | 177 | return _test_per_database 178 | 179 | 180 | def table_segment(database, table_path, key_columns, *args, **kw): 181 | if isinstance(key_columns, str): 182 | key_columns = (key_columns,) 183 | return TableSegment(database, table_path, key_columns, *args, **kw) 184 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from reladiff import diff_tables, connect_to_table, Algorithm 4 | from reladiff.databases import MySQL 5 | from sqeleton.queries import commit 6 | 7 | from .common import TEST_MYSQL_CONN_STRING, DiffTestCase 8 | 9 | 10 | class TestApi(DiffTestCase): 11 | src_schema = {"id": int, "datetime": datetime, "text_comment": str} 12 | db_cls = MySQL 13 | 14 | def setUp(self) -> None: 15 | super().setUp() 16 | 17 | self.conn = self.connection 18 | 19 | self.now = now = datetime.now() 20 | 21 | rows = [ 22 | (now, "now"), 23 | (self.now - timedelta(seconds=10), "a"), 24 | (self.now - timedelta(seconds=7), "b"), 25 | (self.now - timedelta(seconds=6), "c"), 26 | ] 27 | 28 | self.conn.query( 29 | [ 30 | self.src_table.insert_rows((i, ts, s) for i, (ts, s) in enumerate(rows)), 31 | self.dst_table.create(self.src_table), 32 | self.src_table.insert_row(len(rows), self.now - timedelta(seconds=3), "3 seconds ago"), 33 | commit, 34 | ] 35 | ) 36 | 37 | def test_api(self): 38 | # test basic 39 | t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name) 40 | t2 = connect_to_table(TEST_MYSQL_CONN_STRING, (self.table_dst_name,)) 41 | diff = list(diff_tables(t1, t2, algorithm=Algorithm.JOINDIFF)) 42 | assert len(diff) == 1 43 | 44 | # test algorithm 45 | # (also tests shared connection on connect_to_table) 46 | for algo in (Algorithm.HASHDIFF, Algorithm.JOINDIFF): 47 | diff = list(diff_tables(t1, t2, algorithm=algo)) 48 | assert len(diff) == 1 49 | 50 | # test where 51 | diff_id = diff[0][1][0] 52 | where = f"id != {diff_id} OR id = 90000000" 53 | 54 | t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name, where=where) 55 | t2 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_dst_name, where=where) 56 | diff = list(diff_tables(t1, t2)) 57 | assert len(diff) == 0 58 | 59 | # test close and empty tables 60 | diff = diff_tables(t1, t2, allow_empty_tables=True) # Make sure the API exists 61 | diff.close() 62 | 63 | # test context manager 64 | with diff_tables(t1, t2) as diff: 65 | assert len(list(diff)) == 0 66 | 67 | 68 | def test_api_get_stats_dict(self): 69 | # XXX Likely to change in the future 70 | expected_dict = { 71 | "rows_A": 5, 72 | "rows_B": 4, 73 | "exclusive_A": 1, 74 | "exclusive_B": 0, 75 | "updated": 0, 76 | "unchanged": 4, 77 | "total": 1, 78 | # "stats": {"rows_downloaded": 5}, 79 | } 80 | t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name) 81 | t2 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_dst_name) 82 | diff = diff_tables(t1, t2) 83 | 84 | output = diff.get_stats_dict() 85 | output.pop("stats") 86 | self.assertEqual(expected_dict, output) 87 | self.assertIsNotNone(diff) 88 | assert len(list(diff)) == 1 89 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | import sys 4 | from datetime import datetime, timedelta 5 | 6 | from sqeleton.queries import commit, current_timestamp 7 | 8 | from .common import DiffTestCase, CONN_STRINGS 9 | from .test_diff_tables import test_each_database 10 | 11 | 12 | def run_reladiff_cli(*args): 13 | try: 14 | stdout = subprocess.check_output([sys.executable, "-m", "reladiff"] + list(args), stderr=subprocess.PIPE) 15 | except subprocess.CalledProcessError as e: 16 | logging.error(e.stderr) 17 | raise 18 | return stdout.splitlines() 19 | 20 | 21 | @test_each_database 22 | class TestCLI(DiffTestCase): 23 | src_schema = {"id": int, "datetime": datetime, "text_comment": str} 24 | 25 | def setUp(self) -> None: 26 | super().setUp() 27 | 28 | now = self.connection.query(current_timestamp(), datetime) 29 | 30 | rows = [ 31 | (now, "now"), 32 | (now - timedelta(seconds=10), "a"), 33 | (now - timedelta(seconds=7), "b"), 34 | (now - timedelta(seconds=6), "c"), 35 | ] 36 | 37 | self.connection.query( 38 | [ 39 | self.src_table.insert_rows((i, ts, s) for i, (ts, s) in enumerate(rows)), 40 | self.dst_table.create(self.src_table), 41 | self.src_table.insert_row(len(rows), now - timedelta(seconds=3), "3 seconds ago"), 42 | commit, 43 | ] 44 | ) 45 | 46 | def test_basic(self): 47 | conn_str = CONN_STRINGS[self.db_cls] 48 | diff = run_reladiff_cli(conn_str, self.table_src_name, conn_str, self.table_dst_name) 49 | assert len(diff) == 1 50 | 51 | def test_options(self): 52 | conn_str = CONN_STRINGS[self.db_cls] 53 | diff = run_reladiff_cli( 54 | conn_str, 55 | self.table_src_name, 56 | conn_str, 57 | self.table_dst_name, 58 | "--bisection-factor", 59 | "16", 60 | "--bisection-threshold", 61 | "10000", 62 | "--limit", 63 | "5", 64 | "-t", 65 | "datetime", 66 | "--max-age", 67 | "1h", 68 | "--allow-empty-tables" 69 | ) 70 | assert len(diff) == 1, diff 71 | 72 | 73 | @test_each_database 74 | class TestCLI_CaseSensitive(DiffTestCase): 75 | src_schema = {"ID": int, "Datetime": datetime, "Text_Comment": str} 76 | 77 | def setUp(self) -> None: 78 | super().setUp() 79 | 80 | now = self.connection.query(current_timestamp(), datetime) 81 | 82 | rows = [ 83 | (now, "now"), 84 | (now - timedelta(seconds=10), "a"), 85 | (now - timedelta(seconds=7), "b"), 86 | (now - timedelta(seconds=6), "c"), 87 | ] 88 | 89 | self.connection.query( 90 | [ 91 | self.src_table.insert_rows((i, ts, s) for i, (ts, s) in enumerate(rows)), 92 | self.dst_table.create(self.src_table), 93 | self.src_table.insert_row(len(rows), now - timedelta(seconds=3), "3 seconds ago"), 94 | commit, 95 | ] 96 | ) 97 | 98 | def test_cli_case_sensitive(self): 99 | conn_str = CONN_STRINGS[self.db_cls] 100 | diff = run_reladiff_cli(conn_str, self.table_src_name, conn_str, self.table_dst_name) 101 | assert len(diff) == 1 -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from reladiff.config import apply_config_from_string, ConfigParseError 5 | from reladiff.utils import remove_password_from_url 6 | 7 | 8 | class TestConfig(unittest.TestCase): 9 | def test_basic(self): 10 | config = r""" 11 | [database.test_postgresql] 12 | driver = "postgresql" 13 | user = "postgres" 14 | password = "Password1" 15 | 16 | [run.default] 17 | update_column = "timestamp" 18 | verbose = true 19 | threads = 2 20 | 21 | [run.pg_pg] 22 | threads = 4 23 | 1.database = "test_postgresql" 24 | 1.table = "rating" 25 | 1.threads = 11 26 | 2.database = "postgresql://postgres:Password1@/" 27 | 2.table = "rating_del1" 28 | 2.threads = 22 29 | """ 30 | self.assertRaises(ConfigParseError, apply_config_from_string, config, "bla", {}) # No such run 31 | 32 | res = apply_config_from_string(config, "pg_pg", {}) 33 | assert res["update_column"] == "timestamp" # default 34 | assert res["verbose"] is True 35 | assert res["threads"] == 4 # overwritten by pg_pg 36 | assert res["database1"] == {"driver": "postgresql", "user": "postgres", "password": "Password1"} 37 | assert res["database2"] == "postgresql://postgres:Password1@/" 38 | assert res["table1"] == "rating" 39 | assert res["table2"] == "rating_del1" 40 | assert res["threads1"] == 11 41 | assert res["threads2"] == 22 42 | 43 | res = apply_config_from_string(config, "pg_pg", {"update_column": "foo", "table2": "bar"}) 44 | assert res["update_column"] == "foo" 45 | assert res["table2"] == "bar" 46 | 47 | def test_remove_password(self): 48 | replace_with = "*****" 49 | urls = [ 50 | "d://host/", 51 | "d://host:123/", 52 | "d://user@host:123/", 53 | "d://user:PASS@host:123/", 54 | "d://:PASS@host:123/", 55 | "d://:PASS@host:123/path", 56 | "d://:PASS@host:123/path?whatever#blabla", 57 | ] 58 | for url in urls: 59 | removed = remove_password_from_url(url, replace_with) 60 | expected = url.replace("PASS", replace_with) 61 | removed = remove_password_from_url(url, replace_with) 62 | self.assertEqual(removed, expected) 63 | 64 | def test_embed_env(self): 65 | env = { 66 | "DRIVER": "postgresql", 67 | "USER": "postgres", 68 | "PASSWORD": "Password1", 69 | "RUN_PG_1_DATABASE": "test_postgresql", 70 | "RUN_PG_1_TABLE": "rating", 71 | "RUN_PG_2_DATABASE": "postgresql://postgres:Password1@/", 72 | "RUN_PG_2_TABLE": "rating_del1", 73 | } 74 | config = r""" 75 | [database.test_postgresql] 76 | driver = "${DRIVER}" 77 | user = "${USER}" 78 | password = "${PASSWORD}" 79 | 80 | [run.default] 81 | update_column = "${UPDATE_COLUMN}" 82 | verbose = true 83 | threads = 2 84 | 85 | [run.pg_pg] 86 | threads = 4 87 | 1.database = "${RUN_PG_1_DATABASE}" 88 | 1.table = "${RUN_PG_1_TABLE}" 89 | 1.threads = 11 90 | 2.database = "${RUN_PG_2_DATABASE}" 91 | 2.table = "${RUN_PG_2_TABLE}" 92 | 2.threads = 22 93 | """ 94 | 95 | os.environ.update(env) 96 | res = apply_config_from_string(config, "pg_pg", {}) 97 | assert res["update_column"] == "" # missing env var 98 | assert res["verbose"] is True 99 | assert res["threads"] == 4 # overwritten by pg_pg 100 | assert res["database1"] == {"driver": "postgresql", "user": "postgres", "password": "Password1"} 101 | assert res["database2"] == "postgresql://postgres:Password1@/" 102 | assert res["table1"] == "rating" 103 | assert res["table2"] == "rating_del1" 104 | assert res["threads1"] == 11 105 | assert res["threads2"] == 22 106 | -------------------------------------------------------------------------------- /tests/test_joindiff.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from datetime import datetime 3 | 4 | from sqeleton.queries.ast_classes import TablePath 5 | from sqeleton.queries import table, commit 6 | from reladiff.table_segment import TableSegment 7 | from reladiff import databases as db 8 | from reladiff.joindiff_tables import JoinDiffer 9 | 10 | from .test_diff_tables import DiffTestCase 11 | 12 | from .common import ( 13 | random_table_suffix, 14 | test_each_database_in_list, 15 | ) 16 | 17 | 18 | TEST_DATABASES = { 19 | db.PostgreSQL, 20 | db.MySQL, 21 | db.Snowflake, 22 | db.BigQuery, 23 | db.Oracle, 24 | db.Redshift, 25 | db.Presto, 26 | db.Trino, 27 | db.Vertica, 28 | } 29 | 30 | test_each_database = test_each_database_in_list(TEST_DATABASES) 31 | 32 | 33 | @test_each_database_in_list({db.Snowflake, db.BigQuery}) 34 | class TestCompositeKey(DiffTestCase): 35 | src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime} 36 | dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime} 37 | 38 | def setUp(self): 39 | super().setUp() 40 | 41 | self.differ = JoinDiffer() 42 | 43 | def test_composite_key(self): 44 | time = "2022-01-01 00:00:00" 45 | time_obj = datetime.fromisoformat(time) 46 | 47 | cols = "id userid movieid rating timestamp".split() 48 | 49 | self.connection.query( 50 | [ 51 | self.src_table.insert_rows([[1, 1, 1, 9, time_obj], [2, 2, 2, 9, time_obj]], columns=cols), 52 | self.dst_table.insert_rows([[1, 1, 1, 9, time_obj], [2, 3, 2, 9, time_obj]], columns=cols), 53 | commit, 54 | ] 55 | ) 56 | 57 | # Sanity 58 | table1 = TableSegment( 59 | self.connection, self.table_src_path, ("id",), "timestamp", ("userid",), case_sensitive=False 60 | ) 61 | table2 = TableSegment( 62 | self.connection, self.table_dst_path, ("id",), "timestamp", ("userid",), case_sensitive=False 63 | ) 64 | diff = list(self.differ.diff_tables(table1, table2)) 65 | assert len(diff) == 2 66 | assert self.differ.stats["exclusive_count"] == 0 67 | 68 | # Test pks diffed, by checking exclusive_count 69 | table1 = TableSegment(self.connection, self.table_src_path, ("id", "userid"), "timestamp", case_sensitive=False) 70 | table2 = TableSegment(self.connection, self.table_dst_path, ("id", "userid"), "timestamp", case_sensitive=False) 71 | diff = list(self.differ.diff_tables(table1, table2)) 72 | assert len(diff) == 2 73 | assert self.differ.stats["exclusive_count"] == 2 74 | 75 | 76 | @test_each_database 77 | class TestJoindiff(DiffTestCase): 78 | src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime} 79 | dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime} 80 | 81 | def setUp(self): 82 | super().setUp() 83 | 84 | self.table = TableSegment(self.connection, self.table_src_path, ("id",), "timestamp", case_sensitive=False) 85 | self.table2 = TableSegment(self.connection, self.table_dst_path, ("id",), "timestamp", case_sensitive=False) 86 | 87 | self.differ = JoinDiffer() 88 | 89 | def test_diff_small_tables(self): 90 | time = "2022-01-01 00:00:00" 91 | time_obj = datetime.fromisoformat(time) 92 | 93 | cols = "id userid movieid rating timestamp".split() 94 | 95 | self.connection.query( 96 | [ 97 | self.src_table.insert_rows([[1, 1, 1, 9, time_obj], [2, 2, 2, 9, time_obj]], columns=cols), 98 | self.dst_table.insert_rows([[1, 1, 1, 9, time_obj]], columns=cols), 99 | commit, 100 | ] 101 | ) 102 | 103 | diff_res = self.differ.diff_tables(self.table, self.table2) 104 | info = diff_res.info_tree.info 105 | diff = list(diff_res) 106 | 107 | expected_row = ("2", time + ".000000") 108 | expected = [("-", expected_row)] 109 | self.assertEqual(expected, diff) 110 | self.assertEqual(2, info.rowcounts[1]) 111 | self.assertEqual(1, info.rowcounts[2]) 112 | # self.assertEqual(2, self.differ.stats["table1_max_id"]) 113 | # self.assertEqual(1, self.differ.stats["table2_min_id"]) 114 | 115 | # Test materialize 116 | materialize_path = self.connection.parse_table_name(f"test_mat_{random_table_suffix()}") 117 | mdiffer = self.differ.replace(materialize_to_table=materialize_path) 118 | diff = list(mdiffer.diff_tables(self.table, self.table2)) 119 | self.assertEqual(expected, diff) 120 | 121 | t = TablePath(materialize_path) 122 | rows = self.connection.query(t.select(), List[tuple]) 123 | # is_xa, is_xb, is_diff1, is_diff2, row1, row2 124 | # assert rows == [(1, 0, 1, 1) + expected_row + (None, None)], rows 125 | assert rows == [(1, 0, 1, 1) + (expected_row[0], None, expected_row[1], None)], rows 126 | self.connection.query(t.drop()) 127 | 128 | # Test materialize all rows 129 | mdiffer = mdiffer.replace(materialize_all_rows=True) 130 | diff = list(mdiffer.diff_tables(self.table, self.table2)) 131 | self.assertEqual(expected, diff) 132 | rows = self.connection.query(t.select(), List[tuple]) 133 | assert len(rows) == 2, len(rows) 134 | self.connection.query(t.drop()) 135 | 136 | def test_diff_table_above_bisection_threshold(self): 137 | time = "2022-01-01 00:00:00" 138 | time_obj = datetime.fromisoformat(time) 139 | 140 | cols = "id userid movieid rating timestamp".split() 141 | 142 | self.connection.query( 143 | [ 144 | self.src_table.insert_rows( 145 | [ 146 | [1, 1, 1, 9, time_obj], 147 | [2, 2, 2, 9, time_obj], 148 | [3, 3, 3, 9, time_obj], 149 | [4, 4, 4, 9, time_obj], 150 | [5, 5, 5, 9, time_obj], 151 | ], 152 | columns=cols, 153 | ), 154 | self.dst_table.insert_rows( 155 | [ 156 | [1, 1, 1, 9, time_obj], 157 | [2, 2, 2, 9, time_obj], 158 | [3, 3, 3, 9, time_obj], 159 | [4, 4, 4, 9, time_obj], 160 | ], 161 | columns=cols, 162 | ), 163 | commit, 164 | ] 165 | ) 166 | 167 | diff_res = self.differ.diff_tables(self.table, self.table2) 168 | info = diff_res.info_tree.info 169 | diff = list(diff_res) 170 | expected = [("-", ("5", time + ".000000"))] 171 | self.assertEqual(expected, diff) 172 | self.assertEqual(5, info.rowcounts[1]) 173 | self.assertEqual(4, info.rowcounts[2]) 174 | 175 | def test_return_empty_array_when_same(self): 176 | time = "2022-01-01 00:00:00" 177 | time_obj = datetime.fromisoformat(time) 178 | 179 | cols = "id userid movieid rating timestamp".split() 180 | 181 | self.connection.query( 182 | [ 183 | self.src_table.insert_row(1, 1, 1, 9, time_obj, columns=cols), 184 | self.dst_table.insert_row(1, 1, 1, 9, time_obj, columns=cols), 185 | ] 186 | ) 187 | 188 | diff = list(self.differ.diff_tables(self.table, self.table2)) 189 | self.assertEqual([], diff) 190 | 191 | def test_diff_sorted_by_key(self): 192 | time = "2022-01-01 00:00:00" 193 | time2 = "2021-01-01 00:00:00" 194 | 195 | time_obj = datetime.fromisoformat(time) 196 | time_obj2 = datetime.fromisoformat(time2) 197 | 198 | cols = "id userid movieid rating timestamp".split() 199 | 200 | self.connection.query( 201 | [ 202 | self.src_table.insert_rows( 203 | [ 204 | [1, 1, 1, 9, time_obj], 205 | [2, 2, 2, 9, time_obj2], 206 | [3, 3, 3, 9, time_obj], 207 | [4, 4, 4, 9, time_obj2], 208 | [5, 5, 5, 9, time_obj], 209 | ], 210 | columns=cols, 211 | ), 212 | self.dst_table.insert_rows( 213 | [ 214 | [1, 1, 1, 9, time_obj], 215 | [2, 2, 2, 9, time_obj], 216 | [3, 3, 3, 9, time_obj], 217 | [4, 4, 4, 9, time_obj], 218 | [5, 5, 5, 9, time_obj], 219 | ], 220 | columns=cols, 221 | ), 222 | commit, 223 | ] 224 | ) 225 | 226 | diff = list(self.differ.diff_tables(self.table, self.table2)) 227 | expected = { 228 | ("-", ("2", time2 + ".000000")), 229 | ("+", ("2", time + ".000000")), 230 | ("-", ("4", time2 + ".000000")), 231 | ("+", ("4", time + ".000000")), 232 | } 233 | self.assertEqual(expected, set(diff)) 234 | keys = [k for _, (k, _) in diff] 235 | assert keys[0] == keys[1] and keys[2] == keys[3] # same keys 236 | 237 | def test_dup_pks(self): 238 | time = "2022-01-01 00:00:00" 239 | time_obj = datetime.fromisoformat(time) 240 | 241 | cols = "id rating timestamp".split() 242 | 243 | self.connection.query( 244 | [ 245 | self.src_table.insert_rows([[1, 9, time_obj], [1, 10, time_obj]], columns=cols), 246 | self.dst_table.insert_row(1, 9, time_obj, columns=cols), 247 | ] 248 | ) 249 | 250 | x = self.differ.diff_tables(self.table, self.table2) 251 | self.assertRaises(ValueError, list, x) 252 | 253 | def test_null_pks(self): 254 | time = "2022-01-01 00:00:00" 255 | time_obj = datetime.fromisoformat(time) 256 | 257 | cols = "id rating timestamp".split() 258 | 259 | self.connection.query( 260 | [ 261 | self.src_table.insert_row(None, 9, time_obj, columns=cols), 262 | self.dst_table.insert_row(1, 9, time_obj, columns=cols), 263 | ] 264 | ) 265 | 266 | x = self.differ.diff_tables(self.table, self.table2) 267 | self.assertRaises(ValueError, list, x) 268 | 269 | 270 | @test_each_database_in_list(d for d in TEST_DATABASES if d.dialect.SUPPORTS_PRIMARY_KEY and d.SUPPORTS_UNIQUE_CONSTAINT) 271 | class TestUniqueConstraint(DiffTestCase): 272 | def setUp(self): 273 | super().setUp() 274 | 275 | self.src_table = table( 276 | self.table_src_path, 277 | schema={"id": int, "userid": int, "movieid": int, "rating": float}, 278 | ) 279 | self.dst_table = table( 280 | self.table_dst_path, 281 | schema={"id": int, "userid": int, "movieid": int, "rating": float}, 282 | ) 283 | 284 | self.connection.query( 285 | [self.src_table.create(primary_keys=["id"]), self.dst_table.create(primary_keys=["id", "userid"]), commit] 286 | ) 287 | 288 | self.differ = JoinDiffer() 289 | 290 | def test_unique_constraint(self): 291 | self.connection.query( 292 | [ 293 | self.src_table.insert_rows([[1, 1, 1, 9], [2, 2, 2, 9]]), 294 | self.dst_table.insert_rows([[1, 1, 1, 9], [2, 2, 2, 9]]), 295 | commit, 296 | ] 297 | ) 298 | 299 | # Test no active validation 300 | table = TableSegment(self.connection, self.table_src_path, ("id",), case_sensitive=False) 301 | table2 = TableSegment(self.connection, self.table_dst_path, ("id",), case_sensitive=False) 302 | 303 | res = list(self.differ.diff_tables(table, table2)) 304 | assert not res 305 | assert "validated_unique_keys" not in self.differ.stats 306 | 307 | # Test active validation 308 | table = TableSegment(self.connection, self.table_src_path, ("userid",), case_sensitive=False) 309 | table2 = TableSegment(self.connection, self.table_dst_path, ("userid",), case_sensitive=False) 310 | 311 | res = list(self.differ.diff_tables(table, table2)) 312 | assert not res 313 | self.assertEqual(self.differ.stats["validated_unique_keys"], [["userid"]]) 314 | -------------------------------------------------------------------------------- /tests/test_parse_time.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from datetime import timedelta 4 | 5 | from reladiff.parse_time import parse_time_delta 6 | 7 | 8 | class TestParseTime(unittest.TestCase): 9 | def setUp(self): 10 | pass 11 | 12 | def test_times(self): 13 | td = parse_time_delta("1w2d3h4min5s") 14 | assert td == timedelta(weeks=1, days=2, hours=3, minutes=4, seconds=5) 15 | 16 | assert parse_time_delta("1y") == timedelta(days=365) 17 | assert parse_time_delta("1mon") == timedelta(days=30) 18 | 19 | self.assertRaises(ValueError, parse_time_delta, "") 20 | self.assertRaises(ValueError, parse_time_delta, "1y1year") 21 | self.assertRaises(ValueError, parse_time_delta, "1x") 22 | -------------------------------------------------------------------------------- /tests/test_postgresql.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from sqeleton.queries import table, commit 4 | 5 | from reladiff import TableSegment, HashDiffer 6 | from reladiff import databases as db 7 | from .common import get_conn, random_table_suffix 8 | 9 | 10 | class TestUUID(unittest.TestCase): 11 | def setUp(self) -> None: 12 | self.connection = get_conn(db.PostgreSQL) 13 | 14 | table_suffix = random_table_suffix() 15 | 16 | self.table_src_name = f"src{table_suffix}" 17 | self.table_dst_name = f"dst{table_suffix}" 18 | 19 | self.table_src = table(self.table_src_name) 20 | self.table_dst = table(self.table_dst_name) 21 | 22 | def test_uuid(self): 23 | self.connection.query('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";', None) 24 | 25 | queries = [ 26 | self.table_src.drop(True), 27 | self.table_dst.drop(True), 28 | f"CREATE TABLE {self.table_src_name} (id uuid DEFAULT uuid_generate_v4 (), comment VARCHAR, PRIMARY KEY (id))", 29 | commit, 30 | self.table_src.insert_rows([[i] for i in range(100)], columns=["comment"]), 31 | commit, 32 | self.table_dst.create(self.table_src), 33 | commit, 34 | self.table_src.insert_row("This one is different", columns=["comment"]), 35 | commit, 36 | ] 37 | 38 | for query in queries: 39 | self.connection.query(query) 40 | 41 | a = TableSegment(self.connection, self.table_src.path, ("id",), "comment") 42 | b = TableSegment(self.connection, self.table_dst.path, ("id",), "comment") 43 | 44 | differ = HashDiffer() 45 | diff = list(differ.diff_tables(a, b)) 46 | uuid = diff[0][1][0] 47 | self.assertEqual(diff, [("-", (uuid, "This one is different"))]) 48 | 49 | # Compare with MySql 50 | mysql_conn = get_conn(db.MySQL) 51 | 52 | rows = self.connection.query(self.table_src.select(), list) 53 | 54 | queries = [ 55 | f"CREATE TABLE {self.table_dst_name} (id VARCHAR(128), comment VARCHAR(128))", 56 | commit, 57 | self.table_dst.insert_rows(rows, columns=["id", "comment"]), 58 | commit, 59 | ] 60 | 61 | for q in queries: 62 | mysql_conn.query(q) 63 | 64 | c = TableSegment(mysql_conn, (self.table_dst_name,), ("id",), "comment") 65 | diff = list(differ.diff_tables(a, c)) 66 | assert not diff, diff 67 | diff = list(differ.diff_tables(c, a)) 68 | assert not diff, diff 69 | 70 | self.connection.query(self.table_src.drop(True)) 71 | self.connection.query(self.table_dst.drop(True)) 72 | mysql_conn.query(self.table_dst.drop(True)) 73 | 74 | 75 | class Test100Fields(unittest.TestCase): 76 | def setUp(self) -> None: 77 | self.connection = get_conn(db.PostgreSQL) 78 | 79 | table_suffix = random_table_suffix() 80 | 81 | self.table_src_name = f"src{table_suffix}" 82 | self.table_dst_name = f"dst{table_suffix}" 83 | 84 | self.table_src = table(self.table_src_name) 85 | self.table_dst = table(self.table_dst_name) 86 | 87 | def test_100_fields(self): 88 | self.connection.query('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";', None) 89 | 90 | columns = [f"col{i}" for i in range(100)] 91 | fields = " ,".join(f'"{field}" TEXT' for field in columns) 92 | 93 | queries = [ 94 | self.table_src.drop(True), 95 | self.table_dst.drop(True), 96 | f"CREATE TABLE {self.table_src_name} (id uuid DEFAULT uuid_generate_v4 (), {fields})", 97 | commit, 98 | self.table_src.insert_rows([[f"{x * y}" for x in range(100)] for y in range(10)], columns=columns), 99 | commit, 100 | self.table_dst.create(self.table_src), 101 | commit, 102 | self.table_src.insert_rows([[1 for x in range(100)]], columns=columns), 103 | commit, 104 | ] 105 | 106 | for query in queries: 107 | self.connection.query(query) 108 | 109 | a = TableSegment(self.connection, self.table_src.path, ("id",), extra_columns=tuple(columns)) 110 | b = TableSegment(self.connection, self.table_dst.path, ("id",), extra_columns=tuple(columns)) 111 | 112 | differ = HashDiffer() 113 | diff = list(differ.diff_tables(a, b)) 114 | id_ = diff[0][1][0] 115 | result = (id_,) + tuple("1" for x in range(100)) 116 | self.assertEqual(diff, [("-", result)]) 117 | -------------------------------------------------------------------------------- /tests/waiting_for_stack_up.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -n "$VERTICA_URI" ] 4 | then 5 | echo "Check Vertica DB running..." 6 | while true 7 | do 8 | if docker logs dd-vertica | tail -n 100 | grep -q -i "vertica is now running" 9 | then 10 | echo "Vertica DB is ready"; 11 | break; 12 | else 13 | echo "Waiting for Vertica DB starting..."; 14 | sleep 10; 15 | fi 16 | done 17 | fi 18 | --------------------------------------------------------------------------------