├── .dockerignore
├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── request-support-for-a-database.md
    └── workflows
    │   ├── ci.yml
    │   └── ci_full.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── dev
    ├── Dockerfile.prestosql.340
    ├── _bq_import_csv.py
    ├── benchmark.sh
    ├── dev.env
    ├── graph.py
    ├── prepare_db.pql
    ├── prepare_db_gaps.pql
    ├── presto-conf
    │   └── standalone
    │   │   ├── catalog
    │   │       ├── jmx.properties
    │   │       ├── memory.properties
    │   │       ├── postgresql.properties
    │   │       ├── tpcds.properties
    │   │       └── tpch.properties
    │   │   ├── config.properties
    │   │   ├── jvm.config
    │   │   ├── log.properties
    │   │   └── node.properties
    └── trino-conf
    │   └── etc
    │       ├── catalog
    │           ├── jms.properties
    │           ├── memory.properties
    │           ├── postgresql.properties
    │           ├── tpcds.properties
    │           └── tpch.properties
    │       ├── config.properties
    │       ├── jvm.config
    │       └── node.properties
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── conf.py
    ├── how-to-use.md
    ├── index.rst
    ├── install.md
    ├── make.bat
    ├── new-database-driver-guide.rst
    ├── python-api.rst
    ├── requirements.txt
    ├── supported-databases.md
    └── technical-explanation.md
├── poetry.lock
├── pyproject.toml
├── readthedocs.yml
├── reladiff
    ├── __init__.py
    ├── __main__.py
    ├── config.py
    ├── databases
    │   ├── __init__.py
    │   ├── _connect.py
    │   ├── base.py
    │   ├── bigquery.py
    │   ├── clickhouse.py
    │   ├── databricks.py
    │   ├── duckdb.py
    │   ├── mysql.py
    │   ├── oracle.py
    │   ├── postgresql.py
    │   ├── presto.py
    │   ├── redshift.py
    │   ├── snowflake.py
    │   ├── trino.py
    │   └── vertica.py
    ├── diff_tables.py
    ├── hashdiff_tables.py
    ├── info_tree.py
    ├── joindiff_tables.py
    ├── parse_time.py
    ├── query_utils.py
    ├── table_segment.py
    ├── thread_utils.py
    └── utils.py
├── reladiff_logo.svg
└── tests
    ├── __init__.py
    ├── common.py
    ├── test_api.py
    ├── test_cli.py
    ├── test_config.py
    ├── test_database_types.py
    ├── test_diff_tables.py
    ├── test_joindiff.py
    ├── test_parse_time.py
    ├── test_postgresql.py
    └── waiting_for_stack_up.sh


/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | ml-25m*
3 | dev/ml-25m*
4 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig spec for a consistent cross-editor style.
 2 | # Read more: https://EditorConfig.org
 3 | 
 4 | root = true
 5 | 
 6 | [*]
 7 | end_of_line = lf        # Unix-style newlines with a newline ending every file
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | # 4 space indentation
11 | indent_style = space
12 | indent_size = 4
13 | 
14 | [*.{md,py}]
15 | charset = utf-8
16 | 
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | Make sure to include the following (minus sensitive information):
14 | - The command or code you used
15 | - The run output + error you're getting. (including tracestack)
16 | - Run reladiff with the `-d` switch for extra debug information.
17 | 
18 | If possible, please paste these as text, and not a screenshot.
19 | 
20 | **Describe the environment**
21 | 
22 | Describe which OS you're using, which reladiff version, and any other information that might be relevant to this bug.
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/request-support-for-a-database.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Request support for a database
 3 | about: 'Request a driver to support a new database  '
 4 | title: 'Add support for <database name>'
 5 | labels: new-db-driver
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI-COVER-VERSIONS
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - '**.py'
 7 |       - 'pyproject.toml'
 8 |       - '.github/workflows/**'
 9 |       - 'docker-compose.yml'
10 |       - '!dev/**'
11 |       - '!docs/**'
12 |   pull_request:
13 |     paths:
14 |       - '**.py'
15 |       - 'pyproject.toml'
16 |       - '.github/workflows/**'
17 |       - 'docker-compose.yml'
18 |       - '!dev/**'
19 |       - '!docs/**'
20 |     branches: [ master ]
21 | 
22 |   workflow_dispatch:
23 | 
24 | jobs:
25 |   unit_tests:
26 |     strategy:
27 |       fail-fast: false
28 |       matrix:
29 |         os: [ubuntu-latest]
30 |         python-version:
31 |           - "3.8"
32 |           - "3.9"
33 |           - "3.10"
34 |           - "3.11"
35 |           - "3.12"
36 | 
37 |     name: Check Python ${{ matrix.python-version }} on ${{ matrix.os }}
38 |     runs-on: ${{ matrix.os }}
39 |     steps:
40 |       - uses: actions/checkout@v3
41 | 
42 |       - name: Setup Python ${{ matrix.python-version }}
43 |         uses: actions/setup-python@v3
44 |         with:
45 |           python-version: ${{ matrix.python-version }}
46 | 
47 |       - name: Build the stack
48 |         run: docker compose up -d mysql postgres trino clickhouse vertica
49 | 
50 |       - name: Install Poetry
51 |         run: pip install poetry
52 | 
53 |       - name: Install package
54 |         run: "poetry install"
55 | 
56 |       # BigQuery start
57 |     #   - id: 'auth'
58 |     #     uses: 'google-github-actions/auth@v1'
59 |     #     with:
60 |     #       credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
61 | 
62 |     #   - name: 'Set up BigQuery Cloud SDK'
63 |     #     uses: 'google-github-actions/setup-gcloud@v1'
64 | 
65 |     #   - name: 'Use gcloud CLI'
66 |     #     run: "gcloud config configurations list"
67 | 
68 |     #   - name: "Install BigQuery for Python"
69 |     #     run: poetry add google-cloud-bigquery
70 | 
71 |       # BigQuery end
72 | 
73 |       - name: Run unit tests
74 |         env:
75 |             # SNOWFLAKE_URI: '${{ secrets.SNOWFLAKE_URI }}'
76 |             # PRESTO_URI: '${{ secrets.PRESTO_URI }}'
77 |             TRINO_URI: 'trino://postgres@127.0.0.1:8081/postgresql/public'
78 |             # BIGQUERY_URI: '${{ secrets.BIGQUERY_URI }}'
79 |             CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
80 |             VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
81 |             REDSHIFT_URI: '${{ secrets.REDSHIFT_URI }}'
82 |         run: |
83 |           chmod +x tests/waiting_for_stack_up.sh
84 |           ./tests/waiting_for_stack_up.sh && TEST_ACROSS_ALL_DBS=0 poetry run unittest-parallel -j 16
85 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_full.yml:
--------------------------------------------------------------------------------
 1 | name: CI-COVER-DATABASES
 2 | 
 3 | on:
 4 | #   push:
 5 | #     paths:
 6 | #       - '**.py'
 7 | #       - '.github/workflows/**'
 8 | #       - '!dev/**'
 9 |   pull_request:
10 |     paths:
11 |       - '**.py'
12 |       - 'pyproject.toml'
13 |       - 'poetry.lock'
14 |       - '.github/workflows/**'
15 |       - 'docker-compose.yml'
16 |       - '!dev/**'
17 |       - '!docs/**'
18 | 
19 |     branches: [ master ]
20 |   workflow_dispatch:
21 | 
22 | permissions:
23 |   id-token: write # This is required for requesting the JWT
24 |   contents: read  # This is required for actions/checkout
25 | 
26 | jobs:
27 |   unit_tests:
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         os: [ubuntu-latest]
32 |         python-version:
33 |           - "3.10"
34 | 
35 |     name: Check Python ${{ matrix.python-version }} on ${{ matrix.os }}
36 |     runs-on: ${{ matrix.os }}
37 |     steps:
38 |       - uses: actions/checkout@v3
39 | 
40 |       - name: Setup Python ${{ matrix.python-version }}
41 |         uses: actions/setup-python@v3
42 |         with:
43 |           python-version: ${{ matrix.python-version }}
44 | 
45 |       - name: Build the stack
46 |         run: docker compose up -d mysql postgres trino vertica # presto clickhouse
47 | 
48 |       - name: Install Poetry
49 |         run: pip install poetry
50 | 
51 |       - name: Install package
52 |         run: "poetry install"
53 | 
54 |       # BigQuery start
55 |     #   - id: 'auth'
56 |     #     uses: 'google-github-actions/auth@v1'
57 |     #     with:
58 |     #       credentials_json: '${{ secrets.GOOGLE_CREDENTIALS }}'
59 | 
60 |     #   - name: 'Set up BigQuery Cloud SDK'
61 |     #     uses: 'google-github-actions/setup-gcloud@v1'
62 | 
63 |     #   - name: "Install BigQuery for Python"
64 |     #     run: poetry add google-cloud-bigquery
65 | 
66 |       # BigQuery end
67 | 
68 |       - name: Run unit tests
69 |         env:
70 |             TRINO_URI: 'trino://postgres@127.0.0.1:8081/postgresql/public'
71 |             SNOWFLAKE_URI: '${{ secrets.SNOWFLAKE_URI }}'
72 |             # PRESTO_URI: '${{ secrets.PRESTO_URI }}'
73 |             # CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
74 |             VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
75 |             # BIGQUERY_URI: '${{ secrets.BIGQUERY_URI }}'
76 |             REDSHIFT_URI: '${{ secrets.REDSHIFT_URI }}'
77 |         run: |
78 |           chmod +x tests/waiting_for_stack_up.sh
79 |           ./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # dev
132 | ml-25m*
133 | ratings*.csv
134 | drive
135 | mysqltuner.pl
136 | benchmark_*.jsonl
137 | benchmark_*.png
138 | 
139 | # Mac
140 | .DS_Store
141 | 
142 | # IntelliJ
143 | .idea
144 | 
145 | # VSCode
146 | .vscode
147 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | Treat everyone with respect and patience.
2 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Reladiff
  2 | 
  3 | Contributions are very welcome! We'll be happy to help you in the process.
  4 | 
  5 | ## What should I know before I get started?
  6 | 
  7 | Go through the README and the documentation, and make sure that you understand how Reladiff works.
  8 | 
  9 | ## How to contribute?
 10 | 
 11 | ### Reporting bugs
 12 | 
 13 | Please report the bug with as many details as you can.
 14 | 
 15 | 1. Include the exact command that you used. Make sure to run Reladiff with the `-d` flag for debug output.
 16 | 2. Provide the entire output of the command. (stdout, logs, exception)
 17 | 3. If possible, show us how we could reproduce the bug. i.e. how to set up an environment in which it occurs.
 18 | 
 19 | (When pasting, always make sure to redact sensitive information, like passwords.)
 20 | 
 21 | If Reladiff returns incorrect results, i.e. false-positive or false-negative, please also include the original values.
 22 | 
 23 | Before you report a bug, make sure it doesn't already exist.
 24 | 
 25 | See [issues](/erezsh/reladiff/issues/).
 26 | 
 27 | ### Suggesting Enhancements
 28 | 
 29 | We are always interested to hear about how we can make Reladiff better!
 30 | 
 31 | If you'd like us to support a new database, you should open an issue for it, if there isn't one already. If it already exists, make sure to vote for it with a :thumbsup:, to help us priortize it.
 32 | 
 33 | The same goes for other technical requests, like missing features, or gaps in the documentation.
 34 | 
 35 | See [issues](/erezsh/reladiff/issues/).
 36 | 
 37 | For questions, and non-technical discussions, see [discussions](https://github.com/erezsh/reladiff/discussions).
 38 | 
 39 | ### Contributing code
 40 | 
 41 | #### Code style
 42 | 
 43 | All code should be formatted with `black -l 120`.
 44 | 
 45 | When in doubt, use the existing code as a guideline, or ask.
 46 | 
 47 | #### Get started (setup)
 48 | 
 49 | To get started, first clone the repository. For example `git clone https://github.com/erezsh/reladiff`.
 50 | 
 51 | Once inside, you can install the dependencies.
 52 | 
 53 | - Option 1: Run `poetry install` to install them in a virtual env. You can then run Reladiff using `poetry run reladiff ...` .
 54 | 
 55 | - Option 2: Run `pip install -e .` to install them, and Reladiff, in the global context.
 56 | 
 57 | At the bare minimum, you need MySQL to run the tests.
 58 | 
 59 | You can create a local MySQL instance using `docker-compose up mysql`. The URI for it will be `mysql://mysql:Password1@localhost/mysql`. If you're using a different server, make sure to update `TEST_MYSQL_CONN_STRING` in `tests/common.py`. For your convenience, we recommend creating `tests/local_settings.py`, and to override the value there.
 60 | 
 61 | You can also run a few servers at once. For example `docker-compose up mysql postgres presto`.
 62 | 
 63 | Make sure to update the appropriate `TEST_*_CONN_STRING`, so that it will be included in the tests.
 64 | 
 65 | #### Run the tests
 66 | 
 67 | You can run the tests with `unittest`.
 68 | 
 69 | When running against multiple databases, the tests can take a long while.
 70 | 
 71 | To save time, we recommend running them with `unittest-parallel`.
 72 | 
 73 | When debugging, we recommend using the `-f` flag, to stop on error. Also, use the `-k` flag to run only the individual test that you're trying to fix.
 74 | 
 75 | #### Implementing a new database.
 76 | 
 77 | New databases should be added as a new module in the `reladiff/databases/` folder.
 78 | 
 79 | If possible, please also add the database setup to `docker-compose.yml`, so that we can run and test it for ourselves. If you do, also update the CI (`ci.yml`).
 80 | 
 81 | Guide to implementing a new database driver: https://reladiff.readthedocs.io/en/latest/new-database-driver-guide.html
 82 | 
 83 | ## Development Setup
 84 | 
 85 | The development setup centers around using `docker-compose` to boot up various
 86 | databases, and then inserting data into them.
 87 | 
 88 | For Mac for performance of Docker, we suggest enabling in the UI:
 89 | 
 90 | * Use new Virtualization Framework
 91 | * Enable VirtioFS accelerated directory sharing
 92 | 
 93 | **1. Install Data Diff**
 94 | 
 95 | When developing/debugging, it's recommended to install dependencies and run it
 96 | directly with `poetry` rather than go through the package.
 97 | 
 98 | ```
 99 | $ brew install mysql postgresql # MacOS dependencies for C bindings
100 | $ apt-get install libpq-dev libmysqlclient-dev # Debian dependencies
101 | $ pip install poetry # Python dependency isolation tool
102 | $ poetry install # Install dependencies
103 | ```
104 | **2. Start Databases**
105 | 
106 | [Install **docker-compose**][docker-compose] if you haven't already.
107 | 
108 | ```shell-session
109 | $ docker-compose up -d mysql postgres # run mysql and postgres dbs in background
110 | ```
111 | 
112 | [docker-compose]: https://docs.docker.com/compose/install/
113 | 
114 | **3. Run Unit Tests**
115 | 
116 | There are more than 1000 tests for all the different type and database
117 | combinations, so we recommend using `unittest-parallel` that's installed as a
118 | development dependency.
119 | 
120 | ```shell-session
121 | $ poetry run unittest-parallel -j 16 #  run all tests
122 | $ poetry run python -m unittest -k <test> #  run individual test
123 | ```
124 | 
125 | **4. Seed the Database(s) (optional)**
126 | 
127 | First, download the CSVs of seeding data:
128 | 
129 | ```shell-session
130 | $ curl https://datafold-public.s3.us-west-2.amazonaws.com/1m.csv -o dev/ratings.csv
131 | # For a larger data-set (but takes 25x longer to import):
132 | # - curl https://datafold-public.s3.us-west-2.amazonaws.com/25m.csv -o dev/ratings.csv
133 | ```
134 | 
135 | Now you can insert it into the testing database(s):
136 | 
137 | ```shell-session
138 | # It's optional to seed more than one to run reladiff(1) against.
139 | $ poetry run preql -f dev/prepare_db.pql mysql://mysql:Password1@127.0.0.1:3306/mysql
140 | $ poetry run preql -f dev/prepare_db.pql postgresql://postgres:Password1@127.0.0.1:5432/postgres
141 | # Cloud databases
142 | $ poetry run preql -f dev/prepare_db.pql snowflake://<uri>
143 | $ poetry run preql -f dev/prepare_db.pql mssql://<uri>
144 | $ poetry run preql -f dev/prepare_db.pql bigquery:///<project>
145 | ```
146 | 
147 | **5. Run **Reladiff** against seeded database (optional)**
148 | 
149 | ```bash
150 | poetry run python3 -m reladiff postgresql://postgres:Password1@localhost/postgres rating postgresql://postgres:Password1@localhost/postgres rating_del1 --verbose
151 | ```
152 | 
153 | **6. Run benchmarks (optional)**
154 | 
155 | ```shell-session
156 | $ dev/benchmark.sh #  runs benchmarks and puts results in benchmark_<sha>.csv
157 | $ poetry run python3 dev/graph.py #  create graphs from benchmark_*.csv files
158 | ```
159 | 
160 | You can adjust how many rows we benchmark with by passing `N_SAMPLES` to `dev/benchmark.sh`:
161 | 
162 | ```shell-session
163 | $ N_SAMPLES=100000000 dev/benchmark.sh #  100m which is our canonical target
164 | ```
165 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | RUN apt-get update && apt-get install -y \
 3 |     python3-dev libpq-dev wget unzip \
 4 |     python3-setuptools gcc bc
 5 | RUN pip install --no-cache-dir poetry==1.1.13
 6 | COPY . /app
 7 | WORKDIR /app
 8 | # For now while we are in heavy development we install the latest with Poetry
 9 | # and execute directly with Poetry. Later, we'll move to the released Pip package.
10 | RUN poetry install
11 | ENTRYPOINT ["poetry", "run", "python3", "-m", "reladiff"]
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2024 Erez Shinnan
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 
20 | --
21 | 
22 | Copyright 2022 DataFold Inc.
23 | 
24 | Permission is hereby granted, free of charge, to any person obtaining a copy of
25 | this software and associated documentation files (the "Software"), to deal in
26 | the Software without restriction, including without limitation the rights to
27 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
28 | the Software, and to permit persons to whom the Software is furnished to do so,
29 | subject to the following conditions:
30 | 
31 | The above copyright notice and this permission notice shall be included in all
32 | copies or substantial portions of the Software.
33 | 
34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
36 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
37 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
38 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
39 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![](reladiff_logo.svg)
  2 | 
  3 | &nbsp;
  4 | <br/>
  5 | <br/>
  6 | <span style="font-size:1.3em">**Reladiff**</span> is a high-performance tool and library designed for diffing large datasets across databases. By executing the diff calculation within the database itself, Reladiff minimizes data transfer and achieves optimal performance.
  7 | 
  8 | This tool is specifically tailored for data professionals, DevOps engineers, and system administrators.
  9 | 
 10 | Reladiff is free, open-source, user-friendly, extensively tested, and delivers fast results, even at massive scale.
 11 | 
 12 | ### Key Features:
 13 | 
 14 |  1. **Cross-Database Diff**: Reladiff employs a divide-and-conquer algorithm, based on matching hashes, to efficiently identify modified segments and download only the necessary data for comparison. This approach ensures exceptional performance when differences are minimal.
 15 | 
 16 |     - ⇄  Diffs across over a dozen different databases (e.g. *PostgreSQL* -> *Snowflake*) !
 17 | 
 18 |     - 🧠 Gracefully handles reduced precision (e.g., timestamp(9) -> timestamp(3)) by rounding according to the database specification.
 19 | 
 20 |     - 🔥 Benchmarked to diff over 25M rows in under 10 seconds and over 1B rows in approximately 5 minutes, given no differences.
 21 | 
 22 |     - ♾️ Capable of handling tables with tens of billions of rows.
 23 | 
 24 | 
 25 | 2. **Intra-Database Diff**: When both tables reside in the same database, Reladiff compares them using a join operation, with additional optimizations for enhanced speed.
 26 | 
 27 |     - Supports materializing the diff into a local table.
 28 |     - Can collect various extra statistics about the tables.
 29 | 
 30 | 3. **Threaded**: Utilizes multiple threads to significantly boost performance during diffing operations.
 31 | 
 32 | 3. **Configurable**: Offers numerous options for power-users to customize and optimize their usage.
 33 | 
 34 | 4. **Automation-Friendly**: Outputs both JSON and git-like diffs (with + and -), facilitating easy integration into CI/CD pipelines.
 35 | 
 36 | 5. **Over a dozen databases supported**. MySQL, Postgres, Snowflake, Bigquery, Oracle, Clickhouse, and more. [See full list](https://reladiff.readthedocs.io/en/latest/supported-databases.html)
 37 | 
 38 | 
 39 | Reladiff is a fork of an archived project called [data-diff](https://github.com/datafold/data-diff).
 40 | 
 41 | ## Get Started
 42 | 
 43 | [**🗎 Read the Documentation**](https://reladiff.readthedocs.io/en/latest/) - our detailed documentation has everything you need to start diffing.
 44 | 
 45 | ## Quickstart
 46 | 
 47 | For the impatient ;)
 48 | 
 49 | ### Install
 50 | 
 51 | Reladiff is available on [PyPI](https://pypi.org/project/reladiff/). You may install it by running:
 52 | 
 53 | ```
 54 | pip install reladiff
 55 | ```
 56 | 
 57 | Requires Python 3.8+ with pip.
 58 | 
 59 | We advise to install it within a virtual-env.
 60 | 
 61 | ### How to Use
 62 | 
 63 | Once you've installed Reladiff, you can run it from the command-line:
 64 | 
 65 | ```bash
 66 | # Cross-DB diff, using hashes
 67 | reladiff  DB1_URI  TABLE1_NAME  DB2_URI  TABLE2_NAME  [OPTIONS]
 68 | ```
 69 | 
 70 | When both tables belong to the same database, a shorter syntax is available:
 71 | 
 72 | ```bash
 73 | # Same-DB diff, using outer join
 74 | reladiff  DB1_URI  TABLE1_NAME  TABLE2_NAME  [OPTIONS]
 75 | ```
 76 | 
 77 | Or, you can import and run it from Python:
 78 | 
 79 | ```python
 80 | from reladiff import connect_to_table, diff_tables
 81 | 
 82 | table1 = connect_to_table("postgresql:///", "table_name", "id")
 83 | table2 = connect_to_table("mysql:///", "table_name", "id")
 84 | 
 85 | sign: Literal['+' | '-']
 86 | row: tuple[str, ...]
 87 | for sign, row in diff_tables(table1, table2):
 88 |     print(sign, row)
 89 | ```
 90 | 
 91 | Read our detailed instructions:
 92 | 
 93 | * [How to use from the shell / command-line](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-from-the-shell-or-command-line)
 94 |     * [How to use with TOML configuration file](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-with-a-configuration-file)
 95 | * [How to use from Python](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-from-python)
 96 | 
 97 | 
 98 | #### "Real-world" example: Diff "events" table between Postgres and Snowflake
 99 | 
100 | ```
101 | reladiff \
102 |   postgresql:/// \
103 |   events \
104 |   "snowflake://<username>:<password>@<host>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
105 |   events \
106 |   -k event_id \         # Identifier of event
107 |   -c event_data \       # Extra column to compare
108 |   -w "event_time < '2024-10-10'"    # Filter the rows on both dbs
109 | ```
110 | 
111 | #### "Real-world" example: Diff "events" and "old_events" tables in the same Postgres DB
112 | 
113 | Materializes the results into a new table, containing the current timestamp in its name.
114 | 
115 | ```
116 | reladiff \
117 |   postgresql:///  events  old_events \
118 |   -k org_id \
119 |   -c created_at -c is_internal \
120 |   -w "org_id != 1 and org_id < 2000" \
121 |   -m test_results_%t \
122 |   --materialize-all-rows \
123 |   --table-write-limit 10000
124 | ```
125 | 
126 | ### Technical Explanation
127 | 
128 | Check out this [technical explanation](https://reladiff.readthedocs.io/en/latest/technical-explanation.html) of how cross-database reladiff works.
129 | 
130 | ### We're here to help!
131 | 
132 | * Confused? Got a cool idea? Just want to share your thoughts? Let's discuss it in [GitHub Discussions](https://github.com/erezsh/reladiff/discussions).
133 | 
134 | * Did you encounter a bug? [Open an issue](https://github.com/erezsh/reladiff/issues).
135 | 
136 | ## How to Contribute
137 | * Please read the [contributing guidelines](https://github.com/erezsh/reladiff/blob/master/CONTRIBUTING.md) to get started.
138 | * Feel free to open a new issue or work on an existing one.
139 | 
140 | Big thanks to everyone who contributed so far:
141 | 
142 | <a href="https://github.com/erezsh/reladiff/graphs/contributors">
143 |   <img src="https://contributors-img.web.app/image?repo=erezsh/reladiff" />
144 | </a>
145 | 
146 | 
147 | ## License
148 | 
149 | This project is licensed under the terms of the [MIT License](https://github.com/erezsh/reladiff/blob/master/LICENSE).
150 | 


--------------------------------------------------------------------------------
/dev/Dockerfile.prestosql.340:
--------------------------------------------------------------------------------
 1 | FROM openjdk:11-jdk-slim-buster
 2 | 
 3 | ENV PRESTO_VERSION=340
 4 | ENV PRESTO_SERVER_URL=https://repo1.maven.org/maven2/io/prestosql/presto-server/${PRESTO_VERSION}/presto-server-${PRESTO_VERSION}.tar.gz
 5 | ENV PRESTO_CLI_URL=https://repo1.maven.org/maven2/io/prestosql/presto-cli/${PRESTO_VERSION}/presto-cli-${PRESTO_VERSION}-executable.jar
 6 | ENV PRESTO_HOME=/opt/presto
 7 | ENV PATH=${PRESTO_HOME}/bin:${PATH}
 8 | 
 9 | WORKDIR $PRESTO_HOME
10 | 
11 | RUN set -xe \
12 |     && apt-get update \
13 |     && apt-get install -y curl less python \
14 |     && curl -sSL $PRESTO_SERVER_URL | tar xz --strip 1 \
15 |     && curl -sSL $PRESTO_CLI_URL > ./bin/presto \
16 |     && chmod +x ./bin/presto \
17 |     && apt-get remove -y curl \
18 |     && rm -rf /var/lib/apt/lists/*
19 | 
20 | VOLUME /data
21 | 
22 | EXPOSE 8080
23 | 
24 | ENTRYPOINT ["launcher"]
25 | CMD ["run"]
26 | 


--------------------------------------------------------------------------------
/dev/_bq_import_csv.py:
--------------------------------------------------------------------------------
 1 | from google.cloud import bigquery
 2 | 
 3 | client = bigquery.Client()
 4 | 
 5 | table_id = "reladiff-dev-2.reladiff.tmp_rating"
 6 | dataset_name = "reladiff"
 7 | 
 8 | client.create_dataset(dataset_name, exists_ok=True)
 9 | 
10 | job_config = bigquery.LoadJobConfig(
11 |     source_format=bigquery.SourceFormat.CSV,
12 |     skip_leading_rows=1,
13 |     autodetect=True,
14 | )
15 | 
16 | with open("ratings.csv", "rb") as source_file:
17 |     job = client.load_table_from_file(source_file, table_id, job_config=job_config)
18 | 
19 | job.result()  # Waits for the job to complete.
20 | 
21 | table = client.get_table(table_id)  # Make an API request.
22 | print("Loaded {} rows and {} columns to {}".format(table.num_rows, len(table.schema), table_id))
23 | 


--------------------------------------------------------------------------------
/dev/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | run_test() {
 4 |     N_SAMPLES=${N_SAMPLES:-1000000} N_THREADS=${N_THREADS:-16} LOG_LEVEL=${LOG_LEVEL:-info} BENCHMARK=1 \
 5 |         poetry run python3 -m unittest tests/test_database_types.py -v -k $1
 6 | }
 7 | 
 8 | run_test "postgresql_int_mysql_int"
 9 | run_test "mysql_int_mysql_int"
10 | run_test "postgresql_int_postgresql_int"
11 | run_test "postgresql_ts6_n_tz_mysql_ts0"
12 | run_test "postgresql_ts6_n_tz_snowflake_ts9"
13 | run_test "postgresql_int_presto_int"
14 | run_test "postgresql_int_redshift_int"
15 | run_test "postgresql_int_snowflake_int"
16 | run_test "postgresql_int_bigquery_int"
17 | run_test "snowflake_int_snowflake_int"
18 | 
19 | poetry run python dev/graph.py
20 | 


--------------------------------------------------------------------------------
/dev/dev.env:
--------------------------------------------------------------------------------
 1 | POSTGRES_USER=postgres
 2 | POSTGRES_PASSWORD=Password1
 3 | POSTGRES_DB=postgres
 4 | 
 5 | MYSQL_DATABASE=mysql
 6 | MYSQL_USER=mysql
 7 | MYSQL_PASSWORD=Password1
 8 | MYSQL_ROOT_PASSWORD=RootPassword1
 9 | 
10 | CLICKHOUSE_USER=clickhouse
11 | CLICKHOUSE_PASSWORD=Password1
12 | CLICKHOUSE_DB=clickhouse
13 | CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
14 | 
15 | # Vertica credentials
16 | APP_DB_USER=vertica
17 | APP_DB_PASSWORD=Password1
18 | VERTICA_DB_NAME=vertica
19 | 
20 | # To prevent generating sample demo VMart data (more about it here https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/GettingStartedGuide/IntroducingVMart/IntroducingVMart.htm),
21 | # leave VMART_DIR and VMART_ETL_SCRIPT empty.
22 | VMART_DIR=
23 | VMART_ETL_SCRIPT=
24 | 


--------------------------------------------------------------------------------
/dev/graph.py:
--------------------------------------------------------------------------------
 1 | # Use this to graph the benchmarking results (see benchmark.sh)
 2 | #
 3 | # To run this:
 4 | #   - pip install pandas
 5 | #   - pip install plotly
 6 | #
 7 | 
 8 | import pandas as pd
 9 | import plotly.graph_objects as go
10 | from reladiff.utils import number_to_human
11 | import glob
12 | 
13 | for benchmark_file in glob.glob("benchmark_*.jsonl"):
14 |     rows = pd.read_json(benchmark_file, lines=True)
15 |     rows["cloud"] = rows["test"].str.match(r".*(snowflake|redshift|presto|bigquery)")
16 |     sha = benchmark_file.split("_")[1].split(".")[0]
17 |     print(f"Generating graphs from {benchmark_file}..")
18 | 
19 |     for n_rows, group in rows.groupby(["rows"]):
20 |         image_path = f"benchmark_{sha}_{number_to_human(n_rows)}.png"
21 |         print(f"\t rows: {number_to_human(n_rows)}, image: {image_path}")
22 | 
23 |         r = group.drop_duplicates(subset=["name_human"])
24 |         r = r.sort_values(by=["cloud", "source_type", "target_type", "name_human"])
25 | 
26 |         fig = go.Figure(
27 |             data=[
28 |                 go.Bar(
29 |                     name="count(*)",
30 |                     x=r["name_human"],
31 |                     y=r["count_max_sec"],
32 |                     text=r["count_max_sec"],
33 |                     textfont=dict(color="blue"),
34 |                 ),
35 |                 go.Bar(
36 |                     name="reladiff (checksum)",
37 |                     x=r["name_human"],
38 |                     y=r["checksum_sec"],
39 |                     text=r["checksum_sec"],
40 |                     textfont=dict(color="red"),
41 |                 ),
42 |                 go.Bar(
43 |                     name="Download and compare †",
44 |                     x=r["name_human"],
45 |                     y=r["download_sec"],
46 |                     text=r["download_sec"],
47 |                     textfont=dict(color="green"),
48 |                 ),
49 |             ]
50 |         )
51 |         # Change the bar mode
52 |         fig.update_layout(title=f"reladiff {number_to_human(n_rows)} rows, sha: {sha}")
53 |         fig.update_traces(texttemplate="%{text:.1f}", textposition="outside")
54 |         fig.update_layout(uniformtext_minsize=2, uniformtext_mode="hide")
55 |         fig.update_yaxes(title="Time")
56 |         fig.write_image(image_path, scale=2)
57 | 


--------------------------------------------------------------------------------
/dev/prepare_db.pql:
--------------------------------------------------------------------------------
  1 | // This is a Preql file, used for setting up a database for development and testing
  2 | //
  3 | // In loads a "rating" dataset and generates a set of tables from it, with various modifications.
  4 | 
  5 | // Declare table & functions
  6 | func run_sql(code) {
  7 |     print code
  8 |     force_eval( SQL( nulltype, code ))
  9 | }
 10 | 
 11 | func drop_table(t) {
 12 |     run_sql("DROP TABLE IF EXISTS " + get_qualified_name(t))
 13 | }
 14 | 
 15 | func create_indices(tbl) {
 16 |     tbl.add_index("id", true)
 17 |     tbl.add_index("timestamp")
 18 |     tbl.add_index(["id", "timestamp"])
 19 | }
 20 | 
 21 | DATASET = "reladiff"   // For BigQuery
 22 | if (db_type == "bigquery") {
 23 |     set_active_dataset(DATASET)
 24 | }
 25 | 
 26 | // Cleanup
 27 | func cleanup() {
 28 |     drop_table("rating")
 29 |     drop_table("tmp_rating")
 30 |     drop_table("rating_del1")
 31 |     drop_table("rating_update1")
 32 |     drop_table("rating_update001p")
 33 |     drop_table("rating_update1p")
 34 |     drop_table("rating_del1p")
 35 |     drop_table("rating_update50p")
 36 |     commit()
 37 | }
 38 | 
 39 | cleanup()
 40 | 
 41 | // Import CSV
 42 | if (db_type == "snowflake" or db_type == "redshift") {
 43 |     if (db_type == "snowflake") {
 44 |         print "Uploading ratings CSV"
 45 | 
 46 |         run_sql("RM @~/ratings.csv.gz")
 47 |         run_sql("PUT file://dev/ratings.csv @~")
 48 | 
 49 |         print "Loading ratings CSV"
 50 | 
 51 |         bare table tmp_rating {
 52 |             userid: int
 53 |             movieid: int
 54 |             rating: float
 55 |             timestamp: int
 56 |         }
 57 | 
 58 |         run_sql("COPY INTO tmp_rating FROM '@~/ratings.csv.gz' file_format=(skip_header=1)")
 59 | 
 60 |     } else if (db_type == "redshift") {
 61 |         // NOTE: Requires that the csv already exists on s3 in the given path
 62 |         print "Loading ratings CSV (already uploaded)"
 63 | 
 64 |         table tmp_rating {
 65 |             userid: int
 66 |             movieid: int
 67 |             rating: float
 68 |             timestamp: int
 69 |         }
 70 | 
 71 |         run_sql("""
 72 |             COPY "public"."tmp_rating" (userid, movieid, rating, timestamp)
 73 |             FROM 's3://dev-cf-redshift-datafold-reladiff/ml/ratings.csv'
 74 |             IAM_ROLE 'arn:aws:iam::760878568205:role/dev-cf-redshift-reladiff'
 75 |             DELIMITER ','
 76 |             IGNOREHEADER 1;
 77 |             """)
 78 | 
 79 |     }
 80 | 
 81 |     table rating {
 82 |         id: int     // explicit id, instead of identity type
 83 |         userid: int
 84 |         movieid: int
 85 |         rating: float
 86 |         timestamp: int
 87 |     }
 88 | 
 89 |     run_sql("""
 90 |         INSERT INTO rating(id, userid, movieid, rating, timestamp)
 91 |         SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp
 92 |         FROM tmp_rating
 93 |     """)
 94 | 
 95 | } else if (db_type == "mssql") {
 96 |     run_sql("drop table if exists tmp_rating")
 97 |     run_sql("create table tmp_rating(userid int, movieid int, rating float, timestamp int)")
 98 |     table tmp_rating {...}
 99 |     print "Loading ratings CSV"
100 |     run_sql("BULK INSERT tmp_rating from 'dev/ratings.csv' with (fieldterminator = ',', rowterminator = '0x0a', FIRSTROW = 2);")
101 |     print "Populating actual table"
102 |     rating += tmp_rating
103 |     commit()
104 | } else if (db_type == "bigquery") {
105 |     print "Importing the CSV through the Python script (BigQuery)"
106 |     PY("0", "import _bq_import_csv")
107 | 
108 |     table rating {
109 |         id: int     // explicit id, to avoid identity type
110 |         userid: int
111 |         movieid: int
112 |         rating: float
113 |         timestamp: int
114 |     }
115 | 
116 |     run_sql("""
117 |         INSERT INTO reladiff.rating(id, userid, movieid, rating, timestamp)
118 |         SELECT row_number() over (order by userid, movieid, timestamp) AS id, userid, movieid, rating, timestamp FROM reladiff.tmp_rating
119 |     """)
120 | 
121 | } else {
122 |     print "Importing ratings CSV"
123 | 
124 |     table rating {
125 |         userid: int
126 |         movieid: int
127 |         rating: float
128 |         timestamp: int
129 |     }
130 |     import_csv(rating, 'dev/ratings.csv', true)
131 |     create_indices(rating)
132 | }
133 | 
134 | drop_table("tmp_rating")
135 | commit()
136 | 
137 | middle = count(rating) /~ 2
138 | 
139 | // Code notes:
140 | // - We use 'const table' to avoid updating the ids
141 | 
142 | print "Create tables"
143 | const table rating_del1 = rating
144 | const table rating_update1 = rating
145 | const table rating_update001p = rating
146 | const table rating_update1p = rating
147 | const table rating_del1p = rating
148 | const table rating_update50p = rating
149 | 
150 | print "Create indexes"
151 | 
152 | create_indices(rating_del1)
153 | create_indices(rating_update1)
154 | create_indices(rating_update001p)
155 | create_indices(rating_update1p)
156 | create_indices(rating_del1p)
157 | create_indices(rating_update50p)
158 | commit()
159 | 
160 | print "Alter tables"
161 | rating_del1[middle..(middle+1)] delete [true]
162 | assert count(rating) == count(rating_del1) + 1
163 | rating_update1[middle..(middle+1)] update {timestamp: timestamp + 1}
164 | 
165 | rating_update001p[random() < 0.0001] update {timestamp: timestamp + 1}
166 | rating_update1p[random() < 0.01] update {timestamp: timestamp + 1}
167 | rating_update50p[random() < 0.5] update {timestamp: timestamp + 1}
168 | rating_del1p[random() < 0.01] delete [true]
169 | 
170 | commit()
171 | 


--------------------------------------------------------------------------------
/dev/prepare_db_gaps.pql:
--------------------------------------------------------------------------------
 1 | // This is a Preql file, used for setting up a database for development and testing
 2 | //
 3 | // It generates tables with various gaps in them, based on the "rating" dataset.
 4 | // Assumes prepare_db.pql has already been run.
 5 | 
 6 | 
 7 | // Declare table & functions
 8 | func run_sql(code) {
 9 |     print code
10 |     force_eval( SQL( nulltype, code ))
11 | }
12 | 
13 | func drop_table(t) {
14 |     run_sql("DROP TABLE IF EXISTS " + t)
15 | }
16 | 
17 | func create_indices(tbl) {
18 |     tbl.add_index("id", true)
19 |     tbl.add_index("timestamp")
20 |     tbl.add_index(["id", "timestamp"])
21 | }
22 | 
23 | // Assumes prepare_db already ran
24 | table rating {...}
25 | 
26 | drop_table("rating_gap1")
27 | drop_table("rating_gap2")
28 | drop_table("rating_gap3")
29 | drop_table("rating_gap1_update0001p")
30 | drop_table("rating_gap2_update0001p")
31 | drop_table("rating_gap3_update0001p")
32 | 
33 | const table rating_gap1 = rating
34 | const table rating_gap2 = rating
35 | const table rating_gap3 = rating
36 | 
37 | create_indices(rating_gap1)
38 | create_indices(rating_gap2)
39 | create_indices(rating_gap3)
40 | commit()
41 | 
42 | table rating_gap1 {
43 |     userid: int
44 |     movieid: int
45 |     rating: float
46 |     timestamp: int
47 | }
48 | 
49 | table rating_gap2 {
50 |     userid: int
51 |     movieid: int
52 |     rating: float
53 |     timestamp: int
54 | }
55 | 
56 | table rating_gap3 {
57 |     userid: int
58 |     movieid: int
59 |     rating: float
60 |     timestamp: int
61 | }
62 | 
63 | rating_gap3[id == 1000] update {id: 2147483548}
64 | 
65 | // Create many small gaps, for testing low bisection thresholds
66 | run_sql("UPDATE rating_gap1 SET id = id * 1000 + 25000000 where 100000 < id and id <= 500000 ")
67 | 
68 | // Create increasing gaps, to test many gaps of various sizes at once
69 | run_sql("UPDATE rating_gap2 SET id = cast(id*0.1*id as int) + 26000000 WHERE 10 < id and id < 100000")
70 | 
71 | // Create one very big gap, to test empty scans and excessive bisection.
72 | run_sql("INSERT INTO rating_gap3(id, userid, movieid, rating, timestamp) VALUES (2047483548, 1, 1, 5.0, 27)")
73 | commit()
74 | 
75 | print "Create more tables"
76 | const table rating_gap1_update0001p = rating_gap1
77 | const table rating_gap2_update0001p = rating_gap2
78 | const table rating_gap3_update0001p = rating_gap3
79 | create_indices(rating_gap1_update0001p)
80 | create_indices(rating_gap2_update0001p)
81 | create_indices(rating_gap3_update0001p)
82 | 
83 | rating_gap1_update0001p[random() < 0.000001] update {timestamp: timestamp + 1}
84 | rating_gap2_update0001p[random() < 0.000001] update {timestamp: timestamp + 1}
85 | rating_gap3_update0001p[random() < 0.000001] update {timestamp: timestamp + 1}
86 | rating_gap3[id == 100000] delete [true]
87 | commit()
88 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/catalog/jmx.properties:
--------------------------------------------------------------------------------
1 | connector.name=jmx
2 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/catalog/memory.properties:
--------------------------------------------------------------------------------
1 | connector.name=memory
2 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/catalog/postgresql.properties:
--------------------------------------------------------------------------------
1 | connector.name=postgresql
2 | connection-url=jdbc:postgresql://postgres:5432/postgres
3 | connection-user=postgres
4 | connection-password=Password1
5 | allow-drop-table=true
6 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/catalog/tpcds.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpcds
2 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/catalog/tpch.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpch
2 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | query.max-memory=5GB
5 | query.max-memory-per-node=1GB
6 | query.max-total-memory-per-node=2GB
7 | discovery-server.enabled=true
8 | discovery.uri=http://127.0.0.1:8080
9 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/jvm.config:
--------------------------------------------------------------------------------
 1 | -server
 2 | -Xmx16G
 3 | -XX:+UseG1GC
 4 | -XX:G1HeapRegionSize=32M
 5 | -XX:+UseGCOverheadLimit
 6 | -XX:+ExplicitGCInvokesConcurrent
 7 | -XX:+HeapDumpOnOutOfMemoryError
 8 | -XX:+ExitOnOutOfMemoryError
 9 | -XX:OnOutOfMemoryError=kill -9 %p
10 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/log.properties:
--------------------------------------------------------------------------------
1 | com.facebook.presto=INFO
2 | 


--------------------------------------------------------------------------------
/dev/presto-conf/standalone/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=production
2 | node.data-dir=/data
3 | node.id=standalone
4 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/catalog/jms.properties:
--------------------------------------------------------------------------------
1 | connector.name=jmx
2 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/catalog/memory.properties:
--------------------------------------------------------------------------------
1 | connector.name=memory
2 | memory.max-data-per-node=128MB
3 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/catalog/postgresql.properties:
--------------------------------------------------------------------------------
1 | connector.name=postgresql
2 | connection-url=jdbc:postgresql://postgres:5432/postgres
3 | connection-user=postgres
4 | connection-password=Password1
5 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/catalog/tpcds.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpcds
2 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/catalog/tpch.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpch
2 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | discovery.uri=http://localhost:8080
5 | discovery-server.enabled=true
6 | 


--------------------------------------------------------------------------------
/dev/trino-conf/etc/jvm.config:
--------------------------------------------------------------------------------
 1 | -server
 2 | -Xmx1G
 3 | -XX:-UseBiasedLocking
 4 | -XX:+UseG1GC
 5 | -XX:G1HeapRegionSize=32M
 6 | -XX:+ExplicitGCInvokesConcurrent
 7 | -XX:+HeapDumpOnOutOfMemoryError
 8 | -XX:+UseGCOverheadLimit
 9 | -XX:+ExitOnOutOfMemoryError
10 | -XX:ReservedCodeCacheSize=256M
11 | -Djdk.attach.allowAttachSelf=true
12 | -Djdk.nio.maxCachedBufferSize=2000000


--------------------------------------------------------------------------------
/dev/trino-conf/etc/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=docker
2 | node.data-dir=/data/trino
3 | plugin.dir=/usr/lib/trino/plugin
4 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "3.8"
  2 | 
  3 | services:
  4 |     postgres:
  5 |       container_name: dd-postgresql
  6 |       image: postgres:14.1-alpine
  7 |       # work_mem: less tmp files
  8 |       # maintenance_work_mem: improve table-level op perf
  9 |       # max_wal_size: allow more time before merging to heap
 10 |       command: >
 11 |         -c work_mem=1GB
 12 |         -c maintenance_work_mem=1GB
 13 |         -c max_wal_size=8GB
 14 |       restart: always
 15 |       volumes:
 16 |         - postgresql-data:/var/lib/postgresql/data:delegated
 17 |       ports:
 18 |         - '5432:5432'
 19 |       expose:
 20 |         - '5432'
 21 |       env_file:
 22 |         - dev/dev.env
 23 |       tty: true
 24 |       networks:
 25 |         - local
 26 | 
 27 |     mysql:
 28 |       container_name: dd-mysql
 29 |       image: mysql:oracle
 30 |       # fsync less aggressively for insertion perf for test setup
 31 |       command: >
 32 |         --binlog-cache-size=16M
 33 |         --key_buffer_size=0
 34 |         --max_connections=1000
 35 |         --innodb_flush_log_at_trx_commit=2
 36 |         --innodb_flush_log_at_timeout=10
 37 |         --innodb_log_compressed_pages=OFF
 38 |         --sync_binlog=0
 39 |       restart: always
 40 |       volumes:
 41 |         - mysql-data:/var/lib/mysql:delegated
 42 |       user: mysql
 43 |       ports:
 44 |         - '3306:3306'
 45 |       expose:
 46 |         - '3306'
 47 |       env_file:
 48 |         - dev/dev.env
 49 |       tty: true
 50 |       networks:
 51 |         - local
 52 | 
 53 |     clickhouse:
 54 |       container_name: dd-clickhouse
 55 |       image: clickhouse/clickhouse-server:21.12.3.32
 56 |       restart: always
 57 |       volumes:
 58 |           - clickhouse-data:/var/lib/clickhouse:delegated
 59 |       ulimits:
 60 |         nproc: 65535
 61 |         nofile:
 62 |           soft: 262144
 63 |           hard: 262144
 64 |       ports:
 65 |         - '8123:8123'
 66 |         - '9000:9000'
 67 |       expose:
 68 |         - '8123'
 69 |         - '9000'
 70 |       env_file:
 71 |         - dev/dev.env
 72 |       tty: true
 73 |       networks:
 74 |       - local
 75 | 
 76 |     # prestodb.dbapi.connect(host="127.0.0.1", user="presto").cursor().execute('SELECT * FROM system.runtime.nodes')
 77 |     presto:
 78 |         container_name: dd-presto
 79 |         build:
 80 |             context: ./dev
 81 |             dockerfile: ./Dockerfile.prestosql.340
 82 |         volumes:
 83 |             - ./dev/presto-conf/standalone:/opt/presto/etc:ro
 84 |         ports:
 85 |             - '8080:8080'
 86 |         tty: true
 87 |         networks:
 88 |             - local
 89 | 
 90 |     trino:
 91 |         container_name: dd-trino
 92 |         image: 'trinodb/trino:389'
 93 |         hostname: trino
 94 |         ports:
 95 |             - '8081:8080'
 96 |         volumes:
 97 |             - ./dev/trino-conf/etc:/etc/trino:ro
 98 |         networks:
 99 |             - local
100 | 
101 |     vertica:
102 |       container_name: dd-vertica
103 |       image: vertica/vertica-ce:12.0.0-0
104 |       restart: always
105 |       volumes:
106 |           - vertica-data:/data:delegated
107 |       ports:
108 |         - '5433:5433'
109 |         - '5444:5444'
110 |       expose:
111 |         - '5433'
112 |         - '5444'
113 |       env_file:
114 |         - dev/dev.env
115 |       tty: true
116 |       networks:
117 |       - local
118 | 
119 | 
120 | 
121 | volumes:
122 |   postgresql-data:
123 |   mysql-data:
124 |   clickhouse-data:
125 |   vertica-data:
126 | 
127 | networks:
128 |   local:
129 |     driver: bridge
130 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = reladiff
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Documentation build configuration file, created by
  5 | # sphinx-quickstart on Sun Aug 16 13:09:41 2020.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | sys.path.insert(0, os.path.abspath(".."))
 24 | sys.path.append(os.path.abspath("./_ext"))
 25 | autodoc_member_order = "bysource"
 26 | 
 27 | 
 28 | # -- General configuration ------------------------------------------------
 29 | 
 30 | # If your documentation needs a minimal Sphinx version, state it here.
 31 | #
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     "sphinx.ext.autodoc",
 39 |     "sphinx.ext.napoleon",
 40 |     "sphinx.ext.coverage",
 41 |     "recommonmark",
 42 |     "sphinx_markdown_tables",
 43 |     "sphinx_copybutton",
 44 |     "enum_tools.autoenum",
 45 |     # 'sphinx_gallery.gen_gallery'
 46 | ]
 47 | 
 48 | # Add any paths that contain templates here, relative to this directory.
 49 | templates_path = ["_templates"]
 50 | 
 51 | # The suffix(es) of source filenames.
 52 | # You can specify multiple suffix as a list of string:
 53 | #
 54 | # source_suffix = ['.rst', '.md']
 55 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 56 | 
 57 | 
 58 | # The master toctree document.
 59 | master_doc = "index"
 60 | 
 61 | # General information about the project.
 62 | project = "reladiff"
 63 | copyright = "Erez Shinan"
 64 | author = "Erez Shinan"
 65 | 
 66 | # The version info for the project you're documenting, acts as replacement for
 67 | # |version| and |release|, also used in various other places throughout the
 68 | # built documents.
 69 | #
 70 | # The short X.Y version.
 71 | version = ""
 72 | # The full version, including alpha/beta/rc tags.
 73 | release = ""
 74 | 
 75 | # The language for content autogenerated by Sphinx. Refer to documentation
 76 | # for a list of supported languages.
 77 | #
 78 | # This is also used if you do content translation via gettext catalogs.
 79 | # Usually you set "language" from the command line for these cases.
 80 | language = "en"
 81 | 
 82 | # List of patterns, relative to source directory, that match files and
 83 | # directories to ignore when looking for source files.
 84 | # This patterns also effect to html_static_path and html_extra_path
 85 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 86 | 
 87 | # The name of the Pygments (syntax highlighting) style to use.
 88 | pygments_style = "sphinx"
 89 | 
 90 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 91 | todo_include_todos = False
 92 | 
 93 | 
 94 | # -- Options for HTML output ----------------------------------------------
 95 | 
 96 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 97 | # a list of builtin themes.
 98 | #
 99 | html_theme = "sphinx_rtd_theme"
100 | 
101 | # Theme options are theme-specific and customize the look and feel of a theme
102 | # further.  For a list of options available for each theme, see the
103 | # documentation.
104 | #
105 | # html_theme_options = {}
106 | 
107 | # Add any paths that contain custom static files (such as style sheets) here,
108 | # relative to this directory. They are copied after the builtin static files,
109 | # so a file named "default.css" will overwrite the builtin "default.css".
110 | html_static_path = ["_static"]
111 | 
112 | html_css_files = [
113 |     "custom.css",
114 | ]
115 | 
116 | # Custom sidebar templates, must be a dictionary that maps document names
117 | # to template names.
118 | #
119 | # This is required for the alabaster theme
120 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
121 | html_sidebars = {
122 |     "**": [
123 |         "relations.html",  # needs 'show_related': True theme option to display
124 |         "searchbox.html",
125 |     ]
126 | }
127 | 
128 | 
129 | # -- Options for HTMLHelp output ------------------------------------------
130 | 
131 | # Output file base name for HTML help builder.
132 | htmlhelp_basename = "reladiffdoc"
133 | 
134 | 
135 | # -- Options for LaTeX output ---------------------------------------------
136 | 
137 | latex_elements = {
138 |     # The paper size ('letterpaper' or 'a4paper').
139 |     #
140 |     # 'papersize': 'letterpaper',
141 |     # The font size ('10pt', '11pt' or '12pt').
142 |     #
143 |     # 'pointsize': '10pt',
144 |     # Additional stuff for the LaTeX preamble.
145 |     #
146 |     # 'preamble': '',
147 |     # Latex figure (float) alignment
148 |     #
149 |     # 'figure_align': 'htbp',
150 | }
151 | 
152 | # Grouping the document tree into LaTeX files. List of tuples
153 | # (source start file, target name, title,
154 | #  author, documentclass [howto, manual, or own class]).
155 | latex_documents = [
156 |     (master_doc, "Reladiff.tex", "Reladiff Documentation", "Erez Shinan", "manual"),
157 | ]
158 | 
159 | 
160 | # -- Options for manual page output ---------------------------------------
161 | 
162 | # One entry per manual page. List of tuples
163 | # (source start file, name, description, authors, manual section).
164 | man_pages = [(master_doc, "Reladiff", "Reladiff Documentation", [author], 1)]
165 | 
166 | 
167 | # -- Options for Texinfo output -------------------------------------------
168 | 
169 | # Grouping the document tree into Texinfo files. List of tuples
170 | # (source start file, target name, title, author,
171 | #  dir menu entry, description, category)
172 | texinfo_documents = [
173 |     (
174 |         master_doc,
175 |         "Reladiff",
176 |         "Reladiff Documentation",
177 |         author,
178 |         "Reladiff",
179 |         "One line description of project.",
180 |         "Miscellaneous",
181 |     ),
182 | ]
183 | 
184 | # -- Sphinx gallery config -------------------------------------------
185 | 
186 | # sphinx_gallery_conf = {
187 | #     'examples_dirs': ['../examples'],
188 | #     'gallery_dirs': ['examples'],
189 | # }
190 | 


--------------------------------------------------------------------------------
/docs/how-to-use.md:
--------------------------------------------------------------------------------
  1 | # User guide
  2 | 
  3 | Once you've [installed](https://reladiff.readthedocs.io/en/latest/install.html) Reladiff, you can run it from the command-line, or from Python.
  4 | 
  5 | ## How to use from the shell / command-line
  6 | 
  7 | The basic syntax for reladiff is:
  8 | 
  9 | ```bash
 10 | # Cross-DB diff, using hashes
 11 | reladiff  DB1_URI  TABLE1_NAME  DB2_URI  TABLE2_NAME  [OPTIONS]
 12 | ```
 13 | 
 14 | When both tables belong to the same database, a shorter syntax is available:
 15 | 
 16 | ```bash
 17 | # Same-DB diff, using outer join
 18 | reladiff  DB_URI  TABLE1_NAME  TABLE2_NAME  [OPTIONS]
 19 | ```
 20 | 
 21 | `DB_URL` is either a [database URL](supported-databases.md), or the name of a database definition that is specified in a [configuration file](https://reladiff.readthedocs.io/en/latest/how-to-use.html#how-to-use-with-a-configuration-file). Our database URLs conform to the same format as SQLAlchemy.
 22 | 
 23 | We recommend using a configuration file, with the ``--conf`` switch, to keep the command simple and manageable.
 24 | 
 25 | For a list of example URLs, see [list of supported databases](supported-databases.md).
 26 | 
 27 | Note: Because URLs allow many special characters, and may collide with the syntax of your shell,
 28 | it's recommended to surround them with quotes.
 29 | 
 30 | ### Options
 31 | 
 32 |   - `--help` - Show help message and exit.
 33 |   - `-k` or `--key-columns` - Name of the primary key column. If none provided, default is 'id'. Can be used more than once, for a compound key.
 34 |   - `-t` or `--update-column` - Name of updated_at/last_updated column
 35 |   - `-c` or `--columns` - Names of extra columns to compare.  Can be used more than once in the same command.
 36 |                           Accepts a name or a pattern like in SQL.
 37 |                           Example: `-c col% -c another_col -c %foob.r%`
 38 |   - `-l` or `--limit` - Maximum number of differences to find (limits maximum bandwidth and runtime)
 39 |   - `-s` or `--stats` - Print stats instead of a detailed diff
 40 |   - `-d` or `--debug` - Print debug info
 41 |   - `-v` or `--verbose` - Print extra info
 42 |   - `-i` or `--interactive` - Confirm queries, implies `--debug`
 43 |   - `--json` - Print JSONL output for machine readability
 44 |   - `--skip-sort-results` - Skip sorting the hashdiff output by key for better performance.
 45 |                             Entries with the same key but different column values may not appear adjacent in the output.
 46 |   - `--min-age` - Considers only rows older than specified. Useful for specifying replication lag.
 47 |                   Example: `--min-age=5min` ignores rows from the last 5 minutes.
 48 |                   Valid units: `d, days, h, hours, min, minutes, mon, months, s, seconds, w, weeks, y, years`
 49 |   - `--max-age` - Considers only rows younger than specified. See `--min-age`.
 50 |   - `-j` or `--threads` - Number of worker threads to use per database. Default=1.
 51 |   - `-w`, `--where` - An additional 'where' expression to restrict the search space.
 52 |   - `--allow-empty-tables` - Allows diffing on empty tables. Otherwise, we raise an error.
 53 |   - `--case-sensitive` - Column names are treated as case-sensitive. Otherwise, reladiff corrects their case according to schema.
 54 |   - `--conf`, `--run` - Specify the run and configuration from a TOML file. (see below)
 55 |   - `--bisection-threshold` - Minimal size of segment to be split. Smaller segments will be downloaded and compared locally.
 56 |   - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search.
 57 |   - `-m`, `--materialize` - Materialize the diff results into a new table in the database.
 58 |                             If a table exists by that name, it will be replaced.
 59 |                             Use `%t` in the name to place a timestamp.
 60 |                             Example: `-m test_mat_%t`
 61 |   - `--assume-unique-key` - Skip validating the uniqueness of the key column during joindiff, which is costly in non-cloud dbs.
 62 |                             Also, disables support for duplicate rows in hashdiff, offering a small performance gain.
 63 |   - `--sample-exclusive-rows` - Sample several rows that only appear in one of the tables, but not the other. Use with `-s`.
 64 |   - `--materialize-all-rows` -  Materialize every row, even if they are the same, instead of just the differing rows.
 65 |   - `--table-write-limit` - Maximum number of rows to write when creating materialized or sample tables, per thread. Default=1000.
 66 |   - `-a`, `--algorithm` `[auto|joindiff|hashdiff]` - Force algorithm choice
 67 | 
 68 | 
 69 | ### How to use with a configuration file
 70 | 
 71 | Reladiff lets you load the configuration for a run from a TOML file.
 72 | 
 73 | **Reasons to use a configuration file:**
 74 | 
 75 | - Convenience: Set-up the parameters for diffs that need to run often
 76 | 
 77 | - Easier and more readable: You can define the database connection settings as separate config values, instead of in a single URI.
 78 | 
 79 | - Gives you fine-grained control over the settings switches, without requiring any Python code.
 80 | 
 81 | Use `--conf` to specify that path to the configuration file. reladiff will load the settings from `run.default`, if it's defined.
 82 | 
 83 | Then you can, optionally, use `--run` to choose to load the settings of a specific run, and override the settings `run.default`. (all runs extend `run.default`, like inheritance).
 84 | 
 85 | Finally, CLI switches have the final say, and will override the settings defined by the configuration file, and the current run.
 86 | 
 87 | Example TOML file:
 88 | 
 89 | ```toml
 90 | # Specify the connection params to the test database.
 91 | [database.test_postgresql]
 92 | driver = "postgresql"
 93 | user = "postgres"
 94 | password = "Password1"
 95 | 
 96 | # Specify the default run params
 97 | [run.default]
 98 | update_column = "timestamp"
 99 | verbose = true
100 | 
101 | # Specify params for a run 'test_diff'.
102 | [run.test_diff]
103 | verbose = false
104 | # Source 1 ("left")
105 | 1.database = "test_postgresql"                      # Use options from database.test_postgresql
106 | 1.table = "rating"
107 | # Source 2 ("right")
108 | 2.database = "postgresql://postgres:Password1@/"    # Use URI like in the CLI
109 | 2.table = "rating_del1"
110 | ```
111 | 
112 | In this example, running `reladiff --conf myconfig.toml --run test_diff` will compare between `rating` and `rating_del1`.
113 | It will use the `timestamp` column as the update column, as specified in `run.default`. However, it won't be verbose, since that
114 | flag is overwritten to `false`.
115 | 
116 | Running it with `reladiff --conf myconfig.toml --run test_diff -v` will set verbose back to `true`.
117 | 
118 | 
119 | ## How to use from Python
120 | 
121 | Import the `reladiff` module, and use the following functions:
122 | 
123 | - `connect_to_table()` to connect to a specific table in the database
124 | 
125 | - `diff_tables()` to diff those tables
126 | 
127 | 
128 | Example:
129 | 
130 | ```python
131 | # Optional: Set logging to display the progress of the diff
132 | import logging
133 | logging.basicConfig(level=logging.INFO)
134 | 
135 | from reladiff import connect_to_table, diff_tables
136 | 
137 | table1 = connect_to_table("postgresql:///", "table_name", "id")
138 | table2 = connect_to_table("mysql:///", "table_name", "id")
139 | 
140 | sign: Literal['+' | '-']
141 | row: tuple[str, ...]
142 | for sign, row in diff_tables(table1, table2):
143 |     print(sign, row)
144 | ```
145 | 
146 | To learn more about the different options, [read the API reference](https://reladiff.readthedocs.io/en/latest/python-api.html) or run `help(diff_tables)`.
147 | 
148 | 
149 | ## Tips
150 | 
151 | - If you are only interested in whether something changed, i.e. a yes/no answer, set `--limit 1`. Reladiff will return as soon as it finds the first difference.
152 | 
153 | - Ensure that you have indexes on the columns you are comparing. Preferably a compound index, if relevant. You can run with `--interactive` to see an EXPLAIN for the queries.
154 | 
155 | - Setting a higher thread count may help performance significantly, depending on the database. For databases that limit concurrency per query, such as PostgreSQL/MySQL, this can improve performance dramatically.
156 | 
157 | - A low `--bisection-threshold` will minimize the amount of network transfer. But if network isn't an issue, a high `--bisection-threshold` will make Reladiff run a lot faster.
158 | 
159 | - If you run into timeouts for very large tables, try increasing the `--bisection-factor`.
160 | 
161 | - The fewer columns you verify, the faster Reladiff will be. If you're only interested in additions/deletions, verifying the primary key could be enough. If you have an automatic `updated` column, it might be enough to capture changes, i.e. comparing all the data isn't always necessary.
162 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. toctree::
 2 |    :maxdepth: 2
 3 |    :caption: Reference
 4 |    :hidden:
 5 | 
 6 |    install
 7 |    how-to-use
 8 |    supported-databases
 9 |    python-api
10 |    technical-explanation
11 |    new-database-driver-guide
12 | 
13 | Reladiff
14 | ------------
15 | 
16 | **Reladiff** is a high-performance tool and library designed for diffing large datasets across databases. By executing the diff calculation within the database itself, Reladiff minimizes data transfer and achieves optimal performance.
17 | 
18 | This tool is specifically tailored for data professionals, DevOps engineers, and system administrators.
19 | 
20 | Reladiff is free, open-source, user-friendly, extensively tested, and delivers fast results, even at massive scale.
21 | 
22 | Key Features
23 | ============
24 | 
25 | 1. **Cross-Database Diff**: *Reladiff* employs a divide-and-conquer algorithm, based on matching hashes, to efficiently identify modified segments and download only the necessary data for comparison. This approach ensures exceptional performance when differences are minimal.
26 | 
27 |    - ⇄ Diffs across over a dozen different databases (e.g. *PostgreSQL* -> *Snowflake*)!
28 | 
29 |    - 🧠 Gracefully handles reduced precision (e.g., timestamp(9) -> timestamp(3)) by rounding according to the database specification.
30 | 
31 |    - 🔥 Benchmarked to diff over 25M rows in under 10 seconds and over 1B rows in approximately 5 minutes, given no differences.
32 | 
33 |    - ♾️ Capable of handling tables with tens of billions of rows.
34 | 
35 | 2. **Intra-Database Diff**: When both tables reside in the same database, Reladiff compares them using a join operation, with additional optimizations for enhanced speed.
36 | 
37 |    - Supports materializing the diff into a local table.
38 |    - Can collect various extra statistics about the tables.
39 | 
40 | 3. **Threaded**: Utilizes multiple threads to significantly boost performance during diffing operations.
41 | 
42 | 4. **Configurable**: Offers numerous options for power-users to customize and optimize their usage.
43 | 
44 | 5. **Automation-Friendly**: Outputs both JSON and git-like diffs (with + and -), facilitating easy integration into CI/CD pipelines.
45 | 
46 | 6. **Over a dozen databases supported**: MySQL, Postgres, Snowflake, Bigquery, Oracle, Clickhouse, and more. `See full list <https://reladiff.readthedocs.io/en/latest/supported-databases.html>`_.
47 | 
48 | Reladiff is a fork of an archived project called `data-diff <https://github.com/datafold/data-diff>`_. Code that worked with data-diff should also work with reladiff, without any changes. However, there are a few differences: Reladiff doesn't contain any tracking code. Reladiff doesn't have DBT integration.
49 | 
50 | Resources
51 | ---------
52 | 
53 | 
54 | - User Documentation
55 |     - :doc:`install`
56 |     - :doc:`how-to-use`
57 |     - :doc:`supported-databases`
58 |     - :doc:`python-api`
59 |     - :doc:`technical-explanation`
60 | - Contributor Documentation
61 |    - :doc:`new-database-driver-guide`
62 | 
63 | - Other links
64 |     - Github: `<https://github.com/erezsh/reladiff>`_
65 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
 1 | # Installation Guide
 2 | 
 3 | ## Install library and CLI (no drivers)
 4 | 
 5 | Reladiff is available on [PyPI](https://pypi.org/project/reladiff/). You may install it by running:
 6 | 
 7 | ```sh
 8 | pip install reladiff
 9 | ```
10 | 
11 | Requirements: Python 3.8+ with pip.
12 | 
13 | ## Install with database drivers
14 | 
15 | You may install the necessary database drivers, at the same time as when installing Reladiff, using pip's "extra" syntax.
16 | 
17 | We advise to install Reladiff within a virtual-env, because the drivers may bring many dependencies.
18 | 
19 | ```sh
20 | # Install all database drivers
21 | pip install reladiff[all]
22 | 
23 | # The above line is equivalent to:
24 | pip install reladiff[duckdb,mysql,postgresql,snowflake,presto,oracle,trino,clickhouse,vertica]
25 | ```
26 | 
27 | You may remove any database you don't plan to use.
28 | 
29 | For example, if you only want to diff between Postgresql and DuckDB, install Reladiff thusly:
30 | 
31 | ```sh
32 | pip install reladiff[duckdb,postgresql]
33 | ```
34 | 
35 | ### Notes for shell / command-line
36 | 
37 | In some shells, like `bash` and `powershell`, you will have to use quotes, in order to allow the `[]` syntax.
38 | 
39 | For example:
40 | 
41 | ```sh
42 | pip install 'reladiff[all]'     # will work on bash
43 | pip install "reladiff[all]"     # will work on powershell (Windows)
44 | ```
45 | 
46 | Consult your shell environment to learn the correct way to quote or escape your command.
47 | 
48 | ### Notes for BigQuery
49 | 
50 | Reladiff currently doesn't auto-install the BigQuery drivers.
51 | 
52 | For BigQuery, see: [https://pypi.org/project/google-cloud-bigquery](https://pypi.org/project/google-cloud-bigquery)
53 | 
54 | 
55 | ### Another way to install all the drivers
56 | 
57 | For your convenience, you may also run these commands one after the other. You may omit drivers that you don't plan to use.
58 | 
59 | ```bash
60 | pip install reladiff[duckdb]
61 | pip install reladiff[mysql]
62 | pip install reladiff[postgresql]
63 | pip install reladiff[snowflake]
64 | pip install reladiff[presto]
65 | pip install reladiff[oracle]
66 | pip install reladiff[trino]
67 | pip install reladiff[clickhouse]
68 | pip install reladiff[vertica]
69 | ```
70 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=reladiff
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/new-database-driver-guide.rst:
--------------------------------------------------------------------------------
  1 | How to implement a new database driver for Reladiff
  2 | ====================================================
  3 | 
  4 | **This guide is out-of-date!** New databases should be added first in `Sqeleton <https://github.com/erezsh/sqeleton>`_.
  5 | 
  6 | First, read through the `CONTRIBUTING.md <https://github.com/erezsh/reladiff/blob/master/CONTRIBUTING.md>`_ document.
  7 | 
  8 | Make sure Reladiff is set up for development, and that all the tests pass (try to at least set it up for mysql and postgresql)
  9 | 
 10 | Look at the other database drivers for example and inspiration.
 11 | 
 12 | 
 13 | 1. Add dependencies to ``pyproject.toml``
 14 | -----------------------------------------
 15 | 
 16 | Most new drivers will require a 3rd party library in order to connect to the database.
 17 | 
 18 | These dependencies should be specified in the ``pyproject.toml`` file, in ``[tool.poetry.extras]``. Example:
 19 | 
 20 | ::
 21 | 
 22 |     [tool.poetry.extras]
 23 |     postgresql = ["psycopg2"]
 24 | 
 25 | Then, users can install the dependencies needed for your database driver, with ``pip install 'reladiff[postgresql]``.
 26 | 
 27 | This way, Reladiff can support a wide variety of drivers, without requiring our users to install libraries that they won't use.
 28 | 
 29 | 2. Implement a database module
 30 | ------------------------------
 31 | 
 32 | New database modules belong in the ``reladiff/databases`` directory.
 33 | 
 34 | The module consists of:
 35 | 1. Dialect (Class responsible for normalizing/casting fields. e.g. Numbers/Timestamps)
 36 | 2. Database class that handles connecting to the DB, querying (if the default doesn't work) , closing connectiosn and etc.
 37 | 
 38 | Choosing a base class, based on threading Model
 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | You can choose to inherit from either ``base.Database`` or ``base.ThreadedDatabase``.
 42 | 
 43 | Usually, databases with cursor-based connections, like MySQL or Postgresql, only allow connections to be used by the thread that created them. In order to support multithreading, we implement them by inheriting from ``ThreadedDatabase``, which holds a pool of worker threads, and creates a new connection per thread.
 44 | 
 45 | Usually, cloud databases, such as Snowflake and BigQuery, open a new connection per request, and support simultaneous queries from any number of threads. In other words, they already support multithreading, so we can implement them by inheriting directly from ``Database``.
 46 | 
 47 | Import on demand
 48 | ~~~~~~~~~~~~~~~~~
 49 | 
 50 | Database drivers should not import any 3rd party library at the module level.
 51 | 
 52 | Instead, they should be imported and initialized within a function. Example:
 53 | 
 54 | ::
 55 | 
 56 |     from .base import import_helper
 57 | 
 58 |     @import_helper("postgresql")
 59 |     def import_postgresql():
 60 |         import psycopg2
 61 |         import psycopg2.extras
 62 | 
 63 |         psycopg2.extensions.set_wait_callback(psycopg2.extras.wait_select)
 64 |         return psycopg2
 65 | 
 66 | We use the ``import_helper()`` decorator to provide a uniform and informative error. The string argument should be the name of the package, as written in ``pyproject.toml``.
 67 | 
 68 | :meth:`_query()`
 69 | ~~~~~~~~~~~~~~~~~~
 70 | 
 71 | All queries to the database pass through ``_query()``. It takes SQL code, and returns a list of rows. Here is its signature:
 72 | 
 73 | ::
 74 | 
 75 |     def _query(self, sql_code: str) -> list: ...
 76 | 
 77 | For standard cursor connections, it's sufficient to implement it with a call to ``base._query_conn()``, like:
 78 | 
 79 | ::
 80 |         return _query_conn(self._conn, sql_code)
 81 | 
 82 | 
 83 | :meth:`select_table_schema()` / :meth:`query_table_schema()`
 84 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 85 | 
 86 | If your database does not have a ``information_schema.columns`` table, or if its structure is unusual, you may have to implement your own ``select_table_schema()`` function, which returns the query needed to return column information in the form of a list of tuples, where each tuple is `column_name, data_type, datetime_precision, numeric_precision, numeric_scale`.
 87 | 
 88 | If such a query isn't possible, you may have to implement ``query_table_schema()`` yourself, which extracts this information from the database, and returns it in the proper form.
 89 | 
 90 | If the information returned from ``query_table_schema()`` requires slow or error-prone post-processing, you may delay that post-processing by overriding ``_process_table_schema()`` and implementing it there. The method ``_process_table_schema()`` only gets called for the columns that will be diffed.
 91 | 
 92 | Documentation:
 93 | 
 94 | - :meth:`reladiff.databases.database_types.AbstractDatabase.select_table_schema`
 95 | 
 96 | - :meth:`reladiff.databases.database_types.AbstractDatabase.query_table_schema`
 97 | 
 98 | :data:`TYPE_CLASSES`
 99 | ~~~~~~~~~~~~~~~~~~~~~~
100 | 
101 | Each database class must have a ``TYPE_CLASSES`` dictionary, which maps between the string data-type, as returned by querying the table schema, into the appropriate Reladiff type class, i.e. a subclass of ``database_types.ColType``.
102 | 
103 | :data:`ROUNDS_ON_PREC_LOSS`
104 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
105 | 
106 | When providing a datetime or a timestamp to a database, the database may lower its precision to correspond with the target column type.
107 | 
108 | Some databases will lower precision of timestamp/datetime values by truncating them, and some by rounding them.
109 | 
110 | ``ROUNDS_ON_PREC_LOSS`` should be True if this database rounds, or False if it truncates.
111 | 
112 | :meth:`__init__`, :meth:`create_connection()`
113 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
114 | 
115 | The options for the database connection will be given to the ``__init__()`` method as keywords.
116 | 
117 | If you inherit from ``Database``, your ``__init__()`` method may create the database connection.
118 | 
119 | If you inherit from ``ThreadedDatabase``, you should instead create the connection in the ``create_connection()`` method.
120 | 
121 | :meth:`close()`
122 | ~~~~~~~~~~~~~~~~
123 | 
124 | If you inherit from ``Database``, you will need to implement this method to close the connection yourself.
125 | 
126 | If you inherit from ``ThreadedDatabase``, you don't have to implement this method.
127 | 
128 | Docs:
129 | 
130 | - :meth:`reladiff.databases.database_types.AbstractDatabase.close`
131 | 
132 | :meth:`quote()`, :meth:`to_string()`,
133 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
134 | 
135 | These methods are used when creating queries, to quote a value, or cast it to STRING/VARCHAR.
136 | 
137 | For more information, read their docs:
138 | 
139 | - :meth:`reladiff.databases.database_types.AbstractDatabase.quote`
140 | 
141 | - :meth:`reladiff.databases.database_types.AbstractDatabase.to_string`
142 | 
143 | :meth:`normalize_number()`, :meth:`normalize_timestamp()`, :meth:`md5_to_int()`
144 | 
145 | Because comparing data between 2 databases requires both the data to be in the same format - we have normalization functions.
146 | 
147 | Databases can have the same data in different formats, e.g. ``DECIMAL`` vs ``FLOAT`` vs ``VARCHAR``, with different precisions.
148 | Reladiff works by converting the values to ``VARCHAR`` and comparing it.
149 | Your normalize_number/normalize_timestamp functions should account for differing precisions between columns.
150 | 
151 | These functions accept an SQL code fragment, and returns a new code fragment representing the appropriate computation.
152 | 
153 | :meth:`parse_type`
154 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 | 
156 | This is used to determine types which the system cannot effectively detect.
157 | Examples:
158 | DECIMAL(10,3) needs to be parsed by a custom algorithm. You'd be using regex to split it into Field name + Width + Scale.
159 | 
160 | 4. Debugging
161 | -----------------------
162 | 
163 | You can enable debug logging for tests by setting the logger level to ``DEBUG``, via the environment variable ``LOG_LEVEL``, or the ``LOG_LEVEL`` variable in /tests/common.py.
164 | This will display all the queries ran, and display the type detected for each column.
165 | 
166 | 3. Add tests
167 | --------------
168 | 
169 | Add your new database to the ``DATABASE_TYPES`` dict in ``tests/test_database_types.py``
170 | 
171 | The key is the class itself, and the value is a dict of {category: [type1, type2, ...]}
172 | 
173 | Categories supported are: ``int``, ``datetime``, ``float``, and ``uuid``.
174 | 
175 | Example:
176 | 
177 | ::
178 | 
179 |     DATABASE_TYPES = {
180 |         ...
181 |         db.PostgreSQL: {
182 |             "int": [ "int",  "bigint" ],
183 |             "datetime": [
184 |                 "timestamp(6) without time zone",
185 |                 "timestamp(3) without time zone",
186 |                 "timestamp(0) without time zone",
187 |                 "timestamp with time zone",
188 |             ],
189 |             ...
190 |         },
191 | 
192 | 
193 | Then run the tests and make sure your database driver is being tested.
194 | 
195 | You can run the tests with ``unittest``.
196 | 
197 | To save time, we recommend running them with ``unittest-parallel``.
198 | 
199 | When debugging, we recommend using the `-f` flag, to stop on error. Also, use the `-k` flag to run only the individual test that you're trying to fix.
200 | 
201 | 4. Create Pull-Request
202 | -----------------------
203 | 
204 | Open a pull-request on github, and we'll take it from there!
205 | 


--------------------------------------------------------------------------------
/docs/python-api.rst:
--------------------------------------------------------------------------------
 1 | Python API Reference
 2 | ====================
 3 | 
 4 | .. py:module:: reladiff
 5 | 
 6 | .. autofunction:: connect
 7 | 
 8 | .. autofunction:: connect_to_table
 9 | 
10 | .. autofunction:: diff_tables
11 | 
12 | .. autoclass:: HashDiffer
13 |     :members: __init__, diff_tables
14 | 
15 | .. autoclass:: JoinDiffer
16 |     :members: __init__, diff_tables
17 | 
18 | .. autoclass:: TableSegment
19 |     :members: __init__, get_values, choose_checkpoints, segment_by_checkpoints, count, count_and_checksum, is_bounded, new, with_schema
20 | 
21 | .. autoclass:: DiffResultWrapper
22 |     :members: __iter__, close, get_stats_dict, get_stats_string
23 | 
24 | .. autoclass:: reladiff.databases.database_types.AbstractDatabase
25 |     :members:
26 | 
27 | .. autoclass:: reladiff.databases.database_types.AbstractDialect
28 |     :members:
29 | 
30 | .. autodata:: DbKey
31 | .. autodata:: DbTime
32 | .. autodata:: DbPath
33 | .. autoenum:: Algorithm
34 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | # https://docs.readthedocs.io/en/stable/guides/specifying-dependencies.html#specifying-a-requirements-file
 2 | sphinx-gallery
 3 | sphinx_markdown_tables
 4 | sphinx-copybutton
 5 | sphinx-rtd-theme
 6 | recommonmark
 7 | enum-tools[sphinx]
 8 | 
 9 | reladiff
10 | 


--------------------------------------------------------------------------------
/docs/supported-databases.md:
--------------------------------------------------------------------------------
 1 | # List of supported databases
 2 | 
 3 | | Database      | Status | Connection string |
 4 | |---------------|-------------------------------------------------------------------------------------------------------------------------------------|--------|
 5 | | PostgreSQL >=10 |  💚    | `postgresql://<user>:<password>@<host>:5432/<database>`          |
 6 | | MySQL         |  💚    | `mysql://<user>:<password>@<hostname>:5432/<database>`             |
 7 | | Snowflake     |  💚    | `"snowflake://<user>[:<password>]@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<role>[&authenticator=externalbrowser]"` |
 8 | | Redshift      |  💚    | `redshift://<username>:<password>@<hostname>:5439/<database>`    |
 9 | | DuckDB >= 0.6       |  💚    | `duckdb://<file>`  |
10 | | Trino         |  💚    | `trino://<username>:<password>@<hostname>:8080/<database>`      |
11 | | BigQuery      |  💛    | `bigquery://<project>/<dataset>`                                |
12 | | Oracle        |  💛    | `oracle://<username>:<password>@<hostname>/database`            |
13 | | Presto        |  💛    | `presto://<username>:<password>@<hostname>:8080/<database>`     |
14 | | Vertica       |  💛    | `vertica://<username>:<password>@<hostname>:5433/<database>`   |
15 | | Clickhouse    |  💛    | `clickhouse://<username>:<password>@<hostname>:9000/<database>` |
16 | | Databricks    |  💛    | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>`     |
17 | | SQLite        |  📝    |                                                                                                                                     |
18 | 
19 | * 💚: Implemented and thoroughly tested.
20 | * 💛: Implemented, but not thoroughly tested yet.
21 | * ⏳: Implementation in progress.
22 | * 📝: Implementation planned. Contributions welcome.
23 | 
24 | 
25 | #### Looking for a database not on the list?
26 | If a database is not on the list, we'd still love to support it. [Please open an issue](https://github.com/erezsh/reladiff/issues) to discuss it, or vote on existing requests to push them up our todo list.
27 | 
28 | We also accept pull-requests!
29 | 


--------------------------------------------------------------------------------
/docs/technical-explanation.md:
--------------------------------------------------------------------------------
  1 | # Technical explanation
  2 | 
  3 | Reladiff can diff tables within the same database, or across different databases.
  4 | 
  5 | **Same-DB Diff:**
  6 | - Uses an outer-join to diff the rows as efficiently and accurately as possible.
  7 | - Supports materializing the diff results to a database table.
  8 | - Can also collect various extra statistics about the tables.
  9 | 
 10 | **Cross-DB Diff:** Employs a divide and conquer algorithm based on hashing, optimized for few changes.
 11 | 
 12 | The following is a technical explanation of the cross-db diff.
 13 | 
 14 | ### Overview
 15 | 
 16 | Reladiff divides the table into smaller segments and computes checksums for each segment in both databases. If the checksums for a segment do not match, it further subdivides that segment and continues checksumming until it identifies the differing row(s).
 17 | 
 18 | This approach has performance within an order of magnitude of count(*) when there are few/no changes, but is able to output each differing row! By pushing the compute into the databases, it's much faster than querying for and comparing every row.
 19 | 
 20 | ![Performance for 100M rows](https://user-images.githubusercontent.com/97400/175182987-a3900d4e-c097-4732-a4e9-19a40fac8cdc.png)
 21 | 
 22 | **†:** The performance for downloading rows is fairly driver-specific. In our tests, PostgreSQL performed 10x
 23 | better than MySQL.
 24 | 
 25 | ### Deep Dive
 26 | 
 27 | In this section we'll be doing a walk-through of exactly how Reladiff
 28 | works, and how to tune `--bisection-factor` and `--bisection-threshold`.
 29 | 
 30 | Let's consider a scenario with an `orders` table with 1M rows. Fivetran is
 31 | replicating it contionously from PostgreSQL to Snowflake:
 32 | 
 33 | ```
 34 | ┌─────────────┐                        ┌─────────────┐
 35 | │ PostgreSQL  │                        │  Snowflake  │
 36 | ├─────────────┤                        ├─────────────┤
 37 | │             │                        │             │
 38 | │             │                        │             │
 39 | │             │  ┌─────────────┐       │ table with  │
 40 | │ table with  ├──┤ replication ├──────▶│ ?maybe? all │
 41 | │lots of rows!│  └─────────────┘       │  the same   │
 42 | │             │                        │    rows.    │
 43 | │             │                        │             │
 44 | │             │                        │             │
 45 | │             │                        │             │
 46 | └─────────────┘                        └─────────────┘
 47 | ```
 48 | 
 49 | In order to check whether the two tables are the same, Reladiff splits
 50 | the table into segemnts. We define `--bisection-factor=10`, so it will start with 10 segments.
 51 | 
 52 | We also have to choose which columns we want to checksum. In our case, we care
 53 | about the primary key, `--key-column=id` and the update column
 54 | `--update-column=updated_at`. `updated_at` is updated every time the row is, and
 55 | we have an index on it.
 56 | 
 57 | Reladiff starts by querying both databases for the `min(id)` and `max(id)`
 58 | of the table. Then it splits the table into `--bisection-factor=10` segments of
 59 | `1M/10 = 100K` keys each:
 60 | 
 61 | ```
 62 | ┌──────────────────────┐              ┌──────────────────────┐
 63 | │     PostgreSQL       │              │      Snowflake       │
 64 | ├──────────────────────┤              ├──────────────────────┤
 65 | │      id=1..100k      │              │      id=1..100k      │
 66 | ├──────────────────────┤              ├──────────────────────┤
 67 | │    id=100k..200k     │              │    id=100k..200k     │
 68 | ├──────────────────────┤              ├──────────────────────┤
 69 | │    id=200k..300k     ├─────────────▶│    id=200k..300k     │
 70 | ├──────────────────────┤              ├──────────────────────┤
 71 | │    id=300k..400k     │              │    id=300k..400k     │
 72 | ├──────────────────────┤              ├──────────────────────┤
 73 | │         ...          │              │         ...          │
 74 | ├──────────────────────┤              ├──────────────────────┤
 75 | │      900k..100k      │              │      900k..100k      │
 76 | └───────────────────▲──┘              └▲─────────────────────┘
 77 |                     ┃                  ┃
 78 |                     ┃                  ┃
 79 |                     ┃ checksum queries ┃
 80 |                     ┃                  ┃
 81 |                   ┌─┻──────────────────┻────┐
 82 |                   │        Reladiff         │
 83 |                   └─────────────────────────┘
 84 | ```
 85 | 
 86 | Now Reladiff will start running `--threads=1` queries in parallel that
 87 | checksum each segment. The queries for checksumming each segment will look
 88 | something like this, depending on the database:
 89 | 
 90 | ```sql
 91 | SELECT count(*),
 92 |     sum(cast(conv(substring(md5(concat(cast(id as char), cast(updated_at as char))), 18), 16, 10) as unsigned))
 93 | FROM `rating_del1`
 94 | WHERE (id >= 1) AND (id < 100000)
 95 | ```
 96 | 
 97 | This keeps the amount of data that has to be transferred between the databases
 98 | to a minimum, making it very performant! Additionally, if you have an index on
 99 | `updated_at` (highly recommended), then the query will be fast, as the database
100 | only has to do a partial index scan between `id=1..100k`.
101 | 
102 | If you are not sure whether the queries are using an index, you can run it with
103 | `--interactive`. This puts Reladiff in interactive mode, where it shows an
104 | `EXPLAIN` before executing each query, requiring confirmation to proceed.
105 | 
106 | After running the checksum queries on both sides, we see that all segments
107 | are the same except `id=100k..200k`:
108 | 
109 | ```
110 | ┌──────────────────────┐              ┌──────────────────────┐
111 | │     PostgreSQL       │              │      Snowflake       │
112 | ├──────────────────────┤              ├──────────────────────┤
113 | │    checksum=0102     │              │    checksum=0102     │
114 | ├──────────────────────┤   mismatch!  ├──────────────────────┤
115 | │    checksum=ffff     ◀──────────────▶    checksum=aaab    │
116 | ├──────────────────────┤              ├──────────────────────┤
117 | │    checksum=abab     │              │    checksum=abab     │
118 | ├──────────────────────┤              ├──────────────────────┤
119 | │    checksum=f0f0     │              │    checksum=f0f0     │
120 | ├──────────────────────┤              ├──────────────────────┤
121 | │         ...          │              │         ...          │
122 | ├──────────────────────┤              ├──────────────────────┤
123 | │    checksum=9494     │              │    checksum=9494     │
124 | └──────────────────────┘              └──────────────────────┘
125 | ```
126 | 
127 | Now Reladiff will do exactly as it just did for the _whole table_ for only
128 | this segment: Split it into `--bisection-factor` segments.
129 | 
130 | However, this time, because each segment has `100k/10=10k` entries, which is
131 | less than the `--bisection-threshold`, it will pull down every row in the segment
132 | and compare them in memory in Reladiff.
133 | 
134 | ```
135 | ┌──────────────────────┐              ┌──────────────────────┐
136 | │     PostgreSQL       │              │      Snowflake       │
137 | ├──────────────────────┤              ├──────────────────────┤
138 | │    id=100k..110k     │              │    id=100k..110k     │
139 | ├──────────────────────┤              ├──────────────────────┤
140 | │    id=110k..120k     │              │    id=110k..120k     │
141 | ├──────────────────────┤              ├──────────────────────┤
142 | │    id=120k..130k     │              │    id=120k..130k     │
143 | ├──────────────────────┤              ├──────────────────────┤
144 | │    id=130k..140k     │              │    id=130k..140k     │
145 | ├──────────────────────┤              ├──────────────────────┤
146 | │         ...          │              │         ...          │
147 | ├──────────────────────┤              ├──────────────────────┤
148 | │      190k..200k      │              │      190k..200k      │
149 | └──────────────────────┘              └──────────────────────┘
150 | ```
151 | 
152 | Finally Reladiff will output the `(id, updated_at)` for each row that was different:
153 | 
154 | ```
155 | (122001, 1653672821)
156 | ```
157 | 
158 | If you pass `--stats` you'll see stats such as the % of rows were different.
159 | 
160 | ### Performance Considerations
161 | 
162 | * Ensure that you have indexes on the columns you are comparing. Preferably a
163 |   compound index. You can run with `--interactive` to see an `EXPLAIN` for the
164 |   queries.
165 | * Consider increasing the number of simultaneous threads executing
166 |   queries per database with `--threads`. For databases that limit concurrency
167 |   per query, such as PostgreSQL/MySQL, this can improve performance dramatically.
168 | * If you are only interested in _whether_ something changed, pass `--limit 1`.
169 |   This can be useful if changes are very rare. This is often faster than doing a
170 |   `count(*)`, for the reason mentioned above.
171 | * If the table is _very_ large, consider a larger `--bisection-factor`. Otherwise, you may run into timeouts.
172 | * If there are a lot of changes, consider a larger `--bisection-threshold`.
173 | * If there are very large gaps in your key column (e.g., 10s of millions of
174 |   continuous rows missing), then Reladiff may perform poorly, doing lots of
175 |   queries for ranges of rows that do not exist. We have ideas on how to tackle this issue, which we have yet to implement. If you're experiencing this effect, please open an issue, and we
176 |   will prioritize it.
177 | * The fewer columns you verify (passed with `--columns`), the faster
178 |   Reladiff will be. On one extreme, you can verify every column; on the
179 |   other, you can verify _only_ `updated_at`, if you trust it enough. You can also
180 |   _only_ verify `id` if you're interested in only presence, such as to detect
181 |   missing hard deletes. You can do also do a hybrid where you verify
182 |   `updated_at` and the most critical value, such as a money value in `amount`, but
183 |   not verify a large serialized column like `json_settings`.
184 | * We have ideas for making Reladiff even faster that
185 |   we haven't implemented yet: faster checksums by reducing type-casts
186 |   and using a faster hash than MD5, dynamic adaptation of
187 |   `bisection_factor`/`threads`/`bisection_threshold` (especially with large key
188 |   gaps), and improvements to bypass Python/driver performance limitations when
189 |   comparing huge amounts of rows locally (i.e. for very high `bisection_threshold` values).
190 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "reladiff"
 3 | version = "0.6.0"
 4 | description = "Command-line tool and Python library to efficiently diff rows across two different databases."
 5 | authors = ["Erez Shinan <erezshin@gmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | repository = "https://github.com/erezsh/reladiff"
 9 | documentation = "https://reladiff.readthedocs.io/en/latest/"
10 | classifiers = [
11 |     "Intended Audience :: Developers",
12 |     "Intended Audience :: Information Technology",
13 |     "Intended Audience :: System Administrators",
14 |     "Programming Language :: Python :: 3.8",
15 |     "Programming Language :: Python :: 3.9",
16 |     "Programming Language :: Python :: 3.10",
17 |     "Programming Language :: Python :: 3.11",
18 |     "Programming Language :: Python :: 3.12",
19 |     "Development Status :: 4 - Beta",
20 |     "Environment :: Console",
21 |     "Topic :: Database :: Database Engines/Servers",
22 |     "Typing :: Typed"
23 | ]
24 | packages = [{ include = "reladiff" }]
25 | 
26 | [tool.poetry.dependencies]
27 | python = "^3.8"
28 | runtype = ">=0.5.0"
29 | dsnparse = "*"
30 | click = ">=8.1"
31 | rich = "*"
32 | toml = ">=0.10.2"
33 | sqeleton = "^0.1.7"
34 | mysql-connector-python = {version=">=8.0.29", optional=true}
35 | psycopg2-binary = {version="*", optional=true}
36 | snowflake-connector-python = {version=">=2.7.2", optional=true}
37 | cryptography = {version="*", optional=true}
38 | trino = {version=">=0.314.0", optional=true}
39 | presto-python-client = {version="*", optional=true}
40 | clickhouse-driver = {version="*", optional=true}
41 | duckdb = {version=">=0.6.0", optional=true}
42 | 
43 | [tool.poetry.dev-dependencies]
44 | parameterized = "*"
45 | unittest-parallel = "*"
46 | # preql = ">=0.2.19"
47 | mysql-connector-python = "*"
48 | psycopg2-binary = "*"
49 | snowflake-connector-python = ">=2.7.2"
50 | cryptography = "*"
51 | trino = ">=0.314.0"
52 | presto-python-client = "*"
53 | clickhouse-driver = "*"
54 | vertica-python = "*"
55 | duckdb = ">=0.6.0"
56 | # google-cloud-bigquery = "*"
57 | # databricks-sql-connector = "*"
58 | 
59 | [tool.poetry.extras]
60 | # When adding, update also: README + dev deps just above
61 | preql = ["preql"]
62 | mysql = ["mysql-connector-python"]
63 | postgresql = ["psycopg2-binary"]
64 | snowflake = ["snowflake-connector-python", "cryptography"]
65 | presto = ["presto-python-client"]
66 | oracle = ["cx_Oracle"]
67 | # databricks = ["databricks-sql-connector"]
68 | trino = ["trino"]
69 | clickhouse = ["clickhouse-driver"]
70 | vertica = ["vertica-python"]
71 | duckdb = ["duckdb"]
72 | 
73 | all = ["mysql-connector-python", "psycopg2-binary", "snowflake-connector-python", "cryptography", "presto-python-client", "cx_Oracle", "trino", "clickhouse-driver", "vertica-python", "duckdb"]
74 | 
75 | [build-system]
76 | requires = ["poetry-core>=1.0.0"]
77 | build-backend = "poetry.core.masonry.api"
78 | 
79 | [tool.poetry.scripts]
80 | reladiff = 'reladiff.__main__:main'
81 | 
82 | [tool.mypy]
83 | no_implicit_optional=false
84 | 
85 | [tool.ruff]
86 | line-length = 120
87 | 
88 | [tool.black]
89 | line-length = 120
90 | target-version = ['py38']
91 | 


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | formats: all
 4 | 
 5 | build:
 6 |   os: ubuntu-22.04
 7 |   tools:
 8 |     python: "3.8"
 9 | 
10 | python:
11 |    install:
12 |       - requirements: docs/requirements.txt
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 


--------------------------------------------------------------------------------
/reladiff/__init__.py:
--------------------------------------------------------------------------------
  1 | from typing import Sequence, Tuple, Iterable, Optional, Union
  2 | 
  3 | from sqeleton.abcs import DbTime, DbPath, AbstractDatabase
  4 | 
  5 | from .databases import connect
  6 | from .diff_tables import Algorithm, TableDiffer, DiffResultWrapper
  7 | from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
  8 | from .joindiff_tables import JoinDiffer, TABLE_WRITE_LIMIT
  9 | from .table_segment import TableSegment
 10 | from .utils import eval_name_template, Vector
 11 | 
 12 | __version__ = "0.6.0"
 13 | 
 14 | 
 15 | def connect_to_table(
 16 |     db_info: Union[str, dict, AbstractDatabase],
 17 |     table_name: Union[DbPath, str],
 18 |     key_columns: Union[str, Sequence[str]] = ("id",),
 19 |     thread_count: Optional[int] = 1,
 20 |     **kwargs,
 21 | ) -> TableSegment:
 22 |     """Connects to the given database, and creates a TableSegment instance
 23 | 
 24 |     Parameters:
 25 |         db_info: Either a URI string, dict of connection options or a reladiff AbstractDatabase type.
 26 |         table_name: Name of the table as a string, or a tuple that signifies the path.
 27 |         key_columns: Names of the key columns
 28 |         thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
 29 | 
 30 |     See Also:
 31 |         :meth:`connect`
 32 |     """
 33 |     if isinstance(key_columns, str):
 34 |         key_columns = (key_columns,)
 35 |     if isinstance(db_info, AbstractDatabase):
 36 |         db = db_info
 37 |     else:
 38 |         db = connect(db_info, thread_count=thread_count)
 39 | 
 40 |     if isinstance(table_name, str):
 41 |         table_name = db.parse_table_name(table_name)
 42 | 
 43 |     return TableSegment(db, tuple(table_name), tuple(key_columns), **kwargs)
 44 | 
 45 | 
 46 | def diff_tables(
 47 |     table1: TableSegment,
 48 |     table2: TableSegment,
 49 |     *,
 50 |     # Name of the key column, which uniquely identifies each row (usually id)
 51 |     key_columns: Sequence[str] = None,
 52 |     # Name of updated column, which signals that rows changed (usually updated_at or last_update)
 53 |     update_column: str = None,
 54 |     # Extra columns to compare
 55 |     extra_columns: Tuple[str, ...] = None,
 56 |     # Start/end key_column values, used to restrict the segment
 57 |     min_key: Vector = None,
 58 |     max_key: Vector = None,
 59 |     # Start/end update_column values, used to restrict the segment
 60 |     min_update: DbTime = None,
 61 |     max_update: DbTime = None,
 62 |     # Enable/disable threaded diffing. Needed to take advantage of database threads.
 63 |     threaded: bool = True,
 64 |     # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
 65 |     # There may be many pools, so number of actual threads can be a lot higher.
 66 |     max_threadpool_size: Optional[int] = 1,
 67 |     # Algorithm
 68 |     algorithm: Algorithm = Algorithm.AUTO,
 69 |     # An additional 'where' expression to restrict the search space.
 70 |     where: str = None,
 71 |     # Into how many segments to bisect per iteration (hashdiff only)
 72 |     bisection_factor: int = DEFAULT_BISECTION_FACTOR,
 73 |     # When should we stop bisecting and compare locally (in row count; hashdiff only)
 74 |     bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
 75 |     # Enable/disable validating that the key columns are unique. (joindiff only)
 76 |     # Enable/disable support for duplicate rows, offering a small performance gain. (hashdiff only)
 77 |     validate_unique_key: bool = True,
 78 |     # Enable/disable sampling of exclusive rows. Creates a temporary table. (joindiff only)
 79 |     sample_exclusive_rows: bool = False,
 80 |     # Path of new table to write diff results to. Disabled if not provided. (joindiff only)
 81 |     materialize_to_table: Union[str, DbPath] = None,
 82 |     # Materialize every row, not just those that are different. (joindiff only)
 83 |     materialize_all_rows: bool = False,
 84 |     # Maximum number of rows to write when materializing, per thread. (joindiff only)
 85 |     table_write_limit: int = TABLE_WRITE_LIMIT,
 86 |     # If false, diffing on empty tables raises an EmptyTable(ValueError) exception.
 87 |     allow_empty_tables: bool = False,
 88 |     # Skip sorting the hashdiff output by key for better performance. (hashdiff only)
 89 |     skip_sort_results: bool = False,
 90 | ) -> DiffResultWrapper:
 91 |     """Finds the diff between table1 and table2.
 92 | 
 93 |     Parameters:
 94 |         key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
 95 |         update_column (str, optional): Name of updated column, which signals that rows changed.
 96 |                                        Usually updated_at or last_update.  Used by `min_update` and `max_update`.
 97 |         extra_columns (Tuple[str, ...], optional): Extra columns to compare
 98 |         min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
 99 |         max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
100 |         min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
101 |         max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
102 |         threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
103 |         max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
104 |                                    Only relevant when `threaded` is ``True``.
105 |                                    There may be many pools, so number of actual threads can be a lot higher.
106 |                                    (Note: For best performance, we recommend setting this to at least twice the
107 |                                    `thread_count` argument provided to the driver through `connect()`/`connect_to_table()`.
108 |         where (str, optional): An additional 'where' expression to restrict the search space.
109 |         algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`. Default=`AUTO`)
110 |         bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
111 |         bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
112 |                                       and compare locally. (Used when algorithm is `HASHDIFF`).
113 |         validate_unique_key (bool): Enable/disable validating that the key columns are unique (`JOINDIFF`).
114 |                                     Enable/disable support for duplicate rows, offering a small performance gain (`HASHDIFF`).
115 |                                     (default: True)
116 |                                     Single query, and can't be threaded, so it's very slow on non-cloud dbs.
117 |                                     Future versions will detect UNIQUE constraints in the schema.
118 |         sample_exclusive_rows (bool): Enable/disable sampling of exclusive rows. Creates a temporary table. (used for `JOINDIFF`. default: False)
119 |         materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
120 |         materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
121 |         table_write_limit (int): Maximum number of rows to write when materializing, per thread.
122 |         allow_empty_tables (bool): If false, diffing on empty tables raises an EmptyTable(ValueError) exception.
123 |         skip_sort_results (bool): Skip sorting the hashdiff output by key for better performance. (used for `HASHDIFF`. default: False)
124 | 
125 |     Note:
126 |         The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
127 |         `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`, `where`.
128 |         If different values are needed per table, it's possible to omit them here, and instead set
129 |         them directly when creating each :class:`TableSegment`.
130 | 
131 |     Note:
132 |         It is recommended to call .close() on the returned object when done, to release thread-pool. Alternatively, you may use it as a context manager.
133 | 
134 |     Example:
135 |         >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
136 |         >>> list(diff_tables(table1, table1))
137 |         []
138 |         >>> with diff_tables(table1, table1) as diff:
139 |         ...     print(list(diff))
140 |         []
141 | 
142 |     See Also:
143 |         :class:`TableSegment`
144 |         :class:`HashDiffer`
145 |         :class:`JoinDiffer`
146 | 
147 |     """
148 |     if isinstance(key_columns, str):
149 |         key_columns = (key_columns,)
150 | 
151 |     tables = [table1, table2]
152 |     override_attrs = {
153 |         k: v
154 |         for k, v in dict(
155 |             key_columns=key_columns,
156 |             update_column=update_column,
157 |             extra_columns=extra_columns,
158 |             min_key=min_key,
159 |             max_key=max_key,
160 |             min_update=min_update,
161 |             max_update=max_update,
162 |             where=where,
163 |         ).items()
164 |         if v is not None
165 |     }
166 | 
167 |     segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
168 | 
169 |     algorithm = Algorithm(algorithm)
170 |     if algorithm == Algorithm.AUTO:
171 |         algorithm = Algorithm.JOINDIFF if table1.database is table2.database else Algorithm.HASHDIFF
172 | 
173 |     differ: TableDiffer
174 |     if algorithm == Algorithm.HASHDIFF:
175 |         differ = HashDiffer(
176 |             bisection_factor=bisection_factor,
177 |             bisection_threshold=bisection_threshold,
178 |             threaded=threaded,
179 |             max_threadpool_size=max_threadpool_size,
180 |             allow_empty_tables=allow_empty_tables,
181 |             skip_sort_results=skip_sort_results,
182 |         )
183 |     elif algorithm == Algorithm.JOINDIFF:
184 |         if isinstance(materialize_to_table, str):
185 |             materialize_to_table = table1.database.parse_table_name(eval_name_template(materialize_to_table))
186 |         differ = JoinDiffer(
187 |             threaded=threaded,
188 |             max_threadpool_size=max_threadpool_size,
189 |             validate_unique_key=validate_unique_key,
190 |             sample_exclusive_rows=sample_exclusive_rows,
191 |             materialize_to_table=materialize_to_table,
192 |             materialize_all_rows=materialize_all_rows,
193 |             table_write_limit=table_write_limit,
194 |             allow_empty_tables=allow_empty_tables,
195 |         )
196 |     else:
197 |         raise ValueError(f"Unknown algorithm: {algorithm}")
198 | 
199 |     return differ.diff_tables(*segments)
200 | 


--------------------------------------------------------------------------------
/reladiff/config.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | from typing import Any, Dict
  4 | import toml
  5 | 
  6 | 
  7 | class ConfigParseError(Exception):
  8 |     pass
  9 | 
 10 | 
 11 | def is_uri(s: str) -> bool:
 12 |     return "://" in s
 13 | 
 14 | 
 15 | def _apply_config(config: Dict[str, Any], run_name: str, kw: Dict[str, Any]):
 16 |     _resolve_env(config)
 17 | 
 18 |     # Load config
 19 |     databases = config.pop("database", {})
 20 |     runs = config.pop("run", {})
 21 |     if config:
 22 |         raise ConfigParseError(f"Unknown option(s): {config}")
 23 | 
 24 |     # Init run_args
 25 |     run_args = runs.get("default") or {}
 26 |     if run_name:
 27 |         if run_name not in runs:
 28 |             raise ConfigParseError(f"Cannot find run '{run_name}' in configuration.")
 29 |         run_args.update(runs[run_name])
 30 |     else:
 31 |         run_name = "default"
 32 | 
 33 |     if kw.get("database1") is not None:
 34 |         for attr in ("table1", "database2", "table2"):
 35 |             if kw[attr] is None:
 36 |                 raise ValueError(f"Specified database1 but not {attr}. Must specify all 4 arguments, or neither.")
 37 | 
 38 |         for index in "12":
 39 |             run_args[index] = {attr: kw.pop(f"{attr}{index}") for attr in ("database", "table")}
 40 | 
 41 |     # Process databases + tables
 42 |     for index in "12":
 43 |         try:
 44 |             args = run_args.pop(index)
 45 |         except KeyError:
 46 |             raise ConfigParseError(
 47 |                 f"Could not find source #{index}: Expecting a key of '{index}' containing '.database' and '.table'."
 48 |             )
 49 |         for attr in ("database", "table"):
 50 |             if attr not in args:
 51 |                 raise ConfigParseError(f"Running 'run.{run_name}': Connection #{index} is missing attribute '{attr}'.")
 52 | 
 53 |         database = args.pop("database")
 54 |         table = args.pop("table")
 55 |         threads = args.pop("threads", None)
 56 |         if args:
 57 |             raise ConfigParseError(f"Unexpected attributes for connection #{index}: {args}")
 58 | 
 59 |         if not is_uri(database):
 60 |             if database not in databases:
 61 |                 raise ConfigParseError(
 62 |                     f"Database '{database}' not found in list of databases. Available: {list(databases)}."
 63 |                 )
 64 |             database = dict(databases[database])
 65 |             assert isinstance(database, dict)
 66 |             if "driver" not in database:
 67 |                 raise ConfigParseError(f"Database '{database}' did not specify a driver.")
 68 | 
 69 |         run_args[f"database{index}"] = database
 70 |         run_args[f"table{index}"] = table
 71 |         if threads is not None:
 72 |             run_args[f"threads{index}"] = int(threads)
 73 | 
 74 |     # Update keywords
 75 |     new_kw = dict(kw)  # Set defaults
 76 |     new_kw.update(run_args)  # Apply config
 77 |     new_kw.update({k: v for k, v in kw.items() if v})  # Apply non-empty defaults
 78 | 
 79 |     new_kw["__conf__"] = run_args
 80 | 
 81 |     return new_kw
 82 | 
 83 | 
 84 | # There are no strict requirements for the environment variable name format.
 85 | # But most shells only allow alphanumeric characters and underscores.
 86 | # https://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html
 87 | # "Environment variable names (...) consist solely of uppercase letters, digits, and the '_' (underscore)"
 88 | _ENV_VAR_PATTERN = r"\$\{([A-Za-z0-9_]+)\}"
 89 | 
 90 | 
 91 | def _resolve_env(config: Dict[str, Any]):
 92 |     """
 93 |     Resolve environment variables referenced as ${ENV_VAR_NAME}.
 94 |     Missing environment variables are replaced with an empty string.
 95 |     """
 96 |     for key, value in config.items():
 97 |         if isinstance(value, dict):
 98 |             _resolve_env(value)
 99 |         elif isinstance(value, str):
100 |             config[key] = re.sub(_ENV_VAR_PATTERN, _replace_match, value)
101 | 
102 | 
103 | def _replace_match(match: re.Match) -> str:
104 |     # Lookup referenced variable in environment.
105 |     # Replace with empty string if not found
106 |     referenced_var = match.group(1)  # group(0) is the whole string
107 |     return os.environ.get(referenced_var, "")
108 | 
109 | 
110 | def apply_config_from_file(path: str, run_name: str, kw: Dict[str, Any]):
111 |     with open(path) as f:
112 |         return _apply_config(toml.load(f), run_name, kw)
113 | 
114 | 
115 | def apply_config_from_string(toml_config: str, run_name: str, kw: Dict[str, Any]):
116 |     return _apply_config(toml.loads(toml_config), run_name, kw)
117 | 


--------------------------------------------------------------------------------
/reladiff/databases/__init__.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, QueryError, ConnectError
 2 | 
 3 | from .postgresql import PostgreSQL
 4 | from .mysql import MySQL
 5 | from .oracle import Oracle
 6 | from .snowflake import Snowflake
 7 | from .bigquery import BigQuery
 8 | from .redshift import Redshift
 9 | from .presto import Presto
10 | from .databricks import Databricks
11 | from .trino import Trino
12 | from .clickhouse import Clickhouse
13 | from .vertica import Vertica
14 | from .duckdb import DuckDB
15 | 
16 | from ._connect import connect
17 | 


--------------------------------------------------------------------------------
/reladiff/databases/_connect.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from sqeleton.databases import Connect
 4 | 
 5 | from .postgresql import PostgreSQL
 6 | from .mysql import MySQL
 7 | from .oracle import Oracle
 8 | from .snowflake import Snowflake
 9 | from .bigquery import BigQuery
10 | from .redshift import Redshift
11 | from .presto import Presto
12 | from .databricks import Databricks
13 | from .trino import Trino
14 | from .clickhouse import Clickhouse
15 | from .vertica import Vertica
16 | from .duckdb import DuckDB
17 | 
18 | 
19 | DATABASE_BY_SCHEME = {
20 |     "postgresql": PostgreSQL,
21 |     "mysql": MySQL,
22 |     "oracle": Oracle,
23 |     "redshift": Redshift,
24 |     "snowflake": Snowflake,
25 |     "presto": Presto,
26 |     "bigquery": BigQuery,
27 |     "databricks": Databricks,
28 |     "duckdb": DuckDB,
29 |     "trino": Trino,
30 |     "clickhouse": Clickhouse,
31 |     "vertica": Vertica,
32 | }
33 | 
34 | 
35 | class Connect_SetUTC(Connect):
36 |     __doc__ = Connect.__call__.__doc__
37 | 
38 |     def _connection_created(self, db):
39 |         db = super()._connection_created(db)
40 |         try:
41 |             db.query(db.dialect.set_timezone_to_utc())
42 |         except NotImplementedError:
43 |             logging.debug(
44 |                 f"Database '{db}' does not allow setting timezone. We recommend making sure it's set to 'UTC'."
45 |             )
46 |         return db
47 | 
48 | 
49 | connect = Connect_SetUTC(DATABASE_BY_SCHEME)
50 | 


--------------------------------------------------------------------------------
/reladiff/databases/base.py:
--------------------------------------------------------------------------------
1 | from sqeleton.abcs.mixins import AbstractMixin_MD5, AbstractMixin_NormalizeValue
2 | 
3 | 
4 | class ReladiffDialect(AbstractMixin_MD5, AbstractMixin_NormalizeValue):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/reladiff/databases/bigquery.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import bigquery
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(bigquery.Dialect, bigquery.Mixin_MD5, bigquery.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class BigQuery(bigquery.BigQuery):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/clickhouse.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import clickhouse
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(clickhouse.Dialect, clickhouse.Mixin_MD5, clickhouse.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Clickhouse(clickhouse.Clickhouse):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/databricks.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import databricks
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(databricks.Dialect, databricks.Mixin_MD5, databricks.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Databricks(databricks.Databricks):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/duckdb.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import duckdb
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(duckdb.Dialect, duckdb.Mixin_MD5, duckdb.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class DuckDB(duckdb.DuckDB):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/mysql.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import mysql
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(mysql.Dialect, mysql.Mixin_MD5, mysql.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class MySQL(mysql.MySQL):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/oracle.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import oracle
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(oracle.Dialect, oracle.Mixin_MD5, oracle.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Oracle(oracle.Oracle):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/postgresql.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import postgresql as pg
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class PostgresqlDialect(pg.PostgresqlDialect, pg.Mixin_MD5, pg.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class PostgreSQL(pg.PostgreSQL):
10 |     dialect = PostgresqlDialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/presto.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import presto
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(presto.Dialect, presto.Mixin_MD5, presto.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Presto(presto.Presto):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/redshift.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import redshift
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(redshift.Dialect, redshift.Mixin_MD5, redshift.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Redshift(redshift.Redshift):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/snowflake.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import snowflake
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(snowflake.Dialect, snowflake.Mixin_MD5, snowflake.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Snowflake(snowflake.Snowflake):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/trino.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import trino
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(trino.Dialect, trino.Mixin_MD5, trino.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Trino(trino.Trino):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/databases/vertica.py:
--------------------------------------------------------------------------------
 1 | from sqeleton.databases import vertica
 2 | from .base import ReladiffDialect
 3 | 
 4 | 
 5 | class Dialect(vertica.Dialect, vertica.Mixin_MD5, vertica.Mixin_NormalizeValue, ReladiffDialect):
 6 |     pass
 7 | 
 8 | 
 9 | class Vertica(vertica.Vertica):
10 |     dialect = Dialect()
11 | 


--------------------------------------------------------------------------------
/reladiff/diff_tables.py:
--------------------------------------------------------------------------------
  1 | """Provides classes for performing a table diff
  2 | """
  3 | 
  4 | from abc import ABC, abstractmethod
  5 | from enum import Enum
  6 | from contextlib import contextmanager
  7 | from operator import methodcaller
  8 | from typing import Dict, Tuple, Iterator, Optional
  9 | from concurrent.futures import ThreadPoolExecutor, as_completed
 10 | 
 11 | from runtype import dataclass
 12 | 
 13 | from .info_tree import InfoTree, SegmentInfo
 14 | 
 15 | from .utils import safezip, getLogger, Vector
 16 | from .thread_utils import ThreadedYielder
 17 | from .table_segment import TableSegment, create_mesh_from_points, EmptyTable, EmptyTableSegment
 18 | from sqeleton.abcs import IKey
 19 | 
 20 | logger = getLogger(__name__)
 21 | 
 22 | 
 23 | class Algorithm(Enum):
 24 |     AUTO = "auto"
 25 |     JOINDIFF = "joindiff"
 26 |     HASHDIFF = "hashdiff"
 27 | 
 28 | 
 29 | DiffResult = Iterator[Tuple[str, tuple]]  # Iterator[Tuple[Literal["+", "-"], tuple]]
 30 | 
 31 | 
 32 | @dataclass
 33 | class ThreadBase:
 34 |     "Provides utility methods for optional threading"
 35 | 
 36 |     threaded: bool = True
 37 |     max_threadpool_size: Optional[int] = 1
 38 | 
 39 |     def _thread_map(self, func, iterable):
 40 |         if not self.threaded:
 41 |             return map(func, iterable)
 42 | 
 43 |         with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
 44 |             return task_pool.map(func, iterable)
 45 | 
 46 |     def _threaded_call(self, func, iterable, **kw):
 47 |         "Calls a method for each object in iterable."
 48 |         return list(self._thread_map(methodcaller(func, **kw), iterable))
 49 | 
 50 |     def _thread_as_completed(self, func, iterable):
 51 |         if not self.threaded:
 52 |             yield from map(func, iterable)
 53 |             return
 54 | 
 55 |         with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
 56 |             futures = [task_pool.submit(func, item) for item in iterable]
 57 |             for future in as_completed(futures):
 58 |                 yield future.result()
 59 | 
 60 |     def _threaded_call_as_completed(self, func, iterable):
 61 |         "Calls a method for each object in iterable. Returned in order of completion."
 62 |         return self._thread_as_completed(methodcaller(func), iterable)
 63 | 
 64 |     @contextmanager
 65 |     def _run_in_background(self, *funcs):
 66 |         with ThreadPoolExecutor(max_workers=self.max_threadpool_size) as task_pool:
 67 |             futures = [task_pool.submit(f) for f in funcs if f is not None]
 68 |             yield futures
 69 |             for f in futures:
 70 |                 f.result()
 71 | 
 72 | 
 73 | @dataclass
 74 | class DiffStats:
 75 |     diff_by_sign: Dict[str, int]
 76 |     table1_count: int
 77 |     table2_count: int
 78 |     unchanged: int
 79 |     diff_percent: float
 80 | 
 81 | 
 82 | @dataclass
 83 | class DiffResultWrapper:
 84 |     """Wrapper for the diff result, with additional stats and info
 85 | 
 86 |     Supports reenterant iteration, context management, and immediate closing of the thread pool.
 87 | 
 88 |     Note: Once the threadpool is closed, the iterator will not be able to continue.
 89 |     """
 90 |     diff: iter  # DiffResult
 91 |     info_tree: InfoTree
 92 |     stats: dict
 93 |     _ti: ThreadedYielder
 94 |     result_list: list = []
 95 | 
 96 |     def __iter__(self):
 97 |         """Iterate over the results of the diff.
 98 | 
 99 |         It's a "lazy-list": Repeated calls will return the same results, but will not re-run the diff.
100 |         """
101 |         yield from self.result_list
102 |         for i in self.diff:
103 |             self.result_list.append(i)
104 |             yield i
105 | 
106 |     def close(self):
107 |         "Immediately stop diffing and close the thread pool"
108 |         # TODO we should be able to wait for the thread pool to finish
109 |         self._ti.shutdown(wait=False)
110 | 
111 |     def __enter__(self):
112 |         return self
113 |     def __exit__(self, *args):
114 |         self.close()
115 | 
116 |     def _get_stats(self) -> DiffStats:
117 |         list(self)  # Consume the iterator into result_list, if we haven't already
118 | 
119 |         diff_by_key = {}
120 |         for sign, values in self.result_list:
121 |             k = values[: len(self.info_tree.info.tables[0].key_columns)]
122 |             if k in diff_by_key:
123 |                 assert sign != diff_by_key[k]
124 |                 diff_by_key[k] = "!"
125 |             else:
126 |                 diff_by_key[k] = sign
127 | 
128 |         diff_by_sign = {k: 0 for k in "+-!"}
129 |         for sign in diff_by_key.values():
130 |             diff_by_sign[sign] += 1
131 | 
132 |         table1_count = self.info_tree.info.rowcounts[1]
133 |         table2_count = self.info_tree.info.rowcounts[2]
134 |         unchanged = table1_count - diff_by_sign["-"] - diff_by_sign["!"]
135 |         diff_percent = 1 - unchanged / max(table1_count, table2_count, 1)
136 | 
137 |         return DiffStats(diff_by_sign, table1_count, table2_count, unchanged, diff_percent)
138 | 
139 |     def get_stats_string(self):
140 |         """Return a pretty string of the diff stats (used by the CLI)"""
141 |         diff_stats = self._get_stats()
142 |         string_output = ""
143 |         string_output += f"{diff_stats.table1_count} rows in table A\n"
144 |         string_output += f"{diff_stats.table2_count} rows in table B\n"
145 |         string_output += f"{diff_stats.diff_by_sign['-']} rows exclusive to table A (not present in B)\n"
146 |         string_output += f"{diff_stats.diff_by_sign['+']} rows exclusive to table B (not present in A)\n"
147 |         string_output += f"{diff_stats.diff_by_sign['!']} rows updated\n"
148 |         string_output += f"{diff_stats.unchanged} rows unchanged\n"
149 |         string_output += f"{100*diff_stats.diff_percent:.2f}% difference score\n"
150 |         return string_output
151 | 
152 |     def get_stats_dict(self):
153 |         """Return a dictionary of the diff stats"""
154 |         diff_stats = self._get_stats()
155 |         json_output = {
156 |             "rows_A": diff_stats.table1_count,
157 |             "rows_B": diff_stats.table2_count,
158 |             "exclusive_A": diff_stats.diff_by_sign["-"],
159 |             "exclusive_B": diff_stats.diff_by_sign["+"],
160 |             "updated": diff_stats.diff_by_sign["!"],
161 |             "unchanged": diff_stats.unchanged,
162 |             "total": sum(diff_stats.diff_by_sign.values()),
163 |             "stats": self.stats,
164 |         }
165 | 
166 |         return json_output
167 | 
168 | 
169 | @dataclass(frozen=True)
170 | class TableDiffer(ThreadBase, ABC):
171 |     bisection_factor = 32
172 |     stats: dict = {}
173 |     allow_empty_tables: bool = False
174 | 
175 |     def diff_tables(
176 |         self, table1: TableSegment, table2: TableSegment, *, info_tree: InfoTree = None
177 |     ) -> DiffResultWrapper:
178 |         """Diff the given tables.
179 | 
180 |         Parameters:
181 |             table1 (TableSegment): The "before" table to compare. Or: source table
182 |             table2 (TableSegment): The "after" table to compare. Or: target table
183 | 
184 |         Returns:
185 |             An iterator that yield pair-tuples, representing the diff. Items can be either -
186 |             ('-', row) for items in table1 but not in table2.
187 |             ('+', row) for items in table2 but not in table1.
188 |             Where `row` is a tuple of values, corresponding to the diffed columns.
189 |         """
190 |         if info_tree is None:
191 |             info_tree = InfoTree(SegmentInfo([table1, table2]))
192 |         ti = ThreadedYielder(self.max_threadpool_size)
193 |         return DiffResultWrapper(self._diff_tables_wrapper(table1, table2, info_tree, ti), info_tree, self.stats, ti)
194 | 
195 |     def _diff_tables_wrapper(
196 |         self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree, ti: ThreadedYielder
197 |     ) -> DiffResult:
198 |         try:
199 |             # Query and validate schema
200 |             table1, table2 = self._threaded_call(
201 |                 "with_schema", [table1, table2], allow_empty_table=self.allow_empty_tables
202 |             )
203 |             self._validate_and_adjust_columns(table1, table2)
204 | 
205 |             yield from self._diff_tables_root(table1, table2, info_tree, ti)
206 |         finally:
207 |             info_tree.aggregate_info()
208 | 
209 |     def _validate_and_adjust_columns(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
210 |         pass
211 | 
212 |     def _diff_tables_root(
213 |         self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree, ti: ThreadedYielder
214 |     ) -> DiffResult:
215 |         return self._bisect_and_diff_tables(table1, table2, info_tree, ti)
216 | 
217 |     @abstractmethod
218 |     def _diff_segments(
219 |         self,
220 |         ti: ThreadedYielder,
221 |         table1: TableSegment,
222 |         table2: TableSegment,
223 |         info_tree: InfoTree,
224 |         max_rows: int,
225 |         level=0,
226 |         segment_index=None,
227 |         segment_count=None,
228 |     ): ...
229 | 
230 |     def _bisect_and_diff_tables(
231 |         self, table1: TableSegment, table2: TableSegment, info_tree: InfoTree, ti: ThreadedYielder
232 |     ):
233 |         if len(table1.key_columns) != len(table2.key_columns):
234 |             raise ValueError("Tables should have an equivalent number of key columns!")
235 | 
236 |         key_types1 = table1.key_types
237 |         key_types2 = table2.key_types
238 |         is_empty1 = isinstance(table1, EmptyTableSegment)
239 |         is_empty2 = isinstance(table2, EmptyTableSegment)
240 | 
241 |         for kt in ([] if is_empty1 else key_types1) + ([] if is_empty2 else key_types2):
242 |             if not isinstance(kt, IKey):
243 |                 raise NotImplementedError(f"Cannot use a column of type {kt} as a key")
244 | 
245 |         if not (is_empty1 or is_empty2):
246 |             for kt1, kt2 in safezip(key_types1, key_types2):
247 |                 if kt1.python_type is not kt2.python_type:
248 |                     raise TypeError(f"Incompatible key types: {kt1} and {kt2}")
249 | 
250 |         # Query min/max values
251 |         key_ranges = self._threaded_call_as_completed("query_key_range", [table1, table2])
252 | 
253 |         # Start with the first completed value, so we don't waste time waiting
254 |         try:
255 |             min_key1, max_key1 = self._parse_key_range_result(key_types1, next(key_ranges))
256 |         except EmptyTable:
257 |             if not self.allow_empty_tables:
258 |                 raise
259 |             try:
260 |                 min_key1, max_key1 = self._parse_key_range_result(key_types2, next(key_ranges))
261 |             except EmptyTable:
262 |                 # Both tables are empty
263 |                 info_tree.info.set_diff([])
264 |                 info_tree.info.max_rows = 0
265 |                 info_tree.info.rowcounts = {1: 0, 2: 0}
266 |                 return []
267 | 
268 |         btable1, btable2 = [t.new_key_bounds(min_key=min_key1, max_key=max_key1) for t in (table1, table2)]
269 | 
270 |         logger.info(
271 |             f"Diffing segments at key-range: {min_key1}..{max_key1}. "
272 |             f"size: table1 <= {btable1.approximate_size()}, table2 <= {btable2.approximate_size()}"
273 |         )
274 | 
275 |         # Bisect (split) the table into segments, and diff them recursively.
276 |         ti.submit(self._bisect_and_diff_segments, ti, btable1, btable2, info_tree)
277 | 
278 |         # Now we check for the second min-max, to diff the portions we "missed".
279 |         # This is achieved by subtracting the table ranges, and dividing the resulting space into aligned boxes.
280 |         # For example, given tables A & B, and a 2D compound key, where A was queried first for key-range,
281 |         # the regions of B we need to diff in this second pass are marked by B1..8:
282 |         # ┌──┬──────┬──┐
283 |         # │B1│  B2  │B3│
284 |         # ├──┼──────┼──┤
285 |         # │B4│  A   │B5│
286 |         # ├──┼──────┼──┤
287 |         # │B6│  B7  │B8│
288 |         # └──┴──────┴──┘
289 |         # Overall, the max number of new regions in this 2nd pass is 3^|k| - 1
290 | 
291 |         try:
292 |             min_key2, max_key2 = self._parse_key_range_result(key_types1, next(key_ranges))
293 |         except StopIteration:  # First table is empty
294 |             return ti
295 |         except EmptyTable:  # Second table is empty
296 |             if not self.allow_empty_tables:
297 |                 raise
298 |             return ti
299 | 
300 |         points = [list(sorted(p)) for p in safezip(min_key1, min_key2, max_key1, max_key2)]
301 |         box_mesh = create_mesh_from_points(*points)
302 | 
303 |         new_regions = [(p1, p2) for p1, p2 in box_mesh if p1 < p2 and not (p1 >= min_key1 and p2 <= max_key1)]
304 | 
305 |         for p1, p2 in new_regions:
306 |             extra_tables = [t.new_key_bounds(min_key=p1, max_key=p2) for t in (table1, table2)]
307 |             ti.submit(self._bisect_and_diff_segments, ti, *extra_tables, info_tree)
308 | 
309 |         return ti
310 | 
311 |     def _parse_key_range_result(self, key_types, key_range) -> Tuple[Vector, Vector]:
312 |         if isinstance(key_range, Exception):
313 |             raise key_range
314 | 
315 |         min_key_values, max_key_values = key_range
316 | 
317 |         # We add 1 because our ranges are exclusive of the end (like in Python)
318 |         try:
319 |             min_key = Vector(key_type.make_value(mn) for key_type, mn in safezip(key_types, min_key_values))
320 |             max_key = Vector(key_type.make_value(mx) + 1 for key_type, mx in safezip(key_types, max_key_values))
321 |         except (TypeError, ValueError) as e:
322 |             raise type(e)(f"Cannot apply {key_types} to '{min_key_values}', '{max_key_values}'.") from e
323 | 
324 |         return min_key, max_key
325 | 
326 |     def _bisect_and_diff_segments(
327 |         self,
328 |         ti: ThreadedYielder,
329 |         table1: TableSegment,
330 |         table2: TableSegment,
331 |         info_tree: InfoTree,
332 |         level=0,
333 |         max_rows=None,
334 |     ):
335 |         assert table1.is_bounded and table2.is_bounded
336 | 
337 |         # Choose evenly spaced checkpoints (according to min_key and max_key)
338 |         biggest_table = max(table1, table2, key=methodcaller("approximate_size"))
339 |         checkpoints = biggest_table.choose_checkpoints(self.bisection_factor - 1)
340 | 
341 |         # Create new instances of TableSegment between each checkpoint
342 |         segmented1 = table1.segment_by_checkpoints(checkpoints)
343 |         segmented2 = table2.segment_by_checkpoints(checkpoints)
344 | 
345 |         # Recursively compare each pair of corresponding segments between table1 and table2
346 |         for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)):
347 |             info_node = info_tree.add_node(t1, t2, max_rows=max_rows)
348 |             ti.submit(
349 |                 self._diff_segments, ti, t1, t2, info_node, max_rows, level + 1, i + 1, len(segmented1), priority=level
350 |             )
351 | 


--------------------------------------------------------------------------------
/reladiff/hashdiff_tables.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from numbers import Number
  3 | import logging
  4 | from typing import Iterator
  5 | from operator import attrgetter
  6 | from collections import Counter
  7 | from itertools import chain
  8 | 
  9 | from dataclasses import dataclass, field
 10 | 
 11 | # from runtype import dataclass     # TODO fix in runtype
 12 | 
 13 | from sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean
 14 | 
 15 | from .info_tree import InfoTree
 16 | from .utils import safezip
 17 | from .thread_utils import ThreadedYielder
 18 | from .table_segment import TableSegment, EmptyTableSegment
 19 | 
 20 | from .diff_tables import TableDiffer
 21 | 
 22 | BENCHMARK = os.environ.get("BENCHMARK", False)
 23 | 
 24 | DEFAULT_BISECTION_THRESHOLD = 1024 * 16
 25 | DEFAULT_BISECTION_FACTOR = 32
 26 | 
 27 | logger = logging.getLogger("hashdiff_tables")
 28 | 
 29 | 
 30 | def diff_sets(a: list, b: list, skip_sort_results: bool, duplicate_rows_support: bool) -> Iterator:
 31 |     if duplicate_rows_support:
 32 |         c = Counter(b)
 33 |         c.subtract(a)
 34 |         diff = (("+", k) if count > 0 else ("-", k) for k, count in c.items() for _ in range(abs(count)))
 35 |     else:
 36 |         sa = set(a)
 37 |         sb = set(b)
 38 |         diff = chain((("-", x) for x in sa - sb), (("+", x) for x in sb - sa))
 39 | 
 40 |     return diff if skip_sort_results else sorted(diff, key=lambda i: i[1])   # sort by key
 41 | 
 42 | 
 43 | @dataclass(frozen=True)
 44 | class HashDiffer(TableDiffer):
 45 |     """Finds the diff between two SQL tables
 46 | 
 47 |     The algorithm uses hashing to quickly check if the tables are different, and then applies a
 48 |     bisection search recursively to find the differences efficiently.
 49 | 
 50 |     Works best for comparing tables that are mostly the same, with minor discrepancies.
 51 | 
 52 |     Parameters:
 53 |         bisection_factor (int): Into how many segments to bisect per iteration.
 54 |         bisection_threshold (Number): When should we stop bisecting and compare locally (in row count).
 55 |         threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
 56 |         max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
 57 |                                    Only relevant when `threaded` is ``True``.
 58 |                                    There may be many pools, so number of actual threads can be a lot higher.
 59 |         skip_sort_results (bool): Skip sorting the hashdiff output by key for better performance.
 60 |                                   Entries with the same key but different column values may not appear adjacent in the output.
 61 |         duplicate_rows_support (bool): If ``True``, the algorithm will support duplicate rows in the tables.
 62 |     """
 63 | 
 64 |     bisection_factor: int = DEFAULT_BISECTION_FACTOR
 65 |     bisection_threshold: Number = DEFAULT_BISECTION_THRESHOLD  # Accepts inf for tests
 66 |     skip_sort_results: bool = False
 67 |     duplicate_rows_support: bool = True
 68 | 
 69 |     stats: dict = field(default_factory=dict)
 70 | 
 71 |     def __post_init__(self):
 72 |         # Validate options
 73 |         if self.bisection_factor >= self.bisection_threshold:
 74 |             raise ValueError("Incorrect param values (bisection factor must be lower than threshold)")
 75 |         if self.bisection_factor < 2:
 76 |             raise ValueError("Must have at least two segments per iteration (i.e. bisection_factor >= 2)")
 77 | 
 78 |     def _validate_and_adjust_columns(self, table1, table2):
 79 |         if isinstance(table1, EmptyTableSegment) or isinstance(table2, EmptyTableSegment):
 80 |             # Skip all logic; it only pertains to column mismatch
 81 |             return
 82 | 
 83 |         for c1, c2 in safezip(table1.relevant_columns, table2.relevant_columns):
 84 |             if c1 not in table1._schema:
 85 |                 raise ValueError(f"Column '{c1}' not found in schema for table {table1}")
 86 |             if c2 not in table2._schema:
 87 |                 raise ValueError(f"Column '{c2}' not found in schema for table {table2}")
 88 | 
 89 |             # Update schemas to minimal mutual precision
 90 |             col1 = table1._schema[c1]
 91 |             col2 = table2._schema[c2]
 92 |             if isinstance(col1, PrecisionType):
 93 |                 if not isinstance(col2, PrecisionType):
 94 |                     raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
 95 | 
 96 |                 lowest = min(col1, col2, key=attrgetter("precision"))
 97 | 
 98 |                 if col1.precision != col2.precision:
 99 |                     logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}")
100 | 
101 |                 table1._schema[c1] = col1.replace(precision=lowest.precision, rounds=lowest.rounds)
102 |                 table2._schema[c2] = col2.replace(precision=lowest.precision, rounds=lowest.rounds)
103 | 
104 |             elif isinstance(col1, (NumericType, Boolean)):
105 |                 if not isinstance(col2, (NumericType, Boolean)):
106 |                     raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
107 | 
108 |                 lowest = min(col1, col2, key=attrgetter("precision"))
109 | 
110 |                 if col1.precision != col2.precision:
111 |                     logger.warning(f"Using reduced precision {lowest} for column '{c1}'. Types={col1}, {col2}")
112 | 
113 |                 if lowest.precision != col1.precision:
114 |                     table1._schema[c1] = col1.replace(precision=lowest.precision)
115 |                 if lowest.precision != col2.precision:
116 |                     table2._schema[c2] = col2.replace(precision=lowest.precision)
117 | 
118 |             elif isinstance(col1, ColType_UUID):
119 |                 if not isinstance(col2, ColType_UUID):
120 |                     raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
121 | 
122 |             elif isinstance(col1, StringType):
123 |                 if not isinstance(col2, StringType):
124 |                     raise TypeError(f"Incompatible types for column '{c1}':  {col1} <-> {col2}")
125 | 
126 |         for t in [table1, table2]:
127 |             for c in t.relevant_columns:
128 |                 ctype = t._schema[c]
129 |                 if not ctype.supported:
130 |                     logger.warning(
131 |                         f"[{t.database.name}] Column '{c}' of type '{ctype}' has no compatibility handling. "
132 |                         "If encoding/formatting differs between databases, it may result in false positives."
133 |                     )
134 | 
135 |     def _diff_segments(
136 |         self,
137 |         ti: ThreadedYielder,
138 |         table1: TableSegment,
139 |         table2: TableSegment,
140 |         info_tree: InfoTree,
141 |         max_rows: int,
142 |         level=0,
143 |         segment_index=None,
144 |         segment_count=None,
145 |     ):
146 |         logger.info(
147 |             ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
148 |             f"key-range: {table1.min_key}..{table2.max_key}, "
149 |             f"size <= {max_rows}"
150 |         )
151 | 
152 |         # When benchmarking, we want the ability to skip checksumming. This
153 |         # allows us to download all rows for comparison in performance. By
154 |         # default, reladiff will checksum the section first (when it's below
155 |         # the threshold) and _then_ download it.
156 |         if BENCHMARK:
157 |             if max_rows < self.bisection_threshold:
158 |                 return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max_rows)
159 | 
160 |         if isinstance(table1, EmptyTableSegment) or isinstance(table1, EmptyTableSegment):
161 |             # Optimization: No need to checksum if one of the tables is empty
162 |             count1, count2 = self._threaded_call("count", [table1, table2])
163 |             checksum1 = checksum2 = None
164 |         else:
165 |             (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
166 | 
167 |         assert not info_tree.info.rowcounts
168 |         info_tree.info.rowcounts = {1: count1, 2: count2}
169 | 
170 |         if count1 == 0 and count2 == 0:
171 |             logger.debug(
172 |                 "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). "
173 |                 "For better performance, we recommend to increase the bisection-threshold.",
174 |                 table1.min_key,
175 |                 table1.max_key,
176 |             )
177 |             assert checksum1 is None and checksum2 is None
178 |             info_tree.info.is_diff = False
179 |             return
180 | 
181 |         if checksum1 == checksum2 and count1 == count2:
182 |             info_tree.info.is_diff = False
183 |             return
184 | 
185 |         info_tree.info.is_diff = True
186 |         return self._bisect_and_diff_segments(ti, table1, table2, info_tree, level=level, max_rows=max(count1, count2))
187 | 
188 |     def _bisect_and_diff_segments(
189 |         self,
190 |         ti: ThreadedYielder,
191 |         table1: TableSegment,
192 |         table2: TableSegment,
193 |         info_tree: InfoTree,
194 |         level=0,
195 |         max_rows=None,
196 |     ):
197 |         assert table1.is_bounded and table2.is_bounded
198 | 
199 |         max_space_size = max(table1.approximate_size(), table2.approximate_size())
200 |         if max_rows is None:
201 |             # We can be sure that row_count <= max_rows iff the table key is unique
202 |             max_rows = max_space_size
203 |             info_tree.info.max_rows = max_rows
204 | 
205 |         # If count is below the threshold, just download and compare the columns locally
206 |         # This saves time, as bisection speed is limited by ping and query performance.
207 |         if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
208 |             rows1, rows2 = self._threaded_call("get_values", [table1, table2])
209 |             diff = list(diff_sets(rows1, rows2, self.skip_sort_results, self.duplicate_rows_support))
210 | 
211 |             info_tree.info.set_diff(diff)
212 |             info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
213 | 
214 |             logger.info(". " * level + f"Diff found {len(diff)} different rows.")
215 |             self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
216 |             return diff
217 | 
218 |         return super()._bisect_and_diff_segments(ti, table1, table2, info_tree, level, max_rows)
219 | 


--------------------------------------------------------------------------------
/reladiff/info_tree.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Union
 2 | 
 3 | from runtype import dataclass
 4 | 
 5 | from .table_segment import TableSegment, EmptyTableSegment
 6 | 
 7 | 
 8 | @dataclass(frozen=False)
 9 | class SegmentInfo:
10 |     tables: List[Union[TableSegment, EmptyTableSegment]]
11 | 
12 |     diff: list = None
13 |     is_diff: bool = None
14 |     diff_count: int = None
15 | 
16 |     rowcounts: Dict[int, int] = {}
17 |     max_rows: int = None
18 | 
19 |     def set_diff(self, diff: list):
20 |         self.diff = diff
21 |         self.diff_count = len(diff)
22 |         self.is_diff = self.diff_count > 0
23 | 
24 |     def update_from_children(self, child_infos):
25 |         child_infos = list(child_infos)
26 |         assert child_infos
27 | 
28 |         # self.diff = list(chain(*[c.diff for c in child_infos]))
29 |         self.diff_count = sum(c.diff_count for c in child_infos if c.diff_count is not None)
30 |         self.is_diff = any(c.is_diff for c in child_infos)
31 | 
32 |         self.rowcounts = {
33 |             1: sum(c.rowcounts[1] for c in child_infos if c.rowcounts),
34 |             2: sum(c.rowcounts[2] for c in child_infos if c.rowcounts),
35 |         }
36 | 
37 | 
38 | @dataclass
39 | class InfoTree:
40 |     info: SegmentInfo
41 |     children: List["InfoTree"] = []
42 | 
43 |     def add_node(self, table1: TableSegment, table2: TableSegment, max_rows: int = None):
44 |         node = InfoTree(SegmentInfo([table1, table2], max_rows=max_rows))
45 |         self.children.append(node)
46 |         return node
47 | 
48 |     def aggregate_info(self):
49 |         if self.children:
50 |             for c in self.children:
51 |                 c.aggregate_info()
52 |             self.info.update_from_children(c.info for c in self.children)
53 | 


--------------------------------------------------------------------------------
/reladiff/parse_time.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from datetime import datetime, timedelta
 3 | from difflib import SequenceMatcher
 4 | 
 5 | 
 6 | class ParseError(ValueError):
 7 |     pass
 8 | 
 9 | 
10 | TIME_UNITS = dict(
11 |     seconds="seconds",
12 |     minutes="minutes",
13 |     hours="hours",
14 |     days="days",
15 |     weeks="weeks",
16 |     months="months",
17 |     years="years",
18 |     # Shortcuts
19 |     s="seconds",
20 |     min="minutes",
21 |     h="hours",
22 |     d="days",
23 |     w="weeks",
24 |     mon="months",
25 |     y="years",
26 | )
27 | 
28 | EXTRAPOLATED = {"months": (30, "days"), "years": (365, "days")}
29 | assert set(EXTRAPOLATED) <= set(TIME_UNITS)
30 | 
31 | TIME_RE = re.compile(r"(\d+)([a-z]+)")
32 | 
33 | UNITS_STR = ", ".join(sorted(TIME_UNITS.keys()))
34 | 
35 | 
36 | def string_similarity(a, b):
37 |     return SequenceMatcher(None, a, b).ratio()
38 | 
39 | 
40 | def parse_time_atom(count, unit):
41 |     count = int(count)
42 |     try:
43 |         unit = TIME_UNITS[unit]
44 |     except KeyError:
45 |         most_similar = max(TIME_UNITS, key=lambda k: string_similarity(k, unit))
46 |         raise ParseError(
47 |             f"'{unit}' is not a recognized time unit. Did you mean '{most_similar}'?" f"\nSupported units: {UNITS_STR}"
48 |         )
49 | 
50 |     if unit in EXTRAPOLATED:
51 |         mul, unit = EXTRAPOLATED[unit]
52 |         count *= mul
53 |     return count, unit
54 | 
55 | 
56 | def parse_time_delta(t: str):
57 |     time_dict = {}
58 |     while t:
59 |         m = TIME_RE.match(t)
60 |         if not m:
61 |             raise ParseError(f"Cannot parse '{t}': Not a recognized time delta")
62 |         count, unit = parse_time_atom(*m.groups())
63 |         if unit in time_dict:
64 |             raise ParseError(f"Time unit {unit} specified more than once")
65 |         time_dict[unit] = count
66 |         t = t[m.end() :]
67 | 
68 |     if not time_dict:
69 |         raise ParseError("No time difference specified")
70 |     return timedelta(**time_dict)
71 | 
72 | 
73 | def parse_time_before(time: datetime, delta: str):
74 |     return time - parse_time_delta(delta)
75 | 


--------------------------------------------------------------------------------
/reladiff/query_utils.py:
--------------------------------------------------------------------------------
 1 | "Module for query utilities that didn't make it into the query-builder (yet)"
 2 | 
 3 | from contextlib import suppress
 4 | 
 5 | from sqeleton.databases import DbPath, QueryError, Oracle
 6 | from sqeleton.queries import table, commit, Expr
 7 | 
 8 | 
 9 | def _drop_table_oracle(name: DbPath):
10 |     t = table(name)
11 |     # Experience shows double drop is necessary
12 |     with suppress(QueryError):
13 |         yield t.drop()
14 |         yield t.drop()
15 |     yield commit
16 | 
17 | 
18 | def _drop_table(name: DbPath):
19 |     t = table(name)
20 |     yield t.drop(if_exists=True)
21 |     yield commit
22 | 
23 | 
24 | def drop_table(db, tbl):
25 |     if isinstance(db, Oracle):
26 |         db.query(_drop_table_oracle(tbl))
27 |     else:
28 |         db.query(_drop_table(tbl))
29 | 
30 | 
31 | def _append_to_table_oracle(path: DbPath, expr: Expr):
32 |     """See append_to_table"""
33 |     assert expr.schema, expr
34 |     t = table(path, schema=expr.schema)
35 |     with suppress(QueryError):
36 |         yield t.create()  # uses expr.schema
37 |         yield commit
38 |     yield t.insert_expr(expr)
39 |     yield commit
40 | 
41 | 
42 | def _append_to_table(path: DbPath, expr: Expr):
43 |     """Append to table"""
44 |     assert expr.schema, expr
45 |     t = table(path, schema=expr.schema)
46 |     yield t.create(if_not_exists=True)  # uses expr.schema
47 |     yield commit
48 |     yield t.insert_expr(expr)
49 |     yield commit
50 | 
51 | 
52 | def append_to_table(db, path, expr):
53 |     f = _append_to_table_oracle if isinstance(db, Oracle) else _append_to_table
54 |     db.query(f(path, expr))
55 | 


--------------------------------------------------------------------------------
/reladiff/table_segment.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import List, Tuple
  3 | import logging
  4 | from itertools import product
  5 | 
  6 | from runtype import dataclass
  7 | 
  8 | from .utils import safezip, Vector
  9 | from sqeleton.utils import ArithString, split_space
 10 | from sqeleton.databases import Database, DbPath, DbKey, DbTime
 11 | from sqeleton.abcs.database_types import String_UUID
 12 | from sqeleton.schema import Schema, create_schema
 13 | from sqeleton.queries import Count, Checksum, SKIP, table, this, Expr, min_, max_, Code
 14 | from sqeleton.queries.extras import ApplyFuncAndNormalizeAsString, NormalizeAsString
 15 | 
 16 | logger = logging.getLogger("table_segment")
 17 | 
 18 | RECOMMENDED_CHECKSUM_DURATION = 20
 19 | 
 20 | 
 21 | class EmptyTable(ValueError):
 22 |     pass
 23 | 
 24 | 
 25 | def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
 26 |     assert min_key < max_key
 27 | 
 28 |     if max_key - min_key <= count:
 29 |         count = 1
 30 | 
 31 |     if isinstance(min_key, ArithString):
 32 |         assert type(min_key) is type(max_key)
 33 |         checkpoints = min_key.range(max_key, count)
 34 |     else:
 35 |         checkpoints = split_space(min_key, max_key, count)
 36 | 
 37 |     assert all(min_key < x < max_key for x in checkpoints)
 38 |     return [min_key] + checkpoints + [max_key]
 39 | 
 40 | 
 41 | def int_product(nums: List[int]) -> int:
 42 |     p = 1
 43 |     for n in nums:
 44 |         p *= n
 45 |     return p
 46 | 
 47 | 
 48 | def split_compound_key_space(mn: Vector, mx: Vector, count: int) -> List[List[DbKey]]:
 49 |     """Returns a list of split-points for each key dimension, essentially returning an N-dimensional grid of split points."""
 50 |     return [split_key_space(mn_k, mx_k, count) for mn_k, mx_k in safezip(mn, mx)]
 51 | 
 52 | 
 53 | def create_mesh_from_points(*values_per_dim: list) -> List[Tuple[Vector, Vector]]:
 54 |     """Given a list of values along each axis of N dimensional space,
 55 |     return an array of boxes whose start-points & end-points align with the given values,
 56 |     and together consitute a mesh filling that space entirely (within the bounds of the given values).
 57 | 
 58 |     Assumes given values are already ordered ascending.
 59 | 
 60 |     len(boxes) == ∏i( len(i)-1 )
 61 | 
 62 |     Example:
 63 |         ::
 64 |             >>> d1 = 'a', 'b', 'c'
 65 |             >>> d2 = 1, 2, 3
 66 |             >>> d3 = 'X', 'Y'
 67 |             >>> create_mesh_from_points(d1, d2, d3)
 68 |             [
 69 |                 [('a', 1, 'X'), ('b', 2, 'Y')],
 70 |                 [('a', 2, 'X'), ('b', 3, 'Y')],
 71 |                 [('b', 1, 'X'), ('c', 2, 'Y')],
 72 |                 [('b', 2, 'X'), ('c', 3, 'Y')]
 73 |             ]
 74 |     """
 75 |     assert all(len(v) >= 2 for v in values_per_dim), values_per_dim
 76 | 
 77 |     # Create tuples of (v1, v2) for each pair of adjacent values
 78 |     ranges = [list(zip(values[:-1], values[1:])) for values in values_per_dim]
 79 | 
 80 |     assert all(a <= b for r in ranges for a, b in r)
 81 | 
 82 |     # Create a product of all the ranges
 83 |     res = [tuple(Vector(a) for a in safezip(*r)) for r in product(*ranges)]
 84 | 
 85 |     expected_len = int_product(len(v) - 1 for v in values_per_dim)
 86 |     assert len(res) == expected_len, (len(res), expected_len)
 87 |     return res
 88 | 
 89 | 
 90 | @dataclass
 91 | class TableSegment:
 92 |     """Signifies a segment of rows (and selected columns) within a table
 93 | 
 94 |     Parameters:
 95 |         database (Database): Database instance. See :meth:`connect`
 96 |         table_path (:data:`DbPath`): Path to table in form of a tuple. e.g. `('my_dataset', 'table_name')`
 97 |         key_columns (Tuple[str]): Name of the key column, which uniquely identifies each row (usually id)
 98 |         update_column (str, optional): Name of updated column, which signals that rows changed.
 99 |                                        Usually updated_at or last_update. Used by `min_update` and `max_update`.
100 |         extra_columns (Tuple[str, ...], optional): Extra columns to compare
101 |         min_key (:data:`Vector`, optional): Lowest key value, used to restrict the segment
102 |         max_key (:data:`Vector`, optional): Highest key value, used to restrict the segment
103 |         min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
104 |         max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
105 |         where (str, optional): An additional 'where' expression to restrict the search space.
106 | 
107 |         case_sensitive (bool): If false, the case of column names will adjust according to the schema. Default is true.
108 | 
109 |     """
110 | 
111 |     # Location of table
112 |     database: Database
113 |     table_path: DbPath
114 | 
115 |     # Columns
116 |     key_columns: Tuple[str, ...]
117 |     update_column: str = None
118 |     extra_columns: Tuple[str, ...] = ()
119 | 
120 |     # Restrict the segment
121 |     min_key: Vector = None
122 |     max_key: Vector = None
123 |     min_update: DbTime = None
124 |     max_update: DbTime = None
125 |     where: str = None
126 | 
127 |     case_sensitive: bool = True
128 |     _schema: Schema = None
129 | 
130 |     def __post_init__(self):
131 |         if not self.update_column and (self.min_update or self.max_update):
132 |             raise ValueError("Error: the min_update/max_update feature requires 'update_column' to be set.")
133 | 
134 |         if self.min_key is not None and self.max_key is not None and self.min_key >= self.max_key:
135 |             raise ValueError(f"Error: min_key expected to be smaller than max_key! ({self.min_key} >= {self.max_key})")
136 | 
137 |         if self.min_update is not None and self.max_update is not None and self.min_update >= self.max_update:
138 |             raise ValueError(
139 |                 f"Error: min_update expected to be smaller than max_update! ({self.min_update} >= {self.max_update})"
140 |             )
141 | 
142 |     def _where(self):
143 |         return f"({self.where})" if self.where else None
144 | 
145 |     def _with_raw_schema(self, raw_schema: dict, refine: bool = True, allow_empty_table=False) -> "TableSegment":
146 |         # TODO validate all relevant columns are in the schema?
147 |         cols = {c.lower() for c in self.relevant_columns}
148 |         # We use v[0] to get the actual name (with correct case)
149 |         raw_schema = {v[0]: v for k, v in raw_schema.items() if k.lower() in cols}
150 |         schema, samples = self.database.process_query_table_schema(
151 |             self.table_path, raw_schema, refine=refine, refine_where=self._where()
152 |         )
153 |         assert refine or samples is None
154 |         is_empty_table = samples is not None and not samples
155 |         if is_empty_table and not allow_empty_table:
156 |             raise EmptyTable(f"Table {self.table_path} is empty. Use --allow-empty-tables to disable this protection.", self)
157 | 
158 |         res = self.new(_schema=create_schema(self.database, self.table_path, schema, self.case_sensitive))
159 | 
160 |         return EmptyTableSegment(res) if is_empty_table else res
161 | 
162 |     def with_schema(self, refine: bool = True, allow_empty_table: bool = False) -> "TableSegment":
163 |         "Queries the table schema from the database, and returns a new instance of TableSegment, with a schema."
164 |         if self._schema:
165 |             return self
166 | 
167 |         return self._with_raw_schema(
168 |             self.database.query_table_schema(self.table_path), refine=refine, allow_empty_table=allow_empty_table
169 |         )
170 |     
171 |     def _cast_col_value(self, col, value):
172 |         """Cast the value to the right type, based on the type of the column
173 | 
174 |         Currently only used to support native vs string UUID values.
175 |         """
176 |         assert self._schema
177 |         t = self._schema[col]
178 |         if isinstance(t, String_UUID):
179 |             return str(value)
180 |         return value
181 | 
182 |     def _make_key_range(self):
183 |         if self.min_key is not None:
184 |             for mn, k in safezip(self.min_key, self.key_columns):
185 |                 mn = self._cast_col_value(k, mn)
186 |                 yield mn <= this[k]
187 |         if self.max_key is not None:
188 |             for k, mx in safezip(self.key_columns, self.max_key):
189 |                 mx = self._cast_col_value(k, mx)
190 |                 yield this[k] < mx
191 | 
192 |     def _make_update_range(self):
193 |         if self.min_update is not None:
194 |             yield self.min_update <= this[self.update_column]
195 |         if self.max_update is not None:
196 |             yield this[self.update_column] < self.max_update
197 | 
198 |     @property
199 |     def source_table(self):
200 |         return table(*self.table_path, schema=self._schema)
201 | 
202 |     def make_select(self):
203 |         return self.source_table.where(
204 |             *self._make_key_range(), *self._make_update_range(), Code(self._where()) if self.where else SKIP
205 |         )
206 | 
207 |     def get_values(self) -> list:
208 |         "Download all the relevant values of the segment from the database"
209 |         select = self.make_select().select(*self._relevant_columns_repr)
210 |         return self.database.query(select, List[Tuple])
211 | 
212 |     def choose_checkpoints(self, count: int) -> List[List[DbKey]]:
213 |         "Suggests a bunch of evenly-spaced checkpoints to split by, including start, end."
214 | 
215 |         assert self.is_bounded
216 | 
217 |         # Take Nth root of count, to approximate the appropriate box size
218 |         count = int(count ** (1 / len(self.key_columns))) or 1
219 | 
220 |         return split_compound_key_space(self.min_key, self.max_key, count)
221 | 
222 |     def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]:
223 |         "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
224 | 
225 |         return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
226 | 
227 |     def new(self, **kwargs) -> "TableSegment":
228 |         """Creates a copy of the instance using 'replace()'"""
229 |         return self.replace(**kwargs)
230 | 
231 |     def new_key_bounds(self, min_key: Vector, max_key: Vector) -> "TableSegment":
232 |         if self.min_key is not None:
233 |             assert self.min_key <= min_key, (self.min_key, min_key)
234 |             assert self.min_key < max_key
235 | 
236 |         if self.max_key is not None:
237 |             assert min_key < self.max_key
238 |             assert max_key <= self.max_key
239 | 
240 |         return self.replace(min_key=min_key, max_key=max_key)
241 | 
242 |     @property
243 |     def relevant_columns(self) -> List[str]:
244 |         extras = list(self.extra_columns)
245 | 
246 |         if self.update_column and self.update_column not in extras:
247 |             extras = [self.update_column] + extras
248 | 
249 |         return list(self.key_columns) + extras
250 | 
251 |     @property
252 |     def _relevant_columns_repr(self) -> List[Expr]:
253 |         return [NormalizeAsString(this[c]) for c in self.relevant_columns]
254 | 
255 |     def count(self) -> int:
256 |         """Count how many rows are in the segment, in one pass."""
257 |         return self.database.query(self.make_select().select(Count()), int)
258 | 
259 |     def count_and_checksum(self) -> Tuple[int, int]:
260 |         """Count and checksum the rows in the segment, in one pass."""
261 |         start = time.monotonic()
262 |         q = self.make_select().select(Count(), Checksum(self._relevant_columns_repr))
263 |         count, checksum = self.database.query(q, tuple)
264 |         duration = time.monotonic() - start
265 |         if duration > RECOMMENDED_CHECKSUM_DURATION:
266 |             logger.warning(
267 |                 "Checksum is taking longer than expected (%.2f). "
268 |                 "We recommend increasing --bisection-factor or decreasing --threads.",
269 |                 duration,
270 |             )
271 | 
272 |         if count:
273 |             assert checksum, (count, checksum)
274 |         return count or 0, int(checksum) if count else None
275 | 
276 |     def query_key_range(self) -> Tuple[tuple, tuple]:
277 |         """Query database for minimum and maximum key. This is used for setting the initial bounds."""
278 |         # Normalizes the result (needed for UUIDs) after the min/max computation
279 |         select = self.make_select().select(
280 |             ApplyFuncAndNormalizeAsString(this[k], f) for k in self.key_columns for f in (min_, max_)
281 |         )
282 |         result = tuple(self.database.query(select, tuple))
283 | 
284 |         if any(i is None for i in result):
285 |             # We return EmptyTable instead of raising it, so that we can consume
286 |             # the key_ranges as an iterator.
287 |             # _parse_key_range_result() will raise the error we return.
288 |             return EmptyTable(f"Table {self.table_path} appears to be empty.", self)
289 | 
290 |         # Min/max keys are interleaved
291 |         min_key, max_key = result[::2], result[1::2]
292 |         assert len(min_key) == len(max_key)
293 | 
294 |         return min_key, max_key
295 | 
296 |     @property
297 |     def is_bounded(self):
298 |         return self.min_key is not None and self.max_key is not None
299 | 
300 |     def approximate_size(self):
301 |         if not self.is_bounded:
302 |             raise RuntimeError("Cannot approximate the size of an unbounded segment. Must have min_key and max_key.")
303 |         diff = self.max_key - self.min_key
304 |         assert all(d > 0 for d in diff)
305 |         return int_product(diff)
306 | 
307 |     @property
308 |     def key_types(self):
309 |         return [self._schema[i] for i in self.key_columns]
310 | 
311 | 
312 | @dataclass
313 | class EmptyTableSegment:
314 |     _table_segment: TableSegment
315 | 
316 |     def approximate_size(self):
317 |         return 0
318 | 
319 |     @property
320 |     def is_bounded(self):
321 |         return True
322 | 
323 |     def query_key_range(self) -> Tuple[tuple, tuple]:
324 |         return EmptyTable()
325 | 
326 |     def count(self) -> int:
327 |         return 0
328 | 
329 |     def count_and_checksum(self) -> Tuple[int, int]:
330 |         return (0, None)
331 | 
332 |     def __getattr__(self, attr):
333 |         assert attr in ("database", "key_columns", "key_types", "relevant_columns", "_schema")
334 |         return getattr(self._table_segment, attr)
335 | 
336 |     @property
337 |     def min_key(self):
338 |         return None
339 | 
340 |     @property
341 |     def max_key(self):
342 |         return None
343 | 
344 |     def with_schema(self, refine: bool = True, allow_empty_table: bool = False) -> "TableSegment":
345 |         assert self._table_segment._schema
346 |         return self
347 | 
348 |     def new_key_bounds(self, min_key: Vector, max_key: Vector) -> "TableSegment":
349 |         return self
350 | 
351 |     def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]:
352 |         "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
353 |         mesh = create_mesh_from_points(*checkpoints)
354 |         return [self for s, e in mesh]
355 | 
356 |     def make_select(self):
357 |         # XXX shouldn't be called
358 |         return self._table_segment.make_select()
359 | 
360 |     def get_values(self) -> list:
361 |         return []
362 | 


--------------------------------------------------------------------------------
/reladiff/thread_utils.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | from queue import PriorityQueue
  3 | from collections import deque
  4 | from collections.abc import Iterable
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | from concurrent.futures.thread import _WorkItem
  7 | from time import sleep
  8 | from typing import Callable, Iterator, Optional
  9 | 
 10 | 
 11 | class AutoPriorityQueue(PriorityQueue):
 12 |     """Overrides PriorityQueue to automatically get the priority from _WorkItem.kwargs
 13 | 
 14 |     We also assign a unique id for each item, to avoid making comparisons on _WorkItem.
 15 |     As a side effect, items with the same priority are returned FIFO.
 16 |     """
 17 | 
 18 |     _counter = itertools.count().__next__
 19 | 
 20 |     def put(self, item: Optional[_WorkItem], block=True, timeout=None):
 21 |         priority = item.kwargs.pop("priority") if item is not None else 0
 22 |         super().put((-priority, self._counter(), item), block, timeout)
 23 | 
 24 |     def get(self, block=True, timeout=None) -> Optional[_WorkItem]:
 25 |         _p, _c, work_item = super().get(block, timeout)
 26 |         return work_item
 27 | 
 28 | 
 29 | class PriorityThreadPoolExecutor(ThreadPoolExecutor):
 30 |     """Overrides ThreadPoolExecutor to use AutoPriorityQueue
 31 | 
 32 |     XXX WARNING: Might break in future versions of Python
 33 |     """
 34 | 
 35 |     def __init__(self, *args):
 36 |         super().__init__(*args)
 37 | 
 38 |         self._work_queue = AutoPriorityQueue()
 39 | 
 40 | 
 41 | class ThreadedYielder(Iterable):
 42 |     """Yields results from multiple threads into a single iterator, ordered by priority.
 43 | 
 44 |     To add a source iterator, call ``submit()`` with a function that returns an iterator.
 45 |     Priority for the iterator can be provided via the keyword argument 'priority'. (higher runs first)
 46 | 
 47 |     max_workers set the maximum number of worker threads
 48 |     yield_buffer_size sets the size of the "lookahead" buffer for the yielder. Default=1.
 49 |         For lazy computation, set this to 1. Set this to a higher value to reduce latency.
 50 |         Set to 0 for unlimited size.
 51 |     """
 52 | 
 53 |     def __init__(self, max_workers: Optional[int] = None, yield_buffer_size: int = 1):
 54 |         self._pool = PriorityThreadPoolExecutor(max_workers)
 55 |         self._futures = deque()
 56 |         self._yield = deque()
 57 |         self._exception = None
 58 |         self._yield_buffer_size = yield_buffer_size
 59 | 
 60 |     def _worker(self, fn, *args, _priority=0, **kwargs):
 61 |         if self._yield_buffer_size and len(self._yield) >= self._yield_buffer_size:
 62 |             self._idle()
 63 |             self._futures.append(self._pool.submit(self._worker, fn, *args, priority=_priority, _priority=_priority, **kwargs))
 64 |             return
 65 | 
 66 |         try:
 67 |             res = fn(*args, **kwargs)
 68 |             if res is not None:
 69 |                 self._yield.append(res)
 70 |         except Exception as e:
 71 |             self._exception = e
 72 | 
 73 |     def submit(self, fn: Callable, *args, priority: int = 0, **kwargs):
 74 |         self._futures.append(self._pool.submit(self._worker, fn, *args, priority=priority, _priority=priority, **kwargs))
 75 | 
 76 |     def shutdown(self, wait=True):
 77 |         try:
 78 |             # Python 3.9+
 79 |             self._pool.shutdown(wait, cancel_futures=True)
 80 |         except TypeError:
 81 |             # Python 3.8 doesn't support cancel_futures
 82 |             self._pool.shutdown(wait)
 83 | 
 84 |     def _idle(self):
 85 |         if self._exception:
 86 |             raise self._exception
 87 | 
 88 |         while self._futures and self._futures[0].done():
 89 |             self._futures.popleft()
 90 | 
 91 |         if not self._futures:
 92 |             # No more tasks
 93 |             return True
 94 | 
 95 |         sleep(0.001)
 96 | 
 97 |     def __iter__(self) -> Iterator:
 98 |         if self._exception:
 99 |             raise self._exception
100 | 
101 |         while True:
102 |             while self._yield:
103 |                 yield from self._yield.popleft()
104 | 
105 |             if self._idle():
106 |                 break
107 | 
108 | 


--------------------------------------------------------------------------------
/reladiff/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | from typing import Iterable, Sequence
  4 | from urllib.parse import urlparse
  5 | import operator
  6 | import threading
  7 | from datetime import datetime
  8 | 
  9 | 
 10 | def safezip(*args):
 11 |     "zip but makes sure all sequences are the same length"
 12 |     lens = list(map(len, args))
 13 |     if len(set(lens)) != 1:
 14 |         raise ValueError(f"Mismatching lengths in arguments to safezip: {lens}")
 15 |     return zip(*args)
 16 | 
 17 | 
 18 | def _join_if_any(sym, args):
 19 |     args = list(args)
 20 |     if not args:
 21 |         return ""
 22 |     return sym.join(str(a) for a in args if a)
 23 | 
 24 | 
 25 | def remove_password_from_url(url: str, replace_with: str = "***") -> str:
 26 |     parsed = urlparse(url)
 27 |     account = parsed.username or ""
 28 |     if parsed.password:
 29 |         account += ":" + replace_with
 30 |     host = _join_if_any(":", filter(None, [parsed.hostname, parsed.port]))
 31 |     netloc = _join_if_any("@", filter(None, [account, host]))
 32 |     replaced = parsed._replace(netloc=netloc)
 33 |     return replaced.geturl()
 34 | 
 35 | 
 36 | def match_like(pattern: str, strs: Sequence[str]) -> Iterable[str]:
 37 |     reo = re.compile(pattern.replace("%", ".*").replace("?", ".") + "$")
 38 |     for s in strs:
 39 |         if reo.match(s):
 40 |             yield s
 41 | 
 42 | 
 43 | def accumulate(iterable, func=operator.add, *, initial=None):
 44 |     "Return running totals"
 45 |     # Taken from https://docs.python.org/3/library/itertools.html#itertools.accumulate, to backport 'initial' to 3.7
 46 |     it = iter(iterable)
 47 |     total = initial
 48 |     if initial is None:
 49 |         try:
 50 |             total = next(it)
 51 |         except StopIteration:
 52 |             return
 53 |     yield total
 54 |     for element in it:
 55 |         total = func(total, element)
 56 |         yield total
 57 | 
 58 | 
 59 | def run_as_daemon(threadfunc, *args):
 60 |     th = threading.Thread(target=threadfunc, args=args)
 61 |     th.daemon = True
 62 |     th.start()
 63 |     return th
 64 | 
 65 | 
 66 | def getLogger(name):
 67 |     return logging.getLogger(name.rsplit(".", 1)[-1])
 68 | 
 69 | 
 70 | def eval_name_template(name):
 71 |     def get_timestamp(_match):
 72 |         return datetime.now().isoformat("_", "seconds").replace(":", "_")
 73 | 
 74 |     return re.sub("%t", get_timestamp, name)
 75 | 
 76 | 
 77 | def truncate_error(error: str):
 78 |     first_line = error.split("\n", 1)[0]
 79 |     return re.sub("'(.*?)'", "'***'", first_line)
 80 | 
 81 | 
 82 | class Vector(tuple):
 83 | 
 84 |     """Immutable implementation of a regular vector over any arithmetic value
 85 | 
 86 |     Implements a product order - https://en.wikipedia.org/wiki/Product_order
 87 | 
 88 |     Partial implementation: Only the needed functionality is implemented
 89 |     """
 90 | 
 91 |     def __lt__(self, other: "Vector"):
 92 |         if isinstance(other, Vector):
 93 |             return all(a < b for a, b in safezip(self, other))
 94 |         return NotImplemented
 95 | 
 96 |     def __le__(self, other: "Vector"):
 97 |         if isinstance(other, Vector):
 98 |             return all(a <= b for a, b in safezip(self, other))
 99 |         return NotImplemented
100 | 
101 |     def __gt__(self, other: "Vector"):
102 |         if isinstance(other, Vector):
103 |             return all(a > b for a, b in safezip(self, other))
104 |         return NotImplemented
105 | 
106 |     def __ge__(self, other: "Vector"):
107 |         if isinstance(other, Vector):
108 |             return all(a >= b for a, b in safezip(self, other))
109 |         return NotImplemented
110 | 
111 |     def __eq__(self, other: "Vector"):
112 |         if isinstance(other, Vector):
113 |             return all(a == b for a, b in safezip(self, other))
114 |         return NotImplemented
115 | 
116 |     def __sub__(self, other: "Vector"):
117 |         if isinstance(other, Vector):
118 |             return Vector((a - b) for a, b in safezip(self, other))
119 |         raise NotImplementedError()
120 | 
121 |     def __repr__(self) -> str:
122 |         return "(%s)" % ", ".join(str(k) for k in self)
123 | 


--------------------------------------------------------------------------------
/reladiff_logo.svg:
--------------------------------------------------------------------------------
1 | <svg width="270" height="215" viewBox="0 0 270 215" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path d="M270 107.5C270 166.871 222.094 215 163 215C103.906 215 56 166.871 56 107.5C56 48.1294 103.906 0 163 0C222.094 0 270 48.1294 270 107.5Z" fill="#60A166"/>
3 | <path opacity="0.99" d="M214 107.5C214 166.871 166.094 215 107 215C47.9055 215 0 166.871 0 107.5C0 48.1294 47.9055 0 107 0C166.094 0 214 48.1294 214 107.5Z" fill="#C86868"/>
4 | <path fill-rule="evenodd" clip-rule="evenodd" d="M135 211.564C180.514 199.199 214 157.419 214 107.782C214 58.1445 180.514 16.3647 135 4C89.4857 16.3647 56 58.1445 56 107.782C56 157.419 89.4857 199.199 135 211.564Z" fill="#415931"/>
5 | <path d="M84.9297 100.477C85.75 100.477 86.5052 100.535 87.1953 100.652C87.8984 100.757 88.4648 100.874 88.8945 101.004L88.0938 106.414C87.3776 106.245 86.6419 106.121 85.8867 106.043C85.1445 105.952 84.4349 105.906 83.7578 105.906C83.0547 105.906 82.4167 105.971 81.8438 106.102C81.2839 106.219 80.7826 106.401 80.3398 106.648C79.806 106.935 79.3503 107.299 78.9727 107.742C78.5951 108.185 78.2826 108.706 78.0352 109.305V122H72.6055V100.867H77.6641L77.8789 103.855L77.918 104.324C78.7773 103.126 79.806 102.189 81.0039 101.512C82.2018 100.822 83.5104 100.477 84.9297 100.477ZM104.031 122.391C102.404 122.391 100.926 122.124 99.5977 121.59C98.2695 121.056 97.1302 120.327 96.1797 119.402C95.2292 118.491 94.4935 117.417 93.9727 116.18C93.4648 114.93 93.2109 113.595 93.2109 112.176V111.395C93.2109 109.78 93.4648 108.309 93.9727 106.98C94.4805 105.639 95.1901 104.487 96.1016 103.523C97.013 102.56 98.0938 101.811 99.3438 101.277C100.594 100.743 101.954 100.477 103.426 100.477C104.949 100.477 106.303 100.73 107.488 101.238C108.686 101.733 109.702 102.436 110.535 103.348C111.355 104.259 111.98 105.359 112.41 106.648C112.84 107.924 113.055 109.337 113.055 110.887V113.191H98.7188V113.25C98.901 114.096 99.1354 114.78 99.4219 115.301C99.7083 115.809 100.092 116.271 100.574 116.688C101.069 117.117 101.635 117.449 102.273 117.684C102.911 117.918 103.608 118.035 104.363 118.035C105.431 118.035 106.466 117.833 107.469 117.43C108.471 117.026 109.285 116.421 109.91 115.613L112.625 118.543C111.935 119.52 110.841 120.405 109.344 121.199C107.846 121.993 106.076 122.391 104.031 122.391ZM103.387 104.871C102.775 104.871 102.215 104.982 101.707 105.203C101.199 105.411 100.757 105.717 100.379 106.121C99.9883 106.525 99.6628 107.007 99.4023 107.566C99.1419 108.126 98.9466 108.751 98.8164 109.441H107.723V109.012C107.723 108.439 107.618 107.898 107.41 107.391C107.202 106.883 106.915 106.44 106.551 106.062C106.186 105.698 105.737 105.411 105.203 105.203C104.682 104.982 104.077 104.871 103.387 104.871ZM118.934 92H130.75V117.586H136.805V122H118.934V117.586H125.262V96.4336H118.934V92ZM155.027 122C154.897 121.74 154.786 121.44 154.695 121.102C154.604 120.763 154.526 120.405 154.461 120.027C154.161 120.353 153.816 120.659 153.426 120.945C153.035 121.232 152.605 121.479 152.137 121.688C151.655 121.909 151.128 122.078 150.555 122.195C149.982 122.326 149.363 122.391 148.699 122.391C147.618 122.391 146.622 122.234 145.711 121.922C144.812 121.609 144.038 121.173 143.387 120.613C142.736 120.053 142.228 119.389 141.863 118.621C141.499 117.84 141.316 116.993 141.316 116.082C141.316 114.91 141.544 113.875 142 112.977C142.456 112.065 143.139 111.316 144.051 110.73C144.871 110.197 145.88 109.793 147.078 109.52C148.289 109.233 149.676 109.09 151.238 109.09H154.266V107.84C154.266 107.345 154.188 106.902 154.031 106.512C153.875 106.108 153.641 105.763 153.328 105.477C153.003 105.177 152.592 104.949 152.098 104.793C151.616 104.624 151.049 104.539 150.398 104.539C149.826 104.539 149.331 104.611 148.914 104.754C148.497 104.884 148.159 105.073 147.898 105.32C147.664 105.542 147.488 105.809 147.371 106.121C147.267 106.434 147.215 106.779 147.215 107.156H141.785C141.785 106.258 141.98 105.405 142.371 104.598C142.775 103.79 143.361 103.081 144.129 102.469C144.884 101.857 145.809 101.375 146.902 101.023C148.009 100.659 149.272 100.477 150.691 100.477C151.967 100.477 153.152 100.633 154.246 100.945C155.353 101.258 156.31 101.72 157.117 102.332C157.924 102.944 158.556 103.719 159.012 104.656C159.467 105.581 159.695 106.655 159.695 107.879V116.941C159.695 118.074 159.767 119.012 159.91 119.754C160.053 120.496 160.262 121.134 160.535 121.668V122H155.027ZM149.852 118.191C150.398 118.191 150.906 118.126 151.375 117.996C151.844 117.866 152.267 117.697 152.645 117.488C153.022 117.28 153.348 117.046 153.621 116.785C153.895 116.512 154.109 116.238 154.266 115.965V112.332H151.492C150.646 112.332 149.923 112.417 149.324 112.586C148.725 112.742 148.237 112.964 147.859 113.25C147.482 113.549 147.202 113.908 147.02 114.324C146.837 114.728 146.746 115.171 146.746 115.652C146.746 116.03 146.811 116.382 146.941 116.707C147.085 117.02 147.286 117.286 147.547 117.508C147.807 117.716 148.126 117.885 148.504 118.016C148.895 118.133 149.344 118.191 149.852 118.191ZM165.281 111.277C165.281 109.676 165.464 108.211 165.828 106.883C166.206 105.555 166.753 104.415 167.469 103.465C168.185 102.514 169.051 101.779 170.066 101.258C171.095 100.737 172.267 100.477 173.582 100.477C174.689 100.477 175.659 100.678 176.492 101.082C177.339 101.473 178.087 102.033 178.738 102.762V92H184.188V122H179.285L179.012 119.793C178.348 120.613 177.566 121.251 176.668 121.707C175.77 122.163 174.728 122.391 173.543 122.391C172.254 122.391 171.095 122.124 170.066 121.59C169.051 121.043 168.191 120.294 167.488 119.344C166.772 118.393 166.225 117.267 165.848 115.965C165.47 114.65 165.281 113.224 165.281 111.688V111.277ZM170.711 111.688C170.711 112.573 170.783 113.4 170.926 114.168C171.082 114.923 171.329 115.581 171.668 116.141C172.007 116.714 172.443 117.163 172.977 117.488C173.51 117.814 174.161 117.977 174.93 117.977C175.867 117.977 176.642 117.775 177.254 117.371C177.879 116.967 178.374 116.414 178.738 115.711V107.156C178.374 106.44 177.885 105.887 177.273 105.496C176.661 105.092 175.893 104.891 174.969 104.891C174.201 104.891 173.543 105.06 172.996 105.398C172.462 105.724 172.026 106.173 171.688 106.746C171.349 107.332 171.102 108.016 170.945 108.797C170.789 109.565 170.711 110.392 170.711 111.277V111.688ZM191.316 100.867H202.82V117.586H208.543V122H191.316V117.586H197.332V105.301H191.316V100.867ZM196.941 95.457C196.941 95.0404 197.013 94.6562 197.156 94.3047C197.312 93.9401 197.527 93.6276 197.801 93.3672C198.074 93.1198 198.4 92.9245 198.777 92.7812C199.168 92.638 199.598 92.5664 200.066 92.5664C201.03 92.5664 201.792 92.8398 202.352 93.3867C202.924 93.9206 203.211 94.6107 203.211 95.457C203.211 96.3034 202.924 97 202.352 97.5469C201.792 98.0807 201.03 98.3477 200.066 98.3477C199.598 98.3477 199.168 98.276 198.777 98.1328C198.4 97.9896 198.074 97.7878 197.801 97.5273C197.527 97.2799 197.312 96.9805 197.156 96.6289C197.013 96.2643 196.941 95.8737 196.941 95.457ZM215.148 122V104.871H209.602V100.867H215.148V99.7148C215.148 98.3216 215.37 97.0911 215.812 96.0234C216.255 94.9557 216.887 94.0573 217.707 93.3281C218.527 92.599 219.523 92.0521 220.695 91.6875C221.867 91.3099 223.176 91.1211 224.621 91.1211C225.077 91.1211 225.513 91.1406 225.93 91.1797C226.359 91.2057 226.776 91.2448 227.18 91.2969C227.583 91.349 227.987 91.4076 228.391 91.4727C228.807 91.5378 229.237 91.6094 229.68 91.6875L229.328 95.9453C228.833 95.8411 228.273 95.75 227.648 95.6719C227.023 95.5938 226.288 95.5547 225.441 95.5547C224.53 95.5547 223.755 95.6589 223.117 95.8672C222.479 96.0755 221.971 96.3815 221.594 96.7852C221.255 97.1237 221.001 97.5404 220.832 98.0352C220.676 98.5299 220.598 99.0898 220.598 99.7148V100.867H228.215V104.871H220.598V122H215.148ZM237.972 122V104.871H232.425V100.867H237.972V99.7148C237.972 98.3216 238.193 97.0911 238.636 96.0234C239.079 94.9557 239.71 94.0573 240.53 93.3281C241.351 92.599 242.347 92.0521 243.519 91.6875C244.691 91.3099 245.999 91.1211 247.445 91.1211C247.9 91.1211 248.336 91.1406 248.753 91.1797C249.183 91.2057 249.599 91.2448 250.003 91.2969C250.407 91.349 250.81 91.4076 251.214 91.4727C251.631 91.5378 252.06 91.6094 252.503 91.6875L252.152 95.9453C251.657 95.8411 251.097 95.75 250.472 95.6719C249.847 95.5938 249.111 95.5547 248.265 95.5547C247.353 95.5547 246.579 95.6589 245.941 95.8672C245.303 96.0755 244.795 96.3815 244.417 96.7852C244.079 97.1237 243.825 97.5404 243.655 98.0352C243.499 98.5299 243.421 99.0898 243.421 99.7148V100.867H251.038V104.871H243.421V122H237.972Z" fill="white"/>
6 | </svg>
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/erezsh/reladiff/f613504212d1f0a40e650238e25b19810f523825/tests/__init__.py


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import string
  4 | import random
  5 | from typing import Callable
  6 | import unittest
  7 | import logging
  8 | import subprocess
  9 | 
 10 | from parameterized import parameterized_class
 11 | 
 12 | from sqeleton.queries import table
 13 | from sqeleton.databases import Database
 14 | 
 15 | from reladiff import databases as db
 16 | from reladiff import connect
 17 | from reladiff.table_segment import TableSegment
 18 | from reladiff.query_utils import drop_table
 19 | 
 20 | # We write 'or None' because Github sometimes creates empty env vars for secrets
 21 | TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
 22 | TEST_POSTGRESQL_CONN_STRING: str = "postgresql://postgres:Password1@localhost/postgres"
 23 | TEST_SNOWFLAKE_CONN_STRING: str = os.environ.get("SNOWFLAKE_URI") or None
 24 | TEST_PRESTO_CONN_STRING: str = os.environ.get("PRESTO_URI") or None
 25 | TEST_BIGQUERY_CONN_STRING: str = os.environ.get("BIGQUERY_URI") or None
 26 | TEST_REDSHIFT_CONN_STRING: str = os.environ.get("REDSHIFT_URI") or None
 27 | TEST_ORACLE_CONN_STRING: str = None
 28 | TEST_DATABRICKS_CONN_STRING: str = os.environ.get("DATABRICKS_URI")
 29 | TEST_TRINO_CONN_STRING: str = os.environ.get("TRINO_URI") or None
 30 | # clickhouse uri for provided docker - "clickhouse://clickhouse:Password1@localhost:9000/clickhouse"
 31 | TEST_CLICKHOUSE_CONN_STRING: str = os.environ.get("CLICKHOUSE_URI")
 32 | # vertica uri provided for docker - "vertica://vertica:Password1@localhost:5433/vertica"
 33 | TEST_VERTICA_CONN_STRING: str = os.environ.get("VERTICA_URI")
 34 | TEST_DUCKDB_CONN_STRING: str = "duckdb://main:@:memory:"
 35 | 
 36 | 
 37 | DEFAULT_N_SAMPLES = 50
 38 | N_SAMPLES = int(os.environ.get("N_SAMPLES", DEFAULT_N_SAMPLES))
 39 | BENCHMARK = os.environ.get("BENCHMARK", False)
 40 | N_THREADS = int(os.environ.get("N_THREADS", 1))
 41 | TEST_ACROSS_ALL_DBS = os.environ.get("TEST_ACROSS_ALL_DBS", True)  # Should we run the full db<->db test suite?
 42 | 
 43 | 
 44 | def get_git_revision_short_hash() -> str:
 45 |     return subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip()
 46 | 
 47 | 
 48 | GIT_REVISION = get_git_revision_short_hash()
 49 | 
 50 | level = logging.ERROR
 51 | if os.environ.get("LOG_LEVEL", False):
 52 |     level = getattr(logging, os.environ["LOG_LEVEL"].upper())
 53 | 
 54 | logging.basicConfig(level=level)
 55 | logging.getLogger("hashdiff_tables").setLevel(level)
 56 | logging.getLogger("joindiff_tables").setLevel(level)
 57 | logging.getLogger("diff_tables").setLevel(level)
 58 | logging.getLogger("table_segment").setLevel(level)
 59 | logging.getLogger("database").setLevel(level)
 60 | 
 61 | try:
 62 |     from .local_settings import *
 63 | except ImportError:
 64 |     pass  # No local settings
 65 | 
 66 | 
 67 | CONN_STRINGS = {
 68 |     db.BigQuery: TEST_BIGQUERY_CONN_STRING,
 69 |     db.MySQL: TEST_MYSQL_CONN_STRING,
 70 |     db.PostgreSQL: TEST_POSTGRESQL_CONN_STRING,
 71 |     db.Snowflake: TEST_SNOWFLAKE_CONN_STRING,
 72 |     db.Redshift: TEST_REDSHIFT_CONN_STRING,
 73 |     db.Oracle: TEST_ORACLE_CONN_STRING,
 74 |     db.Presto: TEST_PRESTO_CONN_STRING,
 75 |     db.Databricks: TEST_DATABRICKS_CONN_STRING,
 76 |     db.Trino: TEST_TRINO_CONN_STRING,
 77 |     db.Clickhouse: TEST_CLICKHOUSE_CONN_STRING,
 78 |     db.Vertica: TEST_VERTICA_CONN_STRING,
 79 |     db.DuckDB: TEST_DUCKDB_CONN_STRING,
 80 | }
 81 | 
 82 | _database_instances = {}
 83 | 
 84 | 
 85 | def get_conn(cls: type, shared: bool = True) -> Database:
 86 |     if shared:
 87 |         if cls not in _database_instances:
 88 |             _database_instances[cls] = get_conn(cls, shared=False)
 89 |         return _database_instances[cls]
 90 | 
 91 |     return connect(CONN_STRINGS[cls], N_THREADS)
 92 | 
 93 | 
 94 | def _print_used_dbs():
 95 |     used = {k.__name__ for k, v in CONN_STRINGS.items() if v is not None}
 96 |     unused = {k.__name__ for k, v in CONN_STRINGS.items() if v is None}
 97 | 
 98 |     print(f"Testing databases: {', '.join(used)}")
 99 |     if unused:
100 |         logging.info(f"Connection not configured; skipping tests for: {', '.join(unused)}")
101 |     if TEST_ACROSS_ALL_DBS:
102 |         logging.info(
103 |             f"Full tests enabled (every db<->db). May take very long when many dbs are involved. ={TEST_ACROSS_ALL_DBS}"
104 |         )
105 | 
106 | 
107 | _print_used_dbs()
108 | CONN_STRINGS = {k: v for k, v in CONN_STRINGS.items() if v is not None}
109 | 
110 | 
111 | def random_table_suffix() -> str:
112 |     char_set = string.ascii_lowercase + string.digits
113 |     suffix = "_"
114 |     suffix += "".join(random.choice(char_set) for _ in range(5))
115 |     return suffix
116 | 
117 | 
118 | def str_to_checksum(str: str):
119 |     # hello world
120 |     #   => 5eb63bbbe01eeed093cb22bb8f5acdc3
121 |     #   =>                   cb22bb8f5acdc3
122 |     #   => 273350391345368515
123 |     m = hashlib.md5()
124 |     m.update(str.encode("utf-8"))  # encode to binary
125 |     md5 = m.hexdigest()
126 |     # 0-indexed, unlike DBs which are 1-indexed here, so +1 in dbs
127 |     half_pos = db.MD5_HEXDIGITS - db.CHECKSUM_HEXDIGITS
128 |     return int(md5[half_pos:], 16)
129 | 
130 | 
131 | class DiffTestCase(unittest.TestCase):
132 |     "Sets up two tables for diffing"
133 |     db_cls = None
134 |     src_schema = None
135 |     dst_schema = None
136 |     shared_connection = True
137 | 
138 |     def setUp(self):
139 |         assert self.db_cls, self.db_cls
140 | 
141 |         self.connection = get_conn(self.db_cls, self.shared_connection)
142 | 
143 |         table_suffix = random_table_suffix()
144 |         self.table_src_name = f"src{table_suffix}"
145 |         self.table_dst_name = f"dst{table_suffix}"
146 | 
147 |         self.table_src_path = self.connection.parse_table_name(self.table_src_name)
148 |         self.table_dst_path = self.connection.parse_table_name(self.table_dst_name)
149 | 
150 |         drop_table(self.connection, self.table_src_path)
151 |         drop_table(self.connection, self.table_dst_path)
152 | 
153 |         self.src_table = table(self.table_src_path, schema=self.src_schema)
154 |         self.dst_table = table(self.table_dst_path, schema=self.dst_schema)
155 |         if self.src_schema:
156 |             self.connection.query(self.src_table.create())
157 |         if self.dst_schema:
158 |             self.connection.query(self.dst_table.create())
159 | 
160 |         return super().setUp()
161 | 
162 |     def tearDown(self):
163 |         drop_table(self.connection, self.table_src_path)
164 |         drop_table(self.connection, self.table_dst_path)
165 | 
166 | 
167 | def _parameterized_class_per_conn(test_databases):
168 |     test_databases = set(test_databases)
169 |     names = [(cls.__name__, cls) for cls in CONN_STRINGS if cls in test_databases]
170 |     return parameterized_class(("name", "db_cls"), names)
171 | 
172 | 
173 | def test_each_database_in_list(databases) -> Callable:
174 |     def _test_per_database(cls):
175 |         return _parameterized_class_per_conn(databases)(cls)
176 | 
177 |     return _test_per_database
178 | 
179 | 
180 | def table_segment(database, table_path, key_columns, *args, **kw):
181 |     if isinstance(key_columns, str):
182 |         key_columns = (key_columns,)
183 |     return TableSegment(database, table_path, key_columns, *args, **kw)
184 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from reladiff import diff_tables, connect_to_table, Algorithm
 4 | from reladiff.databases import MySQL
 5 | from sqeleton.queries import commit
 6 | 
 7 | from .common import TEST_MYSQL_CONN_STRING, DiffTestCase
 8 | 
 9 | 
10 | class TestApi(DiffTestCase):
11 |     src_schema = {"id": int, "datetime": datetime, "text_comment": str}
12 |     db_cls = MySQL
13 | 
14 |     def setUp(self) -> None:
15 |         super().setUp()
16 | 
17 |         self.conn = self.connection
18 | 
19 |         self.now = now = datetime.now()
20 | 
21 |         rows = [
22 |             (now, "now"),
23 |             (self.now - timedelta(seconds=10), "a"),
24 |             (self.now - timedelta(seconds=7), "b"),
25 |             (self.now - timedelta(seconds=6), "c"),
26 |         ]
27 | 
28 |         self.conn.query(
29 |             [
30 |                 self.src_table.insert_rows((i, ts, s) for i, (ts, s) in enumerate(rows)),
31 |                 self.dst_table.create(self.src_table),
32 |                 self.src_table.insert_row(len(rows), self.now - timedelta(seconds=3), "3 seconds ago"),
33 |                 commit,
34 |             ]
35 |         )
36 | 
37 |     def test_api(self):
38 |         # test basic
39 |         t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name)
40 |         t2 = connect_to_table(TEST_MYSQL_CONN_STRING, (self.table_dst_name,))
41 |         diff = list(diff_tables(t1, t2, algorithm=Algorithm.JOINDIFF))
42 |         assert len(diff) == 1
43 | 
44 |         # test algorithm
45 |         # (also tests shared connection on connect_to_table)
46 |         for algo in (Algorithm.HASHDIFF, Algorithm.JOINDIFF):
47 |             diff = list(diff_tables(t1, t2, algorithm=algo))
48 |             assert len(diff) == 1
49 | 
50 |         # test where
51 |         diff_id = diff[0][1][0]
52 |         where = f"id != {diff_id} OR id = 90000000"
53 | 
54 |         t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name, where=where)
55 |         t2 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_dst_name, where=where)
56 |         diff = list(diff_tables(t1, t2))
57 |         assert len(diff) == 0
58 | 
59 |         # test close and empty tables
60 |         diff = diff_tables(t1, t2, allow_empty_tables=True)  # Make sure the API exists
61 |         diff.close()
62 | 
63 |         # test context manager
64 |         with diff_tables(t1, t2) as diff:
65 |             assert len(list(diff)) == 0
66 | 
67 | 
68 |     def test_api_get_stats_dict(self):
69 |         # XXX Likely to change in the future
70 |         expected_dict = {
71 |             "rows_A": 5,
72 |             "rows_B": 4,
73 |             "exclusive_A": 1,
74 |             "exclusive_B": 0,
75 |             "updated": 0,
76 |             "unchanged": 4,
77 |             "total": 1,
78 |             # "stats": {"rows_downloaded": 5},
79 |         }
80 |         t1 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_src_name)
81 |         t2 = connect_to_table(TEST_MYSQL_CONN_STRING, self.table_dst_name)
82 |         diff = diff_tables(t1, t2)
83 | 
84 |         output = diff.get_stats_dict()
85 |         output.pop("stats")
86 |         self.assertEqual(expected_dict, output)
87 |         self.assertIsNotNone(diff)
88 |         assert len(list(diff)) == 1
89 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import subprocess
  3 | import sys
  4 | from datetime import datetime, timedelta
  5 | 
  6 | from sqeleton.queries import commit, current_timestamp
  7 | 
  8 | from .common import DiffTestCase, CONN_STRINGS
  9 | from .test_diff_tables import test_each_database
 10 | 
 11 | 
 12 | def run_reladiff_cli(*args):
 13 |     try:
 14 |         stdout = subprocess.check_output([sys.executable, "-m", "reladiff"] + list(args), stderr=subprocess.PIPE)
 15 |     except subprocess.CalledProcessError as e:
 16 |         logging.error(e.stderr)
 17 |         raise
 18 |     return stdout.splitlines()
 19 | 
 20 | 
 21 | @test_each_database
 22 | class TestCLI(DiffTestCase):
 23 |     src_schema = {"id": int, "datetime": datetime, "text_comment": str}
 24 | 
 25 |     def setUp(self) -> None:
 26 |         super().setUp()
 27 | 
 28 |         now = self.connection.query(current_timestamp(), datetime)
 29 | 
 30 |         rows = [
 31 |             (now, "now"),
 32 |             (now - timedelta(seconds=10), "a"),
 33 |             (now - timedelta(seconds=7), "b"),
 34 |             (now - timedelta(seconds=6), "c"),
 35 |         ]
 36 | 
 37 |         self.connection.query(
 38 |             [
 39 |                 self.src_table.insert_rows((i, ts, s) for i, (ts, s) in enumerate(rows)),
 40 |                 self.dst_table.create(self.src_table),
 41 |                 self.src_table.insert_row(len(rows), now - timedelta(seconds=3), "3 seconds ago"),
 42 |                 commit,
 43 |             ]
 44 |         )
 45 | 
 46 |     def test_basic(self):
 47 |         conn_str = CONN_STRINGS[self.db_cls]
 48 |         diff = run_reladiff_cli(conn_str, self.table_src_name, conn_str, self.table_dst_name)
 49 |         assert len(diff) == 1
 50 | 
 51 |     def test_options(self):
 52 |         conn_str = CONN_STRINGS[self.db_cls]
 53 |         diff = run_reladiff_cli(
 54 |             conn_str,
 55 |             self.table_src_name,
 56 |             conn_str,
 57 |             self.table_dst_name,
 58 |             "--bisection-factor",
 59 |             "16",
 60 |             "--bisection-threshold",
 61 |             "10000",
 62 |             "--limit",
 63 |             "5",
 64 |             "-t",
 65 |             "datetime",
 66 |             "--max-age",
 67 |             "1h",
 68 |             "--allow-empty-tables"
 69 |         )
 70 |         assert len(diff) == 1, diff
 71 | 
 72 | 
 73 | @test_each_database
 74 | class TestCLI_CaseSensitive(DiffTestCase):
 75 |     src_schema = {"ID": int, "Datetime": datetime, "Text_Comment": str}
 76 | 
 77 |     def setUp(self) -> None:
 78 |         super().setUp()
 79 | 
 80 |         now = self.connection.query(current_timestamp(), datetime)
 81 | 
 82 |         rows = [
 83 |             (now, "now"),
 84 |             (now - timedelta(seconds=10), "a"),
 85 |             (now - timedelta(seconds=7), "b"),
 86 |             (now - timedelta(seconds=6), "c"),
 87 |         ]
 88 | 
 89 |         self.connection.query(
 90 |             [
 91 |                 self.src_table.insert_rows((i, ts, s) for i, (ts, s) in enumerate(rows)),
 92 |                 self.dst_table.create(self.src_table),
 93 |                 self.src_table.insert_row(len(rows), now - timedelta(seconds=3), "3 seconds ago"),
 94 |                 commit,
 95 |             ]
 96 |         )
 97 | 
 98 |     def test_cli_case_sensitive(self):
 99 |         conn_str = CONN_STRINGS[self.db_cls]
100 |         diff = run_reladiff_cli(conn_str, self.table_src_name, conn_str, self.table_dst_name)
101 |         assert len(diff) == 1


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | 
  4 | from reladiff.config import apply_config_from_string, ConfigParseError
  5 | from reladiff.utils import remove_password_from_url
  6 | 
  7 | 
  8 | class TestConfig(unittest.TestCase):
  9 |     def test_basic(self):
 10 |         config = r"""
 11 |             [database.test_postgresql]
 12 |             driver = "postgresql"
 13 |             user = "postgres"
 14 |             password = "Password1"
 15 | 
 16 |             [run.default]
 17 |             update_column = "timestamp"
 18 |             verbose = true
 19 |             threads = 2
 20 | 
 21 |             [run.pg_pg]
 22 |             threads = 4
 23 |             1.database = "test_postgresql"
 24 |             1.table = "rating"
 25 |             1.threads = 11
 26 |             2.database = "postgresql://postgres:Password1@/"
 27 |             2.table = "rating_del1"
 28 |             2.threads = 22
 29 |         """
 30 |         self.assertRaises(ConfigParseError, apply_config_from_string, config, "bla", {})  # No such run
 31 | 
 32 |         res = apply_config_from_string(config, "pg_pg", {})
 33 |         assert res["update_column"] == "timestamp"  # default
 34 |         assert res["verbose"] is True
 35 |         assert res["threads"] == 4  # overwritten by pg_pg
 36 |         assert res["database1"] == {"driver": "postgresql", "user": "postgres", "password": "Password1"}
 37 |         assert res["database2"] == "postgresql://postgres:Password1@/"
 38 |         assert res["table1"] == "rating"
 39 |         assert res["table2"] == "rating_del1"
 40 |         assert res["threads1"] == 11
 41 |         assert res["threads2"] == 22
 42 | 
 43 |         res = apply_config_from_string(config, "pg_pg", {"update_column": "foo", "table2": "bar"})
 44 |         assert res["update_column"] == "foo"
 45 |         assert res["table2"] == "bar"
 46 | 
 47 |     def test_remove_password(self):
 48 |         replace_with = "*****"
 49 |         urls = [
 50 |             "d://host/",
 51 |             "d://host:123/",
 52 |             "d://user@host:123/",
 53 |             "d://user:PASS@host:123/",
 54 |             "d://:PASS@host:123/",
 55 |             "d://:PASS@host:123/path",
 56 |             "d://:PASS@host:123/path?whatever#blabla",
 57 |         ]
 58 |         for url in urls:
 59 |             removed = remove_password_from_url(url, replace_with)
 60 |             expected = url.replace("PASS", replace_with)
 61 |             removed = remove_password_from_url(url, replace_with)
 62 |             self.assertEqual(removed, expected)
 63 | 
 64 |     def test_embed_env(self):
 65 |         env = {
 66 |             "DRIVER": "postgresql",
 67 |             "USER": "postgres",
 68 |             "PASSWORD": "Password1",
 69 |             "RUN_PG_1_DATABASE": "test_postgresql",
 70 |             "RUN_PG_1_TABLE": "rating",
 71 |             "RUN_PG_2_DATABASE": "postgresql://postgres:Password1@/",
 72 |             "RUN_PG_2_TABLE": "rating_del1",
 73 |         }
 74 |         config = r"""
 75 |             [database.test_postgresql]
 76 |             driver = "${DRIVER}"
 77 |             user = "${USER}"
 78 |             password = "${PASSWORD}"
 79 | 
 80 |             [run.default]
 81 |             update_column = "${UPDATE_COLUMN}"
 82 |             verbose = true
 83 |             threads = 2
 84 | 
 85 |             [run.pg_pg]
 86 |             threads = 4
 87 |             1.database = "${RUN_PG_1_DATABASE}"
 88 |             1.table = "${RUN_PG_1_TABLE}"
 89 |             1.threads = 11
 90 |             2.database = "${RUN_PG_2_DATABASE}"
 91 |             2.table = "${RUN_PG_2_TABLE}"
 92 |             2.threads = 22
 93 |         """
 94 | 
 95 |         os.environ.update(env)
 96 |         res = apply_config_from_string(config, "pg_pg", {})
 97 |         assert res["update_column"] == ""  # missing env var
 98 |         assert res["verbose"] is True
 99 |         assert res["threads"] == 4  # overwritten by pg_pg
100 |         assert res["database1"] == {"driver": "postgresql", "user": "postgres", "password": "Password1"}
101 |         assert res["database2"] == "postgresql://postgres:Password1@/"
102 |         assert res["table1"] == "rating"
103 |         assert res["table2"] == "rating_del1"
104 |         assert res["threads1"] == 11
105 |         assert res["threads2"] == 22
106 | 


--------------------------------------------------------------------------------
/tests/test_joindiff.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from datetime import datetime
  3 | 
  4 | from sqeleton.queries.ast_classes import TablePath
  5 | from sqeleton.queries import table, commit
  6 | from reladiff.table_segment import TableSegment
  7 | from reladiff import databases as db
  8 | from reladiff.joindiff_tables import JoinDiffer
  9 | 
 10 | from .test_diff_tables import DiffTestCase
 11 | 
 12 | from .common import (
 13 |     random_table_suffix,
 14 |     test_each_database_in_list,
 15 | )
 16 | 
 17 | 
 18 | TEST_DATABASES = {
 19 |     db.PostgreSQL,
 20 |     db.MySQL,
 21 |     db.Snowflake,
 22 |     db.BigQuery,
 23 |     db.Oracle,
 24 |     db.Redshift,
 25 |     db.Presto,
 26 |     db.Trino,
 27 |     db.Vertica,
 28 | }
 29 | 
 30 | test_each_database = test_each_database_in_list(TEST_DATABASES)
 31 | 
 32 | 
 33 | @test_each_database_in_list({db.Snowflake, db.BigQuery})
 34 | class TestCompositeKey(DiffTestCase):
 35 |     src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
 36 |     dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
 37 | 
 38 |     def setUp(self):
 39 |         super().setUp()
 40 | 
 41 |         self.differ = JoinDiffer()
 42 | 
 43 |     def test_composite_key(self):
 44 |         time = "2022-01-01 00:00:00"
 45 |         time_obj = datetime.fromisoformat(time)
 46 | 
 47 |         cols = "id userid movieid rating timestamp".split()
 48 | 
 49 |         self.connection.query(
 50 |             [
 51 |                 self.src_table.insert_rows([[1, 1, 1, 9, time_obj], [2, 2, 2, 9, time_obj]], columns=cols),
 52 |                 self.dst_table.insert_rows([[1, 1, 1, 9, time_obj], [2, 3, 2, 9, time_obj]], columns=cols),
 53 |                 commit,
 54 |             ]
 55 |         )
 56 | 
 57 |         # Sanity
 58 |         table1 = TableSegment(
 59 |             self.connection, self.table_src_path, ("id",), "timestamp", ("userid",), case_sensitive=False
 60 |         )
 61 |         table2 = TableSegment(
 62 |             self.connection, self.table_dst_path, ("id",), "timestamp", ("userid",), case_sensitive=False
 63 |         )
 64 |         diff = list(self.differ.diff_tables(table1, table2))
 65 |         assert len(diff) == 2
 66 |         assert self.differ.stats["exclusive_count"] == 0
 67 | 
 68 |         # Test pks diffed, by checking exclusive_count
 69 |         table1 = TableSegment(self.connection, self.table_src_path, ("id", "userid"), "timestamp", case_sensitive=False)
 70 |         table2 = TableSegment(self.connection, self.table_dst_path, ("id", "userid"), "timestamp", case_sensitive=False)
 71 |         diff = list(self.differ.diff_tables(table1, table2))
 72 |         assert len(diff) == 2
 73 |         assert self.differ.stats["exclusive_count"] == 2
 74 | 
 75 | 
 76 | @test_each_database
 77 | class TestJoindiff(DiffTestCase):
 78 |     src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
 79 |     dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
 80 | 
 81 |     def setUp(self):
 82 |         super().setUp()
 83 | 
 84 |         self.table = TableSegment(self.connection, self.table_src_path, ("id",), "timestamp", case_sensitive=False)
 85 |         self.table2 = TableSegment(self.connection, self.table_dst_path, ("id",), "timestamp", case_sensitive=False)
 86 | 
 87 |         self.differ = JoinDiffer()
 88 | 
 89 |     def test_diff_small_tables(self):
 90 |         time = "2022-01-01 00:00:00"
 91 |         time_obj = datetime.fromisoformat(time)
 92 | 
 93 |         cols = "id userid movieid rating timestamp".split()
 94 | 
 95 |         self.connection.query(
 96 |             [
 97 |                 self.src_table.insert_rows([[1, 1, 1, 9, time_obj], [2, 2, 2, 9, time_obj]], columns=cols),
 98 |                 self.dst_table.insert_rows([[1, 1, 1, 9, time_obj]], columns=cols),
 99 |                 commit,
100 |             ]
101 |         )
102 | 
103 |         diff_res = self.differ.diff_tables(self.table, self.table2)
104 |         info = diff_res.info_tree.info
105 |         diff = list(diff_res)
106 | 
107 |         expected_row = ("2", time + ".000000")
108 |         expected = [("-", expected_row)]
109 |         self.assertEqual(expected, diff)
110 |         self.assertEqual(2, info.rowcounts[1])
111 |         self.assertEqual(1, info.rowcounts[2])
112 |         # self.assertEqual(2, self.differ.stats["table1_max_id"])
113 |         # self.assertEqual(1, self.differ.stats["table2_min_id"])
114 | 
115 |         # Test materialize
116 |         materialize_path = self.connection.parse_table_name(f"test_mat_{random_table_suffix()}")
117 |         mdiffer = self.differ.replace(materialize_to_table=materialize_path)
118 |         diff = list(mdiffer.diff_tables(self.table, self.table2))
119 |         self.assertEqual(expected, diff)
120 | 
121 |         t = TablePath(materialize_path)
122 |         rows = self.connection.query(t.select(), List[tuple])
123 |         # is_xa, is_xb, is_diff1, is_diff2, row1, row2
124 |         # assert rows == [(1, 0, 1, 1) + expected_row + (None, None)], rows
125 |         assert rows == [(1, 0, 1, 1) + (expected_row[0], None, expected_row[1], None)], rows
126 |         self.connection.query(t.drop())
127 | 
128 |         # Test materialize all rows
129 |         mdiffer = mdiffer.replace(materialize_all_rows=True)
130 |         diff = list(mdiffer.diff_tables(self.table, self.table2))
131 |         self.assertEqual(expected, diff)
132 |         rows = self.connection.query(t.select(), List[tuple])
133 |         assert len(rows) == 2, len(rows)
134 |         self.connection.query(t.drop())
135 | 
136 |     def test_diff_table_above_bisection_threshold(self):
137 |         time = "2022-01-01 00:00:00"
138 |         time_obj = datetime.fromisoformat(time)
139 | 
140 |         cols = "id userid movieid rating timestamp".split()
141 | 
142 |         self.connection.query(
143 |             [
144 |                 self.src_table.insert_rows(
145 |                     [
146 |                         [1, 1, 1, 9, time_obj],
147 |                         [2, 2, 2, 9, time_obj],
148 |                         [3, 3, 3, 9, time_obj],
149 |                         [4, 4, 4, 9, time_obj],
150 |                         [5, 5, 5, 9, time_obj],
151 |                     ],
152 |                     columns=cols,
153 |                 ),
154 |                 self.dst_table.insert_rows(
155 |                     [
156 |                         [1, 1, 1, 9, time_obj],
157 |                         [2, 2, 2, 9, time_obj],
158 |                         [3, 3, 3, 9, time_obj],
159 |                         [4, 4, 4, 9, time_obj],
160 |                     ],
161 |                     columns=cols,
162 |                 ),
163 |                 commit,
164 |             ]
165 |         )
166 | 
167 |         diff_res = self.differ.diff_tables(self.table, self.table2)
168 |         info = diff_res.info_tree.info
169 |         diff = list(diff_res)
170 |         expected = [("-", ("5", time + ".000000"))]
171 |         self.assertEqual(expected, diff)
172 |         self.assertEqual(5, info.rowcounts[1])
173 |         self.assertEqual(4, info.rowcounts[2])
174 | 
175 |     def test_return_empty_array_when_same(self):
176 |         time = "2022-01-01 00:00:00"
177 |         time_obj = datetime.fromisoformat(time)
178 | 
179 |         cols = "id userid movieid rating timestamp".split()
180 | 
181 |         self.connection.query(
182 |             [
183 |                 self.src_table.insert_row(1, 1, 1, 9, time_obj, columns=cols),
184 |                 self.dst_table.insert_row(1, 1, 1, 9, time_obj, columns=cols),
185 |             ]
186 |         )
187 | 
188 |         diff = list(self.differ.diff_tables(self.table, self.table2))
189 |         self.assertEqual([], diff)
190 | 
191 |     def test_diff_sorted_by_key(self):
192 |         time = "2022-01-01 00:00:00"
193 |         time2 = "2021-01-01 00:00:00"
194 | 
195 |         time_obj = datetime.fromisoformat(time)
196 |         time_obj2 = datetime.fromisoformat(time2)
197 | 
198 |         cols = "id userid movieid rating timestamp".split()
199 | 
200 |         self.connection.query(
201 |             [
202 |                 self.src_table.insert_rows(
203 |                     [
204 |                         [1, 1, 1, 9, time_obj],
205 |                         [2, 2, 2, 9, time_obj2],
206 |                         [3, 3, 3, 9, time_obj],
207 |                         [4, 4, 4, 9, time_obj2],
208 |                         [5, 5, 5, 9, time_obj],
209 |                     ],
210 |                     columns=cols,
211 |                 ),
212 |                 self.dst_table.insert_rows(
213 |                     [
214 |                         [1, 1, 1, 9, time_obj],
215 |                         [2, 2, 2, 9, time_obj],
216 |                         [3, 3, 3, 9, time_obj],
217 |                         [4, 4, 4, 9, time_obj],
218 |                         [5, 5, 5, 9, time_obj],
219 |                     ],
220 |                     columns=cols,
221 |                 ),
222 |                 commit,
223 |             ]
224 |         )
225 | 
226 |         diff = list(self.differ.diff_tables(self.table, self.table2))
227 |         expected = {
228 |             ("-", ("2", time2 + ".000000")),
229 |             ("+", ("2", time + ".000000")),
230 |             ("-", ("4", time2 + ".000000")),
231 |             ("+", ("4", time + ".000000")),
232 |         }
233 |         self.assertEqual(expected, set(diff))
234 |         keys = [k for _, (k, _) in diff]
235 |         assert keys[0] == keys[1] and keys[2] == keys[3]  # same keys
236 | 
237 |     def test_dup_pks(self):
238 |         time = "2022-01-01 00:00:00"
239 |         time_obj = datetime.fromisoformat(time)
240 | 
241 |         cols = "id rating timestamp".split()
242 | 
243 |         self.connection.query(
244 |             [
245 |                 self.src_table.insert_rows([[1, 9, time_obj], [1, 10, time_obj]], columns=cols),
246 |                 self.dst_table.insert_row(1, 9, time_obj, columns=cols),
247 |             ]
248 |         )
249 | 
250 |         x = self.differ.diff_tables(self.table, self.table2)
251 |         self.assertRaises(ValueError, list, x)
252 | 
253 |     def test_null_pks(self):
254 |         time = "2022-01-01 00:00:00"
255 |         time_obj = datetime.fromisoformat(time)
256 | 
257 |         cols = "id rating timestamp".split()
258 | 
259 |         self.connection.query(
260 |             [
261 |                 self.src_table.insert_row(None, 9, time_obj, columns=cols),
262 |                 self.dst_table.insert_row(1, 9, time_obj, columns=cols),
263 |             ]
264 |         )
265 | 
266 |         x = self.differ.diff_tables(self.table, self.table2)
267 |         self.assertRaises(ValueError, list, x)
268 | 
269 | 
270 | @test_each_database_in_list(d for d in TEST_DATABASES if d.dialect.SUPPORTS_PRIMARY_KEY and d.SUPPORTS_UNIQUE_CONSTAINT)
271 | class TestUniqueConstraint(DiffTestCase):
272 |     def setUp(self):
273 |         super().setUp()
274 | 
275 |         self.src_table = table(
276 |             self.table_src_path,
277 |             schema={"id": int, "userid": int, "movieid": int, "rating": float},
278 |         )
279 |         self.dst_table = table(
280 |             self.table_dst_path,
281 |             schema={"id": int, "userid": int, "movieid": int, "rating": float},
282 |         )
283 | 
284 |         self.connection.query(
285 |             [self.src_table.create(primary_keys=["id"]), self.dst_table.create(primary_keys=["id", "userid"]), commit]
286 |         )
287 | 
288 |         self.differ = JoinDiffer()
289 | 
290 |     def test_unique_constraint(self):
291 |         self.connection.query(
292 |             [
293 |                 self.src_table.insert_rows([[1, 1, 1, 9], [2, 2, 2, 9]]),
294 |                 self.dst_table.insert_rows([[1, 1, 1, 9], [2, 2, 2, 9]]),
295 |                 commit,
296 |             ]
297 |         )
298 | 
299 |         # Test no active validation
300 |         table = TableSegment(self.connection, self.table_src_path, ("id",), case_sensitive=False)
301 |         table2 = TableSegment(self.connection, self.table_dst_path, ("id",), case_sensitive=False)
302 | 
303 |         res = list(self.differ.diff_tables(table, table2))
304 |         assert not res
305 |         assert "validated_unique_keys" not in self.differ.stats
306 | 
307 |         # Test active validation
308 |         table = TableSegment(self.connection, self.table_src_path, ("userid",), case_sensitive=False)
309 |         table2 = TableSegment(self.connection, self.table_dst_path, ("userid",), case_sensitive=False)
310 | 
311 |         res = list(self.differ.diff_tables(table, table2))
312 |         assert not res
313 |         self.assertEqual(self.differ.stats["validated_unique_keys"], [["userid"]])
314 | 


--------------------------------------------------------------------------------
/tests/test_parse_time.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from datetime import timedelta
 4 | 
 5 | from reladiff.parse_time import parse_time_delta
 6 | 
 7 | 
 8 | class TestParseTime(unittest.TestCase):
 9 |     def setUp(self):
10 |         pass
11 | 
12 |     def test_times(self):
13 |         td = parse_time_delta("1w2d3h4min5s")
14 |         assert td == timedelta(weeks=1, days=2, hours=3, minutes=4, seconds=5)
15 | 
16 |         assert parse_time_delta("1y") == timedelta(days=365)
17 |         assert parse_time_delta("1mon") == timedelta(days=30)
18 | 
19 |         self.assertRaises(ValueError, parse_time_delta, "")
20 |         self.assertRaises(ValueError, parse_time_delta, "1y1year")
21 |         self.assertRaises(ValueError, parse_time_delta, "1x")
22 | 


--------------------------------------------------------------------------------
/tests/test_postgresql.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from sqeleton.queries import table, commit
  4 | 
  5 | from reladiff import TableSegment, HashDiffer
  6 | from reladiff import databases as db
  7 | from .common import get_conn, random_table_suffix
  8 | 
  9 | 
 10 | class TestUUID(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         self.connection = get_conn(db.PostgreSQL)
 13 | 
 14 |         table_suffix = random_table_suffix()
 15 | 
 16 |         self.table_src_name = f"src{table_suffix}"
 17 |         self.table_dst_name = f"dst{table_suffix}"
 18 | 
 19 |         self.table_src = table(self.table_src_name)
 20 |         self.table_dst = table(self.table_dst_name)
 21 | 
 22 |     def test_uuid(self):
 23 |         self.connection.query('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";', None)
 24 | 
 25 |         queries = [
 26 |             self.table_src.drop(True),
 27 |             self.table_dst.drop(True),
 28 |             f"CREATE TABLE {self.table_src_name} (id uuid DEFAULT uuid_generate_v4 (), comment VARCHAR, PRIMARY KEY (id))",
 29 |             commit,
 30 |             self.table_src.insert_rows([[i] for i in range(100)], columns=["comment"]),
 31 |             commit,
 32 |             self.table_dst.create(self.table_src),
 33 |             commit,
 34 |             self.table_src.insert_row("This one is different", columns=["comment"]),
 35 |             commit,
 36 |         ]
 37 | 
 38 |         for query in queries:
 39 |             self.connection.query(query)
 40 | 
 41 |         a = TableSegment(self.connection, self.table_src.path, ("id",), "comment")
 42 |         b = TableSegment(self.connection, self.table_dst.path, ("id",), "comment")
 43 | 
 44 |         differ = HashDiffer()
 45 |         diff = list(differ.diff_tables(a, b))
 46 |         uuid = diff[0][1][0]
 47 |         self.assertEqual(diff, [("-", (uuid, "This one is different"))])
 48 | 
 49 |         # Compare with MySql
 50 |         mysql_conn = get_conn(db.MySQL)
 51 | 
 52 |         rows = self.connection.query(self.table_src.select(), list)
 53 | 
 54 |         queries = [
 55 |             f"CREATE TABLE {self.table_dst_name} (id VARCHAR(128), comment VARCHAR(128))",
 56 |             commit,
 57 |             self.table_dst.insert_rows(rows, columns=["id", "comment"]),
 58 |             commit,
 59 |         ]
 60 | 
 61 |         for q in queries:
 62 |             mysql_conn.query(q)
 63 | 
 64 |         c = TableSegment(mysql_conn, (self.table_dst_name,), ("id",), "comment")
 65 |         diff = list(differ.diff_tables(a, c))
 66 |         assert not diff, diff
 67 |         diff = list(differ.diff_tables(c, a))
 68 |         assert not diff, diff
 69 | 
 70 |         self.connection.query(self.table_src.drop(True))
 71 |         self.connection.query(self.table_dst.drop(True))
 72 |         mysql_conn.query(self.table_dst.drop(True))
 73 | 
 74 | 
 75 | class Test100Fields(unittest.TestCase):
 76 |     def setUp(self) -> None:
 77 |         self.connection = get_conn(db.PostgreSQL)
 78 | 
 79 |         table_suffix = random_table_suffix()
 80 | 
 81 |         self.table_src_name = f"src{table_suffix}"
 82 |         self.table_dst_name = f"dst{table_suffix}"
 83 | 
 84 |         self.table_src = table(self.table_src_name)
 85 |         self.table_dst = table(self.table_dst_name)
 86 | 
 87 |     def test_100_fields(self):
 88 |         self.connection.query('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";', None)
 89 | 
 90 |         columns = [f"col{i}" for i in range(100)]
 91 |         fields = " ,".join(f'"{field}" TEXT' for field in columns)
 92 | 
 93 |         queries = [
 94 |             self.table_src.drop(True),
 95 |             self.table_dst.drop(True),
 96 |             f"CREATE TABLE {self.table_src_name} (id uuid DEFAULT uuid_generate_v4 (), {fields})",
 97 |             commit,
 98 |             self.table_src.insert_rows([[f"{x * y}" for x in range(100)] for y in range(10)], columns=columns),
 99 |             commit,
100 |             self.table_dst.create(self.table_src),
101 |             commit,
102 |             self.table_src.insert_rows([[1 for x in range(100)]], columns=columns),
103 |             commit,
104 |         ]
105 | 
106 |         for query in queries:
107 |             self.connection.query(query)
108 | 
109 |         a = TableSegment(self.connection, self.table_src.path, ("id",), extra_columns=tuple(columns))
110 |         b = TableSegment(self.connection, self.table_dst.path, ("id",), extra_columns=tuple(columns))
111 | 
112 |         differ = HashDiffer()
113 |         diff = list(differ.diff_tables(a, b))
114 |         id_ = diff[0][1][0]
115 |         result = (id_,) + tuple("1" for x in range(100))
116 |         self.assertEqual(diff, [("-", result)])
117 | 


--------------------------------------------------------------------------------
/tests/waiting_for_stack_up.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -n "$VERTICA_URI" ]
 4 |     then
 5 |         echo "Check Vertica DB running..."
 6 |         while true
 7 |         do
 8 |             if docker logs dd-vertica | tail -n 100 | grep -q -i "vertica is now running"
 9 |             then
10 |                echo "Vertica DB is ready";
11 |                break;
12 |             else
13 |                echo "Waiting for Vertica DB starting...";
14 |                sleep 10;
15 |             fi
16 |         done
17 | fi
18 | 


--------------------------------------------------------------------------------