├── .editorconfig ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md └── workflows │ ├── dependency_checker.yml │ ├── install.yml │ ├── integration.yml │ ├── lint.yml │ ├── minimum.yml │ ├── prepare_release.yml │ ├── readme.yml │ └── unit.yml ├── .gitignore ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.md ├── INSTALL.md ├── LICENSE ├── Makefile ├── README.md ├── RELEASE.md ├── codecov.yml ├── docs └── images │ ├── column_comparison.png │ └── column_pairs.png ├── latest_requirements.txt ├── pyproject.toml ├── resources └── visualize.png ├── scripts └── release_notes_generator.py ├── sdmetrics ├── __init__.py ├── _utils_metadata.py ├── base.py ├── column_pairs │ ├── __init__.py │ ├── base.py │ └── statistical │ │ ├── __init__.py │ │ ├── cardinality_boundary_adherence.py │ │ ├── contingency_similarity.py │ │ ├── correlation_similarity.py │ │ ├── inter_row_msas.py │ │ ├── kl_divergence.py │ │ ├── referential_integrity.py │ │ └── statistic_msas.py ├── demos.py ├── demos │ ├── multi_table │ │ ├── metadata.json │ │ ├── sessions_real.csv │ │ ├── sessions_synthetic.csv │ │ ├── transactions_real.csv │ │ ├── transactions_synthetic.csv │ │ ├── users_real.csv │ │ └── users_synthetic.csv │ ├── single_table │ │ ├── metadata.json │ │ ├── real.csv │ │ └── synthetic.csv │ └── timeseries │ │ ├── metadata.json │ │ ├── real.csv │ │ └── synthetic.csv ├── errors.py ├── goal.py ├── multi_table │ ├── README.md │ ├── __init__.py │ ├── base.py │ ├── detection │ │ ├── __init__.py │ │ ├── base.py │ │ └── parent_child.py │ ├── multi_single_table.py │ └── statistical │ │ ├── __init__.py │ │ ├── cardinality_shape_similarity.py │ │ └── cardinality_statistic_similarity.py ├── reports │ ├── __init__.py │ ├── base_report.py │ ├── multi_table │ │ ├── __init__.py │ │ ├── _properties │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── boundary.py │ │ │ ├── cardinality.py │ │ │ ├── column_pair_trends.py │ │ │ ├── column_shapes.py │ │ │ ├── coverage.py │ │ │ ├── data_validity.py │ │ │ ├── inter_table_trends.py │ │ │ ├── relationship_validity.py │ │ │ ├── structure.py │ │ │ └── synthesis.py │ │ ├── base_multi_table_report.py │ │ ├── diagnostic_report.py │ │ └── quality_report.py │ ├── single_table │ │ ├── __init__.py │ │ ├── _properties │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── boundary.py │ │ │ ├── column_pair_trends.py │ │ │ ├── column_shapes.py │ │ │ ├── coverage.py │ │ │ ├── data_validity.py │ │ │ ├── structure.py │ │ │ └── synthesis.py │ │ ├── diagnostic_report.py │ │ ├── plot_utils.py │ │ └── quality_report.py │ └── utils.py ├── single_column │ ├── README.md │ ├── __init__.py │ ├── base.py │ └── statistical │ │ ├── __init__.py │ │ ├── boundary_adherence.py │ │ ├── category_adherence.py │ │ ├── category_coverage.py │ │ ├── cstest.py │ │ ├── key_uniqueness.py │ │ ├── kscomplement.py │ │ ├── missing_value_similarity.py │ │ ├── range_coverage.py │ │ ├── sequence_length_similarity.py │ │ ├── statistic_similarity.py │ │ └── tv_complement.py ├── single_table │ ├── README.md │ ├── __init__.py │ ├── base.py │ ├── bayesian_network.py │ ├── data_augmentation │ │ ├── __init__.py │ │ ├── base.py │ │ ├── binary_classifier_precision_efficacy.py │ │ ├── binary_classifier_recall_efficacy.py │ │ └── utils.py │ ├── detection │ │ ├── __init__.py │ │ ├── base.py │ │ └── sklearn.py │ ├── efficacy │ │ ├── __init__.py │ │ ├── base.py │ │ ├── binary.py │ │ ├── mlefficacy.py │ │ ├── multiclass.py │ │ └── regression.py │ ├── gaussian_mixture.py │ ├── multi_column_pairs.py │ ├── multi_single_column.py │ ├── new_row_synthesis.py │ ├── privacy │ │ ├── __init__.py │ │ ├── base.py │ │ ├── cap.py │ │ ├── categorical_sklearn.py │ │ ├── dcr_baseline_protection.py │ │ ├── dcr_overfitting_protection.py │ │ ├── dcr_utils.py │ │ ├── disclosure_protection.py │ │ ├── ensemble.py │ │ ├── loss.py │ │ ├── numerical_sklearn.py │ │ ├── radius_nearest_neighbor.py │ │ └── util.py │ └── table_structure.py ├── timeseries │ ├── README.md │ ├── __init__.py │ ├── base.py │ ├── detection.py │ ├── efficacy │ │ ├── __init__.py │ │ ├── base.py │ │ └── classification.py │ └── ml_scorers.py ├── utils.py ├── visualization.py └── warnings.py ├── static_code_analysis.txt ├── tasks.py ├── tests ├── __init__.py ├── integration │ ├── __init__.py │ ├── column_pairs │ │ ├── __init__.py │ │ └── statistical │ │ │ ├── __init__.py │ │ │ ├── test_contingency_similarity.py │ │ │ └── test_kl_divergence.py │ ├── multi_table │ │ ├── __init__.py │ │ ├── test_multi_single_table.py │ │ ├── test_multi_table.py │ │ ├── test_parent_child.py │ │ └── test_statistical_metrics.py │ ├── reports │ │ ├── __init__.py │ │ ├── multi_table │ │ │ ├── __init__.py │ │ │ ├── _properties │ │ │ │ ├── __init__.py │ │ │ │ ├── test_boundary.py │ │ │ │ ├── test_cardinality.py │ │ │ │ ├── test_column_pair_trends.py │ │ │ │ ├── test_column_shapes.py │ │ │ │ ├── test_coverage.py │ │ │ │ ├── test_data_validity.py │ │ │ │ ├── test_inter_table_trends.py │ │ │ │ ├── test_relationship_validity.py │ │ │ │ ├── test_structure.py │ │ │ │ └── test_synthesis.py │ │ │ ├── test_diagnostic_report.py │ │ │ └── test_quality_report.py │ │ └── single_table │ │ │ ├── __init__.py │ │ │ ├── _properties │ │ │ ├── test_boundary.py │ │ │ ├── test_column_pair_trends.py │ │ │ ├── test_column_shapes.py │ │ │ ├── test_coverage.py │ │ │ ├── test_data_validity.py │ │ │ ├── test_structure.py │ │ │ └── test_synthesis.py │ │ │ ├── test_diagnostic_report.py │ │ │ └── test_quality_report.py │ ├── single_column │ │ ├── __init__.py │ │ └── statistical │ │ │ ├── __init__.py │ │ │ ├── test_cstest.py │ │ │ └── test_kscomplement.py │ ├── single_table │ │ ├── __init__.py │ │ ├── data_augmentation │ │ │ ├── __init__.py │ │ │ ├── test_binary_classifier_precision_efficacy.py │ │ │ └── test_binary_classifier_recall_efficacy.py │ │ ├── efficacy │ │ │ ├── __init__.py │ │ │ ├── test_binary.py │ │ │ ├── test_detection.py │ │ │ ├── test_multiclass.py │ │ │ └── test_regression.py │ │ ├── privacy │ │ │ ├── __init__.py │ │ │ ├── test_dcr_baseline_protection.py │ │ │ ├── test_dcr_overfitting_protection.py │ │ │ ├── test_dcr_utils.py │ │ │ ├── test_disclosure_protection.py │ │ │ └── test_privacy.py │ │ ├── test_gaussian_mixture.py │ │ └── test_single_table.py │ ├── test_base.py │ ├── test_property.py │ └── timeseries │ │ ├── __init__.py │ │ ├── efficacy │ │ ├── __init__.py │ │ └── test_classification.py │ │ └── test_timeseries.py ├── test_tasks.py ├── unit │ ├── __init__.py │ ├── column_pairs │ │ ├── __init__.py │ │ ├── statistical │ │ │ ├── __init__.py │ │ │ ├── test_cardinality_boundary_adherence.py │ │ │ ├── test_contingency_similarity.py │ │ │ ├── test_correlation_similarity.py │ │ │ ├── test_inter_row_msas.py │ │ │ ├── test_referential_integrity.py │ │ │ └── test_statistic_msas.py │ │ └── test_base.py │ ├── multi_table │ │ ├── __init__.py │ │ ├── statistical │ │ │ ├── __init__.py │ │ │ ├── test_cardinality_shape_similarity.py │ │ │ └── test_cardinality_statistic_similarity.py │ │ └── test_multi_single_table.py │ ├── reports │ │ ├── __init__.py │ │ ├── multi_table │ │ │ ├── __init__.py │ │ │ ├── _properties │ │ │ │ ├── __init__.py │ │ │ │ ├── test_base.py │ │ │ │ ├── test_boundary.py │ │ │ │ ├── test_cardinality.py │ │ │ │ ├── test_column_pair_trends.py │ │ │ │ ├── test_column_shapes.py │ │ │ │ ├── test_coverage.py │ │ │ │ ├── test_inter_table_trends.py │ │ │ │ ├── test_relationship_validity.py │ │ │ │ ├── test_structure.py │ │ │ │ ├── test_synthesis.py │ │ │ │ └── test_validity.py │ │ │ ├── test_base_multi_table_report.py │ │ │ ├── test_diagnostic_report.py │ │ │ └── test_quality_report.py │ │ ├── single_table │ │ │ ├── __init__.py │ │ │ ├── _properties │ │ │ │ ├── __init__.py │ │ │ │ ├── test_base.py │ │ │ │ ├── test_boundary.py │ │ │ │ ├── test_column_pair_trends.py │ │ │ │ ├── test_column_shapes.py │ │ │ │ ├── test_coverage.py │ │ │ │ ├── test_data_validity.py │ │ │ │ ├── test_structure.py │ │ │ │ └── test_synthesis.py │ │ │ ├── test_diagnostic_report.py │ │ │ ├── test_quality_report.py │ │ │ └── test_single_table_plot_utils.py │ │ ├── test_base_report.py │ │ └── test_utils.py │ ├── single_column │ │ ├── __init__.py │ │ ├── statistical │ │ │ ├── __init__.py │ │ │ ├── test_boundary_adherence.py │ │ │ ├── test_category_adherence.py │ │ │ ├── test_category_coverage.py │ │ │ ├── test_key_uniqueness.py │ │ │ ├── test_missing_value_similarity.py │ │ │ ├── test_range_coverage.py │ │ │ ├── test_sequence_length_similarity.py │ │ │ ├── test_statistic_similarity.py │ │ │ └── test_tv_complement.py │ │ └── test_base.py │ ├── single_table │ │ ├── __init__.py │ │ ├── data_augmentation │ │ │ ├── __init__.py │ │ │ ├── test_base.py │ │ │ ├── test_binary_classifier_precision_efficacy.py │ │ │ ├── test_binary_classifier_recall_efficacy.py │ │ │ └── test_utils.py │ │ ├── detection │ │ │ ├── __init__.py │ │ │ └── test_detection.py │ │ ├── privacy │ │ │ ├── __init__.py │ │ │ ├── test_cap.py │ │ │ ├── test_dcr_baseline_protection.py │ │ │ ├── test_dcr_overfitting_protection.py │ │ │ ├── test_dcr_utils.py │ │ │ ├── test_disclosure_protection.py │ │ │ └── test_util.py │ │ ├── test_base.py │ │ ├── test_bayesian_network.py │ │ ├── test_multi_single_column.py │ │ ├── test_new_row_synthesis.py │ │ └── test_table_structure.py │ ├── test___init__.py │ ├── test__utils_metadata.py │ ├── test_base.py │ ├── test_demos.py │ ├── test_utils.py │ ├── test_visualization.py │ └── timeseries │ │ ├── __init__.py │ │ └── test_timeseries.py └── utils.py └── tox.ini /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.py] 14 | max_line_length = 99 15 | 16 | [*.bat] 17 | indent_style = tab 18 | end_of_line = crlf 19 | 20 | [LICENSE] 21 | insert_final_newline = false 22 | 23 | [Makefile] 24 | indent_style = tab 25 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Global rule: 2 | * @sdv-dev/core-contributors 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report an error that you found when using SDMetrics 4 | title: '' 5 | labels: bug, new 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Environment Details 11 | 12 | Please indicate the following details about the environment in which you found the bug: 13 | 14 | * SDMetrics version: 15 | * Python version: 16 | * Operating System: 17 | 18 | ### Error Description 19 | 20 | 22 | 23 | ### Steps to reproduce 24 | 25 | 29 | 30 | ``` 31 | Paste the command(s) you ran and the output. 32 | If there was a crash, please include the traceback here. 33 | ``` 34 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Request a new feature that you would like to see implemented in SDMetrics 4 | title: '' 5 | labels: feature request, new 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Problem Description 11 | 12 | 14 | 15 | ### Expected behavior 16 | 17 | 20 | 21 | ### Additional context 22 | 23 | 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Doubts about SDMetrics usage 4 | title: '' 5 | labels: question, new 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Environment details 11 | 12 | If you are already running SDMetrics, please indicate the following details about the environment in 13 | which you are running it: 14 | 15 | * SDMetrics version: 16 | * Python version: 17 | * Operating System: 18 | 19 | ### Problem description 20 | 21 | 24 | 25 | ### What I already tried 26 | 27 | 29 | 30 | ``` 31 | Paste the command(s) you ran and the output. 32 | If there was a crash, please include the traceback here. 33 | ``` 34 | -------------------------------------------------------------------------------- /.github/workflows/dependency_checker.yml: -------------------------------------------------------------------------------- 1 | name: Dependency Checker 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: '0 0 * * 1' 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v4 11 | - name: Set up Python 3.9 12 | uses: actions/setup-python@v5 13 | with: 14 | python-version: 3.9 15 | - name: Install dependencies 16 | run: | 17 | python -m pip install .[dev] 18 | make check-deps OUTPUT_FILEPATH=latest_requirements.txt 19 | make fix-lint 20 | - name: Create pull request 21 | id: cpr 22 | uses: peter-evans/create-pull-request@v4 23 | with: 24 | token: ${{ secrets.GH_ACCESS_TOKEN }} 25 | commit-message: Update latest dependencies 26 | author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" 27 | committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" 28 | title: Automated Latest Dependency Updates 29 | body: "This is an auto-generated PR with **latest** dependency updates." 30 | branch: latest-dependency-update 31 | branch-suffix: short-commit-hash 32 | base: main 33 | -------------------------------------------------------------------------------- /.github/workflows/install.yml: -------------------------------------------------------------------------------- 1 | name: Install Tests 2 | on: 3 | pull_request: 4 | types: [opened, synchronize] 5 | push: 6 | branches: 7 | - main 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | install: 15 | name: ${{ matrix.python_version }} install 16 | strategy: 17 | fail-fast: true 18 | matrix: 19 | python_version: ["3.8", "3.13"] 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Set up python ${{ matrix.python_version }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ matrix.python_version }} 26 | - uses: actions/checkout@v4 27 | - name: Build package 28 | run: | 29 | make package 30 | - name: Install package 31 | run: | 32 | python -m pip install "unpacked_sdist/." 33 | - name: Test by importing packages 34 | run: | 35 | python -c "import sdmetrics" 36 | - name: Check package conflicts 37 | run: | 38 | python -m pip check -------------------------------------------------------------------------------- /.github/workflows/integration.yml: -------------------------------------------------------------------------------- 1 | name: Integration Tests 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: [opened, reopened] 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | integration: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 18 | os: [ubuntu-latest, windows-latest] 19 | include: 20 | - os: macos-latest 21 | python-version: '3.8' 22 | - os: macos-latest 23 | python-version: '3.13' 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install libomp (macOS only) 31 | if: matrix.os == 'macos-latest' 32 | run: | 33 | brew install libomp 34 | echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> $GITHUB_ENV 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | python -m pip install invoke .[test] 39 | - name: Run integration tests 40 | run: invoke integration 41 | 42 | - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.12 43 | name: Upload integration codecov report 44 | uses: codecov/codecov-action@v4 45 | with: 46 | flags: integration 47 | file: ${{ github.workspace }}/integration_cov.xml 48 | fail_ci_if_error: true 49 | token: ${{ secrets.CODECOV_TOKEN }} 50 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Style Checks 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: [opened, reopened] 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python 3.9 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: 3.9 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | python -m pip install invoke .[dev] 21 | - name: Run lint checks 22 | run: invoke lint 23 | -------------------------------------------------------------------------------- /.github/workflows/minimum.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests Minimum Versions 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: [opened, reopened] 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | minimum: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 18 | os: [ubuntu-latest, windows-latest] 19 | include: 20 | - os: macos-latest 21 | python-version: '3.8' 22 | - os: macos-latest 23 | python-version: '3.13' 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install libomp (macOS only) 31 | if: matrix.os == 'macos-latest' 32 | run: | 33 | brew install libomp 34 | echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> $GITHUB_ENV 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | python -m pip install invoke .[test] 39 | - name: Test with minimum versions 40 | run: invoke minimum 41 | -------------------------------------------------------------------------------- /.github/workflows/prepare_release.yml: -------------------------------------------------------------------------------- 1 | name: Release Prep 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | branch: 7 | description: 'Branch to merge release notes and code analysis into.' 8 | required: true 9 | default: 'main' 10 | version: 11 | description: 12 | 'Version to use for the release. Must be in format: X.Y.Z.' 13 | date: 14 | description: 15 | 'Date of the release. Must be in format YYYY-MM-DD.' 16 | 17 | jobs: 18 | preparerelease: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: '3.10' 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install requests==2.31.0 31 | python -m pip install bandit==1.7.7 32 | python -m pip install .[test] 33 | 34 | - name: Generate release notes 35 | env: 36 | GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }} 37 | run: > 38 | python scripts/release_notes_generator.py 39 | -v ${{ inputs.version }} 40 | -d ${{ inputs.date }} 41 | 42 | - name: Save static code analysis 43 | run: bandit -r . -x ./tests,./scripts,./build -f txt -o static_code_analysis.txt --exit-zero 44 | 45 | - name: Create pull request 46 | id: cpr 47 | uses: peter-evans/create-pull-request@v4 48 | with: 49 | token: ${{ secrets.GH_ACCESS_TOKEN }} 50 | commit-message: Prepare release for v${{ inputs.version }} 51 | author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" 52 | committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>" 53 | title: v${{ inputs.version }} Release Preparation 54 | body: "This is an auto-generated PR to prepare the release." 55 | branch: prepared-release 56 | branch-suffix: short-commit-hash 57 | base: ${{ inputs.branch }} 58 | -------------------------------------------------------------------------------- /.github/workflows/readme.yml: -------------------------------------------------------------------------------- 1 | name: Test README 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: [opened, reopened] 7 | 8 | jobs: 9 | readme: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 14 | os: [ubuntu-latest, macos-latest] # skip windows bc rundoc fails 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v5 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | python -m pip install invoke rundoc . 25 | python -m pip install tomli 26 | python -m pip install packaging 27 | - name: Run the README.md 28 | run: invoke readme 29 | -------------------------------------------------------------------------------- /.github/workflows/unit.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | pull_request: 6 | types: [opened, reopened] 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | unit: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] 18 | os: [ubuntu-latest, windows-latest] 19 | include: 20 | - os: macos-latest 21 | python-version: '3.8' 22 | - os: macos-latest 23 | python-version: '3.13' 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install libomp (macOS only) 31 | if: matrix.os == 'macos-latest' 32 | run: | 33 | brew install libomp 34 | echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> $GITHUB_ENV 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | python -m pip install invoke .[test] 39 | - name: Run unit tests 40 | run: invoke unit 41 | 42 | - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.13 43 | name: Upload unit codecov report 44 | uses: codecov/codecov-action@v4 45 | with: 46 | flags: unit 47 | file: ${{ github.workspace }}/unit_cov.xml 48 | fail_ci_if_error: true 49 | token: ${{ secrets.CODECOV_TOKEN }} 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | *_cov.xml 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | docs/api/ 69 | docs/tutorials/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | # Vim 108 | .*.swp 109 | 110 | # OS Files 111 | .DS_Store 112 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | * Kevin Alex Zhang 6 | * Kalyan Veeramachaneni 7 | * Carles Sala 8 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installing SDMetrics 2 | 3 | ## Requirements 4 | 5 | **SDMetrics** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12 and 3.13](https://www.python.org/downloads/) 6 | 7 | Also, although it is not strictly required, the usage of a [virtualenv]( 8 | https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid 9 | interfering with other software installed in the system where **SDMetrics** is run. 10 | 11 | ## Install with pip 12 | 13 | The easiest and recommended way to install **SDMetrics** is using [pip]( 14 | https://pip.pypa.io/en/stable/): 15 | 16 | ```bash 17 | pip install sdmetrics 18 | ``` 19 | 20 | This will pull and install the latest stable release from [PyPi](https://pypi.org/). 21 | 22 | ## Install with conda 23 | 24 | **SDMetrics** can also be installed using [conda](https://docs.conda.io/en/latest/): 25 | 26 | ```bash 27 | conda install -c sdv-dev -c conda-forge sdmetrics 28 | ``` 29 | 30 | This will pull and install the latest stable release from [Anaconda](https://anaconda.org/). 31 | 32 | ## Install from source 33 | 34 | If you want to install **SDMetrics** from source you need to first clone the repository 35 | and then execute the `make install` command inside the `stable` branch. Note that this 36 | command works only on Unix based systems like GNU/Linux and macOS: 37 | 38 | ```bash 39 | git clone https://github.com/sdv-dev/SDMetrics 40 | cd SDMetrics 41 | git checkout stable 42 | make install 43 | ``` 44 | 45 | ## Install for development 46 | 47 | If you intend to modify the source code or contribute to the project you will need to 48 | install it from the source using the `make install-develop` command. In this case, we 49 | recommend you to branch from `main` first: 50 | 51 | ```bash 52 | git clone git@github.com:sdv-dev/SDMetrics 53 | cd SDMetrics 54 | git checkout main 55 | git checkout -b 56 | make install-develp 57 | ``` 58 | 59 | For more details about how to contribute to the project please visit the [Contributing Guide]( 60 | CONTRIBUTING.rst). 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, MIT Data To AI Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | precision: 2 3 | range: "90...100" 4 | status: 5 | project: 6 | default: false 7 | patch: 8 | default: false -------------------------------------------------------------------------------- /docs/images/column_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/docs/images/column_comparison.png -------------------------------------------------------------------------------- /docs/images/column_pairs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/docs/images/column_pairs.png -------------------------------------------------------------------------------- /latest_requirements.txt: -------------------------------------------------------------------------------- 1 | copulas==0.12.2 2 | numpy==2.0.2 3 | pandas==2.2.3 4 | plotly==6.0.1 5 | scikit-learn==1.6.1 6 | scipy==1.13.1 7 | tqdm==4.67.1 8 | -------------------------------------------------------------------------------- /resources/visualize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/resources/visualize.png -------------------------------------------------------------------------------- /sdmetrics/column_pairs/__init__.py: -------------------------------------------------------------------------------- 1 | """Metrics to compare column pairs.""" 2 | 3 | from sdmetrics.column_pairs.base import ColumnPairsMetric 4 | from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import ( 5 | CardinalityBoundaryAdherence, 6 | ) 7 | from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity 8 | from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity 9 | from sdmetrics.column_pairs.statistical.kl_divergence import ( 10 | ContinuousKLDivergence, 11 | DiscreteKLDivergence, 12 | ) 13 | from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity 14 | from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS 15 | from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS 16 | 17 | __all__ = [ 18 | 'CardinalityBoundaryAdherence', 19 | 'ColumnPairsMetric', 20 | 'ContingencySimilarity', 21 | 'ContinuousKLDivergence', 22 | 'CorrelationSimilarity', 23 | 'DiscreteKLDivergence', 24 | 'ReferentialIntegrity', 25 | 'InterRowMSAS', 26 | 'StatisticMSAS', 27 | ] 28 | -------------------------------------------------------------------------------- /sdmetrics/column_pairs/base.py: -------------------------------------------------------------------------------- 1 | """Base class for metrics that compare pairs of columns.""" 2 | 3 | from sdmetrics.base import BaseMetric 4 | 5 | 6 | class ColumnPairsMetric(BaseMetric): 7 | """Base class for metrics that compare pairs of columns. 8 | 9 | Attributes: 10 | name (str): 11 | Name to use when reports about this metric are printed. 12 | goal (sdmetrics.goal.Goal): 13 | The goal of this metric. 14 | min_value (Union[float, tuple[float]]): 15 | Minimum value or values that this metric can take. 16 | max_value (Union[float, tuple[float]]): 17 | Maximum value or values that this metric can take. 18 | """ 19 | 20 | name = None 21 | goal = None 22 | min_value = None 23 | max_value = None 24 | 25 | @staticmethod 26 | def compute(real_data, synthetic_data): 27 | """Compute this metric. 28 | 29 | Args: 30 | real_data (pandas.DataFrame): 31 | The values from the real dataset, passed as pandas.DataFrame 32 | with 2 columns. 33 | synthetic_data (pandas.DataFrame): 34 | The values from the synthetic dataset, passed as a 35 | pandas.DataFrame with 2 columns. 36 | 37 | Returns: 38 | float: 39 | Metric output. 40 | """ 41 | raise NotImplementedError() 42 | 43 | @classmethod 44 | def compute_breakdown(cls, real_data, synthetic_data): 45 | """Compute the breakdown of this metric. 46 | 47 | Args: 48 | real_data (pandas.DataFrame): 49 | The values from the real dataset, passed as pandas.DataFrame 50 | with 2 columns. 51 | synthetic_data (pandas.DataFrame): 52 | The values from the synthetic dataset, passed as a 53 | pandas.DataFrame with 2 columns. 54 | 55 | Returns: 56 | dict 57 | A mapping of the metric output. Must contain the key 'score'. 58 | """ 59 | return {'score': cls.compute(real_data, synthetic_data)} 60 | -------------------------------------------------------------------------------- /sdmetrics/column_pairs/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """Statistical Metrics to compare column pairs.""" 2 | 3 | from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import ( 4 | CardinalityBoundaryAdherence, 5 | ) 6 | from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity 7 | from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity 8 | from sdmetrics.column_pairs.statistical.kl_divergence import ( 9 | ContinuousKLDivergence, 10 | DiscreteKLDivergence, 11 | ) 12 | from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity 13 | from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS 14 | from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS 15 | 16 | __all__ = [ 17 | 'CardinalityBoundaryAdherence', 18 | 'ContingencySimilarity', 19 | 'ContinuousKLDivergence', 20 | 'CorrelationSimilarity', 21 | 'DiscreteKLDivergence', 22 | 'ReferentialIntegrity', 23 | 'InterRowMSAS', 24 | 'StatisticMSAS', 25 | ] 26 | -------------------------------------------------------------------------------- /sdmetrics/column_pairs/statistical/referential_integrity.py: -------------------------------------------------------------------------------- 1 | """Referential Integrity Metric.""" 2 | 3 | import logging 4 | 5 | import pandas as pd 6 | 7 | from sdmetrics.column_pairs.base import ColumnPairsMetric 8 | from sdmetrics.goal import Goal 9 | 10 | LOGGER = logging.getLogger(__name__) 11 | 12 | 13 | class ReferentialIntegrity(ColumnPairsMetric): 14 | """Referential Integrity metric. 15 | 16 | Compute the fraction of foreign key values that reference a value in the primary key column 17 | in the synthetic data. 18 | 19 | Attributes: 20 | name (str): 21 | Name to use when reports about this metric are printed. 22 | goal (sdmetrics.goal.Goal): 23 | The goal of this metric. 24 | min_value (Union[float, tuple[float]]): 25 | Minimum value or values that this metric can take. 26 | max_value (Union[float, tuple[float]]): 27 | Maximum value or values that this metric can take. 28 | """ 29 | 30 | name = 'ReferentialIntegrity' 31 | goal = Goal.MAXIMIZE 32 | min_value = 0.0 33 | max_value = 1.0 34 | 35 | @classmethod 36 | def compute_breakdown(cls, real_data, synthetic_data): 37 | """Compute the score breakdown of the referential integrity metric. 38 | 39 | Args: 40 | real_data (tuple of 2 pandas.Series): 41 | (primary_key, foreign_key) columns from the real data. 42 | synthetic_data (tuple of 2 pandas.Series): 43 | (primary_key, foreign_key) columns from the synthetic data. 44 | 45 | Returns: 46 | dict: 47 | The score breakdown of the key uniqueness metric. 48 | """ 49 | if pd.isna(real_data[1]).any(): 50 | synthetic_data = list(synthetic_data) 51 | synthetic_data[1] = synthetic_data[1].dropna() 52 | 53 | missing_parents = not real_data[1].isin(real_data[0]).all() 54 | if missing_parents: 55 | LOGGER.info("The real data has foreign keys that don't reference any primary key.") 56 | 57 | score = synthetic_data[1].isin(synthetic_data[0]).mean() 58 | 59 | return {'score': score} 60 | 61 | @classmethod 62 | def compute(cls, real_data, synthetic_data): 63 | """Compute the referential integrity of two columns. 64 | 65 | Args: 66 | real_data (tuple of 2 pandas.Series): 67 | (primary_key, foreign_key) columns from the real data. 68 | synthetic_data (tuple of 2 pandas.Series): 69 | (primary_key, foreign_key) columns from the synthetic data. 70 | 71 | Returns: 72 | float: 73 | The key uniqueness of the two columns. 74 | """ 75 | return cls.compute_breakdown(real_data, synthetic_data)['score'] 76 | -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "tables": { 3 | "users": { 4 | "primary_key": "user_id", 5 | "columns": { 6 | "user_id": { 7 | "sdtype": "id", 8 | "regex_format": "\\d{30}" 9 | }, 10 | "country": { 11 | "sdtype": "categorical" 12 | }, 13 | "gender": { 14 | "sdtype": "categorical" 15 | }, 16 | "age": { 17 | "sdtype": "numerical", 18 | "computer_representation": "Int64" 19 | } 20 | } 21 | }, 22 | "sessions": { 23 | "primary_key": "session_id", 24 | "columns": { 25 | "session_id": { 26 | "sdtype": "id", 27 | "regex_format": "\\d{30}" 28 | }, 29 | "user_id": { 30 | "sdtype": "id", 31 | "regex_format": "\\d{30}" 32 | }, 33 | "device": { 34 | "sdtype": "categorical" 35 | }, 36 | "os": { 37 | "sdtype": "categorical" 38 | } 39 | } 40 | }, 41 | "transactions": { 42 | "primary_key": "transaction_id", 43 | "columns": { 44 | "transaction_id": { 45 | "sdtype": "id", 46 | "regex_format": "\\d{30}" 47 | }, 48 | "session_id": { 49 | "sdtype": "id", 50 | "regex_format": "\\d{30}" 51 | }, 52 | "timestamp": { 53 | "sdtype": "datetime", 54 | "datetime_format": "%Y-%m-%d %H:%M:%S" 55 | }, 56 | "amount": { 57 | "sdtype": "numerical", 58 | "computer_representation": "Float" 59 | }, 60 | "approved": { 61 | "sdtype": "boolean" 62 | } 63 | } 64 | } 65 | }, 66 | "relationships": [ 67 | { 68 | "parent_table_name": "users", 69 | "parent_primary_key": "user_id", 70 | "child_table_name": "sessions", 71 | "child_foreign_key": "user_id" 72 | }, 73 | { 74 | "parent_table_name": "sessions", 75 | "parent_primary_key": "session_id", 76 | "child_table_name": "transactions", 77 | "child_foreign_key": "session_id" 78 | } 79 | ], 80 | "METADATA_SPEC_VERSION": "MULTI_TABLE_V1" 81 | } -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/sessions_real.csv: -------------------------------------------------------------------------------- 1 | session_id,user_id,device,os 2 | 0,0,mobile,android 3 | 1,1,tablet,ios 4 | 2,1,tablet,android 5 | 3,2,mobile,android 6 | 4,4,mobile,ios 7 | 5,5,mobile,android 8 | 6,6,mobile,ios 9 | 7,6,tablet,ios 10 | 8,6,mobile,ios 11 | 9,8,tablet,ios 12 | -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/sessions_synthetic.csv: -------------------------------------------------------------------------------- 1 | session_id,user_id,device,os 2 | 0,1,mobile,ios 3 | 1,4,mobile,android 4 | 2,0,mobile,ios 5 | 3,8,mobile,ios 6 | 4,9,tablet,android 7 | 5,5,tablet,ios 8 | 6,9,mobile,ios 9 | 7,8,mobile,ios 10 | 8,3,mobile,android 11 | 9,8,mobile,ios 12 | -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/transactions_real.csv: -------------------------------------------------------------------------------- 1 | transaction_id,session_id,timestamp,amount,approved 2 | 0,0,2019-01-01 12:34:32,100.0,True 3 | 1,0,2019-01-01 12:42:21,55.3,True 4 | 2,1,2019-01-07 17:23:11,79.5,True 5 | 3,3,2019-01-10 11:08:57,112.1,False 6 | 4,5,2019-01-10 21:54:08,110.0,False 7 | 5,5,2019-01-11 11:21:20,76.3,True 8 | 6,7,2019-01-22 14:44:10,89.5,True 9 | 7,8,2019-01-23 10:14:09,132.1,False 10 | 8,9,2019-01-27 16:09:17,68.0,True 11 | 9,9,2019-01-29 12:10:48,99.9,True 12 | -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/transactions_synthetic.csv: -------------------------------------------------------------------------------- 1 | transaction_id,session_id,timestamp,amount,approved 2 | 0,4,2019-01-25 18:21:18,115.2,True 3 | 1,1,2019-01-08 09:17:33,75.0,True 4 | 2,8,2019-01-26 05:55:58,77.7,True 5 | 3,2,2019-01-12 14:32:23,102.3,True 6 | 4,7,2019-01-10 00:55:37,75.2,True 7 | 5,7,2019-01-20 03:21:23,72.7,True 8 | 6,9,2019-01-13 16:09:43,81.9,True 9 | 7,3,2019-01-16 17:56:36,73.7,True 10 | 8,6,2019-01-04 22:04:02,120.6,False 11 | 9,1,2019-01-11 01:00:26,110.9,False 12 | -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/users_real.csv: -------------------------------------------------------------------------------- 1 | user_id,country,gender,age 2 | 0,US,M,34 3 | 1,UK,F,23 4 | 2,ES,,44 5 | 3,UK,M,22 6 | 4,US,F,54 7 | 5,DE,M,57 8 | 6,BG,F,45 9 | 7,ES,,41 10 | 8,FR,F,23 11 | 9,UK,,30 12 | -------------------------------------------------------------------------------- /sdmetrics/demos/multi_table/users_synthetic.csv: -------------------------------------------------------------------------------- 1 | user_id,country,gender,age 2 | 0,UK,M,47 3 | 1,UK,,29 4 | 2,US,F,41 5 | 3,ES,F,36 6 | 4,US,F,42 7 | 5,US,F,45 8 | 6,ES,F,32 9 | 7,UK,F,35 10 | 8,DE,F,28 11 | 9,ES,F,34 12 | -------------------------------------------------------------------------------- /sdmetrics/demos/single_table/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "primary_key": "student_id", 3 | "columns": { 4 | "start_date": { 5 | "sdtype": "datetime", 6 | "datetime_format": "%Y-%m-%d" 7 | }, 8 | "end_date": { 9 | "sdtype": "datetime", 10 | "datetime_format": "%Y-%m-%d" 11 | }, 12 | "salary": { 13 | "sdtype": "numerical", 14 | "computer_representation": "Int64" 15 | }, 16 | "duration": { 17 | "sdtype": "numerical", 18 | "computer_representation": "Int64" 19 | }, 20 | "student_id": { 21 | "sdtype": "id", 22 | "regex_format": "\\d{30}" 23 | }, 24 | "high_perc": { 25 | "sdtype": "numerical", 26 | "computer_representation": "Float" 27 | }, 28 | "high_spec": { 29 | "sdtype": "categorical" 30 | }, 31 | "mba_spec": { 32 | "sdtype": "categorical" 33 | }, 34 | "second_perc": { 35 | "sdtype": "numerical", 36 | "computer_representation": "Float" 37 | }, 38 | "gender": { 39 | "sdtype": "categorical" 40 | }, 41 | "degree_perc": { 42 | "sdtype": "numerical", 43 | "computer_representation": "Float" 44 | }, 45 | "placed": { 46 | "sdtype": "boolean" 47 | }, 48 | "experience_years": { 49 | "sdtype": "numerical", 50 | "computer_representation": "Float" 51 | }, 52 | "employability_perc": { 53 | "sdtype": "numerical", 54 | "computer_representation": "Float" 55 | }, 56 | "mba_perc": { 57 | "sdtype": "numerical", 58 | "computer_representation": "Float" 59 | }, 60 | "work_experience": { 61 | "sdtype": "boolean" 62 | }, 63 | "degree_type": { 64 | "sdtype": "categorical" 65 | } 66 | }, 67 | "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1" 68 | } -------------------------------------------------------------------------------- /sdmetrics/demos/timeseries/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": { 3 | "region": { 4 | "sdtype": "categorical" 5 | }, 6 | "store_id": { 7 | "sdtype": "numerical", 8 | "computer_representation": "Int64" 9 | }, 10 | "nb_customers": { 11 | "sdtype": "numerical", 12 | "computer_representation": "Int64" 13 | }, 14 | "total_sales": { 15 | "sdtype": "numerical", 16 | "computer_representation": "Float" 17 | }, 18 | "date": { 19 | "sdtype": "datetime" 20 | }, 21 | "day_of_week": { 22 | "sdtype": "numerical", 23 | "computer_representation": "Int64" 24 | } 25 | }, 26 | "sequence_key": "store_id", 27 | "sequence_index": "date", 28 | "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1" 29 | } -------------------------------------------------------------------------------- /sdmetrics/errors.py: -------------------------------------------------------------------------------- 1 | """Custom errors for SDMetrics.""" 2 | 3 | 4 | class VisualizationUnavailableError(Exception): 5 | """Raised when a visualization is not available.""" 6 | 7 | 8 | class IncomputableMetricError(Exception): 9 | """Raised when a metric cannot be computed.""" 10 | 11 | 12 | class ConstantInputError(Exception): 13 | """Thrown when the input data has all the same values.""" 14 | 15 | 16 | class InvalidDataError(Exception): 17 | """Error to raise when data is not valid.""" 18 | -------------------------------------------------------------------------------- /sdmetrics/goal.py: -------------------------------------------------------------------------------- 1 | """SDMetrics Goal Enumeration.""" 2 | 3 | from enum import Enum 4 | 5 | 6 | class Goal(Enum): 7 | """Goal Enumeration. 8 | 9 | This enumerates the ``goal`` for a metric; the value of a metric can be ignored, 10 | minimized, or maximized. 11 | """ 12 | 13 | IGNORE = 'ignore' 14 | MAXIMIZE = 'maximize' 15 | MINIMIZE = 'minimize' 16 | -------------------------------------------------------------------------------- /sdmetrics/multi_table/__init__.py: -------------------------------------------------------------------------------- 1 | """Metrics for multi table datasets.""" 2 | 3 | from sdmetrics.multi_table import detection, multi_single_table 4 | from sdmetrics.multi_table.base import MultiTableMetric 5 | from sdmetrics.multi_table.detection.base import DetectionMetric 6 | from sdmetrics.multi_table.detection.parent_child import ( 7 | LogisticParentChildDetection, 8 | ParentChildDetectionMetric, 9 | SVCParentChildDetection, 10 | ) 11 | from sdmetrics.multi_table.multi_single_table import ( 12 | BNLikelihood, 13 | BNLogLikelihood, 14 | BoundaryAdherence, 15 | CategoryCoverage, 16 | ContingencySimilarity, 17 | CorrelationSimilarity, 18 | CSTest, 19 | KSComplement, 20 | LogisticDetection, 21 | MissingValueSimilarity, 22 | MultiSingleTableMetric, 23 | NewRowSynthesis, 24 | RangeCoverage, 25 | StatisticSimilarity, 26 | SVCDetection, 27 | TVComplement, 28 | ) 29 | from sdmetrics.multi_table.statistical.cardinality_shape_similarity import ( 30 | CardinalityShapeSimilarity, 31 | ) 32 | from sdmetrics.multi_table.statistical.cardinality_statistic_similarity import ( 33 | CardinalityStatisticSimilarity, 34 | ) 35 | 36 | __all__ = [ 37 | 'detection', 38 | 'multi_single_table', 39 | 'MultiTableMetric', 40 | 'DetectionMetric', 41 | 'ParentChildDetectionMetric', 42 | 'LogisticParentChildDetection', 43 | 'SVCParentChildDetection', 44 | 'BNLikelihood', 45 | 'BNLogLikelihood', 46 | 'CSTest', 47 | 'KSComplement', 48 | 'LogisticDetection', 49 | 'SVCDetection', 50 | 'MultiSingleTableMetric', 51 | 'CardinalityShapeSimilarity', 52 | 'CardinalityStatisticSimilarity', 53 | 'BoundaryAdherence', 54 | 'CategoryCoverage', 55 | 'CorrelationSimilarity', 56 | 'ContingencySimilarity', 57 | 'MissingValueSimilarity', 58 | 'StatisticSimilarity', 59 | 'TVComplement', 60 | 'RangeCoverage', 61 | 'NewRowSynthesis', 62 | ] 63 | -------------------------------------------------------------------------------- /sdmetrics/multi_table/base.py: -------------------------------------------------------------------------------- 1 | """Base Multi Table metric class.""" 2 | 3 | from sdmetrics.base import BaseMetric 4 | 5 | 6 | class MultiTableMetric(BaseMetric): 7 | """Base class for metrics that apply to multiple tables. 8 | 9 | Attributes: 10 | name (str): 11 | Name to use when reports about this metric are printed. 12 | goal (sdmetrics.goal.Goal): 13 | The goal of this metric. 14 | min_value (Union[float, tuple[float]]): 15 | Minimum value or values that this metric can take. 16 | max_value (Union[float, tuple[float]]): 17 | Maximum value or values that this metric can take. 18 | """ 19 | 20 | name = None 21 | goal = None 22 | min_value = None 23 | max_value = None 24 | 25 | @staticmethod 26 | def compute(real_data, synthetic_data, metadata=None): 27 | """Compute this metric. 28 | 29 | Args: 30 | real_data (dict[str, pandas.DataFrame]): 31 | The tables from the real dataset, passed as a dictionary of 32 | table names and pandas.DataFrames. 33 | synthetic_data (dict[str, pandas.DataFrame]): 34 | The tables from the synthetic dataset, passed as a dictionary of 35 | table names and pandas.DataFrames. 36 | metadata (dict): 37 | Multi-table metadata dict. If not passed, it is build based on the 38 | real_data fields and dtypes. 39 | 40 | Returns: 41 | Union[float, tuple[float]]: 42 | Metric output. 43 | """ 44 | raise NotImplementedError() 45 | -------------------------------------------------------------------------------- /sdmetrics/multi_table/detection/__init__.py: -------------------------------------------------------------------------------- 1 | """Machine Learning Detection metrics that work on multiple tables.""" 2 | -------------------------------------------------------------------------------- /sdmetrics/multi_table/detection/base.py: -------------------------------------------------------------------------------- 1 | """Base class for Machine Learning Detection metrics that work on multiple tables.""" 2 | 3 | from sdmetrics.multi_table.base import MultiTableMetric 4 | 5 | 6 | class DetectionMetric(MultiTableMetric): 7 | """Base class for Machine Learning Detection based metrics on multiple tables. 8 | 9 | These metrics build a Machine Learning Classifier that learns to tell the synthetic 10 | data apart from the real data, which later on is evaluated using Cross Validation. 11 | 12 | The output of the metric is one minus the average ROC AUC score obtained. 13 | 14 | Attributes: 15 | name (str): 16 | Name to use when reports about this metric are printed. 17 | goal (sdmetrics.goal.Goal): 18 | The goal of this metric. 19 | min_value (Union[float, tuple[float]]): 20 | Minimum value or values that this metric can take. 21 | max_value (Union[float, tuple[float]]): 22 | Maximum value or values that this metric can take. 23 | """ 24 | 25 | name = None 26 | goal = None 27 | min_value = None 28 | max_value = None 29 | 30 | @classmethod 31 | def compute(cls, real_data, synthetic_data, metadata=None): 32 | """Compute this metric. 33 | 34 | Args: 35 | real_data (dict[str, pandas.DataFrame]): 36 | The tables from the real dataset. 37 | synthetic_data (dict[str, pandas.DataFrame]): 38 | The tables from the synthetic dataset. 39 | metadata (dict): 40 | Multi-table metadata dict. If not passed, it is build based on the 41 | real_data fields and dtypes. 42 | 43 | Returns: 44 | Union[float, tuple[float]]: 45 | Metric output. 46 | """ 47 | raise NotImplementedError() 48 | 49 | @classmethod 50 | def normalize(cls, raw_score): 51 | """Return the `raw_score` as is, since it is already normalized. 52 | 53 | Args: 54 | raw_score (float): 55 | The value of the metric from `compute`. 56 | 57 | Returns: 58 | float: 59 | The normalized value of the metric 60 | """ 61 | return super().normalize(raw_score) 62 | -------------------------------------------------------------------------------- /sdmetrics/multi_table/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """Multi table statistical metrics.""" 2 | 3 | from sdmetrics.multi_table.statistical.cardinality_shape_similarity import ( 4 | CardinalityShapeSimilarity, 5 | ) 6 | from sdmetrics.multi_table.statistical.cardinality_statistic_similarity import ( 7 | CardinalityStatisticSimilarity, 8 | ) 9 | 10 | __all__ = ['CardinalityShapeSimilarity', 'CardinalityStatisticSimilarity'] 11 | -------------------------------------------------------------------------------- /sdmetrics/reports/__init__.py: -------------------------------------------------------------------------------- 1 | """Reports for sdmetrics.""" 2 | 3 | from sdmetrics.reports.multi_table import DiagnosticReport as MultiTableDiagnosticReport 4 | from sdmetrics.reports.multi_table import QualityReport as MultiTableQualityReport 5 | from sdmetrics.reports.single_table import DiagnosticReport as SingleTableDiagnosticReport 6 | from sdmetrics.reports.single_table import QualityReport as SingleTableQualityReport 7 | 8 | __all__ = [ 9 | 'SingleTableQualityReport', 10 | 'SingleTableDiagnosticReport', 11 | 'MultiTableQualityReport', 12 | 'MultiTableDiagnosticReport', 13 | ] 14 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/__init__.py: -------------------------------------------------------------------------------- 1 | """Multi table reports for sdmetrics.""" 2 | 3 | from sdmetrics.reports.multi_table.diagnostic_report import DiagnosticReport 4 | from sdmetrics.reports.multi_table.quality_report import QualityReport 5 | 6 | __all__ = [ 7 | 'DiagnosticReport', 8 | 'QualityReport', 9 | ] 10 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/__init__.py: -------------------------------------------------------------------------------- 1 | """Multi table properties for sdmetrics.""" 2 | 3 | from sdmetrics.reports.multi_table._properties.base import BaseMultiTableProperty 4 | from sdmetrics.reports.multi_table._properties.boundary import Boundary 5 | from sdmetrics.reports.multi_table._properties.cardinality import Cardinality 6 | from sdmetrics.reports.multi_table._properties.column_pair_trends import ColumnPairTrends 7 | from sdmetrics.reports.multi_table._properties.column_shapes import ColumnShapes 8 | from sdmetrics.reports.multi_table._properties.coverage import Coverage 9 | from sdmetrics.reports.multi_table._properties.data_validity import DataValidity 10 | from sdmetrics.reports.multi_table._properties.inter_table_trends import InterTableTrends 11 | from sdmetrics.reports.multi_table._properties.relationship_validity import RelationshipValidity 12 | from sdmetrics.reports.multi_table._properties.structure import Structure 13 | from sdmetrics.reports.multi_table._properties.synthesis import Synthesis 14 | 15 | __all__ = [ 16 | 'BaseMultiTableProperty', 17 | 'Boundary', 18 | 'Cardinality', 19 | 'ColumnShapes', 20 | 'ColumnPairTrends', 21 | 'Coverage', 22 | 'InterTableTrends', 23 | 'Synthesis', 24 | 'Structure', 25 | 'DataValidity', 26 | 'RelationshipValidity', 27 | ] 28 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/boundary.py: -------------------------------------------------------------------------------- 1 | """Boundary property for multi-table.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 4 | from sdmetrics.reports.single_table._properties import Boundary as SingleTableBoundary 5 | 6 | 7 | class Boundary(BaseMultiTableProperty): 8 | """Boundary property class for multi-table. 9 | 10 | This property assesses the boundary adherence of the synthetic data over the real data. 11 | The ``BoundaryAdherence`` metric is computed column-wise and the final score is the average 12 | over all columns. This metric is computed over numerical and datetime columns only. 13 | The other column types are ignored by this property. 14 | """ 15 | 16 | _single_table_property = SingleTableBoundary 17 | _num_iteration_case = 'column' 18 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/column_pair_trends.py: -------------------------------------------------------------------------------- 1 | """Column pair trends property for multi-table.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 4 | from sdmetrics.reports.single_table._properties import ( 5 | ColumnPairTrends as SingleTableColumnPairTrends, 6 | ) 7 | 8 | 9 | class ColumnPairTrends(BaseMultiTableProperty): 10 | """Column pair trends property for multi-table. 11 | 12 | This property evaluates the matching in trends between pairs of real 13 | and synthetic data columns. Each pair's correlation is calculated and 14 | the final score represents the average of these measures across all column pairs 15 | """ 16 | 17 | _single_table_property = SingleTableColumnPairTrends 18 | _num_iteration_case = 'column_pair' 19 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/column_shapes.py: -------------------------------------------------------------------------------- 1 | """Column shapes property for multi-table.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 4 | from sdmetrics.reports.single_table._properties import ColumnShapes as SingleTableColumnShapes 5 | 6 | 7 | class ColumnShapes(BaseMultiTableProperty): 8 | """Column Shapes property class for multi-table. 9 | 10 | This property assesses the shape similarity between the real and synthetic data. 11 | A metric score is computed column-wise and the final score is the average over all columns. 12 | The KSComplement metric is used for numerical and datetime columns while the TVComplement 13 | is used for categorical and boolean columns. 14 | The other column types are ignored by this property. 15 | """ 16 | 17 | _single_table_property = SingleTableColumnShapes 18 | _num_iteration_case = 'column' 19 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/coverage.py: -------------------------------------------------------------------------------- 1 | """Coverage property for multi-table.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 4 | from sdmetrics.reports.single_table._properties import Coverage as SingleTableCoverage 5 | 6 | 7 | class Coverage(BaseMultiTableProperty): 8 | """Coverage property class for multi-table. 9 | 10 | This property assesses data coverage between the real and synthetic data. 11 | A metric score is computed column-wise and the final score is the average over all columns. 12 | The ``RangeCoverage`` metric is used for numerical and datetime columns while the 13 | ``CategoryCoverage`` is used for categorical and boolean columns. 14 | The other column types are ignored by this property. 15 | """ 16 | 17 | _single_table_property = SingleTableCoverage 18 | _num_iteration_case = 'column' 19 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/data_validity.py: -------------------------------------------------------------------------------- 1 | """Data validity property for multi-table.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 4 | from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity 5 | 6 | 7 | class DataValidity(BaseMultiTableProperty): 8 | """Data Validitys property class for multi-table. 9 | 10 | This property computes, at base, whether each column contains valid data. 11 | The metric is based on the type data in each column. 12 | A metric score is computed column-wise and the final score is the average over all columns. 13 | The BoundaryAdherence metric is used for numerical and datetime columns, the CategoryAdherence 14 | is used for categorical and boolean columns and the KeyUniqueness for primary and 15 | alternate keys. The other column types are ignored by this property. 16 | """ 17 | 18 | _single_table_property = SingleTableDataValidity 19 | _num_iteration_case = 'column' 20 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/structure.py: -------------------------------------------------------------------------------- 1 | """Structure property for multi-table.""" 2 | 3 | import plotly.express as px 4 | 5 | from sdmetrics.errors import VisualizationUnavailableError 6 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 7 | from sdmetrics.reports.single_table._properties import Structure as SingleTableStructure 8 | from sdmetrics.reports.utils import PlotConfig 9 | 10 | 11 | class Structure(BaseMultiTableProperty): 12 | """Structure property class for multi-table. 13 | 14 | This property checks to see whether the overall structure of the synthetic 15 | data is the same as the real data. The property is calculated for each table. 16 | """ 17 | 18 | _single_table_property = SingleTableStructure 19 | _num_iteration_case = 'table' 20 | 21 | def get_visualization(self, table_name=None): 22 | """Return a visualization for each score in the property. 23 | 24 | Args: 25 | table_name: 26 | If a table name is provided, an error is raised. 27 | 28 | Returns: 29 | plotly.graph_objects._figure.Figure 30 | The visualization for the property. 31 | """ 32 | if table_name: 33 | raise VisualizationUnavailableError( 34 | 'The Structure property does not have a supported visualization for' 35 | ' individual tables.' 36 | ) 37 | 38 | average_score = self._compute_average() 39 | fig = px.bar( 40 | data_frame=self.details, 41 | x='Table', 42 | y='Score', 43 | title=f'Data Diagnostic: Structure (Average Score={average_score})', 44 | category_orders={'group': list(self.details['Table'])}, 45 | color='Metric', 46 | color_discrete_map={ 47 | 'TableStructure': PlotConfig.DATACEBO_DARK, 48 | }, 49 | pattern_shape='Metric', 50 | pattern_shape_sequence=[''], 51 | hover_name='Table', 52 | hover_data={ 53 | 'Table': False, 54 | 'Metric': True, 55 | 'Score': True, 56 | }, 57 | ) 58 | 59 | fig.update_yaxes(range=[0, 1]) 60 | 61 | fig.update_layout( 62 | xaxis_categoryorder='total ascending', 63 | plot_bgcolor=PlotConfig.BACKGROUND_COLOR, 64 | margin={'t': 150}, 65 | font={'size': PlotConfig.FONT_SIZE}, 66 | ) 67 | 68 | return fig 69 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/_properties/synthesis.py: -------------------------------------------------------------------------------- 1 | """Synthesis property for multi-table.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty 4 | from sdmetrics.reports.single_table._properties import Synthesis as SingleTableSynthesis 5 | 6 | 7 | class Synthesis(BaseMultiTableProperty): 8 | """Synthesis property class for multi-table. 9 | 10 | This property assesses the novelty of the syntetic data over the real data. 11 | The ``NewRowSynthesis`` metric is computed over the real and synthetic for each table 12 | to score the proportion of new rows in the synthetic data. 13 | The final score is the average over all tables. 14 | """ 15 | 16 | _single_table_property = SingleTableSynthesis 17 | _num_iteration_case = 'table' 18 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/diagnostic_report.py: -------------------------------------------------------------------------------- 1 | """Multi table diagnostic report.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import DataValidity, RelationshipValidity, Structure 4 | from sdmetrics.reports.multi_table.base_multi_table_report import BaseMultiTableReport 5 | 6 | 7 | class DiagnosticReport(BaseMultiTableReport): 8 | """Multi table diagnostic report. 9 | 10 | This class creates a diagnostic report for multi-table data. It calculates the diagnostic 11 | score along three properties - Relationship Validity, Data Structure, and Data Validity. 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self._properties = { 17 | 'Data Validity': DataValidity(), 18 | 'Data Structure': Structure(), 19 | 'Relationship Validity': RelationshipValidity(), 20 | } 21 | 22 | def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata): 23 | self._validate_relationships(real_data, synthetic_data, metadata) 24 | -------------------------------------------------------------------------------- /sdmetrics/reports/multi_table/quality_report.py: -------------------------------------------------------------------------------- 1 | """Multi table quality report.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import ( 4 | Cardinality, 5 | ColumnPairTrends, 6 | ColumnShapes, 7 | InterTableTrends, 8 | ) 9 | from sdmetrics.reports.multi_table.base_multi_table_report import BaseMultiTableReport 10 | 11 | 12 | class QualityReport(BaseMultiTableReport): 13 | """Multi table quality report. 14 | 15 | This class creates a quality report for multi-table data. It calculates the quality 16 | score along three properties - Column Shapes, Column Pair Trends, and Cardinality. 17 | """ 18 | 19 | def __init__(self): 20 | super().__init__() 21 | self._properties = { 22 | 'Column Shapes': ColumnShapes(), 23 | 'Column Pair Trends': ColumnPairTrends(), 24 | 'Cardinality': Cardinality(), 25 | 'Intertable Trends': InterTableTrends(), 26 | } 27 | -------------------------------------------------------------------------------- /sdmetrics/reports/single_table/__init__.py: -------------------------------------------------------------------------------- 1 | """Single table reports for sdmetrics.""" 2 | 3 | from sdmetrics.reports.single_table.diagnostic_report import DiagnosticReport 4 | from sdmetrics.reports.single_table.quality_report import QualityReport 5 | 6 | __all__ = [ 7 | 'DiagnosticReport', 8 | 'QualityReport', 9 | ] 10 | -------------------------------------------------------------------------------- /sdmetrics/reports/single_table/_properties/__init__.py: -------------------------------------------------------------------------------- 1 | """Single table properties for sdmetrics.""" 2 | 3 | from sdmetrics.reports.single_table._properties.base import BaseSingleTableProperty 4 | from sdmetrics.reports.single_table._properties.boundary import Boundary 5 | from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends 6 | from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes 7 | from sdmetrics.reports.single_table._properties.coverage import Coverage 8 | from sdmetrics.reports.single_table._properties.data_validity import DataValidity 9 | from sdmetrics.reports.single_table._properties.structure import Structure 10 | from sdmetrics.reports.single_table._properties.synthesis import Synthesis 11 | 12 | __all__ = [ 13 | 'BaseSingleTableProperty', 14 | 'ColumnShapes', 15 | 'ColumnPairTrends', 16 | 'Coverage', 17 | 'Boundary', 18 | 'Synthesis', 19 | 'Structure', 20 | 'DataValidity', 21 | ] 22 | -------------------------------------------------------------------------------- /sdmetrics/reports/single_table/_properties/base.py: -------------------------------------------------------------------------------- 1 | """Single table base property class.""" 2 | 3 | import pandas as pd 4 | 5 | 6 | class BaseSingleTableProperty: 7 | """Base class for single table properties. 8 | 9 | A property is a higher-level concept for a class that loops through all the base-level data 10 | and applies different base-level metrics based on the data type. 11 | """ 12 | 13 | _num_iteration_case = None 14 | 15 | def __init__(self): 16 | self.details = pd.DataFrame() 17 | 18 | def _compute_average(self): 19 | """Average the scores for each column.""" 20 | if not isinstance(self.details, pd.DataFrame) or 'Score' not in self.details.columns: 21 | raise ValueError("The property details must be a DataFrame with a 'Score' column.") 22 | 23 | return self.details['Score'].mean() 24 | 25 | def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None): 26 | """Generate the _details dataframe for the property.""" 27 | raise NotImplementedError() 28 | 29 | def _get_num_iterations(self, metadata): 30 | """Get the number of iterations for the property.""" 31 | if self._num_iteration_case == 'column': 32 | return len(metadata['columns']) 33 | elif self._num_iteration_case == 'table': 34 | return 1 35 | elif self._num_iteration_case == 'column_pair': 36 | return int(len(metadata['columns']) * (len(metadata['columns']) - 1) / 2) 37 | 38 | def get_score(self, real_data, synthetic_data, metadata, progress_bar=None): 39 | """Get the average score for the property on the data. 40 | 41 | Args: 42 | real_data (pandas.DataFrame): 43 | The real data. 44 | synthetic_data (pandas.DataFrame): 45 | The synthetic data. 46 | metadata (dict): 47 | The metadata, which contains each column's data type as well as relationships. 48 | progress_bar (tqdm.tqdm or None): 49 | The progress bar object. Defaults to None. 50 | 51 | Returns: 52 | float: 53 | The average score for the property. 54 | """ 55 | self.details = self._generate_details(real_data, synthetic_data, metadata, progress_bar) 56 | return self._compute_average() 57 | 58 | def get_visualization(self): 59 | """Return a visualization for each score in the property. 60 | 61 | Returns: 62 | plotly.graph_objects._figure.Figure 63 | The visualization for the property. 64 | """ 65 | raise NotImplementedError() 66 | -------------------------------------------------------------------------------- /sdmetrics/reports/single_table/_properties/structure.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sdmetrics.errors import VisualizationUnavailableError 5 | from sdmetrics.reports.single_table._properties import BaseSingleTableProperty 6 | from sdmetrics.single_table import TableStructure 7 | 8 | 9 | class Structure(BaseSingleTableProperty): 10 | """Structure property class for single table. 11 | 12 | This property checks to see whether the overall structure of the synthetic 13 | data is the same as the real data. 14 | """ 15 | 16 | _num_iteration_case = 'table' 17 | 18 | def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None): 19 | """Generate the _details dataframe for the structure property. 20 | 21 | Args: 22 | real_data (pandas.DataFrame): 23 | The real data. 24 | synthetic_data (pandas.DataFrame): 25 | The synthetic data. 26 | metadata (dict): 27 | The metadata of the table 28 | progress_bar (tqdm.tqdm or None): 29 | The progress bar to use. Defaults to None. 30 | 31 | Returns: 32 | pandas.DataFrame 33 | """ 34 | try: 35 | score = TableStructure.compute(real_data, synthetic_data) 36 | error_message = None 37 | 38 | except Exception as e: 39 | score = np.nan 40 | error_message = f'{type(e).__name__}: {e}' 41 | 42 | finally: 43 | if progress_bar: 44 | progress_bar.update() 45 | 46 | result = pd.DataFrame( 47 | { 48 | 'Metric': 'TableStructure', 49 | 'Score': score, 50 | 'Error': error_message, 51 | }, 52 | index=[0], 53 | ) 54 | 55 | if result['Error'].isna().all(): 56 | result = result.drop('Error', axis=1) 57 | 58 | return result 59 | 60 | def get_visualization(self): 61 | """Return the visualization for the property. 62 | 63 | Raise an error in this case because the single table Structure property 64 | does not have a supported visualization. 65 | """ 66 | raise VisualizationUnavailableError( 67 | 'The single table Structure property does not have a supported visualization.' 68 | ) 69 | -------------------------------------------------------------------------------- /sdmetrics/reports/single_table/diagnostic_report.py: -------------------------------------------------------------------------------- 1 | """Single table diagnostic report.""" 2 | 3 | from sdmetrics.reports.base_report import BaseReport 4 | from sdmetrics.reports.single_table._properties import DataValidity, Structure 5 | 6 | 7 | class DiagnosticReport(BaseReport): 8 | """Single table diagnostic report. 9 | 10 | This class creates a diagnostic report for single-table data. It calculates the diagnostic 11 | score along two properties - Data Structure and Data Validity. 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self._properties = { 17 | 'Data Validity': DataValidity(), 18 | 'Data Structure': Structure(), 19 | } 20 | 21 | def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata): 22 | return 23 | -------------------------------------------------------------------------------- /sdmetrics/reports/single_table/quality_report.py: -------------------------------------------------------------------------------- 1 | """Single table quality report.""" 2 | 3 | from sdmetrics.reports.base_report import BaseReport 4 | from sdmetrics.reports.single_table._properties import ColumnPairTrends, ColumnShapes 5 | 6 | 7 | class QualityReport(BaseReport): 8 | """Single table quality report. 9 | 10 | This class creates a quality report for single-table data. It calculates the quality 11 | score along two properties - Column Shapes and Column Pair Trends. 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self._properties = { 17 | 'Column Shapes': ColumnShapes(), 18 | 'Column Pair Trends': ColumnPairTrends(), 19 | } 20 | -------------------------------------------------------------------------------- /sdmetrics/single_column/README.md: -------------------------------------------------------------------------------- 1 | # Single Column Metrics 2 | 3 | The metrics found on this folder operate on individual columns (or univariate random variables), 4 | passed as two 1 dimensional arrays. 5 | 6 | Implemented metrics: 7 | 8 | * Statistical: Metrics that compare the arrays using statistical tests 9 | * `CSTest`: Chi-Squared test to compare the distributions of two categorical columns. 10 | * `KSComplement`: Complement to the Kolmogorov-Smirnov statistic to compare the distributions 11 | of two numerical columns using their empirical CDF. 12 | 13 | ## SingleColumnMetric 14 | 15 | All the single column metrics are subclasses form the `sdmetrics.single_column.SingleColumnMetric` 16 | class, which can be used to locate all of them: 17 | 18 | ```python3 19 | In [1]: from sdmetrics.single_column import SingleColumnMetric 20 | 21 | In [2]: SingleColumnMetric.get_subclasses() 22 | Out[2]: 23 | {'CSTest': sdmetrics.single_column.statistical.cstest.CSTest, 24 | 'KSComplement': sdmetrics.single_column.statistical.kscomplement.KSComplement} 25 | ``` 26 | 27 | ## Single Column Inputs and Outputs 28 | 29 | All the single column metrics operate on just two inputs: 30 | 31 | * `real_data`: A 1d numpy array, coming from the real dataset. 32 | * `synthetic_data`: A 1d numpy array, coming from the synthetic dataset. 33 | 34 | For example, this how the KSComplement metric can be computed for the `age` column 35 | from the demo data: 36 | 37 | ```python3 38 | In [3]: from sdmetrics import load_demo 39 | 40 | In [4]: real_data, synthetic_data, metadata = load_demo() 41 | 42 | In [5]: from sdmetrics.single_column import KSComplement 43 | 44 | In [6]: real_column = real_data['users']['age'].to_numpy() 45 | 46 | In [7]: synthetic_column = synthetic_data['users']['age'].to_numpy() 47 | 48 | In [8]: KSComplement.compute(real_column, synthetic_column) 49 | Out[8]: 0.8 50 | ``` 51 | -------------------------------------------------------------------------------- /sdmetrics/single_column/__init__.py: -------------------------------------------------------------------------------- 1 | """Metrics for Single columns.""" 2 | 3 | from sdmetrics.single_column import base 4 | from sdmetrics.single_column.base import SingleColumnMetric 5 | from sdmetrics.single_column.statistical.boundary_adherence import BoundaryAdherence 6 | from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence 7 | from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage 8 | from sdmetrics.single_column.statistical.cstest import CSTest 9 | from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness 10 | from sdmetrics.single_column.statistical.kscomplement import KSComplement 11 | from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity 12 | from sdmetrics.single_column.statistical.range_coverage import RangeCoverage 13 | from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity 14 | from sdmetrics.single_column.statistical.tv_complement import TVComplement 15 | from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity 16 | 17 | __all__ = [ 18 | 'base', 19 | 'SingleColumnMetric', 20 | 'BoundaryAdherence', 21 | 'CategoryCoverage', 22 | 'CategoryAdherence', 23 | 'CSTest', 24 | 'KeyUniqueness', 25 | 'KSComplement', 26 | 'MissingValueSimilarity', 27 | 'RangeCoverage', 28 | 'StatisticSimilarity', 29 | 'TVComplement', 30 | 'SequenceLengthSimilarity', 31 | ] 32 | -------------------------------------------------------------------------------- /sdmetrics/single_column/base.py: -------------------------------------------------------------------------------- 1 | """Base SingleColumnMetric class.""" 2 | 3 | from sdmetrics.base import BaseMetric 4 | 5 | 6 | class SingleColumnMetric(BaseMetric): 7 | """Base class for metrics that apply to individual columns. 8 | 9 | Attributes: 10 | name (str): 11 | Name to use when reports about this metric are printed. 12 | goal (sdmetrics.goal.Goal): 13 | The goal of this metric. 14 | min_value (Union[float, tuple[float]]): 15 | Minimum value or values that this metric can take. 16 | max_value (Union[float, tuple[float]]): 17 | Maximum value or values that this metric can take. 18 | """ 19 | 20 | name = None 21 | goal = None 22 | min_value = None 23 | max_value = None 24 | 25 | @staticmethod 26 | def compute(real_data, synthetic_data): 27 | """Compute this metric. 28 | 29 | Args: 30 | real_data (Union[numpy.ndarray, pandas.Series]): 31 | The values from the real dataset, passed as a 1d numpy 32 | array or as a pandas.Series. 33 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 34 | The values from the synthetic dataset, passed as a 1d numpy 35 | array or as a pandas.Series. 36 | 37 | Returns: 38 | float 39 | Metric output. 40 | """ 41 | raise NotImplementedError() 42 | 43 | @classmethod 44 | def compute_breakdown(cls, real_data, synthetic_data): 45 | """Compute this metric breakdown. 46 | 47 | Args: 48 | real_data (Union[numpy.ndarray, pandas.Series]): 49 | The values from the real dataset, passed as a 1d numpy 50 | array or as a pandas.Series. 51 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 52 | The values from the synthetic dataset, passed as a 1d numpy 53 | array or as a pandas.Series. 54 | 55 | Returns: 56 | dict 57 | Mapping of the metric output. Must include the key 'score'. 58 | """ 59 | return {'score': cls.compute(real_data, synthetic_data)} 60 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """Univariate goodness-of-fit tests.""" 2 | 3 | from sdmetrics.single_column.statistical.boundary_adherence import BoundaryAdherence 4 | from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence 5 | from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage 6 | from sdmetrics.single_column.statistical.cstest import CSTest 7 | from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness 8 | from sdmetrics.single_column.statistical.kscomplement import KSComplement 9 | from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity 10 | from sdmetrics.single_column.statistical.range_coverage import RangeCoverage 11 | from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity 12 | from sdmetrics.single_column.statistical.tv_complement import TVComplement 13 | from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity 14 | 15 | __all__ = [ 16 | 'BoundaryAdherence', 17 | 'CategoryCoverage', 18 | 'CategoryAdherence', 19 | 'CSTest', 20 | 'KeyUniqueness', 21 | 'KSComplement', 22 | 'MissingValueSimilarity', 23 | 'RangeCoverage', 24 | 'StatisticSimilarity', 25 | 'TVComplement', 26 | 'SequenceLengthSimilarity', 27 | ] 28 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/boundary_adherence.py: -------------------------------------------------------------------------------- 1 | """Boundary Adherence Metric.""" 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | from sdmetrics.utils import is_datetime 8 | 9 | 10 | class BoundaryAdherence(SingleColumnMetric): 11 | """Boundary adherence metric. 12 | 13 | Compute the fraction of rows in the synthetic data that are within the min and max 14 | bounds of the real data 15 | 16 | Attributes: 17 | name (str): 18 | Name to use when reports about this metric are printed. 19 | goal (sdmetrics.goal.Goal): 20 | The goal of this metric. 21 | min_value (Union[float, tuple[float]]): 22 | Minimum value or values that this metric can take. 23 | max_value (Union[float, tuple[float]]): 24 | Maximum value or values that this metric can take. 25 | """ 26 | 27 | name = 'BoundaryAdherence' 28 | goal = Goal.MAXIMIZE 29 | min_value = 0.0 30 | max_value = 1.0 31 | 32 | @classmethod 33 | def compute(cls, real_data, synthetic_data): 34 | """Compute the boundary adherence of two continuous columns. 35 | 36 | Args: 37 | real_data (Union[numpy.ndarray, pandas.Series]): 38 | The values from the real dataset. 39 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 40 | The values from the synthetic dataset. 41 | 42 | Returns: 43 | float: 44 | The boundary adherence of the two columns. 45 | """ 46 | real_data = pd.Series(real_data) 47 | synthetic_data = pd.Series(synthetic_data) 48 | if any(pd.isna(real_data)): 49 | real_data = real_data.dropna() 50 | synthetic_data = synthetic_data.dropna() 51 | 52 | if is_datetime(real_data): 53 | real_data = pd.to_numeric(real_data) 54 | synthetic_data = pd.to_numeric(synthetic_data) 55 | 56 | valid = synthetic_data.between(real_data.min(), real_data.max()) 57 | 58 | return valid.sum() / len(synthetic_data) 59 | 60 | @classmethod 61 | def normalize(cls, raw_score): 62 | """Return the `raw_score` as is, since it is already normalized. 63 | 64 | Args: 65 | raw_score (float): 66 | The value of the metric from `compute`. 67 | 68 | Returns: 69 | float: 70 | The normalized value of the metric 71 | """ 72 | return super().normalize(raw_score) 73 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/category_adherence.py: -------------------------------------------------------------------------------- 1 | """Category Adherence Metric.""" 2 | 3 | import numpy as np 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | 8 | 9 | class CategoryAdherence(SingleColumnMetric): 10 | """Category adherence metric. 11 | 12 | The proportion of synthetic data points that match an existing category from the real data. 13 | 14 | Attributes: 15 | name (str): 16 | Name to use when reports about this metric are printed. 17 | goal (sdmetrics.goal.Goal): 18 | The goal of this metric. 19 | min_value (Union[float, tuple[float]]): 20 | Minimum value or values that this metric can take. 21 | max_value (Union[float, tuple[float]]): 22 | Maximum value or values that this metric can take. 23 | """ 24 | 25 | name = 'CategoryAdherence' 26 | goal = Goal.MAXIMIZE 27 | min_value = 0.0 28 | max_value = 1.0 29 | 30 | @classmethod 31 | def compute_breakdown(cls, real_data, synthetic_data): 32 | """Compute the score breakdown of the category adherence metric. 33 | 34 | Args: 35 | real_data (pandas.Series): 36 | The real data. 37 | synthetic_data (pandas.Series): 38 | The synthetic data. 39 | 40 | Returns: 41 | dict: 42 | The score breakdown of the category adherence metric. 43 | """ 44 | real_data = real_data.fillna(np.nan) 45 | synthetic_data = synthetic_data.fillna(np.nan) 46 | score = synthetic_data.isin(real_data).mean() 47 | 48 | return {'score': score} 49 | 50 | @classmethod 51 | def compute(cls, real_data, synthetic_data): 52 | """Compute the category adherence of two columns. 53 | 54 | Args: 55 | real_data (pandas.Series): 56 | The real data. 57 | synthetic_data (pandas.Series): 58 | The synthetic data. 59 | 60 | Returns: 61 | float: 62 | The category adherence metric score. 63 | """ 64 | return cls.compute_breakdown(real_data, synthetic_data)['score'] 65 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/category_coverage.py: -------------------------------------------------------------------------------- 1 | """Category Coverage Metric.""" 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | 8 | 9 | class CategoryCoverage(SingleColumnMetric): 10 | """Category coverage metric. 11 | 12 | Compute the fraction of real data categories that are present in the synthetic data. 13 | 14 | Attributes: 15 | name (str): 16 | Name to use when reports about this metric are printed. 17 | goal (sdmetrics.goal.Goal): 18 | The goal of this metric. 19 | min_value (Union[float, tuple[float]]): 20 | Minimum value or values that this metric can take. 21 | max_value (Union[float, tuple[float]]): 22 | Maximum value or values that this metric can take. 23 | """ 24 | 25 | name = 'CategoryCoverage' 26 | goal = Goal.MAXIMIZE 27 | min_value = 0.0 28 | max_value = 1.0 29 | 30 | @classmethod 31 | def compute(cls, real_data, synthetic_data): 32 | """Compare the category coverage of two continuous columns. 33 | 34 | Args: 35 | real_data (Union[numpy.ndarray, pandas.Series]): 36 | The values from the real dataset. 37 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 38 | The values from the synthetic dataset. 39 | 40 | Returns: 41 | float: 42 | The category coverage ratio of the two columns. 43 | """ 44 | results = cls.compute_breakdown(real_data, synthetic_data) 45 | return results['score'] 46 | 47 | @classmethod 48 | def compute_breakdown(cls, real_data, synthetic_data): 49 | """Compare the category coverage of two continuous columns. 50 | 51 | Args: 52 | real_data (Union[numpy.ndarray, pandas.Series]): 53 | The values from the real dataset. 54 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 55 | The values from the synthetic dataset. 56 | 57 | Returns: 58 | dict: 59 | A mapping of the category coverage results. 60 | """ 61 | real_data = pd.Series(real_data).dropna() 62 | synthetic_data = pd.Series(synthetic_data).dropna() 63 | 64 | real_data_values = set(real_data.value_counts().index) 65 | synthetic_data_values = set(synthetic_data.value_counts().index) 66 | synthetic_coverage = synthetic_data_values.intersection(real_data_values) 67 | 68 | return { 69 | 'score': len(synthetic_coverage) / len(real_data_values), 70 | 'real': len(real_data_values), 71 | 'synthetic': len(synthetic_coverage), 72 | } 73 | 74 | @classmethod 75 | def normalize(cls, raw_score): 76 | """Return the `raw_score` as is, since it is already normalized. 77 | 78 | Args: 79 | raw_score (float): 80 | The value of the metric from `compute`. 81 | 82 | Returns: 83 | float: 84 | The normalized value of the metric 85 | """ 86 | return super().normalize(raw_score) 87 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/cstest.py: -------------------------------------------------------------------------------- 1 | """Chi-Squared test based metric.""" 2 | 3 | from scipy.stats import chisquare 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | from sdmetrics.utils import get_frequencies 8 | 9 | 10 | class CSTest(SingleColumnMetric): 11 | """Chi-Squared test based metric. 12 | 13 | This metric uses the Chi-Squared test to compare the distributions 14 | of the two categorical columns. It returns the resulting p-value so that 15 | a small value indicates that we can reject the null hypothesis (i.e. and 16 | suggests that the distributions are different). 17 | 18 | Attributes: 19 | name (str): 20 | Name to use when reports about this metric are printed. 21 | goal (sdmetrics.goal.Goal): 22 | The goal of this metric. 23 | min_value (Union[float, tuple[float]]): 24 | Minimum value or values that this metric can take. 25 | max_value (Union[float, tuple[float]]): 26 | Maximum value or values that this metric can take. 27 | """ 28 | 29 | name = 'Chi-Squared' 30 | goal = Goal.MAXIMIZE 31 | min_value = 0.0 32 | max_value = 1.0 33 | 34 | @staticmethod 35 | def compute(real_data, synthetic_data): 36 | """Compare two discrete columns using a Chi-Squared test. 37 | 38 | Args: 39 | real_data (Union[numpy.ndarray, pandas.Series]): 40 | The values from the real dataset. 41 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 42 | The values from the synthetic dataset. 43 | 44 | Returns: 45 | float: 46 | The Chi-Squared test p-value 47 | """ 48 | f_obs, f_exp = get_frequencies(real_data, synthetic_data) 49 | if len(f_obs) == len(f_exp) == 1: 50 | pvalue = 1.0 51 | else: 52 | _, pvalue = chisquare(f_obs, f_exp) 53 | 54 | return pvalue 55 | 56 | @classmethod 57 | def normalize(cls, raw_score): 58 | """Return the `raw_score` as is, since it is already normalized. 59 | 60 | Args: 61 | raw_score (float): 62 | The value of the metric from `compute`. 63 | 64 | Returns: 65 | float: 66 | The normalized value of the metric 67 | """ 68 | return super().normalize(raw_score) 69 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/key_uniqueness.py: -------------------------------------------------------------------------------- 1 | """Key Uniqueness Metric.""" 2 | 3 | import logging 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | 8 | LOGGER = logging.getLogger(__name__) 9 | 10 | 11 | class KeyUniqueness(SingleColumnMetric): 12 | """Key uniqueness metric. 13 | 14 | The proportion of data points in the synthetic data that are unique. 15 | 16 | Attributes: 17 | name (str): 18 | Name to use when reports about this metric are printed. 19 | goal (sdmetrics.goal.Goal): 20 | The goal of this metric. 21 | min_value (Union[float, tuple[float]]): 22 | Minimum value or values that this metric can take. 23 | max_value (Union[float, tuple[float]]): 24 | Maximum value or values that this metric can take. 25 | """ 26 | 27 | name = 'KeyUniqueness' 28 | goal = Goal.MAXIMIZE 29 | min_value = 0.0 30 | max_value = 1.0 31 | 32 | @classmethod 33 | def compute_breakdown(cls, real_data, synthetic_data): 34 | """Compute the score breakdown of the key uniqueness metric. 35 | 36 | Args: 37 | real_data (pandas.Series): 38 | The real data. 39 | synthetic_data (pandas.Series): 40 | The synthetic data. 41 | 42 | Returns: 43 | dict: 44 | The score breakdown of the key uniqueness metric. 45 | """ 46 | has_duplicates = real_data.duplicated().any() 47 | has_nans = real_data.isna().any() 48 | if has_duplicates or has_nans: 49 | LOGGER.info('The real data contains NA or duplicate values.') 50 | 51 | nans_or_duplicates_synthetic = synthetic_data.duplicated() | synthetic_data.isna() 52 | score = 1 - nans_or_duplicates_synthetic.sum() / len(synthetic_data) 53 | 54 | return {'score': score} 55 | 56 | @classmethod 57 | def compute(cls, real_data, synthetic_data): 58 | """Compute the key uniqueness metric. 59 | 60 | Args: 61 | real_data (pandas.Series): 62 | The real data. 63 | synthetic_data (pandas.Series): 64 | The synthetic data. 65 | 66 | Returns: 67 | float: 68 | The proportion of data points in the synthetic data that are unique. 69 | """ 70 | return cls.compute_breakdown(real_data, synthetic_data)['score'] 71 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/kscomplement.py: -------------------------------------------------------------------------------- 1 | """Kolmogorov-Smirnov test based Metric.""" 2 | 3 | import sys 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.stats import ks_2samp 8 | 9 | from sdmetrics.goal import Goal 10 | from sdmetrics.single_column.base import SingleColumnMetric 11 | from sdmetrics.utils import is_datetime 12 | 13 | MAX_DECIMALS = sys.float_info.dig - 1 14 | 15 | 16 | class KSComplement(SingleColumnMetric): 17 | """Kolmogorov-Smirnov statistic based metric. 18 | 19 | This function uses the two-sample Kolmogorov–Smirnov test to compare 20 | the distributions of the two continuous columns using the empirical CDF. 21 | It returns 1 minus the KS Test D statistic, which indicates the maximum 22 | distance between the expected CDF and the observed CDF values. 23 | 24 | As a result, the output value is 1.0 if the distributions are identical 25 | and 0.0 if they are completely different. 26 | 27 | Attributes: 28 | name (str): 29 | Name to use when reports about this metric are printed. 30 | goal (sdmetrics.goal.Goal): 31 | The goal of this metric. 32 | min_value (Union[float, tuple[float]]): 33 | Minimum value or values that this metric can take. 34 | max_value (Union[float, tuple[float]]): 35 | Maximum value or values that this metric can take. 36 | """ 37 | 38 | name = 'Inverted Kolmogorov-Smirnov D statistic' 39 | goal = Goal.MAXIMIZE 40 | min_value = 0.0 41 | max_value = 1.0 42 | 43 | @staticmethod 44 | def compute(real_data, synthetic_data): 45 | """Compare two continuous columns using a Kolmogorov–Smirnov test. 46 | 47 | Args: 48 | real_data (Union[numpy.ndarray, pandas.Series]): 49 | The values from the real dataset. 50 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 51 | The values from the synthetic dataset. 52 | 53 | Returns: 54 | float: 55 | 1 minus the Kolmogorov–Smirnov D statistic. 56 | """ 57 | real_data = pd.Series(real_data).dropna() 58 | synthetic_data = pd.Series(synthetic_data).dropna() 59 | 60 | if is_datetime(real_data): 61 | real_data = pd.to_numeric(real_data) 62 | synthetic_data = pd.to_numeric(synthetic_data) 63 | 64 | real_data = real_data.round(MAX_DECIMALS) 65 | synthetic_data = synthetic_data.round(MAX_DECIMALS) 66 | 67 | try: 68 | statistic, _ = ks_2samp(real_data, synthetic_data) 69 | except ValueError as e: 70 | if str(e) == 'Data passed to ks_2samp must not be empty': 71 | return np.nan 72 | else: 73 | raise ValueError(e) 74 | 75 | return 1 - statistic 76 | 77 | @classmethod 78 | def normalize(cls, raw_score): 79 | """Return the `raw_score` as is, since it is already normalized. 80 | 81 | Args: 82 | raw_score (float): 83 | The value of the metric from `compute`. 84 | 85 | Returns: 86 | float: 87 | The normalized value of the metric 88 | """ 89 | return super().normalize(raw_score) 90 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/missing_value_similarity.py: -------------------------------------------------------------------------------- 1 | """Missing Value Similarity Metric.""" 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | 8 | 9 | class MissingValueSimilarity(SingleColumnMetric): 10 | """Missing value similarity metric. 11 | 12 | Compute the percentage of missing values between the real and synthetic data. 13 | 14 | Attributes: 15 | name (str): 16 | Name to use when reports about this metric are printed. 17 | goal (sdmetrics.goal.Goal): 18 | The goal of this metric. 19 | min_value (Union[float, tuple[float]]): 20 | Minimum value or values that this metric can take. 21 | max_value (Union[float, tuple[float]]): 22 | Maximum value or values that this metric can take. 23 | """ 24 | 25 | name = 'MissingValueSimilarity' 26 | goal = Goal.MAXIMIZE 27 | min_value = 0.0 28 | max_value = 1.0 29 | 30 | @classmethod 31 | def compute_breakdown(cls, real_data, synthetic_data): 32 | """Compare the missing value similarity of two continuous columns. 33 | 34 | Args: 35 | real_data (Union[numpy.ndarray, pandas.Series]): 36 | The values from the real dataset. 37 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 38 | The values from the synthetic dataset. 39 | 40 | Returns: 41 | dict: 42 | A mapping of the missing value similarity results. 43 | """ 44 | real_data = pd.Series(real_data) 45 | synthetic_data = pd.Series(synthetic_data) 46 | 47 | real_data_value = real_data.isna().sum() / len(real_data) 48 | synthetic_data_value = synthetic_data.isna().sum() / len(synthetic_data) 49 | 50 | return { 51 | 'score': 1 - abs(real_data_value - synthetic_data_value), 52 | 'real': real_data_value, 53 | 'synthetic': synthetic_data_value, 54 | } 55 | 56 | @classmethod 57 | def compute(cls, real_data, synthetic_data): 58 | """Compare the missing value similarity of two continuous columns. 59 | 60 | Args: 61 | real_data (Union[numpy.ndarray, pandas.Series]): 62 | The values from the real dataset. 63 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 64 | The values from the synthetic dataset. 65 | 66 | Returns: 67 | float: 68 | The missing value similarity of the two columns. 69 | """ 70 | results = cls.compute_breakdown(real_data, synthetic_data) 71 | return results['score'] 72 | 73 | @classmethod 74 | def normalize(cls, raw_score): 75 | """Return the `raw_score` as is, since it is already normalized. 76 | 77 | Args: 78 | raw_score (float): 79 | The value of the metric from `compute`. 80 | 81 | Returns: 82 | float: 83 | The normalized value of the metric 84 | """ 85 | return super().normalize(raw_score) 86 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/range_coverage.py: -------------------------------------------------------------------------------- 1 | """Range Coverage Metric.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sdmetrics.goal import Goal 7 | from sdmetrics.single_column.base import SingleColumnMetric 8 | 9 | 10 | class RangeCoverage(SingleColumnMetric): 11 | """Range coverage metric. 12 | 13 | Compute whether a synthetic column covers the full range of values that are 14 | present in a real column 15 | 16 | Attributes: 17 | name (str): 18 | Name to use when reports about this metric are printed. 19 | goal (sdmetrics.goal.Goal): 20 | The goal of this metric. 21 | min_value (Union[float, tuple[float]]): 22 | Minimum value or values that this metric can take. 23 | max_value (Union[float, tuple[float]]): 24 | Maximum value or values that this metric can take. 25 | """ 26 | 27 | name = 'RangeCoverage' 28 | goal = Goal.MAXIMIZE 29 | min_value = 0.0 30 | max_value = 1.0 31 | 32 | @classmethod 33 | def compute(cls, real_data, synthetic_data): 34 | """Compute the range coverage of synthetic columns over the real column. 35 | 36 | Args: 37 | real_data (Union[numpy.ndarray, pandas.Series]): 38 | The values from the real dataset. 39 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 40 | The values from the synthetic dataset. 41 | 42 | Returns: 43 | float: 44 | The range coverage of the synthetic data over the real data. 45 | """ 46 | if not isinstance(real_data, pd.Series): 47 | real_data = pd.Series(real_data) 48 | 49 | if not isinstance(synthetic_data, pd.Series): 50 | synthetic_data = pd.Series(synthetic_data) 51 | 52 | min_r = real_data.min() 53 | max_r = real_data.max() 54 | min_s = synthetic_data.min() 55 | max_s = synthetic_data.max() 56 | 57 | if min_r == max_r: 58 | return np.nan 59 | 60 | normalized_min = max((min_s - min_r) / (max_r - min_r), 0) 61 | normalized_max = max((max_r - max_s) / (max_r - min_r), 0) 62 | return max(1 - (normalized_min + normalized_max), 0) 63 | 64 | @classmethod 65 | def normalize(cls, raw_score): 66 | """Return the `raw_score` as is, since it is already normalized. 67 | 68 | Args: 69 | raw_score (float): 70 | The value of the metric from `compute`. 71 | 72 | Returns: 73 | float: 74 | The normalized value of the metric 75 | """ 76 | return super().normalize(raw_score) 77 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/sequence_length_similarity.py: -------------------------------------------------------------------------------- 1 | """SequenceLengthSimilarity module.""" 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.goal import Goal 6 | from sdmetrics.single_column.base import SingleColumnMetric 7 | from sdmetrics.single_column.statistical.kscomplement import KSComplement 8 | 9 | 10 | class SequenceLengthSimilarity(SingleColumnMetric): 11 | """Sequence Length Similarity metric. 12 | 13 | Attributes: 14 | name (str): 15 | Name to use when reports about this metric are printed. 16 | goal (sdmetrics.goal.Goal): 17 | The goal of this metric. 18 | min_value (Union[float, tuple[float]]): 19 | Minimum value or values that this metric can take. 20 | max_value (Union[float, tuple[float]]): 21 | Maximum value or values that this metric can take. 22 | """ 23 | 24 | name = 'Sequence Length Similarity' 25 | goal = Goal.MAXIMIZE 26 | min_value = 0.0 27 | max_value = 1.0 28 | 29 | @staticmethod 30 | def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float: 31 | """Compute this metric. 32 | 33 | The length of a sequence is determined by the number of times the same sequence key occurs. 34 | For example if id_09231 appeared 150 times in the sequence key, then the sequence is of 35 | length 150. This metric compares the lengths of all sequence keys in the 36 | real data vs. the synthetic data. 37 | 38 | It works as follows: 39 | - Calculate the length of each sequence in the real data 40 | - Calculate the length of each sequence in the synthetic data 41 | - Apply the KSComplement metric to compare the similarities of the distributions 42 | - Return this score 43 | 44 | Args: 45 | real_data (pd.Series): 46 | The values from the real dataset. 47 | synthetic_data (pd.Series): 48 | The values from the synthetic dataset. 49 | 50 | Returns: 51 | float: 52 | The score. 53 | """ 54 | return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts()) 55 | -------------------------------------------------------------------------------- /sdmetrics/single_column/statistical/tv_complement.py: -------------------------------------------------------------------------------- 1 | """Total Variation Complement Metric.""" 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.errors import IncomputableMetricError 6 | from sdmetrics.goal import Goal 7 | from sdmetrics.single_column.base import SingleColumnMetric 8 | from sdmetrics.utils import get_frequencies 9 | 10 | 11 | class TVComplement(SingleColumnMetric): 12 | """Total Variation Complement metric. 13 | 14 | The complement of the total variation distance. 15 | 16 | Attributes: 17 | name (str): 18 | Name to use when reports about this metric are printed. 19 | goal (sdmetrics.goal.Goal): 20 | The goal of this metric. 21 | min_value (Union[float, tuple[float]]): 22 | Minimum value or values that this metric can take. 23 | max_value (Union[float, tuple[float]]): 24 | Maximum value or values that this metric can take. 25 | """ 26 | 27 | name = 'TVComplement' 28 | goal = Goal.MAXIMIZE 29 | min_value = 0.0 30 | max_value = 1.0 31 | 32 | @classmethod 33 | def compute(cls, real_data, synthetic_data): 34 | """Compute the complement of the total variation distance of two discrete columns. 35 | 36 | Args: 37 | real_data (Union[numpy.ndarray, pandas.Series]): 38 | The values from the real dataset. 39 | synthetic_data (Union[numpy.ndarray, pandas.Series]): 40 | The values from the synthetic dataset. 41 | 42 | Returns: 43 | float: 44 | The complement of the total variation distance. 45 | """ 46 | real_data = pd.Series(real_data).dropna() 47 | synthetic_data = pd.Series(synthetic_data).dropna() 48 | 49 | if len(synthetic_data) == 0 or len(real_data) == 0: 50 | raise IncomputableMetricError( 51 | 'The TVComplement metric must have 1 or more non-null values.' 52 | ) 53 | 54 | f_obs, f_exp = get_frequencies(real_data, synthetic_data) 55 | total_variation = 0 56 | for i in range(len(f_obs)): 57 | total_variation += abs(f_obs[i] - f_exp[i]) 58 | 59 | return 1 - 0.5 * total_variation 60 | 61 | @classmethod 62 | def normalize(cls, raw_score): 63 | """Return the `raw_score` as is, since it is already normalized. 64 | 65 | Args: 66 | raw_score (float): 67 | The value of the metric from `compute`. 68 | 69 | Returns: 70 | float: 71 | The normalized value of the metric 72 | """ 73 | return super().normalize(raw_score) 74 | -------------------------------------------------------------------------------- /sdmetrics/single_table/data_augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | """Data Augmentation Metric for single table datasets.""" 2 | 3 | from sdmetrics.single_table.data_augmentation.binary_classifier_precision_efficacy import ( 4 | BinaryClassifierPrecisionEfficacy, 5 | ) 6 | from sdmetrics.single_table.data_augmentation.binary_classifier_recall_efficacy import ( 7 | BinaryClassifierRecallEfficacy, 8 | ) 9 | 10 | __all__ = ['BinaryClassifierPrecisionEfficacy', 'BinaryClassifierRecallEfficacy'] 11 | -------------------------------------------------------------------------------- /sdmetrics/single_table/data_augmentation/binary_classifier_precision_efficacy.py: -------------------------------------------------------------------------------- 1 | """Binary classifier precision efficacy metric.""" 2 | 3 | from sdmetrics.single_table.data_augmentation.base import BaseDataAugmentationMetric 4 | 5 | 6 | class BinaryClassifierPrecisionEfficacy(BaseDataAugmentationMetric): 7 | """Binary classifier precision efficacy metric.""" 8 | 9 | name = 'Binary Classifier Precision Efficacy' 10 | metric_name = 'precision' 11 | 12 | @classmethod 13 | def compute_breakdown( 14 | cls, 15 | real_training_data, 16 | synthetic_data, 17 | real_validation_data, 18 | metadata, 19 | prediction_column_name, 20 | minority_class_label, 21 | classifier='XGBoost', 22 | fixed_recall_value=0.9, 23 | ): 24 | """Compute the score breakdown of the metric.""" 25 | return super().compute_breakdown( 26 | real_training_data, 27 | synthetic_data, 28 | real_validation_data, 29 | metadata, 30 | prediction_column_name, 31 | minority_class_label, 32 | classifier, 33 | fixed_recall_value, 34 | ) 35 | 36 | @classmethod 37 | def compute( 38 | cls, 39 | real_training_data, 40 | synthetic_data, 41 | real_validation_data, 42 | metadata, 43 | prediction_column_name, 44 | minority_class_label, 45 | classifier='xgboost', 46 | fixed_recall_value=0.9, 47 | ): 48 | """Compute the score of the metric. 49 | 50 | Args: 51 | real_training_data (pandas.DataFrame): 52 | The real training data. 53 | synthetic_data (pandas.DataFrame): 54 | The synthetic data. 55 | real_validation_data (pandas.DataFrame): 56 | The real validation data. 57 | metadata (dict): 58 | The metadata dictionary describing the table of data. 59 | prediction_column_name (str): 60 | The name of the column to be predicted. 61 | minority_class_label (int): 62 | The minority class label. 63 | classifier (str): 64 | The ML algorithm to use when building a Binary Classfication. 65 | Supported options are ``XGBoost``. Defaults to ``XGBoost``. 66 | fixed_recall_value (float): 67 | The fixed recall value to be used when calculating the precision score. 68 | 69 | Returns: 70 | float: 71 | The score of the metric. 72 | """ 73 | return super().compute( 74 | real_training_data, 75 | synthetic_data, 76 | real_validation_data, 77 | metadata, 78 | prediction_column_name, 79 | minority_class_label, 80 | classifier, 81 | fixed_recall_value, 82 | ) 83 | -------------------------------------------------------------------------------- /sdmetrics/single_table/data_augmentation/binary_classifier_recall_efficacy.py: -------------------------------------------------------------------------------- 1 | """Binary classifier recall efficacy metric.""" 2 | 3 | from sdmetrics.single_table.data_augmentation.base import BaseDataAugmentationMetric 4 | 5 | 6 | class BinaryClassifierRecallEfficacy(BaseDataAugmentationMetric): 7 | """Binary classifier recall efficacy metric.""" 8 | 9 | name = 'Binary Classifier Recall Efficacy' 10 | metric_name = 'recall' 11 | 12 | @classmethod 13 | def compute_breakdown( 14 | cls, 15 | real_training_data, 16 | synthetic_data, 17 | real_validation_data, 18 | metadata, 19 | prediction_column_name, 20 | minority_class_label, 21 | classifier='XGBoost', 22 | fixed_precision_value=0.9, 23 | ): 24 | """Compute the score breakdown of the metric.""" 25 | return super().compute_breakdown( 26 | real_training_data, 27 | synthetic_data, 28 | real_validation_data, 29 | metadata, 30 | prediction_column_name, 31 | minority_class_label, 32 | classifier, 33 | fixed_precision_value, 34 | ) 35 | 36 | @classmethod 37 | def compute( 38 | cls, 39 | real_training_data, 40 | synthetic_data, 41 | real_validation_data, 42 | metadata, 43 | prediction_column_name, 44 | minority_class_label, 45 | classifier='XGBoost', 46 | fixed_precision_value=0.9, 47 | ): 48 | """Compute the score of the metric. 49 | 50 | Args: 51 | real_training_data (pandas.DataFrame): 52 | The real training data. 53 | synthetic_data (pandas.DataFrame): 54 | The synthetic data. 55 | real_validation_data (pandas.DataFrame): 56 | The real validation data. 57 | metadata (dict): 58 | The metadata dictionary describing the table of data. 59 | prediction_column_name (str): 60 | The name of the column to be predicted. 61 | minority_class_label (int): 62 | The minority class label. 63 | classifier (str): 64 | The ML algorithm to use when building a Binary Classfication. 65 | Supported options are ``XGBoost``. Defaults to ``XGBoost``. 66 | fixed_precision_value (float): 67 | The fixed precision value to be used when calculating the recall score. 68 | Defaults to 0.9. 69 | 70 | Returns: 71 | float: 72 | The score of the metric. 73 | """ 74 | return super().compute( 75 | real_training_data, 76 | synthetic_data, 77 | real_validation_data, 78 | metadata, 79 | prediction_column_name, 80 | minority_class_label, 81 | classifier, 82 | fixed_precision_value, 83 | ) 84 | -------------------------------------------------------------------------------- /sdmetrics/single_table/detection/__init__.py: -------------------------------------------------------------------------------- 1 | """Machine Learning Detection metrics for single table datasets.""" 2 | 3 | from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection 4 | 5 | __all__ = ['LogisticDetection', 'SVCDetection'] 6 | -------------------------------------------------------------------------------- /sdmetrics/single_table/detection/sklearn.py: -------------------------------------------------------------------------------- 1 | """scikit-learn based DetectionMetrics for single table datasets.""" 2 | 3 | from sklearn.impute import SimpleImputer 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.preprocessing import RobustScaler 7 | from sklearn.svm import SVC 8 | 9 | from sdmetrics.single_table.detection.base import DetectionMetric 10 | 11 | 12 | class ScikitLearnClassifierDetectionMetric(DetectionMetric): 13 | """Base class for Detection metrics build using Scikit Learn Classifiers. 14 | 15 | The base class for these metrics makes a prediction using a scikit-learn 16 | pipeline which contains a SimpleImputer, a RobustScaler and finally 17 | the classifier, which is defined in the subclasses. 18 | """ 19 | 20 | name = 'Scikit-Learn Detection' 21 | 22 | @staticmethod 23 | def _get_classifier(): 24 | """Build and return an instance of a scikit-learn Classifier.""" 25 | raise NotImplementedError() 26 | 27 | @classmethod 28 | def _fit_predict(cls, X_train, y_train, X_test): 29 | """Fit a pipeline to the training data and then use it to make prediction on test data.""" 30 | model = Pipeline([ 31 | ('imputer', SimpleImputer()), 32 | ('scalar', RobustScaler()), 33 | ('classifier', cls._get_classifier()), 34 | ]) 35 | model.fit(X_train, y_train) 36 | 37 | return model.predict_proba(X_test)[:, 1] 38 | 39 | 40 | class LogisticDetection(ScikitLearnClassifierDetectionMetric): 41 | """ScikitLearnClassifierDetectionMetric based on a LogisticRegression. 42 | 43 | This metric builds a LogisticRegression Classifier that learns to tell the synthetic 44 | data apart from the real data, which later on is evaluated using Cross Validation. 45 | 46 | The output of the metric is one minus the average ROC AUC score obtained. 47 | """ 48 | 49 | name = 'LogisticRegression Detection' 50 | 51 | @staticmethod 52 | def _get_classifier(): 53 | return LogisticRegression(solver='lbfgs') 54 | 55 | 56 | class SVCDetection(ScikitLearnClassifierDetectionMetric): 57 | """ScikitLearnClassifierDetectionMetric based on a SVC. 58 | 59 | This metric builds a SVC Classifier that learns to tell the synthetic 60 | data apart from the real data, which later on is evaluated using Cross Validation. 61 | 62 | The output of the metric is one minus the average ROC AUC score obtained. 63 | """ 64 | 65 | name = 'SVC Detection' 66 | 67 | @staticmethod 68 | def _get_classifier(): 69 | return SVC(probability=True, gamma='scale') 70 | -------------------------------------------------------------------------------- /sdmetrics/single_table/efficacy/__init__.py: -------------------------------------------------------------------------------- 1 | """Single table efficacy metrics module.""" 2 | 3 | from sdmetrics.single_table.efficacy import binary, multiclass, regression 4 | from sdmetrics.single_table.efficacy.base import MLEfficacyMetric 5 | from sdmetrics.single_table.efficacy.binary import ( 6 | BinaryAdaBoostClassifier, 7 | BinaryDecisionTreeClassifier, 8 | BinaryEfficacyMetric, 9 | BinaryLogisticRegression, 10 | BinaryMLPClassifier, 11 | ) 12 | from sdmetrics.single_table.efficacy.multiclass import ( 13 | MulticlassDecisionTreeClassifier, 14 | MulticlassEfficacyMetric, 15 | MulticlassMLPClassifier, 16 | ) 17 | from sdmetrics.single_table.efficacy.regression import ( 18 | LinearRegression, 19 | MLPRegressor, 20 | RegressionEfficacyMetric, 21 | ) 22 | 23 | __all__ = [ 24 | 'binary', 25 | 'multiclass', 26 | 'regression', 27 | 'MLEfficacyMetric', 28 | 'BinaryEfficacyMetric', 29 | 'BinaryDecisionTreeClassifier', 30 | 'BinaryAdaBoostClassifier', 31 | 'BinaryLogisticRegression', 32 | 'BinaryMLPClassifier', 33 | 'MulticlassEfficacyMetric', 34 | 'MulticlassDecisionTreeClassifier', 35 | 'MulticlassMLPClassifier', 36 | 'RegressionEfficacyMetric', 37 | 'LinearRegression', 38 | 'MLPRegressor', 39 | ] 40 | -------------------------------------------------------------------------------- /sdmetrics/single_table/efficacy/multiclass.py: -------------------------------------------------------------------------------- 1 | """Base class for Multiclass Classification Efficacy Metrics for single table datasets.""" 2 | 3 | from sklearn.metrics import f1_score 4 | from sklearn.neural_network import MLPClassifier 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from sdmetrics.goal import Goal 8 | from sdmetrics.single_table.efficacy.base import MLEfficacyMetric 9 | 10 | 11 | def f1_macro(test_target, predictions): 12 | """Return the `f1_score` of the passed data.""" 13 | return f1_score(test_target, predictions, average='macro') 14 | 15 | 16 | class MulticlassEfficacyMetric(MLEfficacyMetric): 17 | """Base class for Multiclass Classification Efficacy Metrics.""" 18 | 19 | name = None 20 | goal = Goal.MAXIMIZE 21 | min_value = 0 22 | max_value = 1 23 | SCORER = f1_macro 24 | 25 | @classmethod 26 | def normalize(cls, raw_score): 27 | """Return the `raw_score` as is, since it is already normalized. 28 | 29 | Args: 30 | raw_score (float): 31 | The value of the metric from `compute`. 32 | 33 | Returns: 34 | float: 35 | The normalized value of the metric 36 | """ 37 | return super().normalize(raw_score) 38 | 39 | 40 | class MulticlassDecisionTreeClassifier(MulticlassEfficacyMetric): 41 | """Multiclass DecisionTreeClassifier Efficacy based metric. 42 | 43 | This fits a DecisionTreeClassifier to the training data and 44 | then evaluates it making predictions on the test data. 45 | """ 46 | 47 | MODEL = DecisionTreeClassifier 48 | MODEL_KWARGS = { 49 | 'max_depth': 30, 50 | 'class_weight': 'balanced', 51 | } 52 | 53 | 54 | class MulticlassMLPClassifier(MulticlassEfficacyMetric): 55 | """Multiclass MLPClassifier Efficacy based metric. 56 | 57 | This fits a MLPClassifier to the training data and 58 | then evaluates it making predictions on the test data. 59 | """ 60 | 61 | MODEL = MLPClassifier 62 | MODEL_KWARGS = {'hidden_layer_sizes': (100,), 'max_iter': 50} 63 | -------------------------------------------------------------------------------- /sdmetrics/single_table/efficacy/regression.py: -------------------------------------------------------------------------------- 1 | """Regression Efficacy based metrics.""" 2 | 3 | import numpy as np 4 | from sklearn import linear_model, neural_network 5 | from sklearn.metrics import r2_score 6 | 7 | from sdmetrics.goal import Goal 8 | from sdmetrics.single_table.efficacy.base import MLEfficacyMetric 9 | 10 | 11 | class RegressionEfficacyMetric(MLEfficacyMetric): 12 | """RegressionEfficacy base class.""" 13 | 14 | name = None 15 | goal = Goal.MAXIMIZE 16 | min_value = -np.inf 17 | max_value = 1 18 | SCORER = r2_score 19 | 20 | @classmethod 21 | def normalize(cls, raw_score): 22 | """Return a normalized version of the R^2 score. 23 | 24 | Args: 25 | raw_score (float): 26 | The value of the metric from `compute`. 27 | 28 | Returns: 29 | float: 30 | The normalized value of the metric 31 | """ 32 | return super().normalize(raw_score) 33 | 34 | 35 | class LinearRegression(RegressionEfficacyMetric): 36 | """LinearRegression Efficacy based metric. 37 | 38 | This fits a LinearRegression to the training data and 39 | then evaluates it making predictions on the test data. 40 | """ 41 | 42 | MODEL = linear_model.LinearRegression 43 | 44 | 45 | class MLPRegressor(RegressionEfficacyMetric): 46 | """MLPRegressor Efficacy based metric. 47 | 48 | This fits a MLPRegressor to the training data and 49 | then evaluates it making predictions on the test data. 50 | """ 51 | 52 | MODEL = neural_network.MLPRegressor 53 | MODEL_KWARGS = {'hidden_layer_sizes': (100,), 'max_iter': 50} 54 | -------------------------------------------------------------------------------- /sdmetrics/single_table/privacy/__init__.py: -------------------------------------------------------------------------------- 1 | """Privacy metrics module.""" 2 | 3 | from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric 4 | from sdmetrics.single_table.privacy.cap import ( 5 | CategoricalCAP, 6 | CategoricalGeneralizedCAP, 7 | CategoricalZeroCAP, 8 | ) 9 | from sdmetrics.single_table.privacy.categorical_sklearn import ( 10 | CategoricalKNN, 11 | CategoricalNB, 12 | CategoricalRF, 13 | CategoricalSVM, 14 | ) 15 | from sdmetrics.single_table.privacy.disclosure_protection import ( 16 | DisclosureProtection, 17 | DisclosureProtectionEstimate, 18 | ) 19 | from sdmetrics.single_table.privacy.dcr_baseline_protection import DCRBaselineProtection 20 | from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection 21 | from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble 22 | from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR 23 | from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor 24 | 25 | __all__ = [ 26 | 'CategoricalCAP', 27 | 'CategoricalEnsemble', 28 | 'CategoricalGeneralizedCAP', 29 | 'CategoricalKNN', 30 | 'CategoricalNB', 31 | 'CategoricalPrivacyMetric', 32 | 'CategoricalRF', 33 | 'CategoricalSVM', 34 | 'CategoricalZeroCAP', 35 | 'DisclosureProtection', 36 | 'DisclosureProtectionEstimate', 37 | 'NumericalLR', 38 | 'NumericalMLP', 39 | 'NumericalPrivacyMetric', 40 | 'NumericalRadiusNearestNeighbor', 41 | 'NumericalSVR', 42 | 'DCRBaselineProtection', 43 | 'DCROverfittingProtection', 44 | ] 45 | -------------------------------------------------------------------------------- /sdmetrics/single_table/privacy/loss.py: -------------------------------------------------------------------------------- 1 | """Utilities for the single_table.privacy modules.""" 2 | 3 | import numpy as np 4 | from copulas.univariate.base import Univariate 5 | 6 | 7 | class LossFunction: 8 | """Base class for a loss function.""" 9 | 10 | def fit(self, data, cols): 11 | """Learn the metric on the value space. 12 | 13 | Args: 14 | real_data (pandas.DataFrame): 15 | The real data table. 16 | cols (list[str]): 17 | The names for the target columns (usually the sensitive cols). 18 | """ 19 | 20 | def measure(self, pred, real): 21 | """Calculate the loss of a single prediction. 22 | 23 | Args: 24 | pred (tuple): 25 | The predicted value. 26 | real (tuple): 27 | The actual value. 28 | """ 29 | raise NotImplementedError('Please implement the loss measuring algorithm!') 30 | 31 | 32 | class InverseCDFDistance(LossFunction): 33 | """Measure the distance between continuous key fields. 34 | 35 | This loss function first applies the fitted cdfs to every single entry (i.e. turning 36 | the numerical values into their respective percentiles) and then measures the Lp distance 37 | to the pth power, between the predicted value and the real value. 38 | 39 | Args: 40 | p (float): 41 | The p parameter in L_p metric. Must be positive. 42 | """ 43 | 44 | def __init__(self, p=2): 45 | self.p = p 46 | self.cdfs = [] 47 | 48 | def fit(self, data, cols): 49 | """Fits univariate distributions (automatically selected). 50 | 51 | Args: 52 | data (DataFrame): 53 | Data, where each column in `cols` is a continuous column. 54 | cols (list[str]): 55 | Column names. 56 | """ 57 | for col in cols: 58 | col_data = np.array(data[col]) 59 | dist_model = Univariate() 60 | dist_model.fit(col_data) 61 | self.cdfs.append(dist_model) 62 | 63 | def measure(self, pred, real): 64 | """Compute the distance (L_p norm) between the pred and real values. 65 | 66 | This uses the probability integral transform to map the pred/real values 67 | to a CDF value (between 0.0 and 1.0). Then, it computes the L_p norm 68 | between the CDF(pred) and CDF(real). 69 | 70 | Args: 71 | pred (tuple): 72 | Predicted value(s) corresponding to the columns specified in fit. 73 | real (tuple): 74 | Real value(s) corresponding to the columns specified in fit. 75 | 76 | Returns: 77 | float: 78 | The L_p norm of the CDF value. 79 | """ 80 | assert len(pred) == len(real) 81 | 82 | dist = 0 83 | for idx in range(len(real)): 84 | percentiles = self.cdfs[idx].cdf(np.array([pred[idx], real[idx]])) 85 | dist += abs(percentiles[0] - percentiles[1]) ** self.p 86 | 87 | return dist 88 | -------------------------------------------------------------------------------- /sdmetrics/single_table/table_structure.py: -------------------------------------------------------------------------------- 1 | """Table Format metric.""" 2 | 3 | from sdmetrics.goal import Goal 4 | from sdmetrics.single_table.base import SingleTableMetric 5 | 6 | 7 | class TableStructure(SingleTableMetric): 8 | """TableStructure Single Table metric. 9 | 10 | This metric computes whether the names and data types of each column are 11 | the same in the real and synthetic data. 12 | 13 | Attributes: 14 | name (str): 15 | Name to use when reports about this metric are printed. 16 | goal (sdmetrics.goal.Goal): 17 | The goal of this metric. 18 | min_value (Union[float, tuple[float]]): 19 | Minimum value or values that this metric can take. 20 | max_value (Union[float, tuple[float]]): 21 | Maximum value or values that this metric can take. 22 | """ 23 | 24 | name = 'TableStructure' 25 | goal = Goal.MAXIMIZE 26 | min_value = 0 27 | max_value = 1 28 | 29 | @classmethod 30 | def compute_breakdown(cls, real_data, synthetic_data): 31 | """Compute the score breakdown of the table format metric. 32 | 33 | Args: 34 | real_data (pandas.DataFrame): 35 | The real data. 36 | synthetic_data (pandas.DataFrame): 37 | The synthetic data. 38 | """ 39 | real_columns_dtypes = set(zip(real_data.columns, map(str, real_data.dtypes))) 40 | synthetic_columns_dtypes = set(zip(synthetic_data.columns, map(str, synthetic_data.dtypes))) 41 | 42 | intersection = real_columns_dtypes & synthetic_columns_dtypes 43 | union = real_columns_dtypes | synthetic_columns_dtypes 44 | score = len(intersection) / len(union) 45 | 46 | return {'score': score} 47 | 48 | @classmethod 49 | def compute(cls, real_data, synthetic_data): 50 | """Compute the table format metric score. 51 | 52 | Args: 53 | real_data (pandas.DataFrame): 54 | The real data. 55 | synthetic_data (pandas.DataFrame): 56 | The synthetic data. 57 | 58 | Returns: 59 | float: 60 | The metric score. 61 | """ 62 | return cls.compute_breakdown(real_data, synthetic_data)['score'] 63 | -------------------------------------------------------------------------------- /sdmetrics/timeseries/__init__.py: -------------------------------------------------------------------------------- 1 | """Metrics for timeseries datasets.""" 2 | 3 | from sdmetrics.timeseries import base, detection, efficacy, ml_scorers 4 | from sdmetrics.timeseries.base import TimeSeriesMetric 5 | from sdmetrics.timeseries.detection import LSTMDetection, TimeSeriesDetectionMetric 6 | from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric 7 | from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy 8 | 9 | __all__ = [ 10 | 'base', 11 | 'detection', 12 | 'efficacy', 13 | 'ml_scorers', 14 | 'TimeSeriesMetric', 15 | 'TimeSeriesDetectionMetric', 16 | 'LSTMDetection', 17 | 'TimeSeriesEfficacyMetric', 18 | 'LSTMClassifierEfficacy', 19 | ] 20 | -------------------------------------------------------------------------------- /sdmetrics/timeseries/efficacy/__init__.py: -------------------------------------------------------------------------------- 1 | """Machine Learning Efficacy metrics for Time Series.""" 2 | 3 | from sdmetrics.timeseries.efficacy.base import TimeSeriesEfficacyMetric 4 | from sdmetrics.timeseries.efficacy.classification import ( 5 | LSTMClassifierEfficacy, 6 | TimeSeriesClassificationEfficacyMetric, 7 | ) 8 | 9 | __all__ = [ 10 | 'TimeSeriesEfficacyMetric', 11 | 'TimeSeriesClassificationEfficacyMetric', 12 | 'LSTMClassifierEfficacy', 13 | ] 14 | -------------------------------------------------------------------------------- /sdmetrics/timeseries/efficacy/classification.py: -------------------------------------------------------------------------------- 1 | """Machine Learning Classification Efficacy based metrics for Time Series.""" 2 | 3 | from sdmetrics.timeseries import ml_scorers 4 | from sdmetrics.timeseries.efficacy.base import TimeSeriesEfficacyMetric 5 | 6 | 7 | class TimeSeriesClassificationEfficacyMetric(TimeSeriesEfficacyMetric): 8 | """TimeSeriesEfficacy metrics for Time Series Classification problems.""" 9 | 10 | 11 | class LSTMClassifierEfficacy(TimeSeriesClassificationEfficacyMetric): 12 | """TimeSeriesEfficacy metric based on an LSTM Classifier.""" 13 | 14 | _scorer = ml_scorers.lstm_classifier 15 | -------------------------------------------------------------------------------- /sdmetrics/timeseries/ml_scorers.py: -------------------------------------------------------------------------------- 1 | """Machine Learning Detection based metrics for Time Series.""" 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.preprocessing import LabelEncoder 6 | 7 | 8 | def _stack(row): 9 | return np.stack(row.to_numpy()) # noqa 10 | 11 | 12 | def _to_numpy(dataframe): 13 | return np.stack(dataframe.apply(_stack, axis=1)) # noqa 14 | 15 | 16 | def _x_to_packed_sequence(X, torch): 17 | sequences = [] 18 | for _, row in X.iterrows(): 19 | sequence = [] 20 | for _, values in row.items(): 21 | sequence.append(values) 22 | 23 | sequences.append(torch.FloatTensor(np.array(sequence)).T) 24 | 25 | return torch.nn.utils.rnn.pack_sequence(sequences, enforce_sorted=False) 26 | 27 | 28 | def lstm_classifier(X_train, X_test, y_train, y_test): 29 | """ML Scorer based on a simple LSTM based NN implemented using torch.""" 30 | try: 31 | import torch 32 | except ImportError: 33 | raise ImportError('Please install torch with `pip install torch`') 34 | 35 | input_dim = len(X_train.columns) 36 | output_dim = len(set(y_train)) 37 | hidden_dim = 32 38 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 39 | 40 | lstm = torch.nn.LSTM(input_dim, hidden_dim).to(device) 41 | linear = torch.nn.Linear(hidden_dim, output_dim).to(device) 42 | 43 | X_train = _x_to_packed_sequence(X_train, torch).to(device) 44 | X_test = _x_to_packed_sequence(X_test, torch).to(device) 45 | 46 | transformer = LabelEncoder() 47 | column = 'target' 48 | y_train = pd.DataFrame(y_train, columns=[column]) 49 | y_test = pd.DataFrame(y_test, columns=[column]) 50 | 51 | y_train = transformer.fit_transform(y_train[column]) 52 | y_train = torch.LongTensor(y_train).to(device) 53 | y_test = torch.LongTensor(transformer.transform(y_test[column])).to(device) 54 | 55 | optimizer = torch.optim.Adam(list(lstm.parameters()) + list(linear.parameters()), lr=1e-2) 56 | 57 | for _ in range(1024): 58 | _, (y, _) = lstm(X_train) 59 | y_pred = linear(y[0]) 60 | loss = torch.nn.functional.cross_entropy(y_pred, y_train) 61 | 62 | optimizer.zero_grad() 63 | loss.backward() 64 | optimizer.step() 65 | 66 | _, (y, _) = lstm(X_test) 67 | y_pred = linear(y[0]) 68 | y_pred = torch.argmax(y_pred, axis=1) 69 | return (y_test == y_pred).sum().item() / len(y_test) 70 | -------------------------------------------------------------------------------- /sdmetrics/warnings.py: -------------------------------------------------------------------------------- 1 | """Warnings for sdmetrics.""" 2 | 3 | 4 | class SDMetricsWarning(RuntimeWarning): 5 | """Class to represent SDMetrics warnings.""" 6 | 7 | 8 | class ConstantInputWarning(SDMetricsWarning): 9 | """Thrown when the input data has all the same values.""" 10 | 11 | def __init__(self, message): 12 | self.message = message 13 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics testing package.""" 2 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing package.""" 2 | -------------------------------------------------------------------------------- /tests/integration/column_pairs/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the column_pairs module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/column_pairs/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the column_pairs statistical module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/column_pairs/statistical/test_contingency_similarity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sdmetrics.column_pairs.statistical import ContingencySimilarity 4 | from sdmetrics.demos import load_demo 5 | 6 | 7 | def test_with_num_rows_subsample(): 8 | """Test the metric with `num_rows_subsample`. 9 | 10 | Here the `real_data` and `syntehtic_data` have 218 rows. 11 | """ 12 | # Setup 13 | np.random.seed(42) 14 | real_data, synthetic_data, _ = load_demo('single_table') 15 | real_data = real_data[['degree_type', 'high_spec']] 16 | synthetic_data = synthetic_data[['degree_type', 'high_spec']] 17 | num_rows_subsample = 100 18 | 19 | # Run 20 | result_1 = ContingencySimilarity.compute( 21 | real_data=real_data, 22 | synthetic_data=synthetic_data, 23 | num_rows_subsample=num_rows_subsample, 24 | ) 25 | result_2 = ContingencySimilarity.compute( 26 | real_data=real_data, 27 | synthetic_data=synthetic_data, 28 | num_rows_subsample=num_rows_subsample, 29 | ) 30 | result_entire_data = ContingencySimilarity.compute( 31 | real_data=real_data, 32 | synthetic_data=synthetic_data, 33 | num_rows_subsample=None, 34 | ) 35 | 36 | # Assert 37 | assert result_1 != result_2 38 | assert result_1 != result_entire_data 39 | assert result_2 != result_entire_data 40 | assert np.isclose(result_1, result_entire_data, atol=0.1) 41 | assert np.isclose(result_2, result_entire_data, atol=0.1) 42 | -------------------------------------------------------------------------------- /tests/integration/multi_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the multi_table module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/multi_table/test_multi_single_table.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from sdmetrics.multi_table.multi_single_table import ( 6 | CSTest, 7 | KSComplement, 8 | LogisticDetection, 9 | SVCDetection, 10 | ) 11 | 12 | METRICS = [CSTest, KSComplement, LogisticDetection, SVCDetection] 13 | 14 | 15 | @pytest.fixture 16 | def ones(): 17 | data = pd.DataFrame({ 18 | 'a': [1] * 100, 19 | 'b': [True] * 100, 20 | }) 21 | return {'a': data, 'b': data.copy()} 22 | 23 | 24 | @pytest.fixture 25 | def zeros(): 26 | data = pd.DataFrame({ 27 | 'a': [0] * 100, 28 | 'b': [False] * 100, 29 | }) 30 | return {'a': data, 'b': data.copy()} 31 | 32 | 33 | @pytest.fixture 34 | def real_data(): 35 | data = pd.DataFrame({ 36 | 'a': np.random.normal(size=600), 37 | 'b': np.random.randint(0, 10, size=600), 38 | 'c': ['a', 'b', 'b', 'c', 'c', 'c'] * 100, 39 | 'd': [True, True, True, True, True, False] * 100, 40 | }) 41 | return {'a': data, 'b': data.copy()} 42 | 43 | 44 | @pytest.fixture 45 | def good_data(): 46 | data = pd.DataFrame({ 47 | 'a': np.random.normal(loc=0.01, size=600), 48 | 'b': np.random.randint(0, 10, size=600), 49 | 'c': ['a', 'b', 'b', 'b', 'c', 'c'] * 100, 50 | 'd': [True, True, True, True, False, False] * 100, 51 | }) 52 | return {'a': data, 'b': data.copy()} 53 | 54 | 55 | @pytest.fixture 56 | def bad_data(): 57 | data = pd.DataFrame({ 58 | 'a': np.random.normal(loc=5, scale=3, size=600), 59 | 'b': np.random.randint(5, 15, size=600), 60 | 'c': ['a', 'a', 'a', 'a', 'b', 'b'] * 100, 61 | 'd': [True, False, False, False, False, False] * 100, 62 | }) 63 | return {'a': data, 'b': data.copy()} 64 | 65 | 66 | @pytest.mark.parametrize('metric', METRICS) 67 | def test_max(metric, ones): 68 | output = metric.compute(ones, ones.copy()) 69 | normalized = metric.normalize(output) 70 | 71 | assert output == 1 72 | assert normalized == 1 73 | 74 | 75 | @pytest.mark.parametrize('metric', METRICS) 76 | def test_min(metric, ones, zeros): 77 | output = metric.compute(ones, zeros) 78 | normalized = metric.normalize(output) 79 | 80 | assert np.round(output, decimals=5) == 0 81 | assert np.round(normalized, decimals=5) == 0 82 | 83 | 84 | @pytest.mark.parametrize('metric', METRICS) 85 | def test_good(metric, real_data, good_data): 86 | output = metric.compute(real_data, good_data) 87 | normalized = metric.normalize(output) 88 | 89 | assert 0.5 < output <= 1 90 | assert 0.5 < normalized <= 1 91 | 92 | 93 | @pytest.mark.parametrize('metric', METRICS) 94 | def test_bad(metric, real_data, bad_data): 95 | output = metric.compute(real_data, bad_data) 96 | normalized = metric.normalize(output) 97 | 98 | assert 0 <= output < 0.5 99 | assert 0 <= normalized < 0.5 100 | 101 | 102 | @pytest.mark.parametrize('metric', METRICS) 103 | def test_fail(metric): 104 | error_msg = '`real_data` and `synthetic_data` must have the same tables' 105 | with pytest.raises(ValueError, match=error_msg): 106 | metric.compute({'a': None, 'b': None}, {'a': None}) 107 | -------------------------------------------------------------------------------- /tests/integration/multi_table/test_multi_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from sdmetrics import compute_metrics 4 | from sdmetrics.demos import load_multi_table_demo 5 | from sdmetrics.multi_table.base import MultiTableMetric 6 | 7 | 8 | def test_compute_all(): 9 | real_data, synthetic_data, metadata = load_multi_table_demo() 10 | 11 | output = compute_metrics( 12 | MultiTableMetric.get_subclasses(), real_data, synthetic_data, metadata=metadata 13 | ) 14 | 15 | assert not pd.isna(output.raw_score.mean()) 16 | 17 | scores = output[output.raw_score.notna()] 18 | 19 | assert scores.raw_score.between(scores.min_value, scores.max_value).all() 20 | -------------------------------------------------------------------------------- /tests/integration/reports/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the reports module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the multi-table reports module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the reports multi_table _properties module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_boundary.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import Boundary 7 | 8 | 9 | class TestBoundary: 10 | def test_end_to_end(self): 11 | """Test the ``Boundary`` multi-table property end to end.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | boundary = Boundary() 15 | 16 | # Run 17 | result = boundary.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 1.0 21 | 22 | def test_with_progress_bar(self): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | boundary = Boundary() 27 | num_columns = sum(len(table['columns']) for table in metadata['tables'].values()) 28 | 29 | progress_bar = tqdm(total=num_columns) 30 | mock_update = Mock() 31 | progress_bar.update = mock_update 32 | 33 | # Run 34 | result = boundary.get_score(real_data, synthetic_data, metadata, progress_bar) 35 | 36 | # Assert 37 | assert result == 1.0 38 | assert mock_update.call_count == num_columns 39 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_cardinality.py: -------------------------------------------------------------------------------- 1 | """Test multi-table cardinality properties.""" 2 | 3 | import pandas as pd 4 | from plotly.graph_objs._figure import Figure 5 | 6 | from sdmetrics.demos import load_multi_table_demo 7 | from sdmetrics.reports.multi_table._properties import Cardinality 8 | 9 | 10 | def test_cardinality_property(): 11 | """Test the ``Cardinality`` with the multi table demo.""" 12 | # Setup 13 | cardinality_property = Cardinality() 14 | real_data, synthetic_data, metadata = load_multi_table_demo() 15 | 16 | # Run 17 | score = cardinality_property.get_score(real_data, synthetic_data, metadata) 18 | figure = cardinality_property.get_visualization('users') 19 | 20 | # Assert 21 | assert score == 0.95 22 | assert isinstance(figure, Figure) 23 | 24 | 25 | def test_with_multi_foreign_key(): 26 | """Test the ``Cardinality`` with multiple foreign keys.""" 27 | # Setup 28 | real_data = { 29 | 'bank': pd.DataFrame({ 30 | 'primary_key': [1, 2, 3, 4, 5], 31 | 'category': ['a', 'b', 'c', 'd', 'e'], 32 | 'numerical': [1, 2, 3, 4, 5], 33 | }), 34 | 'transactions': pd.DataFrame({ 35 | 'f_key_1': [1, 2, 3, 2, 1], 36 | 'f_key_2': [1, 5, 3, 2, 4], 37 | }), 38 | } 39 | 40 | synthetic_data = { 41 | 'bank': pd.DataFrame({ 42 | 'primary_key': [1, 2, 3, 4, 5], 43 | 'category': ['a', 'b', 'c', 'd', 'e'], 44 | 'numerical': [1, 2, 3, 4, 5], 45 | }), 46 | 'transactions': pd.DataFrame({ 47 | 'f_key_1': [5, 2, 3, 4, 1], 48 | 'f_key_2': [1, 5, 5, 2, 4], 49 | }), 50 | } 51 | 52 | metadata = { 53 | 'tables': { 54 | 'bank': { 55 | 'primary_key': 'primary_key', 56 | 'columns': { 57 | 'primary_key': {'sdtype': 'id'}, 58 | 'category': {'sdtype': 'categorical'}, 59 | 'numerical': {'sdtype': 'numerical'}, 60 | }, 61 | }, 62 | 'transactions': {'columns': {'f_key_1': {'sdtype': 'id'}, 'f_key_2': {'sdtype': 'id'}}}, 63 | }, 64 | 'relationships': [ 65 | { 66 | 'parent_table_name': 'bank', 67 | 'child_table_name': 'transactions', 68 | 'parent_primary_key': 'primary_key', 69 | 'child_foreign_key': 'f_key_1', 70 | }, 71 | { 72 | 'parent_table_name': 'bank', 73 | 'child_table_name': 'transactions', 74 | 'parent_primary_key': 'primary_key', 75 | 'child_foreign_key': 'f_key_2', 76 | }, 77 | ], 78 | } 79 | 80 | cardinality_property = Cardinality() 81 | 82 | # Run 83 | cardinality_property.get_score(real_data, synthetic_data, metadata) 84 | fig = cardinality_property.get_visualization('bank') 85 | 86 | # Assert 87 | expected_labels = ['transactions (f_key_1) → bank', 'transactions (f_key_2) → bank'] 88 | assert fig.data[0].x.tolist() == expected_labels 89 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_column_pair_trends.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | import numpy as np 4 | from tqdm import tqdm 5 | 6 | from sdmetrics.demos import load_demo 7 | from sdmetrics.reports.multi_table._properties import ColumnPairTrends 8 | 9 | 10 | class TestColumnPairTrends: 11 | def test_end_to_end(self): 12 | """Test ``ColumnPairTrends`` multi-table property end to end.""" 13 | # Setup 14 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 15 | column_pair_trends = ColumnPairTrends() 16 | 17 | # Run 18 | result = column_pair_trends.get_score(real_data, synthetic_data, metadata) 19 | 20 | # Assert 21 | assert np.isclose(result, 0.45654629583521095, atol=1e-8) 22 | 23 | def test_with_progress_bar(self): 24 | """Test that the progress bar is correctly updated.""" 25 | # Setup 26 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 27 | column_pair_trends = ColumnPairTrends() 28 | num_iter = sum( 29 | int(0.5 * len(table['columns']) * (len(table['columns']) - 1)) 30 | for table in metadata['tables'].values() 31 | ) 32 | 33 | progress_bar = tqdm(total=num_iter) 34 | mock_update = Mock() 35 | progress_bar.update = mock_update 36 | 37 | # Run 38 | result = column_pair_trends.get_score(real_data, synthetic_data, metadata, progress_bar) 39 | 40 | # Assert 41 | assert np.isclose(result, 0.45654629583521095, atol=1e-8) 42 | assert mock_update.call_count == num_iter 43 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_column_shapes.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import ColumnShapes 7 | 8 | 9 | class TestColumnShapes: 10 | def test_end_to_end(self): 11 | """Test the ``ColumnShapes`` multi-table property end to end.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | column_shapes = ColumnShapes() 15 | 16 | # Run 17 | result = column_shapes.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 0.7978174603174604 21 | 22 | def test_with_progress_bar(self): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | column_shapes = ColumnShapes() 27 | num_columns = sum(len(table['columns']) for table in metadata['tables'].values()) 28 | 29 | progress_bar = tqdm(total=num_columns) 30 | mock_update = Mock() 31 | progress_bar.update = mock_update 32 | 33 | # Run 34 | result = column_shapes.get_score(real_data, synthetic_data, metadata, progress_bar) 35 | 36 | # Assert 37 | assert result == 0.7978174603174604 38 | assert mock_update.call_count == num_columns 39 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_coverage.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import Coverage 7 | 8 | 9 | class TestCoverage: 10 | def test_end_to_end(self): 11 | """Test the ``Coverage`` multi-table property end to end.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | coverage = Coverage() 15 | 16 | # Run 17 | result = coverage.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 0.8244218804937835 21 | 22 | def test_with_progress_bar(self): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | coverage = Coverage() 27 | num_columns = sum(len(table['columns']) for table in metadata['tables'].values()) 28 | 29 | progress_bar = tqdm(total=num_columns) 30 | mock_update = Mock() 31 | progress_bar.update = mock_update 32 | 33 | # Run 34 | result = coverage.get_score(real_data, synthetic_data, metadata, progress_bar) 35 | 36 | # Assert 37 | assert result == 0.8244218804937835 38 | assert mock_update.call_count == num_columns 39 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_data_validity.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import DataValidity 7 | 8 | 9 | class TestDataValidity: 10 | def test_end_to_end(self): 11 | """Test the ``DataValidity`` multi-table property end to end.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | column_shapes = DataValidity() 15 | 16 | # Run 17 | result = column_shapes.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 1.0 21 | 22 | def test_with_progress_bar(self): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | column_shapes = DataValidity() 27 | num_columns = sum(len(table['columns']) for table in metadata['tables'].values()) 28 | 29 | progress_bar = tqdm(total=num_columns) 30 | mock_update = Mock() 31 | progress_bar.update = mock_update 32 | 33 | # Run 34 | result = column_shapes.get_score(real_data, synthetic_data, metadata, progress_bar) 35 | 36 | # Assert 37 | assert result == 1.0 38 | assert mock_update.call_count == num_columns 39 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_inter_table_trends.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import InterTableTrends 7 | 8 | 9 | class TestInterTableTrends: 10 | def test_end_to_end(self): 11 | """Test ``ColumnPairTrends`` multi-table property end to end.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | inter_table_trends = InterTableTrends() 15 | 16 | # Run 17 | result = inter_table_trends.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 0.4416666666666666 21 | 22 | def test_with_progress_bar(self): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | inter_table_trends = InterTableTrends() 27 | num_iter = sum( 28 | len(metadata['tables'][relationship['parent_table_name']]['columns']) 29 | * len(metadata['tables'][relationship['child_table_name']]['columns']) 30 | for relationship in metadata['relationships'] 31 | ) 32 | 33 | progress_bar = tqdm(total=num_iter) 34 | mock_update = Mock() 35 | progress_bar.update = mock_update 36 | 37 | # Run 38 | result = inter_table_trends.get_score(real_data, synthetic_data, metadata, progress_bar) 39 | 40 | # Assert 41 | assert result == 0.4416666666666666 42 | assert mock_update.call_count == num_iter 43 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_relationship_validity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import RelationshipValidity 7 | 8 | 9 | class TestRelationshipValidity: 10 | def test_end_to_end(self): 11 | """Test the ``RelationshipValidity`` multi-table property end to end.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | relationship_validity = RelationshipValidity() 15 | 16 | # Run 17 | result = relationship_validity.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 1.0 21 | 22 | def test_with_progress_bar(self, capsys): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | relationship_validity = RelationshipValidity() 27 | num_relationship = 2 28 | 29 | progress_bar = tqdm(total=num_relationship, file=sys.stdout) 30 | 31 | # Run 32 | result = relationship_validity.get_score(real_data, synthetic_data, metadata, progress_bar) 33 | progress_bar.close() 34 | captured = capsys.readouterr() 35 | output = captured.out 36 | 37 | # Assert 38 | assert result == 1.0 39 | assert '100%' in output 40 | assert f'{num_relationship}/{num_relationship}' in output 41 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_structure.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | from sdmetrics.demos import load_demo 7 | from sdmetrics.reports.multi_table._properties import Structure 8 | 9 | 10 | class TestStructure: 11 | def test_end_to_end(self): 12 | """Test Structure multi-table.""" 13 | # Setup 14 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 15 | structure = Structure() 16 | 17 | # Run 18 | result = structure.get_score(real_data, synthetic_data, metadata) 19 | 20 | # Assert 21 | assert result == 1.0 22 | 23 | expected_details = pd.DataFrame({ 24 | 'Table': ['users', 'sessions', 'transactions'], 25 | 'Metric': ['TableStructure', 'TableStructure', 'TableStructure'], 26 | 'Score': [1.0, 1.0, 1.0], 27 | }) 28 | pd.testing.assert_frame_equal(structure.details, expected_details) 29 | 30 | def test_with_progress_bar(self): 31 | """Test that the progress bar is correctly updated.""" 32 | # Setup 33 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 34 | structure = Structure() 35 | num_tables = len(metadata['tables']) 36 | 37 | progress_bar = tqdm(total=num_tables) 38 | mock_update = Mock() 39 | progress_bar.update = mock_update 40 | 41 | # Run 42 | result = structure.get_score(real_data, synthetic_data, metadata, progress_bar) 43 | 44 | # Assert 45 | assert result == 1.0 46 | assert mock_update.call_count == num_tables 47 | -------------------------------------------------------------------------------- /tests/integration/reports/multi_table/_properties/test_synthesis.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock 2 | 3 | from tqdm import tqdm 4 | 5 | from sdmetrics.demos import load_demo 6 | from sdmetrics.reports.multi_table._properties import Synthesis 7 | 8 | 9 | class TestSynthesis: 10 | def test_end_to_end(self): 11 | """Test Synthesis multi-table.""" 12 | # Setup 13 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 14 | synthesis = Synthesis() 15 | 16 | # Run 17 | result = synthesis.get_score(real_data, synthetic_data, metadata) 18 | 19 | # Assert 20 | assert result == 0.6333333333333333 21 | 22 | def test_with_progress_bar(self): 23 | """Test that the progress bar is correctly updated.""" 24 | # Setup 25 | real_data, synthetic_data, metadata = load_demo(modality='multi_table') 26 | synthesis = Synthesis() 27 | num_tables = len(metadata['tables']) 28 | 29 | progress_bar = tqdm(total=num_tables) 30 | mock_update = Mock() 31 | progress_bar.update = mock_update 32 | 33 | # Run 34 | result = synthesis.get_score(real_data, synthetic_data, metadata, progress_bar) 35 | 36 | # Assert 37 | assert result == 0.6333333333333333 38 | assert mock_update.call_count == num_tables 39 | -------------------------------------------------------------------------------- /tests/integration/reports/single_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the single-table reports module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/reports/single_table/_properties/test_boundary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sdmetrics.demos import load_demo 5 | from sdmetrics.reports.single_table._properties import Boundary 6 | 7 | 8 | class TestBoundary: 9 | def test_get_score(self): 10 | """Test the ``get_score`` method.""" 11 | # Setup 12 | real_data, synthetic_data, metadata = load_demo(modality='single_table') 13 | boundary_property = Boundary() 14 | 15 | # Run 16 | score = boundary_property.get_score(real_data, synthetic_data, metadata) 17 | 18 | # Assert 19 | assert score == 1.0 20 | expected_details = pd.DataFrame({ 21 | 'Column': [ 22 | 'start_date', 23 | 'end_date', 24 | 'salary', 25 | 'duration', 26 | 'high_perc', 27 | 'second_perc', 28 | 'degree_perc', 29 | 'experience_years', 30 | 'employability_perc', 31 | 'mba_perc', 32 | ], 33 | 'Metric': ['BoundaryAdherence'] * 10, 34 | 'Score': [1.0] * 10, 35 | }) 36 | 37 | pd.testing.assert_frame_equal(boundary_property.details, expected_details) 38 | 39 | def test_get_score_error(self): 40 | """Test the ``get_score`` method with errors.""" 41 | # Setup 42 | real_data, synthetic_data, metadata = load_demo(modality='single_table') 43 | real_data['start_date'].iloc[0] = 0 44 | real_data['employability_perc'].iloc[2] = 'a' 45 | real_data['salary'] = np.nan 46 | 47 | boundary_property = Boundary() 48 | 49 | # Run 50 | score = boundary_property.get_score(real_data, synthetic_data, metadata) 51 | 52 | # Assert 53 | expected_message_1 = ( 54 | "TypeError: '<=' not supported between instances of 'int' and 'Timestamp'" 55 | ) 56 | expected_message_2 = 'InvalidDataError: All NaN values in real data.' 57 | expected_message_3 = "TypeError: '<=' not supported between instances of 'float' and 'str'" 58 | 59 | details = boundary_property.details 60 | details_nan = details.loc[pd.isna(details['Score'])] 61 | column_names_nan = details_nan['Column'].tolist() 62 | error_messages = details_nan['Error'].tolist() 63 | assert column_names_nan == ['start_date', 'salary', 'employability_perc'] 64 | assert error_messages[0] == expected_message_1 65 | assert error_messages[1] == expected_message_2 66 | assert error_messages[2] == expected_message_3 67 | assert score == 1.0 68 | -------------------------------------------------------------------------------- /tests/integration/reports/single_table/_properties/test_structure.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sdmetrics.demos import load_demo 5 | from sdmetrics.reports.single_table._properties import Structure 6 | 7 | 8 | class TestStructure: 9 | def test_get_score(self): 10 | """Test the ``get_score`` method.""" 11 | # Setup 12 | real_data, synthetic_data, metadata = load_demo('single_table') 13 | 14 | # Run 15 | synthesis_property = Structure() 16 | score = synthesis_property.get_score(real_data, synthetic_data, metadata) 17 | 18 | # Assert 19 | assert score == 1.0 20 | 21 | expected_details = pd.DataFrame( 22 | { 23 | 'Metric': 'TableStructure', 24 | 'Score': 1.0, 25 | }, 26 | index=[0], 27 | ) 28 | 29 | pd.testing.assert_frame_equal(synthesis_property.details, expected_details) 30 | 31 | def test_get_score_error(self): 32 | """Test the ``get_score`` method with an error. 33 | Give an empty synthetic data to get an error. 34 | """ 35 | # Setup 36 | real_data, _, metadata = load_demo('single_table') 37 | 38 | # Run 39 | synthesis_property = Structure() 40 | score = synthesis_property.get_score(real_data.iloc[:20], [], metadata) 41 | 42 | # Assert 43 | assert pd.isna(score) 44 | 45 | expected_details = pd.DataFrame( 46 | { 47 | 'Metric': 'TableStructure', 48 | 'Score': np.nan, 49 | 'Error': "AttributeError: 'list' object has no attribute 'columns'", 50 | }, 51 | index=[0], 52 | ) 53 | 54 | pd.testing.assert_frame_equal(synthesis_property.details, expected_details) 55 | -------------------------------------------------------------------------------- /tests/integration/reports/single_table/_properties/test_synthesis.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from sdmetrics.demos import load_demo 5 | from sdmetrics.reports.single_table._properties import Synthesis 6 | 7 | 8 | class TestSynthesis: 9 | def test_get_score(self): 10 | """Test the ``get_score`` method.""" 11 | # Setup 12 | real_data, _, metadata = load_demo('single_table') 13 | 14 | # Run 15 | synthesis_property = Synthesis() 16 | score = synthesis_property.get_score(real_data.iloc[:20], real_data.iloc[10:30], metadata) 17 | 18 | # Assert 19 | assert score == 0.5 20 | 21 | def test_get_score_error(self): 22 | """Test the ``get_score`` method with an error. 23 | 24 | Give an empty synthetic data to get an error. 25 | """ 26 | # Setup 27 | real_data, _, metadata = load_demo('single_table') 28 | 29 | # Run 30 | synthesis_property = Synthesis() 31 | score = synthesis_property.get_score(real_data.iloc[:20], [], metadata) 32 | 33 | # Assert 34 | assert pd.isna(score) 35 | 36 | expected_details = pd.DataFrame( 37 | { 38 | 'Metric': 'NewRowSynthesis', 39 | 'Score': np.nan, 40 | 'Num Matched Rows': np.nan, 41 | 'Num New Rows': np.nan, 42 | 'Error': "AttributeError: 'list' object has no attribute 'columns'", 43 | }, 44 | index=[0], 45 | ) 46 | 47 | pd.testing.assert_frame_equal(synthesis_property.details, expected_details) 48 | -------------------------------------------------------------------------------- /tests/integration/single_column/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the single_column module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/single_column/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the single_column module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/single_column/statistical/test_cstest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from sdmetrics.single_column.statistical.cstest import CSTest 6 | 7 | 8 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 9 | def test_max(array_like): 10 | data = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100) 11 | output = CSTest.compute(data, data) 12 | normalized = CSTest.normalize(output) 13 | 14 | assert output == 1 15 | assert normalized == 1 16 | 17 | 18 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 19 | def test_min(array_like): 20 | real = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100) 21 | synth = array_like(['d', 'e', 'e', 'f', 'f', 'f'] * 100) 22 | output = CSTest.compute(real, synth) 23 | normalized = CSTest.normalize(output) 24 | 25 | assert output == 0 26 | assert normalized == 0 27 | 28 | 29 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 30 | def test_good(array_like): 31 | real = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100) 32 | synth = array_like(['a', 'b', 'b', 'b', 'c', 'c'] * 100) 33 | output = CSTest.compute(real, synth) 34 | normalized = CSTest.normalize(output) 35 | 36 | assert 0.5 < output <= 1.0 37 | assert 0.5 < normalized <= 1.0 38 | 39 | 40 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 41 | def test_bad(array_like): 42 | real = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100) 43 | synth = array_like(['a', 'a', 'a', 'a', 'b', 'c'] * 100) 44 | output = CSTest.compute(real, synth) 45 | normalized = CSTest.normalize(output) 46 | 47 | assert 0.0 <= output < 0.5 48 | assert 0.0 <= normalized < 0.5 49 | -------------------------------------------------------------------------------- /tests/integration/single_column/statistical/test_kscomplement.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from sdmetrics.single_column.statistical.kscomplement import KSComplement 6 | 7 | 8 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 9 | def test_max(array_like): 10 | data = array_like(np.random.normal(size=1000)) 11 | output = KSComplement.compute(data, data) 12 | normalized = KSComplement.normalize(output) 13 | 14 | assert output == 1 15 | assert normalized == 1 16 | 17 | 18 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 19 | def test_min(array_like): 20 | real = array_like(np.random.normal(size=1000)) 21 | synth = array_like(np.random.normal(loc=1000, scale=10, size=1000)) 22 | output = KSComplement.compute(real, synth) 23 | normalized = KSComplement.normalize(output) 24 | 25 | assert output == 0 26 | assert normalized == 0 27 | 28 | 29 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 30 | def test_good(array_like): 31 | real = array_like(np.random.normal(size=1000)) 32 | synth = array_like(np.random.normal(loc=0.1, size=1000)) 33 | output = KSComplement.compute(real, synth) 34 | normalized = KSComplement.normalize(output) 35 | 36 | assert 0.5 < output <= 1.0 37 | assert 0.5 < normalized <= 1.0 38 | 39 | 40 | @pytest.mark.parametrize('array_like', [np.array, pd.Series]) 41 | def test_bad(array_like): 42 | real = array_like(np.random.normal(size=1000)) 43 | synth = array_like(np.random.normal(loc=3, scale=3, size=1000)) 44 | output = KSComplement.compute(real, synth) 45 | normalized = KSComplement.normalize(output) 46 | 47 | assert 0.0 <= output < 0.5 48 | assert 0.0 <= normalized < 0.5 49 | 50 | 51 | def test_one_float_value(): 52 | """Test KSComplement.compute when both data have the same float values GH#652.""" 53 | # Setup 54 | real = pd.Series([0.3 - 0.2]) 55 | synth = pd.Series([0.2 - 0.1]) 56 | 57 | # Run 58 | output = KSComplement.compute(real, synth) 59 | 60 | # Assert 61 | assert output == 1 62 | -------------------------------------------------------------------------------- /tests/integration/single_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the single_table module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/single_table/data_augmentation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/tests/integration/single_table/data_augmentation/__init__.py -------------------------------------------------------------------------------- /tests/integration/single_table/efficacy/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the single_table efficacy module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/single_table/efficacy/test_binary.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from sklearn.datasets import load_breast_cancer 5 | 6 | from sdmetrics.single_table.efficacy.binary import ( 7 | BinaryAdaBoostClassifier, 8 | BinaryDecisionTreeClassifier, 9 | BinaryLogisticRegression, 10 | BinaryMLPClassifier, 11 | ) 12 | 13 | METRICS = [ 14 | BinaryAdaBoostClassifier, 15 | BinaryDecisionTreeClassifier, 16 | BinaryLogisticRegression, 17 | BinaryMLPClassifier, 18 | ] 19 | 20 | 21 | @pytest.fixture 22 | def test_data(): 23 | return load_breast_cancer(as_frame=True).frame 24 | 25 | 26 | @pytest.fixture 27 | def good_data(): 28 | breast_cancer = load_breast_cancer(as_frame=True) 29 | data = breast_cancer.data 30 | stds = data.std(axis=0) * 2.5 31 | columns = len(data.columns) 32 | rows = len(data) 33 | zeros = np.zeros(columns) 34 | noise = np.random.normal(loc=zeros, scale=stds, size=(rows, columns)) 35 | good = data + noise 36 | good['target'] = breast_cancer.target 37 | return good 38 | 39 | 40 | @pytest.fixture 41 | def bad_data(): 42 | breast_cancer = load_breast_cancer(as_frame=True) 43 | data = breast_cancer.data 44 | stds = data.std(axis=0) 45 | mus = data.mean(axis=0) 46 | columns = len(data.columns) 47 | rows = len(data) 48 | bad = np.random.normal(loc=mus, scale=stds, size=(rows, columns)) 49 | bad = pd.DataFrame(bad, columns=data.columns) 50 | bad['target'] = breast_cancer.target 51 | 52 | return bad 53 | 54 | 55 | @pytest.mark.parametrize('metric', METRICS) 56 | def test_rank(metric, test_data, bad_data, good_data): 57 | bad = metric.compute(test_data, bad_data, target='target') 58 | good = metric.compute(test_data, good_data, target='target') 59 | test = metric.compute(test_data, test_data, target='target') 60 | 61 | normalized_bad = metric.normalize(bad) 62 | normalized_good = metric.normalize(good) 63 | normalized_test = metric.normalize(test) 64 | 65 | assert metric.min_value <= bad < good <= test <= metric.max_value 66 | assert 0.0 <= normalized_bad < normalized_good <= normalized_test <= 1.0 67 | 68 | 69 | @pytest.mark.parametrize('metric', METRICS) 70 | def test_rank_object(metric, test_data, bad_data, good_data): 71 | bad = metric.compute(test_data, bad_data, target='target') 72 | good = metric.compute(test_data, good_data, target='target') 73 | test = metric.compute(test_data, test_data, target='target') 74 | 75 | normalized_bad = metric.normalize(bad) 76 | normalized_good = metric.normalize(good) 77 | normalized_test = metric.normalize(test) 78 | 79 | assert metric.min_value <= bad < good <= test <= metric.max_value 80 | assert 0.0 <= normalized_bad < normalized_good <= normalized_test <= 1.0 81 | -------------------------------------------------------------------------------- /tests/integration/single_table/efficacy/test_detection.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sdmetrics import load_demo 4 | from sdmetrics.single_table.detection import LogisticDetection, SVCDetection 5 | 6 | METRICS = [LogisticDetection, SVCDetection] 7 | 8 | 9 | @pytest.mark.parametrize('metric', METRICS) 10 | def test_primary_key(metric): 11 | """Test that primary keys don't affect detection metric.""" 12 | real_data_with_primary_key, synthetic_data_with_primary_key, metadata = load_demo( 13 | modality='single_table' 14 | ) 15 | 16 | real_data_sin_primary_key = real_data_with_primary_key.drop(metadata['primary_key'], axis=1) 17 | synthetic_data_sin_primary_key = synthetic_data_with_primary_key.drop( 18 | metadata['primary_key'], axis=1 19 | ) 20 | 21 | test_with_primary_key = metric.compute( 22 | real_data_with_primary_key, synthetic_data_with_primary_key, metadata 23 | ) 24 | test_sin_primary_key = metric.compute(real_data_sin_primary_key, synthetic_data_sin_primary_key) 25 | 26 | normalized_with_primary_key = metric.normalize(test_with_primary_key) 27 | normalized_sin_primary_key = metric.normalize(test_sin_primary_key) 28 | 29 | # Approximately equal because detection metrics vary when receiving the same data. 30 | assert pytest.approx(normalized_with_primary_key, abs=0.06) == normalized_sin_primary_key 31 | -------------------------------------------------------------------------------- /tests/integration/single_table/efficacy/test_multiclass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from sklearn.datasets import load_wine 5 | 6 | from sdmetrics.single_table.efficacy.multiclass import ( 7 | MulticlassDecisionTreeClassifier, 8 | MulticlassMLPClassifier, 9 | ) 10 | 11 | METRICS = [ 12 | MulticlassDecisionTreeClassifier, 13 | MulticlassMLPClassifier, 14 | ] 15 | 16 | 17 | @pytest.fixture 18 | def test_data(): 19 | return load_wine(as_frame=True).frame 20 | 21 | 22 | @pytest.fixture 23 | def good_data(): 24 | wine = load_wine(as_frame=True) 25 | data = wine.data 26 | stds = data.std(axis=0) * 2.5 27 | columns = len(data.columns) 28 | rows = len(data) 29 | zeros = np.zeros(columns) 30 | noise = np.random.normal(loc=zeros, scale=stds, size=(rows, columns)) 31 | good = data + noise 32 | good['target'] = wine.target 33 | return good 34 | 35 | 36 | @pytest.fixture 37 | def bad_data(): 38 | wine = load_wine(as_frame=True) 39 | data = wine.data 40 | stds = data.std(axis=0) 41 | mus = data.mean(axis=0) 42 | columns = len(data.columns) 43 | rows = len(data) 44 | bad = np.random.normal(loc=mus, scale=stds, size=(rows, columns)) 45 | bad = pd.DataFrame(bad, columns=data.columns) 46 | bad['target'] = wine.target 47 | 48 | return bad 49 | 50 | 51 | @pytest.mark.parametrize('metric', METRICS) 52 | def test_rank(metric, test_data, good_data, bad_data): 53 | bad = metric.compute(test_data, bad_data, target='target') 54 | good = metric.compute(test_data, good_data, target='target') 55 | test = metric.compute(test_data, test_data, target='target') 56 | 57 | normalized_bad = metric.normalize(bad) 58 | normalized_good = metric.normalize(good) 59 | normalized_test = metric.normalize(test) 60 | 61 | assert metric.min_value <= bad < good < test <= metric.max_value 62 | assert 0.0 <= normalized_bad < normalized_good <= normalized_test <= 1.0 63 | -------------------------------------------------------------------------------- /tests/integration/single_table/efficacy/test_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from sklearn.datasets import load_diabetes 5 | 6 | from sdmetrics.single_table.efficacy.regression import LinearRegression, MLPRegressor 7 | 8 | METRICS = [ 9 | LinearRegression, 10 | MLPRegressor, 11 | ] 12 | 13 | 14 | @pytest.fixture 15 | def test_data(): 16 | boston = load_diabetes() 17 | data = pd.DataFrame(boston.data, columns=boston.feature_names) 18 | data['target'] = boston.target 19 | return data 20 | 21 | 22 | @pytest.fixture 23 | def good_data(): 24 | boston = load_diabetes() 25 | data = pd.DataFrame(boston.data, columns=boston.feature_names) 26 | 27 | columns = len(data.columns) 28 | rows = len(data) 29 | data = boston.data 30 | 31 | stds = data.std(axis=0) / 4 32 | zeros = np.zeros(columns) 33 | noise = np.random.normal(loc=zeros, scale=stds, size=(rows, columns)) 34 | good = data + noise * 4 35 | 36 | good = pd.DataFrame(good, columns=boston.feature_names) 37 | good['target'] = boston.target 38 | return good 39 | 40 | 41 | @pytest.fixture 42 | def bad_data(): 43 | boston = load_diabetes() 44 | data = pd.DataFrame(boston.data, columns=boston.feature_names) 45 | 46 | stds = data.std(axis=0) 47 | mus = data.mean(axis=0) 48 | columns = len(data.columns) 49 | rows = len(data) 50 | bad = np.random.normal(loc=mus, scale=stds, size=(rows, columns)) 51 | bad = pd.DataFrame(bad, columns=data.columns) 52 | 53 | bad['target'] = boston.target 54 | 55 | return bad 56 | 57 | 58 | @pytest.mark.parametrize('metric', METRICS) 59 | def test_rank(metric, test_data, good_data, bad_data): 60 | bad = metric.compute(test_data, bad_data, target='target') 61 | good = metric.compute(test_data, good_data, target='target') 62 | test = metric.compute(test_data, test_data, target='target') 63 | 64 | normalized_bad = metric.normalize(bad) 65 | normalized_good = metric.normalize(good) 66 | normalized_test = metric.normalize(test) 67 | 68 | assert metric.min_value <= bad < good < test <= metric.max_value 69 | assert 0.0 <= normalized_bad <= normalized_good <= normalized_test <= 1.0 70 | -------------------------------------------------------------------------------- /tests/integration/single_table/privacy/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the single_table privacy module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/single_table/privacy/test_dcr_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.demos import load_single_table_demo 6 | from sdmetrics.single_table.privacy.dcr_utils import ( 7 | calculate_dcr, 8 | ) 9 | 10 | 11 | def test_calculate_dcr(): 12 | """Test calculate_dcr with numerical values.""" 13 | # Setup 14 | real_data_num = [0, 5, 8, 9, 10] 15 | synthetic_data_num_diff = [3, 5] 16 | 17 | real_df = pd.DataFrame({'num_col': real_data_num}) 18 | synthetic_df_diff = pd.DataFrame({ 19 | 'num_col': synthetic_data_num_diff, 20 | }) 21 | metadata = {'columns': {'num_col': {'sdtype': 'numerical'}}} 22 | 23 | # Run 24 | result = calculate_dcr(reference_dataset=real_df, dataset=synthetic_df_diff, metadata=metadata) 25 | 26 | # Assert 27 | expected_result = pd.Series([0.2, 0.0]) 28 | pd.testing.assert_series_equal(result, expected_result) 29 | 30 | 31 | def test_calculate_dcr_with_zero_col_range(): 32 | """Test calculate_dcr with a range of zero.""" 33 | # Setup 34 | real_data_num = [5.0] 35 | real_data_date = [datetime(2025, 1, 5)] 36 | synthetic_data_num_diff = [1, 2, 3, 5, 5] 37 | synthetic_data_date_diff = [ 38 | datetime(2025, 1, 1), 39 | datetime(2025, 1, 2), 40 | datetime(2025, 1, 3), 41 | datetime(2025, 1, 4), 42 | datetime(2025, 1, 5), 43 | ] 44 | 45 | real_df = pd.DataFrame({'num_col': real_data_num, 'date_col': real_data_date}) 46 | synthetic_df_diff = pd.DataFrame({ 47 | 'num_col': synthetic_data_num_diff, 48 | 'date_col': synthetic_data_date_diff, 49 | }) 50 | metadata = {'columns': {'num_col': {'sdtype': 'numerical'}, 'date_col': {'sdtype': 'datetime'}}} 51 | 52 | # Run 53 | result = calculate_dcr(reference_dataset=real_df, dataset=synthetic_df_diff, metadata=metadata) 54 | 55 | # Assert 56 | expected_result = pd.Series([1.0, 1.0, 1.0, 0.5, 0.0]) 57 | pd.testing.assert_series_equal(result, expected_result) 58 | 59 | 60 | def test_calculate_dcr_chunked(): 61 | """Test calculate_dcr with chunking calculations.""" 62 | # Setup 63 | real_data, synthetic_data, metadata = load_single_table_demo() 64 | 65 | # Run 66 | result = calculate_dcr( 67 | reference_dataset=real_data, 68 | dataset=synthetic_data, 69 | metadata=metadata, 70 | chunk_size=1000, 71 | ) 72 | chunked_result = calculate_dcr( 73 | reference_dataset=real_data, 74 | dataset=synthetic_data, 75 | metadata=metadata, 76 | chunk_size=50, 77 | ) 78 | 79 | # Assert 80 | assert len(result) == len(real_data) 81 | pd.testing.assert_series_equal(result, chunked_result) 82 | -------------------------------------------------------------------------------- /tests/integration/single_table/test_gaussian_mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from sdmetrics.single_table.gaussian_mixture import GMLogLikelihood 6 | 7 | 8 | @pytest.fixture 9 | def ones(): 10 | return pd.DataFrame({ 11 | 'a': [1] * 300, 12 | 'b': [True] * 300, 13 | 'c': [1.0] * 300, 14 | 'd': [True] * 300, 15 | }) 16 | 17 | 18 | @pytest.fixture 19 | def zeros(): 20 | return pd.DataFrame({ 21 | 'a': [0] * 300, 22 | 'b': [False] * 300, 23 | 'c': [0.0] * 300, 24 | 'd': [False] * 300, 25 | }) 26 | 27 | 28 | @pytest.fixture 29 | def real_data(): 30 | return pd.DataFrame({ 31 | 'a': np.random.normal(size=1800), 32 | 'b': np.random.randint(0, 10, size=1800), 33 | 'c': ['a', 'b', 'b', 'c', 'c', 'c'] * 300, 34 | 'd': [True, True, True, True, True, False] * 300, 35 | }) 36 | 37 | 38 | @pytest.fixture 39 | def good_data(): 40 | return pd.DataFrame({ 41 | 'a': np.random.normal(loc=0.01, size=1800), 42 | 'b': np.random.randint(0, 10, size=1800), 43 | 'c': ['a', 'b', 'b', 'b', 'c', 'c'] * 300, 44 | 'd': [True, True, True, True, False, False] * 300, 45 | }) 46 | 47 | 48 | @pytest.fixture 49 | def bad_data(): 50 | return pd.DataFrame({ 51 | 'a': np.random.normal(loc=5, scale=3, size=1800), 52 | 'b': np.random.randint(5, 15, size=1800), 53 | 'c': ['a', 'a', 'a', 'a', 'b', 'b'] * 300, 54 | 'd': [True, False, False, False, False, False] * 300, 55 | }) 56 | 57 | 58 | def test_rank(ones, zeros, real_data, good_data, bad_data): 59 | worst = GMLogLikelihood.compute(ones, zeros) 60 | normalized_worst = GMLogLikelihood.normalize(worst) 61 | best = GMLogLikelihood.compute(ones, ones) 62 | normalized_best = GMLogLikelihood.normalize(best) 63 | 64 | assert GMLogLikelihood.min_value <= worst < best <= GMLogLikelihood.max_value 65 | assert 0.0 <= normalized_worst < normalized_best <= 1.0 66 | -------------------------------------------------------------------------------- /tests/integration/test_base.py: -------------------------------------------------------------------------------- 1 | from sdmetrics.single_table import SingleTableMetric 2 | 3 | SINGLE_TABLE_METRICS = [ 4 | 'BNLikelihood', 5 | 'BNLogLikelihood', 6 | 'LogisticDetection', 7 | 'SVCDetection', 8 | 'BinaryDecisionTreeClassifier', 9 | 'BinaryAdaBoostClassifier', 10 | 'BinaryLogisticRegression', 11 | 'BinaryMLPClassifier', 12 | 'MulticlassDecisionTreeClassifier', 13 | 'MulticlassMLPClassifier', 14 | 'LinearRegression', 15 | 'MLPRegressor', 16 | 'GMLogLikelihood', 17 | 'CSTest', 18 | 'KSComplement', 19 | 'StatisticSimilarity', 20 | 'BoundaryAdherence', 21 | 'MissingValueSimilarity', 22 | 'CategoryCoverage', 23 | 'TVComplement', 24 | 'RangeCoverage', 25 | 'CategoricalCAP', 26 | 'CategoricalZeroCAP', 27 | 'CategoricalGeneralizedCAP', 28 | 'CategoricalNB', 29 | 'CategoricalKNN', 30 | 'CategoricalRF', 31 | 'CategoricalSVM', 32 | 'CategoricalEnsemble', 33 | 'NumericalLR', 34 | 'NumericalMLP', 35 | 'NumericalSVR', 36 | 'NumericalRadiusNearestNeighbor', 37 | 'ContinuousKLDivergence', 38 | 'DiscreteKLDivergence', 39 | 'ContingencySimilarity', 40 | 'CorrelationSimilarity', 41 | 'NewRowSynthesis', 42 | ] 43 | 44 | 45 | def test_get_single_table_subclasses(): 46 | single_table_metrics = SingleTableMetric.get_subclasses() 47 | for single_table_metric in SINGLE_TABLE_METRICS: 48 | assert single_table_metric in single_table_metrics 49 | -------------------------------------------------------------------------------- /tests/integration/test_property.py: -------------------------------------------------------------------------------- 1 | """Tests that are common to all properties.""" 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from sdmetrics.demos import load_demo 7 | from sdmetrics.reports.multi_table import _properties as multi_table_properties 8 | from sdmetrics.reports.single_table import _properties as single_table_properties 9 | 10 | REAL_DATA_ST, SYNTHETIC_DATA_ST, METADATA_ST = load_demo(modality='single_table') 11 | REAL_DATA_MT, SYNTHETIC_DATA_MT, METADATA_MT = load_demo(modality='multi_table') 12 | SINGLE_TABLE_PROPERTIES = [ 13 | property 14 | for property_name, property in vars(single_table_properties).items() 15 | if property_name != 'BaseSingleTableProperty' and isinstance(property, type) 16 | ] 17 | MULTI_TABLE_PROPERTIES = [ 18 | property 19 | for property_name, property in vars(multi_table_properties).items() 20 | if property_name != 'BaseMultiTableProperty' and isinstance(property, type) 21 | ] 22 | 23 | 24 | @pytest.mark.parametrize('property', SINGLE_TABLE_PROPERTIES) 25 | def test_shuffling_data_single_table(property): 26 | """Test the property score is the same when shuffling the data for single-table.""" 27 | # Setup 28 | property_instance = property() 29 | 30 | # Run 31 | score = property_instance.get_score(REAL_DATA_ST, SYNTHETIC_DATA_ST, METADATA_ST) 32 | score_shuffled = property_instance.get_score( 33 | REAL_DATA_ST.sample(frac=1), SYNTHETIC_DATA_ST.sample(frac=1), METADATA_ST 34 | ) 35 | 36 | # Assert 37 | assert score_shuffled == score 38 | 39 | 40 | @pytest.mark.parametrize('property', MULTI_TABLE_PROPERTIES) 41 | def test_shuffling_data_multi_table(property): 42 | """Test the property score is the same when shuffling the data for multi-table.""" 43 | # Setup 44 | property_instance = property() 45 | real_data_shuffled = { 46 | table_name: table.sample(frac=1) for table_name, table in REAL_DATA_MT.items() 47 | } 48 | synthetic_data_shuffled = { 49 | table_name: SYNTHETIC_DATA_MT[table_name].sample(frac=1) for table_name in SYNTHETIC_DATA_MT 50 | } 51 | 52 | # Run 53 | score = property_instance.get_score(REAL_DATA_MT, SYNTHETIC_DATA_MT, METADATA_MT) 54 | score_shuffled = property_instance.get_score( 55 | real_data_shuffled, synthetic_data_shuffled, METADATA_MT 56 | ) 57 | 58 | # Assert 59 | assert np.isclose(score, score_shuffled, rtol=1e-12) 60 | -------------------------------------------------------------------------------- /tests/integration/timeseries/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the timeseries module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/timeseries/efficacy/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics integration testing for the timeseries efficacy module.""" 2 | -------------------------------------------------------------------------------- /tests/integration/timeseries/efficacy/test_classification.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sdmetrics.demos import load_timeseries_demo 4 | from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy 5 | 6 | METRICS = [ 7 | LSTMClassifierEfficacy, 8 | ] 9 | 10 | 11 | @pytest.mark.parametrize('metric', METRICS) 12 | def test_rank(metric): 13 | real_data, synthetic_data, metadata = load_timeseries_demo() 14 | 15 | real_score = metric.compute(real_data, real_data, metadata, target='region') 16 | synthetic_score = metric.compute(real_data, synthetic_data, metadata, target='region') 17 | 18 | normalized_real_score = metric.normalize(real_score) 19 | normalized_synthetic_score = metric.normalize(synthetic_score) 20 | 21 | assert metric.min_value <= synthetic_score <= real_score <= metric.max_value 22 | assert 0.0 <= normalized_synthetic_score <= normalized_real_score <= 1.0 23 | -------------------------------------------------------------------------------- /tests/test_tasks.py: -------------------------------------------------------------------------------- 1 | """Tests for the ``tasks.py`` file.""" 2 | 3 | from tasks import _get_minimum_versions 4 | 5 | 6 | def test_get_minimum_versions(): 7 | """Test the ``_get_minimum_versions`` method. 8 | 9 | The method should return the minimum versions of the dependencies for the given python version. 10 | If a library is linked to an URL, the minimum version should be the URL. 11 | """ 12 | # Setup 13 | dependencies = [ 14 | "numpy>=1.20.0,<2;python_version<'3.10'", 15 | "numpy>=1.23.3,<2;python_version>='3.10'", 16 | "pandas>=1.2.0,<2;python_version<'3.10'", 17 | "pandas>=1.3.0,<2;python_version>='3.10'", 18 | 'humanfriendly>=8.2,<11', 19 | 'pandas @ git+https://github.com/pandas-dev/pandas.git@master', 20 | ] 21 | 22 | # Run 23 | minimum_versions_39 = _get_minimum_versions(dependencies, '3.9') 24 | minimum_versions_310 = _get_minimum_versions(dependencies, '3.10') 25 | 26 | # Assert 27 | expected_versions_39 = [ 28 | 'numpy==1.20.0', 29 | 'git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', 30 | 'humanfriendly==8.2', 31 | ] 32 | expected_versions_310 = [ 33 | 'numpy==1.23.3', 34 | 'git+https://github.com/pandas-dev/pandas.git@master#egg=pandas', 35 | 'humanfriendly==8.2', 36 | ] 37 | 38 | assert minimum_versions_39 == expected_versions_39 39 | assert minimum_versions_310 == expected_versions_310 40 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing package.""" 2 | -------------------------------------------------------------------------------- /tests/unit/column_pairs/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the column pairs module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/column_pairs/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the column pairs statistical metrics.""" 2 | -------------------------------------------------------------------------------- /tests/unit/column_pairs/statistical/test_cardinality_boundary_adherence.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from sdmetrics.column_pairs.statistical import CardinalityBoundaryAdherence 4 | 5 | 6 | class TestCardinalityBoundaryAdherence: 7 | def test_compute_breakdown(self): 8 | """Test the ``compute_breakdown`` method.""" 9 | # Setup 10 | real_parent_keys = pd.Series([1, 2, 3, 4, 5]) 11 | real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5]) 12 | real_data = (real_parent_keys, real_foreign_keys) 13 | synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5]) 14 | synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5]) 15 | synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys) 16 | 17 | metric = CardinalityBoundaryAdherence() 18 | 19 | # Run 20 | result = metric.compute_breakdown(real_data, synthetic_data) 21 | 22 | # Assert 23 | assert result == {'score': 0.6} 24 | 25 | def test_compute(self): 26 | """Test the ``compute`` method.""" 27 | # Setup 28 | real_parent_keys = pd.Series([1, 2, 3, 4, 5]) 29 | real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5]) 30 | real_data = (real_parent_keys, real_foreign_keys) 31 | synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5]) 32 | synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5]) 33 | synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys) 34 | 35 | metric = CardinalityBoundaryAdherence() 36 | 37 | # Run 38 | result = metric.compute(real_data, synthetic_data) 39 | 40 | # Assert 41 | assert result == 0.6 42 | -------------------------------------------------------------------------------- /tests/unit/column_pairs/test_base.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | from sdmetrics.column_pairs.base import ColumnPairsMetric 4 | 5 | 6 | class TestColumnPairsMetric: 7 | def test_compute_breakdown(self): 8 | """Test the ``compute_breakdown`` method. 9 | 10 | Expect a breakdown dictionary is returned that contains the score. 11 | 12 | Setup: 13 | - Mock the ``compute`` method to return a fake score. 14 | 15 | Input: 16 | - Real data. 17 | - Synthetic data. 18 | 19 | Output: 20 | - The evaluated metric. 21 | """ 22 | # Setup 23 | metric = ColumnPairsMetric() 24 | test_metric_score = 0.5 25 | 26 | # Run 27 | with patch.object(ColumnPairsMetric, 'compute', return_value=test_metric_score): 28 | result = metric.compute_breakdown(Mock(), Mock()) 29 | 30 | # Assert 31 | assert result == {'score': test_metric_score} 32 | -------------------------------------------------------------------------------- /tests/unit/multi_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the multi_table module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/multi_table/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the multi_table statistical module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/reports/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the reports module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the reports multi_table module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the reports multi_table _properties module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/test_boundary.py: -------------------------------------------------------------------------------- 1 | """Test Boundary multi-table class.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import Boundary 4 | from sdmetrics.reports.single_table._properties import Boundary as SingleTableBoundary 5 | 6 | 7 | def test__init__(): 8 | """Test the ``__init__`` method.""" 9 | # Setup 10 | boundary = Boundary() 11 | 12 | # Assert 13 | assert boundary._properties == {} 14 | assert boundary._single_table_property == SingleTableBoundary 15 | assert boundary._num_iteration_case == 'column' 16 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/test_column_pair_trends.py: -------------------------------------------------------------------------------- 1 | """Test ColumnPairTrends multi-table class.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import ColumnPairTrends 4 | from sdmetrics.reports.single_table._properties import ( 5 | ColumnPairTrends as SingleTableColumnPairTrends, 6 | ) 7 | 8 | 9 | def test__init__(): 10 | """Test the ``__init__`` method.""" 11 | # Setup 12 | column_pair_trends = ColumnPairTrends() 13 | 14 | # Assert 15 | assert column_pair_trends._properties == {} 16 | assert column_pair_trends._single_table_property == SingleTableColumnPairTrends 17 | assert column_pair_trends._num_iteration_case == 'column_pair' 18 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/test_column_shapes.py: -------------------------------------------------------------------------------- 1 | """Test ColumnShapes multi-table class.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import ColumnShapes 4 | from sdmetrics.reports.single_table._properties import ColumnShapes as SingleTableColumnShapes 5 | 6 | 7 | def test__init__(): 8 | """Test the ``__init__`` method.""" 9 | # Setup 10 | column_shapes = ColumnShapes() 11 | 12 | # Assert 13 | assert column_shapes._properties == {} 14 | assert column_shapes._single_table_property == SingleTableColumnShapes 15 | assert column_shapes._num_iteration_case == 'column' 16 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/test_coverage.py: -------------------------------------------------------------------------------- 1 | """Test Coverage multi-table class.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import Coverage 4 | from sdmetrics.reports.single_table._properties import Coverage as SingleTableCoverage 5 | 6 | 7 | def test__init__(): 8 | """Test the ``__init__`` method.""" 9 | # Setup 10 | coverage = Coverage() 11 | 12 | # Assert 13 | assert coverage._properties == {} 14 | assert coverage._single_table_property == SingleTableCoverage 15 | assert coverage._num_iteration_case == 'column' 16 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/test_synthesis.py: -------------------------------------------------------------------------------- 1 | """Test Synthesis multi-table class.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import Synthesis 4 | from sdmetrics.reports.single_table._properties import Synthesis as SingleTableSynthesis 5 | 6 | 7 | def test__init__(): 8 | """Test the ``__init__`` method.""" 9 | # Setup 10 | synthesis = Synthesis() 11 | 12 | # Assert 13 | assert synthesis._properties == {} 14 | assert synthesis._single_table_property == SingleTableSynthesis 15 | assert synthesis._num_iteration_case == 'table' 16 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/_properties/test_validity.py: -------------------------------------------------------------------------------- 1 | """Test Data Validity multi-table class.""" 2 | 3 | from sdmetrics.reports.multi_table._properties import DataValidity 4 | from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity 5 | 6 | 7 | def test__init__(): 8 | """Test the ``__init__`` method.""" 9 | # Setup 10 | column_shapes = DataValidity() 11 | 12 | # Assert 13 | assert column_shapes._properties == {} 14 | assert column_shapes._single_table_property == SingleTableDataValidity 15 | assert column_shapes._num_iteration_case == 'column' 16 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/test_diagnostic_report.py: -------------------------------------------------------------------------------- 1 | from sdmetrics.reports.multi_table import DiagnosticReport 2 | from sdmetrics.reports.multi_table._properties import DataValidity, RelationshipValidity, Structure 3 | 4 | 5 | class TestDiagnosticReport: 6 | def test___init__(self): 7 | """Test that the ``__init__`` method""" 8 | # Setup 9 | report = DiagnosticReport() 10 | 11 | # Assert 12 | assert report._overall_score is None 13 | assert report.is_generated is False 14 | assert report.table_names == [] 15 | assert isinstance(report._properties['Data Validity'], DataValidity) 16 | assert isinstance(report._properties['Data Structure'], Structure) 17 | assert isinstance(report._properties['Relationship Validity'], RelationshipValidity) 18 | -------------------------------------------------------------------------------- /tests/unit/reports/multi_table/test_quality_report.py: -------------------------------------------------------------------------------- 1 | from sdmetrics.reports.multi_table import QualityReport 2 | from sdmetrics.reports.multi_table._properties import ( 3 | Cardinality, 4 | ColumnPairTrends, 5 | ColumnShapes, 6 | InterTableTrends, 7 | ) 8 | 9 | 10 | class TestQualityReport: 11 | def test___init__(self): 12 | """Test that the ``__init__`` method""" 13 | # Setup 14 | report = QualityReport() 15 | 16 | # Assert 17 | assert report._overall_score is None 18 | assert report.is_generated is False 19 | assert report.table_names == [] 20 | assert isinstance(report._properties['Column Shapes'], ColumnShapes) 21 | assert isinstance(report._properties['Column Pair Trends'], ColumnPairTrends) 22 | assert isinstance(report._properties['Cardinality'], Cardinality) 23 | assert isinstance(report._properties['Intertable Trends'], InterTableTrends) 24 | -------------------------------------------------------------------------------- /tests/unit/reports/single_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the reports single_table module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/reports/single_table/_properties/__init__.py: -------------------------------------------------------------------------------- 1 | """Test package for single table properties.""" 2 | -------------------------------------------------------------------------------- /tests/unit/reports/single_table/test_diagnostic_report.py: -------------------------------------------------------------------------------- 1 | from sdmetrics.reports.single_table import DiagnosticReport 2 | from sdmetrics.reports.single_table._properties import DataValidity, Structure 3 | 4 | 5 | class TestDiagnosticReport: 6 | def test___init__(self): 7 | """Test the ``__init__`` method.""" 8 | # Run 9 | report = DiagnosticReport() 10 | 11 | # Assert 12 | assert report._overall_score is None 13 | assert report.is_generated is False 14 | assert isinstance(report._properties['Data Validity'], DataValidity) 15 | assert isinstance(report._properties['Data Structure'], Structure) 16 | -------------------------------------------------------------------------------- /tests/unit/reports/single_table/test_quality_report.py: -------------------------------------------------------------------------------- 1 | from sdmetrics.reports.single_table import QualityReport 2 | from sdmetrics.reports.single_table._properties import ColumnPairTrends, ColumnShapes 3 | 4 | 5 | class TestQualityReport: 6 | def test___init__(self): 7 | """Test the ``__init__`` method.""" 8 | # Run 9 | report = QualityReport() 10 | 11 | # Assert 12 | assert report._overall_score is None 13 | assert not report.is_generated 14 | assert isinstance(report._properties['Column Shapes'], ColumnShapes) 15 | assert isinstance(report._properties['Column Pair Trends'], ColumnPairTrends) 16 | -------------------------------------------------------------------------------- /tests/unit/single_column/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the single column module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/single_column/statistical/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit tests for the statistical single column metrics.""" 2 | -------------------------------------------------------------------------------- /tests/unit/single_column/statistical/test_category_adherence.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sdmetrics.single_column.statistical import CategoryAdherence 7 | 8 | 9 | class TestCategoryAdherence: 10 | def test_compute_breakdown(self): 11 | """Test the ``compute_breakdown`` method.""" 12 | # Setup 13 | real_data = pd.Series(['A', 'B', 'C', 'B', 'A']) 14 | synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E']) 15 | 16 | metric = CategoryAdherence() 17 | 18 | # Run 19 | result = metric.compute_breakdown(real_data, synthetic_data) 20 | 21 | # Assert 22 | assert result == {'score': 0.6} 23 | 24 | def test_compute_breakdown_with_nans(self): 25 | """Test the ``compute_breakdown`` method with NaNs.""" 26 | # Setup 27 | real_data = pd.Series(['A', 'B', 'C', 'B', 'A', None]) 28 | synthetic_data = pd.Series(['A', 'B', np.nan, 'C', np.nan, 'B', 'A', None, 'D', 'C']) 29 | 30 | metric = CategoryAdherence() 31 | 32 | # Run 33 | result = metric.compute_breakdown(real_data, synthetic_data) 34 | 35 | # Assert 36 | assert result == {'score': 0.9} 37 | 38 | @patch( 39 | 'sdmetrics.single_column.statistical.category_adherence.CategoryAdherence.compute_breakdown' 40 | ) 41 | def test_compute(self, compute_breakdown_mock): 42 | """Test the ``compute`` method.""" 43 | # Setup 44 | real_data = pd.Series(['A', 'B', 'C', 'B', 'A']) 45 | synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E']) 46 | metric = CategoryAdherence() 47 | compute_breakdown_mock.return_value = {'score': 0.6} 48 | 49 | # Run 50 | result = metric.compute(real_data, synthetic_data) 51 | 52 | # Assert 53 | compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data) 54 | assert result == 0.6 55 | -------------------------------------------------------------------------------- /tests/unit/single_column/statistical/test_category_coverage.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | import pandas as pd 4 | 5 | from sdmetrics.single_column.statistical import CategoryCoverage 6 | 7 | 8 | class TestCategoryCoverage: 9 | def test_compute_breakdown(self): 10 | """Test the ``compute_breakdown`` method. 11 | 12 | Expect that the number of unique categories is computed for both real and synthetic data. 13 | 14 | Input: 15 | - Real data. 16 | - Synthetic data. 17 | 18 | Output: 19 | - A mapping of the metric results, containing the score and the real and synthetic results. 20 | """ 21 | # Setup 22 | real_data = pd.Series(['a', 'b', 'a', 'b', 'c']) 23 | synthetic_data = pd.Series(['a', 'a', 'a', 'b', 'b']) 24 | 25 | metric = CategoryCoverage() 26 | 27 | # Run 28 | result = metric.compute_breakdown(real_data, synthetic_data) 29 | 30 | # Assert 31 | assert result == {'score': 2 / 3, 'real': 3, 'synthetic': 2} 32 | 33 | def test_compute_breakdown_missing_categories(self): 34 | """Test the ``compute_breakdown`` method with missing categorical values. 35 | 36 | Expect that the number of unique categories is computed for both real and synthetic data. 37 | """ 38 | # Setup 39 | real_data = pd.Series(['a', 'b', 'a', 'b', 'c']) 40 | synthetic_data = pd.Series(['d', 'e', 'f', 'f', 'e']) 41 | 42 | metric = CategoryCoverage() 43 | 44 | # Run 45 | result = metric.compute_breakdown(real_data, synthetic_data) 46 | 47 | # Assert 48 | assert result == {'score': 0, 'real': 3, 'synthetic': 0} 49 | 50 | def test_compute(self): 51 | """Test the ``compute`` method. 52 | 53 | Expect that the number of unique categories is computed for both real and synthetic data. 54 | 55 | Setup: 56 | - Patch the ``compute_breakdown`` method to return a mapping of the metric results. 57 | 58 | Input: 59 | - Real data. 60 | - Synthetic data. 61 | 62 | Output: 63 | - The evaluated metric. 64 | """ 65 | # Setup 66 | metric_breakdown = {'score': 2 / 3, 'real': 3, 'synthetic': 2} 67 | 68 | metric = CategoryCoverage() 69 | 70 | # Run 71 | with patch.object(CategoryCoverage, 'compute_breakdown', return_value=metric_breakdown): 72 | result = metric.compute(Mock(), Mock()) 73 | 74 | # Assert 75 | assert result == 2 / 3 76 | 77 | @patch('sdmetrics.single_column.statistical.category_coverage.SingleColumnMetric.normalize') 78 | def test_normalize(self, normalize_mock): 79 | """Test the ``normalize`` method. 80 | 81 | Expect that the inherited ``normalize`` method is called. 82 | 83 | Input: 84 | - Raw score 85 | 86 | Output: 87 | - The output of the inherited ``normalize`` method. 88 | """ 89 | # Setup 90 | metric = CategoryCoverage() 91 | raw_score = 0.9 92 | 93 | # Run 94 | result = metric.normalize(raw_score) 95 | 96 | # Assert 97 | normalize_mock.assert_called_once_with(raw_score) 98 | assert result == normalize_mock.return_value 99 | -------------------------------------------------------------------------------- /tests/unit/single_column/statistical/test_key_uniqueness.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sdmetrics.single_column.statistical import KeyUniqueness 7 | 8 | 9 | class TestKeyUniqueness: 10 | def test_compute_breakdown(self): 11 | """Test the ``compute_breakdown`` method.""" 12 | # Setup 13 | real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) 14 | synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) 15 | 16 | metric = KeyUniqueness() 17 | 18 | # Run 19 | result = metric.compute_breakdown(real_data, synthetic_data) 20 | 21 | # Assert 22 | assert result == {'score': 0.5} 23 | 24 | @patch('sdmetrics.single_column.statistical.key_uniqueness.LOGGER') 25 | def test_compute_breakdown_with_duplicates_in_real_data(self, logger_mock): 26 | """Test the ``compute_breakdown`` method with duplicates in the real data.""" 27 | # Setup 28 | real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2) 29 | synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) 30 | metric = KeyUniqueness() 31 | 32 | # Run 33 | metric.compute_breakdown(real_data, synthetic_data) 34 | 35 | # Assert 36 | expected_message = 'The real data contains NA or duplicate values.' 37 | logger_mock.info.assert_called_once_with(expected_message) 38 | 39 | @patch('sdmetrics.single_column.statistical.key_uniqueness.KeyUniqueness.compute_breakdown') 40 | def test_compute(self, compute_breakdown_mock): 41 | """Test the ``compute`` method.""" 42 | # Setup 43 | real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) 44 | synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None]) 45 | metric = KeyUniqueness() 46 | compute_breakdown_mock.return_value = {'score': 0.6} 47 | 48 | # Run 49 | result = metric.compute(real_data, synthetic_data) 50 | 51 | # Assert 52 | compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data) 53 | assert result == 0.6 54 | -------------------------------------------------------------------------------- /tests/unit/single_column/statistical/test_missing_value_similarity.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from sdmetrics.single_column.statistical import MissingValueSimilarity 7 | 8 | 9 | class TestMissingValueSimilarity: 10 | def test_compute_breakdown(self): 11 | """Test the ``compute_breakdown`` method. 12 | 13 | Expect that the number of missing values is computed for both real and synthetic data. 14 | 15 | Input: 16 | - Real data. 17 | - Synthetic data. 18 | 19 | Output: 20 | - A mapping of the metric results, containing the score and the real and synthetic results. 21 | """ 22 | # Setup 23 | real_data = pd.Series([1.0, np.nan, 2.6, 0.8]) 24 | synthetic_data = pd.Series([0.9, 1.8, None, None]) 25 | 26 | metric = MissingValueSimilarity() 27 | 28 | # Run 29 | result = metric.compute_breakdown(real_data, synthetic_data) 30 | 31 | # Assert 32 | assert result == {'score': 0.75, 'real': 0.25, 'synthetic': 0.5} 33 | 34 | def test_compute(self): 35 | """Test the ``compute`` method. 36 | 37 | Expect that the number of missing values is computed for both real and synthetic data. 38 | 39 | Setup: 40 | - Patch the ``compute_breakdown`` method to return a mapping of the metric results. 41 | 42 | Input: 43 | - Real data. 44 | - Synthetic data. 45 | 46 | Output: 47 | - The evaluated metric. 48 | """ 49 | # Setup 50 | metric_breakdown = {'score': 0.75, 'real': 0.25, 'synthetic': 0.75} 51 | 52 | metric = MissingValueSimilarity() 53 | 54 | # Run 55 | with patch.object( 56 | MissingValueSimilarity, 57 | 'compute_breakdown', 58 | return_value=metric_breakdown, 59 | ): 60 | result = metric.compute(Mock(), Mock()) 61 | 62 | # Assert 63 | assert result == 0.75 64 | 65 | @patch( 66 | 'sdmetrics.single_column.statistical.missing_value_similarity.SingleColumnMetric.normalize' 67 | ) 68 | def test_normalize(self, normalize_mock): 69 | """Test the ``normalize`` method. 70 | 71 | Expect that the inherited ``normalize`` method is called. 72 | 73 | Input: 74 | - Raw score 75 | 76 | Output: 77 | - The output of the inherited ``normalize`` method. 78 | """ 79 | # Setup 80 | metric = MissingValueSimilarity() 81 | raw_score = 0.9 82 | 83 | # Run 84 | result = metric.normalize(raw_score) 85 | 86 | # Assert 87 | normalize_mock.assert_called_once_with(raw_score) 88 | assert result == normalize_mock.return_value 89 | -------------------------------------------------------------------------------- /tests/unit/single_column/statistical/test_sequence_length_similarity.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from sdmetrics.single_column import SequenceLengthSimilarity 4 | 5 | 6 | class TestSequenceLengthSimilarity: 7 | def test_compute_breakdown(self): 8 | """Test `compute_breakdown` works.""" 9 | # Setup 10 | real_data = pd.Series([1, 1, 2, 2, 2]) 11 | synthetic_data = pd.Series([3, 4, 5, 6, 6]) 12 | 13 | metric = SequenceLengthSimilarity() 14 | 15 | # Run 16 | result = metric.compute_breakdown(real_data, synthetic_data) 17 | 18 | # Assert 19 | assert result == {'score': 0.25} 20 | 21 | def test_compute(self): 22 | """Test it runs.""" 23 | # Setup 24 | real_data = pd.Series(['id1', 'id2', 'id2', 'id3']) 25 | synthetic_data = pd.Series(['id4', 'id5', 'id6']) 26 | 27 | # Run 28 | score = SequenceLengthSimilarity.compute(real_data, synthetic_data) 29 | 30 | # Assert 31 | assert score == 0.6666666666666667 32 | 33 | def test_compute_one(self): 34 | """Test it returns 1 when real and synthetic data have the same distribution.""" 35 | # Setup 36 | real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3']) 37 | synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6']) 38 | 39 | # Run 40 | score = SequenceLengthSimilarity.compute(real_data, synthetic_data) 41 | 42 | # Assert 43 | assert score == 1 44 | 45 | def test_compute_low_score(self): 46 | """Test it for distinct distributions.""" 47 | # Setup 48 | real_data = pd.Series([f'id{i}' for i in range(100)]) 49 | synthetic_data = pd.Series(['id100'] * 100) 50 | 51 | # Run 52 | score = SequenceLengthSimilarity.compute(real_data, synthetic_data) 53 | 54 | # Assert 55 | assert score == 0 56 | -------------------------------------------------------------------------------- /tests/unit/single_column/test_base.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | from sdmetrics.single_column.base import SingleColumnMetric 4 | 5 | 6 | class TestSingleColumnMetric: 7 | def test_compute_breakdown(self): 8 | """Test the ``compute_breakdown`` method. 9 | 10 | Expect a breakdown dictionary is returned that contains the score. 11 | 12 | Setup: 13 | - Mock the ``compute`` method to return a fake score. 14 | 15 | Input: 16 | - Real data. 17 | - Synthetic data. 18 | 19 | Output: 20 | - The evaluated metric. 21 | """ 22 | # Setup 23 | metric = SingleColumnMetric() 24 | test_metric_score = 0.5 25 | 26 | # Run 27 | with patch.object(SingleColumnMetric, 'compute', return_value=test_metric_score): 28 | result = metric.compute_breakdown(Mock(), Mock()) 29 | 30 | # Assert 31 | assert result == {'score': test_metric_score} 32 | -------------------------------------------------------------------------------- /tests/unit/single_table/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the single_table module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/single_table/data_augmentation/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the single_table data_augmentation module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/single_table/detection/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the single_table detection module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/single_table/privacy/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the single_table privacy module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/single_table/privacy/test_cap.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from sdmetrics.single_table.privacy.cap import ( 7 | CategoricalCAP, 8 | CategoricalGeneralizedCAP, 9 | CategoricalZeroCAP, 10 | ) 11 | 12 | 13 | @pytest.mark.parametrize('metric', [CategoricalCAP, CategoricalZeroCAP, CategoricalGeneralizedCAP]) 14 | def test_CAP_deprecation_message(metric): 15 | """Test deprecation warning is raised when running the metric directly.""" 16 | # Setup 17 | real_data = pd.DataFrame({'col1': range(5), 'col2': ['A', 'B', 'C', 'A', 'B']}) 18 | synthetic_data = pd.DataFrame({'col1': range(5), 'col2': ['C', 'A', 'A', 'B', 'C']}) 19 | 20 | # Run and Assert 21 | expected_warning = re.escape( 22 | 'Computing CAP metrics directly is deprecated. For improved privacy metrics, ' 23 | "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' " 24 | 'metrics instead.' 25 | ) 26 | with pytest.warns(DeprecationWarning, match=expected_warning): 27 | metric.compute(real_data, synthetic_data, key_fields=['col1'], sensitive_fields=['col2']) 28 | -------------------------------------------------------------------------------- /tests/unit/single_table/privacy/test_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from sdmetrics.single_table.privacy.util import ( 6 | closest_neighbors, 7 | validate_num_samples_num_iteration, 8 | ) 9 | 10 | 11 | def test_closest_neighbors_exact(): 12 | samples = [ 13 | ('a', '1'), 14 | ('a', '2'), 15 | ('a', '3'), 16 | ('b', '1'), 17 | ('b', '2'), 18 | ('b', '3'), 19 | ] 20 | target = ('a', '2') 21 | results = closest_neighbors(samples, target) 22 | assert len(results) == 1 23 | assert results[0] == ('a', '2') 24 | 25 | 26 | def test_closest_neighbors_non_exact(): 27 | samples = [ 28 | ('a', '1'), 29 | ('a', '3'), 30 | ('b', '1'), 31 | ('b', '2'), 32 | ('b', '3'), 33 | ] 34 | target = ('a', '2') 35 | results = closest_neighbors(samples, target) 36 | assert len(results) == 3 37 | assert ('a', '1') in results 38 | assert ('a', '3') in results 39 | assert ('b', '2') in results 40 | 41 | 42 | def test_validate_num_samples_num_iteration(): 43 | # Run and Assert 44 | num_subsample_error_post = re.escape('must be an integer greater than 1.') 45 | 46 | with pytest.raises(ValueError, match=num_subsample_error_post): 47 | validate_num_samples_num_iteration(0, 1) 48 | 49 | with pytest.raises(ValueError, match=num_subsample_error_post): 50 | validate_num_samples_num_iteration('X', 1) 51 | 52 | subsample_none_msg = re.escape( 53 | 'num_iterations should not be greater than 1 if there is no subsampling.' 54 | ) 55 | num_iterations = 3 56 | with pytest.raises(ValueError, match=subsample_none_msg): 57 | validate_num_samples_num_iteration(None, num_iterations) 58 | 59 | zero_iteration_msg = re.escape('num_iterations (0) must be an integer greater than 1.') 60 | with pytest.raises(ValueError, match=zero_iteration_msg): 61 | validate_num_samples_num_iteration(1, 0) 62 | -------------------------------------------------------------------------------- /tests/unit/test_base.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from sdmetrics.base import BaseMetric 4 | from sdmetrics.goal import Goal 5 | 6 | 7 | class TestBaseMetric: 8 | def test_normalize_bounded(self): 9 | BaseMetric.max_value = 1 10 | BaseMetric.min_value = -1 11 | BaseMetric.goal = Goal.MAXIMIZE 12 | 13 | raw_score = 0 14 | normalized = BaseMetric.normalize(raw_score) 15 | 16 | assert normalized == 0.5 17 | 18 | def test_normalize_high_bound(self): 19 | BaseMetric.max_value = 1 20 | BaseMetric.min_value = float('-inf') 21 | BaseMetric.goal = Goal.MAXIMIZE 22 | 23 | raw_score = 1 24 | normalized = BaseMetric.normalize(raw_score) 25 | 26 | assert normalized == 1 27 | 28 | def test_normalize_low_bound(self): 29 | BaseMetric.max_value = float('inf') 30 | BaseMetric.min_value = -1 31 | BaseMetric.goal = Goal.MAXIMIZE 32 | 33 | raw_score = -1 34 | normalized = BaseMetric.normalize(raw_score) 35 | 36 | assert normalized == 0 37 | 38 | def test_normalize_unbounded(self): 39 | BaseMetric.max_value = float('inf') 40 | BaseMetric.min_value = float('-inf') 41 | BaseMetric.goal = Goal.MAXIMIZE 42 | 43 | raw_score = 0 44 | normalized = BaseMetric.normalize(raw_score) 45 | 46 | assert normalized == 0.5 47 | 48 | def test_normalize_minimize(self): 49 | BaseMetric.max_value = 1 50 | BaseMetric.min_value = -1 51 | BaseMetric.goal = Goal.MINIMIZE 52 | 53 | raw_score = 1 54 | normalized = BaseMetric.normalize(raw_score) 55 | 56 | assert normalized == 0 57 | 58 | def test_normalize_out_of_bounds(self): 59 | BaseMetric.max_value = 1 60 | BaseMetric.min_value = -1 61 | BaseMetric.goal = Goal.MAXIMIZE 62 | 63 | raw_score = 2 64 | error_msg = '`raw_score` must be between `min_value` and `max_value`.' 65 | with pytest.raises(ValueError, match=error_msg): 66 | BaseMetric.normalize(raw_score) 67 | -------------------------------------------------------------------------------- /tests/unit/test_demos.py: -------------------------------------------------------------------------------- 1 | from sdmetrics.demos import load_demo 2 | 3 | 4 | def test_load_single_table_demo(): 5 | """Test loading the single tale demo data and expect the correct demo data to be returned.""" 6 | # Setup 7 | modality = 'single_table' 8 | 9 | # Run 10 | real_data, synthetic_data, metadata = load_demo(modality) 11 | 12 | # Assert 13 | assert metadata['columns']['duration'] == { 14 | 'sdtype': 'numerical', 15 | 'computer_representation': 'Int64', 16 | } 17 | assert real_data['duration'].dtype == 'float64' 18 | assert synthetic_data['duration'].dtype == 'float64' 19 | 20 | 21 | def test_load_multi_table_demo(): 22 | """Test loading the multi table demo data and expect the correct demo data to be returned.""" 23 | # Setup 24 | modality = 'multi_table' 25 | 26 | # Run 27 | real_data, synthetic_data, metadata = load_demo(modality) 28 | 29 | # Assert 30 | assert metadata['tables']['transactions']['columns']['timestamp'] == { 31 | 'sdtype': 'datetime', 32 | 'datetime_format': '%Y-%m-%d %H:%M:%S', 33 | } 34 | assert real_data['transactions']['timestamp'].dtype == 'datetime64[ns]' 35 | assert synthetic_data['transactions']['timestamp'].dtype == 'datetime64[ns]' 36 | -------------------------------------------------------------------------------- /tests/unit/timeseries/__init__.py: -------------------------------------------------------------------------------- 1 | """SDMetrics unit testing for the timeseries module.""" 2 | -------------------------------------------------------------------------------- /tests/unit/timeseries/test_timeseries.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from sdmetrics.timeseries.base import TimeSeriesMetric 7 | 8 | 9 | def test__validate_inputs_for_TimeSeriesMetric(): 10 | """Test ``_validate_inputs`` crashes when datetime column doesn't match metadata.""" 11 | # Setup 12 | df1 = pd.DataFrame({ 13 | 's_key': [1, 2, 3], 14 | 'visits': pd.to_datetime(['1/1/2019', '1/2/2019', '1/3/2019']), 15 | }) 16 | df1['visits'] = df1['visits'].dt.date 17 | df2 = pd.DataFrame({ 18 | 's_key': [1, 2, 3], 19 | 'visits': ['not', 'a', 'datetime'], 20 | }) 21 | metadata = { 22 | 'columns': { 23 | 's_key': {'sdtype': 'numerical'}, 24 | 'visits': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S'}, 25 | }, 26 | 'sequence_key': 's_key', 27 | } 28 | 29 | # Run and Assert 30 | expected_msg = re.escape("Error converting column 'visits' to timestamp: ") 31 | with pytest.raises(ValueError, match=expected_msg): 32 | TimeSeriesMetric._validate_inputs( 33 | real_data=df1, synthetic_data=df2, sequence_key=['s_key'], metadata=metadata 34 | ) 35 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for testing.""" 2 | 3 | import pandas as pd 4 | 5 | 6 | class DataFrameMatcher: 7 | """Match a given Pandas DataFrame in a mock function call.""" 8 | 9 | def __init__(self, df): 10 | """Initialize the DataFrame.""" 11 | self.df = df 12 | 13 | def __eq__(self, other): 14 | """Assert equality using pandas testing module.""" 15 | pd.testing.assert_frame_equal(self.df, other) 16 | return True 17 | 18 | 19 | class SeriesMatcher: 20 | """Match a given Pandas Series in a mock function call.""" 21 | 22 | def __init__(self, data): 23 | """Initialize the Series.""" 24 | self.data = data 25 | 26 | def __eq__(self, other): 27 | """Assert equality using pandas testing module.""" 28 | pd.testing.assert_series_equal(self.data, other) 29 | return True 30 | 31 | 32 | class IteratorMatcher: 33 | """Match a given iterator in a mock function call.""" 34 | 35 | def __init__(self, iterator): 36 | """Initialize the iterator.""" 37 | self.iterator = iterator 38 | 39 | def __eq__(self, other): 40 | """Assert equality by expanding the iterator.""" 41 | assert all(x == y for x, y in zip(self.iterator, other)) 42 | return True 43 | 44 | 45 | def get_error_type(error): 46 | if error is not None: 47 | colon_index = error.find(':') 48 | return error[:colon_index] 49 | return None 50 | 51 | 52 | def check_if_value_in_threshold(value, expected_value, threshold): 53 | assert abs(value - expected_value) < threshold 54 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39-lint, py3{8,9,10,11,12,13}-{readme,integration,unit,minimum} 3 | 4 | [testenv] 5 | skipsdist = false 6 | skip_install = false 7 | deps = 8 | invoke 9 | readme: rundoc 10 | extras = 11 | lint: dev 12 | unit: test 13 | integration: test 14 | minimum: test 15 | commands = 16 | lint: invoke lint 17 | readme: invoke readme 18 | unit: invoke unit 19 | integration: invoke integration 20 | minimum: invoke minimum 21 | invoke rmdir --path {envdir} 22 | --------------------------------------------------------------------------------